1 | #!/usr/bin/perl |
---|
2 | # ========================================================= # |
---|
3 | # # |
---|
4 | # File : findNewContent.pl # |
---|
5 | # Purpose : mark species by field content # |
---|
6 | # # |
---|
7 | # Coded by Ralf Westram (coder@reallysoft.de) in Dec 22 # |
---|
8 | # http://www.arb-home.de/ # |
---|
9 | # # |
---|
10 | # ========================================================= # |
---|
11 | |
---|
12 | use strict; |
---|
13 | use warnings; |
---|
14 | |
---|
15 | BEGIN { |
---|
16 | if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; } |
---|
17 | my $arbhome = $ENV{'ARBHOME'}; |
---|
18 | push @INC, "$arbhome/lib"; |
---|
19 | push @INC, "$arbhome/PERL_SCRIPTS/lib"; |
---|
20 | 1; |
---|
21 | } |
---|
22 | |
---|
23 | use ARB; |
---|
24 | use tools; |
---|
25 | |
---|
26 | sub findNewContent($) { |
---|
27 | my ($field) = @_; |
---|
28 | |
---|
29 | my $gb_main = ARB::open(":","r"); |
---|
30 | $gb_main || expectError('db connect (no running ARB?)'); |
---|
31 | |
---|
32 | dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction'); |
---|
33 | |
---|
34 | my %marked_content = (); |
---|
35 | my %unmarked_content = (); |
---|
36 | |
---|
37 | my $markedCount = 0; # counts previously marked species |
---|
38 | my $allCount = 0; # count all species |
---|
39 | |
---|
40 | for (my $gb_species = BIO::first_species($gb_main); |
---|
41 | $gb_species; |
---|
42 | $gb_species = BIO::next_species($gb_species)) |
---|
43 | { |
---|
44 | ++$allCount; |
---|
45 | |
---|
46 | my $content = BIO::read_string($gb_species, $field); |
---|
47 | if (not $content) { $content = ''; } |
---|
48 | |
---|
49 | my $marked = ARB::read_flag($gb_species); |
---|
50 | if ($marked==1) { |
---|
51 | ++$markedCount; |
---|
52 | $marked_content{$content} = 1; |
---|
53 | } |
---|
54 | else { |
---|
55 | $unmarked_content{$content} = 1; |
---|
56 | } |
---|
57 | } |
---|
58 | |
---|
59 | my $marked_content = scalar(keys %marked_content); |
---|
60 | my $unmarked_content = scalar(keys %unmarked_content); |
---|
61 | |
---|
62 | my $notmarkedCount = $allCount-$markedCount; # species which were unmarked |
---|
63 | |
---|
64 | if ($markedCount==0 or $notmarkedCount==0) { |
---|
65 | print "Error: a subset of all species has to be marked (marked=$markedCount, unmarked=$notmarkedCount)\n"; |
---|
66 | } |
---|
67 | else { |
---|
68 | print "Different content detected in species subsets:\n"; |
---|
69 | print sprintf('- unmarked:%6i (in%6i species; %5.2f ~use)'."\n", $unmarked_content, $notmarkedCount, $notmarkedCount/$unmarked_content); |
---|
70 | print sprintf('- marked: %6i (in%6i species; %5.2f ~use)'."\n", $marked_content, $markedCount, $markedCount/$marked_content); |
---|
71 | |
---|
72 | my $unmarkedCount = 0; # count species where mark has been removed |
---|
73 | |
---|
74 | for (my $gb_species = BIO::first_species($gb_main); |
---|
75 | $gb_species; |
---|
76 | $gb_species = BIO::next_species($gb_species)) { |
---|
77 | my $marked = ARB::read_flag($gb_species); |
---|
78 | if ($marked==1) { |
---|
79 | my $content = BIO::read_string($gb_species, $field); |
---|
80 | if (not $content) { $content = ''; } |
---|
81 | |
---|
82 | if (exists $unmarked_content{$content}) { # already seen in old subset |
---|
83 | ARB::write_flag($gb_species, 0); # => unmark species |
---|
84 | ++$unmarkedCount; |
---|
85 | } |
---|
86 | } |
---|
87 | } |
---|
88 | |
---|
89 | print "Unmarked $unmarkedCount of previously $markedCount marked species.\n"; |
---|
90 | my $remain = $markedCount - $unmarkedCount; |
---|
91 | print "Species with new content remained marked: $remain\n"; |
---|
92 | } |
---|
93 | |
---|
94 | ARB::commit_transaction($gb_main); |
---|
95 | ARB::close($gb_main); |
---|
96 | } |
---|
97 | |
---|
98 | sub show_usage() { |
---|
99 | print "Purpose: Find new content in field of a subset of species.\n"; |
---|
100 | print "Usage: findNewContent.pl <field>\n"; |
---|
101 | print "\n"; |
---|
102 | print "What this script does:\n"; |
---|
103 | print "- detect contents of <field> in subsets of marked and unmarked species.\n"; |
---|
104 | print "- unmark all species which have <field> content already seen in at least one unmarked species.\n"; |
---|
105 | print "\n"; |
---|
106 | print "Missing fields will be treated as if field had an empty string as content.\n"; |
---|
107 | print "=> when you specify a non-existing field, this script will simply unmark all species.\n"; |
---|
108 | } |
---|
109 | |
---|
110 | sub main() { |
---|
111 | my $args = scalar(@ARGV); |
---|
112 | if ($args!=1) { |
---|
113 | show_usage(); |
---|
114 | } |
---|
115 | else { |
---|
116 | my $field = $ARGV[0]; |
---|
117 | findNewContent($field); |
---|
118 | } |
---|
119 | } |
---|
120 | |
---|
121 | main(); |
---|