| 1 | #!/usr/bin/perl |
|---|
| 2 | # ========================================================= # |
|---|
| 3 | # # |
|---|
| 4 | # File : findNewContent.pl # |
|---|
| 5 | # Purpose : mark species by field content # |
|---|
| 6 | # # |
|---|
| 7 | # Coded by Ralf Westram (coder@reallysoft.de) in Dec 22 # |
|---|
| 8 | # http://www.arb-home.de/ # |
|---|
| 9 | # # |
|---|
| 10 | # ========================================================= # |
|---|
| 11 | |
|---|
| 12 | use strict; |
|---|
| 13 | use warnings; |
|---|
| 14 | |
|---|
| 15 | BEGIN { |
|---|
| 16 | if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; } |
|---|
| 17 | my $arbhome = $ENV{'ARBHOME'}; |
|---|
| 18 | push @INC, "$arbhome/lib"; |
|---|
| 19 | push @INC, "$arbhome/PERL_SCRIPTS/lib"; |
|---|
| 20 | 1; |
|---|
| 21 | } |
|---|
| 22 | |
|---|
| 23 | use ARB; |
|---|
| 24 | use tools; |
|---|
| 25 | |
|---|
| 26 | sub findNewContent($) { |
|---|
| 27 | my ($field) = @_; |
|---|
| 28 | |
|---|
| 29 | my $gb_main = ARB::open(":","r"); |
|---|
| 30 | $gb_main || expectError('db connect (no running ARB?)'); |
|---|
| 31 | |
|---|
| 32 | dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction'); |
|---|
| 33 | |
|---|
| 34 | my %marked_content = (); |
|---|
| 35 | my %unmarked_content = (); |
|---|
| 36 | |
|---|
| 37 | my $markedCount = 0; # counts previously marked species |
|---|
| 38 | my $allCount = 0; # count all species |
|---|
| 39 | |
|---|
| 40 | for (my $gb_species = BIO::first_species($gb_main); |
|---|
| 41 | $gb_species; |
|---|
| 42 | $gb_species = BIO::next_species($gb_species)) |
|---|
| 43 | { |
|---|
| 44 | ++$allCount; |
|---|
| 45 | |
|---|
| 46 | my $content = BIO::read_string($gb_species, $field); |
|---|
| 47 | if (not $content) { $content = ''; } |
|---|
| 48 | |
|---|
| 49 | my $marked = ARB::read_flag($gb_species); |
|---|
| 50 | if ($marked==1) { |
|---|
| 51 | ++$markedCount; |
|---|
| 52 | $marked_content{$content} = 1; |
|---|
| 53 | } |
|---|
| 54 | else { |
|---|
| 55 | $unmarked_content{$content} = 1; |
|---|
| 56 | } |
|---|
| 57 | } |
|---|
| 58 | |
|---|
| 59 | my $marked_content = scalar(keys %marked_content); |
|---|
| 60 | my $unmarked_content = scalar(keys %unmarked_content); |
|---|
| 61 | |
|---|
| 62 | my $notmarkedCount = $allCount-$markedCount; # species which were unmarked |
|---|
| 63 | |
|---|
| 64 | if ($markedCount==0 or $notmarkedCount==0) { |
|---|
| 65 | print "Error: a subset of all species has to be marked (marked=$markedCount, unmarked=$notmarkedCount)\n"; |
|---|
| 66 | } |
|---|
| 67 | else { |
|---|
| 68 | print "Different content detected in species subsets:\n"; |
|---|
| 69 | print sprintf('- unmarked:%6i (in%6i species; %5.2f ~use)'."\n", $unmarked_content, $notmarkedCount, $notmarkedCount/$unmarked_content); |
|---|
| 70 | print sprintf('- marked: %6i (in%6i species; %5.2f ~use)'."\n", $marked_content, $markedCount, $markedCount/$marked_content); |
|---|
| 71 | |
|---|
| 72 | my $unmarkedCount = 0; # count species where mark has been removed |
|---|
| 73 | |
|---|
| 74 | for (my $gb_species = BIO::first_species($gb_main); |
|---|
| 75 | $gb_species; |
|---|
| 76 | $gb_species = BIO::next_species($gb_species)) { |
|---|
| 77 | my $marked = ARB::read_flag($gb_species); |
|---|
| 78 | if ($marked==1) { |
|---|
| 79 | my $content = BIO::read_string($gb_species, $field); |
|---|
| 80 | if (not $content) { $content = ''; } |
|---|
| 81 | |
|---|
| 82 | if (exists $unmarked_content{$content}) { # already seen in old subset |
|---|
| 83 | ARB::write_flag($gb_species, 0); # => unmark species |
|---|
| 84 | ++$unmarkedCount; |
|---|
| 85 | } |
|---|
| 86 | } |
|---|
| 87 | } |
|---|
| 88 | |
|---|
| 89 | print "Unmarked $unmarkedCount of previously $markedCount marked species.\n"; |
|---|
| 90 | my $remain = $markedCount - $unmarkedCount; |
|---|
| 91 | print "Species with new content remained marked: $remain\n"; |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | ARB::commit_transaction($gb_main); |
|---|
| 95 | ARB::close($gb_main); |
|---|
| 96 | } |
|---|
| 97 | |
|---|
| 98 | sub show_usage() { |
|---|
| 99 | print "Purpose: Find new content in field of a subset of species.\n"; |
|---|
| 100 | print "Usage: findNewContent.pl <field>\n"; |
|---|
| 101 | print "\n"; |
|---|
| 102 | print "What this script does:\n"; |
|---|
| 103 | print "- detect contents of <field> in subsets of marked and unmarked species.\n"; |
|---|
| 104 | print "- unmark all species which have <field> content already seen in at least one unmarked species.\n"; |
|---|
| 105 | print "\n"; |
|---|
| 106 | print "Missing fields will be treated as if field had an empty string as content.\n"; |
|---|
| 107 | print "=> when you specify a non-existing field, this script will simply unmark all species.\n"; |
|---|
| 108 | } |
|---|
| 109 | |
|---|
| 110 | sub main() { |
|---|
| 111 | my $args = scalar(@ARGV); |
|---|
| 112 | if ($args!=1) { |
|---|
| 113 | show_usage(); |
|---|
| 114 | } |
|---|
| 115 | else { |
|---|
| 116 | my $field = $ARGV[0]; |
|---|
| 117 | findNewContent($field); |
|---|
| 118 | } |
|---|
| 119 | } |
|---|
| 120 | |
|---|
| 121 | main(); |
|---|