source: trunk/PERL_SCRIPTS/SPECIES/findNewContent.pl

Last change on this file was 19355, checked in by westram, 2 years ago
  • adds script to detect new field content in marked subset of species (compared with unmarked subset).
  • fix whitespace only in listMarkedSpecies.pl.
  • Property svn:executable set to *
File size: 3.9 KB
Line 
1#!/usr/bin/perl
2# ========================================================= #
3#                                                           #
4#   File      : findNewContent.pl                           #
5#   Purpose   : mark species by field content               #
6#                                                           #
7#   Coded by Ralf Westram (coder@reallysoft.de) in Dec 22   #
8#   http://www.arb-home.de/                                 #
9#                                                           #
10# ========================================================= #
11
12use strict;
13use warnings;
14
15BEGIN {
16  if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; }
17  my $arbhome = $ENV{'ARBHOME'};
18  push @INC, "$arbhome/lib";
19  push @INC, "$arbhome/PERL_SCRIPTS/lib";
20  1;
21}
22
23use ARB;
24use tools;
25
26sub findNewContent($) {
27  my ($field) = @_;
28
29  my $gb_main = ARB::open(":","r");
30  $gb_main || expectError('db connect (no running ARB?)');
31
32  dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction');
33
34  my %marked_content   = ();
35  my %unmarked_content = ();
36
37  my $markedCount = 0; # counts previously marked species
38  my $allCount    = 0; # count all species
39
40  for (my $gb_species = BIO::first_species($gb_main);
41       $gb_species;
42       $gb_species = BIO::next_species($gb_species))
43    {
44      ++$allCount;
45
46      my $content = BIO::read_string($gb_species, $field);
47      if (not $content) { $content = ''; }
48
49      my $marked = ARB::read_flag($gb_species);
50      if ($marked==1) {
51        ++$markedCount;
52        $marked_content{$content} = 1;
53      }
54      else {
55        $unmarked_content{$content} = 1;
56      }
57    }
58
59  my $marked_content   = scalar(keys %marked_content);
60  my $unmarked_content = scalar(keys %unmarked_content);
61
62  my $notmarkedCount = $allCount-$markedCount; # species which were unmarked
63
64  if ($markedCount==0 or $notmarkedCount==0) {
65    print "Error: a subset of all species has to be marked (marked=$markedCount, unmarked=$notmarkedCount)\n";
66  }
67  else {
68    print "Different content detected in species subsets:\n";
69    print sprintf('- unmarked:%6i (in%6i species; %5.2f ~use)'."\n", $unmarked_content, $notmarkedCount, $notmarkedCount/$unmarked_content);
70    print sprintf('- marked:  %6i (in%6i species; %5.2f ~use)'."\n", $marked_content,   $markedCount,    $markedCount/$marked_content);
71
72    my $unmarkedCount = 0; # count species where mark has been removed
73
74    for (my $gb_species = BIO::first_species($gb_main);
75         $gb_species;
76         $gb_species = BIO::next_species($gb_species)) {
77      my $marked = ARB::read_flag($gb_species);
78      if ($marked==1) {
79        my $content = BIO::read_string($gb_species, $field);
80        if (not $content) { $content = ''; }
81
82        if (exists $unmarked_content{$content}) { # already seen in old subset
83          ARB::write_flag($gb_species, 0); # => unmark species
84          ++$unmarkedCount;
85        }
86      }
87    }
88
89    print "Unmarked $unmarkedCount of previously $markedCount marked species.\n";
90    my $remain = $markedCount - $unmarkedCount;
91    print "Species with new content remained marked: $remain\n";
92  }
93
94  ARB::commit_transaction($gb_main);
95  ARB::close($gb_main);
96}
97
98sub show_usage() {
99  print "Purpose: Find new content in field of a subset of species.\n";
100  print "Usage: findNewContent.pl <field>\n";
101  print "\n";
102  print "What this script does:\n";
103  print "- detect contents of <field> in subsets of marked and unmarked species.\n";
104  print "- unmark all species which have <field> content already seen in at least one unmarked species.\n";
105  print "\n";
106  print "Missing fields will be treated as if field had an empty string as content.\n";
107  print "=> when you specify a non-existing field, this script will simply unmark all species.\n";
108}
109
110sub main() {
111  my $args = scalar(@ARGV);
112  if ($args!=1) {
113    show_usage();
114  }
115  else {
116    my $field = $ARGV[0];
117    findNewContent($field);
118  }
119}
120
121main();
Note: See TracBrowser for help on using the repository browser.