| 1 | #!/usr/bin/perl |
|---|
| 2 | # ========================================================= # |
|---|
| 3 | # # |
|---|
| 4 | # File : databaseReport.pl # |
|---|
| 5 | # Purpose : create generalized database report # |
|---|
| 6 | # # |
|---|
| 7 | # Coded by Ralf Westram (coder@reallysoft.de) in Jul 25 # |
|---|
| 8 | # http://www.arb-home.de/ # |
|---|
| 9 | # # |
|---|
| 10 | # ========================================================= # |
|---|
| 11 | # |
|---|
| 12 | # Script to analyze and summarize ARB database contents. |
|---|
| 13 | # |
|---|
| 14 | # Generates structured reports about: |
|---|
| 15 | # - Field usage and protection levels |
|---|
| 16 | # - Species counts and marking statistics |
|---|
| 17 | # - Alignments, trees, SAIs, and selection areas |
|---|
| 18 | # |
|---|
| 19 | # Intended for diagnostic and audit purposes in ARB workflows. |
|---|
| 20 | # |
|---|
| 21 | # ========================================================= # |
|---|
| 22 | |
|---|
| 23 | use strict; |
|---|
| 24 | use warnings; |
|---|
| 25 | |
|---|
| 26 | BEGIN { |
|---|
| 27 | if (not exists $ENV{'ARBHOME'}) { die "Environment variable \$ARBHOME has to be defined"; } |
|---|
| 28 | my $arbhome = $ENV{'ARBHOME'}; |
|---|
| 29 | push @INC, "$arbhome/lib"; |
|---|
| 30 | push @INC, "$arbhome/PERL_SCRIPTS/lib"; |
|---|
| 31 | 1; |
|---|
| 32 | } |
|---|
| 33 | |
|---|
| 34 | use ARB; |
|---|
| 35 | use tools; |
|---|
| 36 | |
|---|
| 37 | # utility functions to return the max/min of two values: |
|---|
| 38 | sub max($$) { |
|---|
| 39 | my ($a, $b) = @_; |
|---|
| 40 | $a<$b ? $b : $a; |
|---|
| 41 | } |
|---|
| 42 | sub min($$) { |
|---|
| 43 | my ($a, $b) = @_; |
|---|
| 44 | $a<$b ? $a : $b; |
|---|
| 45 | } |
|---|
| 46 | |
|---|
| 47 | sub report_stamped($$) { |
|---|
| 48 | # Prints a timestamped message to the output stream. |
|---|
| 49 | my ($OUT, $message) = @_; |
|---|
| 50 | |
|---|
| 51 | my $stamp = scalar(localtime); |
|---|
| 52 | print $OUT "$stamp | $message\n"; |
|---|
| 53 | } |
|---|
| 54 | sub start_section($$$) { |
|---|
| 55 | # Begins a new report section with optional underlining. |
|---|
| 56 | my ($OUT, $section, $underlined) = @_; |
|---|
| 57 | print $OUT "\n$section:\n"; |
|---|
| 58 | print $OUT '=' x (length($section)+1)."\n" if $underlined; |
|---|
| 59 | print $OUT "\n"; |
|---|
| 60 | } |
|---|
| 61 | |
|---|
| 62 | # functions to read values of required or optional database entries: |
|---|
| 63 | sub read_required_entry($$$$$) { |
|---|
| 64 | my ($parent, $key, $item_desc, $key_desc, $reader) = @_; |
|---|
| 65 | my $gb_entry = ARB::entry($parent, $key); |
|---|
| 66 | die "$item_desc lacks '$key'" unless $gb_entry; |
|---|
| 67 | my $value = $reader->($gb_entry); |
|---|
| 68 | expectError("read $key_desc of $item_desc") unless defined $value; |
|---|
| 69 | return $value; |
|---|
| 70 | } |
|---|
| 71 | sub read_required_string_entry($$$$) { |
|---|
| 72 | return read_required_entry($_[0], $_[1], $_[2], $_[3], \&ARB::read_string); |
|---|
| 73 | } |
|---|
| 74 | sub read_required_int_entry($$$$) { |
|---|
| 75 | return read_required_entry($_[0], $_[1], $_[2], $_[3], \&ARB::read_int); |
|---|
| 76 | } |
|---|
| 77 | sub read_optional_string_entry($$) { |
|---|
| 78 | my ($parent, $key) = @_; |
|---|
| 79 | my $gb_entry = ARB::entry($parent, $key); |
|---|
| 80 | return undef unless $gb_entry; |
|---|
| 81 | |
|---|
| 82 | my $value = ARB::read_string($gb_entry); |
|---|
| 83 | expectError("read value of '$key'") unless defined $value; |
|---|
| 84 | return $value; |
|---|
| 85 | } |
|---|
| 86 | |
|---|
| 87 | |
|---|
| 88 | sub scanFieldsRecursive($); |
|---|
| 89 | sub scanFieldsRecursive($) { |
|---|
| 90 | # Recursively scans all subnodes of the current item. |
|---|
| 91 | # Collects paths to leaf fields and their associated protection levels. |
|---|
| 92 | my ($gb_item) = @_; |
|---|
| 93 | my %childs = (); |
|---|
| 94 | for (my $gb_child = ARB::child($gb_item); $gb_child; $gb_child = ARB::nextChild($gb_child)) { |
|---|
| 95 | my $key = ARB::read_key($gb_child); |
|---|
| 96 | |
|---|
| 97 | my %sub = scanFieldsRecursive($gb_child); |
|---|
| 98 | if (%sub) { |
|---|
| 99 | foreach (keys %sub) { |
|---|
| 100 | $childs{$key.'/'.$_} = $sub{$_}; |
|---|
| 101 | } |
|---|
| 102 | } |
|---|
| 103 | else { |
|---|
| 104 | my $prot = ARB::read_security_write($gb_child); |
|---|
| 105 | $childs{$key} = $prot; |
|---|
| 106 | } |
|---|
| 107 | } |
|---|
| 108 | return %childs; # key=recursive field, value=protection |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | sub report_fields_by_name($\%\%\%\%\%) { |
|---|
| 112 | # Outputs all collected fields sorted by name. |
|---|
| 113 | # Includes information about data type, protection level and |
|---|
| 114 | # the database browsers visibility status (hidden/unknown). |
|---|
| 115 | my ($OUT, $fields_r, $hidden_r, $type_r, $protMin_r, $protMax_r) = @_; |
|---|
| 116 | |
|---|
| 117 | my $typeCodes = "-bcif-B-CIFlSS-%"; # copied from GB_type_2_char [c++] |
|---|
| 118 | |
|---|
| 119 | start_section($OUT, "Fields by name", 0); |
|---|
| 120 | print $OUT " TYPE NAME\n\n"; |
|---|
| 121 | my @fieldsByName = sort { lc($a) cmp lc($b); } keys %$fields_r; |
|---|
| 122 | foreach (@fieldsByName) { |
|---|
| 123 | my $prot = undef; |
|---|
| 124 | if (not defined $$protMin_r{$_}) { $prot .= '-'; } |
|---|
| 125 | elsif ($$protMin_r{$_} == $$protMax_r{$_}) { $prot .= sprintf("%i", $$protMin_r{$_}); } |
|---|
| 126 | else { $prot .= sprintf("%i-%i", $$protMin_r{$_}, $$protMax_r{$_}); } |
|---|
| 127 | |
|---|
| 128 | my $typeChar = undef; |
|---|
| 129 | my $note = ''; |
|---|
| 130 | if (not defined $$type_r{$_}) { |
|---|
| 131 | $typeChar = '?'; |
|---|
| 132 | $note .= '[unknown]'; |
|---|
| 133 | } |
|---|
| 134 | else { |
|---|
| 135 | die "expected known type for '$_'" if not defined $$type_r{$_}; |
|---|
| 136 | my $type = $$type_r{$_}; |
|---|
| 137 | $typeChar = substr($typeCodes, $type, 1); |
|---|
| 138 | |
|---|
| 139 | if (defined $$hidden_r{$_}) { |
|---|
| 140 | if ($$hidden_r{$_}) { |
|---|
| 141 | $note .= '[hidden]'; |
|---|
| 142 | } |
|---|
| 143 | } |
|---|
| 144 | } |
|---|
| 145 | print $OUT sprintf(" %s%-3s %-10s %s\n", $typeChar, $prot, $note, $_); |
|---|
| 146 | } |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | sub report_fields_by_frequency($\%$) { |
|---|
| 150 | # Outputs all collected fields sorted by descending frequency. |
|---|
| 151 | # Useful to identify commonly or rarely used fields. |
|---|
| 152 | my ($OUT, $fields_r, $speciesCount) = @_; |
|---|
| 153 | |
|---|
| 154 | my $frequencyDigits = max($speciesCount =~ tr/0-9//, 9); |
|---|
| 155 | |
|---|
| 156 | start_section($OUT, "Fields by frequency", 0); |
|---|
| 157 | my @fieldsByFrequency = sort { |
|---|
| 158 | $fields_r->{$b} <=> $fields_r->{$a} || lc($a) cmp lc($b); |
|---|
| 159 | } keys %{$fields_r}; |
|---|
| 160 | |
|---|
| 161 | print $OUT sprintf("%*s %8s %s\n\n", $frequencyDigits, "COUNT", "FREQ%", "NAME"); |
|---|
| 162 | foreach (@fieldsByFrequency) { |
|---|
| 163 | my $count = $fields_r->{$_}; |
|---|
| 164 | my $percentage = int($count * 10000 / $speciesCount) / 100; |
|---|
| 165 | print $OUT sprintf("%*i %6.2f%% %s\n", $frequencyDigits, $count, $percentage, $_); |
|---|
| 166 | } |
|---|
| 167 | } |
|---|
| 168 | |
|---|
| 169 | # -------------------------------------------------------------------------------- |
|---|
| 170 | # globals collected by collectOccurringFields(). |
|---|
| 171 | # used by reportSpeciesFields() and reportAlignments(). |
|---|
| 172 | |
|---|
| 173 | my $fieldsWereCollected = 0; |
|---|
| 174 | |
|---|
| 175 | my $speciesCount = 0; |
|---|
| 176 | my $markedSpecies = 0; |
|---|
| 177 | |
|---|
| 178 | my %fields = (); # parsed from species (key=fieldname, value=count) |
|---|
| 179 | my %protMin = (); # key=fieldname, value=min.protection |
|---|
| 180 | my %protMax = (); # key=fieldname, value=max.protection |
|---|
| 181 | |
|---|
| 182 | # -------------------------------------------------------------------------------- |
|---|
| 183 | |
|---|
| 184 | sub collectOccurringFields($) { |
|---|
| 185 | # Traverses the entire database and collects statistics on field usage and protection. |
|---|
| 186 | # (this function must be called before any report_* function) |
|---|
| 187 | my ($gb_main) = @_; |
|---|
| 188 | |
|---|
| 189 | # scan all species and collect occurring fields: |
|---|
| 190 | for (my $gb_species = BIO::first_species($gb_main); |
|---|
| 191 | $gb_species; |
|---|
| 192 | $gb_species = BIO::next_species($gb_species)) { |
|---|
| 193 | $speciesCount++; |
|---|
| 194 | my $marked = ARB::read_flag($gb_species); |
|---|
| 195 | if ($marked==1) { |
|---|
| 196 | $markedSpecies++; |
|---|
| 197 | } |
|---|
| 198 | my %childs = scanFieldsRecursive($gb_species); |
|---|
| 199 | foreach (keys %childs) { |
|---|
| 200 | my $seen = $fields{$_}; |
|---|
| 201 | my $protect = $childs{$_}; |
|---|
| 202 | if (defined $seen) { |
|---|
| 203 | $fields{$_} = $seen+1; |
|---|
| 204 | $protMax{$_} = max($protMax{$_}, $protect); |
|---|
| 205 | $protMin{$_} = min($protMin{$_}, $protect); |
|---|
| 206 | } |
|---|
| 207 | else { |
|---|
| 208 | $fields{$_} = 1; |
|---|
| 209 | $protMin{$_} = $protect; |
|---|
| 210 | $protMax{$_} = $protect; |
|---|
| 211 | } |
|---|
| 212 | } |
|---|
| 213 | } |
|---|
| 214 | |
|---|
| 215 | $fieldsWereCollected = 1; |
|---|
| 216 | } |
|---|
| 217 | |
|---|
| 218 | sub reportSpeciesFields($$) { |
|---|
| 219 | # Scans all known fields from /presets/key_data (=refreshed fields). |
|---|
| 220 | # Considers all occurring fields (scanned by collectOccurringFields). |
|---|
| 221 | # |
|---|
| 222 | # Prints an overview about species & fields, and two field lists, one sorted |
|---|
| 223 | # alphabetically, the other by frequency. |
|---|
| 224 | my ($gb_main, $OUT) = @_; |
|---|
| 225 | |
|---|
| 226 | die "collectOccurringFields not called" if $fieldsWereCollected==0; |
|---|
| 227 | |
|---|
| 228 | my @keys = (); # keys parsed from /presets/key_data |
|---|
| 229 | my %hidden = (); # key=fieldname, value=1 if hidden |
|---|
| 230 | my %type = (); # key=fieldname, value=type number |
|---|
| 231 | |
|---|
| 232 | # search known keys (aka refreshed fields): |
|---|
| 233 | my $gb_keys = ARB::search($gb_main, '/presets/key_data', 'NONE'); |
|---|
| 234 | die 'failed to find key-data' if not $gb_keys; |
|---|
| 235 | for (my $gb_key = ARB::entry($gb_keys, 'key'); $gb_key; $gb_key = ARB::nextEntry($gb_key)) { |
|---|
| 236 | my $name = read_required_string_entry($gb_key, 'key_name', 'key', 'name'); |
|---|
| 237 | my $type = read_required_int_entry($gb_key, 'key_type', 'key', 'type'); |
|---|
| 238 | |
|---|
| 239 | # types are defined in ../../ARBDB/arbdb.h@GB_TYPES |
|---|
| 240 | |
|---|
| 241 | if ($type != 15) { # ignore containers |
|---|
| 242 | my $hidden = read_required_int_entry($gb_key, 'key_hidden', 'key', 'hidden-flag'); |
|---|
| 243 | |
|---|
| 244 | push @keys, $name; |
|---|
| 245 | $hidden{$name} = $hidden; |
|---|
| 246 | $type{$name} = $type; |
|---|
| 247 | |
|---|
| 248 | # add fields which are present in key-data but not occurring in fields with zero counter. |
|---|
| 249 | if (not defined $fields{$name}) { |
|---|
| 250 | $fields{$name} = 0; |
|---|
| 251 | } |
|---|
| 252 | } |
|---|
| 253 | } |
|---|
| 254 | |
|---|
| 255 | start_section($OUT, "Species information", 1); |
|---|
| 256 | |
|---|
| 257 | my $fieldsCount = scalar(keys %fields); |
|---|
| 258 | |
|---|
| 259 | print $OUT " Species: $speciesCount\n"; |
|---|
| 260 | print $OUT " Marked: $markedSpecies\n"; |
|---|
| 261 | print $OUT " Fields: $fieldsCount\n"; |
|---|
| 262 | |
|---|
| 263 | report_fields_by_name($OUT, %fields, %hidden, %type, %protMin, %protMax); |
|---|
| 264 | report_fields_by_frequency($OUT, %fields, $speciesCount); |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | sub reportAlignments($$) { |
|---|
| 268 | # Outputs information about all known alignments defined in /presets/alignment. |
|---|
| 269 | # For each alignment, shows name, type, length and usage in the DB. |
|---|
| 270 | my ($gb_main, $OUT) = @_; |
|---|
| 271 | |
|---|
| 272 | die "collectOccurringFields not called" if $fieldsWereCollected==0; |
|---|
| 273 | |
|---|
| 274 | start_section($OUT, "Alignment information", 1); |
|---|
| 275 | |
|---|
| 276 | my $gb_presets = ARB::search($gb_main, '/presets', 'NONE'); |
|---|
| 277 | die 'failed to find alignment-data' if not $gb_presets; |
|---|
| 278 | |
|---|
| 279 | my @alignment = (); |
|---|
| 280 | my %type = (); # key=name, value=type |
|---|
| 281 | my %length = (); # key=name, value=length |
|---|
| 282 | |
|---|
| 283 | my $maxOccur = 0; |
|---|
| 284 | my $maxLength = 0; |
|---|
| 285 | |
|---|
| 286 | for (my $gb_ali = ARB::entry($gb_presets, 'alignment'); $gb_ali; $gb_ali = ARB::nextEntry($gb_ali)) { # read all alignments |
|---|
| 287 | my $name = read_required_string_entry($gb_ali, 'alignment_name', 'alignment', 'name'); |
|---|
| 288 | my $type = read_required_string_entry($gb_ali, 'alignment_type', 'alignment', 'type'); |
|---|
| 289 | my $length = read_required_int_entry($gb_ali, 'alignment_len', 'alignment', 'length'); |
|---|
| 290 | |
|---|
| 291 | push @alignment, $name; |
|---|
| 292 | $type{$name} = $type; |
|---|
| 293 | $length{$name} = $length; |
|---|
| 294 | |
|---|
| 295 | $maxLength = max($maxLength, $length); |
|---|
| 296 | } |
|---|
| 297 | |
|---|
| 298 | # extract occurring alignments from fields: |
|---|
| 299 | my %seen = (); |
|---|
| 300 | foreach (keys %fields) { |
|---|
| 301 | if (/^(ali_[^\/]*)\/data$/o) { |
|---|
| 302 | my $occur = $fields{$_}; |
|---|
| 303 | $seen{$1} = $occur; |
|---|
| 304 | $maxOccur = max($maxOccur, $occur); |
|---|
| 305 | } |
|---|
| 306 | } |
|---|
| 307 | |
|---|
| 308 | my @all_ali = keys %seen; |
|---|
| 309 | push @all_ali, @alignment; |
|---|
| 310 | { |
|---|
| 311 | my %all_ali = map { $_ => 1; } @all_ali; |
|---|
| 312 | @all_ali = sort keys %all_ali; |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | my $OCCUR = "SPECIES"; |
|---|
| 316 | my $LENGTH = "LENGTH"; |
|---|
| 317 | |
|---|
| 318 | my $occurDigits = max(($maxOccur =~ tr/0-9//), length($OCCUR)); |
|---|
| 319 | my $lengthDigits = max(($maxLength =~ tr/0-9//), length($LENGTH)); |
|---|
| 320 | |
|---|
| 321 | print $OUT sprintf(" %*s %8s %4s %*s %s\n\n", |
|---|
| 322 | $occurDigits, $OCCUR, "FREQ%", |
|---|
| 323 | "TYPE", |
|---|
| 324 | $lengthDigits, $LENGTH, |
|---|
| 325 | "NAME"); |
|---|
| 326 | |
|---|
| 327 | foreach (@all_ali) { |
|---|
| 328 | my $occur = $seen{$_}; |
|---|
| 329 | if (not defined $occur) { $occur = 0; } |
|---|
| 330 | my $percentage = int($occur * 10000 / $speciesCount) / 100; |
|---|
| 331 | |
|---|
| 332 | print $OUT sprintf(" %*s %6.2f%% %4s %*s %s\n", |
|---|
| 333 | $occurDigits, $occur, $percentage, |
|---|
| 334 | $type{$_}, |
|---|
| 335 | $lengthDigits, $length{$_}, |
|---|
| 336 | $_); |
|---|
| 337 | } |
|---|
| 338 | } |
|---|
| 339 | |
|---|
| 340 | sub countGroups($) { |
|---|
| 341 | # Counts the number of taxonomic groups defined in the passed tree. |
|---|
| 342 | my ($gb_tree) = @_; |
|---|
| 343 | |
|---|
| 344 | my $count = 0; |
|---|
| 345 | for (my $gb_node = ARB::entry($gb_tree, 'node'); $gb_node; $gb_node = ARB::nextEntry($gb_node)) { # read all nodes (=groups) |
|---|
| 346 | $count++; |
|---|
| 347 | } |
|---|
| 348 | return $count; |
|---|
| 349 | } |
|---|
| 350 | |
|---|
| 351 | sub reportTrees($$) { |
|---|
| 352 | # Lists all trees. |
|---|
| 353 | # Reports how many species and groups are contained in each tree. |
|---|
| 354 | my ($gb_main, $OUT) = @_; |
|---|
| 355 | |
|---|
| 356 | start_section($OUT, "Tree information", 1); |
|---|
| 357 | |
|---|
| 358 | my $gb_trees = ARB::search($gb_main, '/tree_data', 'NONE'); |
|---|
| 359 | die 'failed to find tree-data' if not $gb_trees; |
|---|
| 360 | |
|---|
| 361 | my @trees = (); |
|---|
| 362 | my %nodeCount = (); |
|---|
| 363 | my %groupCount = (); |
|---|
| 364 | my $maxNodeCount = 0; |
|---|
| 365 | my $maxGroupCount = 0; |
|---|
| 366 | |
|---|
| 367 | for (my $gb_sub = ARB::child($gb_trees); $gb_sub; $gb_sub = ARB::nextChild($gb_sub)) { # read all childs |
|---|
| 368 | my $name = ARB::read_key($gb_sub); # the tree name is "stored" in the key name :-( |
|---|
| 369 | expectError('read name of tree') if not $name; |
|---|
| 370 | |
|---|
| 371 | my $isTree = substr($name, 0, 5) eq 'tree_'; |
|---|
| 372 | die "entry is no tree: '$name'" if not $isTree; |
|---|
| 373 | |
|---|
| 374 | push @trees, $name; |
|---|
| 375 | |
|---|
| 376 | my $nodeCount = read_required_int_entry($gb_sub, 'nnodes', 'tree', 'nodeCount'); |
|---|
| 377 | my $groupCount = countGroups($gb_sub); |
|---|
| 378 | |
|---|
| 379 | $nodeCount{$name} = $nodeCount; |
|---|
| 380 | $groupCount{$name} = $groupCount; |
|---|
| 381 | |
|---|
| 382 | $maxNodeCount = max($maxNodeCount, $nodeCount); |
|---|
| 383 | $maxGroupCount = max($maxGroupCount, $groupCount); |
|---|
| 384 | } |
|---|
| 385 | |
|---|
| 386 | my $nodeCountDigits = max(($maxNodeCount =~ tr/0-9//), 3); |
|---|
| 387 | my $groupCountDigits = max(($maxGroupCount =~ tr/0-9//), 4); |
|---|
| 388 | |
|---|
| 389 | print $OUT sprintf(" %*s %*s %s\n\n", |
|---|
| 390 | $nodeCountDigits+1, "LEAF", |
|---|
| 391 | $groupCountDigits+1, "GROUP", |
|---|
| 392 | "NAME"); |
|---|
| 393 | |
|---|
| 394 | foreach (sort @trees) { |
|---|
| 395 | my $nodeCount = $nodeCount{$_}; |
|---|
| 396 | my $groupCount = $groupCount{$_}; |
|---|
| 397 | print $OUT sprintf(" %*i %*i %s\n", |
|---|
| 398 | $nodeCountDigits, $nodeCount, |
|---|
| 399 | $groupCountDigits, $groupCount, |
|---|
| 400 | $_); |
|---|
| 401 | } |
|---|
| 402 | } |
|---|
| 403 | |
|---|
| 404 | sub reportSAIs($$) { |
|---|
| 405 | # Reports all SAIs (sequence associated information) in the database. |
|---|
| 406 | # Each SAI includes group, name and all alignments for which it has associated data. |
|---|
| 407 | my ($gb_main, $OUT) = @_; |
|---|
| 408 | |
|---|
| 409 | start_section($OUT, "SAI information", 1); |
|---|
| 410 | |
|---|
| 411 | my $gb_SAIs = ARB::search($gb_main, '/extended_data', 'NONE'); |
|---|
| 412 | die 'failed to find SAI-data' if not $gb_SAIs; |
|---|
| 413 | |
|---|
| 414 | my @SAIs = (); |
|---|
| 415 | my %group = (); # key=sainame, value=groupname |
|---|
| 416 | my %alis = (); # key=sainame, value=ali-names |
|---|
| 417 | |
|---|
| 418 | my $maxGroupLen = 0; |
|---|
| 419 | my $maxNameLen = 0; |
|---|
| 420 | |
|---|
| 421 | for (my $gb_SAI = ARB::entry($gb_SAIs, 'extended'); $gb_SAI; $gb_SAI = ARB::nextEntry($gb_SAI)) { |
|---|
| 422 | my $name = read_required_string_entry($gb_SAI, 'name', 'SAI', 'name'); |
|---|
| 423 | push @SAIs, $name; |
|---|
| 424 | $maxNameLen = max($maxNameLen, length($name)); |
|---|
| 425 | |
|---|
| 426 | my $groupname = read_optional_string_entry($gb_SAI, 'sai_group'); |
|---|
| 427 | if (defined $groupname) { |
|---|
| 428 | $group{$name} = $groupname; |
|---|
| 429 | $maxGroupLen = max($maxGroupLen, length($groupname)); |
|---|
| 430 | } |
|---|
| 431 | |
|---|
| 432 | my @ali = (); |
|---|
| 433 | for (my $gb_sub = ARB::child($gb_SAI); $gb_sub; $gb_sub = ARB::nextChild($gb_sub)) { |
|---|
| 434 | my $key = ARB::read_key($gb_sub); |
|---|
| 435 | expectError('entry w/o key') if not $key; |
|---|
| 436 | if ($key =~ /^ali_/o) { |
|---|
| 437 | push @ali, $'; |
|---|
| 438 | } |
|---|
| 439 | } |
|---|
| 440 | $alis{$name} = join ' ', sort @ali; |
|---|
| 441 | } |
|---|
| 442 | |
|---|
| 443 | if ($maxGroupLen>0) { |
|---|
| 444 | $maxGroupLen += 4; # 2 for brackets + 2 spaces |
|---|
| 445 | } |
|---|
| 446 | |
|---|
| 447 | my @sortedSAIs = sort { |
|---|
| 448 | (defined $group{$b} <=> defined $group{$a}) |
|---|
| 449 | || |
|---|
| 450 | (defined $group{$a} # safe to compare only if both are defined |
|---|
| 451 | ? (lc($group{$a}) cmp lc($group{$b}) |
|---|
| 452 | || |
|---|
| 453 | $group{$a} cmp $group{$b}) |
|---|
| 454 | : 0) |
|---|
| 455 | || |
|---|
| 456 | lc($a) cmp lc($b); |
|---|
| 457 | } @SAIs; |
|---|
| 458 | |
|---|
| 459 | print $OUT sprintf(" %-*s%-*s %s\n\n", |
|---|
| 460 | int($maxGroupLen), $maxGroupLen ? "GROUP" : "", |
|---|
| 461 | int($maxNameLen), "NAME", |
|---|
| 462 | "ALIGNMENTS"); |
|---|
| 463 | foreach (@sortedSAIs) { |
|---|
| 464 | my $groupname = $group{$_}; |
|---|
| 465 | if (defined $groupname) { $groupname = '['.$groupname.']'; } |
|---|
| 466 | else { $groupname = ''; } |
|---|
| 467 | |
|---|
| 468 | print $OUT sprintf(" %-*s%-*s %s\n", |
|---|
| 469 | int($maxGroupLen), $groupname, |
|---|
| 470 | int($maxNameLen), $_, |
|---|
| 471 | $alis{$_}); |
|---|
| 472 | } |
|---|
| 473 | } |
|---|
| 474 | |
|---|
| 475 | sub parse_area_data($) { |
|---|
| 476 | # Extracts and parses an area-part of a species selection. |
|---|
| 477 | # Counts the number of contained species, SAIs and groups. |
|---|
| 478 | # see also similar code in ../../ARBDB/ad_config.cxx@PARSE_SELECTION |
|---|
| 479 | |
|---|
| 480 | my ($area) = @_; |
|---|
| 481 | my ($species, $sai, $groups) = (0, 0, 0); |
|---|
| 482 | |
|---|
| 483 | my $sep = chr(1); |
|---|
| 484 | my @items = split $sep, $area; |
|---|
| 485 | |
|---|
| 486 | die "area string is expected to start with separator" |
|---|
| 487 | if $items[0] ne ''; |
|---|
| 488 | shift @items; # drop first |
|---|
| 489 | |
|---|
| 490 | foreach (@items) { |
|---|
| 491 | my $type = substr($_, 0, 1); |
|---|
| 492 | if ($type eq 'L') { $species++; } |
|---|
| 493 | elsif ($type eq 'S') { $sai++; } |
|---|
| 494 | elsif ($type eq 'F' or $type eq 'G') { $groups++; } |
|---|
| 495 | elsif ($type ne 'E') { |
|---|
| 496 | print "Warning: Invalid type $type in token '$_' (while parsing selection)\n"; |
|---|
| 497 | } |
|---|
| 498 | } |
|---|
| 499 | |
|---|
| 500 | return ($species, $sai, $groups); |
|---|
| 501 | } |
|---|
| 502 | |
|---|
| 503 | sub parse_config_data($$) { |
|---|
| 504 | # Parses and sums up data from both configuration areas. |
|---|
| 505 | my ($top_area, $middle_area) = @_; |
|---|
| 506 | my ($tspecies, $tsai, $tgroups) = parse_area_data($top_area); |
|---|
| 507 | my ($mspecies, $msai, $mgroups) = parse_area_data($middle_area); |
|---|
| 508 | return ($tspecies+$mspecies, $tsai+$msai, $tgroups+$mgroups); |
|---|
| 509 | } |
|---|
| 510 | |
|---|
| 511 | sub reportSelections($$) { |
|---|
| 512 | # Reports information about species selections. |
|---|
| 513 | # For each entry, shows name and number of selected species, SAIs and groups. |
|---|
| 514 | my ($gb_main, $OUT) = @_; |
|---|
| 515 | |
|---|
| 516 | start_section($OUT, "Selection information", 1); |
|---|
| 517 | |
|---|
| 518 | my $gb_selections = ARB::search($gb_main, '/configuration_data', 'NONE'); |
|---|
| 519 | die 'failed to find configuration_data' if not $gb_selections; |
|---|
| 520 | |
|---|
| 521 | my @speciesSelections = (); |
|---|
| 522 | |
|---|
| 523 | my $maxSpecies = 0; |
|---|
| 524 | my $maxSai = 0; |
|---|
| 525 | my $maxGroups = 0; |
|---|
| 526 | |
|---|
| 527 | my %species = (); # key=configname, value=speciescount |
|---|
| 528 | my %sai = (); # key=configname, value=saicount |
|---|
| 529 | my %groups = (); # key=configname, value=groupscount |
|---|
| 530 | |
|---|
| 531 | for (my $gb_sel = ARB::entry($gb_selections, 'configuration'); $gb_sel; $gb_sel = ARB::nextEntry($gb_sel)) { |
|---|
| 532 | my $name = read_required_string_entry($gb_sel, 'name', 'speciesSelection', 'name'); |
|---|
| 533 | push @speciesSelections, $name; |
|---|
| 534 | |
|---|
| 535 | my $top_area = read_optional_string_entry($gb_sel, 'top_area') // ""; # if result is undef => fallback to empty string |
|---|
| 536 | my $middle_area = read_optional_string_entry($gb_sel, 'middle_area') // ""; |
|---|
| 537 | |
|---|
| 538 | my ($species, $sai, $groups) = parse_config_data($top_area, $middle_area); |
|---|
| 539 | |
|---|
| 540 | $species{$name} = $species; |
|---|
| 541 | $sai{$name} = $sai; |
|---|
| 542 | $groups{$name} = $groups; |
|---|
| 543 | |
|---|
| 544 | $maxSpecies = max($maxSpecies, $species); |
|---|
| 545 | $maxSai = max($maxSai, $sai); |
|---|
| 546 | $maxGroups = max($maxGroups, $groups); |
|---|
| 547 | } |
|---|
| 548 | |
|---|
| 549 | my @sortedSelections = sort { |
|---|
| 550 | (($b eq 'default_configuration') <=> ($a eq 'default_configuration')) |
|---|
| 551 | || |
|---|
| 552 | lc($a) cmp lc($b) |
|---|
| 553 | } @speciesSelections; |
|---|
| 554 | |
|---|
| 555 | my $SPECIES = "SPEC"; |
|---|
| 556 | my $SAI = "SAI"; |
|---|
| 557 | my $GROUPS = "GRP"; |
|---|
| 558 | |
|---|
| 559 | my $speciesDigits = max(($maxSpecies =~ tr/0-9//), length($SPECIES)); |
|---|
| 560 | my $saiDigits = max(($maxSai =~ tr/0-9//), length($SAI)); |
|---|
| 561 | my $groupsDigits = max(($maxGroups =~ tr/0-9//), length($GROUPS)); |
|---|
| 562 | |
|---|
| 563 | print $OUT sprintf(" %*s %*s %*s %s\n\n", |
|---|
| 564 | $speciesDigits, $SPECIES, |
|---|
| 565 | $saiDigits, $SAI, |
|---|
| 566 | $groupsDigits, $GROUPS, |
|---|
| 567 | "NAME"); |
|---|
| 568 | foreach (@sortedSelections) { |
|---|
| 569 | print $OUT sprintf(" %*s %*s %*s %s\n", |
|---|
| 570 | $speciesDigits, $species{$_}, |
|---|
| 571 | $saiDigits, $sai{$_}, |
|---|
| 572 | $groupsDigits, $groups{$_}, |
|---|
| 573 | $_); |
|---|
| 574 | } |
|---|
| 575 | } |
|---|
| 576 | |
|---|
| 577 | sub generateReport($$) { |
|---|
| 578 | # Main function that orchestrates report generation. |
|---|
| 579 | # Opens the database, calls data collection, and generates all report sections. |
|---|
| 580 | my ($dbFile, $reportFile) = @_; |
|---|
| 581 | |
|---|
| 582 | my $gb_main = ARB::open($dbFile,"r"); |
|---|
| 583 | $gb_main || expectError('db connect ('.(($dbFile eq ':') ? 'no running ARB?' : "could not open database '$dbFile'").')'); |
|---|
| 584 | dieOnError(ARB::begin_transaction($gb_main), 'begin_transaction'); |
|---|
| 585 | |
|---|
| 586 | my $dbname; |
|---|
| 587 | if ($dbFile eq ':') { |
|---|
| 588 | $dbname = 'database running in arb'; |
|---|
| 589 | } |
|---|
| 590 | else { |
|---|
| 591 | if ($dbFile =~ /[^\/]*$/o) { $dbname = $&; } |
|---|
| 592 | else { $dbname = $dbFile; } |
|---|
| 593 | } |
|---|
| 594 | |
|---|
| 595 | open(REPORT, '>'.$reportFile) || die "Failed to write to $reportFile (Reason: $!)"; |
|---|
| 596 | report_stamped(\*REPORT, "analysing database '$dbname'"); |
|---|
| 597 | |
|---|
| 598 | collectOccurringFields($gb_main); |
|---|
| 599 | |
|---|
| 600 | reportTrees($gb_main, \*REPORT); |
|---|
| 601 | reportSAIs($gb_main, \*REPORT); |
|---|
| 602 | reportAlignments($gb_main, \*REPORT); |
|---|
| 603 | reportSelections($gb_main, \*REPORT); |
|---|
| 604 | reportSpeciesFields($gb_main, \*REPORT); |
|---|
| 605 | |
|---|
| 606 | print REPORT "\n"; |
|---|
| 607 | report_stamped(\*REPORT, "end of report"); |
|---|
| 608 | close(REPORT); |
|---|
| 609 | |
|---|
| 610 | ARB::commit_transaction($gb_main); |
|---|
| 611 | ARB::close($gb_main); |
|---|
| 612 | } |
|---|
| 613 | |
|---|
| 614 | sub show_usage() { |
|---|
| 615 | # Prints command line usage information. |
|---|
| 616 | print <<"USAGE"; |
|---|
| 617 | Purpose: |
|---|
| 618 | Generate a structured and comparable report from an ARB database. |
|---|
| 619 | |
|---|
| 620 | Usage: |
|---|
| 621 | databaseReport.pl <database> <report> |
|---|
| 622 | |
|---|
| 623 | Arguments: |
|---|
| 624 | <database> Path to the ARB database to analyze. |
|---|
| 625 | Use ':' to select the currently running database (much slower). |
|---|
| 626 | <report> Name of the report to create. |
|---|
| 627 | |
|---|
| 628 | Description: |
|---|
| 629 | This script analyzes selected contents of an ARB database and writes a |
|---|
| 630 | standardized report. Such reports are suitable for visual inspection and |
|---|
| 631 | comparison across different databases using diff-viewers like meld or kompare. |
|---|
| 632 | |
|---|
| 633 | The generated report includes: |
|---|
| 634 | - Field usage, types and protection levels. |
|---|
| 635 | - Species counts, marks and selections. |
|---|
| 636 | - Alignments, trees and SAIs. |
|---|
| 637 | |
|---|
| 638 | Example: |
|---|
| 639 | ./databaseReport.pl my_db.arb my_report.txt |
|---|
| 640 | |
|---|
| 641 | Note: |
|---|
| 642 | Using ':' to access the currently running database is supported, but significantly slower. |
|---|
| 643 | Prefer using a specific database path when possible. |
|---|
| 644 | USAGE |
|---|
| 645 | } |
|---|
| 646 | |
|---|
| 647 | sub main() { |
|---|
| 648 | # Handles argument parsing and output file handling. |
|---|
| 649 | my $args = scalar(@ARGV); |
|---|
| 650 | if ($args!=2) { |
|---|
| 651 | show_usage(); |
|---|
| 652 | } |
|---|
| 653 | else { |
|---|
| 654 | my $dbFile = $ARGV[0]; |
|---|
| 655 | my $reportFile = $ARGV[1]; |
|---|
| 656 | |
|---|
| 657 | generateReport($dbFile, $reportFile); |
|---|
| 658 | } |
|---|
| 659 | } |
|---|
| 660 | |
|---|
| 661 | main(); |
|---|
| 662 | |
|---|