| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : NT_species_set.h // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #ifndef NT_SPECIES_SET_H |
|---|
| 12 | #define NT_SPECIES_SET_H |
|---|
| 13 | |
|---|
| 14 | #ifndef NT_TREE_CMP_H |
|---|
| 15 | #include "NT_tree_cmp.h" |
|---|
| 16 | #endif |
|---|
| 17 | #ifndef ARBTOOLS_H |
|---|
| 18 | #include <arbtools.h> |
|---|
| 19 | #endif |
|---|
| 20 | #ifndef AP_TREE_HXX |
|---|
| 21 | #include <AP_Tree.hxx> |
|---|
| 22 | #endif |
|---|
| 23 | |
|---|
| 24 | class RSpecSet; |
|---|
| 25 | class TSpecSet; |
|---|
| 26 | class arb_progress; |
|---|
| 27 | |
|---|
| 28 | // @@@ improve compare logic: |
|---|
| 29 | // - species sets (and bitstrings) should only contain species that occur in both trees |
|---|
| 30 | // - species that occur only in RSpecSet-tree shall be stored in RSpecSet (like done in TSpecSet::unfound_species_count) |
|---|
| 31 | // - a small penalty shall be assigned (as done for TSpecSet) |
|---|
| 32 | |
|---|
| 33 | class SpecSetRegistry : virtual Noncopyable { |
|---|
| 34 | long species_counter; // number of species added to hash |
|---|
| 35 | long nspecies; |
|---|
| 36 | long nsets; // number of RSpecSet added to 'sets' |
|---|
| 37 | |
|---|
| 38 | RSpecSet **sets; |
|---|
| 39 | int set_bits[256]; |
|---|
| 40 | |
|---|
| 41 | GroupMatchScorer scorer; |
|---|
| 42 | arb_progress *progress; |
|---|
| 43 | GB_HASH *species_hash; // contains [1..N] |
|---|
| 44 | unsigned char *tmp_bitstring; |
|---|
| 45 | |
|---|
| 46 | int max_nsets() const { return leafs_2_innerNodes(nspecies, ROOTED); } |
|---|
| 47 | |
|---|
| 48 | void dump_bitstring(const char *tag, unsigned char *bs); |
|---|
| 49 | |
|---|
| 50 | void add(const char *species_name); // max nspecies |
|---|
| 51 | void add(RSpecSet *rset); // max 2 * nspecies |
|---|
| 52 | |
|---|
| 53 | double search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log); |
|---|
| 54 | |
|---|
| 55 | #if defined(UNIT_TESTS) |
|---|
| 56 | friend void TEST_species_sets(); |
|---|
| 57 | #endif |
|---|
| 58 | |
|---|
| 59 | public: |
|---|
| 60 | SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_); |
|---|
| 61 | ~SpecSetRegistry(); |
|---|
| 62 | void finish(GB_ERROR& error); // call before destruction to retrieve errors |
|---|
| 63 | |
|---|
| 64 | long bitstring_bytes() const { return (nspecies-1)/8 + 1; } |
|---|
| 65 | long bitstring_longs() const { return (bitstring_bytes()-1)/sizeof(long) + 1; } |
|---|
| 66 | |
|---|
| 67 | unsigned char *allocate_bitstring() const { return ARB_calloc<unsigned char>(bitstring_longs()*sizeof(long)); } |
|---|
| 68 | |
|---|
| 69 | long get_species_index(const char *species_name) const { return GBS_read_hash(species_hash, species_name); } |
|---|
| 70 | RSpecSet *registerTree(AP_tree *node); |
|---|
| 71 | |
|---|
| 72 | RSpecSet *search_best_match(const TSpecSet *tset, GroupPenalty& min_penalty); |
|---|
| 73 | TSpecSet *find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info); |
|---|
| 74 | GB_ERROR write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci); |
|---|
| 75 | |
|---|
| 76 | void setScorer(const GroupMatchScorer& newScorer) { scorer = newScorer; } |
|---|
| 77 | }; |
|---|
| 78 | |
|---|
| 79 | |
|---|
| 80 | class SpecSet : virtual Noncopyable { |
|---|
| 81 | protected: |
|---|
| 82 | // SpecSet should only be used by derived classes |
|---|
| 83 | |
|---|
| 84 | int known_members; // number of registered members |
|---|
| 85 | |
|---|
| 86 | void init(AP_tree *nodei, const SpecSetRegistry& ssr); |
|---|
| 87 | |
|---|
| 88 | SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species.. |
|---|
| 89 | SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r); // ..or from two subsets |
|---|
| 90 | ~SpecSet(); |
|---|
| 91 | |
|---|
| 92 | public: |
|---|
| 93 | // @@@ make member private |
|---|
| 94 | unsigned char *bitstring; |
|---|
| 95 | AP_tree *set_node; // node in tree (from which SpecSet was initialized) |
|---|
| 96 | |
|---|
| 97 | bool is_leaf_set() const { return set_node && set_node->is_leaf(); } // @@@ might be wrong for zombies |
|---|
| 98 | int get_known_members() const { return known_members; } |
|---|
| 99 | }; |
|---|
| 100 | |
|---|
| 101 | class RSpecSet : public SpecSet { // derived from Noncopyable |
|---|
| 102 | // set registered in SpecSetRegistry |
|---|
| 103 | AP_tree *best_node; // node in other tree |
|---|
| 104 | GroupPenalty best_match; // result of matching 'this' versus TSpecSet of best_node |
|---|
| 105 | |
|---|
| 106 | public: |
|---|
| 107 | RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species.. |
|---|
| 108 | RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r); // ..or from two subsets |
|---|
| 109 | |
|---|
| 110 | void storeBetterMatch(const GroupPenalty& match, AP_tree *matched_node) { |
|---|
| 111 | // if 'this' was detected as best match for any TSpecSet of other (not registered) tree, |
|---|
| 112 | // -> store match in best_match + node of TSpecSet in best_node: |
|---|
| 113 | |
|---|
| 114 | nt_assert(!best_match.betterThan(match)); // avoid overwriting with worse match |
|---|
| 115 | |
|---|
| 116 | best_match = match; |
|---|
| 117 | best_node = matched_node; |
|---|
| 118 | } |
|---|
| 119 | |
|---|
| 120 | int size() const { return known_members; } // only contains known members by definition |
|---|
| 121 | const GroupPenalty& bestMatch() const { return best_match; } |
|---|
| 122 | AP_tree* matchedNode() const { return best_node; } |
|---|
| 123 | }; |
|---|
| 124 | |
|---|
| 125 | class TSpecSet : public SpecSet { // derived from Noncopyable |
|---|
| 126 | // set tested against sets in registry |
|---|
| 127 | |
|---|
| 128 | int unfound_species_count; // species missing in SpecSetRegistry |
|---|
| 129 | public: |
|---|
| 130 | TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species.. |
|---|
| 131 | TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r); // ..or from two subsets |
|---|
| 132 | |
|---|
| 133 | int size() const { return known_members + unfound_species_count; } |
|---|
| 134 | int get_unknown_members() const { return unfound_species_count; } |
|---|
| 135 | }; |
|---|
| 136 | |
|---|
| 137 | #else |
|---|
| 138 | #error NT_species_set.h included twice |
|---|
| 139 | #endif // NT_SPECIES_SET_H |
|---|