1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : NT_species_set.h // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #ifndef NT_SPECIES_SET_H |
---|
12 | #define NT_SPECIES_SET_H |
---|
13 | |
---|
14 | #ifndef NT_TREE_CMP_H |
---|
15 | #include "NT_tree_cmp.h" |
---|
16 | #endif |
---|
17 | #ifndef ARBTOOLS_H |
---|
18 | #include <arbtools.h> |
---|
19 | #endif |
---|
20 | #ifndef AP_TREE_HXX |
---|
21 | #include <AP_Tree.hxx> |
---|
22 | #endif |
---|
23 | |
---|
24 | class RSpecSet; |
---|
25 | class TSpecSet; |
---|
26 | class arb_progress; |
---|
27 | |
---|
28 | // @@@ improve compare logic: |
---|
29 | // - species sets (and bitstrings) should only contain species that occur in both trees |
---|
30 | // - species that occur only in RSpecSet-tree shall be stored in RSpecSet (like done in TSpecSet::unfound_species_count) |
---|
31 | // - a small penalty shall be assigned (as done for TSpecSet) |
---|
32 | |
---|
33 | class SpecSetRegistry : virtual Noncopyable { |
---|
34 | long species_counter; // number of species added to hash |
---|
35 | long nspecies; |
---|
36 | long nsets; // number of RSpecSet added to 'sets' |
---|
37 | |
---|
38 | RSpecSet **sets; |
---|
39 | int set_bits[256]; |
---|
40 | |
---|
41 | GroupMatchScorer scorer; |
---|
42 | arb_progress *progress; |
---|
43 | GB_HASH *species_hash; // contains [1..N] |
---|
44 | unsigned char *tmp_bitstring; |
---|
45 | |
---|
46 | int max_nsets() const { return leafs_2_innerNodes(nspecies, ROOTED); } |
---|
47 | |
---|
48 | void dump_bitstring(const char *tag, unsigned char *bs); |
---|
49 | |
---|
50 | void add(const char *species_name); // max nspecies |
---|
51 | void add(RSpecSet *rset); // max 2 * nspecies |
---|
52 | |
---|
53 | double search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log); |
---|
54 | |
---|
55 | #if defined(UNIT_TESTS) |
---|
56 | friend void TEST_species_sets(); |
---|
57 | #endif |
---|
58 | |
---|
59 | public: |
---|
60 | SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_); |
---|
61 | ~SpecSetRegistry(); |
---|
62 | void finish(GB_ERROR& error); // call before destruction to retrieve errors |
---|
63 | |
---|
64 | long bitstring_bytes() const { return (nspecies-1)/8 + 1; } |
---|
65 | long bitstring_longs() const { return (bitstring_bytes()-1)/sizeof(long) + 1; } |
---|
66 | |
---|
67 | unsigned char *allocate_bitstring() const { return ARB_calloc<unsigned char>(bitstring_longs()*sizeof(long)); } |
---|
68 | |
---|
69 | long get_species_index(const char *species_name) const { return GBS_read_hash(species_hash, species_name); } |
---|
70 | RSpecSet *registerTree(AP_tree *node); |
---|
71 | |
---|
72 | RSpecSet *search_best_match(const TSpecSet *tset, GroupPenalty& min_penalty); |
---|
73 | TSpecSet *find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info); |
---|
74 | GB_ERROR write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci); |
---|
75 | |
---|
76 | void setScorer(const GroupMatchScorer& newScorer) { scorer = newScorer; } |
---|
77 | }; |
---|
78 | |
---|
79 | |
---|
80 | class SpecSet : virtual Noncopyable { |
---|
81 | protected: |
---|
82 | // SpecSet should only be used by derived classes |
---|
83 | |
---|
84 | int known_members; // number of registered members |
---|
85 | |
---|
86 | void init(AP_tree *nodei, const SpecSetRegistry& ssr); |
---|
87 | |
---|
88 | SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species.. |
---|
89 | SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r); // ..or from two subsets |
---|
90 | ~SpecSet(); |
---|
91 | |
---|
92 | public: |
---|
93 | // @@@ make member private |
---|
94 | unsigned char *bitstring; |
---|
95 | AP_tree *set_node; // node in tree (from which SpecSet was initialized) |
---|
96 | |
---|
97 | bool is_leaf_set() const { return set_node && set_node->is_leaf(); } // @@@ might be wrong for zombies |
---|
98 | int get_known_members() const { return known_members; } |
---|
99 | }; |
---|
100 | |
---|
101 | class RSpecSet : public SpecSet { // derived from Noncopyable |
---|
102 | // set registered in SpecSetRegistry |
---|
103 | AP_tree *best_node; // node in other tree |
---|
104 | GroupPenalty best_match; // result of matching 'this' versus TSpecSet of best_node |
---|
105 | |
---|
106 | public: |
---|
107 | RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species.. |
---|
108 | RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r); // ..or from two subsets |
---|
109 | |
---|
110 | void storeBetterMatch(const GroupPenalty& match, AP_tree *matched_node) { |
---|
111 | // if 'this' was detected as best match for any TSpecSet of other (not registered) tree, |
---|
112 | // -> store match in best_match + node of TSpecSet in best_node: |
---|
113 | |
---|
114 | nt_assert(!best_match.betterThan(match)); // avoid overwriting with worse match |
---|
115 | |
---|
116 | best_match = match; |
---|
117 | best_node = matched_node; |
---|
118 | } |
---|
119 | |
---|
120 | int size() const { return known_members; } // only contains known members by definition |
---|
121 | const GroupPenalty& bestMatch() const { return best_match; } |
---|
122 | AP_tree* matchedNode() const { return best_node; } |
---|
123 | }; |
---|
124 | |
---|
125 | class TSpecSet : public SpecSet { // derived from Noncopyable |
---|
126 | // set tested against sets in registry |
---|
127 | |
---|
128 | int unfound_species_count; // species missing in SpecSetRegistry |
---|
129 | public: |
---|
130 | TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name); // create from species.. |
---|
131 | TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r); // ..or from two subsets |
---|
132 | |
---|
133 | int size() const { return known_members + unfound_species_count; } |
---|
134 | int get_unknown_members() const { return unfound_species_count; } |
---|
135 | }; |
---|
136 | |
---|
137 | #else |
---|
138 | #error NT_species_set.h included twice |
---|
139 | #endif // NT_SPECIES_SET_H |
---|