1 | // ============================================================= // |
---|
2 | // // |
---|
3 | // File : CT_hash.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================= // |
---|
10 | |
---|
11 | #include "CT_hash.hxx" |
---|
12 | #include "CT_ntree.hxx" |
---|
13 | #include "CT_ctree.hxx" |
---|
14 | #include <cmath> |
---|
15 | |
---|
16 | #include <arb_sort.h> |
---|
17 | |
---|
18 | PartRegistry::~PartRegistry() { |
---|
19 | for (PartSet::iterator p = parts.begin(); p != parts.end(); ++p) delete *p; |
---|
20 | for (PartSet::iterator p = artificial_parts.begin(); p != artificial_parts.end(); ++p) delete *p; |
---|
21 | for (PartVector::iterator p = sorted.begin(); p != sorted.end(); ++p) delete *p; |
---|
22 | } |
---|
23 | |
---|
24 | PART *PartRegistry::get_part() { |
---|
25 | /*! each call to get_part() returns the next part from the sorted PART-list. |
---|
26 | * build_sorted_list() has to be called once before. |
---|
27 | */ |
---|
28 | |
---|
29 | arb_assert(retrieval_phase()); // call build_sorted_list() before retrieving |
---|
30 | |
---|
31 | PART *p = NULp; |
---|
32 | if (retrieved<sorted.size()) { |
---|
33 | std::swap(p, sorted[retrieved++]); |
---|
34 | } |
---|
35 | return p; |
---|
36 | } |
---|
37 | |
---|
38 | const PART *PartRegistry::peek_part(int idx) const { |
---|
39 | arb_assert(idx>=0 && size_t(idx)<size()); |
---|
40 | arb_assert(retrieval_phase()); // call build_sorted_list() before retrieving |
---|
41 | |
---|
42 | return sorted[idx]; |
---|
43 | } |
---|
44 | |
---|
45 | |
---|
46 | void PartRegistry::put_part_from_complete_tree(PART*& part, const TreeNode *node) { |
---|
47 | //! insert part in PartRegistry (destroys/reassigns part) |
---|
48 | |
---|
49 | arb_assert(registration_phase()); |
---|
50 | part->standardize(); |
---|
51 | |
---|
52 | PartSet::iterator found = parts.find(part); |
---|
53 | |
---|
54 | if (found != parts.end()) { |
---|
55 | (*found)->addWeightAndLength(part); |
---|
56 | delete part; |
---|
57 | } |
---|
58 | else { |
---|
59 | part->set_origin(node); |
---|
60 | parts.insert(part); |
---|
61 | } |
---|
62 | part = NULp; |
---|
63 | } |
---|
64 | |
---|
65 | void PartRegistry::put_artificial_part(PART*& part) { |
---|
66 | arb_assert(registration_phase()); |
---|
67 | part->standardize(); |
---|
68 | |
---|
69 | PartSet::iterator found = artificial_parts.find(part); |
---|
70 | if (found != artificial_parts.end()) { |
---|
71 | (*found)->addWeightAndLength(part); |
---|
72 | delete part; |
---|
73 | } |
---|
74 | else { |
---|
75 | artificial_parts.insert(part); |
---|
76 | } |
---|
77 | part = NULp; |
---|
78 | } |
---|
79 | |
---|
80 | void PartRegistry::put_part_from_partial_tree(PART*& part, const PART *partialTree, const TreeNode *node) { |
---|
81 | //! insert part from partialTree into PartRegistry (destroys/reassigns part) |
---|
82 | // |
---|
83 | // Example: |
---|
84 | // |
---|
85 | // 'part': A---B (A=part, B=invs) |
---|
86 | // set of missing species: C |
---|
87 | // |
---|
88 | // A |
---|
89 | // / |
---|
90 | // assumed tree: C---+ |
---|
91 | // \ . |
---|
92 | // B |
---|
93 | // |
---|
94 | // inserted partitions: A---CB |
---|
95 | // AC---B |
---|
96 | // |
---|
97 | // Both partitions are inserted here. Since they mutually exclude each other, |
---|
98 | // only one of them can and will be inserted into the result tree. |
---|
99 | // |
---|
100 | // The algorithm assumes that all species missing in the partialTree, |
---|
101 | // reside on the bigger side of each branch of the partialTree. |
---|
102 | // |
---|
103 | // Therefore the partition representing that state is added with normal weight. |
---|
104 | // The opposite ("missing on smaller side of branch") is added with reduced |
---|
105 | // probability. |
---|
106 | // |
---|
107 | // Note: The PART 'C---AB' might exists at EACH branch in the deconstructed tree |
---|
108 | // and is inserted once globally (in deconstruct_partial_rootnode). |
---|
109 | |
---|
110 | arb_assert(registration_phase()); |
---|
111 | |
---|
112 | PART *invs = part->clone(); |
---|
113 | invs->invertInSuperset(partialTree); |
---|
114 | |
---|
115 | int diff = part->get_members() - invs->get_members(); |
---|
116 | if (diff != 0) { |
---|
117 | if (diff<0) std::swap(part, invs); |
---|
118 | // now part contains bigger partition |
---|
119 | |
---|
120 | // Note: adding 'part' means "add unknown species to smaller side of the partition" |
---|
121 | int edges_in_part = leafs_2_edges(part->get_members()+1, ROOTED); |
---|
122 | int edges_in_invs = leafs_2_edges(invs->get_members()+1, ROOTED); |
---|
123 | |
---|
124 | double part_prob = double(edges_in_invs)/edges_in_part; // probability for one unkown species to reside inside smaller set |
---|
125 | arb_assert(part_prob>0.0 && part_prob<1.0); |
---|
126 | |
---|
127 | int unknownCount = partialTree->get_nonmembers(); |
---|
128 | double all_in_part_prob = pow(part_prob, unknownCount); // probability ALL unkown species reside inside smaller set |
---|
129 | |
---|
130 | part->set_faked_weight(all_in_part_prob); // make "unknown in smaller" unlikely |
---|
131 | // -> reduces chance to be picked and bootstrap IF picked |
---|
132 | } |
---|
133 | // else, if both partitions have equal size -> simply add both |
---|
134 | |
---|
135 | put_part_from_complete_tree(part, node); // inserts 'A---CB' (i.e. missing species added to invs, which is smaller) |
---|
136 | put_part_from_complete_tree(invs, node); // inserts 'B---CA' (i.e. missing species added to part) |
---|
137 | } |
---|
138 | |
---|
139 | inline bool insertionOrder_less(const PART *p1, const PART *p2) { |
---|
140 | return p1->insertionOrder_cmp(p2)<0; |
---|
141 | } |
---|
142 | |
---|
143 | void PartRegistry::build_sorted_list(double overall_weight) { |
---|
144 | //! sort the parts into insertion order. |
---|
145 | |
---|
146 | arb_assert(!parts.empty()); // nothing has been added |
---|
147 | arb_assert(registration_phase()); |
---|
148 | |
---|
149 | merge_artificial_parts(); |
---|
150 | |
---|
151 | copy(parts.begin(), parts.end(), std::back_insert_iterator<PartVector>(sorted)); |
---|
152 | sort(sorted.begin(), sorted.end(), insertionOrder_less); |
---|
153 | |
---|
154 | parts.clear(); |
---|
155 | |
---|
156 | for (PartVector::iterator pi = sorted.begin(); pi != sorted.end(); ++pi) { |
---|
157 | (*pi)->takeMean(overall_weight); |
---|
158 | } |
---|
159 | |
---|
160 | arb_assert(retrieval_phase()); |
---|
161 | } |
---|
162 | |
---|
163 | void PartRegistry::merge_artificial_parts() { |
---|
164 | /*! 'artificial_parts' contains those partitions that did not occur in |
---|
165 | * one PARTIAL input tree. |
---|
166 | * |
---|
167 | * Not-yet-registered artificial_parts will be registered here. |
---|
168 | * Their weight is faked to zero (aka "never occurs"). |
---|
169 | * |
---|
170 | * Already registered artificial_parts are only skipped -> their bootstrap |
---|
171 | * will be set correctly (just like when the branch as such does not exist |
---|
172 | * in a non-partial tree). |
---|
173 | */ |
---|
174 | |
---|
175 | for (PartSet::iterator arti = artificial_parts.begin(); arti != artificial_parts.end(); ++arti) { |
---|
176 | if (parts.find(*arti) == parts.end()) { |
---|
177 | // partition was not added by any known edge -> add plain artificial edge |
---|
178 | (*arti)->set_faked_weight(0.0); // never occurs (implicates length=1.0; see get_len()) |
---|
179 | parts.insert(*arti); |
---|
180 | } |
---|
181 | else { |
---|
182 | delete *arti; |
---|
183 | } |
---|
184 | } |
---|
185 | artificial_parts.clear(); |
---|
186 | } |
---|
187 | |
---|