1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : DI_clustertree.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in October 2009 // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #include "di_clustertree.hxx" |
---|
13 | #include <arb_progress.h> |
---|
14 | #include <set> |
---|
15 | #include <cmath> |
---|
16 | #include <limits> |
---|
17 | |
---|
18 | using namespace std; |
---|
19 | |
---|
20 | typedef std::set<LeafRelation> SortedPairValues; // iterator runs from small to big values |
---|
21 | |
---|
22 | // ------------------------ |
---|
23 | // ClusterTreeRoot |
---|
24 | |
---|
25 | |
---|
26 | ClusterTreeRoot::ClusterTreeRoot(AliView *aliview, AP_sequence *seqTemplate_, AP_FLOAT maxDistance_, size_t minClusterSize_) |
---|
27 | : ARB_seqtree_root(aliview, seqTemplate_, false), |
---|
28 | maxDistance(maxDistance_), |
---|
29 | minClusterSize(minClusterSize_) |
---|
30 | {} |
---|
31 | |
---|
32 | #if defined(DEBUG) |
---|
33 | struct ClusterStats { |
---|
34 | size_t clusters; |
---|
35 | size_t subclusters; |
---|
36 | size_t loadedSequences; |
---|
37 | |
---|
38 | ClusterStats() : |
---|
39 | clusters(0), |
---|
40 | subclusters(0), |
---|
41 | loadedSequences(0) |
---|
42 | {} |
---|
43 | }; |
---|
44 | |
---|
45 | static void collectStats(ClusterTree *node, ClusterStats *stats) { |
---|
46 | switch (node->get_state()) { |
---|
47 | case CS_IS_CLUSTER: stats->clusters++; break; |
---|
48 | case CS_SUB_CLUSTER: stats->subclusters++; break; |
---|
49 | default: break; |
---|
50 | } |
---|
51 | if (node->is_leaf()) { |
---|
52 | stats->loadedSequences += node->hasSequence(); |
---|
53 | } |
---|
54 | else { |
---|
55 | collectStats(node->get_leftson(), stats); |
---|
56 | collectStats(node->get_rightson(), stats); |
---|
57 | } |
---|
58 | } |
---|
59 | #endif // DEBUG |
---|
60 | |
---|
61 | GB_ERROR ClusterTreeRoot::find_clusters() { |
---|
62 | ClusterTree *root = get_root_node(); |
---|
63 | |
---|
64 | root->init_tree(); |
---|
65 | #if defined(DEBUG) |
---|
66 | printf("----------------------------------------\n"); |
---|
67 | printf("maxDistance: %f\n", maxDistance); |
---|
68 | printf("minClusterSize: %u\n", minClusterSize); |
---|
69 | printf("Leafs in tree: %u\n", root->get_leaf_count()); |
---|
70 | printf("Possible clusters: %u\n", root->get_cluster_count()); |
---|
71 | #endif // DEBUG |
---|
72 | |
---|
73 | arb_progress cluster_progress(long(root->get_cluster_count())); |
---|
74 | cluster_progress.auto_subtitles("Cluster"); |
---|
75 | |
---|
76 | GB_ERROR error = NULp; |
---|
77 | |
---|
78 | try { |
---|
79 | root->detect_clusters(cluster_progress); |
---|
80 | } |
---|
81 | catch (const char *msg) { error = msg; } |
---|
82 | catch (...) { cl_assert(0); } |
---|
83 | |
---|
84 | if (error) { |
---|
85 | // avoid further access after error |
---|
86 | change_root(root, NULp); |
---|
87 | UNCOVERED(); |
---|
88 | destroy(root); |
---|
89 | } |
---|
90 | else { |
---|
91 | #if defined(DEBUG) |
---|
92 | ClusterStats stats; |
---|
93 | collectStats(root, &stats); |
---|
94 | |
---|
95 | printf("Found clusters:::::::: %zu\n", stats.clusters); |
---|
96 | printf("Found subclusters::::: %zu\n", stats.subclusters); |
---|
97 | printf("Loaded sequences:::::: %zu (%5.2f%%)\n", |
---|
98 | stats.loadedSequences, |
---|
99 | (100.0*stats.loadedSequences)/root->get_leaf_count()); |
---|
100 | |
---|
101 | #if defined(TRACE_DIST_CALC) |
---|
102 | size_t distances = root->get_calculated_distances(); |
---|
103 | size_t leafs = root->get_leaf_count(); |
---|
104 | size_t existingDistances = (leafs*leafs)/2-1; |
---|
105 | |
---|
106 | printf("Calculated distances:: %zu (%5.2f%%)\n", |
---|
107 | distances, |
---|
108 | (100.0*distances)/existingDistances); |
---|
109 | #endif // TRACE_DIST_CALC |
---|
110 | |
---|
111 | #endif // DEBUG |
---|
112 | } |
---|
113 | |
---|
114 | return error; |
---|
115 | } |
---|
116 | |
---|
117 | // -------------------- |
---|
118 | // ClusterTree |
---|
119 | |
---|
120 | void ClusterTree::init_tree() { |
---|
121 | cl_assert(state == CS_UNKNOWN); |
---|
122 | |
---|
123 | #if defined(TRACE_DIST_CALC) |
---|
124 | calculatedDistances = 0; |
---|
125 | #endif // TRACE_DIST_CALC |
---|
126 | |
---|
127 | if (get_father()) { |
---|
128 | depth = get_father()->get_depth()+1; |
---|
129 | } |
---|
130 | else { |
---|
131 | depth = 1; |
---|
132 | cl_assert(is_root_node()); |
---|
133 | } |
---|
134 | |
---|
135 | if (is_leaf()) { |
---|
136 | leaf_count = 1; |
---|
137 | clus_count = 0; |
---|
138 | state = CS_TOO_SMALL; |
---|
139 | } |
---|
140 | else { |
---|
141 | ClusterTree *lson = get_leftson(); |
---|
142 | ClusterTree *rson = get_rightson(); |
---|
143 | |
---|
144 | lson->init_tree(); |
---|
145 | rson->init_tree(); |
---|
146 | |
---|
147 | leaf_count = lson->get_leaf_count() + rson->get_leaf_count(); |
---|
148 | |
---|
149 | ClusterTreeRoot *troot = get_tree_root(); |
---|
150 | if (leaf_count<troot->get_minClusterSize()) { |
---|
151 | state = CS_TOO_SMALL; |
---|
152 | clus_count = 0; |
---|
153 | } |
---|
154 | else { |
---|
155 | state = CS_MAYBE_CLUSTER; |
---|
156 | clus_count = lson->get_cluster_count() + rson->get_cluster_count() + 1; |
---|
157 | min_bases = numeric_limits<AP_FLOAT>::infinity(); |
---|
158 | } |
---|
159 | } |
---|
160 | |
---|
161 | cl_assert(state != CS_UNKNOWN); |
---|
162 | } |
---|
163 | |
---|
164 | void ClusterTree::detect_clusters(arb_progress& progress) { |
---|
165 | if (state == CS_MAYBE_CLUSTER) { |
---|
166 | cl_assert(!is_leaf()); |
---|
167 | |
---|
168 | ClusterTree *lson = get_leftson(); |
---|
169 | ClusterTree *rson = get_rightson(); |
---|
170 | |
---|
171 | lson->detect_clusters(progress); |
---|
172 | rson->detect_clusters(progress); |
---|
173 | |
---|
174 | #if defined(TRACE_DIST_CALC) |
---|
175 | calculatedDistances += get_leftson()->calculatedDistances; |
---|
176 | calculatedDistances += get_rightson()->calculatedDistances; |
---|
177 | #endif // TRACE_DIST_CALC |
---|
178 | |
---|
179 | if (lson->get_state() == CS_NO_CLUSTER || rson->get_state() == CS_NO_CLUSTER) { |
---|
180 | // one son is no cluster -> can't be cluster myself |
---|
181 | state = CS_NO_CLUSTER; |
---|
182 | } |
---|
183 | else { |
---|
184 | state = CS_IS_CLUSTER; // assume this is a cluster |
---|
185 | |
---|
186 | // Get set of branches (sorted by branch distance) |
---|
187 | |
---|
188 | get_branch_dists(); // calculates branchDists |
---|
189 | SortedPairValues pairsByBranchDistance; // biggest branch distances at end |
---|
190 | { |
---|
191 | LeafRelationCIter bd_end = branchDists->end(); |
---|
192 | for (LeafRelationCIter bd = branchDists->begin(); bd != bd_end; ++bd) { |
---|
193 | pairsByBranchDistance.insert(LeafRelation(bd->first, bd->second)); |
---|
194 | } |
---|
195 | } |
---|
196 | |
---|
197 | cl_assert(pairsByBranchDistance.size() == possible_relations()); |
---|
198 | |
---|
199 | // calculate real sequence distance |
---|
200 | // * stop when distance > maxDistance is detected |
---|
201 | // * calculate longest branchdistance first |
---|
202 | // (Assumption is: big branch distance <=> big sequence distance) |
---|
203 | |
---|
204 | AP_FLOAT maxDistance = get_tree_root()->get_maxDistance(); |
---|
205 | AP_FLOAT worstDistanceSeen = -1.0; |
---|
206 | |
---|
207 | cl_assert(!sequenceDists); |
---|
208 | sequenceDists = new LeafRelations; |
---|
209 | |
---|
210 | SortedPairValues::const_reverse_iterator pair_end = pairsByBranchDistance.rend(); |
---|
211 | for (SortedPairValues::const_reverse_iterator pair = pairsByBranchDistance.rbegin(); pair != pair_end; ++pair) { |
---|
212 | AP_FLOAT realDist = get_seqDist(pair->get_pair()); |
---|
213 | |
---|
214 | if (realDist>worstDistanceSeen) { |
---|
215 | worstDistanceSeen = realDist; |
---|
216 | |
---|
217 | delete worstKnownDistance; |
---|
218 | worstKnownDistance = new TwoLeafs(pair->get_pair()); |
---|
219 | |
---|
220 | if (realDist>maxDistance) { // big distance found -> not a cluster |
---|
221 | state = CS_NO_CLUSTER; |
---|
222 | break; |
---|
223 | } |
---|
224 | } |
---|
225 | } |
---|
226 | |
---|
227 | #if defined(TRACE_DIST_CALC) |
---|
228 | calculatedDistances += sequenceDists->size(); |
---|
229 | #endif // TRACE_DIST_CALC |
---|
230 | |
---|
231 | if (state == CS_IS_CLUSTER) { |
---|
232 | #if defined(DEBUG) |
---|
233 | // test whether all distances have been calculated |
---|
234 | cl_assert(!is_leaf()); |
---|
235 | size_t knownDistances = sequenceDists->size() + |
---|
236 | get_leftson()->known_seqDists() + |
---|
237 | get_rightson()->known_seqDists(); |
---|
238 | |
---|
239 | cl_assert(knownDistances == possible_relations()); |
---|
240 | #endif // DEBUG |
---|
241 | |
---|
242 | get_leftson()->state = CS_SUB_CLUSTER; |
---|
243 | get_rightson()->state = CS_SUB_CLUSTER; |
---|
244 | } |
---|
245 | |
---|
246 | } |
---|
247 | |
---|
248 | cl_assert(state == CS_NO_CLUSTER || state == CS_IS_CLUSTER); // detection failure |
---|
249 | |
---|
250 | lson->oblivion(false); |
---|
251 | rson->oblivion(false); |
---|
252 | |
---|
253 | progress.inc(); |
---|
254 | if (progress.aborted()) throw "aborted on userrequest"; |
---|
255 | } |
---|
256 | |
---|
257 | cl_assert(state != CS_MAYBE_CLUSTER); |
---|
258 | } |
---|
259 | |
---|
260 | void ClusterTree::oblivion(bool forgetDistances) { |
---|
261 | delete branchDepths; |
---|
262 | branchDepths = NULp; |
---|
263 | |
---|
264 | delete branchDists; |
---|
265 | branchDists = NULp; |
---|
266 | |
---|
267 | if (state == CS_NO_CLUSTER) { |
---|
268 | if (forgetDistances) { |
---|
269 | delete sequenceDists; |
---|
270 | sequenceDists = NULp; |
---|
271 | } |
---|
272 | forgetDistances = true; |
---|
273 | } |
---|
274 | else { |
---|
275 | cl_assert(state & (CS_IS_CLUSTER|CS_SUB_CLUSTER|CS_TOO_SMALL)); |
---|
276 | } |
---|
277 | |
---|
278 | if (!is_leaf()) { |
---|
279 | get_leftson()->oblivion(forgetDistances); |
---|
280 | get_rightson()->oblivion(forgetDistances); |
---|
281 | } |
---|
282 | } |
---|
283 | |
---|
284 | void ClusterTree::calc_branch_depths() { |
---|
285 | cl_assert(!branchDepths); |
---|
286 | branchDepths = new NodeValues; |
---|
287 | if (is_leaf()) { |
---|
288 | (*branchDepths)[this] = 0.0; |
---|
289 | } |
---|
290 | else { |
---|
291 | for (int s = 0; s<2; s++) { |
---|
292 | const NodeValues *sonDepths = s ? get_leftson()->get_branch_depths() : get_rightson()->get_branch_depths(); |
---|
293 | AP_FLOAT lenToSon = s ? leftlen : rightlen; |
---|
294 | |
---|
295 | NodeValues::const_iterator sd_end = sonDepths->end(); |
---|
296 | for (NodeValues::const_iterator sd = sonDepths->begin(); sd != sd_end; ++sd) { |
---|
297 | (*branchDepths)[sd->first] = sd->second+lenToSon; |
---|
298 | } |
---|
299 | } |
---|
300 | } |
---|
301 | } |
---|
302 | |
---|
303 | void ClusterTree::calc_branch_dists() { |
---|
304 | cl_assert(!branchDists); |
---|
305 | if (!is_leaf()) { |
---|
306 | branchDists = new LeafRelations; |
---|
307 | |
---|
308 | for (int s = 0; s<2; s++) { |
---|
309 | ClusterTree *son = s ? get_leftson() : get_rightson(); |
---|
310 | const LeafRelations *sonDists = son->get_branch_dists(); |
---|
311 | |
---|
312 | cl_assert(sonDists || son->is_leaf()); |
---|
313 | if (sonDists) branchDists->insert(sonDists->begin(), sonDists->end()); |
---|
314 | } |
---|
315 | |
---|
316 | const NodeValues *ldepths = get_leftson()->get_branch_depths(); |
---|
317 | const NodeValues *rdepths = get_rightson()->get_branch_depths(); |
---|
318 | |
---|
319 | NodeValues::const_iterator l_end = ldepths->end(); |
---|
320 | NodeValues::const_iterator r_end = rdepths->end(); |
---|
321 | |
---|
322 | for (NodeValues::const_iterator ld = ldepths->begin(); ld != l_end; ++ld) { |
---|
323 | AP_FLOAT llen = ld->second+leftlen; |
---|
324 | for (NodeValues::const_iterator rd = rdepths->begin(); rd != r_end; ++rd) { |
---|
325 | AP_FLOAT rlen = rd->second+rightlen; |
---|
326 | |
---|
327 | (*branchDists)[TwoLeafs(ld->first, rd->first)] = llen+rlen; |
---|
328 | } |
---|
329 | } |
---|
330 | |
---|
331 | cl_assert(branchDists->size() == possible_relations()); |
---|
332 | } |
---|
333 | } |
---|
334 | |
---|
335 | AP_FLOAT ClusterTree::get_seqDist(const TwoLeafs& pair) { |
---|
336 | // searches or calculates the distance between the sequences of leafs in 'pair' |
---|
337 | |
---|
338 | const AP_FLOAT *found = has_seqDist(pair); |
---|
339 | if (!found) { |
---|
340 | const ClusterTree *commonFather = pair.first()->commonFatherWith(pair.second()); |
---|
341 | |
---|
342 | while (commonFather != this) { |
---|
343 | found = commonFather->has_seqDist(pair); |
---|
344 | if (found) break; |
---|
345 | |
---|
346 | commonFather = commonFather->get_father(); |
---|
347 | cl_assert(commonFather); // first commonFather was not inside this! |
---|
348 | } |
---|
349 | } |
---|
350 | |
---|
351 | if (!found) { |
---|
352 | // calculate the distance |
---|
353 | |
---|
354 | const AP_combinableSeq *seq1 = pair.first()->get_seq(); |
---|
355 | const AP_combinableSeq *seq2 = pair.second()->get_seq(); |
---|
356 | |
---|
357 | seq1->lazy_load_sequence(); |
---|
358 | seq2->lazy_load_sequence(); |
---|
359 | |
---|
360 | { |
---|
361 | AP_combinableSeq *ancestor = seq1->dup(); |
---|
362 | |
---|
363 | Mutations mutations = ancestor->combine_seq(seq1, seq2); |
---|
364 | AP_FLOAT wbc1 = seq1->weighted_base_count(); |
---|
365 | AP_FLOAT wbc2 = seq2->weighted_base_count(); |
---|
366 | AP_FLOAT minBaseCount = wbc1 < wbc2 ? wbc1 : wbc2; |
---|
367 | AP_FLOAT dist; |
---|
368 | |
---|
369 | if (minBaseCount>0) { |
---|
370 | dist = mutations / minBaseCount; |
---|
371 | if (minBaseCount < min_bases) min_bases = minBaseCount; |
---|
372 | } |
---|
373 | else { // got at least one empty sequence -> no distance |
---|
374 | dist = 0.0; |
---|
375 | min_bases = minBaseCount; |
---|
376 | } |
---|
377 | |
---|
378 | #if defined(DEBUG) && 0 |
---|
379 | const char *name1 = pair.first()->name; |
---|
380 | const char *name2 = pair.second()->name; |
---|
381 | printf("%s <-?-> %s: mutations=%5.2f wbc1=%5.2f wbc2=%5.2f mbc=%5.2f dist=%5.2f\n", |
---|
382 | name1, name2, mutations, wbc1, wbc2, minBaseCount, dist); |
---|
383 | #endif // DEBUG |
---|
384 | |
---|
385 | cl_assert(!is_nan_or_inf(dist)); |
---|
386 | |
---|
387 | (*sequenceDists)[pair] = dist; |
---|
388 | delete ancestor; |
---|
389 | } |
---|
390 | |
---|
391 | found = has_seqDist(pair); |
---|
392 | cl_assert(found); |
---|
393 | } |
---|
394 | |
---|
395 | cl_assert(found); |
---|
396 | return *found; |
---|
397 | } |
---|
398 | |
---|
399 | const AP_FLOAT *ClusterTree::has_seqDist(const TwoLeafs& pair) const { |
---|
400 | if (sequenceDists) { |
---|
401 | LeafRelationCIter found = sequenceDists->find(pair); |
---|
402 | if (found != sequenceDists->end()) { |
---|
403 | return &(found->second); |
---|
404 | } |
---|
405 | } |
---|
406 | return NULp; |
---|
407 | } |
---|
408 | |
---|
409 | const ClusterTree *ClusterTree::commonFatherWith(const ClusterTree *other) const { |
---|
410 | int depth_diff = get_depth() - other->get_depth(); |
---|
411 | if (depth_diff) { |
---|
412 | if (depth_diff<0) { // this is less deep than other |
---|
413 | return other->get_father()->commonFatherWith(this); |
---|
414 | } |
---|
415 | else { // other is less deep than this |
---|
416 | return get_father()->commonFatherWith(other); |
---|
417 | } |
---|
418 | } |
---|
419 | else { // both at same depth |
---|
420 | if (this == other) { // common father reached ? |
---|
421 | return this; |
---|
422 | } |
---|
423 | else { // otherwise check fathers |
---|
424 | return get_father()->commonFatherWith(other->get_father()); |
---|
425 | } |
---|
426 | } |
---|
427 | } |
---|
428 | |
---|