| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : DI_clustertree.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in October 2009 // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | #include "di_clustertree.hxx" |
|---|
| 13 | #include <arb_progress.h> |
|---|
| 14 | #include <set> |
|---|
| 15 | #include <cmath> |
|---|
| 16 | #include <limits> |
|---|
| 17 | |
|---|
| 18 | using namespace std; |
|---|
| 19 | |
|---|
| 20 | typedef std::set<LeafRelation> SortedPairValues; // iterator runs from small to big values |
|---|
| 21 | |
|---|
| 22 | // ------------------------ |
|---|
| 23 | // ClusterTreeRoot |
|---|
| 24 | |
|---|
| 25 | |
|---|
| 26 | ClusterTreeRoot::ClusterTreeRoot(AliView *aliview, AP_sequence *seqTemplate_, AP_FLOAT maxDistance_, size_t minClusterSize_) |
|---|
| 27 | : ARB_seqtree_root(aliview, seqTemplate_, false), |
|---|
| 28 | maxDistance(maxDistance_), |
|---|
| 29 | minClusterSize(minClusterSize_) |
|---|
| 30 | {} |
|---|
| 31 | |
|---|
| 32 | #if defined(DEBUG) |
|---|
| 33 | struct ClusterStats { |
|---|
| 34 | size_t clusters; |
|---|
| 35 | size_t subclusters; |
|---|
| 36 | size_t loadedSequences; |
|---|
| 37 | |
|---|
| 38 | ClusterStats() : |
|---|
| 39 | clusters(0), |
|---|
| 40 | subclusters(0), |
|---|
| 41 | loadedSequences(0) |
|---|
| 42 | {} |
|---|
| 43 | }; |
|---|
| 44 | |
|---|
| 45 | static void collectStats(ClusterTree *node, ClusterStats *stats) { |
|---|
| 46 | switch (node->get_state()) { |
|---|
| 47 | case CS_IS_CLUSTER: stats->clusters++; break; |
|---|
| 48 | case CS_SUB_CLUSTER: stats->subclusters++; break; |
|---|
| 49 | default: break; |
|---|
| 50 | } |
|---|
| 51 | if (node->is_leaf()) { |
|---|
| 52 | stats->loadedSequences += node->hasSequence(); |
|---|
| 53 | } |
|---|
| 54 | else { |
|---|
| 55 | collectStats(node->get_leftson(), stats); |
|---|
| 56 | collectStats(node->get_rightson(), stats); |
|---|
| 57 | } |
|---|
| 58 | } |
|---|
| 59 | #endif // DEBUG |
|---|
| 60 | |
|---|
| 61 | GB_ERROR ClusterTreeRoot::find_clusters() { |
|---|
| 62 | ClusterTree *root = get_root_node(); |
|---|
| 63 | |
|---|
| 64 | root->init_tree(); |
|---|
| 65 | #if defined(DEBUG) |
|---|
| 66 | printf("----------------------------------------\n"); |
|---|
| 67 | printf("maxDistance: %f\n", maxDistance); |
|---|
| 68 | printf("minClusterSize: %u\n", minClusterSize); |
|---|
| 69 | printf("Leafs in tree: %u\n", root->get_leaf_count()); |
|---|
| 70 | printf("Possible clusters: %u\n", root->get_cluster_count()); |
|---|
| 71 | #endif // DEBUG |
|---|
| 72 | |
|---|
| 73 | arb_progress cluster_progress(long(root->get_cluster_count())); |
|---|
| 74 | cluster_progress.auto_subtitles("Cluster"); |
|---|
| 75 | |
|---|
| 76 | GB_ERROR error = NULp; |
|---|
| 77 | |
|---|
| 78 | try { |
|---|
| 79 | root->detect_clusters(cluster_progress); |
|---|
| 80 | } |
|---|
| 81 | catch (const char *msg) { error = msg; } |
|---|
| 82 | catch (...) { cl_assert(0); } |
|---|
| 83 | |
|---|
| 84 | if (error) { |
|---|
| 85 | // avoid further access after error |
|---|
| 86 | change_root(root, NULp); |
|---|
| 87 | UNCOVERED(); |
|---|
| 88 | destroy(root); |
|---|
| 89 | } |
|---|
| 90 | else { |
|---|
| 91 | #if defined(DEBUG) |
|---|
| 92 | ClusterStats stats; |
|---|
| 93 | collectStats(root, &stats); |
|---|
| 94 | |
|---|
| 95 | printf("Found clusters:::::::: %zu\n", stats.clusters); |
|---|
| 96 | printf("Found subclusters::::: %zu\n", stats.subclusters); |
|---|
| 97 | printf("Loaded sequences:::::: %zu (%5.2f%%)\n", |
|---|
| 98 | stats.loadedSequences, |
|---|
| 99 | (100.0*stats.loadedSequences)/root->get_leaf_count()); |
|---|
| 100 | |
|---|
| 101 | #if defined(TRACE_DIST_CALC) |
|---|
| 102 | size_t distances = root->get_calculated_distances(); |
|---|
| 103 | size_t leafs = root->get_leaf_count(); |
|---|
| 104 | size_t existingDistances = (leafs*leafs)/2-1; |
|---|
| 105 | |
|---|
| 106 | printf("Calculated distances:: %zu (%5.2f%%)\n", |
|---|
| 107 | distances, |
|---|
| 108 | (100.0*distances)/existingDistances); |
|---|
| 109 | #endif // TRACE_DIST_CALC |
|---|
| 110 | |
|---|
| 111 | #endif // DEBUG |
|---|
| 112 | } |
|---|
| 113 | |
|---|
| 114 | return error; |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | // -------------------- |
|---|
| 118 | // ClusterTree |
|---|
| 119 | |
|---|
| 120 | void ClusterTree::init_tree() { |
|---|
| 121 | cl_assert(state == CS_UNKNOWN); |
|---|
| 122 | |
|---|
| 123 | #if defined(TRACE_DIST_CALC) |
|---|
| 124 | calculatedDistances = 0; |
|---|
| 125 | #endif // TRACE_DIST_CALC |
|---|
| 126 | |
|---|
| 127 | if (get_father()) { |
|---|
| 128 | depth = get_father()->get_depth()+1; |
|---|
| 129 | } |
|---|
| 130 | else { |
|---|
| 131 | depth = 1; |
|---|
| 132 | cl_assert(is_root_node()); |
|---|
| 133 | } |
|---|
| 134 | |
|---|
| 135 | if (is_leaf()) { |
|---|
| 136 | leaf_count = 1; |
|---|
| 137 | clus_count = 0; |
|---|
| 138 | state = CS_TOO_SMALL; |
|---|
| 139 | } |
|---|
| 140 | else { |
|---|
| 141 | ClusterTree *lson = get_leftson(); |
|---|
| 142 | ClusterTree *rson = get_rightson(); |
|---|
| 143 | |
|---|
| 144 | lson->init_tree(); |
|---|
| 145 | rson->init_tree(); |
|---|
| 146 | |
|---|
| 147 | leaf_count = lson->get_leaf_count() + rson->get_leaf_count(); |
|---|
| 148 | |
|---|
| 149 | ClusterTreeRoot *troot = get_tree_root(); |
|---|
| 150 | if (leaf_count<troot->get_minClusterSize()) { |
|---|
| 151 | state = CS_TOO_SMALL; |
|---|
| 152 | clus_count = 0; |
|---|
| 153 | } |
|---|
| 154 | else { |
|---|
| 155 | state = CS_MAYBE_CLUSTER; |
|---|
| 156 | clus_count = lson->get_cluster_count() + rson->get_cluster_count() + 1; |
|---|
| 157 | min_bases = numeric_limits<AP_FLOAT>::infinity(); |
|---|
| 158 | } |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | cl_assert(state != CS_UNKNOWN); |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | void ClusterTree::detect_clusters(arb_progress& progress) { |
|---|
| 165 | if (state == CS_MAYBE_CLUSTER) { |
|---|
| 166 | cl_assert(!is_leaf()); |
|---|
| 167 | |
|---|
| 168 | ClusterTree *lson = get_leftson(); |
|---|
| 169 | ClusterTree *rson = get_rightson(); |
|---|
| 170 | |
|---|
| 171 | lson->detect_clusters(progress); |
|---|
| 172 | rson->detect_clusters(progress); |
|---|
| 173 | |
|---|
| 174 | #if defined(TRACE_DIST_CALC) |
|---|
| 175 | calculatedDistances += get_leftson()->calculatedDistances; |
|---|
| 176 | calculatedDistances += get_rightson()->calculatedDistances; |
|---|
| 177 | #endif // TRACE_DIST_CALC |
|---|
| 178 | |
|---|
| 179 | if (lson->get_state() == CS_NO_CLUSTER || rson->get_state() == CS_NO_CLUSTER) { |
|---|
| 180 | // one son is no cluster -> can't be cluster myself |
|---|
| 181 | state = CS_NO_CLUSTER; |
|---|
| 182 | } |
|---|
| 183 | else { |
|---|
| 184 | state = CS_IS_CLUSTER; // assume this is a cluster |
|---|
| 185 | |
|---|
| 186 | // Get set of branches (sorted by branch distance) |
|---|
| 187 | |
|---|
| 188 | get_branch_dists(); // calculates branchDists |
|---|
| 189 | SortedPairValues pairsByBranchDistance; // biggest branch distances at end |
|---|
| 190 | { |
|---|
| 191 | LeafRelationCIter bd_end = branchDists->end(); |
|---|
| 192 | for (LeafRelationCIter bd = branchDists->begin(); bd != bd_end; ++bd) { |
|---|
| 193 | pairsByBranchDistance.insert(LeafRelation(bd->first, bd->second)); |
|---|
| 194 | } |
|---|
| 195 | } |
|---|
| 196 | |
|---|
| 197 | cl_assert(pairsByBranchDistance.size() == possible_relations()); |
|---|
| 198 | |
|---|
| 199 | // calculate real sequence distance |
|---|
| 200 | // * stop when distance > maxDistance is detected |
|---|
| 201 | // * calculate longest branchdistance first |
|---|
| 202 | // (Assumption is: big branch distance <=> big sequence distance) |
|---|
| 203 | |
|---|
| 204 | AP_FLOAT maxDistance = get_tree_root()->get_maxDistance(); |
|---|
| 205 | AP_FLOAT worstDistanceSeen = -1.0; |
|---|
| 206 | |
|---|
| 207 | cl_assert(!sequenceDists); |
|---|
| 208 | sequenceDists = new LeafRelations; |
|---|
| 209 | |
|---|
| 210 | SortedPairValues::const_reverse_iterator pair_end = pairsByBranchDistance.rend(); |
|---|
| 211 | for (SortedPairValues::const_reverse_iterator pair = pairsByBranchDistance.rbegin(); pair != pair_end; ++pair) { |
|---|
| 212 | AP_FLOAT realDist = get_seqDist(pair->get_pair()); |
|---|
| 213 | |
|---|
| 214 | if (realDist>worstDistanceSeen) { |
|---|
| 215 | worstDistanceSeen = realDist; |
|---|
| 216 | |
|---|
| 217 | delete worstKnownDistance; |
|---|
| 218 | worstKnownDistance = new TwoLeafs(pair->get_pair()); |
|---|
| 219 | |
|---|
| 220 | if (realDist>maxDistance) { // big distance found -> not a cluster |
|---|
| 221 | state = CS_NO_CLUSTER; |
|---|
| 222 | break; |
|---|
| 223 | } |
|---|
| 224 | } |
|---|
| 225 | } |
|---|
| 226 | |
|---|
| 227 | #if defined(TRACE_DIST_CALC) |
|---|
| 228 | calculatedDistances += sequenceDists->size(); |
|---|
| 229 | #endif // TRACE_DIST_CALC |
|---|
| 230 | |
|---|
| 231 | if (state == CS_IS_CLUSTER) { |
|---|
| 232 | #if defined(DEBUG) |
|---|
| 233 | // test whether all distances have been calculated |
|---|
| 234 | cl_assert(!is_leaf()); |
|---|
| 235 | size_t knownDistances = sequenceDists->size() + |
|---|
| 236 | get_leftson()->known_seqDists() + |
|---|
| 237 | get_rightson()->known_seqDists(); |
|---|
| 238 | |
|---|
| 239 | cl_assert(knownDistances == possible_relations()); |
|---|
| 240 | #endif // DEBUG |
|---|
| 241 | |
|---|
| 242 | get_leftson()->state = CS_SUB_CLUSTER; |
|---|
| 243 | get_rightson()->state = CS_SUB_CLUSTER; |
|---|
| 244 | } |
|---|
| 245 | |
|---|
| 246 | } |
|---|
| 247 | |
|---|
| 248 | cl_assert(state == CS_NO_CLUSTER || state == CS_IS_CLUSTER); // detection failure |
|---|
| 249 | |
|---|
| 250 | lson->oblivion(false); |
|---|
| 251 | rson->oblivion(false); |
|---|
| 252 | |
|---|
| 253 | progress.inc(); |
|---|
| 254 | if (progress.aborted()) throw "aborted on userrequest"; |
|---|
| 255 | } |
|---|
| 256 | |
|---|
| 257 | cl_assert(state != CS_MAYBE_CLUSTER); |
|---|
| 258 | } |
|---|
| 259 | |
|---|
| 260 | void ClusterTree::oblivion(bool forgetDistances) { |
|---|
| 261 | delete branchDepths; |
|---|
| 262 | branchDepths = NULp; |
|---|
| 263 | |
|---|
| 264 | delete branchDists; |
|---|
| 265 | branchDists = NULp; |
|---|
| 266 | |
|---|
| 267 | if (state == CS_NO_CLUSTER) { |
|---|
| 268 | if (forgetDistances) { |
|---|
| 269 | delete sequenceDists; |
|---|
| 270 | sequenceDists = NULp; |
|---|
| 271 | } |
|---|
| 272 | forgetDistances = true; |
|---|
| 273 | } |
|---|
| 274 | else { |
|---|
| 275 | cl_assert(state & (CS_IS_CLUSTER|CS_SUB_CLUSTER|CS_TOO_SMALL)); |
|---|
| 276 | } |
|---|
| 277 | |
|---|
| 278 | if (!is_leaf()) { |
|---|
| 279 | get_leftson()->oblivion(forgetDistances); |
|---|
| 280 | get_rightson()->oblivion(forgetDistances); |
|---|
| 281 | } |
|---|
| 282 | } |
|---|
| 283 | |
|---|
| 284 | void ClusterTree::calc_branch_depths() { |
|---|
| 285 | cl_assert(!branchDepths); |
|---|
| 286 | branchDepths = new NodeValues; |
|---|
| 287 | if (is_leaf()) { |
|---|
| 288 | (*branchDepths)[this] = 0.0; |
|---|
| 289 | } |
|---|
| 290 | else { |
|---|
| 291 | for (int s = 0; s<2; s++) { |
|---|
| 292 | const NodeValues *sonDepths = s ? get_leftson()->get_branch_depths() : get_rightson()->get_branch_depths(); |
|---|
| 293 | AP_FLOAT lenToSon = s ? leftlen : rightlen; |
|---|
| 294 | |
|---|
| 295 | NodeValues::const_iterator sd_end = sonDepths->end(); |
|---|
| 296 | for (NodeValues::const_iterator sd = sonDepths->begin(); sd != sd_end; ++sd) { |
|---|
| 297 | (*branchDepths)[sd->first] = sd->second+lenToSon; |
|---|
| 298 | } |
|---|
| 299 | } |
|---|
| 300 | } |
|---|
| 301 | } |
|---|
| 302 | |
|---|
| 303 | void ClusterTree::calc_branch_dists() { |
|---|
| 304 | cl_assert(!branchDists); |
|---|
| 305 | if (!is_leaf()) { |
|---|
| 306 | branchDists = new LeafRelations; |
|---|
| 307 | |
|---|
| 308 | for (int s = 0; s<2; s++) { |
|---|
| 309 | ClusterTree *son = s ? get_leftson() : get_rightson(); |
|---|
| 310 | const LeafRelations *sonDists = son->get_branch_dists(); |
|---|
| 311 | |
|---|
| 312 | cl_assert(sonDists || son->is_leaf()); |
|---|
| 313 | if (sonDists) branchDists->insert(sonDists->begin(), sonDists->end()); |
|---|
| 314 | } |
|---|
| 315 | |
|---|
| 316 | const NodeValues *ldepths = get_leftson()->get_branch_depths(); |
|---|
| 317 | const NodeValues *rdepths = get_rightson()->get_branch_depths(); |
|---|
| 318 | |
|---|
| 319 | NodeValues::const_iterator l_end = ldepths->end(); |
|---|
| 320 | NodeValues::const_iterator r_end = rdepths->end(); |
|---|
| 321 | |
|---|
| 322 | for (NodeValues::const_iterator ld = ldepths->begin(); ld != l_end; ++ld) { |
|---|
| 323 | AP_FLOAT llen = ld->second+leftlen; |
|---|
| 324 | for (NodeValues::const_iterator rd = rdepths->begin(); rd != r_end; ++rd) { |
|---|
| 325 | AP_FLOAT rlen = rd->second+rightlen; |
|---|
| 326 | |
|---|
| 327 | (*branchDists)[TwoLeafs(ld->first, rd->first)] = llen+rlen; |
|---|
| 328 | } |
|---|
| 329 | } |
|---|
| 330 | |
|---|
| 331 | cl_assert(branchDists->size() == possible_relations()); |
|---|
| 332 | } |
|---|
| 333 | } |
|---|
| 334 | |
|---|
| 335 | AP_FLOAT ClusterTree::get_seqDist(const TwoLeafs& pair) { |
|---|
| 336 | // searches or calculates the distance between the sequences of leafs in 'pair' |
|---|
| 337 | |
|---|
| 338 | const AP_FLOAT *found = has_seqDist(pair); |
|---|
| 339 | if (!found) { |
|---|
| 340 | const ClusterTree *commonFather = pair.first()->commonFatherWith(pair.second()); |
|---|
| 341 | |
|---|
| 342 | while (commonFather != this) { |
|---|
| 343 | found = commonFather->has_seqDist(pair); |
|---|
| 344 | if (found) break; |
|---|
| 345 | |
|---|
| 346 | commonFather = commonFather->get_father(); |
|---|
| 347 | cl_assert(commonFather); // first commonFather was not inside this! |
|---|
| 348 | } |
|---|
| 349 | } |
|---|
| 350 | |
|---|
| 351 | if (!found) { |
|---|
| 352 | // calculate the distance |
|---|
| 353 | |
|---|
| 354 | const AP_combinableSeq *seq1 = pair.first()->get_seq(); |
|---|
| 355 | const AP_combinableSeq *seq2 = pair.second()->get_seq(); |
|---|
| 356 | |
|---|
| 357 | seq1->lazy_load_sequence(); |
|---|
| 358 | seq2->lazy_load_sequence(); |
|---|
| 359 | |
|---|
| 360 | { |
|---|
| 361 | AP_combinableSeq *ancestor = seq1->dup(); |
|---|
| 362 | |
|---|
| 363 | Mutations mutations = ancestor->combine_seq(seq1, seq2); |
|---|
| 364 | AP_FLOAT wbc1 = seq1->weighted_base_count(); |
|---|
| 365 | AP_FLOAT wbc2 = seq2->weighted_base_count(); |
|---|
| 366 | AP_FLOAT minBaseCount = wbc1 < wbc2 ? wbc1 : wbc2; |
|---|
| 367 | AP_FLOAT dist; |
|---|
| 368 | |
|---|
| 369 | if (minBaseCount>0) { |
|---|
| 370 | dist = mutations / minBaseCount; |
|---|
| 371 | if (minBaseCount < min_bases) min_bases = minBaseCount; |
|---|
| 372 | } |
|---|
| 373 | else { // got at least one empty sequence -> no distance |
|---|
| 374 | dist = 0.0; |
|---|
| 375 | min_bases = minBaseCount; |
|---|
| 376 | } |
|---|
| 377 | |
|---|
| 378 | #if defined(DEBUG) && 0 |
|---|
| 379 | const char *name1 = pair.first()->name; |
|---|
| 380 | const char *name2 = pair.second()->name; |
|---|
| 381 | printf("%s <-?-> %s: mutations=%5.2f wbc1=%5.2f wbc2=%5.2f mbc=%5.2f dist=%5.2f\n", |
|---|
| 382 | name1, name2, mutations, wbc1, wbc2, minBaseCount, dist); |
|---|
| 383 | #endif // DEBUG |
|---|
| 384 | |
|---|
| 385 | cl_assert(!is_nan_or_inf(dist)); |
|---|
| 386 | |
|---|
| 387 | (*sequenceDists)[pair] = dist; |
|---|
| 388 | delete ancestor; |
|---|
| 389 | } |
|---|
| 390 | |
|---|
| 391 | found = has_seqDist(pair); |
|---|
| 392 | cl_assert(found); |
|---|
| 393 | } |
|---|
| 394 | |
|---|
| 395 | cl_assert(found); |
|---|
| 396 | return *found; |
|---|
| 397 | } |
|---|
| 398 | |
|---|
| 399 | const AP_FLOAT *ClusterTree::has_seqDist(const TwoLeafs& pair) const { |
|---|
| 400 | if (sequenceDists) { |
|---|
| 401 | LeafRelationCIter found = sequenceDists->find(pair); |
|---|
| 402 | if (found != sequenceDists->end()) { |
|---|
| 403 | return &(found->second); |
|---|
| 404 | } |
|---|
| 405 | } |
|---|
| 406 | return NULp; |
|---|
| 407 | } |
|---|
| 408 | |
|---|
| 409 | const ClusterTree *ClusterTree::commonFatherWith(const ClusterTree *other) const { |
|---|
| 410 | int depth_diff = get_depth() - other->get_depth(); |
|---|
| 411 | if (depth_diff) { |
|---|
| 412 | if (depth_diff<0) { // this is less deep than other |
|---|
| 413 | return other->get_father()->commonFatherWith(this); |
|---|
| 414 | } |
|---|
| 415 | else { // other is less deep than this |
|---|
| 416 | return get_father()->commonFatherWith(other); |
|---|
| 417 | } |
|---|
| 418 | } |
|---|
| 419 | else { // both at same depth |
|---|
| 420 | if (this == other) { // common father reached ? |
|---|
| 421 | return this; |
|---|
| 422 | } |
|---|
| 423 | else { // otherwise check fathers |
|---|
| 424 | return get_father()->commonFatherWith(other->get_father()); |
|---|
| 425 | } |
|---|
| 426 | } |
|---|
| 427 | } |
|---|
| 428 | |
|---|