| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : di_clustertree.hxx // |
|---|
| 4 | // Purpose : Tree structure used for cluster detection // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in October 2009 // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | #ifndef DI_CLUSTERTREE_HXX |
|---|
| 13 | #define DI_CLUSTERTREE_HXX |
|---|
| 14 | |
|---|
| 15 | #ifndef ARB_TREE_HXX |
|---|
| 16 | #include <ARB_Tree.hxx> |
|---|
| 17 | #endif |
|---|
| 18 | #ifndef AP_SEQUENCE_HXX |
|---|
| 19 | #include <AP_sequence.hxx> |
|---|
| 20 | #endif |
|---|
| 21 | #ifndef _GLIBCXX_MAP |
|---|
| 22 | #include <map> |
|---|
| 23 | #endif |
|---|
| 24 | |
|---|
| 25 | |
|---|
| 26 | #define cl_assert(cond) arb_assert(cond) |
|---|
| 27 | |
|---|
| 28 | class ClusterTree; |
|---|
| 29 | class arb_progress; |
|---|
| 30 | |
|---|
| 31 | enum ClusterState { |
|---|
| 32 | CS_UNKNOWN = 0, // initial state |
|---|
| 33 | CS_TOO_SMALL = 1, // cluster is too small |
|---|
| 34 | CS_MAYBE_CLUSTER = 2, // need to test whether this is a cluster |
|---|
| 35 | CS_NO_CLUSTER = 4, // not a cluster (only worstKnownDistance is known) |
|---|
| 36 | CS_IS_CLUSTER = 8, // subtree is cluster (all sequence distances are known) |
|---|
| 37 | CS_SUB_CLUSTER = 16, // like CS_IS_CLUSTER, but father is cluster as well |
|---|
| 38 | }; |
|---|
| 39 | |
|---|
| 40 | // ------------------------ |
|---|
| 41 | // ClusterTreeRoot |
|---|
| 42 | |
|---|
| 43 | class ClusterTreeRoot FINAL_TYPE : public ARB_seqtree_root { |
|---|
| 44 | AP_FLOAT maxDistance; // max. allowed distance inside cluster |
|---|
| 45 | unsigned minClusterSize; // min. size of cluster (number of leafs) |
|---|
| 46 | |
|---|
| 47 | public: |
|---|
| 48 | ClusterTreeRoot(AliView *aliview, AP_sequence *seqTemplate_, AP_FLOAT maxDistance_, size_t minClusterSize_); |
|---|
| 49 | ~ClusterTreeRoot() OVERRIDE { predelete(); } |
|---|
| 50 | |
|---|
| 51 | inline TreeNode *makeNode() const OVERRIDE; |
|---|
| 52 | inline void destroyNode(TreeNode *node) const OVERRIDE; |
|---|
| 53 | |
|---|
| 54 | DEFINE_DOWNCAST_ACCESSORS(ClusterTree, get_root_node, ARB_seqtree_root::get_root_node()); |
|---|
| 55 | |
|---|
| 56 | GB_ERROR find_clusters(); |
|---|
| 57 | unsigned get_minClusterSize() const { return minClusterSize; } |
|---|
| 58 | AP_FLOAT get_maxDistance() const { return maxDistance; } |
|---|
| 59 | }; |
|---|
| 60 | |
|---|
| 61 | // ------------------ |
|---|
| 62 | // LeafPairs |
|---|
| 63 | |
|---|
| 64 | class TwoLeafs { |
|---|
| 65 | ClusterTree *ct1, *ct2; // ct1<ct2! |
|---|
| 66 | |
|---|
| 67 | public: |
|---|
| 68 | TwoLeafs(ClusterTree *c1, ClusterTree *c2) : |
|---|
| 69 | ct1(c1<c2 ? c1 : c2), |
|---|
| 70 | ct2(c1<c2 ? c2 : c1) |
|---|
| 71 | {} |
|---|
| 72 | |
|---|
| 73 | const ClusterTree *first() const { return ct1; } |
|---|
| 74 | const ClusterTree *second() const { return ct2; } |
|---|
| 75 | |
|---|
| 76 | bool operator<(const TwoLeafs& other) const { |
|---|
| 77 | return ct1 == other.ct1 ? ct2<other.ct2 : ct1<other.ct1; |
|---|
| 78 | } |
|---|
| 79 | }; |
|---|
| 80 | |
|---|
| 81 | class LeafRelation { |
|---|
| 82 | const TwoLeafs *pair; |
|---|
| 83 | const AP_FLOAT value; |
|---|
| 84 | public: |
|---|
| 85 | LeafRelation(const TwoLeafs& pair_, AP_FLOAT value_) : |
|---|
| 86 | pair(&pair_), |
|---|
| 87 | value(value_) |
|---|
| 88 | {} |
|---|
| 89 | |
|---|
| 90 | bool operator<(const LeafRelation& other) const { |
|---|
| 91 | if (value == other.value) return *pair < *other.pair; |
|---|
| 92 | return value < other.value; |
|---|
| 93 | } |
|---|
| 94 | |
|---|
| 95 | const TwoLeafs& get_pair() const { return *pair; } |
|---|
| 96 | }; |
|---|
| 97 | |
|---|
| 98 | typedef std::map<ClusterTree*, AP_FLOAT> NodeValues; |
|---|
| 99 | typedef std::map<TwoLeafs, AP_FLOAT> LeafRelations; |
|---|
| 100 | typedef LeafRelations::const_iterator LeafRelationCIter; |
|---|
| 101 | |
|---|
| 102 | // -------------------- |
|---|
| 103 | // ClusterTree |
|---|
| 104 | |
|---|
| 105 | |
|---|
| 106 | #if defined(DEBUG) |
|---|
| 107 | #define TRACE_DIST_CALC |
|---|
| 108 | #endif // DEBUG |
|---|
| 109 | |
|---|
| 110 | class ClusterTree FINAL_TYPE : public ARB_countedTree { // derived from a Noncopyable |
|---|
| 111 | ClusterState state; |
|---|
| 112 | |
|---|
| 113 | unsigned leaf_count; // number of leafs in subtree |
|---|
| 114 | unsigned clus_count; // number of clusters at and in subtree |
|---|
| 115 | unsigned depth; // depth of node ( 1 == root ) |
|---|
| 116 | AP_FLOAT min_bases; // min. bases used for comparing two members |
|---|
| 117 | |
|---|
| 118 | NodeValues *branchDepths; // leaf-depths (distance from this each leaf) |
|---|
| 119 | LeafRelations *branchDists; // distance (branch) between two leafs |
|---|
| 120 | LeafRelations *sequenceDists; // real distance between sequences of two leafs |
|---|
| 121 | |
|---|
| 122 | TwoLeafs *worstKnownDistance; |
|---|
| 123 | |
|---|
| 124 | void calc_branch_depths(); |
|---|
| 125 | void calc_branch_dists(); |
|---|
| 126 | |
|---|
| 127 | #if defined(TRACE_DIST_CALC) |
|---|
| 128 | unsigned calculatedDistances; |
|---|
| 129 | #endif // TRACE_DIST_CALC |
|---|
| 130 | |
|---|
| 131 | unsigned get_depth() const { return depth; } |
|---|
| 132 | bool knows_seqDists() const { return state & (CS_IS_CLUSTER|CS_SUB_CLUSTER); } |
|---|
| 133 | unsigned possible_relations() const { return (leaf_count*(leaf_count-1)) / 2; } |
|---|
| 134 | unsigned known_seqDists() const { return knows_seqDists() ? possible_relations() : 0; } |
|---|
| 135 | |
|---|
| 136 | const NodeValues *get_branch_depths() { |
|---|
| 137 | if (!branchDepths) calc_branch_depths(); |
|---|
| 138 | return branchDepths; |
|---|
| 139 | } |
|---|
| 140 | |
|---|
| 141 | const LeafRelations *get_branch_dists() { |
|---|
| 142 | if (!branchDists) calc_branch_dists(); |
|---|
| 143 | return branchDists; |
|---|
| 144 | } |
|---|
| 145 | |
|---|
| 146 | AP_FLOAT get_seqDist(const TwoLeafs& pair); |
|---|
| 147 | const AP_FLOAT *has_seqDist(const TwoLeafs& pair) const; |
|---|
| 148 | const ClusterTree *commonFatherWith(const ClusterTree *other) const; |
|---|
| 149 | |
|---|
| 150 | void oblivion(bool forgetDistances); // forget unneeded data |
|---|
| 151 | |
|---|
| 152 | protected: |
|---|
| 153 | ~ClusterTree() OVERRIDE { |
|---|
| 154 | delete worstKnownDistance; |
|---|
| 155 | delete sequenceDists; |
|---|
| 156 | delete branchDists; |
|---|
| 157 | delete branchDepths; |
|---|
| 158 | } |
|---|
| 159 | friend class ClusterTreeRoot; |
|---|
| 160 | public: |
|---|
| 161 | explicit ClusterTree(ClusterTreeRoot *tree_root_) : |
|---|
| 162 | ARB_countedTree(tree_root_), |
|---|
| 163 | state(CS_UNKNOWN), |
|---|
| 164 | leaf_count(0), |
|---|
| 165 | clus_count(0), |
|---|
| 166 | depth(0), |
|---|
| 167 | min_bases(-1.0), |
|---|
| 168 | branchDepths(NULp), |
|---|
| 169 | branchDists(NULp), |
|---|
| 170 | sequenceDists(NULp), |
|---|
| 171 | worstKnownDistance(NULp) |
|---|
| 172 | {} |
|---|
| 173 | |
|---|
| 174 | DEFINE_TREE_ACCESSORS(ClusterTreeRoot, ClusterTree); |
|---|
| 175 | OVERRIDE_SEQ_ACCESSORS(AP_combinableSeq,ARB_countedTree); |
|---|
| 176 | |
|---|
| 177 | unsigned get_cluster_count() const { return clus_count; } |
|---|
| 178 | unsigned get_leaf_count() const OVERRIDE { return leaf_count; } |
|---|
| 179 | |
|---|
| 180 | #if defined(TRACE_DIST_CALC) |
|---|
| 181 | unsigned get_calculated_distances() const { return calculatedDistances; } |
|---|
| 182 | #endif // TRACE_DIST_CALC |
|---|
| 183 | |
|---|
| 184 | ClusterState get_state() const { return state; } |
|---|
| 185 | |
|---|
| 186 | void init_tree() OVERRIDE; |
|---|
| 187 | void detect_clusters(arb_progress& progress); |
|---|
| 188 | |
|---|
| 189 | const LeafRelations *get_sequence_dists() const { return sequenceDists; } |
|---|
| 190 | |
|---|
| 191 | AP_FLOAT get_min_bases() const { return min_bases; } |
|---|
| 192 | }; |
|---|
| 193 | |
|---|
| 194 | inline TreeNode *ClusterTreeRoot::makeNode() const { return new ClusterTree(const_cast<ClusterTreeRoot*>(this)); } |
|---|
| 195 | inline void ClusterTreeRoot::destroyNode(TreeNode *node) const { delete DOWNCAST(ClusterTree*, node); } |
|---|
| 196 | |
|---|
| 197 | class UseAnyTree : public ARB_tree_predicate { |
|---|
| 198 | bool selects(const ARB_seqtree&) const OVERRIDE { return true; } |
|---|
| 199 | }; |
|---|
| 200 | |
|---|
| 201 | #else |
|---|
| 202 | #error di_clustertree.hxx included twice |
|---|
| 203 | #endif // DI_CLUSTERTREE_HXX |
|---|