1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : di_clustertree.hxx // |
---|
4 | // Purpose : Tree structure used for cluster detection // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in October 2009 // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #ifndef DI_CLUSTERTREE_HXX |
---|
13 | #define DI_CLUSTERTREE_HXX |
---|
14 | |
---|
15 | #ifndef ARB_TREE_HXX |
---|
16 | #include <ARB_Tree.hxx> |
---|
17 | #endif |
---|
18 | #ifndef AP_SEQUENCE_HXX |
---|
19 | #include <AP_sequence.hxx> |
---|
20 | #endif |
---|
21 | #ifndef _GLIBCXX_MAP |
---|
22 | #include <map> |
---|
23 | #endif |
---|
24 | |
---|
25 | |
---|
26 | #define cl_assert(cond) arb_assert(cond) |
---|
27 | |
---|
28 | class ClusterTree; |
---|
29 | class arb_progress; |
---|
30 | |
---|
31 | enum ClusterState { |
---|
32 | CS_UNKNOWN = 0, // initial state |
---|
33 | CS_TOO_SMALL = 1, // cluster is too small |
---|
34 | CS_MAYBE_CLUSTER = 2, // need to test whether this is a cluster |
---|
35 | CS_NO_CLUSTER = 4, // not a cluster (only worstKnownDistance is known) |
---|
36 | CS_IS_CLUSTER = 8, // subtree is cluster (all sequence distances are known) |
---|
37 | CS_SUB_CLUSTER = 16, // like CS_IS_CLUSTER, but father is cluster as well |
---|
38 | }; |
---|
39 | |
---|
40 | // ------------------------ |
---|
41 | // ClusterTreeRoot |
---|
42 | |
---|
43 | class ClusterTreeRoot : public ARB_seqtree_root { |
---|
44 | AP_FLOAT maxDistance; // max. allowed distance inside cluster |
---|
45 | unsigned minClusterSize; // min. size of cluster (number of leafs) |
---|
46 | |
---|
47 | public: |
---|
48 | ClusterTreeRoot(AliView *aliview, AP_sequence *seqTemplate_, AP_FLOAT maxDistance_, size_t minClusterSize_); |
---|
49 | virtual ~ClusterTreeRoot() OVERRIDE {} |
---|
50 | |
---|
51 | DEFINE_DOWNCAST_ACCESSORS(ClusterTree, get_root_node, ARB_seqtree_root::get_root_node()); |
---|
52 | |
---|
53 | GB_ERROR find_clusters(); |
---|
54 | unsigned get_minClusterSize() const { return minClusterSize; } |
---|
55 | AP_FLOAT get_maxDistance() const { return maxDistance; } |
---|
56 | }; |
---|
57 | |
---|
58 | // ------------------ |
---|
59 | // LeafPairs |
---|
60 | |
---|
61 | class TwoLeafs { |
---|
62 | ClusterTree *ct1, *ct2; // ct1<ct2! |
---|
63 | |
---|
64 | public: |
---|
65 | TwoLeafs(ClusterTree *c1, ClusterTree *c2) |
---|
66 | : ct1(c1<c2 ? c1 : c2) |
---|
67 | , ct2(c1<c2 ? c2 : c1) |
---|
68 | {} |
---|
69 | |
---|
70 | const ClusterTree *first() const { return ct1; } |
---|
71 | const ClusterTree *second() const { return ct2; } |
---|
72 | |
---|
73 | bool operator<(const TwoLeafs& other) const { |
---|
74 | return ct1 == other.ct1 ? ct2<other.ct2 : ct1<other.ct1; |
---|
75 | } |
---|
76 | }; |
---|
77 | |
---|
78 | class LeafRelation { |
---|
79 | const TwoLeafs *pair; |
---|
80 | const AP_FLOAT value; |
---|
81 | public: |
---|
82 | LeafRelation(const TwoLeafs& pair_, AP_FLOAT value_) |
---|
83 | : pair(&pair_) |
---|
84 | , value(value_) |
---|
85 | {} |
---|
86 | |
---|
87 | bool operator<(const LeafRelation& other) const { |
---|
88 | if (value == other.value) return *pair < *other.pair; |
---|
89 | return value < other.value; |
---|
90 | } |
---|
91 | |
---|
92 | const TwoLeafs& get_pair() const { return *pair; } |
---|
93 | }; |
---|
94 | |
---|
95 | typedef std::map<ClusterTree*, AP_FLOAT> NodeValues; |
---|
96 | typedef std::map<TwoLeafs, AP_FLOAT> LeafRelations; |
---|
97 | typedef LeafRelations::const_iterator LeafRelationCIter; |
---|
98 | |
---|
99 | // -------------------- |
---|
100 | // ClusterTree |
---|
101 | |
---|
102 | |
---|
103 | #if defined(DEBUG) |
---|
104 | #define TRACE_DIST_CALC |
---|
105 | #endif // DEBUG |
---|
106 | |
---|
107 | class ClusterTree : public ARB_countedTree { // derived from a Noncopyable |
---|
108 | ClusterState state; |
---|
109 | |
---|
110 | unsigned leaf_count; // number of leafs in subtree |
---|
111 | unsigned clus_count; // number of clusters at and in subtree |
---|
112 | unsigned depth; // depth of node ( 1 == root ) |
---|
113 | AP_FLOAT min_bases; // min. bases used for comparing two members |
---|
114 | |
---|
115 | NodeValues *branchDepths; // leaf-depths (distance from this each leaf) |
---|
116 | LeafRelations *branchDists; // distance (branch) between two leafs |
---|
117 | LeafRelations *sequenceDists; // real distance between sequences of two leafs |
---|
118 | |
---|
119 | TwoLeafs *worstKnownDistance; |
---|
120 | |
---|
121 | void calc_branch_depths(); |
---|
122 | void calc_branch_dists(); |
---|
123 | |
---|
124 | #if defined(TRACE_DIST_CALC) |
---|
125 | unsigned calculatedDistances; |
---|
126 | #endif // TRACE_DIST_CALC |
---|
127 | |
---|
128 | unsigned get_depth() const { return depth; } |
---|
129 | bool knows_seqDists() const { return state & (CS_IS_CLUSTER|CS_SUB_CLUSTER); } |
---|
130 | unsigned possible_relations() const { return (leaf_count*(leaf_count-1)) / 2; } |
---|
131 | unsigned known_seqDists() const { return knows_seqDists() ? possible_relations() : 0; } |
---|
132 | |
---|
133 | const NodeValues *get_branch_depths() { |
---|
134 | if (!branchDepths) calc_branch_depths(); |
---|
135 | return branchDepths; |
---|
136 | } |
---|
137 | |
---|
138 | const LeafRelations *get_branch_dists() { |
---|
139 | if (!branchDists) calc_branch_dists(); |
---|
140 | return branchDists; |
---|
141 | } |
---|
142 | |
---|
143 | AP_FLOAT get_seqDist(const TwoLeafs& pair); |
---|
144 | const AP_FLOAT *has_seqDist(const TwoLeafs& pair) const; |
---|
145 | const ClusterTree *commonFatherWith(const ClusterTree *other) const; |
---|
146 | |
---|
147 | void oblivion(bool forgetDistances); // forget unneeded data |
---|
148 | |
---|
149 | public: |
---|
150 | explicit ClusterTree(ClusterTreeRoot *tree_root_) |
---|
151 | : ARB_countedTree(tree_root_) |
---|
152 | , state(CS_UNKNOWN) |
---|
153 | , leaf_count(0) |
---|
154 | , clus_count(0) |
---|
155 | , depth(0) |
---|
156 | , min_bases(-1.0) |
---|
157 | , branchDepths(NULL) |
---|
158 | , branchDists(NULL) |
---|
159 | , sequenceDists(NULL) |
---|
160 | , worstKnownDistance(NULL) |
---|
161 | {} |
---|
162 | |
---|
163 | ~ClusterTree() OVERRIDE { |
---|
164 | delete worstKnownDistance; |
---|
165 | delete sequenceDists; |
---|
166 | delete branchDists; |
---|
167 | delete branchDepths; |
---|
168 | } |
---|
169 | DEFINE_TREE_ACCESSORS(ClusterTreeRoot, ClusterTree); |
---|
170 | |
---|
171 | unsigned get_cluster_count() const { return clus_count; } |
---|
172 | unsigned get_leaf_count() const OVERRIDE { return leaf_count; } |
---|
173 | |
---|
174 | #if defined(TRACE_DIST_CALC) |
---|
175 | unsigned get_calculated_distances() const { return calculatedDistances; } |
---|
176 | #endif // TRACE_DIST_CALC |
---|
177 | |
---|
178 | ClusterState get_state() const { return state; } |
---|
179 | |
---|
180 | void init_tree() OVERRIDE; |
---|
181 | void detect_clusters(arb_progress& progress); |
---|
182 | |
---|
183 | const LeafRelations *get_sequence_dists() const { return sequenceDists; } |
---|
184 | |
---|
185 | AP_FLOAT get_min_bases() const { return min_bases; } |
---|
186 | }; |
---|
187 | |
---|
188 | class ClusterTreeNodeFactory : public RootedTreeNodeFactory { |
---|
189 | RootedTree *makeNode(TreeRoot *root) const OVERRIDE { |
---|
190 | return new ClusterTree(DOWNCAST(ClusterTreeRoot*, root)); |
---|
191 | } |
---|
192 | }; |
---|
193 | |
---|
194 | class UseAnyTree : public ARB_tree_predicate { |
---|
195 | bool selects(const ARB_seqtree&) const OVERRIDE { return true; } |
---|
196 | }; |
---|
197 | |
---|
198 | #else |
---|
199 | #error di_clustertree.hxx included twice |
---|
200 | #endif // DI_CLUSTERTREE_HXX |
---|