1 | // ============================================================= // |
---|
2 | // // |
---|
3 | // File : CT_part.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================= // |
---|
10 | |
---|
11 | /* This module is designed to organize the data structure partitions |
---|
12 | partitions represent the edges of a tree */ |
---|
13 | // the partitions are implemented as an array of longs |
---|
14 | // Each leaf in a GBT-Tree is represented as one Bit in the Partition |
---|
15 | |
---|
16 | #include "CT_part.hxx" |
---|
17 | #include <arbdbt.h> |
---|
18 | #include <arb_strarray.h> |
---|
19 | |
---|
20 | #define BITS_PER_PELEM (sizeof(PELEM)*8) |
---|
21 | |
---|
22 | #if defined(DUMP_PART_INIT) || defined(UNIT_TESTS) |
---|
23 | static const char *readable_cutmask(PELEM mask) { |
---|
24 | static char readable[BITS_PER_PELEM+1]; |
---|
25 | memset(readable, '0', BITS_PER_PELEM); |
---|
26 | readable[BITS_PER_PELEM] = 0; |
---|
27 | |
---|
28 | for (int b = BITS_PER_PELEM-1; b >= 0; --b) { |
---|
29 | if (mask&1) readable[b] = '1'; |
---|
30 | mask = mask>>1; |
---|
31 | } |
---|
32 | return readable; |
---|
33 | } |
---|
34 | #endif |
---|
35 | |
---|
36 | PartitionSize::PartitionSize(const int len) |
---|
37 | : cutmask(0), |
---|
38 | longs((((len + 7) / 8)+sizeof(PELEM)-1) / sizeof(PELEM)), |
---|
39 | bits(len), |
---|
40 | id(0) |
---|
41 | { |
---|
42 | /*! Function to initialize the global variables above |
---|
43 | * @param len number of bits the part should content |
---|
44 | * |
---|
45 | * result: calculate cutmask, longs, plen |
---|
46 | */ |
---|
47 | |
---|
48 | int j = len % BITS_PER_PELEM; |
---|
49 | if (!j) j += BITS_PER_PELEM; |
---|
50 | |
---|
51 | for (int i=0; i<j; i++) { |
---|
52 | cutmask <<= 1; |
---|
53 | cutmask |= 1; |
---|
54 | } |
---|
55 | |
---|
56 | #if defined(DEBUG) |
---|
57 | size_t possible = longs*BITS_PER_PELEM; |
---|
58 | arb_assert((possible-bits)<BITS_PER_PELEM); // longs is too big (wasted space) |
---|
59 | |
---|
60 | #if defined(DUMP_PART_INIT) |
---|
61 | printf("leafs=%i\n", len); |
---|
62 | printf("cutmask='%s'\n", readable_cutmask(cutmask)); |
---|
63 | printf("longs=%i (can hold %zu bits)\n", longs, possible); |
---|
64 | printf("bits=%i\n", bits); |
---|
65 | #endif |
---|
66 | #endif |
---|
67 | } |
---|
68 | |
---|
69 | #if defined(NTREE_DEBUG_FUNCTIONS) |
---|
70 | |
---|
71 | static const CharPtrArray *namesPtr = NULp; |
---|
72 | |
---|
73 | void PART::start_pretty_printing(const CharPtrArray& names) { namesPtr = &names; } |
---|
74 | void PART::stop_pretty_printing() { namesPtr = NULp; } |
---|
75 | |
---|
76 | void PART::print() const { |
---|
77 | // ! Testfunction to print a part |
---|
78 | int k = 0; |
---|
79 | const int longs = get_longs(); |
---|
80 | const int plen = info->get_bits(); |
---|
81 | |
---|
82 | if (namesPtr) { |
---|
83 | const CharPtrArray& names = *namesPtr; |
---|
84 | for (int part = 0; part<=1; ++part) { |
---|
85 | // bool first = true; |
---|
86 | for (int i=0; i<longs; i++) { |
---|
87 | PELEM el = 1; |
---|
88 | for (int j=0; k<plen && size_t(j)<sizeof(PELEM)*8; j++, k++) { |
---|
89 | bool bitset = p[i] & el; |
---|
90 | if (bitset == part) { |
---|
91 | const char *name = names[k]; |
---|
92 | #if 1 |
---|
93 | fputc(name[0], stdout); // first char of name |
---|
94 | #else |
---|
95 | if (!first) fputc(',', stdout); |
---|
96 | else first = false; |
---|
97 | fputs(name, stdout); // full name |
---|
98 | #endif |
---|
99 | } |
---|
100 | el <<= 1; |
---|
101 | } |
---|
102 | } |
---|
103 | if (!part) { |
---|
104 | fputs("---", stdout); |
---|
105 | k = 0; |
---|
106 | } |
---|
107 | } |
---|
108 | } |
---|
109 | else { |
---|
110 | for (int i=0; i<longs; i++) { |
---|
111 | PELEM el = 1; |
---|
112 | for (int j=0; k<plen && size_t(j)<sizeof(PELEM)*8; j++, k++) { |
---|
113 | bool bitset = p[i] & el; |
---|
114 | fputc('0'+bitset, stdout); |
---|
115 | el <<= 1; |
---|
116 | } |
---|
117 | } |
---|
118 | } |
---|
119 | |
---|
120 | printf(" len=%.5f prob=%5.1f%% w.len=%.5f leaf=%i dist2center=%i\n", |
---|
121 | len, weight*100.0, get_len(), is_leaf_edge(), distance_to_tree_center()); |
---|
122 | } |
---|
123 | #endif |
---|
124 | |
---|
125 | PART *PartitionSize::create_root() const { |
---|
126 | /*! build a partition that totally consists of 111111...1111 that is needed to |
---|
127 | * build the root of a specific ntree |
---|
128 | */ |
---|
129 | |
---|
130 | PART *p = new PART(this, 1.0); |
---|
131 | p->invert(); |
---|
132 | arb_assert(p->is_valid()); |
---|
133 | return p; |
---|
134 | } |
---|
135 | |
---|
136 | bool PART::overlaps_with(const PART *other) const { |
---|
137 | /*! test if two parts overlap (i.e. share common bits) |
---|
138 | */ |
---|
139 | |
---|
140 | arb_assert(is_valid()); |
---|
141 | arb_assert(other->is_valid()); |
---|
142 | |
---|
143 | int longs = get_longs(); |
---|
144 | for (int i=0; i<longs; i++) { |
---|
145 | if (p[i] & other->p[i]) return true; |
---|
146 | } |
---|
147 | return false; |
---|
148 | } |
---|
149 | |
---|
150 | void PART::invert() { |
---|
151 | //! invert a part |
---|
152 | // |
---|
153 | // Each edge in a tree connects two subtrees. |
---|
154 | // These subtrees are represented by inverse partitions |
---|
155 | |
---|
156 | arb_assert(is_valid()); |
---|
157 | |
---|
158 | int longs = get_longs(); |
---|
159 | for (int i=0; i<longs; i++) { // LOOP_VECTORIZED |
---|
160 | p[i] = ~p[i]; |
---|
161 | } |
---|
162 | p[longs-1] &= get_cutmask(); |
---|
163 | |
---|
164 | members = get_maxsize()-members; |
---|
165 | |
---|
166 | arb_assert(is_valid()); |
---|
167 | } |
---|
168 | |
---|
169 | void PART::invertInSuperset(const PART *superset) { |
---|
170 | arb_assert(is_valid()); |
---|
171 | arb_assert(is_subset_of(superset)); |
---|
172 | |
---|
173 | int longs = get_longs(); |
---|
174 | for (int i=0; i<longs; i++) { // LOOP_VECTORIZED |
---|
175 | p[i] = p[i] ^ superset->p[i]; |
---|
176 | } |
---|
177 | p[longs-1] &= get_cutmask(); |
---|
178 | |
---|
179 | members = superset->get_members()-members; |
---|
180 | |
---|
181 | arb_assert(is_valid()); |
---|
182 | arb_assert(is_subset_of(superset)); |
---|
183 | } |
---|
184 | |
---|
185 | |
---|
186 | void PART::add_members_from(const PART *source) { |
---|
187 | //! destination = source or destination |
---|
188 | arb_assert(source->is_valid()); |
---|
189 | arb_assert(is_valid()); |
---|
190 | |
---|
191 | bool distinct = disjunct_from(source); |
---|
192 | |
---|
193 | int longs = get_longs(); |
---|
194 | for (int i=0; i<longs; i++) { // LOOP_VECTORIZED |
---|
195 | p[i] |= source->p[i]; |
---|
196 | } |
---|
197 | |
---|
198 | if (distinct) { |
---|
199 | members += source->members; |
---|
200 | } |
---|
201 | else { |
---|
202 | members = count_members(); |
---|
203 | } |
---|
204 | |
---|
205 | arb_assert(is_valid()); |
---|
206 | } |
---|
207 | |
---|
208 | |
---|
209 | bool PART::equals(const PART *other) const { |
---|
210 | /*! return true if p1 and p2 are equal |
---|
211 | */ |
---|
212 | arb_assert(is_valid()); |
---|
213 | arb_assert(other->is_valid()); |
---|
214 | |
---|
215 | int longs = get_longs(); |
---|
216 | for (int i=0; i<longs; i++) { |
---|
217 | if (p[i] != other->p[i]) return false; |
---|
218 | } |
---|
219 | return true; |
---|
220 | } |
---|
221 | |
---|
222 | |
---|
223 | unsigned PART::key() const { |
---|
224 | //! calculate a hashkey from part |
---|
225 | arb_assert(is_valid()); |
---|
226 | |
---|
227 | PELEM ph = 0; |
---|
228 | int longs = get_longs(); |
---|
229 | for (int i=0; i<longs; i++) { // LOOP_VECTORIZED |
---|
230 | ph ^= p[i]; |
---|
231 | } |
---|
232 | |
---|
233 | return ph; |
---|
234 | } |
---|
235 | |
---|
236 | int PART::count_members() const { |
---|
237 | //! count the number of leafs in partition |
---|
238 | int leafs = 0; |
---|
239 | int longs = get_longs(); |
---|
240 | for (int i = 0; i<longs; ++i) { |
---|
241 | PELEM e = p[i]; |
---|
242 | |
---|
243 | if (i == (longs-1)) e = e&get_cutmask(); |
---|
244 | |
---|
245 | for (size_t b = 0; b<(sizeof(e)*8); ++b) { |
---|
246 | if (e&1) leafs++; |
---|
247 | e = e>>1; |
---|
248 | } |
---|
249 | } |
---|
250 | return leafs; |
---|
251 | } |
---|
252 | |
---|
253 | bool PART::is_standardized() const { // @@@ inline |
---|
254 | /*! true if PART is in standard representation. |
---|
255 | * @see standardize() |
---|
256 | */ |
---|
257 | |
---|
258 | // may be any criteria which differs between PART and its inverse |
---|
259 | // if you change the criteria, generated trees will change |
---|
260 | // (because branch-insertion-order is affected) |
---|
261 | |
---|
262 | return bit_is_set(0); |
---|
263 | } |
---|
264 | |
---|
265 | void PART::standardize() { |
---|
266 | /*! standardize the partition |
---|
267 | * |
---|
268 | * Generally two PARTs are equivalent, if one is the inverted version of the other. |
---|
269 | * A standardized PART is equal for equivalent PARTs, i.e. may be used as key (as done in PartRegistry) |
---|
270 | */ |
---|
271 | arb_assert(is_valid()); |
---|
272 | if (!is_standardized()) { |
---|
273 | invert(); |
---|
274 | arb_assert(is_standardized()); |
---|
275 | } |
---|
276 | arb_assert(is_valid()); |
---|
277 | } |
---|
278 | |
---|
279 | int PART::index() const { |
---|
280 | /*! calculate the first bit set in p, |
---|
281 | * |
---|
282 | * this is only useful if only one bit is set, |
---|
283 | * this is used to identify leafs in a ntree |
---|
284 | * |
---|
285 | * ATTENTION: p has to exist |
---|
286 | */ |
---|
287 | arb_assert(is_valid()); |
---|
288 | arb_assert(is_leaf_edge()); |
---|
289 | |
---|
290 | int pos = 0; |
---|
291 | int longs = get_longs(); |
---|
292 | for (int i=0; i<longs; i++) { |
---|
293 | PELEM p_temp = p[i]; |
---|
294 | pos = i * sizeof(PELEM) * 8; |
---|
295 | if (p_temp) { |
---|
296 | for (; p_temp; p_temp >>= 1, pos++) { |
---|
297 | ; |
---|
298 | } |
---|
299 | break; |
---|
300 | } |
---|
301 | } |
---|
302 | return pos-1; |
---|
303 | } |
---|
304 | |
---|
305 | int PART::insertionOrder_cmp(const PART *other) const { |
---|
306 | // defines order in which edges will be inserted into the consensus tree |
---|
307 | |
---|
308 | if (this == other) return 0; |
---|
309 | |
---|
310 | int cmp = is_leaf_edge() - other->is_leaf_edge(); |
---|
311 | |
---|
312 | if (!cmp) { |
---|
313 | cmp = -double_cmp(weight, other->weight); // insert bigger weight first |
---|
314 | if (!cmp) { |
---|
315 | int centerdist1 = distance_to_tree_center(); |
---|
316 | int centerdist2 = other->distance_to_tree_center(); |
---|
317 | |
---|
318 | cmp = centerdist1-centerdist2; // insert central edges first |
---|
319 | |
---|
320 | if (!cmp) { |
---|
321 | cmp = -double_cmp(get_len(), other->get_len()); // NOW REALLY insert bigger len first |
---|
322 | // (change affected test results: increased in-tree-distance, |
---|
323 | // but reduced parsimony value of result-trees) |
---|
324 | if (!cmp) { |
---|
325 | cmp = id - other->id; // strict by definition |
---|
326 | arb_assert(cmp); |
---|
327 | } |
---|
328 | } |
---|
329 | } |
---|
330 | } |
---|
331 | |
---|
332 | return cmp; |
---|
333 | } |
---|
334 | inline int PELEM_cmp(const PELEM& p1, const PELEM& p2) { |
---|
335 | return p1<p2 ? -1 : (p1>p2 ? 1 : 0); |
---|
336 | } |
---|
337 | |
---|
338 | int PART::topological_cmp(const PART *other) const { |
---|
339 | // define a strict order on topologies defined by edges |
---|
340 | |
---|
341 | if (this == other) return 0; |
---|
342 | |
---|
343 | arb_assert(is_standardized()); |
---|
344 | arb_assert(other->is_standardized()); |
---|
345 | |
---|
346 | int cmp = members - other->members; |
---|
347 | if (!cmp) { |
---|
348 | int longs = get_longs(); |
---|
349 | for (int i = 0; !cmp && i<longs; ++i) { |
---|
350 | cmp = PELEM_cmp(p[i], other->p[i]); |
---|
351 | } |
---|
352 | } |
---|
353 | |
---|
354 | arb_assert(contradicted(cmp, equals(other))); |
---|
355 | |
---|
356 | return cmp; |
---|
357 | } |
---|
358 | |
---|
359 | // -------------------------------------------------------------------------------- |
---|
360 | |
---|
361 | #ifdef UNIT_TESTS |
---|
362 | #ifndef TEST_UNIT_H |
---|
363 | #include <test_unit.h> |
---|
364 | #endif |
---|
365 | |
---|
366 | void TEST_PartRegistry() { |
---|
367 | { |
---|
368 | PartitionSize reg(0); |
---|
369 | TEST_EXPECT_EQUAL(reg.get_bits(), 0); |
---|
370 | TEST_EXPECT_EQUAL(reg.get_longs(), 0); |
---|
371 | // cutmask doesnt matter |
---|
372 | } |
---|
373 | |
---|
374 | { |
---|
375 | PartitionSize reg(1); |
---|
376 | TEST_EXPECT_EQUAL(reg.get_bits(), 1); |
---|
377 | TEST_EXPECT_EQUAL(reg.get_longs(), 1); |
---|
378 | TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "00000000000000000000000000000001"); |
---|
379 | } |
---|
380 | |
---|
381 | { |
---|
382 | PartitionSize reg(31); |
---|
383 | TEST_EXPECT_EQUAL(reg.get_bits(), 31); |
---|
384 | TEST_EXPECT_EQUAL(reg.get_longs(), 1); |
---|
385 | TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "01111111111111111111111111111111"); |
---|
386 | } |
---|
387 | |
---|
388 | { |
---|
389 | PartitionSize reg(32); |
---|
390 | TEST_EXPECT_EQUAL(reg.get_bits(), 32); |
---|
391 | TEST_EXPECT_EQUAL(reg.get_longs(), 1); |
---|
392 | TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "11111111111111111111111111111111"); |
---|
393 | } |
---|
394 | |
---|
395 | { |
---|
396 | PartitionSize reg(33); |
---|
397 | TEST_EXPECT_EQUAL(reg.get_bits(), 33); |
---|
398 | TEST_EXPECT_EQUAL(reg.get_longs(), 2); |
---|
399 | TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "00000000000000000000000000000001"); |
---|
400 | } |
---|
401 | |
---|
402 | { |
---|
403 | PartitionSize reg(95); |
---|
404 | TEST_EXPECT_EQUAL(reg.get_bits(), 95); |
---|
405 | TEST_EXPECT_EQUAL(reg.get_longs(), 3); |
---|
406 | TEST_EXPECT_EQUAL(readable_cutmask(reg.get_cutmask()), "01111111111111111111111111111111"); |
---|
407 | } |
---|
408 | } |
---|
409 | |
---|
410 | #endif // UNIT_TESTS |
---|
411 | |
---|
412 | // -------------------------------------------------------------------------------- |
---|
413 | |
---|
414 | |
---|