1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : NT_tree_cmp.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "NT_species_set.h" |
---|
12 | #include <aw_msg.hxx> |
---|
13 | #include <gb_aci.h> |
---|
14 | #include <gb_aci_impl.h> |
---|
15 | #include <arb_progress.h> |
---|
16 | #include <arb_defs.h> |
---|
17 | #include <string> |
---|
18 | #include <climits> |
---|
19 | |
---|
20 | using namespace std; |
---|
21 | |
---|
22 | SpecSetRegistry::SpecSetRegistry(long nspecies_, arb_progress *progress_, const GroupMatchScorer& scorer_) : |
---|
23 | species_counter(0), |
---|
24 | nspecies(nspecies_), |
---|
25 | nsets(0), |
---|
26 | sets(ARB_calloc<RSpecSet*>(max_nsets())), |
---|
27 | scorer(scorer_), |
---|
28 | progress(progress_), |
---|
29 | species_hash(GBS_create_hash(nspecies, GB_IGNORE_CASE)) |
---|
30 | { |
---|
31 | for (int i=0; i<256; i++) { |
---|
32 | int j = i; |
---|
33 | int count = 0; |
---|
34 | while (j) { // count bits |
---|
35 | if (j&1) count ++; |
---|
36 | j = j>>1; |
---|
37 | } |
---|
38 | set_bits[i] = count; |
---|
39 | } |
---|
40 | tmp_bitstring = allocate_bitstring(); |
---|
41 | } |
---|
42 | |
---|
43 | SpecSetRegistry::~SpecSetRegistry() { |
---|
44 | for (int i=0; i<nsets; i++) delete sets[i]; |
---|
45 | free(tmp_bitstring); |
---|
46 | free(sets); |
---|
47 | GBS_free_hash(species_hash); |
---|
48 | } |
---|
49 | |
---|
50 | void SpecSetRegistry::add(const char *species_name) { |
---|
51 | if (GBS_read_hash(species_hash, species_name)) { |
---|
52 | aw_message(GBS_global_string("Warning: Species '%s' is found more than once in tree", species_name)); |
---|
53 | } |
---|
54 | else { |
---|
55 | GBS_write_hash(species_hash, species_name, ++species_counter); |
---|
56 | nt_assert(species_counter<=nspecies); // more species added than planned |
---|
57 | nt_assert(species_counter<=nspecies); |
---|
58 | } |
---|
59 | } |
---|
60 | |
---|
61 | void SpecSetRegistry::add(RSpecSet *rset) { |
---|
62 | nt_assert(rset->size()>1); // do NOT register leafs (only inner nodes allowed) |
---|
63 | nt_assert(nsets<max_nsets()); |
---|
64 | sets[nsets++] = rset; |
---|
65 | } |
---|
66 | |
---|
67 | void SpecSetRegistry::dump_bitstring(const char *tag, unsigned char *bs) { |
---|
68 | fprintf(stderr, "%s: ", tag); |
---|
69 | const long bbytes = bitstring_bytes(); |
---|
70 | for (long i = 0; i<bbytes; ++i) { |
---|
71 | int val = bs[i]; |
---|
72 | for (int b = 0; b<8; ++b) { |
---|
73 | fputc(val&128 ? 'X' : '-', stderr); |
---|
74 | val<<=1; |
---|
75 | } |
---|
76 | } |
---|
77 | fputc('\n', stderr); |
---|
78 | } |
---|
79 | |
---|
80 | GroupPenalty GroupMatchScorer::calcPenalty(long removed, long added, long commonSpecies) { |
---|
81 | // - can be calculated from removed/added/commonSpecies |
---|
82 | // - shall be reported (result param?) |
---|
83 | |
---|
84 | if (commonSpecies) { |
---|
85 | long oldGroupSize = removed+commonSpecies; |
---|
86 | long newGroupSize = added+commonSpecies; |
---|
87 | double ingroupRatio = 1.0 - removed/double(oldGroupSize); // = percent of old members in new group |
---|
88 | double outgroupRatio = added/double(newGroupSize); // = percent of non-group-members in new group |
---|
89 | |
---|
90 | if (insideLimits(ingroupRatio, outgroupRatio)) { |
---|
91 | double penalty = |
---|
92 | // abs. penalty: |
---|
93 | removed*ingroupPep + |
---|
94 | added*outgroupPep + |
---|
95 | // rel. penalty: |
---|
96 | (1-ingroupRatio)*ingroupInvRelPen + |
---|
97 | outgroupRatio*outgroupRelPen; |
---|
98 | |
---|
99 | return GroupPenalty(penalty, ingroupRatio, outgroupRatio, oldGroupSize); |
---|
100 | } |
---|
101 | } |
---|
102 | return GroupPenalty::NoMatch(); |
---|
103 | } |
---|
104 | |
---|
105 | GroupPenalty GroupMatchScorer::matchGroups(const TSpecSet& sourceSet, const RSpecSet& targetSet, long commonSpecies, long overallSpecies) { |
---|
106 | /*! calculate score of group match (tests normal and keeled insertion, reports better) |
---|
107 | * @param sourceSet tested set (= named group in source tree (if moving groups)) |
---|
108 | * @param targetSet registered set (= subtree of target tree (if moving groups)) |
---|
109 | * @param commonSpecies number of species common in sourceSet and targetSet |
---|
110 | * @param overallSpecies number of registered species (in SpecSetRegistry) |
---|
111 | * @return better GroupPenalty |
---|
112 | */ |
---|
113 | |
---|
114 | long removed = sourceSet.get_known_members() - commonSpecies; |
---|
115 | long added = targetSet.get_known_members() - commonSpecies; |
---|
116 | |
---|
117 | GroupPenalty match = calcPenalty(removed, added, commonSpecies); |
---|
118 | nt_assert(implicated(match.doesMatch(), (removed+added)<overallSpecies)); |
---|
119 | |
---|
120 | long targetSize_keeled = overallSpecies - targetSet.get_known_members(); |
---|
121 | long commonSpecies_keeled = sourceSet.get_known_members() - commonSpecies; |
---|
122 | long removed_keeled = commonSpecies; |
---|
123 | long added_keeled = targetSize_keeled - commonSpecies_keeled; |
---|
124 | |
---|
125 | GroupPenalty match_keeled = calcPenalty(removed_keeled, added_keeled, commonSpecies_keeled); |
---|
126 | match_keeled.mark_as_keeled(); |
---|
127 | if (match_keeled.doesMatch()) { // do not add to NoMatch! |
---|
128 | match_keeled.addPenalty(keelPenalty); |
---|
129 | } |
---|
130 | nt_assert(implicated(match_keeled.doesMatch(), (removed_keeled+added_keeled)<overallSpecies)); |
---|
131 | |
---|
132 | if (match_keeled.betterThan(match)) { |
---|
133 | // @@@ if this happens, the match semantic is inverted: if moving a group -> group should be keeled (by caller)! |
---|
134 | // related to #512 + #451 |
---|
135 | return match_keeled; |
---|
136 | } |
---|
137 | |
---|
138 | return match; |
---|
139 | } |
---|
140 | |
---|
141 | double GroupMatchScorer::calcUnknownMembersPenalty(const TSpecSet& sourceSet) const { |
---|
142 | return sourceSet.get_unknown_members() * unfoundPep; // penalty for unknown members in source tree |
---|
143 | } |
---|
144 | |
---|
145 | GB_ERROR GroupMatchScorer::check_validity() const { |
---|
146 | if (ingroupPep == 0.0 && ingroupInvRelPen == 0.0) { |
---|
147 | return "one ingroup penalty has to be different from zero"; |
---|
148 | } |
---|
149 | if (outgroupPep == 0.0 && outgroupRelPen == 0.0) { |
---|
150 | return "one outgroup penalty has to be different from zero"; |
---|
151 | } |
---|
152 | |
---|
153 | if (ingroupPep<0.0 || outgroupPep<0.0 || ingroupInvRelPen<0.0 || outgroupRelPen<0.0) { |
---|
154 | return "invalid negative in/outgroup penalty"; |
---|
155 | } |
---|
156 | |
---|
157 | if (!ingroup.isValid() || !outgroup.isValid()) { |
---|
158 | return "invalid limits"; |
---|
159 | } |
---|
160 | |
---|
161 | return NULp; |
---|
162 | } |
---|
163 | |
---|
164 | RSpecSet *SpecSetRegistry::search_best_match(const TSpecSet *tset, GroupPenalty& min_penalty) { |
---|
165 | // returns best match for 'tset' (against all registered sets). |
---|
166 | // NULp if no match found. |
---|
167 | // sets 'min_penalty' to penalty found for returned set. |
---|
168 | |
---|
169 | RSpecSet *bset = NULp; |
---|
170 | min_penalty = GroupPenalty::NoMatch(); |
---|
171 | |
---|
172 | const long bbytes = bitstring_bytes(); |
---|
173 | const long blongs = bitstring_longs(); |
---|
174 | |
---|
175 | unsigned char * const tbs = tset->bitstring; |
---|
176 | long * const tbl = (long*)tbs; |
---|
177 | long * const tmp_bl = (long*)tmp_bitstring; |
---|
178 | |
---|
179 | for (long i=nsets-1; i>=0; i--) { |
---|
180 | long same = 0; // amount of known members present in both sets |
---|
181 | |
---|
182 | RSpecSet *rset = sets[i]; |
---|
183 | |
---|
184 | unsigned char * const rbs = rset->bitstring; |
---|
185 | long * const rbl = (long*)rbs; |
---|
186 | |
---|
187 | for (long j = 0; j<blongs; ++j) { // LOOP_VECTORIZED |
---|
188 | tmp_bl[j] = tbl[j] & rbl[j]; |
---|
189 | } |
---|
190 | for (long j = 0; j<bbytes; ++j) { // (Note: failed to optimize: gcc wont vectorize double-indexed access) |
---|
191 | same += set_bits[tmp_bitstring[j]]; |
---|
192 | } |
---|
193 | |
---|
194 | GroupPenalty penalty = scorer.matchGroups(*tset, *rset, same, nspecies); |
---|
195 | if (penalty.betterThan(min_penalty)) { |
---|
196 | min_penalty = penalty; |
---|
197 | bset = rset; |
---|
198 | } |
---|
199 | progress->inc(); |
---|
200 | } |
---|
201 | |
---|
202 | return bset; |
---|
203 | } |
---|
204 | |
---|
205 | void GroupPenalty::registerUnfound(const GroupMatchScorer& scorer, const TSpecSet& tset) { |
---|
206 | addPenalty(scorer.calcUnknownMembersPenalty(tset)); |
---|
207 | nt_assert(unknown == -1); // registerUnfound shall be called exactly once! |
---|
208 | unknown = tset.get_unknown_members(); |
---|
209 | } |
---|
210 | |
---|
211 | double SpecSetRegistry::search_and_remember_best_match_and_log_errors(const TSpecSet *tset, FILE *log) { |
---|
212 | /*! search for best matching registered subtree. |
---|
213 | * @param tset tested set |
---|
214 | * @param log if != NULp -> log move information (not allowed when "comparing topo"!). |
---|
215 | * @return the penalty for best matching subtree (plus a small penalty for unregistered species) |
---|
216 | * or -1 if best matching subtree is "too bad" (i.e. if no matching subtree was found). |
---|
217 | * |
---|
218 | * if return value >= 0 -> stores best_match & best_node (inside best match for 'tset' found in other tree) |
---|
219 | * |
---|
220 | * When moving node info, 'tset' represents a (named!) subtree of the source tree. |
---|
221 | * When comparing topologies, 'tset' represents a subtree of the destination tree (trees are swapped in .@NTREE_move_tree_info) |
---|
222 | */ |
---|
223 | |
---|
224 | GroupPenalty match; |
---|
225 | RSpecSet *bset = search_best_match(tset, match); |
---|
226 | |
---|
227 | if (!bset) { |
---|
228 | // no good match found. |
---|
229 | // atm this happens for small groups (e.g. with 2 species) + a special placement of these species in tree. |
---|
230 | // in the future 'finding no match' will get normal behavior. |
---|
231 | return -1; |
---|
232 | } |
---|
233 | |
---|
234 | AP_tree *earlierFoundGroup = bset->matchedNode(); // group previously considered for same destination node |
---|
235 | AP_tree *currentGroup = tset->set_node; |
---|
236 | |
---|
237 | match.registerUnfound(scorer, *tset); |
---|
238 | if (match.betterThan(bset->bestMatch())) { |
---|
239 | nt_assert(!bset->bestMatch().isPerfectMatch()); // shall never supersede optimally placed group! |
---|
240 | if (earlierFoundGroup && log) { |
---|
241 | nt_assert(earlierFoundGroup->name && currentGroup->name); |
---|
242 | fprintf(log, "Group '%s' skipped (got superseded by group '%s'; same best target positions)\n", |
---|
243 | earlierFoundGroup->name, |
---|
244 | currentGroup->name); |
---|
245 | } |
---|
246 | |
---|
247 | bset->storeBetterMatch(match, currentGroup); |
---|
248 | } |
---|
249 | else { |
---|
250 | nt_assert(!match.isPerfectMatch()); // found two groups with optimal placement. should be impossible by design! can be caused by invalid penalties |
---|
251 | if (log) { |
---|
252 | nt_assert(earlierFoundGroup->name && currentGroup->name); |
---|
253 | fprintf(log, "Group '%s' skipped (does NOT supersede group '%s'; same best target positions)\n", |
---|
254 | currentGroup->name, |
---|
255 | earlierFoundGroup->name); |
---|
256 | } |
---|
257 | } |
---|
258 | |
---|
259 | return match.get_penalty(); |
---|
260 | } |
---|
261 | |
---|
262 | void SpecSet::init(AP_tree *nodei, const SpecSetRegistry& ssr) { |
---|
263 | bitstring = ssr.allocate_bitstring(); |
---|
264 | known_members = 0; |
---|
265 | set_node = nodei; |
---|
266 | } |
---|
267 | |
---|
268 | SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) { |
---|
269 | init(nodei, ssr); |
---|
270 | |
---|
271 | long species_index = ssr.get_species_index(species_name); |
---|
272 | if (species_index) { |
---|
273 | nt_assert(species_index>0); |
---|
274 | --species_index; // [1..N] -> [0..N-1] |
---|
275 | nt_assert((species_index/8) < ssr.bitstring_bytes()); |
---|
276 | bitstring[species_index/8] |= 1 << (species_index % 8); |
---|
277 | known_members = 1; |
---|
278 | } |
---|
279 | } |
---|
280 | |
---|
281 | SpecSet::SpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const SpecSet *l, const SpecSet *r) { |
---|
282 | init(nodei, ssr); |
---|
283 | |
---|
284 | const long *lbs = (const long *)l->bitstring; |
---|
285 | const long *rbs = (const long *)r->bitstring; |
---|
286 | long *dbs = (long *)bitstring; |
---|
287 | |
---|
288 | for (long j = ssr.bitstring_longs()-1; j>=0; j--) { // LOOP_VECTORIZED=2 |
---|
289 | dbs[j] = lbs[j] | rbs[j]; |
---|
290 | } |
---|
291 | known_members = l->known_members + r->known_members; |
---|
292 | } |
---|
293 | |
---|
294 | SpecSet::~SpecSet() { |
---|
295 | free(bitstring); |
---|
296 | } |
---|
297 | |
---|
298 | RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) : |
---|
299 | SpecSet(nodei, ssr, species_name), |
---|
300 | best_node(NULp) |
---|
301 | {} |
---|
302 | RSpecSet::RSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const RSpecSet *l, const RSpecSet *r) : |
---|
303 | SpecSet(nodei, ssr, l, r), |
---|
304 | best_node(NULp) |
---|
305 | {} |
---|
306 | TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const char *species_name) : |
---|
307 | SpecSet(nodei, ssr, species_name), |
---|
308 | unfound_species_count(1-known_members) |
---|
309 | { |
---|
310 | } |
---|
311 | TSpecSet::TSpecSet(AP_tree *nodei, const SpecSetRegistry& ssr, const TSpecSet *l, const TSpecSet *r) : |
---|
312 | SpecSet(nodei, ssr, l, r), |
---|
313 | unfound_species_count(l->unfound_species_count + r->unfound_species_count) |
---|
314 | { |
---|
315 | } |
---|
316 | |
---|
317 | RSpecSet *SpecSetRegistry::registerTree(AP_tree *node) { |
---|
318 | RSpecSet *ss; |
---|
319 | // Warning: confusing resource handling: |
---|
320 | // - leafs are returned "NOT owned by anybody" |
---|
321 | // - inner nodes are added to and owned by this->sets |
---|
322 | |
---|
323 | if (node->is_leaf()) { |
---|
324 | this->add(node->name); |
---|
325 | ss = new RSpecSet(node, *this, node->name); |
---|
326 | nt_assert(ss->is_leaf_set()); |
---|
327 | } |
---|
328 | else { |
---|
329 | RSpecSet *ls = registerTree(node->get_leftson()); |
---|
330 | RSpecSet *rs = registerTree(node->get_rightson()); |
---|
331 | |
---|
332 | ss = new RSpecSet(node, *this, ls, rs); |
---|
333 | this->add(ss); |
---|
334 | nt_assert(!ss->is_leaf_set()); |
---|
335 | |
---|
336 | if (ls->is_leaf_set()) delete ls; |
---|
337 | if (rs->is_leaf_set()) delete rs; |
---|
338 | } |
---|
339 | return ss; |
---|
340 | } |
---|
341 | |
---|
342 | TSpecSet *SpecSetRegistry::find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info) { |
---|
343 | /* Go through all nodes of subtree 'node' (part of the source tree if copying info) |
---|
344 | * and search for the best matching registered node (in this->sets; which belong to dest tree if copying info). |
---|
345 | * |
---|
346 | * [Note: If comparing topologies, source- and dest-trees were exchanged by caller.] |
---|
347 | * |
---|
348 | * Named groups (if copying info) resp. all nodes (if comparing topologies) |
---|
349 | * get compared versus all registered nodes and the best matching node is detected. |
---|
350 | * The compared node is stored inside the best matching node. |
---|
351 | */ |
---|
352 | |
---|
353 | nt_assert(contradicted(log, compare_node_info)); |
---|
354 | |
---|
355 | TSpecSet *ss = NULp; |
---|
356 | if (node->is_leaf()) { |
---|
357 | ss = new TSpecSet(node, *this, node->name); |
---|
358 | } |
---|
359 | else { |
---|
360 | TSpecSet *ls = find_best_matches_info(node->get_leftson(), log, compare_node_info); |
---|
361 | TSpecSet *rs = ls ? find_best_matches_info(node->get_rightson(), log, compare_node_info) : NULp; |
---|
362 | |
---|
363 | if (rs) { |
---|
364 | ss = new TSpecSet(node, *this, ls, rs); |
---|
365 | if (compare_node_info) { |
---|
366 | nt_assert(!log); // does not work / not allowed |
---|
367 | |
---|
368 | double penalty = search_and_remember_best_match_and_log_errors(ss, log); |
---|
369 | int ipenalty = int(penalty); |
---|
370 | |
---|
371 | // the #-sign is important (otherwise TREE_write_Newick will not work correctly; interference with bootstrap values; refer to #787 + #767) |
---|
372 | if (ipenalty>0) node->use_as_remark(GBS_global_string_copy("# %i", ipenalty)); |
---|
373 | else if (ipenalty<0) node->set_remark("# no match"); |
---|
374 | else node->remove_remark(); |
---|
375 | } |
---|
376 | else { |
---|
377 | if (node->name) { |
---|
378 | double penalty = search_and_remember_best_match_and_log_errors(ss, log); |
---|
379 | if (penalty<0) { |
---|
380 | nt_assert(log); |
---|
381 | fprintf(log, "Group '%s' doesn't fit to any destination subtree.\n", node->name); |
---|
382 | } |
---|
383 | } |
---|
384 | } |
---|
385 | } |
---|
386 | delete rs; |
---|
387 | delete ls; |
---|
388 | } |
---|
389 | |
---|
390 | if (ss && progress->aborted()) { |
---|
391 | delete ss; |
---|
392 | ss = NULp; // abort recursion after user-abort |
---|
393 | } |
---|
394 | return ss; |
---|
395 | } |
---|
396 | |
---|
397 | // ----------------------------- |
---|
398 | // local ACI extension |
---|
399 | |
---|
400 | class GroupXfer_callenv : public GBL_call_env { |
---|
401 | RefPtr<const char> name; |
---|
402 | const GroupPenalty& match; |
---|
403 | |
---|
404 | int oldGroupSize; |
---|
405 | int newGroupSize; |
---|
406 | |
---|
407 | public: |
---|
408 | GroupXfer_callenv(const GBL_env& env_, const char *name_, const GroupPenalty& match_, int oldGroupSize_, int newGroupSize_) : |
---|
409 | GBL_call_env(NULp, env_), |
---|
410 | name(name_), |
---|
411 | match(match_), |
---|
412 | oldGroupSize(oldGroupSize_), |
---|
413 | newGroupSize(newGroupSize_) |
---|
414 | {} |
---|
415 | |
---|
416 | const char *get_group_name() const { return name; } |
---|
417 | const GroupPenalty& get_match() const { return match; } |
---|
418 | int get_old_group_size() const { return oldGroupSize; } |
---|
419 | int get_new_group_size() const { return newGroupSize; } |
---|
420 | }; |
---|
421 | |
---|
422 | inline const GroupXfer_callenv& custom_env(GBL_command_arguments *args) { return DOWNCAST_REFERENCE(const GroupXfer_callenv, args->get_callEnv()); } |
---|
423 | inline const GroupPenalty& custom_match(GBL_command_arguments *args) { return custom_env(args).get_match(); } |
---|
424 | |
---|
425 | using namespace GBL_IMPL; |
---|
426 | |
---|
427 | #define IMPL_FORMATVALUE_CMD(args,fmt,value) \ |
---|
428 | COMMAND_DROPS_INPUT_STREAMS(args); \ |
---|
429 | GB_ERROR error = check_no_parameter(args); \ |
---|
430 | if (!error) FORMAT_2_OUT(args, fmt, value); \ |
---|
431 | return error |
---|
432 | |
---|
433 | static GB_ERROR gxl_groupname(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%s", custom_env(args).get_group_name()); } |
---|
434 | static GB_ERROR gxl_penalty(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%f", custom_match(args).get_penalty()); } |
---|
435 | static GB_ERROR gxl_ingroup(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_ingroup_ratio()*100.0); } |
---|
436 | static GB_ERROR gxl_outgroup(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%.1f%%", custom_match(args).get_outgroup_ratio()*100.0); } |
---|
437 | static GB_ERROR gxl_oldsize(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_env(args).get_old_group_size()); } |
---|
438 | static GB_ERROR gxl_newsize(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_env(args).get_new_group_size()); } |
---|
439 | static GB_ERROR gxl_unknown(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", custom_match(args).get_unknown()); } |
---|
440 | static GB_ERROR gxl_keeled(GBL_command_arguments *args) { IMPL_FORMATVALUE_CMD(args, "%i", int(custom_match(args).shouldHaveBeenKeeled())); } |
---|
441 | |
---|
442 | static GBL_command_definition groupXfer_command_table[] = { |
---|
443 | { "groupname", gxl_groupname }, |
---|
444 | { "penalty", gxl_penalty }, |
---|
445 | { "ingroup", gxl_ingroup }, |
---|
446 | { "outgroup", gxl_outgroup }, |
---|
447 | { "oldsize", gxl_oldsize }, |
---|
448 | { "newsize", gxl_newsize }, |
---|
449 | { "unknown", gxl_unknown }, |
---|
450 | { "keeled", gxl_keeled }, |
---|
451 | { NULp, NULp } |
---|
452 | }; |
---|
453 | |
---|
454 | static const GBL_command_lookup_table& get_GroupXfer_customized_ACI_commands() { |
---|
455 | static GBL_custom_command_lookup_table clt(groupXfer_command_table, |
---|
456 | ARRAY_ELEMS(groupXfer_command_table)-1, |
---|
457 | ACI_get_standard_commands()); |
---|
458 | return clt; |
---|
459 | } |
---|
460 | |
---|
461 | |
---|
462 | |
---|
463 | GB_ERROR SpecSetRegistry::write_node_information(FILE *log, bool delete_old_nodes, GroupsToTransfer what, const char *aci) { |
---|
464 | GB_ERROR error = NULp; |
---|
465 | |
---|
466 | if (log) fputs("\nDetailed group changes:\n\n", log); |
---|
467 | |
---|
468 | GBL_env env(NULp, // have no item here |
---|
469 | NULp, // use no treename |
---|
470 | get_GroupXfer_customized_ACI_commands()); |
---|
471 | |
---|
472 | for (long j=this->nsets-1; j>=0 && !error; j--) { |
---|
473 | RSpecSet * const cset = this->sets[j]; |
---|
474 | |
---|
475 | char *old_group_name = NULp; // !=NULp -> old node has been deleted |
---|
476 | AP_tree *sourceNode = cset->matchedNode(); |
---|
477 | AP_tree *targetNode = cset->set_node; |
---|
478 | bool insert_new_node = sourceNode && sourceNode->name; |
---|
479 | |
---|
480 | if (what == XFER_GROUPS_WITH_MARKED && insert_new_node) { |
---|
481 | if (!targetNode->contains_marked_species()) insert_new_node = false; // @@@ this tests the wrong tree (the target tree)! (#792) |
---|
482 | } |
---|
483 | |
---|
484 | if (targetNode->gb_node && (delete_old_nodes || insert_new_node)) { // There is already a node, delete old |
---|
485 | if (!targetNode->name) { |
---|
486 | GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name"); |
---|
487 | if (gb_name) { |
---|
488 | targetNode->name = GB_read_string(gb_name); |
---|
489 | } |
---|
490 | else { |
---|
491 | targetNode->name = ARB_strdup("<gb_node w/o name>"); // @@@ happens whenever any other node info is present (e.g. linewidth or angles; node gets deleted when delete_old_nodes==true); #793 |
---|
492 | } |
---|
493 | } |
---|
494 | |
---|
495 | old_group_name = ARB_strdup(targetNode->name); // store old_group_name to rename new inserted node |
---|
496 | |
---|
497 | error = GB_delete(targetNode->gb_node); |
---|
498 | if (!error) { |
---|
499 | targetNode->gb_node = NULp; |
---|
500 | freenull(targetNode->name); |
---|
501 | } |
---|
502 | } |
---|
503 | |
---|
504 | if (!error) { |
---|
505 | if (insert_new_node) { |
---|
506 | targetNode->gb_node = GB_create_container(targetNode->get_tree_root()->get_gb_tree(), "node"); |
---|
507 | error = GB_copy_dropProtectMarksAndTempstate(targetNode->gb_node, sourceNode->gb_node); // @@@ this copies more info than group-name (e.g. linewidth or angles; unwanted!); #793 |
---|
508 | // @@@ need a function which copies/moves/deletes group-related info (between two nodes). currently affected fields: 'group_name', 'grouped', 'keeled'; #793 |
---|
509 | |
---|
510 | if (!error) { |
---|
511 | GBDATA *gb_name = GB_entry(targetNode->gb_node, "group_name"); |
---|
512 | nt_assert(gb_name); |
---|
513 | if (gb_name) { |
---|
514 | const GroupPenalty& match = cset->bestMatch(); |
---|
515 | |
---|
516 | int oldGroupsize = match.get_groupsize(); |
---|
517 | int newGroupsize = cset->size(); |
---|
518 | |
---|
519 | char *new_group_name = NULp; // new wanted groupname |
---|
520 | { |
---|
521 | char *source_group_name = GB_read_string(gb_name); // existing groupname (in group-container copied from source tree) |
---|
522 | if (aci) { |
---|
523 | GroupXfer_callenv callEnv(env, source_group_name, match, oldGroupsize, newGroupsize); |
---|
524 | new_group_name = GB_command_interpreter_in_env("", aci, callEnv); |
---|
525 | if (!new_group_name) error = GB_await_error(); |
---|
526 | } |
---|
527 | else { |
---|
528 | reassign(new_group_name, source_group_name); |
---|
529 | } |
---|
530 | free(source_group_name); |
---|
531 | } |
---|
532 | |
---|
533 | if (!error) { |
---|
534 | nt_assert(new_group_name); |
---|
535 | if (old_group_name) { // overwrite existing group |
---|
536 | if (!delete_old_nodes) { |
---|
537 | if (strcmp(old_group_name, new_group_name) != 0) { // old and new name differ |
---|
538 | char *combined_name = GBS_global_string_copy("%s [was: %s]", new_group_name, old_group_name); |
---|
539 | if (log) fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, combined_name); |
---|
540 | reassign(new_group_name, combined_name); |
---|
541 | } |
---|
542 | else { // name matches existing |
---|
543 | if (log) fprintf(log, "Group '%s' remains unchanged", old_group_name); |
---|
544 | } |
---|
545 | } |
---|
546 | else { |
---|
547 | if (log) { |
---|
548 | if (strcmp(old_group_name, new_group_name) == 0) { |
---|
549 | fprintf(log, "Group '%s' remains unchanged", old_group_name); |
---|
550 | } |
---|
551 | else { |
---|
552 | fprintf(log, "Destination group '%s' overwritten by '%s'", old_group_name, new_group_name); |
---|
553 | } |
---|
554 | } |
---|
555 | } |
---|
556 | } |
---|
557 | else { // no group at destination -> insert |
---|
558 | if (log) fprintf(log, "Group '%s' inserted", new_group_name); |
---|
559 | } |
---|
560 | } |
---|
561 | |
---|
562 | if (!error) error = GBT_write_group_name(gb_name, new_group_name, true); // always write wanted target name |
---|
563 | |
---|
564 | if (log) { |
---|
565 | if (error) { |
---|
566 | fprintf(log, " (Failed! Reason: %s)\n", error); |
---|
567 | } |
---|
568 | else if (match.isPerfectMatch()) { |
---|
569 | fputc('\n', log); |
---|
570 | } |
---|
571 | else { |
---|
572 | fprintf(log, " (not placed optimal; penalty=%f; in=%.1f%%/%i; out=%.1f%%/%i%s)\n", |
---|
573 | match.get_penalty(), |
---|
574 | match.get_ingroup_ratio() *100.0, oldGroupsize, |
---|
575 | match.get_outgroup_ratio()*100.0, newGroupsize, |
---|
576 | match.shouldHaveBeenKeeled() ? "; WARNING: matched inverse set, i.e. rest of tree!" : ""); |
---|
577 | } |
---|
578 | } |
---|
579 | |
---|
580 | free(new_group_name); |
---|
581 | } |
---|
582 | } |
---|
583 | } |
---|
584 | else { |
---|
585 | nt_assert(implicated(old_group_name, delete_old_nodes)); |
---|
586 | if (old_group_name && log) { |
---|
587 | fprintf(log, "Destination group '%s' removed\n", old_group_name); |
---|
588 | } |
---|
589 | } |
---|
590 | } |
---|
591 | free(old_group_name); |
---|
592 | } |
---|
593 | return error; |
---|
594 | } |
---|
595 | |
---|
596 | void SpecSetRegistry::finish(GB_ERROR& error) { |
---|
597 | if (!error) error = progress->error_if_aborted(); |
---|
598 | progress->done(); |
---|
599 | } |
---|
600 | |
---|
601 | GB_ERROR NTREE_move_tree_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, GroupTransferMode mode, GroupsToTransfer what, const GroupMatchScorer& scorer, const char *aci) { |
---|
602 | GB_ERROR error = NULp; |
---|
603 | FILE *log = NULp; |
---|
604 | |
---|
605 | if (!contradicted(mode == COMPARE_TOPOLOGY, log_file)) { |
---|
606 | GBK_terminatef("invalid use of function NTREE_move_tree_info: mode=%i log_file=%s\n", int(mode), log_file); |
---|
607 | } |
---|
608 | |
---|
609 | if (aci && !aci[0]) aci = NULp; // handle "empty ACI" like "no ACI" |
---|
610 | |
---|
611 | if (mode == COMPARE_TOPOLOGY) { |
---|
612 | // info is written into 'tree_source' in .@find_best_matches_info |
---|
613 | // (but we want to modify destination tree - like 'mode node info' does) |
---|
614 | std::swap(tree_source, tree_dest); |
---|
615 | } |
---|
616 | |
---|
617 | if (log_file) { |
---|
618 | nt_assert(mode == REMOVE_EXISTING_GROUPS || mode == KEEP_OLD_NAMES); |
---|
619 | log = fopen(log_file, "w"); |
---|
620 | if (log) { |
---|
621 | const char *transferredGroups = what == XFER_GROUPS_WITH_MARKED ? "groups containing marked" : "all"; |
---|
622 | const char *overwriteMode = NULp; |
---|
623 | |
---|
624 | switch (mode) { |
---|
625 | case COMPARE_TOPOLOGY: nt_assert(0); break; |
---|
626 | case REMOVE_EXISTING_GROUPS: overwriteMode = "remove existing groups"; break; |
---|
627 | case KEEP_OLD_NAMES: overwriteMode = "keep old names"; break; |
---|
628 | } |
---|
629 | |
---|
630 | nt_assert(transferredGroups && overwriteMode); |
---|
631 | |
---|
632 | fprintf(log, // start of log |
---|
633 | "LOGFILE: transfer group information\n" |
---|
634 | "\n" |
---|
635 | " Source tree '%s'\n" |
---|
636 | "Destination tree '%s'\n" |
---|
637 | "\n" |
---|
638 | "transferred groups: %s\n" |
---|
639 | " overwrite mode: %s\n" |
---|
640 | "\n", |
---|
641 | tree_source, |
---|
642 | tree_dest, |
---|
643 | transferredGroups, |
---|
644 | overwriteMode); |
---|
645 | } |
---|
646 | else { |
---|
647 | error = GB_IO_error("writing", log_file); |
---|
648 | } |
---|
649 | } |
---|
650 | |
---|
651 | GB_begin_transaction(gb_main); |
---|
652 | |
---|
653 | AP_tree_root rsource(new AliView(gb_main), NULp, false, NULp); |
---|
654 | AP_tree_root rdest (new AliView(gb_main), NULp, false, NULp); |
---|
655 | AP_tree_root& rsave = (mode == COMPARE_TOPOLOGY) ? rsource : rdest; |
---|
656 | arb_progress progress(mode == COMPARE_TOPOLOGY ? "Compare topologies" : (mode == REMOVE_EXISTING_GROUPS ? "Copy group information" : "Add group information")); |
---|
657 | |
---|
658 | if (!error) error = scorer.check_validity(); |
---|
659 | if (!error) error = rsource.loadFromDB(tree_source); |
---|
660 | if (!error) error = rsource.linkToDB(NULp, NULp); |
---|
661 | if (!error) { |
---|
662 | error = rdest.loadFromDB(tree_dest); |
---|
663 | if (!error) error = rdest.linkToDB(NULp, NULp); |
---|
664 | if (!error) { |
---|
665 | AP_tree *source = rsource.get_root_node(); |
---|
666 | AP_tree *dest = rdest.get_root_node(); |
---|
667 | |
---|
668 | int dest_leafs = dest->count_leafs(); |
---|
669 | int dest_nodes = leafs_2_nodes(dest_leafs, ROOTED); |
---|
670 | int dest_inodes = dest_nodes-dest_leafs; // inner nodes |
---|
671 | |
---|
672 | int source_leafs = source->count_leafs(); |
---|
673 | int source_nodes = leafs_2_nodes(source_leafs, ROOTED); |
---|
674 | int source_inodes = source_nodes-source_leafs; // inner nodes |
---|
675 | |
---|
676 | size_t compared_nodes = mode == COMPARE_TOPOLOGY ? source_inodes : source->count_clades(); |
---|
677 | size_t progress_steps = (compared_nodes + 2) * dest_inodes; // 2 for searching both sides of root |
---|
678 | |
---|
679 | arb_progress compare_progress(progress_steps); |
---|
680 | compare_progress.subtitle("Register topology"); |
---|
681 | |
---|
682 | { |
---|
683 | SpecSetRegistry ssr(dest_leafs, &compare_progress, scorer); |
---|
684 | |
---|
685 | ssr.registerTree(dest); |
---|
686 | |
---|
687 | if (source_leafs < 3) error = GB_export_errorf("tree '%s' has less than 3 species", tree_source); |
---|
688 | else { |
---|
689 | compare_progress.subtitle("Match subtrees"); |
---|
690 | |
---|
691 | TSpecSet *root_tsetl = ssr.find_best_matches_info(source->get_leftson(), log, mode == COMPARE_TOPOLOGY); |
---|
692 | TSpecSet *root_tsetr = root_tsetl ? ssr.find_best_matches_info(source->get_rightson(), log, mode == COMPARE_TOPOLOGY) : NULp; |
---|
693 | |
---|
694 | if (root_tsetr) { |
---|
695 | if (mode != COMPARE_TOPOLOGY) { |
---|
696 | compare_progress.subtitle("Write group information"); |
---|
697 | error = ssr.write_node_information(log, mode == REMOVE_EXISTING_GROUPS, what, aci); |
---|
698 | } |
---|
699 | |
---|
700 | if (!error) { |
---|
701 | GroupPenalty dummy; |
---|
702 | |
---|
703 | ssr.setScorer(GroupMatchScorer()); // otherwise may find no matches |
---|
704 | RSpecSet *new_root_rsetl = ssr.search_best_match(root_tsetl, dummy); |
---|
705 | RSpecSet *new_root_rsetr = ssr.search_best_match(root_tsetr, dummy); |
---|
706 | |
---|
707 | nt_assert(new_root_rsetl && new_root_rsetr); // should always report matches (even really bad ones!) |
---|
708 | |
---|
709 | AP_tree *new_rootl = new_root_rsetl->set_node; |
---|
710 | AP_tree *new_rootr = new_root_rsetr->set_node; |
---|
711 | |
---|
712 | // naive attempt to set root of target tree |
---|
713 | // @@@ setting root wont work very well.. |
---|
714 | // - if these two nodes are NOT adjacent (esp. if 'new_rootr' is less "good" than 'new_rootl') |
---|
715 | // - probably modifies topology even if not necessary; see ad_trees.cxx@NAIVE_ROOTING |
---|
716 | new_rootl->set_root(); |
---|
717 | new_rootr->set_root(); |
---|
718 | |
---|
719 | compare_progress.subtitle("Save trees"); |
---|
720 | |
---|
721 | AP_tree *saved_root = (mode == COMPARE_TOPOLOGY) ? source : new_rootr->get_root_node(); |
---|
722 | error = GBT_overwrite_tree(rsave.get_gb_tree(), saved_root); |
---|
723 | |
---|
724 | if (!error) { |
---|
725 | char *entry; |
---|
726 | if (mode == COMPARE_TOPOLOGY) { |
---|
727 | entry = GBS_global_string_copy("Compared topology with %s", tree_dest); // Note: the modified tree is 'tree_source'! |
---|
728 | } |
---|
729 | else { |
---|
730 | const char *copiedOrAdded = mode == REMOVE_EXISTING_GROUPS ? "Copied" : "Added"; |
---|
731 | |
---|
732 | entry = GBS_global_string_copy("%s node info %sfrom %s", |
---|
733 | copiedOrAdded, |
---|
734 | what == XFER_GROUPS_WITH_MARKED ? "of marked " : "", |
---|
735 | tree_source); |
---|
736 | } |
---|
737 | GBT_log_to_tree_remark(rsave.get_gb_tree(), entry, true); |
---|
738 | free(entry); |
---|
739 | } |
---|
740 | } |
---|
741 | } |
---|
742 | |
---|
743 | delete root_tsetl; |
---|
744 | delete root_tsetr; |
---|
745 | } |
---|
746 | |
---|
747 | ssr.finish(error); |
---|
748 | } |
---|
749 | if (error) compare_progress.done(); |
---|
750 | } |
---|
751 | } |
---|
752 | |
---|
753 | if (log) { |
---|
754 | if (error) fprintf(log, "\nError: %s\n", error); // write error to log as well |
---|
755 | else fputs("[done]\n", log); |
---|
756 | fclose(log); |
---|
757 | } |
---|
758 | |
---|
759 | return GB_end_transaction(gb_main, error); |
---|
760 | } |
---|
761 | |
---|
762 | // -------------------------------------------------------------------------------- |
---|
763 | |
---|
764 | #ifdef UNIT_TESTS |
---|
765 | #ifndef TEST_UNIT_H |
---|
766 | #include <test_unit.h> |
---|
767 | #endif |
---|
768 | |
---|
769 | void TEST_species_sets() { |
---|
770 | // SpecSetRegistry ssr(0, NULp); // does fatal error - fine |
---|
771 | |
---|
772 | GroupMatchScorer defaultScorer; |
---|
773 | { |
---|
774 | SpecSetRegistry s1(1, NULp, defaultScorer); |
---|
775 | TEST_EXPECT_EQUAL(s1.bitstring_bytes(), 1); |
---|
776 | TEST_EXPECT_EQUAL(s1.bitstring_longs(), 1); |
---|
777 | |
---|
778 | s1.add("one"); |
---|
779 | // s1.add("too much"); // fails assertion; fine |
---|
780 | TSpecSet t1(NULp, s1, "one"); |
---|
781 | } |
---|
782 | { |
---|
783 | SpecSetRegistry s8(8, NULp, defaultScorer); |
---|
784 | TEST_EXPECT_EQUAL(s8.bitstring_bytes(), 1); |
---|
785 | TEST_EXPECT_EQUAL(s8.bitstring_longs(), 1); |
---|
786 | |
---|
787 | for (int i = 1; i<=8; ++i) { |
---|
788 | s8.add(GBS_global_string("%i", i)); |
---|
789 | } |
---|
790 | // s8.add("too much"); // fails assertion; fine |
---|
791 | for (int i = 1; i<=8; ++i) { |
---|
792 | TSpecSet t(NULp, s8, GBS_global_string("%i", i)); |
---|
793 | } |
---|
794 | } |
---|
795 | { |
---|
796 | SpecSetRegistry s9(9, NULp, defaultScorer); |
---|
797 | TEST_EXPECT_EQUAL(s9.bitstring_bytes(), 2); |
---|
798 | TEST_EXPECT_EQUAL(s9.bitstring_longs(), 1); |
---|
799 | |
---|
800 | for (int i = 1; i<=9; ++i) { |
---|
801 | s9.add(GBS_global_string("%i", i)); |
---|
802 | } |
---|
803 | // s9.add("too much"); // fails assertion; fine |
---|
804 | for (int i = 1; i<=9; ++i) { |
---|
805 | TSpecSet t(NULp, s9, GBS_global_string("%i", i)); |
---|
806 | } |
---|
807 | } |
---|
808 | |
---|
809 | { |
---|
810 | SpecSetRegistry s64(64, NULp, defaultScorer); |
---|
811 | TEST_EXPECT_EQUAL(s64.bitstring_bytes(), 8); |
---|
812 | TEST_EXPECT_EQUAL(s64.bitstring_longs(), 1); |
---|
813 | |
---|
814 | for (int i = 1; i<=64; ++i) { |
---|
815 | s64.add(GBS_global_string("%i", i)); |
---|
816 | } |
---|
817 | // s64.add("too much"); // fails assertion; fine |
---|
818 | for (int i = 1; i<=64; ++i) { |
---|
819 | TSpecSet t(NULp, s64, GBS_global_string("%i", i)); |
---|
820 | } |
---|
821 | } |
---|
822 | { |
---|
823 | SpecSetRegistry s65(65, NULp, defaultScorer); |
---|
824 | TEST_EXPECT_EQUAL(s65.bitstring_bytes(), 9); |
---|
825 | TEST_EXPECT_EQUAL(s65.bitstring_longs(), 2); |
---|
826 | |
---|
827 | for (int i = 1; i<=65; ++i) { |
---|
828 | s65.add(GBS_global_string("%i", i)); |
---|
829 | } |
---|
830 | // s65.add("too much"); // fails assertion; fine |
---|
831 | for (int i = 1; i<=65; ++i) { |
---|
832 | TSpecSet t(NULp, s65, GBS_global_string("%i", i)); |
---|
833 | } |
---|
834 | } |
---|
835 | } |
---|
836 | |
---|
837 | #endif // UNIT_TESTS |
---|
838 | |
---|
839 | // -------------------------------------------------------------------------------- |
---|