1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : NT_tree_cmp.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "NT_species_set.h" |
---|
12 | #include "NT_local.h" |
---|
13 | #include <aw_msg.hxx> |
---|
14 | #include <arb_progress.h> |
---|
15 | #include <string> |
---|
16 | #include <climits> |
---|
17 | |
---|
18 | using namespace std; |
---|
19 | |
---|
20 | AWT_species_set_root::AWT_species_set_root(GBDATA *gb_maini, long nspeciesi, arb_progress *progress_) { |
---|
21 | memset((char *)this, 0, sizeof(*this)); |
---|
22 | gb_main = gb_maini; |
---|
23 | nspecies = nspeciesi; |
---|
24 | progress = progress_; |
---|
25 | sets = (AWT_species_set **)GB_calloc(sizeof(AWT_species_set *), (size_t)leafs_2_nodes(nspecies, ROOTED)); |
---|
26 | |
---|
27 | for (int i=0; i<256; i++) { |
---|
28 | int j = i; |
---|
29 | int count = 0; |
---|
30 | while (j) { // count bits |
---|
31 | if (j&1) count ++; |
---|
32 | j = j>>1; |
---|
33 | } |
---|
34 | diff_bits[i] = count; |
---|
35 | } |
---|
36 | species_hash = GBS_create_hash(nspecies, GB_IGNORE_CASE); |
---|
37 | species_counter = 1; |
---|
38 | } |
---|
39 | |
---|
40 | AWT_species_set_root::~AWT_species_set_root() { |
---|
41 | for (int i=0; i<nsets; i++) delete sets[i]; |
---|
42 | free(sets); |
---|
43 | GBS_free_hash(species_hash); |
---|
44 | } |
---|
45 | |
---|
46 | void AWT_species_set_root::add(const char *species_name) { |
---|
47 | if (GBS_read_hash(species_hash, species_name)) { |
---|
48 | aw_message(GBS_global_string("Warning: Species '%s' is found more than once in tree", species_name)); |
---|
49 | } |
---|
50 | else { |
---|
51 | GBS_write_hash(species_hash, species_name, species_counter++); |
---|
52 | } |
---|
53 | } |
---|
54 | |
---|
55 | void AWT_species_set_root::add(AWT_species_set *set) { |
---|
56 | nt_assert(nsets<nspecies*2); |
---|
57 | sets[nsets++] = set; |
---|
58 | } |
---|
59 | |
---|
60 | AWT_species_set *AWT_species_set_root::search_best_match(const AWT_species_set *set, long& best_cost) { |
---|
61 | // returns best match for 'set' |
---|
62 | // sets 'best_cost' to minimum mismatches |
---|
63 | |
---|
64 | AWT_species_set *result = 0; |
---|
65 | best_cost = LONG_MAX; |
---|
66 | |
---|
67 | unsigned char *sbs = set->bitstring; |
---|
68 | for (long i=nsets-1; i>=0; i--) { |
---|
69 | long sum = 0; |
---|
70 | unsigned char *rsb = sets[i]->bitstring; |
---|
71 | |
---|
72 | for (long j=bitstring_bytes()-1; j>=0; j--) { |
---|
73 | sum += diff_bits[ sbs[j] ^ rsb[j] ]; |
---|
74 | } |
---|
75 | if (sum > nspecies/2) sum = nspecies - sum; // take always the minimum |
---|
76 | if (sum < best_cost) { |
---|
77 | best_cost = sum; |
---|
78 | result = sets[i]; |
---|
79 | } |
---|
80 | } |
---|
81 | return result; |
---|
82 | } |
---|
83 | |
---|
84 | int AWT_species_set_root::search_and_remember_best_match_and_log_errors(const AWT_species_set *set, FILE *log_file) { |
---|
85 | // set's best_cost & best_node (of best match for 'set' found in other tree) |
---|
86 | // returns the number of mismatches (plus a small penalty for missing species) |
---|
87 | // |
---|
88 | // When moving node info, 'set' represents a subtree of the source tree. |
---|
89 | // When comparing topologies, 'set' represents a subtree of the destination tree. |
---|
90 | |
---|
91 | long net_cost; |
---|
92 | AWT_species_set *bs = search_best_match(set, net_cost); |
---|
93 | |
---|
94 | double best_cost = net_cost + set->unfound_species_count * 0.0001; |
---|
95 | if (best_cost < bs->best_cost) { |
---|
96 | bs->best_cost = best_cost; |
---|
97 | bs->best_node = set->node; |
---|
98 | } |
---|
99 | if (log_file) { |
---|
100 | if (net_cost) { |
---|
101 | fprintf(log_file, "Group '%s' not placed optimal (%li errors)\n", |
---|
102 | set->node->name, |
---|
103 | net_cost); |
---|
104 | } |
---|
105 | } |
---|
106 | return net_cost; |
---|
107 | } |
---|
108 | |
---|
109 | void AWT_species_set::init(AP_tree *nodei, const AWT_species_set_root *ssr) { |
---|
110 | memset((char *)this, 0, sizeof(*this)); |
---|
111 | |
---|
112 | bitstring = (unsigned char *)GB_calloc(sizeof(char), ssr->bitstring_longs()*sizeof(long)); |
---|
113 | this->node = nodei; |
---|
114 | best_cost = 0x7fffffff; |
---|
115 | } |
---|
116 | |
---|
117 | AWT_species_set::AWT_species_set(AP_tree *nodei, const AWT_species_set_root *ssr, const char *species_name) { |
---|
118 | init(nodei, ssr); |
---|
119 | |
---|
120 | long species_index = GBS_read_hash(ssr->species_hash, species_name); |
---|
121 | if (species_index) { |
---|
122 | bitstring[species_index/8] |= 1 << (species_index % 8); |
---|
123 | } |
---|
124 | else { |
---|
125 | unfound_species_count = 1; |
---|
126 | } |
---|
127 | } |
---|
128 | |
---|
129 | AWT_species_set::AWT_species_set(AP_tree *nodei, const AWT_species_set_root *ssr, const AWT_species_set *l, const AWT_species_set *r) { |
---|
130 | init(nodei, ssr); |
---|
131 | |
---|
132 | const long *lbits = (const long *)l->bitstring; |
---|
133 | const long *rbits = (const long *)r->bitstring; |
---|
134 | long *dest = (long *)bitstring; |
---|
135 | |
---|
136 | for (long j = ssr->bitstring_longs()-1; j>=0; j--) { |
---|
137 | dest[j] = lbits[j] | rbits[j]; |
---|
138 | } |
---|
139 | unfound_species_count = l->unfound_species_count + r->unfound_species_count; |
---|
140 | } |
---|
141 | |
---|
142 | AWT_species_set::~AWT_species_set() { |
---|
143 | free(bitstring); |
---|
144 | } |
---|
145 | |
---|
146 | AWT_species_set *AWT_species_set_root::move_tree_2_ssr(AP_tree *node) { |
---|
147 | AWT_species_set *ss; |
---|
148 | // Warning: confusing resource handling: |
---|
149 | // - leafs are returned "NOT owned by anybody" |
---|
150 | // - inner nodes are added to and owned by this->sets |
---|
151 | |
---|
152 | if (node->is_leaf) { |
---|
153 | this->add(node->name); |
---|
154 | ss = new AWT_species_set(node, this, node->name); |
---|
155 | nt_assert(ss->is_leaf_set()); |
---|
156 | } |
---|
157 | else { |
---|
158 | AWT_species_set *ls = move_tree_2_ssr(node->get_leftson()); |
---|
159 | AWT_species_set *rs = move_tree_2_ssr(node->get_rightson()); |
---|
160 | |
---|
161 | ss = new AWT_species_set(node, this, ls, rs); |
---|
162 | this->add(ss); |
---|
163 | nt_assert(!ss->is_leaf_set()); |
---|
164 | |
---|
165 | if (ls->is_leaf_set()) delete ls; |
---|
166 | if (rs->is_leaf_set()) delete rs; |
---|
167 | } |
---|
168 | return ss; |
---|
169 | } |
---|
170 | |
---|
171 | AWT_species_set *AWT_species_set_root::find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info) { |
---|
172 | /* Go through all node of the source tree and search for the best |
---|
173 | * matching node in dest_tree (meaning searching ssr->sets) |
---|
174 | * If a match is found, set ssr->sets to this match. |
---|
175 | */ |
---|
176 | |
---|
177 | AWT_species_set *ss = NULL; |
---|
178 | if (node->is_leaf) { |
---|
179 | ss = new AWT_species_set(node, this, node->name); |
---|
180 | } |
---|
181 | else { |
---|
182 | AWT_species_set *ls = find_best_matches_info(node->get_leftson(), log, compare_node_info); |
---|
183 | AWT_species_set *rs = ls ? find_best_matches_info(node->get_rightson(), log, compare_node_info) : NULL; |
---|
184 | |
---|
185 | if (rs) { |
---|
186 | ss = new AWT_species_set(node, this, ls, rs); |
---|
187 | if (compare_node_info) { |
---|
188 | int mismatches = search_and_remember_best_match_and_log_errors(ss, log); |
---|
189 | // the #-sign is important (otherwise TREE_write_Newick will not work correctly; interference with bootstrap values!) |
---|
190 | char *new_remark = mismatches ? GBS_global_string_copy("# %i", mismatches) : NULL; |
---|
191 | node->use_as_remark(new_remark); |
---|
192 | } |
---|
193 | else { |
---|
194 | if (node->name) { |
---|
195 | search_and_remember_best_match_and_log_errors(ss, log); |
---|
196 | } |
---|
197 | } |
---|
198 | } |
---|
199 | delete rs; |
---|
200 | delete ls; |
---|
201 | } |
---|
202 | if (ss) { |
---|
203 | progress->inc(); |
---|
204 | if (progress->aborted()) { |
---|
205 | delete ss; |
---|
206 | ss = NULL; |
---|
207 | } |
---|
208 | } |
---|
209 | return ss; |
---|
210 | } |
---|
211 | |
---|
212 | GB_ERROR AWT_species_set_root::copy_node_information(FILE *log, bool delete_old_nodes, bool nodes_with_marked_only) { |
---|
213 | GB_ERROR error = NULL; |
---|
214 | |
---|
215 | if (log) fputs("\nDetailed group changes:\n\n", log); |
---|
216 | |
---|
217 | for (long j=this->nsets-1; j>=0 && !error; j--) { |
---|
218 | AWT_species_set *cset = this->sets[j]; |
---|
219 | |
---|
220 | char *old_group_name = NULL; |
---|
221 | bool insert_new_node = cset->best_node && cset->best_node->name; |
---|
222 | |
---|
223 | if (nodes_with_marked_only && insert_new_node) { |
---|
224 | if (!cset->node->contains_marked_species()) insert_new_node = false; |
---|
225 | } |
---|
226 | |
---|
227 | if (cset->node->gb_node && (delete_old_nodes || insert_new_node)) { // There is already a node, delete old |
---|
228 | if (cset->node->name == 0) { |
---|
229 | GBDATA *gb_name = GB_entry(cset->node->gb_node, "group_name"); |
---|
230 | if (gb_name) { |
---|
231 | cset->node->name = GB_read_string(gb_name); |
---|
232 | } |
---|
233 | else { |
---|
234 | cset->node->name = strdup("<gb_node w/o name>"); |
---|
235 | } |
---|
236 | } |
---|
237 | |
---|
238 | old_group_name = strdup(cset->node->name); // store old_group_name to rename new inserted node |
---|
239 | |
---|
240 | error = GB_delete(cset->node->gb_node); |
---|
241 | if (!error) { |
---|
242 | cset->node->gb_node = 0; |
---|
243 | freenull(cset->node->name); |
---|
244 | } |
---|
245 | } |
---|
246 | |
---|
247 | if (!error) { |
---|
248 | if (insert_new_node) { |
---|
249 | cset->node->gb_node = GB_create_container(cset->node->get_tree_root()->get_gb_tree(), "node"); |
---|
250 | error = GB_copy(cset->node->gb_node, cset->best_node->gb_node); |
---|
251 | if (!error) { |
---|
252 | GBDATA *gb_name = GB_entry(cset->node->gb_node, "group_name"); |
---|
253 | nt_assert(gb_name); |
---|
254 | if (gb_name) { |
---|
255 | char *best_group_name = GB_read_string(gb_name); |
---|
256 | if (old_group_name) { |
---|
257 | if (!delete_old_nodes) { |
---|
258 | if (strcmp(old_group_name, best_group_name) != 0) { // old and new name differ |
---|
259 | char *new_group_name = GBS_global_string_copy("%s [was: %s]", best_group_name, old_group_name); |
---|
260 | GB_write_string(gb_name, new_group_name); |
---|
261 | if (log) fprintf(log, "Destination group '%s' overwritten by '%s'\n", old_group_name, new_group_name); |
---|
262 | free(new_group_name); |
---|
263 | } |
---|
264 | else { |
---|
265 | if (log) fprintf(log, "Group '%s' remains unchanged\n", old_group_name); |
---|
266 | } |
---|
267 | } |
---|
268 | else { |
---|
269 | if (log) { |
---|
270 | if (strcmp(old_group_name, best_group_name) == 0) { |
---|
271 | fprintf(log, "Group '%s' remains unchanged\n", old_group_name); |
---|
272 | } |
---|
273 | else { |
---|
274 | fprintf(log, "Destination group '%s' overwritten by '%s'\n", old_group_name, best_group_name); |
---|
275 | } |
---|
276 | } |
---|
277 | } |
---|
278 | } |
---|
279 | else { |
---|
280 | if (log) fprintf(log, "Group '%s' inserted\n", best_group_name); |
---|
281 | } |
---|
282 | free(best_group_name); |
---|
283 | } |
---|
284 | } |
---|
285 | } |
---|
286 | else { |
---|
287 | if (old_group_name && log) fprintf(log, "Destination group '%s' removed\n", old_group_name); |
---|
288 | } |
---|
289 | } |
---|
290 | free(old_group_name); |
---|
291 | } |
---|
292 | return error; |
---|
293 | } |
---|
294 | |
---|
295 | void AWT_species_set_root::finish(GB_ERROR& error) { |
---|
296 | if (!error) error = progress->error_if_aborted(); |
---|
297 | progress->done(); |
---|
298 | } |
---|
299 | |
---|
300 | GB_ERROR AWT_move_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, TreeInfoMode mode, bool nodes_with_marked_only) { |
---|
301 | GB_ERROR error = 0; |
---|
302 | FILE *log = 0; |
---|
303 | |
---|
304 | nt_assert(contradicted(mode == TREE_INFO_COMPARE, log_file)); |
---|
305 | |
---|
306 | if (mode == TREE_INFO_COMPARE) { |
---|
307 | // info is written into 'tree_source' |
---|
308 | // (but we want to modify destination tree - like 'mode node info' does) |
---|
309 | std::swap(tree_source, tree_dest); |
---|
310 | } |
---|
311 | |
---|
312 | if (log_file) { |
---|
313 | nt_assert(mode == TREE_INFO_COPY || mode == TREE_INFO_ADD); |
---|
314 | log = fopen(log_file, "w"); |
---|
315 | |
---|
316 | fprintf(log, |
---|
317 | "LOGFILE: %s node info\n" |
---|
318 | "\n" |
---|
319 | " Source tree '%s'\n" |
---|
320 | "Destination tree '%s'\n" |
---|
321 | "\n", |
---|
322 | mode == TREE_INFO_COPY ? "Copying" : "Adding", |
---|
323 | tree_source, tree_dest); |
---|
324 | } |
---|
325 | |
---|
326 | GB_begin_transaction(gb_main); |
---|
327 | |
---|
328 | AP_tree_root rsource(new AliView(gb_main), new AP_TreeNodeFactory, NULL, false); |
---|
329 | AP_tree_root rdest (new AliView(gb_main), new AP_TreeNodeFactory, NULL, false); |
---|
330 | AP_tree_root& rsave = (mode == TREE_INFO_COMPARE) ? rsource : rdest; |
---|
331 | arb_progress progress("Comparing Topologies"); |
---|
332 | |
---|
333 | error = rsource.loadFromDB(tree_source); |
---|
334 | if (!error) error = rsource.linkToDB(0, 0); |
---|
335 | if (!error) { |
---|
336 | error = rdest.loadFromDB(tree_dest); |
---|
337 | if (!error) error = rdest.linkToDB(0, 0); |
---|
338 | if (!error) { |
---|
339 | AP_tree *source = rsource.get_root_node(); |
---|
340 | AP_tree *dest = rdest.get_root_node(); |
---|
341 | |
---|
342 | long nspecies = dest->count_leafs(); |
---|
343 | long source_leafs = source->count_leafs(); |
---|
344 | long source_nodes = leafs_2_nodes(source_leafs, ROOTED); |
---|
345 | |
---|
346 | arb_progress compare_progress(source_nodes); |
---|
347 | compare_progress.subtitle("Comparing both trees"); |
---|
348 | |
---|
349 | AWT_species_set_root *ssr = new AWT_species_set_root(gb_main, nspecies, &compare_progress); |
---|
350 | |
---|
351 | ssr->move_tree_2_ssr(dest); |
---|
352 | |
---|
353 | if (source_leafs < 3) error = GB_export_error("Destination tree has less than 3 species"); |
---|
354 | else { |
---|
355 | AWT_species_set *root_setl = ssr->find_best_matches_info(source->get_leftson(), log, mode == TREE_INFO_COMPARE); |
---|
356 | AWT_species_set *root_setr = root_setl ? ssr->find_best_matches_info(source->get_rightson(), log, mode == TREE_INFO_COMPARE) : NULL; |
---|
357 | |
---|
358 | if (root_setr) { |
---|
359 | if (mode != TREE_INFO_COMPARE) { |
---|
360 | compare_progress.subtitle("Copying node information"); |
---|
361 | ssr->copy_node_information(log, mode == TREE_INFO_COPY, nodes_with_marked_only); |
---|
362 | } |
---|
363 | |
---|
364 | long dummy = 0; |
---|
365 | AWT_species_set *new_root_setl = ssr->search_best_match(root_setl, dummy); |
---|
366 | AWT_species_set *new_root_setr = ssr->search_best_match(root_setr, dummy); |
---|
367 | AP_tree *new_rootl = new_root_setl->node; |
---|
368 | AP_tree *new_rootr = new_root_setr->node; |
---|
369 | |
---|
370 | new_rootl->set_root(); // set root correctly |
---|
371 | new_rootr->set_root(); // set root correctly |
---|
372 | |
---|
373 | compare_progress.subtitle("Saving trees"); |
---|
374 | |
---|
375 | AP_tree *saved_root = (mode == TREE_INFO_COMPARE) ? source : new_rootr->get_root_node(); |
---|
376 | error = GBT_overwrite_tree(rsave.get_gb_tree(), saved_root); |
---|
377 | |
---|
378 | if (!error) { |
---|
379 | char *entry; |
---|
380 | if (mode == TREE_INFO_COMPARE) { |
---|
381 | entry = GBS_global_string_copy("Compared topology with %s", tree_dest); |
---|
382 | } |
---|
383 | else { |
---|
384 | const char *copiedOrAdded = mode == TREE_INFO_COPY ? "Copied" : "Added"; |
---|
385 | |
---|
386 | entry = GBS_global_string_copy("%s node info %sfrom %s", |
---|
387 | copiedOrAdded, |
---|
388 | nodes_with_marked_only ? "of marked " : "", |
---|
389 | tree_source); |
---|
390 | } |
---|
391 | GBT_log_to_tree_remark(rsave.get_gb_tree(), entry); |
---|
392 | free(entry); |
---|
393 | } |
---|
394 | } |
---|
395 | |
---|
396 | delete root_setl; |
---|
397 | delete root_setr; |
---|
398 | } |
---|
399 | |
---|
400 | ssr->finish(error); |
---|
401 | delete ssr; |
---|
402 | } |
---|
403 | } |
---|
404 | |
---|
405 | if (log) { |
---|
406 | if (error) fprintf(log, "\nError: %s\n", error); // write error to log as well |
---|
407 | fclose(log); |
---|
408 | } |
---|
409 | |
---|
410 | return GB_end_transaction(gb_main, error); |
---|
411 | } |
---|
412 | |
---|