1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : NT_tree_cmp.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "NT_tree_cmp.h" |
---|
12 | #include "NT_local.h" |
---|
13 | |
---|
14 | #include <AP_Tree.hxx> |
---|
15 | #include <aw_msg.hxx> |
---|
16 | #include <arb_progress.h> |
---|
17 | #include <string> |
---|
18 | |
---|
19 | using namespace std; |
---|
20 | |
---|
21 | AWT_species_set_root::AWT_species_set_root(GBDATA *gb_maini, long nspeciesi, arb_progress *progress_) { |
---|
22 | memset((char *)this, 0, sizeof(*this)); |
---|
23 | gb_main = gb_maini; |
---|
24 | nspecies = nspeciesi; |
---|
25 | progress = progress_; |
---|
26 | sets = (AWT_species_set **)GB_calloc(sizeof(AWT_species_set *), (size_t)(nspecies*2+1)); |
---|
27 | |
---|
28 | int i; |
---|
29 | for (i=0; i<256; i++) { |
---|
30 | int j = i; |
---|
31 | int count = 0; |
---|
32 | while (j) { // count bits |
---|
33 | if (j&1) count ++; |
---|
34 | j = j>>1; |
---|
35 | } |
---|
36 | diff_bits[i] = count; |
---|
37 | } |
---|
38 | species_hash = GBS_create_hash(nspecies, GB_IGNORE_CASE); |
---|
39 | species_counter = 1; |
---|
40 | } |
---|
41 | |
---|
42 | AWT_species_set_root::~AWT_species_set_root() { |
---|
43 | int i; |
---|
44 | for (i=0; i<nsets; i++) { |
---|
45 | delete sets[i]; |
---|
46 | } |
---|
47 | delete sets; |
---|
48 | } |
---|
49 | |
---|
50 | void AWT_species_set_root::add(const char *species_name) { |
---|
51 | if (GBS_read_hash(species_hash, species_name)) { |
---|
52 | aw_message(GBS_global_string("Warning: Species '%s' is found more than once in tree", species_name)); |
---|
53 | } |
---|
54 | else { |
---|
55 | GBS_write_hash(species_hash, species_name, species_counter++); |
---|
56 | } |
---|
57 | } |
---|
58 | |
---|
59 | void AWT_species_set_root::add(AWT_species_set *set) { |
---|
60 | nt_assert(nsets<nspecies*2); |
---|
61 | sets[nsets++] = set; |
---|
62 | } |
---|
63 | |
---|
64 | AWT_species_set *AWT_species_set_root::search(AWT_species_set *set, long *best_co) { |
---|
65 | AWT_species_set *result = 0; |
---|
66 | long i; |
---|
67 | long best_cost = 0x7fffffff; |
---|
68 | unsigned char *sbs = set->bitstring; |
---|
69 | for (i=nsets-1; i>=0; i--) { |
---|
70 | long j; |
---|
71 | long sum = 0; |
---|
72 | unsigned char *rsb = sets[i]->bitstring; |
---|
73 | for (j=nspecies/8 -1 + 1; j>=0; j--) { |
---|
74 | sum += diff_bits[(sbs[j]) ^ (rsb[j])]; |
---|
75 | } |
---|
76 | if (sum > nspecies/2) sum = nspecies - sum; // take always the minimum |
---|
77 | if (sum < best_cost) { |
---|
78 | best_cost = sum; |
---|
79 | result = sets[i]; |
---|
80 | } |
---|
81 | } |
---|
82 | *best_co = best_cost; |
---|
83 | return result; |
---|
84 | } |
---|
85 | |
---|
86 | int AWT_species_set_root::search(AWT_species_set *set, FILE *log_file) { |
---|
87 | long net_cost; |
---|
88 | double best_cost; |
---|
89 | AWT_species_set *bs = this->search(set, &net_cost); |
---|
90 | best_cost = net_cost + set->unfound_species_count * 0.0001; |
---|
91 | if (best_cost < bs->best_cost) { |
---|
92 | bs->best_cost = best_cost; |
---|
93 | bs->best_node = set->node; |
---|
94 | } |
---|
95 | if (log_file) { |
---|
96 | if (net_cost) { |
---|
97 | fprintf(log_file, "Node '%s' placed not optimal, %li errors\n", |
---|
98 | set->node->name, |
---|
99 | net_cost); |
---|
100 | } |
---|
101 | } |
---|
102 | return net_cost; |
---|
103 | } |
---|
104 | |
---|
105 | AWT_species_set::AWT_species_set(AP_tree *nodei, AWT_species_set_root *ssr, char *species_name) { |
---|
106 | memset((char *)this, 0, sizeof(*this)); |
---|
107 | bitstring = (unsigned char *)GB_calloc(sizeof(char), size_t(ssr->nspecies/8)+sizeof(long)+1); |
---|
108 | long species_index = GBS_read_hash(ssr->species_hash, species_name); |
---|
109 | if (species_index) { |
---|
110 | bitstring[species_index/8] |= 1 << (species_index % 8); |
---|
111 | } |
---|
112 | else { |
---|
113 | unfound_species_count = 1; |
---|
114 | } |
---|
115 | this->node = nodei; |
---|
116 | best_cost = 0x7fffffff; |
---|
117 | } |
---|
118 | |
---|
119 | AWT_species_set::AWT_species_set(AP_tree *nodei, AWT_species_set_root *ssr, AWT_species_set *l, AWT_species_set *r) { |
---|
120 | memset((char *)this, 0, sizeof(*this)); |
---|
121 | this->node = node; |
---|
122 | long j = ssr->nspecies/8+1; // @@@ unused |
---|
123 | bitstring = (unsigned char *)GB_calloc(sizeof(char), size_t(ssr->nspecies/8)+5); |
---|
124 | long *lbits = (long *)l->bitstring; |
---|
125 | long *rbits = (long *)r->bitstring; |
---|
126 | long *dest = (long *)bitstring; |
---|
127 | for (j = ssr->nspecies/8/sizeof(long) - 1 + 1; j>=0; j--) { |
---|
128 | dest[j] = lbits[j] | rbits[j]; |
---|
129 | } |
---|
130 | unfound_species_count = l->unfound_species_count + r->unfound_species_count; |
---|
131 | this->node = nodei; |
---|
132 | best_cost = 0x7fffffff; |
---|
133 | } |
---|
134 | |
---|
135 | AWT_species_set::~AWT_species_set() { |
---|
136 | free(bitstring); |
---|
137 | } |
---|
138 | |
---|
139 | AWT_species_set *AWT_species_set_root::move_tree_2_ssr(AP_tree *node) { |
---|
140 | AWT_species_set *ss; |
---|
141 | if (node->is_leaf) { |
---|
142 | this->add(node->name); |
---|
143 | ss = new AWT_species_set(node, this, node->name); |
---|
144 | } |
---|
145 | else { |
---|
146 | AWT_species_set *ls = move_tree_2_ssr(node->get_leftson()); |
---|
147 | AWT_species_set *rs = move_tree_2_ssr(node->get_rightson()); |
---|
148 | ss = new AWT_species_set(node, this, ls, rs); |
---|
149 | this->add(ss); |
---|
150 | } |
---|
151 | return ss; |
---|
152 | } |
---|
153 | |
---|
154 | AWT_species_set *AWT_species_set_root::find_best_matches_info(AP_tree *tree_source, FILE *log, bool compare_node_info) { |
---|
155 | /* Go through all node of the source tree and search for the best |
---|
156 | * matching node in dest_tree (meaning searching ssr->sets) |
---|
157 | * If a match is found, set ssr->sets to this match) |
---|
158 | */ |
---|
159 | AWT_species_set *ss = NULL; |
---|
160 | if (tree_source->is_leaf) { |
---|
161 | ss = new AWT_species_set(tree_source, this, tree_source->name); |
---|
162 | } |
---|
163 | else { |
---|
164 | AWT_species_set *ls = find_best_matches_info(tree_source->get_leftson(), log, compare_node_info); |
---|
165 | AWT_species_set *rs = ls ? find_best_matches_info(tree_source->get_rightson(), log, compare_node_info) : NULL; |
---|
166 | |
---|
167 | if (rs) { |
---|
168 | ss = new AWT_species_set(tree_source, this, ls, rs); // Generate new bitstring |
---|
169 | if (compare_node_info) { |
---|
170 | int mismatches = this->search(ss, log); // Search optimal position |
---|
171 | delete ss->node->remark_branch; |
---|
172 | ss->node->remark_branch = 0; |
---|
173 | if (mismatches) { |
---|
174 | char remark[20]; |
---|
175 | sprintf(remark, "# %i", mismatches); // the #-sign is important (otherwise TREE_write_Newick will not work correctly) |
---|
176 | ss->node->remark_branch = strdup(remark); |
---|
177 | } |
---|
178 | } |
---|
179 | else { |
---|
180 | if (tree_source->name) { |
---|
181 | this->search(ss, log); // Search optimal position |
---|
182 | } |
---|
183 | } |
---|
184 | } |
---|
185 | delete rs; |
---|
186 | delete ls; |
---|
187 | } |
---|
188 | if (ss) { |
---|
189 | progress->inc(); |
---|
190 | if (progress->aborted()) { |
---|
191 | delete ss; |
---|
192 | ss = NULL; |
---|
193 | } |
---|
194 | } |
---|
195 | return ss; // return bitstring for this node |
---|
196 | } |
---|
197 | |
---|
198 | GB_ERROR AWT_species_set_root::copy_node_information(FILE *log, bool delete_old_nodes, bool nodes_with_marked_only) { |
---|
199 | GB_ERROR error = 0; |
---|
200 | long j; |
---|
201 | |
---|
202 | for (j=this->nsets-1; j>=0; j--) { |
---|
203 | AWT_species_set *cset = this->sets[j]; |
---|
204 | char *old_group_name = 0; |
---|
205 | bool insert_new_node = cset->best_node && cset->best_node->name; |
---|
206 | |
---|
207 | if (nodes_with_marked_only && insert_new_node) { |
---|
208 | if (!cset->node->contains_marked_species()) insert_new_node = false; |
---|
209 | } |
---|
210 | |
---|
211 | if (cset->node->gb_node && (delete_old_nodes || insert_new_node)) { // There is already a node, delete old |
---|
212 | if (cset->node->name == 0) { |
---|
213 | GBDATA *gb_name = GB_entry(cset->node->gb_node, "group_name"); |
---|
214 | if (gb_name) { |
---|
215 | cset->node->name = GB_read_string(gb_name); |
---|
216 | } |
---|
217 | else { |
---|
218 | cset->node->name = strdup("<gb_node w/o name>"); |
---|
219 | } |
---|
220 | } |
---|
221 | |
---|
222 | old_group_name = strdup(cset->node->name); // store old_group_name to rename new inserted node |
---|
223 | #if defined(DEBUG) |
---|
224 | printf("delete node '%s'\n", cset->node->name); |
---|
225 | #endif // DEBUG |
---|
226 | |
---|
227 | error = GB_delete(cset->node->gb_node); |
---|
228 | if (error) break; |
---|
229 | |
---|
230 | if (log) fprintf(log, "Destination Tree not empty, destination node '%s' deleted\n", old_group_name); |
---|
231 | cset->node->gb_node = 0; |
---|
232 | cset->node->name = 0; |
---|
233 | } |
---|
234 | if (insert_new_node) { |
---|
235 | cset->node->gb_node = GB_create_container(cset->node->get_tree_root()->get_gb_tree(), "node"); |
---|
236 | error = GB_copy(cset->node->gb_node, cset->best_node->gb_node); |
---|
237 | if (error) break; |
---|
238 | |
---|
239 | GB_dump(cset->node->gb_node); |
---|
240 | |
---|
241 | GBDATA *gb_name = GB_entry(cset->node->gb_node, "group_name"); |
---|
242 | nt_assert(gb_name); |
---|
243 | if (gb_name) { |
---|
244 | char *name = GB_read_string(gb_name); |
---|
245 | |
---|
246 | if (old_group_name && strcmp(old_group_name, name)!=0 && !delete_old_nodes) { |
---|
247 | // there was a group with a different name and we are adding nodes |
---|
248 | string new_group_name = (string)name+" [was: "+old_group_name+"]"; |
---|
249 | GB_write_string(gb_name, new_group_name.c_str()); |
---|
250 | delete name; name = GB_read_string(gb_name); |
---|
251 | } |
---|
252 | #if defined(DEBUG) |
---|
253 | printf("insert node '%s'\n", name); |
---|
254 | #endif // DEBUG |
---|
255 | free(name); |
---|
256 | } |
---|
257 | } |
---|
258 | free(old_group_name); |
---|
259 | } |
---|
260 | |
---|
261 | return error; |
---|
262 | } |
---|
263 | |
---|
264 | void AWT_species_set_root::finish(GB_ERROR& error) { |
---|
265 | if (!error) error = progress->error_if_aborted(); |
---|
266 | progress->done(); |
---|
267 | } |
---|
268 | |
---|
269 | void AWT_move_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, bool compare_node_info, bool delete_old_nodes, bool nodes_with_marked_only) { |
---|
270 | GB_ERROR error = 0; |
---|
271 | FILE *log = 0; |
---|
272 | |
---|
273 | if (log_file) { |
---|
274 | log = fopen(log_file, "w"); |
---|
275 | fprintf(log, |
---|
276 | "LOGFILE: %s node info\n" |
---|
277 | "\n" |
---|
278 | " Source tree '%s'\n" |
---|
279 | "Destination tree '%s'\n" |
---|
280 | "\n", |
---|
281 | delete_old_nodes ? "Moving" : "Adding", |
---|
282 | tree_source, tree_dest); |
---|
283 | } |
---|
284 | |
---|
285 | GB_begin_transaction(gb_main); |
---|
286 | |
---|
287 | AP_tree_root rsource(new AliView(gb_main), AP_tree(0), NULL, false); |
---|
288 | AP_tree_root rdest (new AliView(gb_main), AP_tree(0), NULL, false); |
---|
289 | |
---|
290 | arb_progress progress("Comparing Topologies"); |
---|
291 | |
---|
292 | error = rsource.loadFromDB(tree_source); |
---|
293 | if (!error) error = rsource.linkToDB(0, 0); |
---|
294 | if (!error) { |
---|
295 | error = rdest.loadFromDB(tree_dest); |
---|
296 | if (!error) error = rdest.linkToDB(0, 0); |
---|
297 | if (!error) { |
---|
298 | AP_tree *source = rsource.get_root_node(); |
---|
299 | AP_tree *dest = rdest.get_root_node(); |
---|
300 | |
---|
301 | long nspecies = dest->arb_tree_leafsum2(); |
---|
302 | long source_leafs = source->arb_tree_leafsum2(); |
---|
303 | long source_nodes = source_leafs*2-1; |
---|
304 | |
---|
305 | arb_progress compare_progress(source_nodes); |
---|
306 | compare_progress.subtitle("Comparing both trees"); |
---|
307 | |
---|
308 | AWT_species_set_root *ssr = new AWT_species_set_root(gb_main, nspecies, &compare_progress); |
---|
309 | |
---|
310 | ssr->move_tree_2_ssr(dest); |
---|
311 | |
---|
312 | if (source_leafs < 3) error = GB_export_error("Destination tree has less than 3 species"); |
---|
313 | else { |
---|
314 | AWT_species_set *root_setl = ssr->find_best_matches_info(source->get_leftson(), log, compare_node_info); |
---|
315 | AWT_species_set *root_setr = root_setl ? ssr->find_best_matches_info(source->get_rightson(), log, compare_node_info) : NULL; |
---|
316 | |
---|
317 | if (root_setr) { |
---|
318 | if (!compare_node_info) { |
---|
319 | compare_progress.subtitle("Copying node information"); |
---|
320 | ssr->copy_node_information(log, delete_old_nodes, nodes_with_marked_only); |
---|
321 | } |
---|
322 | |
---|
323 | long dummy = 0; |
---|
324 | AWT_species_set *new_root_setl = ssr->search(root_setl, &dummy); |
---|
325 | AWT_species_set *new_root_setr = ssr->search(root_setr, &dummy); |
---|
326 | AP_tree *new_rootl = (AP_tree *) new_root_setl->node; |
---|
327 | AP_tree *new_rootr = (AP_tree *) new_root_setr->node; |
---|
328 | |
---|
329 | new_rootl->set_root(); // set root correctly |
---|
330 | new_rootr->set_root(); // set root correctly |
---|
331 | |
---|
332 | compare_progress.subtitle("Saving trees"); |
---|
333 | |
---|
334 | AP_tree *root = new_rootr->get_root_node(); |
---|
335 | |
---|
336 | error = GBT_write_tree(gb_main, rdest.get_gb_tree(), 0, root->get_gbt_tree()); |
---|
337 | if (!error) error = GBT_write_tree(gb_main, rsource.get_gb_tree(), 0, source->get_gbt_tree()); |
---|
338 | } |
---|
339 | |
---|
340 | delete root_setl; |
---|
341 | delete root_setr; |
---|
342 | } |
---|
343 | |
---|
344 | ssr->finish(error); |
---|
345 | delete ssr; |
---|
346 | } |
---|
347 | } |
---|
348 | |
---|
349 | if (log) { |
---|
350 | if (error) fprintf(log, "\nError: %s\n", error); // write error to log as well |
---|
351 | fclose(log); |
---|
352 | } |
---|
353 | |
---|
354 | GB_end_transaction_show_error(gb_main, error, aw_message); |
---|
355 | } |
---|
356 | |
---|