| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : NT_tree_cmp.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "NT_species_set.h" |
|---|
| 12 | #include "NT_local.h" |
|---|
| 13 | #include <aw_msg.hxx> |
|---|
| 14 | #include <arb_progress.h> |
|---|
| 15 | #include <string> |
|---|
| 16 | #include <climits> |
|---|
| 17 | |
|---|
| 18 | using namespace std; |
|---|
| 19 | |
|---|
| 20 | AWT_species_set_root::AWT_species_set_root(GBDATA *gb_maini, long nspeciesi, arb_progress *progress_) { |
|---|
| 21 | memset((char *)this, 0, sizeof(*this)); |
|---|
| 22 | gb_main = gb_maini; |
|---|
| 23 | nspecies = nspeciesi; |
|---|
| 24 | progress = progress_; |
|---|
| 25 | sets = (AWT_species_set **)GB_calloc(sizeof(AWT_species_set *), (size_t)leafs_2_nodes(nspecies, ROOTED)); |
|---|
| 26 | |
|---|
| 27 | for (int i=0; i<256; i++) { |
|---|
| 28 | int j = i; |
|---|
| 29 | int count = 0; |
|---|
| 30 | while (j) { // count bits |
|---|
| 31 | if (j&1) count ++; |
|---|
| 32 | j = j>>1; |
|---|
| 33 | } |
|---|
| 34 | diff_bits[i] = count; |
|---|
| 35 | } |
|---|
| 36 | species_hash = GBS_create_hash(nspecies, GB_IGNORE_CASE); |
|---|
| 37 | species_counter = 1; |
|---|
| 38 | } |
|---|
| 39 | |
|---|
| 40 | AWT_species_set_root::~AWT_species_set_root() { |
|---|
| 41 | for (int i=0; i<nsets; i++) delete sets[i]; |
|---|
| 42 | free(sets); |
|---|
| 43 | GBS_free_hash(species_hash); |
|---|
| 44 | } |
|---|
| 45 | |
|---|
| 46 | void AWT_species_set_root::add(const char *species_name) { |
|---|
| 47 | if (GBS_read_hash(species_hash, species_name)) { |
|---|
| 48 | aw_message(GBS_global_string("Warning: Species '%s' is found more than once in tree", species_name)); |
|---|
| 49 | } |
|---|
| 50 | else { |
|---|
| 51 | GBS_write_hash(species_hash, species_name, species_counter++); |
|---|
| 52 | } |
|---|
| 53 | } |
|---|
| 54 | |
|---|
| 55 | void AWT_species_set_root::add(AWT_species_set *set) { |
|---|
| 56 | nt_assert(nsets<nspecies*2); |
|---|
| 57 | sets[nsets++] = set; |
|---|
| 58 | } |
|---|
| 59 | |
|---|
| 60 | AWT_species_set *AWT_species_set_root::search_best_match(const AWT_species_set *set, long& best_cost) { |
|---|
| 61 | // returns best match for 'set' |
|---|
| 62 | // sets 'best_cost' to minimum mismatches |
|---|
| 63 | |
|---|
| 64 | AWT_species_set *result = 0; |
|---|
| 65 | best_cost = LONG_MAX; |
|---|
| 66 | |
|---|
| 67 | unsigned char *sbs = set->bitstring; |
|---|
| 68 | for (long i=nsets-1; i>=0; i--) { |
|---|
| 69 | long sum = 0; |
|---|
| 70 | unsigned char *rsb = sets[i]->bitstring; |
|---|
| 71 | |
|---|
| 72 | for (long j=bitstring_bytes()-1; j>=0; j--) { |
|---|
| 73 | sum += diff_bits[ sbs[j] ^ rsb[j] ]; |
|---|
| 74 | } |
|---|
| 75 | if (sum > nspecies/2) sum = nspecies - sum; // take always the minimum |
|---|
| 76 | if (sum < best_cost) { |
|---|
| 77 | best_cost = sum; |
|---|
| 78 | result = sets[i]; |
|---|
| 79 | } |
|---|
| 80 | } |
|---|
| 81 | return result; |
|---|
| 82 | } |
|---|
| 83 | |
|---|
| 84 | int AWT_species_set_root::search_and_remember_best_match_and_log_errors(const AWT_species_set *set, FILE *log_file) { |
|---|
| 85 | // set's best_cost & best_node (of best match for 'set' found in other tree) |
|---|
| 86 | // returns the number of mismatches (plus a small penalty for missing species) |
|---|
| 87 | // |
|---|
| 88 | // When moving node info, 'set' represents a subtree of the source tree. |
|---|
| 89 | // When comparing topologies, 'set' represents a subtree of the destination tree. |
|---|
| 90 | |
|---|
| 91 | long net_cost; |
|---|
| 92 | AWT_species_set *bs = search_best_match(set, net_cost); |
|---|
| 93 | |
|---|
| 94 | double best_cost = net_cost + set->unfound_species_count * 0.0001; |
|---|
| 95 | if (best_cost < bs->best_cost) { |
|---|
| 96 | bs->best_cost = best_cost; |
|---|
| 97 | bs->best_node = set->node; |
|---|
| 98 | } |
|---|
| 99 | if (log_file) { |
|---|
| 100 | if (net_cost) { |
|---|
| 101 | fprintf(log_file, "Group '%s' not placed optimal (%li errors)\n", |
|---|
| 102 | set->node->name, |
|---|
| 103 | net_cost); |
|---|
| 104 | } |
|---|
| 105 | } |
|---|
| 106 | return net_cost; |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | void AWT_species_set::init(AP_tree *nodei, const AWT_species_set_root *ssr) { |
|---|
| 110 | memset((char *)this, 0, sizeof(*this)); |
|---|
| 111 | |
|---|
| 112 | bitstring = (unsigned char *)GB_calloc(sizeof(char), ssr->bitstring_longs()*sizeof(long)); |
|---|
| 113 | this->node = nodei; |
|---|
| 114 | best_cost = 0x7fffffff; |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | AWT_species_set::AWT_species_set(AP_tree *nodei, const AWT_species_set_root *ssr, const char *species_name) { |
|---|
| 118 | init(nodei, ssr); |
|---|
| 119 | |
|---|
| 120 | long species_index = GBS_read_hash(ssr->species_hash, species_name); |
|---|
| 121 | if (species_index) { |
|---|
| 122 | bitstring[species_index/8] |= 1 << (species_index % 8); |
|---|
| 123 | } |
|---|
| 124 | else { |
|---|
| 125 | unfound_species_count = 1; |
|---|
| 126 | } |
|---|
| 127 | } |
|---|
| 128 | |
|---|
| 129 | AWT_species_set::AWT_species_set(AP_tree *nodei, const AWT_species_set_root *ssr, const AWT_species_set *l, const AWT_species_set *r) { |
|---|
| 130 | init(nodei, ssr); |
|---|
| 131 | |
|---|
| 132 | const long *lbits = (const long *)l->bitstring; |
|---|
| 133 | const long *rbits = (const long *)r->bitstring; |
|---|
| 134 | long *dest = (long *)bitstring; |
|---|
| 135 | |
|---|
| 136 | for (long j = ssr->bitstring_longs()-1; j>=0; j--) { |
|---|
| 137 | dest[j] = lbits[j] | rbits[j]; |
|---|
| 138 | } |
|---|
| 139 | unfound_species_count = l->unfound_species_count + r->unfound_species_count; |
|---|
| 140 | } |
|---|
| 141 | |
|---|
| 142 | AWT_species_set::~AWT_species_set() { |
|---|
| 143 | free(bitstring); |
|---|
| 144 | } |
|---|
| 145 | |
|---|
| 146 | AWT_species_set *AWT_species_set_root::move_tree_2_ssr(AP_tree *node) { |
|---|
| 147 | AWT_species_set *ss; |
|---|
| 148 | // Warning: confusing resource handling: |
|---|
| 149 | // - leafs are returned "NOT owned by anybody" |
|---|
| 150 | // - inner nodes are added to and owned by this->sets |
|---|
| 151 | |
|---|
| 152 | if (node->is_leaf) { |
|---|
| 153 | this->add(node->name); |
|---|
| 154 | ss = new AWT_species_set(node, this, node->name); |
|---|
| 155 | nt_assert(ss->is_leaf_set()); |
|---|
| 156 | } |
|---|
| 157 | else { |
|---|
| 158 | AWT_species_set *ls = move_tree_2_ssr(node->get_leftson()); |
|---|
| 159 | AWT_species_set *rs = move_tree_2_ssr(node->get_rightson()); |
|---|
| 160 | |
|---|
| 161 | ss = new AWT_species_set(node, this, ls, rs); |
|---|
| 162 | this->add(ss); |
|---|
| 163 | nt_assert(!ss->is_leaf_set()); |
|---|
| 164 | |
|---|
| 165 | if (ls->is_leaf_set()) delete ls; |
|---|
| 166 | if (rs->is_leaf_set()) delete rs; |
|---|
| 167 | } |
|---|
| 168 | return ss; |
|---|
| 169 | } |
|---|
| 170 | |
|---|
| 171 | AWT_species_set *AWT_species_set_root::find_best_matches_info(AP_tree *node, FILE *log, bool compare_node_info) { |
|---|
| 172 | /* Go through all node of the source tree and search for the best |
|---|
| 173 | * matching node in dest_tree (meaning searching ssr->sets) |
|---|
| 174 | * If a match is found, set ssr->sets to this match. |
|---|
| 175 | */ |
|---|
| 176 | |
|---|
| 177 | AWT_species_set *ss = NULL; |
|---|
| 178 | if (node->is_leaf) { |
|---|
| 179 | ss = new AWT_species_set(node, this, node->name); |
|---|
| 180 | } |
|---|
| 181 | else { |
|---|
| 182 | AWT_species_set *ls = find_best_matches_info(node->get_leftson(), log, compare_node_info); |
|---|
| 183 | AWT_species_set *rs = ls ? find_best_matches_info(node->get_rightson(), log, compare_node_info) : NULL; |
|---|
| 184 | |
|---|
| 185 | if (rs) { |
|---|
| 186 | ss = new AWT_species_set(node, this, ls, rs); |
|---|
| 187 | if (compare_node_info) { |
|---|
| 188 | int mismatches = search_and_remember_best_match_and_log_errors(ss, log); |
|---|
| 189 | // the #-sign is important (otherwise TREE_write_Newick will not work correctly; interference with bootstrap values!) |
|---|
| 190 | char *new_remark = mismatches ? GBS_global_string_copy("# %i", mismatches) : NULL; |
|---|
| 191 | node->use_as_remark(new_remark); |
|---|
| 192 | } |
|---|
| 193 | else { |
|---|
| 194 | if (node->name) { |
|---|
| 195 | search_and_remember_best_match_and_log_errors(ss, log); |
|---|
| 196 | } |
|---|
| 197 | } |
|---|
| 198 | } |
|---|
| 199 | delete rs; |
|---|
| 200 | delete ls; |
|---|
| 201 | } |
|---|
| 202 | if (ss) { |
|---|
| 203 | progress->inc(); |
|---|
| 204 | if (progress->aborted()) { |
|---|
| 205 | delete ss; |
|---|
| 206 | ss = NULL; |
|---|
| 207 | } |
|---|
| 208 | } |
|---|
| 209 | return ss; |
|---|
| 210 | } |
|---|
| 211 | |
|---|
| 212 | GB_ERROR AWT_species_set_root::copy_node_information(FILE *log, bool delete_old_nodes, bool nodes_with_marked_only) { |
|---|
| 213 | GB_ERROR error = NULL; |
|---|
| 214 | |
|---|
| 215 | if (log) fputs("\nDetailed group changes:\n\n", log); |
|---|
| 216 | |
|---|
| 217 | for (long j=this->nsets-1; j>=0 && !error; j--) { |
|---|
| 218 | AWT_species_set *cset = this->sets[j]; |
|---|
| 219 | |
|---|
| 220 | char *old_group_name = NULL; |
|---|
| 221 | bool insert_new_node = cset->best_node && cset->best_node->name; |
|---|
| 222 | |
|---|
| 223 | if (nodes_with_marked_only && insert_new_node) { |
|---|
| 224 | if (!cset->node->contains_marked_species()) insert_new_node = false; |
|---|
| 225 | } |
|---|
| 226 | |
|---|
| 227 | if (cset->node->gb_node && (delete_old_nodes || insert_new_node)) { // There is already a node, delete old |
|---|
| 228 | if (cset->node->name == 0) { |
|---|
| 229 | GBDATA *gb_name = GB_entry(cset->node->gb_node, "group_name"); |
|---|
| 230 | if (gb_name) { |
|---|
| 231 | cset->node->name = GB_read_string(gb_name); |
|---|
| 232 | } |
|---|
| 233 | else { |
|---|
| 234 | cset->node->name = strdup("<gb_node w/o name>"); |
|---|
| 235 | } |
|---|
| 236 | } |
|---|
| 237 | |
|---|
| 238 | old_group_name = strdup(cset->node->name); // store old_group_name to rename new inserted node |
|---|
| 239 | |
|---|
| 240 | error = GB_delete(cset->node->gb_node); |
|---|
| 241 | if (!error) { |
|---|
| 242 | cset->node->gb_node = 0; |
|---|
| 243 | freenull(cset->node->name); |
|---|
| 244 | } |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | if (!error) { |
|---|
| 248 | if (insert_new_node) { |
|---|
| 249 | cset->node->gb_node = GB_create_container(cset->node->get_tree_root()->get_gb_tree(), "node"); |
|---|
| 250 | error = GB_copy(cset->node->gb_node, cset->best_node->gb_node); |
|---|
| 251 | if (!error) { |
|---|
| 252 | GBDATA *gb_name = GB_entry(cset->node->gb_node, "group_name"); |
|---|
| 253 | nt_assert(gb_name); |
|---|
| 254 | if (gb_name) { |
|---|
| 255 | char *best_group_name = GB_read_string(gb_name); |
|---|
| 256 | if (old_group_name) { |
|---|
| 257 | if (!delete_old_nodes) { |
|---|
| 258 | if (strcmp(old_group_name, best_group_name) != 0) { // old and new name differ |
|---|
| 259 | char *new_group_name = GBS_global_string_copy("%s [was: %s]", best_group_name, old_group_name); |
|---|
| 260 | GB_write_string(gb_name, new_group_name); |
|---|
| 261 | if (log) fprintf(log, "Destination group '%s' overwritten by '%s'\n", old_group_name, new_group_name); |
|---|
| 262 | free(new_group_name); |
|---|
| 263 | } |
|---|
| 264 | else { |
|---|
| 265 | if (log) fprintf(log, "Group '%s' remains unchanged\n", old_group_name); |
|---|
| 266 | } |
|---|
| 267 | } |
|---|
| 268 | else { |
|---|
| 269 | if (log) { |
|---|
| 270 | if (strcmp(old_group_name, best_group_name) == 0) { |
|---|
| 271 | fprintf(log, "Group '%s' remains unchanged\n", old_group_name); |
|---|
| 272 | } |
|---|
| 273 | else { |
|---|
| 274 | fprintf(log, "Destination group '%s' overwritten by '%s'\n", old_group_name, best_group_name); |
|---|
| 275 | } |
|---|
| 276 | } |
|---|
| 277 | } |
|---|
| 278 | } |
|---|
| 279 | else { |
|---|
| 280 | if (log) fprintf(log, "Group '%s' inserted\n", best_group_name); |
|---|
| 281 | } |
|---|
| 282 | free(best_group_name); |
|---|
| 283 | } |
|---|
| 284 | } |
|---|
| 285 | } |
|---|
| 286 | else { |
|---|
| 287 | if (old_group_name && log) fprintf(log, "Destination group '%s' removed\n", old_group_name); |
|---|
| 288 | } |
|---|
| 289 | } |
|---|
| 290 | free(old_group_name); |
|---|
| 291 | } |
|---|
| 292 | return error; |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | void AWT_species_set_root::finish(GB_ERROR& error) { |
|---|
| 296 | if (!error) error = progress->error_if_aborted(); |
|---|
| 297 | progress->done(); |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | GB_ERROR AWT_move_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, TreeInfoMode mode, bool nodes_with_marked_only) { |
|---|
| 301 | GB_ERROR error = 0; |
|---|
| 302 | FILE *log = 0; |
|---|
| 303 | |
|---|
| 304 | nt_assert(contradicted(mode == TREE_INFO_COMPARE, log_file)); |
|---|
| 305 | |
|---|
| 306 | if (mode == TREE_INFO_COMPARE) { |
|---|
| 307 | // info is written into 'tree_source' |
|---|
| 308 | // (but we want to modify destination tree - like 'mode node info' does) |
|---|
| 309 | std::swap(tree_source, tree_dest); |
|---|
| 310 | } |
|---|
| 311 | |
|---|
| 312 | if (log_file) { |
|---|
| 313 | nt_assert(mode == TREE_INFO_COPY || mode == TREE_INFO_ADD); |
|---|
| 314 | log = fopen(log_file, "w"); |
|---|
| 315 | |
|---|
| 316 | fprintf(log, |
|---|
| 317 | "LOGFILE: %s node info\n" |
|---|
| 318 | "\n" |
|---|
| 319 | " Source tree '%s'\n" |
|---|
| 320 | "Destination tree '%s'\n" |
|---|
| 321 | "\n", |
|---|
| 322 | mode == TREE_INFO_COPY ? "Copying" : "Adding", |
|---|
| 323 | tree_source, tree_dest); |
|---|
| 324 | } |
|---|
| 325 | |
|---|
| 326 | GB_begin_transaction(gb_main); |
|---|
| 327 | |
|---|
| 328 | AP_tree_root rsource(new AliView(gb_main), new AP_TreeNodeFactory, NULL, false); |
|---|
| 329 | AP_tree_root rdest (new AliView(gb_main), new AP_TreeNodeFactory, NULL, false); |
|---|
| 330 | AP_tree_root& rsave = (mode == TREE_INFO_COMPARE) ? rsource : rdest; |
|---|
| 331 | arb_progress progress("Comparing Topologies"); |
|---|
| 332 | |
|---|
| 333 | error = rsource.loadFromDB(tree_source); |
|---|
| 334 | if (!error) error = rsource.linkToDB(0, 0); |
|---|
| 335 | if (!error) { |
|---|
| 336 | error = rdest.loadFromDB(tree_dest); |
|---|
| 337 | if (!error) error = rdest.linkToDB(0, 0); |
|---|
| 338 | if (!error) { |
|---|
| 339 | AP_tree *source = rsource.get_root_node(); |
|---|
| 340 | AP_tree *dest = rdest.get_root_node(); |
|---|
| 341 | |
|---|
| 342 | long nspecies = dest->count_leafs(); |
|---|
| 343 | long source_leafs = source->count_leafs(); |
|---|
| 344 | long source_nodes = leafs_2_nodes(source_leafs, ROOTED); |
|---|
| 345 | |
|---|
| 346 | arb_progress compare_progress(source_nodes); |
|---|
| 347 | compare_progress.subtitle("Comparing both trees"); |
|---|
| 348 | |
|---|
| 349 | AWT_species_set_root *ssr = new AWT_species_set_root(gb_main, nspecies, &compare_progress); |
|---|
| 350 | |
|---|
| 351 | ssr->move_tree_2_ssr(dest); |
|---|
| 352 | |
|---|
| 353 | if (source_leafs < 3) error = GB_export_error("Destination tree has less than 3 species"); |
|---|
| 354 | else { |
|---|
| 355 | AWT_species_set *root_setl = ssr->find_best_matches_info(source->get_leftson(), log, mode == TREE_INFO_COMPARE); |
|---|
| 356 | AWT_species_set *root_setr = root_setl ? ssr->find_best_matches_info(source->get_rightson(), log, mode == TREE_INFO_COMPARE) : NULL; |
|---|
| 357 | |
|---|
| 358 | if (root_setr) { |
|---|
| 359 | if (mode != TREE_INFO_COMPARE) { |
|---|
| 360 | compare_progress.subtitle("Copying node information"); |
|---|
| 361 | ssr->copy_node_information(log, mode == TREE_INFO_COPY, nodes_with_marked_only); |
|---|
| 362 | } |
|---|
| 363 | |
|---|
| 364 | long dummy = 0; |
|---|
| 365 | AWT_species_set *new_root_setl = ssr->search_best_match(root_setl, dummy); |
|---|
| 366 | AWT_species_set *new_root_setr = ssr->search_best_match(root_setr, dummy); |
|---|
| 367 | AP_tree *new_rootl = new_root_setl->node; |
|---|
| 368 | AP_tree *new_rootr = new_root_setr->node; |
|---|
| 369 | |
|---|
| 370 | new_rootl->set_root(); // set root correctly |
|---|
| 371 | new_rootr->set_root(); // set root correctly |
|---|
| 372 | |
|---|
| 373 | compare_progress.subtitle("Saving trees"); |
|---|
| 374 | |
|---|
| 375 | AP_tree *saved_root = (mode == TREE_INFO_COMPARE) ? source : new_rootr->get_root_node(); |
|---|
| 376 | error = GBT_overwrite_tree(rsave.get_gb_tree(), saved_root); |
|---|
| 377 | |
|---|
| 378 | if (!error) { |
|---|
| 379 | char *entry; |
|---|
| 380 | if (mode == TREE_INFO_COMPARE) { |
|---|
| 381 | entry = GBS_global_string_copy("Compared topology with %s", tree_dest); |
|---|
| 382 | } |
|---|
| 383 | else { |
|---|
| 384 | const char *copiedOrAdded = mode == TREE_INFO_COPY ? "Copied" : "Added"; |
|---|
| 385 | |
|---|
| 386 | entry = GBS_global_string_copy("%s node info %sfrom %s", |
|---|
| 387 | copiedOrAdded, |
|---|
| 388 | nodes_with_marked_only ? "of marked " : "", |
|---|
| 389 | tree_source); |
|---|
| 390 | } |
|---|
| 391 | GBT_log_to_tree_remark(rsave.get_gb_tree(), entry); |
|---|
| 392 | free(entry); |
|---|
| 393 | } |
|---|
| 394 | } |
|---|
| 395 | |
|---|
| 396 | delete root_setl; |
|---|
| 397 | delete root_setr; |
|---|
| 398 | } |
|---|
| 399 | |
|---|
| 400 | ssr->finish(error); |
|---|
| 401 | delete ssr; |
|---|
| 402 | } |
|---|
| 403 | } |
|---|
| 404 | |
|---|
| 405 | if (log) { |
|---|
| 406 | if (error) fprintf(log, "\nError: %s\n", error); // write error to log as well |
|---|
| 407 | fclose(log); |
|---|
| 408 | } |
|---|
| 409 | |
|---|
| 410 | return GB_end_transaction(gb_main, error); |
|---|
| 411 | } |
|---|
| 412 | |
|---|