| 1 | #include <stdio.h> |
|---|
| 2 | |
|---|
| 3 | #include <xml.hxx> |
|---|
| 4 | |
|---|
| 5 | #include <TreeRead.h> |
|---|
| 6 | #include <TreeWrite.h> |
|---|
| 7 | |
|---|
| 8 | using namespace std; |
|---|
| 9 | |
|---|
| 10 | #define tree_assert(cond) arb_assert(cond) |
|---|
| 11 | |
|---|
| 12 | static void export_tree_label(const char *label, FILE *out, TREE_node_quoting qmode) { |
|---|
| 13 | // writes a label into the Newick file |
|---|
| 14 | // label is quoted if necessary |
|---|
| 15 | // label may be an internal_node_label, a leaf_label or a root_label |
|---|
| 16 | tree_assert(label); |
|---|
| 17 | |
|---|
| 18 | const char *disallowed_chars = " \t\'\"()[]:;,"; // '(' is first problem_char |
|---|
| 19 | const char *problem_chars = disallowed_chars+4; |
|---|
| 20 | tree_assert(problem_chars[0] == '('); |
|---|
| 21 | |
|---|
| 22 | bool need_quotes = strpbrk(label, disallowed_chars) != NULL; |
|---|
| 23 | char used_quote = 0; |
|---|
| 24 | |
|---|
| 25 | if ((qmode & TREE_FORCE_QUOTES) || need_quotes) { |
|---|
| 26 | if (qmode&TREE_SINGLE_QUOTES) used_quote = '\''; |
|---|
| 27 | else if (qmode&TREE_DOUBLE_QUOTES) used_quote = '\"'; |
|---|
| 28 | } |
|---|
| 29 | |
|---|
| 30 | if (used_quote) { |
|---|
| 31 | bool force_replace = (qmode & TREE_FORCE_REPLACE); |
|---|
| 32 | |
|---|
| 33 | fputc(used_quote, out); |
|---|
| 34 | while (*label) { |
|---|
| 35 | char c = *label++; |
|---|
| 36 | if (c == used_quote || // replace used quote by an '_' if it appears inside label |
|---|
| 37 | (force_replace && strchr(problem_chars, c))) // replace all problematic characters if requested |
|---|
| 38 | { |
|---|
| 39 | c = '_'; |
|---|
| 40 | } |
|---|
| 41 | fputc(c, out); |
|---|
| 42 | } |
|---|
| 43 | fputc(used_quote, out); |
|---|
| 44 | } |
|---|
| 45 | else { |
|---|
| 46 | // unquoted label - always replace all problematic characters by '_' |
|---|
| 47 | for (int i = 0; label[i]; ++i) { |
|---|
| 48 | fputc(strchr(disallowed_chars, label[i]) ? '_' : label[i], out); |
|---|
| 49 | } |
|---|
| 50 | } |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | |
|---|
| 54 | |
|---|
| 55 | // documentation of the Newick Format is in ../SOURCE_TOOLS/docs/newick_doc.html |
|---|
| 56 | |
|---|
| 57 | inline void indentTo(int indent, FILE *out) { |
|---|
| 58 | for (int i = 0; i < indent; i++) { |
|---|
| 59 | putc(' ',out); |
|---|
| 60 | putc(' ',out); |
|---|
| 61 | } |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | static const char *export_tree_node_print(GBDATA *gb_main, FILE *out, GBT_TREE *tree, const char *tree_name, |
|---|
| 65 | bool pretty, int indent, |
|---|
| 66 | const TREE_node_text_gen *node_gen, bool save_branchlengths, |
|---|
| 67 | bool save_bootstraps, bool save_groupnames, TREE_node_quoting qmode) |
|---|
| 68 | { |
|---|
| 69 | const char *error = 0; |
|---|
| 70 | const char *buf; |
|---|
| 71 | |
|---|
| 72 | if (pretty) indentTo(indent, out); |
|---|
| 73 | |
|---|
| 74 | if (tree->is_leaf) { |
|---|
| 75 | if (node_gen) buf = node_gen->gen(gb_main, tree->gb_node,0,tree, tree_name); |
|---|
| 76 | else buf = tree->name; |
|---|
| 77 | |
|---|
| 78 | export_tree_label(buf, out, qmode); |
|---|
| 79 | } |
|---|
| 80 | else { |
|---|
| 81 | if (pretty) fputs("(\n", out); |
|---|
| 82 | else putc('(', out); |
|---|
| 83 | |
|---|
| 84 | error = export_tree_node_print(gb_main, out, tree->leftson, tree_name, pretty, indent+1, node_gen, save_branchlengths, save_bootstraps, save_groupnames, qmode); |
|---|
| 85 | if (save_branchlengths) fprintf(out, ":%.5f", tree->leftlen); |
|---|
| 86 | fputs(",\n", out); |
|---|
| 87 | |
|---|
| 88 | if (error) return error; |
|---|
| 89 | |
|---|
| 90 | error = export_tree_node_print(gb_main, out, tree->rightson, tree_name, pretty, indent+1, node_gen, save_branchlengths, save_bootstraps, save_groupnames, qmode); |
|---|
| 91 | if (save_branchlengths) fprintf(out, ":%.5f", tree->rightlen); |
|---|
| 92 | fputc('\n', out); |
|---|
| 93 | |
|---|
| 94 | if (pretty) indentTo(indent, out); |
|---|
| 95 | fputc(')', out); |
|---|
| 96 | |
|---|
| 97 | buf = 0; |
|---|
| 98 | char *bootstrap = 0; |
|---|
| 99 | |
|---|
| 100 | if (tree->remark_branch && save_bootstraps) { |
|---|
| 101 | const char *boot = tree->remark_branch; |
|---|
| 102 | if (boot[strlen(boot)-1] == '%') { // does remark_branch contain a bootstrap value ? |
|---|
| 103 | char *end = 0; |
|---|
| 104 | double val = strtod(boot, &end); |
|---|
| 105 | tree_assert(end[0] == '%'); // otherwise sth strange is contained in remark_branch |
|---|
| 106 | |
|---|
| 107 | boot = GBS_global_string("%i", int(val+0.5)); |
|---|
| 108 | } |
|---|
| 109 | bootstrap = strdup(boot); |
|---|
| 110 | } |
|---|
| 111 | |
|---|
| 112 | if (tree->name && save_groupnames) buf = tree->name; |
|---|
| 113 | |
|---|
| 114 | const char *print = 0; |
|---|
| 115 | if (buf) { |
|---|
| 116 | if (bootstrap) print = GBS_global_string("%s:%s", bootstrap, buf); |
|---|
| 117 | else print = buf; |
|---|
| 118 | } |
|---|
| 119 | else if (bootstrap) print = bootstrap; |
|---|
| 120 | |
|---|
| 121 | if (print) export_tree_label(print, out, qmode); |
|---|
| 122 | |
|---|
| 123 | free(bootstrap); |
|---|
| 124 | } |
|---|
| 125 | |
|---|
| 126 | return error; |
|---|
| 127 | } |
|---|
| 128 | |
|---|
| 129 | inline string buildNodeIdentifier(const string& parent_id, int& son_counter) { |
|---|
| 130 | ++son_counter; |
|---|
| 131 | if (parent_id.empty()) return GBS_global_string("n_%i", son_counter); |
|---|
| 132 | return GBS_global_string("%s.%i", parent_id.c_str(), son_counter); |
|---|
| 133 | } |
|---|
| 134 | |
|---|
| 135 | static const char *export_tree_node_print_xml(GBDATA *gb_main, GBT_TREE *tree, double my_length, const char *tree_name, |
|---|
| 136 | const TREE_node_text_gen *node_gen, bool skip_folded, const string& parent_id, int& parent_son_counter) { |
|---|
| 137 | const char *error = 0; |
|---|
| 138 | |
|---|
| 139 | if (tree->is_leaf) { |
|---|
| 140 | XML_Tag item_tag("ITEM"); |
|---|
| 141 | |
|---|
| 142 | item_tag.add_attribute("name", buildNodeIdentifier(parent_id, parent_son_counter)); |
|---|
| 143 | |
|---|
| 144 | item_tag.add_attribute("itemname", |
|---|
| 145 | node_gen |
|---|
| 146 | ? node_gen->gen(gb_main, tree->gb_node, 0, tree, tree_name) |
|---|
| 147 | : tree->name); |
|---|
| 148 | |
|---|
| 149 | item_tag.add_attribute("length", GBS_global_string("%.5f", my_length)); |
|---|
| 150 | } |
|---|
| 151 | else { |
|---|
| 152 | char *groupname = 0; |
|---|
| 153 | char *bootstrap = 0; |
|---|
| 154 | |
|---|
| 155 | if (tree->remark_branch) { |
|---|
| 156 | const char *boot = tree->remark_branch; |
|---|
| 157 | if (boot[0] && boot[strlen(boot)-1] == '%') { // does remark_branch contain a bootstrap value ? |
|---|
| 158 | char *end = 0; |
|---|
| 159 | double val = strtod(boot, &end); |
|---|
| 160 | |
|---|
| 161 | tree_assert(end[0] == '%'); // otherwise sth strange is contained in remark_branch |
|---|
| 162 | bootstrap = GBS_global_string_copy("%i", int(val+0.5)); |
|---|
| 163 | } |
|---|
| 164 | } |
|---|
| 165 | bool folded = false; |
|---|
| 166 | if (tree->name) { |
|---|
| 167 | const char *buf; |
|---|
| 168 | |
|---|
| 169 | if (node_gen) buf = node_gen->gen(gb_main, tree->gb_node,0,tree, tree_name); |
|---|
| 170 | else buf = tree->name; |
|---|
| 171 | |
|---|
| 172 | tree_assert(buf); |
|---|
| 173 | groupname = strdup(buf); |
|---|
| 174 | |
|---|
| 175 | GBDATA *gb_grouped = GB_entry(tree->gb_node, "grouped"); |
|---|
| 176 | if (gb_grouped) { |
|---|
| 177 | folded = GB_read_byte(gb_grouped); |
|---|
| 178 | } |
|---|
| 179 | } |
|---|
| 180 | |
|---|
| 181 | if (my_length || bootstrap || groupname ) { |
|---|
| 182 | bool hide_this_group = skip_folded && folded; // hide folded groups only if skip_folded is true |
|---|
| 183 | |
|---|
| 184 | XML_Tag branch_tag(hide_this_group ? "FOLDED_GROUP" : "BRANCH"); |
|---|
| 185 | string my_id = buildNodeIdentifier(parent_id, parent_son_counter); |
|---|
| 186 | |
|---|
| 187 | branch_tag.add_attribute("name", my_id); |
|---|
| 188 | |
|---|
| 189 | if (my_length) { |
|---|
| 190 | branch_tag.add_attribute("length", GBS_global_string("%.5f", my_length)); |
|---|
| 191 | } |
|---|
| 192 | if (bootstrap) { |
|---|
| 193 | branch_tag.add_attribute("bootstrap", bootstrap); |
|---|
| 194 | freeset(bootstrap, 0); |
|---|
| 195 | } |
|---|
| 196 | if (groupname) { |
|---|
| 197 | branch_tag.add_attribute("groupname", groupname); |
|---|
| 198 | freeset(groupname, 0); |
|---|
| 199 | if (folded) branch_tag.add_attribute("folded", "1"); |
|---|
| 200 | } |
|---|
| 201 | else { |
|---|
| 202 | tree_assert(!folded); |
|---|
| 203 | } |
|---|
| 204 | |
|---|
| 205 | int my_son_counter = 0; |
|---|
| 206 | if (hide_this_group) { |
|---|
| 207 | branch_tag.add_attribute("items_in_group", GBT_count_nodes(tree)); |
|---|
| 208 | } |
|---|
| 209 | else { |
|---|
| 210 | if (!error) error = export_tree_node_print_xml(gb_main, tree->leftson, tree->leftlen, tree_name, node_gen, skip_folded, my_id, my_son_counter); |
|---|
| 211 | if (!error) error = export_tree_node_print_xml(gb_main, tree->rightson, tree->rightlen, tree_name, node_gen, skip_folded, my_id, my_son_counter); |
|---|
| 212 | } |
|---|
| 213 | } |
|---|
| 214 | else { |
|---|
| 215 | if (!error) error = export_tree_node_print_xml(gb_main, tree->leftson, tree->leftlen, tree_name, node_gen, skip_folded, parent_id, parent_son_counter); |
|---|
| 216 | if (!error) error = export_tree_node_print_xml(gb_main, tree->rightson, tree->rightlen, tree_name, node_gen, skip_folded, parent_id, parent_son_counter); |
|---|
| 217 | } |
|---|
| 218 | } |
|---|
| 219 | |
|---|
| 220 | return error; |
|---|
| 221 | } |
|---|
| 222 | |
|---|
| 223 | GB_ERROR TREE_write_XML(GBDATA *gb_main, const char *db_name, const char *tree_name, const TREE_node_text_gen *node_gen, bool skip_folded, const char *path) { |
|---|
| 224 | GB_ERROR error = 0; |
|---|
| 225 | FILE *output = fopen(path, "w"); |
|---|
| 226 | |
|---|
| 227 | if (!output) error = GB_export_errorf("file '%s' could not be opened for writing", path); |
|---|
| 228 | else { |
|---|
| 229 | GB_transaction gb_dummy(gb_main); |
|---|
| 230 | |
|---|
| 231 | GBT_TREE *tree = GBT_read_tree(gb_main,tree_name,sizeof(GBT_TREE)); |
|---|
| 232 | if (!tree) error = GB_await_error(); |
|---|
| 233 | else { |
|---|
| 234 | error = GBT_link_tree(tree,gb_main,GB_TRUE, 0, 0); |
|---|
| 235 | if (!error && node_gen) node_gen->init(gb_main); |
|---|
| 236 | |
|---|
| 237 | if (!error) { |
|---|
| 238 | GBDATA *tree_cont = GBT_get_tree(gb_main,tree_name); |
|---|
| 239 | GBDATA *tree_remark = GB_entry(tree_cont, "remark"); |
|---|
| 240 | |
|---|
| 241 | XML_Document xml_doc("ARB_TREE", "arb_tree.dtd", output); |
|---|
| 242 | |
|---|
| 243 | xml_doc.add_attribute("database", db_name); |
|---|
| 244 | xml_doc.add_attribute("treename", tree_name); |
|---|
| 245 | xml_doc.add_attribute("export_date", GB_date_string()); |
|---|
| 246 | |
|---|
| 247 | if (tree_remark) { |
|---|
| 248 | char *remark = GB_read_string(tree_remark); |
|---|
| 249 | XML_Tag remark_tag("COMMENT"); |
|---|
| 250 | XML_Text remark_text(remark); |
|---|
| 251 | free(remark); |
|---|
| 252 | } |
|---|
| 253 | |
|---|
| 254 | int my_son_counter = 0; |
|---|
| 255 | error = export_tree_node_print_xml(gb_main,tree,0.0, tree_name, node_gen, skip_folded, "", my_son_counter); |
|---|
| 256 | } |
|---|
| 257 | } |
|---|
| 258 | fclose(output); |
|---|
| 259 | } |
|---|
| 260 | |
|---|
| 261 | return error; |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | static char *complete_newick_comment(const char *comment) { |
|---|
| 265 | // ensure that all '[' in 'comment' are closed by corresponding ']' by inserting additional brackets |
|---|
| 266 | |
|---|
| 267 | int openBrackets = 0; |
|---|
| 268 | struct GBS_strstruct *out = GBS_stropen(strlen(comment)*1.1); |
|---|
| 269 | |
|---|
| 270 | for (int o = 0; comment[o]; ++o) { |
|---|
| 271 | switch (comment[o]) { |
|---|
| 272 | case '[': |
|---|
| 273 | openBrackets++; |
|---|
| 274 | break; |
|---|
| 275 | case ']': |
|---|
| 276 | if (openBrackets == 0) { |
|---|
| 277 | GBS_chrcat(out, '['); // insert one |
|---|
| 278 | } |
|---|
| 279 | else { |
|---|
| 280 | openBrackets--; |
|---|
| 281 | } |
|---|
| 282 | break; |
|---|
| 283 | |
|---|
| 284 | default: |
|---|
| 285 | break; |
|---|
| 286 | } |
|---|
| 287 | GBS_chrcat(out, comment[o]); |
|---|
| 288 | } |
|---|
| 289 | |
|---|
| 290 | while (openBrackets>0) { |
|---|
| 291 | GBS_chrcat(out, ']'); // insert one |
|---|
| 292 | openBrackets--; |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | gb_assert(openBrackets == 0); |
|---|
| 296 | |
|---|
| 297 | return GBS_strclose(out); |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | GB_ERROR TREE_write_Newick(GBDATA *gb_main, char *tree_name, const TREE_node_text_gen *node_gen, bool save_branchlengths, bool save_bootstraps, bool save_groupnames, bool pretty, TREE_node_quoting quoteMode, char *path) |
|---|
| 301 | { |
|---|
| 302 | GB_ERROR error = 0; |
|---|
| 303 | FILE *output = fopen(path, "w"); |
|---|
| 304 | |
|---|
| 305 | if (!output) error = GB_export_errorf("file '%s' could not be opened for writing", path); |
|---|
| 306 | else { |
|---|
| 307 | GB_transaction gb_dummy(gb_main); |
|---|
| 308 | |
|---|
| 309 | GBT_TREE *tree = GBT_read_tree(gb_main,tree_name,sizeof(GBT_TREE)); |
|---|
| 310 | if (!tree) error = GB_await_error(); |
|---|
| 311 | else { |
|---|
| 312 | error = GBT_link_tree(tree,gb_main,GB_TRUE, 0, 0); |
|---|
| 313 | if (!error && node_gen) node_gen->init(gb_main); |
|---|
| 314 | |
|---|
| 315 | if (!error) { |
|---|
| 316 | char *remark = 0; |
|---|
| 317 | GBDATA *tree_cont = GBT_get_tree(gb_main,tree_name); |
|---|
| 318 | GBDATA *tree_remark = GB_entry(tree_cont, "remark"); |
|---|
| 319 | |
|---|
| 320 | if (tree_remark) { |
|---|
| 321 | remark = GB_read_string(tree_remark); |
|---|
| 322 | } |
|---|
| 323 | { |
|---|
| 324 | const char *saved_to = GBS_global_string("%s saved to %s", tree_name, path); |
|---|
| 325 | freeset(remark, TREE_log_action_to_tree_comment(remark, saved_to)); |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | if (remark) { |
|---|
| 329 | char *wellformed = complete_newick_comment(remark); |
|---|
| 330 | |
|---|
| 331 | tree_assert(wellformed); |
|---|
| 332 | |
|---|
| 333 | fputc('[', output); fputs(wellformed, output); fputs("]\n", output); |
|---|
| 334 | free(wellformed); |
|---|
| 335 | } |
|---|
| 336 | free(remark); |
|---|
| 337 | if (!error) { |
|---|
| 338 | error = export_tree_node_print(gb_main, output, tree, tree_name, pretty, 0, node_gen, save_branchlengths, save_bootstraps, save_groupnames, quoteMode); |
|---|
| 339 | } |
|---|
| 340 | } |
|---|
| 341 | |
|---|
| 342 | GBT_delete_tree(tree); |
|---|
| 343 | } |
|---|
| 344 | |
|---|
| 345 | fprintf(output, ";\n"); |
|---|
| 346 | fclose(output); |
|---|
| 347 | } |
|---|
| 348 | |
|---|
| 349 | return error; |
|---|
| 350 | } |
|---|
| 351 | |
|---|
| 352 | // -------------------------------------------------------------------------------- |
|---|
| 353 | |
|---|
| 354 | static void export_tree_node_print_remove(char *str) { |
|---|
| 355 | int i = 0; |
|---|
| 356 | while (char c = str[i]) { |
|---|
| 357 | if (c == '\'' || c == '\"') str[i] = '.'; |
|---|
| 358 | i++; |
|---|
| 359 | } |
|---|
| 360 | } |
|---|
| 361 | |
|---|
| 362 | static void export_tree_rek(GBT_TREE *tree, FILE *out, bool export_branchlens, bool dquot) { |
|---|
| 363 | if (tree->is_leaf) { |
|---|
| 364 | export_tree_node_print_remove(tree->name); |
|---|
| 365 | fprintf(out, |
|---|
| 366 | dquot ? " \"%s\" " : " '%s' ", |
|---|
| 367 | tree->name); |
|---|
| 368 | } |
|---|
| 369 | else { |
|---|
| 370 | fputc('(', out); |
|---|
| 371 | export_tree_rek(tree->leftson, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f,", tree->leftlen); |
|---|
| 372 | export_tree_rek(tree->rightson, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", tree->rightlen); |
|---|
| 373 | fputc(')', out); |
|---|
| 374 | |
|---|
| 375 | if (tree->name) { |
|---|
| 376 | export_tree_node_print_remove(tree->name); |
|---|
| 377 | fprintf(out, |
|---|
| 378 | dquot ? "\"%s\"" : "'%s'", |
|---|
| 379 | tree->name); |
|---|
| 380 | } |
|---|
| 381 | } |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | #if defined(DEBUG) |
|---|
| 385 | #warning maybe replace TREE_export_tree by TREE_write_Newick |
|---|
| 386 | /* need some additional parameters (no comment, trifurcation) */ |
|---|
| 387 | #endif /* DEBUG */ |
|---|
| 388 | |
|---|
| 389 | GB_ERROR TREE_export_tree(GBDATA *gb_main,FILE *out,GBT_TREE *tree, bool triple_root, bool export_branchlens, bool dquot) { |
|---|
| 390 | GBUSE(gb_main); |
|---|
| 391 | |
|---|
| 392 | if (triple_root){ |
|---|
| 393 | GBT_TREE *one,*two,*three; |
|---|
| 394 | if (tree->is_leaf){ |
|---|
| 395 | return GB_export_error("Tree is two small, minimum 3 nodes"); |
|---|
| 396 | } |
|---|
| 397 | if (tree->leftson->is_leaf && tree->rightson->is_leaf){ |
|---|
| 398 | return GB_export_error("Tree is two small, minimum 3 nodes"); |
|---|
| 399 | } |
|---|
| 400 | if (tree->leftson->is_leaf){ |
|---|
| 401 | one = tree->leftson; |
|---|
| 402 | two = tree->rightson->leftson; |
|---|
| 403 | three = tree->rightson->rightson; |
|---|
| 404 | }else{ |
|---|
| 405 | one = tree->leftson->leftson; |
|---|
| 406 | two = tree->leftson->rightson; |
|---|
| 407 | three = tree->rightson; |
|---|
| 408 | } |
|---|
| 409 | fputc('(', out); |
|---|
| 410 | export_tree_rek(one, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out); |
|---|
| 411 | export_tree_rek(two, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); fputc(',', out); |
|---|
| 412 | export_tree_rek(three, out, export_branchlens, dquot); if (export_branchlens) fprintf(out, ":%.5f", 1.0); |
|---|
| 413 | fputc(')', out); |
|---|
| 414 | } |
|---|
| 415 | else { |
|---|
| 416 | export_tree_rek(tree, out, export_branchlens, dquot); |
|---|
| 417 | } |
|---|
| 418 | return 0; |
|---|
| 419 | } |
|---|