1 | // ============================================================ // |
---|
2 | // // |
---|
3 | // File : TreeRead.cxx // |
---|
4 | // Purpose : load tree from file // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // www.arb-home.de // |
---|
8 | // // |
---|
9 | // ============================================================ // |
---|
10 | |
---|
11 | #include "TreeRead.h" |
---|
12 | #include <stdio.h> |
---|
13 | |
---|
14 | #define tree_assert(cond) arb_assert(cond) |
---|
15 | |
---|
16 | /******************************************************************************************** |
---|
17 | load a tree from file system |
---|
18 | ********************************************************************************************/ |
---|
19 | |
---|
20 | |
---|
21 | /* -------------------- */ |
---|
22 | /* TreeReader */ |
---|
23 | /* -------------------- */ |
---|
24 | |
---|
25 | enum tr_lfmode { LF_UNKNOWN, LF_N, LF_R, LF_NR, LF_RN, }; |
---|
26 | |
---|
27 | typedef struct { |
---|
28 | char *tree_file_name; |
---|
29 | FILE *in; |
---|
30 | int last_character; // may be EOF |
---|
31 | int line_cnt; |
---|
32 | struct GBS_strstruct *tree_comment; |
---|
33 | double max_found_branchlen; |
---|
34 | double max_found_bootstrap; |
---|
35 | GB_ERROR error; |
---|
36 | enum tr_lfmode lfmode; |
---|
37 | } TreeReader; |
---|
38 | |
---|
39 | static void setReaderError(TreeReader *reader, const char *message) { |
---|
40 | tree_assert(!reader->error); |
---|
41 | reader->error = GBS_global_string("Error reading %s:%i: %s", |
---|
42 | reader->tree_file_name, |
---|
43 | reader->line_cnt, |
---|
44 | message); |
---|
45 | } |
---|
46 | |
---|
47 | static char gbt_getc(TreeReader *reader) { |
---|
48 | // reads character from stream |
---|
49 | // - converts linefeeds for DOS- and MAC-textfiles |
---|
50 | // - increments line_cnt |
---|
51 | char c = getc(reader->in); |
---|
52 | int inc = 0; |
---|
53 | |
---|
54 | if (c == '\n') { |
---|
55 | switch (reader->lfmode) { |
---|
56 | case LF_UNKNOWN: reader->lfmode = LF_N; inc = 1; break; |
---|
57 | case LF_N: inc = 1; break; |
---|
58 | case LF_R: reader->lfmode = LF_RN; c = gbt_getc(reader); break; |
---|
59 | case LF_NR: c = gbt_getc(reader); break; |
---|
60 | case LF_RN: inc = 1; break; |
---|
61 | } |
---|
62 | } |
---|
63 | else if (c == '\r') { |
---|
64 | switch (reader->lfmode) { |
---|
65 | case LF_UNKNOWN: reader->lfmode = LF_R; inc = 1; break; |
---|
66 | case LF_R: inc = 1; break; |
---|
67 | case LF_N: reader->lfmode = LF_NR; c = gbt_getc(reader); break; |
---|
68 | case LF_RN: c = gbt_getc(reader); break; |
---|
69 | case LF_NR: inc = 1; break; |
---|
70 | } |
---|
71 | if (c == '\r') c = '\n'; // never report '\r' |
---|
72 | } |
---|
73 | if (inc) reader->line_cnt++; |
---|
74 | |
---|
75 | return c; |
---|
76 | } |
---|
77 | |
---|
78 | static char gbt_read_char(TreeReader *reader) { |
---|
79 | GB_BOOL done = GB_FALSE; |
---|
80 | int c = ' '; |
---|
81 | |
---|
82 | while (!done) { |
---|
83 | c = gbt_getc(reader); |
---|
84 | if (c == ' ' || c == '\t' || c == '\n') ; // skip |
---|
85 | else if (c == '[') { // collect tree comment(s) |
---|
86 | int openBrackets = 1; |
---|
87 | if (GBS_memoffset(reader->tree_comment)) { |
---|
88 | // not first comment -> do new line |
---|
89 | GBS_chrcat(reader->tree_comment, '\n'); |
---|
90 | } |
---|
91 | |
---|
92 | while (openBrackets && !reader->error) { |
---|
93 | c = gbt_getc(reader); |
---|
94 | switch (c) { |
---|
95 | case EOF: |
---|
96 | setReaderError(reader, "Reached end of file while reading comment"); |
---|
97 | break; |
---|
98 | case ']': |
---|
99 | openBrackets--; |
---|
100 | if (openBrackets) GBS_chrcat(reader->tree_comment, c); // write all but last closing brackets |
---|
101 | break; |
---|
102 | case '[': |
---|
103 | openBrackets++; |
---|
104 | // fall-through |
---|
105 | default: |
---|
106 | GBS_chrcat(reader->tree_comment, c); |
---|
107 | break; |
---|
108 | } |
---|
109 | } |
---|
110 | } |
---|
111 | else done = GB_TRUE; |
---|
112 | } |
---|
113 | |
---|
114 | reader->last_character = c; |
---|
115 | return c; |
---|
116 | } |
---|
117 | |
---|
118 | static char gbt_get_char(TreeReader *reader) { |
---|
119 | int c = gbt_getc(reader); |
---|
120 | reader->last_character = c; |
---|
121 | return c; |
---|
122 | } |
---|
123 | |
---|
124 | static TreeReader *newTreeReader(FILE *input, const char *file_name) { |
---|
125 | TreeReader *reader = (TreeReader*)GB_calloc(1, sizeof(*reader)); |
---|
126 | |
---|
127 | reader->tree_file_name = strdup(file_name); |
---|
128 | reader->in = input; |
---|
129 | reader->tree_comment = GBS_stropen(2048); |
---|
130 | reader->max_found_branchlen = -1; |
---|
131 | reader->max_found_bootstrap = -1; |
---|
132 | reader->line_cnt = 1; |
---|
133 | reader->lfmode = LF_UNKNOWN; |
---|
134 | |
---|
135 | gbt_read_char(reader); |
---|
136 | |
---|
137 | return reader; |
---|
138 | } |
---|
139 | |
---|
140 | static void freeTreeReader(TreeReader *reader) { |
---|
141 | free(reader->tree_file_name); |
---|
142 | if (reader->tree_comment) GBS_strforget(reader->tree_comment); |
---|
143 | free(reader); |
---|
144 | } |
---|
145 | |
---|
146 | static char *getTreeComment(TreeReader *reader) { |
---|
147 | /* can only be called once. Deletes the comment from TreeReader! */ |
---|
148 | char *comment = 0; |
---|
149 | if (reader->tree_comment) { |
---|
150 | comment = GBS_strclose(reader->tree_comment); |
---|
151 | reader->tree_comment = 0; |
---|
152 | } |
---|
153 | return comment; |
---|
154 | } |
---|
155 | |
---|
156 | |
---|
157 | /* ------------------------------------------------------------ |
---|
158 | * The following functions assume that the "current" character |
---|
159 | * has already been read into 'TreeReader->last_character' |
---|
160 | */ |
---|
161 | |
---|
162 | static void gbt_eat_white(TreeReader *reader) { |
---|
163 | int c = reader->last_character; |
---|
164 | while ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t')){ |
---|
165 | c = gbt_get_char(reader); |
---|
166 | } |
---|
167 | } |
---|
168 | |
---|
169 | static double gbt_read_number(TreeReader *reader) { |
---|
170 | char strng[256]; |
---|
171 | char *s = strng; |
---|
172 | int c = reader->last_character; |
---|
173 | double fl; |
---|
174 | |
---|
175 | while (((c<='9') && (c>='0')) || (c=='.') || (c=='-') || (c=='e') || (c=='E')) { |
---|
176 | *(s++) = c; |
---|
177 | c = gbt_get_char(reader); |
---|
178 | } |
---|
179 | *s = 0; |
---|
180 | fl = GB_atof(strng); |
---|
181 | |
---|
182 | gbt_eat_white(reader); |
---|
183 | |
---|
184 | return fl; |
---|
185 | } |
---|
186 | |
---|
187 | static char *gbt_read_quoted_string(TreeReader *reader){ |
---|
188 | /* Read in a quoted or unquoted string. |
---|
189 | * in quoted strings double quotes ('') are replaced by (') |
---|
190 | */ |
---|
191 | |
---|
192 | char buffer[1024]; |
---|
193 | char *s = buffer; |
---|
194 | int c = reader->last_character; |
---|
195 | |
---|
196 | if (c == '\'') { |
---|
197 | c = gbt_get_char(reader); |
---|
198 | while ( (c!= EOF) && (c!='\'') ) { |
---|
199 | gbt_lt_double_quot: |
---|
200 | *(s++) = c; |
---|
201 | if ((s-buffer) > 1000) { |
---|
202 | *s = 0; |
---|
203 | setReaderError(reader, GBS_global_string("Name '%s' longer than 1000 bytes", buffer)); |
---|
204 | return NULL; |
---|
205 | } |
---|
206 | c = gbt_get_char(reader); |
---|
207 | } |
---|
208 | if (c == '\'') { |
---|
209 | c = gbt_read_char(reader); |
---|
210 | if (c == '\'') goto gbt_lt_double_quot; |
---|
211 | } |
---|
212 | }else{ |
---|
213 | while ( c== '_') c = gbt_read_char(reader); |
---|
214 | while ( c== ' ') c = gbt_read_char(reader); |
---|
215 | while ( (c != ':') && (c!= EOF) && (c!=',') && |
---|
216 | (c!=';') && (c!= ')') ) |
---|
217 | { |
---|
218 | *(s++) = c; |
---|
219 | if ((s-buffer) > 1000) { |
---|
220 | *s = 0; |
---|
221 | setReaderError(reader, GBS_global_string("Name '%s' longer than 1000 bytes", buffer)); |
---|
222 | return NULL; |
---|
223 | } |
---|
224 | c = gbt_read_char(reader); |
---|
225 | } |
---|
226 | } |
---|
227 | *s = 0; |
---|
228 | return strdup(buffer); |
---|
229 | } |
---|
230 | |
---|
231 | static void setBranchName(TreeReader *reader, GBT_TREE *node, char *name) { |
---|
232 | /* detect bootstrap values */ |
---|
233 | /* name has to be stored in node or must be free'ed */ |
---|
234 | |
---|
235 | char *end = 0; |
---|
236 | double bootstrap = strtod(name, &end); |
---|
237 | |
---|
238 | if (end == name) { // no digits -> no bootstrap |
---|
239 | node->name = name; |
---|
240 | } |
---|
241 | else { |
---|
242 | bootstrap = bootstrap*100.0 + 0.5; // needed if bootstrap values are between 0.0 and 1.0 */ |
---|
243 | // downscaling in done later! |
---|
244 | |
---|
245 | if (bootstrap>reader->max_found_bootstrap) { |
---|
246 | reader->max_found_bootstrap = bootstrap; |
---|
247 | } |
---|
248 | |
---|
249 | tree_assert(node->remark_branch == 0); |
---|
250 | node->remark_branch = GBS_global_string_copy("%i%%", (int)bootstrap); |
---|
251 | |
---|
252 | if (end[0] != 0) { // sth behind bootstrap value |
---|
253 | if (end[0] == ':') ++end; // ARB format for nodes with bootstraps AND node name is 'bootstrap:nodename' |
---|
254 | node->name = strdup(end); |
---|
255 | } |
---|
256 | free(name); |
---|
257 | } |
---|
258 | } |
---|
259 | |
---|
260 | static bool gbt_readNameAndLength(TreeReader *reader, GBT_TREE *node, GBT_LEN *len) { |
---|
261 | /* reads the branch-length and -name |
---|
262 | '*len' should normally be initialized with TREE_DEFLEN_MARKER |
---|
263 | * returns the branch-length in 'len' and sets the branch-name of 'node' |
---|
264 | * returns GB_TRUE if successful, otherwise reader->error gets set |
---|
265 | */ |
---|
266 | |
---|
267 | bool done = false; |
---|
268 | while (!done && !reader->error) { |
---|
269 | switch (reader->last_character) { |
---|
270 | case ';': |
---|
271 | case ',': |
---|
272 | case ')': |
---|
273 | done = GB_TRUE; |
---|
274 | break; |
---|
275 | case ':': |
---|
276 | gbt_read_char(reader); /* drop ':' */ |
---|
277 | *len = gbt_read_number(reader); |
---|
278 | if (*len>reader->max_found_branchlen) { |
---|
279 | reader->max_found_branchlen = *len; |
---|
280 | } |
---|
281 | break; |
---|
282 | default: { |
---|
283 | char *branchName = gbt_read_quoted_string(reader); |
---|
284 | if (branchName) { |
---|
285 | setBranchName(reader, node, branchName); |
---|
286 | } |
---|
287 | else { |
---|
288 | setReaderError(reader, "Expected branch-name or one of ':;,)'"); |
---|
289 | } |
---|
290 | break; |
---|
291 | } |
---|
292 | } |
---|
293 | } |
---|
294 | |
---|
295 | return !reader->error; |
---|
296 | } |
---|
297 | |
---|
298 | static GBT_TREE *gbt_linkedTreeNode(GBT_TREE *left, GBT_LEN leftlen, GBT_TREE *right, GBT_LEN rightlen, int structuresize) { |
---|
299 | GBT_TREE *node = (GBT_TREE*)GB_calloc(1, structuresize); |
---|
300 | |
---|
301 | node->leftson = left; |
---|
302 | node->leftlen = leftlen; |
---|
303 | node->rightson = right; |
---|
304 | node->rightlen = rightlen; |
---|
305 | |
---|
306 | left->father = node; |
---|
307 | right->father = node; |
---|
308 | |
---|
309 | return node; |
---|
310 | } |
---|
311 | |
---|
312 | static GBT_TREE *gbt_load_tree_rek(TreeReader *reader, int structuresize, GBT_LEN *nodeLen) { |
---|
313 | GBT_TREE *node = 0; |
---|
314 | |
---|
315 | if (reader->last_character == '(') { |
---|
316 | gbt_read_char(reader); // drop the '(' |
---|
317 | |
---|
318 | GBT_LEN leftLen = TREE_DEFLEN_MARKER; |
---|
319 | GBT_TREE *left = gbt_load_tree_rek(reader, structuresize, &leftLen); |
---|
320 | |
---|
321 | tree_assert(left || reader->error); |
---|
322 | |
---|
323 | if (left) { |
---|
324 | if (gbt_readNameAndLength(reader, left, &leftLen)) { |
---|
325 | switch (reader->last_character) { |
---|
326 | case ')': /* single node */ |
---|
327 | *nodeLen = leftLen; |
---|
328 | node = left; |
---|
329 | left = 0; |
---|
330 | break; |
---|
331 | |
---|
332 | case ',': { |
---|
333 | GBT_LEN rightLen = TREE_DEFLEN_MARKER; |
---|
334 | GBT_TREE *right = 0; |
---|
335 | |
---|
336 | while (reader->last_character == ',' && !reader->error) { |
---|
337 | if (right) { /* multi-branch */ |
---|
338 | GBT_TREE *pair = gbt_linkedTreeNode(left, leftLen, right, rightLen, structuresize); |
---|
339 | |
---|
340 | left = pair; leftLen = 0; |
---|
341 | right = 0; rightLen = TREE_DEFLEN_MARKER; |
---|
342 | } |
---|
343 | |
---|
344 | gbt_read_char(reader); /* drop ',' */ |
---|
345 | right = gbt_load_tree_rek(reader, structuresize, &rightLen); |
---|
346 | if (right) gbt_readNameAndLength(reader, right, &rightLen); |
---|
347 | } |
---|
348 | |
---|
349 | if (reader->last_character == ')') { |
---|
350 | node = gbt_linkedTreeNode(left, leftLen, right, rightLen, structuresize); |
---|
351 | *nodeLen = TREE_DEFLEN_MARKER; |
---|
352 | |
---|
353 | left = 0; |
---|
354 | right = 0; |
---|
355 | |
---|
356 | gbt_read_char(reader); /* drop ')' */ |
---|
357 | } |
---|
358 | else { |
---|
359 | setReaderError(reader, "Expected one of ',)'"); |
---|
360 | } |
---|
361 | |
---|
362 | free(right); |
---|
363 | |
---|
364 | break; |
---|
365 | } |
---|
366 | |
---|
367 | default: |
---|
368 | setReaderError(reader, "Expected one of ',)'"); |
---|
369 | break; |
---|
370 | } |
---|
371 | } |
---|
372 | else { |
---|
373 | tree_assert(reader->error); |
---|
374 | } |
---|
375 | |
---|
376 | free(left); |
---|
377 | } |
---|
378 | } |
---|
379 | else { /* single node */ |
---|
380 | gbt_eat_white(reader); |
---|
381 | char *name = gbt_read_quoted_string(reader); |
---|
382 | if (name) { |
---|
383 | node = (GBT_TREE*)GB_calloc(1, structuresize); |
---|
384 | node->is_leaf = GB_TRUE; |
---|
385 | node->name = name; |
---|
386 | } |
---|
387 | else { |
---|
388 | setReaderError(reader, "Expected quoted string"); |
---|
389 | } |
---|
390 | } |
---|
391 | |
---|
392 | tree_assert(node || reader->error); |
---|
393 | return node; |
---|
394 | } |
---|
395 | |
---|
396 | |
---|
397 | |
---|
398 | GBT_TREE *TREE_load(const char *path, int structuresize, char **commentPtr, int allow_length_scaling, char **warningPtr) { |
---|
399 | /* Load a newick compatible tree from file 'path', |
---|
400 | structure size should be >0, see GBT_read_tree for more information |
---|
401 | if commentPtr != NULL -> set it to a malloc copy of all concatenated comments found in tree file |
---|
402 | if warningPtr != NULL -> set it to a malloc copy auto-scale-warning (if autoscaling happens) |
---|
403 | */ |
---|
404 | |
---|
405 | GBT_TREE *tree = 0; |
---|
406 | FILE *input = fopen(path, "rt"); |
---|
407 | GB_ERROR error = 0; |
---|
408 | |
---|
409 | if (!input) { |
---|
410 | error = GBS_global_string("No such file: %s", path); |
---|
411 | } |
---|
412 | else { |
---|
413 | const char *name_only = strrchr(path, '/'); |
---|
414 | if (name_only) ++name_only; |
---|
415 | else name_only = path; |
---|
416 | |
---|
417 | TreeReader *reader = newTreeReader(input, name_only); |
---|
418 | GBT_LEN rootNodeLen = TREE_DEFLEN_MARKER; /* root node has no length. only used as input to gbt_load_tree_rek*/ |
---|
419 | tree = gbt_load_tree_rek(reader, structuresize, &rootNodeLen); |
---|
420 | fclose(input); |
---|
421 | |
---|
422 | if (reader->error) { |
---|
423 | GBT_delete_tree(tree); |
---|
424 | tree = 0; |
---|
425 | error = reader->error; |
---|
426 | } |
---|
427 | |
---|
428 | if (tree) { |
---|
429 | double bootstrap_scale = 1.0; |
---|
430 | double branchlen_scale = 1.0; |
---|
431 | |
---|
432 | if (reader->max_found_bootstrap >= 101.0) { // bootstrap values were given in percent |
---|
433 | bootstrap_scale = 0.01; |
---|
434 | if (warningPtr) { |
---|
435 | *warningPtr = GBS_global_string_copy("Auto-scaling bootstrap values by factor %.2f (max. found bootstrap was %5.2f)", |
---|
436 | bootstrap_scale, reader->max_found_bootstrap); |
---|
437 | } |
---|
438 | } |
---|
439 | if (reader->max_found_branchlen >= 1.01) { // assume branchlengths have range [0;100] |
---|
440 | if (allow_length_scaling) { |
---|
441 | branchlen_scale = 0.01; |
---|
442 | if (warningPtr) { |
---|
443 | char *w = GBS_global_string_copy("Auto-scaling branchlengths by factor %.2f (max. found branchlength = %5.2f)", |
---|
444 | branchlen_scale, reader->max_found_branchlen); |
---|
445 | if (*warningPtr) { |
---|
446 | char *w2 = GBS_global_string_copy("%s\n%s", *warningPtr, w); |
---|
447 | |
---|
448 | free(*warningPtr); |
---|
449 | free(w); |
---|
450 | *warningPtr = w2; |
---|
451 | } |
---|
452 | else { |
---|
453 | *warningPtr = w; |
---|
454 | } |
---|
455 | } |
---|
456 | } |
---|
457 | } |
---|
458 | |
---|
459 | TREE_scale(tree, branchlen_scale, bootstrap_scale); // scale bootstraps and branchlengths |
---|
460 | |
---|
461 | if (commentPtr) { |
---|
462 | char *comment = getTreeComment(reader); |
---|
463 | |
---|
464 | const char *loaded_from = GBS_global_string("Loaded from %s", path); |
---|
465 | freeset(comment, TREE_log_action_to_tree_comment(comment, loaded_from)); |
---|
466 | |
---|
467 | tree_assert(*commentPtr == 0); |
---|
468 | *commentPtr = comment; |
---|
469 | } |
---|
470 | } |
---|
471 | |
---|
472 | freeTreeReader(reader); |
---|
473 | } |
---|
474 | |
---|
475 | tree_assert(tree||error); |
---|
476 | if (error) { |
---|
477 | GB_export_errorf("Import tree: %s", error); |
---|
478 | tree_assert(!tree); |
---|
479 | } |
---|
480 | |
---|
481 | return tree; |
---|
482 | } |
---|
483 | |
---|