1 | #include "muscle.h" |
---|
2 | #include "tree.h" |
---|
3 | #include "textfile.h" |
---|
4 | |
---|
5 | #define TRACE 0 |
---|
6 | |
---|
7 | // Tokens in Newick files are: |
---|
8 | // ( ) : , ; |
---|
9 | // string |
---|
10 | // 'string' |
---|
11 | // "string" |
---|
12 | // [ comment ] |
---|
13 | // |
---|
14 | // We can't safely distinguish between identifiers and floating point |
---|
15 | // numbers at the lexical level (because identifiers may be numeric, |
---|
16 | // or start with digits), so both edge lengths and identifiers are |
---|
17 | // returned as strings. |
---|
18 | |
---|
19 | const char *Tree::NTTStr(NEWICK_TOKEN_TYPE NTT) const |
---|
20 | { |
---|
21 | switch (NTT) |
---|
22 | { |
---|
23 | #define c(x) case NTT_##x: return #x; |
---|
24 | c(Unknown) |
---|
25 | c(Lparen) |
---|
26 | c(Rparen) |
---|
27 | c(Colon) |
---|
28 | c(Comma) |
---|
29 | c(Semicolon) |
---|
30 | c(String) |
---|
31 | c(SingleQuotedString) |
---|
32 | c(DoubleQuotedString) |
---|
33 | c(Comment) |
---|
34 | #undef c |
---|
35 | } |
---|
36 | return "??"; |
---|
37 | } |
---|
38 | |
---|
39 | NEWICK_TOKEN_TYPE Tree::GetToken(TextFile &File, char szToken[], unsigned uBytes) const |
---|
40 | { |
---|
41 | // Skip leading white space |
---|
42 | File.SkipWhite(); |
---|
43 | |
---|
44 | char c; |
---|
45 | File.GetCharX(c); |
---|
46 | |
---|
47 | // In case a single-character token |
---|
48 | szToken[0] = c; |
---|
49 | szToken[1] = 0; |
---|
50 | |
---|
51 | unsigned uBytesCopied = 0; |
---|
52 | NEWICK_TOKEN_TYPE TT; |
---|
53 | switch (c) |
---|
54 | { |
---|
55 | case '(': |
---|
56 | return NTT_Lparen; |
---|
57 | |
---|
58 | case ')': |
---|
59 | return NTT_Rparen; |
---|
60 | |
---|
61 | case ':': |
---|
62 | return NTT_Colon; |
---|
63 | |
---|
64 | case ';': |
---|
65 | return NTT_Semicolon; |
---|
66 | |
---|
67 | case ',': |
---|
68 | return NTT_Comma; |
---|
69 | |
---|
70 | case '\'': |
---|
71 | TT = NTT_SingleQuotedString; |
---|
72 | File.GetCharX(c); |
---|
73 | break; |
---|
74 | |
---|
75 | case '"': |
---|
76 | TT = NTT_DoubleQuotedString; |
---|
77 | File.GetCharX(c); |
---|
78 | break; |
---|
79 | |
---|
80 | case '[': |
---|
81 | TT = NTT_Comment; |
---|
82 | break; |
---|
83 | |
---|
84 | default: |
---|
85 | TT = NTT_String; |
---|
86 | break; |
---|
87 | } |
---|
88 | |
---|
89 | for (;;) |
---|
90 | { |
---|
91 | if (TT != NTT_Comment) |
---|
92 | { |
---|
93 | if (uBytesCopied < uBytes - 2) |
---|
94 | { |
---|
95 | szToken[uBytesCopied++] = c; |
---|
96 | szToken[uBytesCopied] = 0; |
---|
97 | } |
---|
98 | else |
---|
99 | Quit("Tree::GetToken: input buffer too small, token so far='%s'", szToken); |
---|
100 | } |
---|
101 | bool bEof = File.GetChar(c); |
---|
102 | if (bEof) |
---|
103 | return TT; |
---|
104 | |
---|
105 | switch (TT) |
---|
106 | { |
---|
107 | case NTT_String: |
---|
108 | if (0 != strchr("():;,", c)) |
---|
109 | { |
---|
110 | File.PushBack(c); |
---|
111 | return NTT_String; |
---|
112 | } |
---|
113 | if (isspace(c)) |
---|
114 | return NTT_String; |
---|
115 | break; |
---|
116 | |
---|
117 | case NTT_SingleQuotedString: |
---|
118 | if ('\'' == c) |
---|
119 | return NTT_String; |
---|
120 | break; |
---|
121 | |
---|
122 | case NTT_DoubleQuotedString: |
---|
123 | if ('"' == c) |
---|
124 | return NTT_String; |
---|
125 | break; |
---|
126 | |
---|
127 | case NTT_Comment: |
---|
128 | if (']' == c) |
---|
129 | return GetToken(File, szToken, uBytes); |
---|
130 | break; |
---|
131 | |
---|
132 | default: |
---|
133 | Quit("Tree::GetToken, invalid TT=%u", TT); |
---|
134 | } |
---|
135 | } |
---|
136 | } |
---|
137 | |
---|
138 | // NOTE: this hack must come after definition of Tree::GetToken. |
---|
139 | #if TRACE |
---|
140 | #define GetToken GetTokenVerbose |
---|
141 | #endif |
---|
142 | |
---|
143 | void Tree::FromFile(TextFile &File) |
---|
144 | { |
---|
145 | // Assume rooted. |
---|
146 | // If we discover that it is unrooted, will convert on the fly. |
---|
147 | CreateRooted(); |
---|
148 | |
---|
149 | double dEdgeLength; |
---|
150 | bool bEdgeLength = GetGroupFromFile(File, 0, &dEdgeLength); |
---|
151 | |
---|
152 | // Next token should be either ';' for rooted tree or ',' for unrooted. |
---|
153 | char szToken[16]; |
---|
154 | NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); |
---|
155 | |
---|
156 | // If rooted, all done. |
---|
157 | if (NTT_Semicolon == NTT) |
---|
158 | { |
---|
159 | if (bEdgeLength) |
---|
160 | Log(" *** Warning *** edge length on root group in Newick file %s\n", |
---|
161 | File.GetFileName()); |
---|
162 | Validate(); |
---|
163 | return; |
---|
164 | } |
---|
165 | |
---|
166 | if (NTT_Comma != NTT) |
---|
167 | Quit("Tree::FromFile, expected ';' or ',', got '%s'", szToken); |
---|
168 | |
---|
169 | const unsigned uThirdNode = UnrootFromFile(); |
---|
170 | bEdgeLength = GetGroupFromFile(File, uThirdNode, &dEdgeLength); |
---|
171 | if (bEdgeLength) |
---|
172 | SetEdgeLength(0, uThirdNode, dEdgeLength); |
---|
173 | Validate(); |
---|
174 | } |
---|
175 | |
---|
176 | // Return true if edge length for this group. |
---|
177 | bool Tree::GetGroupFromFile(TextFile &File, unsigned uNodeIndex, |
---|
178 | double *ptrdEdgeLength) |
---|
179 | { |
---|
180 | char szToken[1024]; |
---|
181 | NEWICK_TOKEN_TYPE NTT = GetToken(File, szToken, sizeof(szToken)); |
---|
182 | |
---|
183 | // Group is either leaf name or (left, right). |
---|
184 | if (NTT_String == NTT) |
---|
185 | { |
---|
186 | SetLeafName(uNodeIndex, szToken); |
---|
187 | #if TRACE |
---|
188 | Log("Group is leaf '%s'\n", szToken); |
---|
189 | #endif |
---|
190 | } |
---|
191 | else if (NTT_Lparen == NTT) |
---|
192 | { |
---|
193 | const unsigned uLeft = AppendBranch(uNodeIndex); |
---|
194 | const unsigned uRight = uLeft + 1; |
---|
195 | |
---|
196 | // Left sub-group... |
---|
197 | #if TRACE |
---|
198 | Log("Got '(', group is compound, expect left sub-group\n"); |
---|
199 | #endif |
---|
200 | double dEdgeLength; |
---|
201 | bool bLeftLength = GetGroupFromFile(File, uLeft, &dEdgeLength); |
---|
202 | #if TRACE |
---|
203 | if (bLeftLength) |
---|
204 | Log("Edge length for left sub-group: %.3g\n", dEdgeLength); |
---|
205 | else |
---|
206 | Log("No edge length for left sub-group\n"); |
---|
207 | #endif |
---|
208 | if (bLeftLength) |
---|
209 | SetEdgeLength(uNodeIndex, uLeft, dEdgeLength); |
---|
210 | |
---|
211 | // ... then comma ... |
---|
212 | #if TRACE |
---|
213 | Log("Expect comma\n"); |
---|
214 | #endif |
---|
215 | NTT = GetToken(File, szToken, sizeof(szToken)); |
---|
216 | if (NTT_Comma != NTT) |
---|
217 | Quit("Tree::GetGroupFromFile, expected ',', got '%s'", szToken); |
---|
218 | |
---|
219 | // ...then right sub-group... |
---|
220 | #if TRACE |
---|
221 | Log("Expect right sub-group\n"); |
---|
222 | #endif |
---|
223 | bool bRightLength = GetGroupFromFile(File, uRight, &dEdgeLength); |
---|
224 | if (bRightLength) |
---|
225 | SetEdgeLength(uNodeIndex, uRight, dEdgeLength); |
---|
226 | |
---|
227 | #if TRACE |
---|
228 | if (bRightLength) |
---|
229 | Log("Edge length for right sub-group: %.3g\n", dEdgeLength); |
---|
230 | else |
---|
231 | Log("No edge length for right sub-group\n"); |
---|
232 | #endif |
---|
233 | |
---|
234 | // ... then closing parenthesis. |
---|
235 | #if TRACE |
---|
236 | Log("Expect closing parenthesis (or comma if > 2-ary)\n"); |
---|
237 | #endif |
---|
238 | NTT = GetToken(File, szToken, sizeof(szToken)); |
---|
239 | if (NTT_Rparen == NTT) |
---|
240 | ; |
---|
241 | else if (NTT_Comma == NTT) |
---|
242 | { |
---|
243 | File.PushBack(','); |
---|
244 | return false; |
---|
245 | } |
---|
246 | else |
---|
247 | Quit("Tree::GetGroupFromFile, expected ')' or ',', got '%s'", szToken); |
---|
248 | } |
---|
249 | else |
---|
250 | Quit("Tree::GetGroupFromFile, expected '(' or leaf name, got '%s'", |
---|
251 | szToken); |
---|
252 | |
---|
253 | // Group may optionally be followed by edge length. |
---|
254 | bool bEof = File.SkipWhiteX(); |
---|
255 | if (bEof) |
---|
256 | return false; |
---|
257 | char c; |
---|
258 | File.GetCharX(c); |
---|
259 | #if TRACE |
---|
260 | Log("Character following group, could be colon, is '%c'\n", c); |
---|
261 | #endif |
---|
262 | if (':' == c) |
---|
263 | { |
---|
264 | NTT = GetToken(File, szToken, sizeof(szToken)); |
---|
265 | if (NTT_String != NTT) |
---|
266 | Quit("Tree::GetGroupFromFile, expected edge length, got '%s'", szToken); |
---|
267 | *ptrdEdgeLength = atof(szToken); |
---|
268 | return true; |
---|
269 | } |
---|
270 | File.PushBack(c); |
---|
271 | return false; |
---|
272 | } |
---|