1 | __VERSION__="ete2-2.2rev1026" |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | # #START_LICENSE########################################################### |
---|
4 | # |
---|
5 | # |
---|
6 | # This file is part of the Environment for Tree Exploration program |
---|
7 | # (ETE). http://ete.cgenomics.org |
---|
8 | # |
---|
9 | # ETE is free software: you can redistribute it and/or modify it |
---|
10 | # under the terms of the GNU General Public License as published by |
---|
11 | # the Free Software Foundation, either version 3 of the License, or |
---|
12 | # (at your option) any later version. |
---|
13 | # |
---|
14 | # ETE is distributed in the hope that it will be useful, but WITHOUT |
---|
15 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
---|
16 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
---|
17 | # License for more details. |
---|
18 | # |
---|
19 | # You should have received a copy of the GNU General Public License |
---|
20 | # along with ETE. If not, see <http://www.gnu.org/licenses/>. |
---|
21 | # |
---|
22 | # |
---|
23 | # ABOUT THE ETE PACKAGE |
---|
24 | # ===================== |
---|
25 | # |
---|
26 | # ETE is distributed under the GPL copyleft license (2008-2011). |
---|
27 | # |
---|
28 | # If you make use of ETE in published work, please cite: |
---|
29 | # |
---|
30 | # Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. |
---|
31 | # ETE: a python Environment for Tree Exploration. Jaime BMC |
---|
32 | # Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 |
---|
33 | # |
---|
34 | # Note that extra references to the specific methods implemented in |
---|
35 | # the toolkit are available in the documentation. |
---|
36 | # |
---|
37 | # More info at http://ete.cgenomics.org |
---|
38 | # |
---|
39 | # |
---|
40 | # #END_LICENSE############################################################# |
---|
41 | |
---|
42 | import re |
---|
43 | import os |
---|
44 | import base64 |
---|
45 | |
---|
46 | __all__ = ["read_newick", "write_newick", "print_supported_formats"] |
---|
47 | |
---|
48 | # Regular expressions used for reading newick format |
---|
49 | _ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r=" |
---|
50 | _NON_PRINTABLE_CHARS_RE = "[\x00-\x1f]+" |
---|
51 | |
---|
52 | _NHX_RE = "\[&&NHX:[^\]]*\]" |
---|
53 | _FLOAT_RE = "[+-]?\d+\.?\d*(?:[eE][-+]\d+)?" |
---|
54 | #_FLOAT_RE = "[+-]?\d+\.?\d*" |
---|
55 | _NAME_RE = "[^():,;\[\]]+" |
---|
56 | |
---|
57 | DEFAULT_DIST = 1.0 |
---|
58 | DEFAULT_NAME = '' |
---|
59 | DEFAULT_SUPPORT = 1.0 |
---|
60 | |
---|
61 | |
---|
62 | # Allowed formats. This table is used to read and write newick using |
---|
63 | # different convenctions. You can also add your own formats in an easy way. |
---|
64 | # |
---|
65 | # |
---|
66 | # FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\ |
---|
67 | # [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]] |
---|
68 | # |
---|
69 | # Attributes are placed in the newick as follows: |
---|
70 | # |
---|
71 | # .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ... |
---|
72 | # |
---|
73 | # |
---|
74 | # /-A |
---|
75 | # -NoName--| |
---|
76 | # | /-B |
---|
77 | # \C-------| |
---|
78 | # | /-D |
---|
79 | # \E-------| |
---|
80 | # \-G |
---|
81 | # |
---|
82 | # Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737); |
---|
83 | # Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737); |
---|
84 | # Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737); |
---|
85 | # Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737); |
---|
86 | # Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729))); |
---|
87 | # Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737); |
---|
88 | # Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C); |
---|
89 | # Format 7 = (A,(B,(D,G)E)C); |
---|
90 | # Format 8 = (A,(B,(D,G))); |
---|
91 | # Format 9 = (,(,(,))); |
---|
92 | |
---|
93 | NW_FORMAT = { |
---|
94 | 0: [['name', str, True], ["dist", float, True], ['support', float, True], ["dist", float, True]], # Flexible with support |
---|
95 | 1: [['name', str, True], ["dist", float, True], ['name', str, True], ["dist", float, True]], # Flexible with internal node names |
---|
96 | 2: [['name', str, False], ["dist", float, False], ['support', float, False], ["dist", float, False]],# Strict with support values |
---|
97 | 3: [['name', str, False], ["dist", float, False], ['name', str, False], ["dist", float, False]], # Strict with internal node names |
---|
98 | 4: [['name', str, False], ["dist", float, False], [None, None, False], [None, None, False]], |
---|
99 | 5: [['name', str, False], ["dist", float, False], [None, None, False], ["dist", float, False]], |
---|
100 | 6: [['name', str, False], [None, None, False], [None, None, False], ["dist", float, False]], |
---|
101 | 7: [['name', str, False], ["dist", float, False], ["name", str, False], [None, None, False]], |
---|
102 | 8: [['name', str, False], [None, None, False], ["name", str, False], [None, None, False]], |
---|
103 | 9: [['name', str, False], [None, None, False], [None, None, False], [None, None, False]], # Only topology with node names |
---|
104 | 100: [[None, None, False], [None, None, False], [None, None, False], [None, None, False]] # Only Topology |
---|
105 | } |
---|
106 | |
---|
107 | |
---|
108 | def format_node(node, node_type, format): |
---|
109 | if node_type == "leaf": |
---|
110 | container1 = NW_FORMAT[format][0][0] |
---|
111 | container2 = NW_FORMAT[format][1][0] |
---|
112 | converterFn1 = NW_FORMAT[format][0][1] |
---|
113 | converterFn2 = NW_FORMAT[format][1][1] |
---|
114 | else: |
---|
115 | container1 = NW_FORMAT[format][2][0] |
---|
116 | container2 = NW_FORMAT[format][3][0] |
---|
117 | converterFn1 = NW_FORMAT[format][2][1] |
---|
118 | converterFn2 = NW_FORMAT[format][3][1] |
---|
119 | |
---|
120 | if converterFn1 == str: |
---|
121 | try: |
---|
122 | FIRST_PART = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ |
---|
123 | str(getattr(node, container1))) |
---|
124 | except (AttributeError, TypeError): |
---|
125 | FIRST_PART = "?" |
---|
126 | |
---|
127 | elif converterFn1 is None: |
---|
128 | FIRST_PART = "" |
---|
129 | else: |
---|
130 | try: |
---|
131 | #FIRST_PART = "%0.6f" %(converterFn2(getattr(node, container1))) |
---|
132 | FIRST_PART = "%g" %(converterFn2(getattr(node, container1))) |
---|
133 | except (ValueError, TypeError): |
---|
134 | FIRST_PART = "?" |
---|
135 | |
---|
136 | |
---|
137 | if converterFn2 == str: |
---|
138 | try: |
---|
139 | SECOND_PART = ":"+re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ |
---|
140 | str(getattr(node, container2))) |
---|
141 | except (ValueError, TypeError): |
---|
142 | SECOND_PART = ":?" |
---|
143 | elif converterFn2 is None: |
---|
144 | SECOND_PART = "" |
---|
145 | else: |
---|
146 | try: |
---|
147 | #SECOND_PART = ":%0.6f" %(converterFn2(getattr(node, container2))) |
---|
148 | SECOND_PART = ":%g" %(converterFn2(getattr(node, container2))) |
---|
149 | except (ValueError, TypeError): |
---|
150 | SECOND_PART = ":?" |
---|
151 | |
---|
152 | return "%s%s" %(FIRST_PART, SECOND_PART) |
---|
153 | |
---|
154 | # Used to write into specific formats |
---|
155 | def node2leafformat(node, format): |
---|
156 | safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ |
---|
157 | str(getattr(node, "name"))) |
---|
158 | |
---|
159 | if format == 0 or format == 1 or format == 2 or format ==3: |
---|
160 | return "%s:%0.6f" %(safe_name, node.dist) |
---|
161 | elif format == 4 or format == 7: |
---|
162 | return ":%0.6f" %(node.dist) |
---|
163 | elif format == 5 or format == 6: |
---|
164 | return "%s" %(safe_name) |
---|
165 | |
---|
166 | def node2internalformat(node, format): |
---|
167 | safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ |
---|
168 | str(getattr(node, "name"))) |
---|
169 | if format == 0 or format == 1: |
---|
170 | return "%0.6f:%0.6f" %(node.support, node.dist) |
---|
171 | elif format == 2: |
---|
172 | return "%s:%0.6f" %(safe_name, node.dist) |
---|
173 | elif format == 3 or format == 4: |
---|
174 | return ":%0.6f" %(node.dist) |
---|
175 | elif format == 5: |
---|
176 | return "%s" %(safe_name) |
---|
177 | elif format == 6 or format == 7: |
---|
178 | return "" |
---|
179 | |
---|
180 | def print_supported_formats(): |
---|
181 | from ete2.coretype.tree import TreeNode |
---|
182 | t = TreeNode() |
---|
183 | t.populate(4, "ABCDEFGHI") |
---|
184 | print t |
---|
185 | for f in NW_FORMAT: |
---|
186 | print "Format", f,"=", write_newick(t, features=None, format=f) |
---|
187 | |
---|
188 | class NewickError(Exception): |
---|
189 | """Exception class designed for NewickIO errors.""" |
---|
190 | pass |
---|
191 | |
---|
192 | def read_newick(newick, root_node=None, format=0): |
---|
193 | """ Reads a newick tree from either a string or a file, and returns |
---|
194 | an ETE tree structure. |
---|
195 | |
---|
196 | A previously existent node object can be passed as the root of the |
---|
197 | tree, which means that all its new children will belong to the same |
---|
198 | class as the root(This allows to work with custom TreeNode |
---|
199 | objects). |
---|
200 | |
---|
201 | You can also take advantage from this behaviour to concatenate |
---|
202 | several tree structures. |
---|
203 | """ |
---|
204 | |
---|
205 | if root_node is None: |
---|
206 | from ete2.coretype.tree import TreeNode |
---|
207 | root_node = TreeNode() |
---|
208 | |
---|
209 | if isinstance(newick, basestring): |
---|
210 | if os.path.exists(newick): |
---|
211 | nw = open(newick, 'rU').read() |
---|
212 | else: |
---|
213 | nw = newick |
---|
214 | nw = nw.strip() |
---|
215 | if not nw.startswith('(') and nw.endswith(';'): |
---|
216 | return _read_node_data(nw, root_node, "single", format) |
---|
217 | |
---|
218 | elif not nw.startswith('(') or not nw.endswith(';'): |
---|
219 | raise NewickError, \ |
---|
220 | 'Unexisting tree file or Malformed newick tree structure.' |
---|
221 | else: |
---|
222 | return _read_newick_from_string(nw, root_node, format) |
---|
223 | |
---|
224 | else: |
---|
225 | raise NewickError, \ |
---|
226 | "'newick' argument must be either a filename or a newick string." |
---|
227 | |
---|
228 | def _read_newick_from_string(nw, root_node, format): |
---|
229 | """ Reads a newick string in the New Hampshire format. """ |
---|
230 | |
---|
231 | if nw.count('(') != nw.count(')'): |
---|
232 | raise NewickError, 'Parentheses do not match. Broken tree structure' |
---|
233 | |
---|
234 | # white spaces and separators are removed |
---|
235 | nw = re.sub("[\n\r\t]+", "", nw) |
---|
236 | |
---|
237 | current_parent = None |
---|
238 | |
---|
239 | # Ok, this is my own way of reading newick structures. I find it |
---|
240 | # more flexible and elegant than other docummented methods. Don't |
---|
241 | # know if I'm loosing much efficiency. It Starts by splitting the |
---|
242 | # structure using open parentheses. Each of the resulting chunks |
---|
243 | # represent an internal node. So for each chunk I create a new node |
---|
244 | # that hungs from the current parent node. Each internal node chunk |
---|
245 | # may contain information about terminal nodes hanging from the |
---|
246 | # internal and clossing parenthessis (closing previously opened |
---|
247 | # internal nodes). |
---|
248 | # |
---|
249 | # Enjoy. |
---|
250 | # by JHC ;) |
---|
251 | |
---|
252 | # Skip the first chunk. It is always == '' |
---|
253 | for internal_node in nw.split("(")[1:]: |
---|
254 | # If this is the root of tree, use the root_node instead of |
---|
255 | # creating it, otherwise make a new one. |
---|
256 | if current_parent is None: |
---|
257 | current_parent = root_node |
---|
258 | else: |
---|
259 | current_parent = current_parent.add_child() |
---|
260 | # We can only find leaf nodes within this chunk, since rest of |
---|
261 | # internal nodes will be in the next newick chunks |
---|
262 | possible_leaves = internal_node.split(",") |
---|
263 | for i, leaf in enumerate(possible_leaves): |
---|
264 | # Any resulting sub-chunk resulting from splitting by commas can |
---|
265 | # be considered (tpologically) as a child to the current parent |
---|
266 | # node. We only discard chunks if they are empty and in the last |
---|
267 | # possition, meaining that the next brother is not terminal bu |
---|
268 | # internal node (will be visited in the next newick chunk) |
---|
269 | if leaf.strip() == '' and i == len(possible_leaves)-1: |
---|
270 | continue |
---|
271 | # Leaf text strings may end with a variable number of clossing |
---|
272 | # parenthesis. For each ')' we read the information of the |
---|
273 | # current node, close it and go up one more node. |
---|
274 | clossing_nodes = leaf.split(")") |
---|
275 | # first par contain leaf info |
---|
276 | _read_node_data(clossing_nodes[0], current_parent, "leaf", format) |
---|
277 | # The next parts containg clossing nodes and info about the |
---|
278 | # internal nodes. |
---|
279 | if len(clossing_nodes)>1: |
---|
280 | for closing_internal in clossing_nodes[1:]: |
---|
281 | if closing_internal.strip() ==";": continue |
---|
282 | _read_node_data(closing_internal, current_parent, "internal", format) |
---|
283 | current_parent = current_parent.up |
---|
284 | return root_node |
---|
285 | |
---|
286 | def _parse_extra_features(node, NHX_string): |
---|
287 | """ Reads node's extra data form its NHX string. NHX uses this |
---|
288 | format: [&&NHX:prop1=value1:prop2=value2] """ |
---|
289 | NHX_string = NHX_string.replace("[&&NHX:", "") |
---|
290 | NHX_string = NHX_string.replace("]", "") |
---|
291 | for field in NHX_string.split(":"): |
---|
292 | try: |
---|
293 | pname, pvalue = field.split("=") |
---|
294 | except ValueError, e: |
---|
295 | print NHX_string, field.split("=") |
---|
296 | raise ValueError, e |
---|
297 | node.add_feature(pname, pvalue) |
---|
298 | |
---|
299 | def _read_node_data(subnw, current_node, node_type, format): |
---|
300 | """ Reads a leaf node from a subpart of the original newick |
---|
301 | tree """ |
---|
302 | |
---|
303 | if node_type == "leaf" or node_type == "single": |
---|
304 | if node_type == "leaf": |
---|
305 | node = current_node.add_child() |
---|
306 | else: |
---|
307 | node = current_node |
---|
308 | container1 = NW_FORMAT[format][0][0] |
---|
309 | container2 = NW_FORMAT[format][1][0] |
---|
310 | converterFn1 = NW_FORMAT[format][0][1] |
---|
311 | converterFn2 = NW_FORMAT[format][1][1] |
---|
312 | flexible1 = NW_FORMAT[format][0][2] |
---|
313 | flexible2 = NW_FORMAT[format][1][2] |
---|
314 | else: |
---|
315 | node = current_node |
---|
316 | container1 = NW_FORMAT[format][2][0] |
---|
317 | container2 = NW_FORMAT[format][3][0] |
---|
318 | converterFn1 = NW_FORMAT[format][2][1] |
---|
319 | converterFn2 = NW_FORMAT[format][3][1] |
---|
320 | flexible1 = NW_FORMAT[format][2][2] |
---|
321 | flexible2 = NW_FORMAT[format][3][2] |
---|
322 | |
---|
323 | if converterFn1 == str: |
---|
324 | FIRST_MATCH = "("+_NAME_RE+")" |
---|
325 | elif converterFn1 == float: |
---|
326 | FIRST_MATCH = "("+_FLOAT_RE+")" |
---|
327 | elif converterFn1 is None: |
---|
328 | FIRST_MATCH = '()' |
---|
329 | |
---|
330 | if converterFn2 == str: |
---|
331 | SECOND_MATCH = "(:"+_NAME_RE+")" |
---|
332 | elif converterFn2 == float: |
---|
333 | SECOND_MATCH = "(:"+_FLOAT_RE+")" |
---|
334 | elif converterFn2 is None: |
---|
335 | SECOND_MATCH = '()' |
---|
336 | |
---|
337 | if flexible1: |
---|
338 | FIRST_MATCH += "?" |
---|
339 | if flexible2: |
---|
340 | SECOND_MATCH += "?" |
---|
341 | |
---|
342 | MATCH = '%s\s*%s\s*(%s)?' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE) |
---|
343 | data = re.match(MATCH, subnw) |
---|
344 | if data: |
---|
345 | data = data.groups() |
---|
346 | if data[0] is not None and data[0] != '': |
---|
347 | node.add_feature(container1, converterFn1(data[0].strip())) |
---|
348 | |
---|
349 | if data[1] is not None and data[1] != '': |
---|
350 | node.add_feature(container2, converterFn2(data[1][1:].strip())) |
---|
351 | |
---|
352 | if data[2] is not None \ |
---|
353 | and data[2].startswith("[&&NHX"): |
---|
354 | _parse_extra_features(node, data[2]) |
---|
355 | else: |
---|
356 | raise NewickError, "Unexpected leaf node format:\n\t"+ subnw[0:50] + "[%s]" %format |
---|
357 | return |
---|
358 | |
---|
359 | # def write_newick_recursive(node, features=None, format=1, _is_root=True): |
---|
360 | # """ Recursively reads a tree structure and returns its NHX |
---|
361 | # representation. """ |
---|
362 | # newick = "" |
---|
363 | # if not node.children: |
---|
364 | # safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ |
---|
365 | # str(getattr(node, "name"))) |
---|
366 | |
---|
367 | # newick += format_node(node, "leaf", format) |
---|
368 | # newick += _get_features_string(node, features) |
---|
369 | # #return newick |
---|
370 | |
---|
371 | # else: |
---|
372 | # if node.children: |
---|
373 | # newick+= "(" |
---|
374 | # for cnode in node.children: |
---|
375 | # newick += write_newick(cnode, features, format=format,\ |
---|
376 | # _is_root = False) |
---|
377 | # # After last child is processed, add closing string |
---|
378 | # if cnode == node.children[-1]: |
---|
379 | # newick += ")" |
---|
380 | # if node.up is not None: |
---|
381 | # newick += format_node(node, "internal", format) |
---|
382 | # newick += _get_features_string(node, features) |
---|
383 | # else: |
---|
384 | # newick += ',' |
---|
385 | |
---|
386 | # if _is_root: |
---|
387 | # newick += ";" |
---|
388 | # return newick |
---|
389 | |
---|
390 | def write_newick(rootnode, features=None, format=1, format_root_node=True, |
---|
391 | is_leaf_fn=None): |
---|
392 | """ Iteratively export a tree structure and returns its NHX |
---|
393 | representation. """ |
---|
394 | newick = [] |
---|
395 | leaf = is_leaf_fn if is_leaf_fn else lambda n: not bool(n.children) |
---|
396 | for postorder, node in rootnode.iter_prepostorder(is_leaf_fn=is_leaf_fn): |
---|
397 | if postorder: |
---|
398 | newick.append(")") |
---|
399 | if node.up is not None or format_root_node: |
---|
400 | newick.append(format_node(node, "internal", format)) |
---|
401 | newick.append(_get_features_string(node, features)) |
---|
402 | else: |
---|
403 | if node is not rootnode and node != node.up.children[0]: |
---|
404 | newick.append(",") |
---|
405 | |
---|
406 | if leaf(node): |
---|
407 | safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ |
---|
408 | str(getattr(node, "name"))) |
---|
409 | newick.append(format_node(node, "leaf", format)) |
---|
410 | newick.append(_get_features_string(node, features)) |
---|
411 | else: |
---|
412 | newick.append("(") |
---|
413 | |
---|
414 | newick.append(";") |
---|
415 | return ''.join(newick) |
---|
416 | |
---|
417 | def _get_features_string(self, features=None): |
---|
418 | """ Generates the extended newick string NHX with extra data about |
---|
419 | a node. """ |
---|
420 | string = "" |
---|
421 | if features is None: |
---|
422 | features = [] |
---|
423 | elif features == []: |
---|
424 | features = self.features |
---|
425 | |
---|
426 | for pr in features: |
---|
427 | if hasattr(self, pr): |
---|
428 | value = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \ |
---|
429 | str(getattr(self, pr))) |
---|
430 | if string != "": |
---|
431 | string +=":" |
---|
432 | string +="%s=%s" %(pr, str(value)) |
---|
433 | if string != "": |
---|
434 | string = "[&&NHX:"+string+"]" |
---|
435 | |
---|
436 | return string |
---|