source: branches/items/GDE/SATIVA/sativa/epac/ete2/newick.py

Last change on this file was 12906, checked in by akozlov, 10 years ago

add sativa files and scripts for ARB integration

File size: 16.6 KB
Line 
1__VERSION__="ete2-2.2rev1026" 
2# -*- coding: utf-8 -*-
3# #START_LICENSE###########################################################
4#
5#
6# This file is part of the Environment for Tree Exploration program
7# (ETE).  http://ete.cgenomics.org
8
9# ETE is free software: you can redistribute it and/or modify it
10# under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13
14# ETE is distributed in the hope that it will be useful, but WITHOUT
15# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
17# License for more details.
18
19# You should have received a copy of the GNU General Public License
20# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
21#
22#
23#                     ABOUT THE ETE PACKAGE
24#                     =====================
25#
26# ETE is distributed under the GPL copyleft license (2008-2011). 
27#
28# If you make use of ETE in published work, please cite:
29#
30# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31# ETE: a python Environment for Tree Exploration. Jaime BMC
32# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33#
34# Note that extra references to the specific methods implemented in
35# the toolkit are available in the documentation.
36#
37# More info at http://ete.cgenomics.org
38#
39#
40# #END_LICENSE#############################################################
41
42import re
43import os
44import base64 
45
46__all__ = ["read_newick", "write_newick", "print_supported_formats"]
47
48# Regular expressions used for reading newick format
49_ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r="
50_NON_PRINTABLE_CHARS_RE = "[\x00-\x1f]+"
51
52_NHX_RE = "\[&&NHX:[^\]]*\]"
53_FLOAT_RE = "[+-]?\d+\.?\d*(?:[eE][-+]\d+)?"
54#_FLOAT_RE = "[+-]?\d+\.?\d*"
55_NAME_RE = "[^():,;\[\]]+"
56
57DEFAULT_DIST = 1.0
58DEFAULT_NAME = ''
59DEFAULT_SUPPORT = 1.0
60
61
62# Allowed formats. This table is used to read and write newick using
63# different convenctions. You can also add your own formats in an easy way.
64#
65#
66# FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\
67#    [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]]
68#
69# Attributes are placed in the newick as follows:
70#
71# .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ...
72#
73#
74#           /-A
75# -NoName--|
76#          |          /-B
77#           \C-------|
78#                    |          /-D
79#                     \E-------|
80#                               \-G
81#
82# Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
83# Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
84# Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
85# Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
86# Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)));
87# Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737);
88# Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C);
89# Format 7 = (A,(B,(D,G)E)C);
90# Format 8 = (A,(B,(D,G)));
91# Format 9 = (,(,(,)));
92
93NW_FORMAT = {
94  0: [['name', str, True],  ["dist", float, True],    ['support', float, True],   ["dist", float, True]], # Flexible with support
95  1: [['name', str, True],  ["dist", float, True],    ['name', str, True],      ["dist", float, True]], # Flexible with internal node names
96  2: [['name', str, False], ["dist", float, False],   ['support', float, False],  ["dist", float, False]],# Strict with support values
97  3: [['name', str, False], ["dist", float, False],   ['name', str, False],     ["dist", float, False]], # Strict with internal node names
98  4: [['name', str, False], ["dist", float, False],   [None, None, False],        [None, None, False]],
99  5: [['name', str, False], ["dist", float, False],   [None, None, False],        ["dist", float, False]],
100  6: [['name', str, False], [None, None, False],      [None, None, False],        ["dist", float, False]],
101  7: [['name', str, False], ["dist", float, False],   ["name", str, False],       [None, None, False]],
102  8: [['name', str, False], [None, None, False],      ["name", str, False],       [None, None, False]],
103  9: [['name', str, False], [None, None, False],      [None, None, False],        [None, None, False]], # Only topology with node names
104  100: [[None, None, False],  [None, None, False],      [None, None, False],        [None, None, False]] # Only Topology
105}
106
107
108def format_node(node, node_type, format):
109    if node_type == "leaf":
110        container1 = NW_FORMAT[format][0][0]
111        container2 = NW_FORMAT[format][1][0]
112        converterFn1 = NW_FORMAT[format][0][1]
113        converterFn2 = NW_FORMAT[format][1][1]
114    else:
115        container1 = NW_FORMAT[format][2][0]
116        container2 = NW_FORMAT[format][3][0]
117        converterFn1 = NW_FORMAT[format][2][1]
118        converterFn2 = NW_FORMAT[format][3][1]
119
120    if converterFn1 == str:
121        try:
122            FIRST_PART = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
123                                  str(getattr(node, container1)))
124        except (AttributeError, TypeError):
125            FIRST_PART = "?"
126
127    elif converterFn1 is None:
128        FIRST_PART = ""
129    else:
130        try:
131            #FIRST_PART =  "%0.6f" %(converterFn2(getattr(node, container1)))
132            FIRST_PART =  "%g" %(converterFn2(getattr(node, container1)))
133        except (ValueError, TypeError):
134            FIRST_PART = "?"
135
136
137    if converterFn2 == str:
138        try:
139            SECOND_PART = ":"+re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
140                                  str(getattr(node, container2)))
141        except (ValueError, TypeError):
142            SECOND_PART = ":?"
143    elif converterFn2 is None:
144        SECOND_PART = ""
145    else:
146        try:
147            #SECOND_PART = ":%0.6f" %(converterFn2(getattr(node, container2)))
148            SECOND_PART = ":%g" %(converterFn2(getattr(node, container2)))
149        except (ValueError, TypeError):
150            SECOND_PART = ":?"
151
152    return "%s%s" %(FIRST_PART, SECOND_PART)
153
154# Used to write into specific formats
155def node2leafformat(node, format):
156    safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
157                             str(getattr(node, "name")))
158
159    if format == 0 or format == 1 or format == 2 or format ==3:
160        return "%s:%0.6f" %(safe_name, node.dist)
161    elif format == 4 or format == 7:
162        return ":%0.6f" %(node.dist)
163    elif format == 5 or format == 6:
164        return "%s" %(safe_name)
165
166def node2internalformat(node, format):
167    safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
168                             str(getattr(node, "name")))
169    if format == 0 or format == 1:
170        return "%0.6f:%0.6f" %(node.support, node.dist)
171    elif format == 2:
172        return "%s:%0.6f" %(safe_name, node.dist)
173    elif format == 3 or format == 4:
174        return ":%0.6f" %(node.dist)
175    elif format == 5:
176        return "%s" %(safe_name)
177    elif format == 6 or format == 7:
178        return ""
179
180def print_supported_formats():
181    from ete2.coretype.tree import TreeNode
182    t = TreeNode()
183    t.populate(4, "ABCDEFGHI")
184    print t
185    for f in NW_FORMAT:
186        print "Format", f,"=", write_newick(t, features=None, format=f)
187
188class NewickError(Exception):
189    """Exception class designed for NewickIO errors."""
190    pass
191
192def read_newick(newick, root_node=None, format=0):
193    """ Reads a newick tree from either a string or a file, and returns
194    an ETE tree structure.
195
196    A previously existent node object can be passed as the root of the
197    tree, which means that all its new children will belong to the same
198    class as the root(This allows to work with custom TreeNode
199    objects).
200
201    You can also take advantage from this behaviour to concatenate
202    several tree structures.
203    """
204
205    if root_node is None:
206        from ete2.coretype.tree import TreeNode
207        root_node = TreeNode()
208
209    if isinstance(newick, basestring):
210        if os.path.exists(newick):
211            nw = open(newick, 'rU').read()
212        else:
213            nw = newick
214        nw = nw.strip()
215        if not nw.startswith('(') and nw.endswith(';'):
216            return _read_node_data(nw, root_node, "single", format)
217           
218        elif not nw.startswith('(') or not nw.endswith(';'):
219            raise NewickError, \
220            'Unexisting tree file or Malformed newick tree structure.'
221        else:
222            return _read_newick_from_string(nw, root_node, format)
223
224    else:
225        raise NewickError, \
226            "'newick' argument must be either a filename or a newick string."
227
228def _read_newick_from_string(nw, root_node, format):
229    """ Reads a newick string in the New Hampshire format. """
230
231    if nw.count('(') != nw.count(')'):
232        raise NewickError, 'Parentheses do not match. Broken tree structure'
233
234    # white spaces and separators are removed
235    nw = re.sub("[\n\r\t]+", "", nw)
236
237    current_parent = None
238
239    # Ok, this is my own way of reading newick structures. I find it
240    # more flexible and elegant than other docummented methods. Don't
241    # know if I'm loosing much efficiency. It Starts by splitting the
242    # structure using open parentheses. Each of the resulting chunks
243    # represent an internal node. So for each chunk I create a new node
244    # that hungs from the current parent node.  Each internal node chunk
245    # may contain information about terminal nodes hanging from the
246    # internal and clossing parenthessis (closing previously opened
247    # internal nodes).
248    #
249    # Enjoy.
250    # by JHC ;)
251
252    # Skip the first chunk. It is always == ''
253    for internal_node in nw.split("(")[1:]:
254        # If this is the root of tree, use the root_node instead of
255        # creating it, otherwise make a new one.
256        if current_parent is None:
257            current_parent = root_node
258        else:
259            current_parent = current_parent.add_child()
260        # We can only find leaf nodes within this chunk, since rest of
261        # internal nodes will be in the next newick chunks
262        possible_leaves = internal_node.split(",")
263        for i, leaf in enumerate(possible_leaves):
264            # Any resulting sub-chunk resulting from splitting by commas can
265            # be considered (tpologically) as a child to the current parent
266            # node. We only discard chunks if they are empty and in the last
267            # possition, meaining that the next brother is not terminal bu
268            # internal node (will be visited in the next newick chunk)
269            if leaf.strip() == '' and i == len(possible_leaves)-1:
270                continue
271            # Leaf text strings may end with a variable number of clossing
272            # parenthesis. For each ')' we read the information of the
273            # current node, close it and go up one more node.
274            clossing_nodes = leaf.split(")")
275            # first par contain leaf info
276            _read_node_data(clossing_nodes[0], current_parent, "leaf", format)
277            # The next parts containg clossing nodes and info about the
278            # internal nodes.
279            if len(clossing_nodes)>1:
280                for closing_internal in clossing_nodes[1:]:
281                    if closing_internal.strip() ==";": continue
282                    _read_node_data(closing_internal, current_parent, "internal", format)
283                    current_parent = current_parent.up
284    return root_node
285
286def _parse_extra_features(node, NHX_string):
287    """ Reads node's extra data form its NHX string. NHX uses this
288    format:  [&&NHX:prop1=value1:prop2=value2] """
289    NHX_string = NHX_string.replace("[&&NHX:", "")
290    NHX_string = NHX_string.replace("]", "")
291    for field in NHX_string.split(":"):
292        try:
293            pname, pvalue = field.split("=")
294        except ValueError, e:
295            print NHX_string, field.split("=")
296            raise ValueError, e
297        node.add_feature(pname, pvalue)
298
299def _read_node_data(subnw, current_node, node_type, format):
300    """ Reads a leaf node from a subpart of the original newick
301    tree """
302
303    if node_type == "leaf" or node_type == "single":
304        if node_type == "leaf":
305            node = current_node.add_child()
306        else:
307            node = current_node
308        container1 = NW_FORMAT[format][0][0]
309        container2 = NW_FORMAT[format][1][0]
310        converterFn1 = NW_FORMAT[format][0][1]
311        converterFn2 = NW_FORMAT[format][1][1]
312        flexible1 = NW_FORMAT[format][0][2]
313        flexible2 = NW_FORMAT[format][1][2]
314    else:
315        node = current_node
316        container1 = NW_FORMAT[format][2][0]
317        container2 = NW_FORMAT[format][3][0]
318        converterFn1 = NW_FORMAT[format][2][1]
319        converterFn2 = NW_FORMAT[format][3][1]
320        flexible1 = NW_FORMAT[format][2][2]
321        flexible2 = NW_FORMAT[format][3][2]
322
323    if converterFn1 == str:
324        FIRST_MATCH = "("+_NAME_RE+")"
325    elif converterFn1 == float:
326        FIRST_MATCH = "("+_FLOAT_RE+")"
327    elif converterFn1 is None:
328        FIRST_MATCH = '()'
329
330    if converterFn2 == str:
331        SECOND_MATCH = "(:"+_NAME_RE+")"
332    elif converterFn2 == float:
333        SECOND_MATCH = "(:"+_FLOAT_RE+")"
334    elif converterFn2 is None:
335        SECOND_MATCH = '()'
336
337    if flexible1:
338        FIRST_MATCH += "?"
339    if flexible2:
340        SECOND_MATCH += "?"
341
342    MATCH = '%s\s*%s\s*(%s)?' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE)
343    data = re.match(MATCH, subnw)
344    if data:
345        data = data.groups()
346        if data[0] is not None and data[0] != '':
347            node.add_feature(container1, converterFn1(data[0].strip()))
348
349        if data[1] is not None and data[1] != '':
350            node.add_feature(container2, converterFn2(data[1][1:].strip()))
351
352        if data[2] is not None \
353                and data[2].startswith("[&&NHX"):
354            _parse_extra_features(node, data[2])
355    else:
356        raise NewickError, "Unexpected leaf node format:\n\t"+ subnw[0:50] + "[%s]" %format
357    return
358
359# def write_newick_recursive(node, features=None, format=1, _is_root=True):
360#     """ Recursively reads a tree structure and returns its NHX
361#     representation. """
362#     newick = ""
363#     if not node.children:
364#         safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
365#                                str(getattr(node, "name")))
366
367#         newick += format_node(node, "leaf", format)
368#         newick += _get_features_string(node, features)
369#         #return newick
370       
371#     else:
372#         if node.children:
373#             newick+= "("
374#         for cnode in node.children:
375#             newick += write_newick(cnode, features, format=format,\
376#                                      _is_root = False)
377#             # After last child is processed, add closing string
378#             if cnode == node.children[-1]:
379#                 newick += ")"
380#                 if node.up is not None:
381#                     newick += format_node(node, "internal", format)
382#                 newick += _get_features_string(node, features)
383#             else:
384#                 newick += ','
385               
386#     if _is_root:
387#         newick += ";"
388#     return newick
389
390def write_newick(rootnode, features=None, format=1, format_root_node=True,
391                 is_leaf_fn=None):
392    """ Iteratively export a tree structure and returns its NHX
393    representation. """
394    newick = []
395    leaf = is_leaf_fn if is_leaf_fn else lambda n: not bool(n.children)
396    for postorder, node in rootnode.iter_prepostorder(is_leaf_fn=is_leaf_fn):
397        if postorder:
398            newick.append(")")
399            if node.up is not None or format_root_node:
400                newick.append(format_node(node, "internal", format))
401                newick.append(_get_features_string(node, features))
402        else:
403            if node is not rootnode and node != node.up.children[0]:
404                newick.append(",")
405               
406            if leaf(node):
407                safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
408                               str(getattr(node, "name")))
409                newick.append(format_node(node, "leaf", format))
410                newick.append(_get_features_string(node, features))
411            else:
412                newick.append("(")
413
414    newick.append(";")
415    return ''.join(newick)
416   
417def _get_features_string(self, features=None):
418    """ Generates the extended newick string NHX with extra data about
419    a node. """
420    string = ""
421    if features is None:
422        features = []
423    elif features == []:
424        features = self.features
425
426    for pr in features:
427        if hasattr(self, pr):
428            value = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
429                             str(getattr(self, pr)))
430            if string != "":
431                string +=":"
432            string +="%s=%s"  %(pr, str(value))
433    if string != "":
434        string = "[&&NHX:"+string+"]"
435
436    return string
Note: See TracBrowser for help on using the repository browser.