Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: trunk/GDE/SATIVA/sativa/epac/ete2/newick.py

Visit:

Last change on this file was 12906, checked in by akozlov, 10 years ago
add sativa files and scripts for ARB integration
File size: 16.6 KB

Line
1	__VERSION__="ete2-2.2rev1026"
2	# -- coding: utf-8 --
3	# #START_LICENSE###########################################################
4	#
5	#
6	# This file is part of the Environment for Tree Exploration program
7	# (ETE). http://ete.cgenomics.org
8	#
9	# ETE is free software: you can redistribute it and/or modify it
10	# under the terms of the GNU General Public License as published by
11	# the Free Software Foundation, either version 3 of the License, or
12	# (at your option) any later version.
13	#
14	# ETE is distributed in the hope that it will be useful, but WITHOUT
15	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16	# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
17	# License for more details.
18	#
19	# You should have received a copy of the GNU General Public License
20	# along with ETE. If not, see <http://www.gnu.org/licenses/>.
21	#
22	#
23	# ABOUT THE ETE PACKAGE
24	# =====================
25	#
26	# ETE is distributed under the GPL copyleft license (2008-2011).
27	#
28	# If you make use of ETE in published work, please cite:
29	#
30	# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31	# ETE: a python Environment for Tree Exploration. Jaime BMC
32	# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33	#
34	# Note that extra references to the specific methods implemented in
35	# the toolkit are available in the documentation.
36	#
37	# More info at http://ete.cgenomics.org
38	#
39	#
40	# #END_LICENSE#############################################################
41
42	import re
43	import os
44	import base64
45
46	__all__ = ["read_newick", "write_newick", "print_supported_formats"]
47
48	# Regular expressions used for reading newick format
49	_ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r="
50	_NON_PRINTABLE_CHARS_RE = "[\x00-\x1f]+"
51
52	_NHX_RE = "\[&&NHX:[^\]]*\]"
53	_FLOAT_RE = "[+-]?\d+\.?\d*(?:[eE][-+]\d+)?"
54	#_FLOAT_RE = "[+-]?\d+\.?\d*"
55	_NAME_RE = "[^():,;\[\]]+"
56
57	DEFAULT_DIST = 1.0
58	DEFAULT_NAME = ''
59	DEFAULT_SUPPORT = 1.0
60
61
62	# Allowed formats. This table is used to read and write newick using
63	# different convenctions. You can also add your own formats in an easy way.
64	#
65	#
66	# FORMAT: [[LeafAttr1, LeafAttr1Type, Strict?], [LeafAttr2, LeafAttr2Type, Strict?],\
67	# [InternalAttr1, InternalAttr1Type, Strict?], [InternalAttr2, InternalAttr2Type, Strict?]]
68	#
69	# Attributes are placed in the newick as follows:
70	#
71	# .... ,LeafAttr1:LeafAttr2)InternalAttr1:InternalAttr2 ...
72	#
73	#
74	# /-A
75	# -NoName--\|
76	# \| /-B
77	# \C-------\|
78	# \| /-D
79	# \E-------\|
80	# \-G
81	#
82	# Format 0 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
83	# Format 1 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
84	# Format 2 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)1.000000:0.642905)1.000000:0.567737);
85	# Format 3 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E:0.642905)C:0.567737);
86	# Format 4 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)));
87	# Format 5 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729):0.642905):0.567737);
88	# Format 6 = (A:0.350596,(B:0.728431,(D:0.609498,G:0.125729)E)C);
89	# Format 7 = (A,(B,(D,G)E)C);
90	# Format 8 = (A,(B,(D,G)));
91	# Format 9 = (,(,(,)));
92
93	NW_FORMAT = {
94	0: [['name', str, True], ["dist", float, True], ['support', float, True], ["dist", float, True]], # Flexible with support
95	1: [['name', str, True], ["dist", float, True], ['name', str, True], ["dist", float, True]], # Flexible with internal node names
96	2: [['name', str, False], ["dist", float, False], ['support', float, False], ["dist", float, False]],# Strict with support values
97	3: [['name', str, False], ["dist", float, False], ['name', str, False], ["dist", float, False]], # Strict with internal node names
98	4: [['name', str, False], ["dist", float, False], [None, None, False], [None, None, False]],
99	5: [['name', str, False], ["dist", float, False], [None, None, False], ["dist", float, False]],
100	6: [['name', str, False], [None, None, False], [None, None, False], ["dist", float, False]],
101	7: [['name', str, False], ["dist", float, False], ["name", str, False], [None, None, False]],
102	8: [['name', str, False], [None, None, False], ["name", str, False], [None, None, False]],
103	9: [['name', str, False], [None, None, False], [None, None, False], [None, None, False]], # Only topology with node names
104	100: [[None, None, False], [None, None, False], [None, None, False], [None, None, False]] # Only Topology
105	}
106
107
108	def format_node(node, node_type, format):
109	if node_type == "leaf":
110	container1 = NW_FORMAT[format][0][0]
111	container2 = NW_FORMAT[format][1][0]
112	converterFn1 = NW_FORMAT[format][0][1]
113	converterFn2 = NW_FORMAT[format][1][1]
114	else:
115	container1 = NW_FORMAT[format][2][0]
116	container2 = NW_FORMAT[format][3][0]
117	converterFn1 = NW_FORMAT[format][2][1]
118	converterFn2 = NW_FORMAT[format][3][1]
119
120	if converterFn1 == str:
121	try:
122	FIRST_PART = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
123	str(getattr(node, container1)))
124	except (AttributeError, TypeError):
125	FIRST_PART = "?"
126
127	elif converterFn1 is None:
128	FIRST_PART = ""
129	else:
130	try:
131	#FIRST_PART = "%0.6f" %(converterFn2(getattr(node, container1)))
132	FIRST_PART = "%g" %(converterFn2(getattr(node, container1)))
133	except (ValueError, TypeError):
134	FIRST_PART = "?"
135
136
137	if converterFn2 == str:
138	try:
139	SECOND_PART = ":"+re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
140	str(getattr(node, container2)))
141	except (ValueError, TypeError):
142	SECOND_PART = ":?"
143	elif converterFn2 is None:
144	SECOND_PART = ""
145	else:
146	try:
147	#SECOND_PART = ":%0.6f" %(converterFn2(getattr(node, container2)))
148	SECOND_PART = ":%g" %(converterFn2(getattr(node, container2)))
149	except (ValueError, TypeError):
150	SECOND_PART = ":?"
151
152	return "%s%s" %(FIRST_PART, SECOND_PART)
153
154	# Used to write into specific formats
155	def node2leafformat(node, format):
156	safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
157	str(getattr(node, "name")))
158
159	if format == 0 or format == 1 or format == 2 or format ==3:
160	return "%s:%0.6f" %(safe_name, node.dist)
161	elif format == 4 or format == 7:
162	return ":%0.6f" %(node.dist)
163	elif format == 5 or format == 6:
164	return "%s" %(safe_name)
165
166	def node2internalformat(node, format):
167	safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
168	str(getattr(node, "name")))
169	if format == 0 or format == 1:
170	return "%0.6f:%0.6f" %(node.support, node.dist)
171	elif format == 2:
172	return "%s:%0.6f" %(safe_name, node.dist)
173	elif format == 3 or format == 4:
174	return ":%0.6f" %(node.dist)
175	elif format == 5:
176	return "%s" %(safe_name)
177	elif format == 6 or format == 7:
178	return ""
179
180	def print_supported_formats():
181	from ete2.coretype.tree import TreeNode
182	t = TreeNode()
183	t.populate(4, "ABCDEFGHI")
184	print t
185	for f in NW_FORMAT:
186	print "Format", f,"=", write_newick(t, features=None, format=f)
187
188	class NewickError(Exception):
189	"""Exception class designed for NewickIO errors."""
190	pass
191
192	def read_newick(newick, root_node=None, format=0):
193	""" Reads a newick tree from either a string or a file, and returns
194	an ETE tree structure.
195
196	A previously existent node object can be passed as the root of the
197	tree, which means that all its new children will belong to the same
198	class as the root(This allows to work with custom TreeNode
199	objects).
200
201	You can also take advantage from this behaviour to concatenate
202	several tree structures.
203	"""
204
205	if root_node is None:
206	from ete2.coretype.tree import TreeNode
207	root_node = TreeNode()
208
209	if isinstance(newick, basestring):
210	if os.path.exists(newick):
211	nw = open(newick, 'rU').read()
212	else:
213	nw = newick
214	nw = nw.strip()
215	if not nw.startswith('(') and nw.endswith(';'):
216	return _read_node_data(nw, root_node, "single", format)
217
218	elif not nw.startswith('(') or not nw.endswith(';'):
219	raise NewickError, \
220	'Unexisting tree file or Malformed newick tree structure.'
221	else:
222	return _read_newick_from_string(nw, root_node, format)
223
224	else:
225	raise NewickError, \
226	"'newick' argument must be either a filename or a newick string."
227
228	def _read_newick_from_string(nw, root_node, format):
229	""" Reads a newick string in the New Hampshire format. """
230
231	if nw.count('(') != nw.count(')'):
232	raise NewickError, 'Parentheses do not match. Broken tree structure'
233
234	# white spaces and separators are removed
235	nw = re.sub("[\n\r\t]+", "", nw)
236
237	current_parent = None
238
239	# Ok, this is my own way of reading newick structures. I find it
240	# more flexible and elegant than other docummented methods. Don't
241	# know if I'm loosing much efficiency. It Starts by splitting the
242	# structure using open parentheses. Each of the resulting chunks
243	# represent an internal node. So for each chunk I create a new node
244	# that hungs from the current parent node. Each internal node chunk
245	# may contain information about terminal nodes hanging from the
246	# internal and clossing parenthessis (closing previously opened
247	# internal nodes).
248	#
249	# Enjoy.
250	# by JHC ;)
251
252	# Skip the first chunk. It is always == ''
253	for internal_node in nw.split("(")[1:]:
254	# If this is the root of tree, use the root_node instead of
255	# creating it, otherwise make a new one.
256	if current_parent is None:
257	current_parent = root_node
258	else:
259	current_parent = current_parent.add_child()
260	# We can only find leaf nodes within this chunk, since rest of
261	# internal nodes will be in the next newick chunks
262	possible_leaves = internal_node.split(",")
263	for i, leaf in enumerate(possible_leaves):
264	# Any resulting sub-chunk resulting from splitting by commas can
265	# be considered (tpologically) as a child to the current parent
266	# node. We only discard chunks if they are empty and in the last
267	# possition, meaining that the next brother is not terminal bu
268	# internal node (will be visited in the next newick chunk)
269	if leaf.strip() == '' and i == len(possible_leaves)-1:
270	continue
271	# Leaf text strings may end with a variable number of clossing
272	# parenthesis. For each ')' we read the information of the
273	# current node, close it and go up one more node.
274	clossing_nodes = leaf.split(")")
275	# first par contain leaf info
276	_read_node_data(clossing_nodes[0], current_parent, "leaf", format)
277	# The next parts containg clossing nodes and info about the
278	# internal nodes.
279	if len(clossing_nodes)>1:
280	for closing_internal in clossing_nodes[1:]:
281	if closing_internal.strip() ==";": continue
282	_read_node_data(closing_internal, current_parent, "internal", format)
283	current_parent = current_parent.up
284	return root_node
285
286	def _parse_extra_features(node, NHX_string):
287	""" Reads node's extra data form its NHX string. NHX uses this
288	format: [&&NHX:prop1=value1:prop2=value2] """
289	NHX_string = NHX_string.replace("[&&NHX:", "")
290	NHX_string = NHX_string.replace("]", "")
291	for field in NHX_string.split(":"):
292	try:
293	pname, pvalue = field.split("=")
294	except ValueError, e:
295	print NHX_string, field.split("=")
296	raise ValueError, e
297	node.add_feature(pname, pvalue)
298
299	def _read_node_data(subnw, current_node, node_type, format):
300	""" Reads a leaf node from a subpart of the original newick
301	tree """
302
303	if node_type == "leaf" or node_type == "single":
304	if node_type == "leaf":
305	node = current_node.add_child()
306	else:
307	node = current_node
308	container1 = NW_FORMAT[format][0][0]
309	container2 = NW_FORMAT[format][1][0]
310	converterFn1 = NW_FORMAT[format][0][1]
311	converterFn2 = NW_FORMAT[format][1][1]
312	flexible1 = NW_FORMAT[format][0][2]
313	flexible2 = NW_FORMAT[format][1][2]
314	else:
315	node = current_node
316	container1 = NW_FORMAT[format][2][0]
317	container2 = NW_FORMAT[format][3][0]
318	converterFn1 = NW_FORMAT[format][2][1]
319	converterFn2 = NW_FORMAT[format][3][1]
320	flexible1 = NW_FORMAT[format][2][2]
321	flexible2 = NW_FORMAT[format][3][2]
322
323	if converterFn1 == str:
324	FIRST_MATCH = "("+_NAME_RE+")"
325	elif converterFn1 == float:
326	FIRST_MATCH = "("+_FLOAT_RE+")"
327	elif converterFn1 is None:
328	FIRST_MATCH = '()'
329
330	if converterFn2 == str:
331	SECOND_MATCH = "(:"+_NAME_RE+")"
332	elif converterFn2 == float:
333	SECOND_MATCH = "(:"+_FLOAT_RE+")"
334	elif converterFn2 is None:
335	SECOND_MATCH = '()'
336
337	if flexible1:
338	FIRST_MATCH += "?"
339	if flexible2:
340	SECOND_MATCH += "?"
341
342	MATCH = '%s\s%s\s(%s)?' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE)
343	data = re.match(MATCH, subnw)
344	if data:
345	data = data.groups()
346	if data[0] is not None and data[0] != '':
347	node.add_feature(container1, converterFn1(data[0].strip()))
348
349	if data[1] is not None and data[1] != '':
350	node.add_feature(container2, converterFn2(data[1][1:].strip()))
351
352	if data[2] is not None \
353	and data[2].startswith("[&&NHX"):
354	_parse_extra_features(node, data[2])
355	else:
356	raise NewickError, "Unexpected leaf node format:\n\t"+ subnw[0:50] + "[%s]" %format
357	return
358
359	# def write_newick_recursive(node, features=None, format=1, _is_root=True):
360	# """ Recursively reads a tree structure and returns its NHX
361	# representation. """
362	# newick = ""
363	# if not node.children:
364	# safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
365	# str(getattr(node, "name")))
366
367	# newick += format_node(node, "leaf", format)
368	# newick += _get_features_string(node, features)
369	# #return newick
370
371	# else:
372	# if node.children:
373	# newick+= "("
374	# for cnode in node.children:
375	# newick += write_newick(cnode, features, format=format,\
376	# _is_root = False)
377	# # After last child is processed, add closing string
378	# if cnode == node.children[-1]:
379	# newick += ")"
380	# if node.up is not None:
381	# newick += format_node(node, "internal", format)
382	# newick += _get_features_string(node, features)
383	# else:
384	# newick += ','
385
386	# if _is_root:
387	# newick += ";"
388	# return newick
389
390	def write_newick(rootnode, features=None, format=1, format_root_node=True,
391	is_leaf_fn=None):
392	""" Iteratively export a tree structure and returns its NHX
393	representation. """
394	newick = []
395	leaf = is_leaf_fn if is_leaf_fn else lambda n: not bool(n.children)
396	for postorder, node in rootnode.iter_prepostorder(is_leaf_fn=is_leaf_fn):
397	if postorder:
398	newick.append(")")
399	if node.up is not None or format_root_node:
400	newick.append(format_node(node, "internal", format))
401	newick.append(_get_features_string(node, features))
402	else:
403	if node is not rootnode and node != node.up.children[0]:
404	newick.append(",")
405
406	if leaf(node):
407	safe_name = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
408	str(getattr(node, "name")))
409	newick.append(format_node(node, "leaf", format))
410	newick.append(_get_features_string(node, features))
411	else:
412	newick.append("(")
413
414	newick.append(";")
415	return ''.join(newick)
416
417	def _get_features_string(self, features=None):
418	""" Generates the extended newick string NHX with extra data about
419	a node. """
420	string = ""
421	if features is None:
422	features = []
423	elif features == []:
424	features = self.features
425
426	for pr in features:
427	if hasattr(self, pr):
428	value = re.sub("["+_ILEGAL_NEWICK_CHARS+"]", "_", \
429	str(getattr(self, pr)))
430	if string != "":
431	string +=":"
432	string +="%s=%s" %(pr, str(value))
433	if string != "":
434	string = "[&&NHX:"+string+"]"
435
436	return string

Note: See TracBrowser for help on using the repository browser.

Download in other formats: