Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: branches/ali/GDE/SATIVA/sativa/epac/ete2/phylip.py

Visit:

Last change on this file was 12906, checked in by akozlov, 11 years ago
add sativa files and scripts for ARB integration
File size: 7.9 KB

Line
1	__VERSION__="ete2-2.2rev1026"
2	# -- coding: utf-8 --
3	# #START_LICENSE###########################################################
4	#
5	#
6	# This file is part of the Environment for Tree Exploration program
7	# (ETE). http://ete.cgenomics.org
8	#
9	# ETE is free software: you can redistribute it and/or modify it
10	# under the terms of the GNU General Public License as published by
11	# the Free Software Foundation, either version 3 of the License, or
12	# (at your option) any later version.
13	#
14	# ETE is distributed in the hope that it will be useful, but WITHOUT
15	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16	# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
17	# License for more details.
18	#
19	# You should have received a copy of the GNU General Public License
20	# along with ETE. If not, see <http://www.gnu.org/licenses/>.
21	#
22	#
23	# ABOUT THE ETE PACKAGE
24	# =====================
25	#
26	# ETE is distributed under the GPL copyleft license (2008-2011).
27	#
28	# If you make use of ETE in published work, please cite:
29	#
30	# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31	# ETE: a python Environment for Tree Exploration. Jaime BMC
32	# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33	#
34	# Note that extra references to the specific methods implemented in
35	# the toolkit are available in the documentation.
36	#
37	# More info at http://ete.cgenomics.org
38	#
39	#
40	# #END_LICENSE#############################################################
41
42	import os
43	import re
44	from sys import stderr as STDERR
45
46	def read_phylip(source, interleaved=True, obj=None,
47	relaxed=False, fix_duplicates=True):
48	if obj is None:
49	from ete2.coretype import SeqGroup
50	SG = SeqGroup()
51	else:
52	SG = obj
53
54	# Prepares handle from which read sequences
55	if os.path.isfile(source):
56	_source = open(source, "rU")
57	else:
58	_source = iter(source.split("\n"))
59
60	nchar, ntax = None, None
61	counter = 0
62	id_counter = 0
63	for line in _source:
64	line = line.strip("\n")
65	# Passes comments and blank lines
66	if not line or line[0] == "#":
67	continue
68	# Reads head
69	if not nchar or not ntax:
70	m = re.match("^\s*(\d+)\s+(\d+)",line)
71	if m:
72	ntax = int (m.groups()[0])
73	nchar = int (m.groups()[1])
74	else:
75	raise Exception, \
76	"A first line with the alignment dimension is required"
77	# Reads sequences
78	else:
79	if not interleaved:
80	# Reads names and sequences
81	if SG.id2name.get(id_counter, None) is None:
82	if relaxed:
83	m = re.match("^([^ ]+)(.+)", line)
84	else:
85	m = re.match("^(.{10})(.+)", line)
86	if m:
87	name = m.groups()[0].strip()
88	if fix_duplicates and name in SG.name2id:
89	tag = str(len([k for k in SG.name2id.keys() \
90	if k.endswith(name)]))
91	old_name = name
92	# Tag is in the beginning to avoid being
93	# cut it by the 10 chars limit
94	name = tag+"_"+name
95	print >>STDERR, \
96	"Duplicated entry [%s] was renamed to [%s]" %\
97	(old_name, name)
98	SG.id2name[id_counter] = name
99	SG.name2id[name] = id_counter
100	SG.id2seq[id_counter] = ""
101	line = m.groups()[1]
102	else:
103	raise Exception, \
104	"Wrong phylip sequencial format."
105	SG.id2seq[id_counter] += re.sub("\s","", line)
106	if len(SG.id2seq[id_counter]) == nchar:
107	id_counter += 1
108	name = None
109	elif len(SG.id2seq[id_counter]) > nchar:
110	raise Exception, \
111	"Unexpected length of sequence [%s] [%s]." %(name,SG.id2seq[id_counter])
112	else:
113	if len(SG)<ntax:
114	if relaxed:
115	m = re.match("^([^ ]+)(.+)", line)
116	else:
117	m = re.match("^(.{10})(.+)",line)
118	if m:
119	name = m.groups()[0].strip()
120
121	seq = re.sub("\s","",m.groups()[1])
122	SG.id2seq[id_counter] = seq
123	SG.id2name[id_counter] = name
124	if fix_duplicates and name in SG.name2id:
125	tag = str(len([k for k in SG.name2id.keys() \
126	if k.endswith(name)]))
127	old_name = name
128	name = tag+"_"+name
129	print >>STDERR, \
130	"Duplicated entry [%s] was renamed to [%s]" %\
131	(old_name, name)
132	SG.name2id[name] = id_counter
133	id_counter += 1
134	else:
135	raise Exception, \
136	"Unexpected number of sequences."
137	else:
138	seq = re.sub("\s", "", line)
139	if id_counter == len(SG):
140	id_counter = 0
141	SG.id2seq[id_counter] += seq
142	id_counter += 1
143
144	if len(SG) != ntax:
145	raise Exception, \
146	"Unexpected number of sequences."
147
148	# Check lenght of all seqs
149	for i in SG.id2seq.keys():
150	if len(SG.id2seq[i]) != nchar:
151	raise Exception, \
152	"Unexpected lenght of sequence [%s]" %SG.id2name[i]
153
154	return SG
155
156	def write_phylip(aln, outfile=None, interleaved=True, relaxed=False):
157	width = 60
158	seq_visited = set([])
159
160	show_name_warning = False
161	lenghts = set((len(seq) for seq in aln.id2seq.values()))
162	if len(lenghts) >1:
163	raise Exception, "Phylip format requires sequences of equal lenght."
164	seqlength = lenghts.pop()
165
166	if not relaxed:
167	name_fix = 10
168	else:
169	name_fix = max([len(name) for name in aln.id2name.values()])
170
171	alg_lines = []
172	alg_text = " %d %d" %(len(aln), seqlength)
173	alg_lines.append(alg_text)
174	if interleaved:
175	visited = set([])
176	for i in xrange(0, seqlength, width):
177	for j in aln.id2name.iterkeys(): #xrange(len(aln)):
178	name = aln.id2name[j]
179	if not relaxed and len(name)>name_fix:
180	name = name[:name_fix]
181	show_name_warning = True
182
183	seq = aln.id2seq[j][i:i+width]
184	if j not in visited:
185	name_str = "%s " %name.ljust(name_fix)
186	visited.add(j)
187	else:
188	name_str = "".ljust(name_fix+3)
189
190	seq_str = ' '.join([seq[k:k+10] for k in xrange(0, len(seq), 10)])
191	line_str = "%s%s" %(name_str, seq_str)
192	alg_lines.append(line_str)
193	alg_lines.append("")
194	else:
195	for name, seq, comments in aln.iter_entries():
196	if not relaxed and len(name)>10:
197	name = name[:name_fix]
198	show_name_warning = True
199	line_str = "%s %s\n%s" %\
200	(name.ljust(name_fix), seq[0:width-name_fix-3], '\n'.join([seq[k:k+width] \
201	for k in xrange(width-name_fix-3, len(seq), width)]))
202	alg_lines.append(line_str)
203	alg_lines.append("")
204
205
206	if show_name_warning:
207	print >>STDERR, "Warning! Some sequence names were cut to 10 characters!!"
208	alg_text = '\n'.join(alg_lines)
209	if outfile is not None:
210	OUT = open(outfile, "w")
211	OUT.write(alg_text)
212	OUT.close()
213	else:
214	return alg_text

Note: See TracBrowser for help on using the repository browser.

Download in other formats: