Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: branches/items/GDE/SATIVA/sativa/epac/ete2/paml.py

Visit:

Last change on this file was 12906, checked in by akozlov, 11 years ago
add sativa files and scripts for ARB integration
File size: 5.3 KB

Line
1	__VERSION__="ete2-2.2rev1026"
2	# -- coding: utf-8 --
3	# #START_LICENSE###########################################################
4	#
5	#
6	# This file is part of the Environment for Tree Exploration program
7	# (ETE). http://ete.cgenomics.org
8	#
9	# ETE is free software: you can redistribute it and/or modify it
10	# under the terms of the GNU General Public License as published by
11	# the Free Software Foundation, either version 3 of the License, or
12	# (at your option) any later version.
13	#
14	# ETE is distributed in the hope that it will be useful, but WITHOUT
15	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16	# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
17	# License for more details.
18	#
19	# You should have received a copy of the GNU General Public License
20	# along with ETE. If not, see <http://www.gnu.org/licenses/>.
21	#
22	#
23	# ABOUT THE ETE PACKAGE
24	# =====================
25	#
26	# ETE is distributed under the GPL copyleft license (2008-2011).
27	#
28	# If you make use of ETE in published work, please cite:
29	#
30	# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31	# ETE: a python Environment for Tree Exploration. Jaime BMC
32	# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33	#
34	# Note that extra references to the specific methods implemented in
35	# the toolkit are available in the documentation.
36	#
37	# More info at http://ete.cgenomics.org
38	#
39	#
40	# #END_LICENSE#############################################################
41
42
43	import os
44	import string
45	from sys import stderr as STDERR
46	from re import search
47
48	def read_paml (source, obj=None, header_delimiter="\t", fix_duplicates=True):
49	""" Reads a collection of sequences econded in PAML format... that is, something between PHYLIP and fasta
50
51	3 6
52	seq1
53	ATGATG
54	seq2
55	ATGATG
56	seq3
57	ATGATG
58
59	or
60
61	3 6
62	>seq1
63	ATGATG
64	>seq2
65	ATGATG
66	>seq3
67	ATGATG
68
69	or
70
71	>seq1
72	ATGATG
73	>seq2
74	ATGATG
75	>seq3
76	ATGATG
77
78	"""
79
80	if obj is None:
81	from ete2.coretype import seqgroup
82	SC = seqgroup.SeqGroup()
83	else:
84	SC = obj
85
86	names = set([])
87	seq_id = -1
88
89	# Prepares handle from which read sequences
90	if os.path.isfile(source):
91	_source = open(source, "rU")
92	else:
93	_source = iter(source.split("\n"))
94
95	seq_name = None
96	num_seq = 0
97	len_seq = 0
98	in_seq = False
99	for line in _source:
100	line = line.strip()
101	if line.startswith('#') or not line:
102	continue
103	# Reads seq number
104	elif line.startswith('>') or ((num_seq and len_seq) and not in_seq):
105	line = line.replace('>','')
106	# Checks if previous name had seq
107	if seq_id>-1 and SC.id2seq[seq_id] == "":
108	raise Exception, "No sequence found for "+seq_name
109
110	seq_id += 1
111	# Takes header info
112	seq_header_fields = map(string.strip, line.split(header_delimiter))
113	seq_name = seq_header_fields[0]
114
115	# Checks for duplicated seq names
116	if fix_duplicates and seq_name in names:
117	tag = str(len([k for k in SC.name2id.keys() if k.endswith(seq_name)]))
118	old_name = seq_name
119	seq_name = tag+"_"+seq_name
120	print >>STDERR, "Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name)
121
122	# stores seq_name
123	SC.id2seq[seq_id] = ""
124	SC.id2name[seq_id] = seq_name
125	SC.name2id[seq_name] = seq_id
126	SC.id2comment[seq_id] = seq_header_fields[1:]
127	names.add(seq_name)
128	in_seq = True
129	else:
130	if seq_name is None:
131	if search ('^[0-9]+ *[0-9]+$', line):
132	num_seq, len_seq = line.strip().split()
133	num_seq = int(num_seq)
134	len_seq = int(len_seq)
135	continue
136	if line.startswith('\n'):
137	continue
138	raise Exception, "Error reading sequences: Wrong format.\n"+line
139	elif in_seq:
140	# removes all white spaces in line
141	s = line.strip().replace(" ","")
142
143	# append to seq_string
144	SC.id2seq[seq_id] += s
145	if len_seq:
146	if len(SC.id2seq[seq_id]) == len_seq:
147	in_seq=False
148	elif len(SC.id2seq[seq_id]) > len_seq:
149	raise Exception, "Error reading sequences: Wrong sequence length.\n"+line
150
151	if seq_name and SC.id2seq[seq_id] == "":
152	print >>STDERR, seq_name,"has no sequence"
153	return None
154
155	# Everything ok
156	return SC
157
158	def write_paml(sequences, outfile = None, seqwidth = 80):
159	"""
160	Writes a SeqGroup python object using PAML format.
161	sequences are ordered, because PAML labels tree according to this.
162	"""
163	text = ' %d %d\n' % (len (sequences), len (sequences.get_entries()[0][1]))
164	text += '\n'.join(["%s\n%s" %( "\t".join([name]+comment), _seq2str(seq)) for
165	name, seq, comment in sorted(sequences)])
166	if outfile is not None:
167	OUT = open(outfile,"w")
168	OUT.write(text)
169	OUT.close()
170	else:
171	return text
172
173	def _seq2str(seq, seqwidth = 80):
174	sequence = ""
175	for i in xrange(0,len(seq),seqwidth):
176	sequence+= seq[i:i+seqwidth] + "\n"
177	return sequence
178

Note: See TracBrowser for help on using the repository browser.

Download in other formats: