1 | __VERSION__="ete2-2.2rev1026" |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | # #START_LICENSE########################################################### |
---|
4 | # |
---|
5 | # |
---|
6 | # This file is part of the Environment for Tree Exploration program |
---|
7 | # (ETE). http://ete.cgenomics.org |
---|
8 | # |
---|
9 | # ETE is free software: you can redistribute it and/or modify it |
---|
10 | # under the terms of the GNU General Public License as published by |
---|
11 | # the Free Software Foundation, either version 3 of the License, or |
---|
12 | # (at your option) any later version. |
---|
13 | # |
---|
14 | # ETE is distributed in the hope that it will be useful, but WITHOUT |
---|
15 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
---|
16 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
---|
17 | # License for more details. |
---|
18 | # |
---|
19 | # You should have received a copy of the GNU General Public License |
---|
20 | # along with ETE. If not, see <http://www.gnu.org/licenses/>. |
---|
21 | # |
---|
22 | # |
---|
23 | # ABOUT THE ETE PACKAGE |
---|
24 | # ===================== |
---|
25 | # |
---|
26 | # ETE is distributed under the GPL copyleft license (2008-2011). |
---|
27 | # |
---|
28 | # If you make use of ETE in published work, please cite: |
---|
29 | # |
---|
30 | # Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. |
---|
31 | # ETE: a python Environment for Tree Exploration. Jaime BMC |
---|
32 | # Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 |
---|
33 | # |
---|
34 | # Note that extra references to the specific methods implemented in |
---|
35 | # the toolkit are available in the documentation. |
---|
36 | # |
---|
37 | # More info at http://ete.cgenomics.org |
---|
38 | # |
---|
39 | # |
---|
40 | # #END_LICENSE############################################################# |
---|
41 | |
---|
42 | import os |
---|
43 | import re |
---|
44 | from sys import stderr as STDERR |
---|
45 | |
---|
46 | def read_phylip(source, interleaved=True, obj=None, |
---|
47 | relaxed=False, fix_duplicates=True): |
---|
48 | if obj is None: |
---|
49 | from ete2.coretype import SeqGroup |
---|
50 | SG = SeqGroup() |
---|
51 | else: |
---|
52 | SG = obj |
---|
53 | |
---|
54 | # Prepares handle from which read sequences |
---|
55 | if os.path.isfile(source): |
---|
56 | _source = open(source, "rU") |
---|
57 | else: |
---|
58 | _source = iter(source.split("\n")) |
---|
59 | |
---|
60 | nchar, ntax = None, None |
---|
61 | counter = 0 |
---|
62 | id_counter = 0 |
---|
63 | for line in _source: |
---|
64 | line = line.strip("\n") |
---|
65 | # Passes comments and blank lines |
---|
66 | if not line or line[0] == "#": |
---|
67 | continue |
---|
68 | # Reads head |
---|
69 | if not nchar or not ntax: |
---|
70 | m = re.match("^\s*(\d+)\s+(\d+)",line) |
---|
71 | if m: |
---|
72 | ntax = int (m.groups()[0]) |
---|
73 | nchar = int (m.groups()[1]) |
---|
74 | else: |
---|
75 | raise Exception, \ |
---|
76 | "A first line with the alignment dimension is required" |
---|
77 | # Reads sequences |
---|
78 | else: |
---|
79 | if not interleaved: |
---|
80 | # Reads names and sequences |
---|
81 | if SG.id2name.get(id_counter, None) is None: |
---|
82 | if relaxed: |
---|
83 | m = re.match("^([^ ]+)(.+)", line) |
---|
84 | else: |
---|
85 | m = re.match("^(.{10})(.+)", line) |
---|
86 | if m: |
---|
87 | name = m.groups()[0].strip() |
---|
88 | if fix_duplicates and name in SG.name2id: |
---|
89 | tag = str(len([k for k in SG.name2id.keys() \ |
---|
90 | if k.endswith(name)])) |
---|
91 | old_name = name |
---|
92 | # Tag is in the beginning to avoid being |
---|
93 | # cut it by the 10 chars limit |
---|
94 | name = tag+"_"+name |
---|
95 | print >>STDERR, \ |
---|
96 | "Duplicated entry [%s] was renamed to [%s]" %\ |
---|
97 | (old_name, name) |
---|
98 | SG.id2name[id_counter] = name |
---|
99 | SG.name2id[name] = id_counter |
---|
100 | SG.id2seq[id_counter] = "" |
---|
101 | line = m.groups()[1] |
---|
102 | else: |
---|
103 | raise Exception, \ |
---|
104 | "Wrong phylip sequencial format." |
---|
105 | SG.id2seq[id_counter] += re.sub("\s","", line) |
---|
106 | if len(SG.id2seq[id_counter]) == nchar: |
---|
107 | id_counter += 1 |
---|
108 | name = None |
---|
109 | elif len(SG.id2seq[id_counter]) > nchar: |
---|
110 | raise Exception, \ |
---|
111 | "Unexpected length of sequence [%s] [%s]." %(name,SG.id2seq[id_counter]) |
---|
112 | else: |
---|
113 | if len(SG)<ntax: |
---|
114 | if relaxed: |
---|
115 | m = re.match("^([^ ]+)(.+)", line) |
---|
116 | else: |
---|
117 | m = re.match("^(.{10})(.+)",line) |
---|
118 | if m: |
---|
119 | name = m.groups()[0].strip() |
---|
120 | |
---|
121 | seq = re.sub("\s","",m.groups()[1]) |
---|
122 | SG.id2seq[id_counter] = seq |
---|
123 | SG.id2name[id_counter] = name |
---|
124 | if fix_duplicates and name in SG.name2id: |
---|
125 | tag = str(len([k for k in SG.name2id.keys() \ |
---|
126 | if k.endswith(name)])) |
---|
127 | old_name = name |
---|
128 | name = tag+"_"+name |
---|
129 | print >>STDERR, \ |
---|
130 | "Duplicated entry [%s] was renamed to [%s]" %\ |
---|
131 | (old_name, name) |
---|
132 | SG.name2id[name] = id_counter |
---|
133 | id_counter += 1 |
---|
134 | else: |
---|
135 | raise Exception, \ |
---|
136 | "Unexpected number of sequences." |
---|
137 | else: |
---|
138 | seq = re.sub("\s", "", line) |
---|
139 | if id_counter == len(SG): |
---|
140 | id_counter = 0 |
---|
141 | SG.id2seq[id_counter] += seq |
---|
142 | id_counter += 1 |
---|
143 | |
---|
144 | if len(SG) != ntax: |
---|
145 | raise Exception, \ |
---|
146 | "Unexpected number of sequences." |
---|
147 | |
---|
148 | # Check lenght of all seqs |
---|
149 | for i in SG.id2seq.keys(): |
---|
150 | if len(SG.id2seq[i]) != nchar: |
---|
151 | raise Exception, \ |
---|
152 | "Unexpected lenght of sequence [%s]" %SG.id2name[i] |
---|
153 | |
---|
154 | return SG |
---|
155 | |
---|
156 | def write_phylip(aln, outfile=None, interleaved=True, relaxed=False): |
---|
157 | width = 60 |
---|
158 | seq_visited = set([]) |
---|
159 | |
---|
160 | show_name_warning = False |
---|
161 | lenghts = set((len(seq) for seq in aln.id2seq.values())) |
---|
162 | if len(lenghts) >1: |
---|
163 | raise Exception, "Phylip format requires sequences of equal lenght." |
---|
164 | seqlength = lenghts.pop() |
---|
165 | |
---|
166 | if not relaxed: |
---|
167 | name_fix = 10 |
---|
168 | else: |
---|
169 | name_fix = max([len(name) for name in aln.id2name.values()]) |
---|
170 | |
---|
171 | alg_lines = [] |
---|
172 | alg_text = " %d %d" %(len(aln), seqlength) |
---|
173 | alg_lines.append(alg_text) |
---|
174 | if interleaved: |
---|
175 | visited = set([]) |
---|
176 | for i in xrange(0, seqlength, width): |
---|
177 | for j in aln.id2name.iterkeys(): #xrange(len(aln)): |
---|
178 | name = aln.id2name[j] |
---|
179 | if not relaxed and len(name)>name_fix: |
---|
180 | name = name[:name_fix] |
---|
181 | show_name_warning = True |
---|
182 | |
---|
183 | seq = aln.id2seq[j][i:i+width] |
---|
184 | if j not in visited: |
---|
185 | name_str = "%s " %name.ljust(name_fix) |
---|
186 | visited.add(j) |
---|
187 | else: |
---|
188 | name_str = "".ljust(name_fix+3) |
---|
189 | |
---|
190 | seq_str = ' '.join([seq[k:k+10] for k in xrange(0, len(seq), 10)]) |
---|
191 | line_str = "%s%s" %(name_str, seq_str) |
---|
192 | alg_lines.append(line_str) |
---|
193 | alg_lines.append("") |
---|
194 | else: |
---|
195 | for name, seq, comments in aln.iter_entries(): |
---|
196 | if not relaxed and len(name)>10: |
---|
197 | name = name[:name_fix] |
---|
198 | show_name_warning = True |
---|
199 | line_str = "%s %s\n%s" %\ |
---|
200 | (name.ljust(name_fix), seq[0:width-name_fix-3], '\n'.join([seq[k:k+width] \ |
---|
201 | for k in xrange(width-name_fix-3, len(seq), width)])) |
---|
202 | alg_lines.append(line_str) |
---|
203 | alg_lines.append("") |
---|
204 | |
---|
205 | |
---|
206 | if show_name_warning: |
---|
207 | print >>STDERR, "Warning! Some sequence names were cut to 10 characters!!" |
---|
208 | alg_text = '\n'.join(alg_lines) |
---|
209 | if outfile is not None: |
---|
210 | OUT = open(outfile, "w") |
---|
211 | OUT.write(alg_text) |
---|
212 | OUT.close() |
---|
213 | else: |
---|
214 | return alg_text |
---|