| 1 | __VERSION__="ete2-2.2rev1026" |
|---|
| 2 | # -*- coding: utf-8 -*- |
|---|
| 3 | # #START_LICENSE########################################################### |
|---|
| 4 | # |
|---|
| 5 | # |
|---|
| 6 | # This file is part of the Environment for Tree Exploration program |
|---|
| 7 | # (ETE). http://ete.cgenomics.org |
|---|
| 8 | # |
|---|
| 9 | # ETE is free software: you can redistribute it and/or modify it |
|---|
| 10 | # under the terms of the GNU General Public License as published by |
|---|
| 11 | # the Free Software Foundation, either version 3 of the License, or |
|---|
| 12 | # (at your option) any later version. |
|---|
| 13 | # |
|---|
| 14 | # ETE is distributed in the hope that it will be useful, but WITHOUT |
|---|
| 15 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
|---|
| 16 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
|---|
| 17 | # License for more details. |
|---|
| 18 | # |
|---|
| 19 | # You should have received a copy of the GNU General Public License |
|---|
| 20 | # along with ETE. If not, see <http://www.gnu.org/licenses/>. |
|---|
| 21 | # |
|---|
| 22 | # |
|---|
| 23 | # ABOUT THE ETE PACKAGE |
|---|
| 24 | # ===================== |
|---|
| 25 | # |
|---|
| 26 | # ETE is distributed under the GPL copyleft license (2008-2011). |
|---|
| 27 | # |
|---|
| 28 | # If you make use of ETE in published work, please cite: |
|---|
| 29 | # |
|---|
| 30 | # Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. |
|---|
| 31 | # ETE: a python Environment for Tree Exploration. Jaime BMC |
|---|
| 32 | # Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 |
|---|
| 33 | # |
|---|
| 34 | # Note that extra references to the specific methods implemented in |
|---|
| 35 | # the toolkit are available in the documentation. |
|---|
| 36 | # |
|---|
| 37 | # More info at http://ete.cgenomics.org |
|---|
| 38 | # |
|---|
| 39 | # |
|---|
| 40 | # #END_LICENSE############################################################# |
|---|
| 41 | |
|---|
| 42 | import os |
|---|
| 43 | import re |
|---|
| 44 | from sys import stderr as STDERR |
|---|
| 45 | |
|---|
| 46 | def read_phylip(source, interleaved=True, obj=None, |
|---|
| 47 | relaxed=False, fix_duplicates=True): |
|---|
| 48 | if obj is None: |
|---|
| 49 | from ete2.coretype import SeqGroup |
|---|
| 50 | SG = SeqGroup() |
|---|
| 51 | else: |
|---|
| 52 | SG = obj |
|---|
| 53 | |
|---|
| 54 | # Prepares handle from which read sequences |
|---|
| 55 | if os.path.isfile(source): |
|---|
| 56 | _source = open(source, "rU") |
|---|
| 57 | else: |
|---|
| 58 | _source = iter(source.split("\n")) |
|---|
| 59 | |
|---|
| 60 | nchar, ntax = None, None |
|---|
| 61 | counter = 0 |
|---|
| 62 | id_counter = 0 |
|---|
| 63 | for line in _source: |
|---|
| 64 | line = line.strip("\n") |
|---|
| 65 | # Passes comments and blank lines |
|---|
| 66 | if not line or line[0] == "#": |
|---|
| 67 | continue |
|---|
| 68 | # Reads head |
|---|
| 69 | if not nchar or not ntax: |
|---|
| 70 | m = re.match("^\s*(\d+)\s+(\d+)",line) |
|---|
| 71 | if m: |
|---|
| 72 | ntax = int (m.groups()[0]) |
|---|
| 73 | nchar = int (m.groups()[1]) |
|---|
| 74 | else: |
|---|
| 75 | raise Exception, \ |
|---|
| 76 | "A first line with the alignment dimension is required" |
|---|
| 77 | # Reads sequences |
|---|
| 78 | else: |
|---|
| 79 | if not interleaved: |
|---|
| 80 | # Reads names and sequences |
|---|
| 81 | if SG.id2name.get(id_counter, None) is None: |
|---|
| 82 | if relaxed: |
|---|
| 83 | m = re.match("^([^ ]+)(.+)", line) |
|---|
| 84 | else: |
|---|
| 85 | m = re.match("^(.{10})(.+)", line) |
|---|
| 86 | if m: |
|---|
| 87 | name = m.groups()[0].strip() |
|---|
| 88 | if fix_duplicates and name in SG.name2id: |
|---|
| 89 | tag = str(len([k for k in SG.name2id.keys() \ |
|---|
| 90 | if k.endswith(name)])) |
|---|
| 91 | old_name = name |
|---|
| 92 | # Tag is in the beginning to avoid being |
|---|
| 93 | # cut it by the 10 chars limit |
|---|
| 94 | name = tag+"_"+name |
|---|
| 95 | print >>STDERR, \ |
|---|
| 96 | "Duplicated entry [%s] was renamed to [%s]" %\ |
|---|
| 97 | (old_name, name) |
|---|
| 98 | SG.id2name[id_counter] = name |
|---|
| 99 | SG.name2id[name] = id_counter |
|---|
| 100 | SG.id2seq[id_counter] = "" |
|---|
| 101 | line = m.groups()[1] |
|---|
| 102 | else: |
|---|
| 103 | raise Exception, \ |
|---|
| 104 | "Wrong phylip sequencial format." |
|---|
| 105 | SG.id2seq[id_counter] += re.sub("\s","", line) |
|---|
| 106 | if len(SG.id2seq[id_counter]) == nchar: |
|---|
| 107 | id_counter += 1 |
|---|
| 108 | name = None |
|---|
| 109 | elif len(SG.id2seq[id_counter]) > nchar: |
|---|
| 110 | raise Exception, \ |
|---|
| 111 | "Unexpected length of sequence [%s] [%s]." %(name,SG.id2seq[id_counter]) |
|---|
| 112 | else: |
|---|
| 113 | if len(SG)<ntax: |
|---|
| 114 | if relaxed: |
|---|
| 115 | m = re.match("^([^ ]+)(.+)", line) |
|---|
| 116 | else: |
|---|
| 117 | m = re.match("^(.{10})(.+)",line) |
|---|
| 118 | if m: |
|---|
| 119 | name = m.groups()[0].strip() |
|---|
| 120 | |
|---|
| 121 | seq = re.sub("\s","",m.groups()[1]) |
|---|
| 122 | SG.id2seq[id_counter] = seq |
|---|
| 123 | SG.id2name[id_counter] = name |
|---|
| 124 | if fix_duplicates and name in SG.name2id: |
|---|
| 125 | tag = str(len([k for k in SG.name2id.keys() \ |
|---|
| 126 | if k.endswith(name)])) |
|---|
| 127 | old_name = name |
|---|
| 128 | name = tag+"_"+name |
|---|
| 129 | print >>STDERR, \ |
|---|
| 130 | "Duplicated entry [%s] was renamed to [%s]" %\ |
|---|
| 131 | (old_name, name) |
|---|
| 132 | SG.name2id[name] = id_counter |
|---|
| 133 | id_counter += 1 |
|---|
| 134 | else: |
|---|
| 135 | raise Exception, \ |
|---|
| 136 | "Unexpected number of sequences." |
|---|
| 137 | else: |
|---|
| 138 | seq = re.sub("\s", "", line) |
|---|
| 139 | if id_counter == len(SG): |
|---|
| 140 | id_counter = 0 |
|---|
| 141 | SG.id2seq[id_counter] += seq |
|---|
| 142 | id_counter += 1 |
|---|
| 143 | |
|---|
| 144 | if len(SG) != ntax: |
|---|
| 145 | raise Exception, \ |
|---|
| 146 | "Unexpected number of sequences." |
|---|
| 147 | |
|---|
| 148 | # Check lenght of all seqs |
|---|
| 149 | for i in SG.id2seq.keys(): |
|---|
| 150 | if len(SG.id2seq[i]) != nchar: |
|---|
| 151 | raise Exception, \ |
|---|
| 152 | "Unexpected lenght of sequence [%s]" %SG.id2name[i] |
|---|
| 153 | |
|---|
| 154 | return SG |
|---|
| 155 | |
|---|
| 156 | def write_phylip(aln, outfile=None, interleaved=True, relaxed=False): |
|---|
| 157 | width = 60 |
|---|
| 158 | seq_visited = set([]) |
|---|
| 159 | |
|---|
| 160 | show_name_warning = False |
|---|
| 161 | lenghts = set((len(seq) for seq in aln.id2seq.values())) |
|---|
| 162 | if len(lenghts) >1: |
|---|
| 163 | raise Exception, "Phylip format requires sequences of equal lenght." |
|---|
| 164 | seqlength = lenghts.pop() |
|---|
| 165 | |
|---|
| 166 | if not relaxed: |
|---|
| 167 | name_fix = 10 |
|---|
| 168 | else: |
|---|
| 169 | name_fix = max([len(name) for name in aln.id2name.values()]) |
|---|
| 170 | |
|---|
| 171 | alg_lines = [] |
|---|
| 172 | alg_text = " %d %d" %(len(aln), seqlength) |
|---|
| 173 | alg_lines.append(alg_text) |
|---|
| 174 | if interleaved: |
|---|
| 175 | visited = set([]) |
|---|
| 176 | for i in xrange(0, seqlength, width): |
|---|
| 177 | for j in aln.id2name.iterkeys(): #xrange(len(aln)): |
|---|
| 178 | name = aln.id2name[j] |
|---|
| 179 | if not relaxed and len(name)>name_fix: |
|---|
| 180 | name = name[:name_fix] |
|---|
| 181 | show_name_warning = True |
|---|
| 182 | |
|---|
| 183 | seq = aln.id2seq[j][i:i+width] |
|---|
| 184 | if j not in visited: |
|---|
| 185 | name_str = "%s " %name.ljust(name_fix) |
|---|
| 186 | visited.add(j) |
|---|
| 187 | else: |
|---|
| 188 | name_str = "".ljust(name_fix+3) |
|---|
| 189 | |
|---|
| 190 | seq_str = ' '.join([seq[k:k+10] for k in xrange(0, len(seq), 10)]) |
|---|
| 191 | line_str = "%s%s" %(name_str, seq_str) |
|---|
| 192 | alg_lines.append(line_str) |
|---|
| 193 | alg_lines.append("") |
|---|
| 194 | else: |
|---|
| 195 | for name, seq, comments in aln.iter_entries(): |
|---|
| 196 | if not relaxed and len(name)>10: |
|---|
| 197 | name = name[:name_fix] |
|---|
| 198 | show_name_warning = True |
|---|
| 199 | line_str = "%s %s\n%s" %\ |
|---|
| 200 | (name.ljust(name_fix), seq[0:width-name_fix-3], '\n'.join([seq[k:k+width] \ |
|---|
| 201 | for k in xrange(width-name_fix-3, len(seq), width)])) |
|---|
| 202 | alg_lines.append(line_str) |
|---|
| 203 | alg_lines.append("") |
|---|
| 204 | |
|---|
| 205 | |
|---|
| 206 | if show_name_warning: |
|---|
| 207 | print >>STDERR, "Warning! Some sequence names were cut to 10 characters!!" |
|---|
| 208 | alg_text = '\n'.join(alg_lines) |
|---|
| 209 | if outfile is not None: |
|---|
| 210 | OUT = open(outfile, "w") |
|---|
| 211 | OUT.write(alg_text) |
|---|
| 212 | OUT.close() |
|---|
| 213 | else: |
|---|
| 214 | return alg_text |
|---|