source: branches/help/GDE/SATIVA/sativa/epac/ete2/paml.py

Last change on this file was 12906, checked in by akozlov, 10 years ago

add sativa files and scripts for ARB integration

File size: 5.3 KB
Line 
1__VERSION__="ete2-2.2rev1026" 
2# -*- coding: utf-8 -*-
3# #START_LICENSE###########################################################
4#
5#
6# This file is part of the Environment for Tree Exploration program
7# (ETE).  http://ete.cgenomics.org
8
9# ETE is free software: you can redistribute it and/or modify it
10# under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13
14# ETE is distributed in the hope that it will be useful, but WITHOUT
15# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
17# License for more details.
18
19# You should have received a copy of the GNU General Public License
20# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
21#
22#
23#                     ABOUT THE ETE PACKAGE
24#                     =====================
25#
26# ETE is distributed under the GPL copyleft license (2008-2011). 
27#
28# If you make use of ETE in published work, please cite:
29#
30# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31# ETE: a python Environment for Tree Exploration. Jaime BMC
32# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33#
34# Note that extra references to the specific methods implemented in
35# the toolkit are available in the documentation.
36#
37# More info at http://ete.cgenomics.org
38#
39#
40# #END_LICENSE#############################################################
41
42
43import os
44import string
45from sys import stderr as STDERR
46from re import search
47
48def read_paml (source, obj=None, header_delimiter="\t", fix_duplicates=True):
49    """ Reads a collection of sequences econded in PAML format... that is, something between PHYLIP and fasta
50
51     3 6
52    seq1
53    ATGATG
54    seq2
55    ATGATG
56    seq3
57    ATGATG
58
59    or
60
61     3 6
62    >seq1
63    ATGATG
64    >seq2
65    ATGATG
66    >seq3
67    ATGATG
68
69    or
70
71    >seq1
72    ATGATG
73    >seq2
74    ATGATG
75    >seq3
76    ATGATG
77   
78    """
79
80    if obj is None:
81        from ete2.coretype import seqgroup
82        SC = seqgroup.SeqGroup()
83    else:
84        SC = obj
85
86    names = set([])
87    seq_id = -1
88
89    # Prepares handle from which read sequences
90    if os.path.isfile(source):
91        _source = open(source, "rU")
92    else:
93        _source = iter(source.split("\n"))
94
95    seq_name = None
96    num_seq = 0
97    len_seq = 0
98    in_seq  = False
99    for line in _source:
100        line = line.strip()
101        if line.startswith('#') or not line:
102            continue
103        # Reads seq number
104        elif line.startswith('>') or ((num_seq and len_seq) and not in_seq):
105            line = line.replace('>','')
106            # Checks if previous name had seq
107            if seq_id>-1 and SC.id2seq[seq_id] == "":
108                raise Exception, "No sequence found for "+seq_name
109
110            seq_id += 1
111            # Takes header info
112            seq_header_fields = map(string.strip, line.split(header_delimiter))
113            seq_name = seq_header_fields[0]
114
115            # Checks for duplicated seq names
116            if fix_duplicates and seq_name in names:
117                tag = str(len([k for k in SC.name2id.keys() if k.endswith(seq_name)]))
118                old_name = seq_name
119                seq_name = tag+"_"+seq_name
120                print >>STDERR, "Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name)
121
122            # stores seq_name
123            SC.id2seq[seq_id]     = ""
124            SC.id2name[seq_id]    = seq_name
125            SC.name2id[seq_name]  = seq_id
126            SC.id2comment[seq_id] = seq_header_fields[1:]
127            names.add(seq_name)
128            in_seq = True
129        else:
130            if seq_name is None:
131                if search ('^[0-9]+  *[0-9]+$', line):
132                    num_seq, len_seq = line.strip().split()
133                    num_seq = int(num_seq)
134                    len_seq = int(len_seq)
135                    continue
136                if line.startswith('\n'):
137                    continue
138                raise Exception, "Error reading sequences: Wrong format.\n"+line
139            elif in_seq:
140                # removes all white spaces in line
141                s = line.strip().replace(" ","")
142
143                # append to seq_string
144                SC.id2seq[seq_id] += s
145                if len_seq:
146                    if len(SC.id2seq[seq_id]) == len_seq:
147                        in_seq=False
148                    elif len(SC.id2seq[seq_id]) > len_seq:
149                        raise  Exception, "Error reading sequences: Wrong sequence length.\n"+line
150
151    if seq_name and SC.id2seq[seq_id] == "":
152        print >>STDERR, seq_name,"has no sequence"
153        return None
154
155    # Everything ok
156    return SC
157
158def write_paml(sequences, outfile = None, seqwidth = 80):
159    """
160    Writes a SeqGroup python object using PAML format.
161    sequences are ordered, because PAML labels tree according to this.
162    """
163    text =  ' %d %d\n' % (len (sequences), len (sequences.get_entries()[0][1]))
164    text += '\n'.join(["%s\n%s" %( "\t".join([name]+comment), _seq2str(seq)) for
165                       name, seq, comment in sorted(sequences)])
166    if outfile is not None:
167        OUT = open(outfile,"w")
168        OUT.write(text)
169        OUT.close()
170    else:
171        return text
172
173def _seq2str(seq, seqwidth = 80):
174    sequence = ""
175    for i in xrange(0,len(seq),seqwidth):
176        sequence+= seq[i:i+seqwidth] + "\n"
177    return sequence
178
Note: See TracBrowser for help on using the repository browser.