source: branches/stable/GDE/SATIVA/sativa/epac/ete2/seqgroup.py

Last change on this file was 14544, checked in by akozlov, 9 years ago
File size: 6.6 KB
Line 
1__VERSION__="ete2-2.2rev1026" 
2# -*- coding: utf-8 -*-
3# #START_LICENSE###########################################################
4#  !!!!This is a hacked version by Jiajie Zhang!!!!!
5#  !!!!The internally used sequence id is exposed!!!!!
6#
7# This file is part of the Environment for Tree Exploration program
8# (ETE).  http://ete.cgenomics.org
9
10# ETE is free software: you can redistribute it and/or modify it
11# under the terms of the GNU General Public License as published by
12# the Free Software Foundation, either version 3 of the License, or
13# (at your option) any later version.
14
15# ETE is distributed in the hope that it will be useful, but WITHOUT
16# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
18# License for more details.
19
20# You should have received a copy of the GNU General Public License
21# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
22#
23#
24#                     ABOUT THE ETE PACKAGE
25#                     =====================
26#
27# ETE is distributed under the GPL copyleft license (2008-2011). 
28#
29# If you make use of ETE in published work, please cite:
30#
31# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
32# ETE: a python Environment for Tree Exploration. Jaime BMC
33# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
34#
35# Note that extra references to the specific methods implemented in
36# the toolkit are available in the documentation.
37#
38# More info at http://ete.cgenomics.org
39#
40#
41# #END_LICENSE#############################################################
42
43"""
44The 'seqgroup' module provides methods and classes to operate with
45Multiple Sequence Files, including Multiple Sequence Alignments.
46
47Currently, Fasta, Phylip sequencial and Phylip interleaved formats are
48supported.
49"""
50
51from fasta import read_fasta, write_fasta, write_fasta_internal
52from paml import read_paml, write_paml
53from phylip import read_phylip, write_phylip
54
55__all__ = ["SeqGroup"]
56
57class SeqGroup(object):
58    """
59    SeqGroup class can be used to store a set of sequences (aligned
60    or not).
61
62
63    :argument sequences: Path to the file containing the sequences or,
64        alternatively, the text string containing the same
65        information.
66
67    :argument fasta format: the format in which sequences are
68        encoded. Current supported formats are: ``fasta``, ``phylip``
69        (phylip sequencial) and ``iphylip`` (phylip
70        interleaved). Phylip format forces sequence names to a maximum
71        of 10 chars. To avoid this effect, you can use the relaxed
72        phylip format: ``phylip_relaxed`` and ``iphylip_relaxed``.
73
74    ::
75
76     msf = ">seq1\\nAAAAAAAAAAA\\n>seq2\\nTTTTTTTTTTTTT\\n"
77     seqs = SeqGroup(msf, format="fasta")
78     print seqs.get_seq("seq1")
79     """
80
81    def __len__(self):
82        return len(self.id2seq)
83
84    def __contains__(self, item):
85        return item in self.name2id
86
87    def __str__(self):
88        return write_fasta(self)
89
90    def __iter__(self):
91        return self.iter_entries()
92
93    def __init__(self, sequences=None , format="fasta", fix_duplicates=True, **kwargs):
94        self.parsers = {
95            "fasta": [read_fasta, write_fasta, {}],
96            "fasta_internal": [read_fasta, write_fasta_internal, {}],
97            "phylip": [read_phylip, write_phylip, {"interleaved":False, "relaxed":False}],
98            "iphylip": [read_phylip, write_phylip, {"interleaved":True, "relaxed":False}],
99            "phylip_relaxed": [read_phylip, write_phylip, {"interleaved":False, "relaxed":True}],
100            "iphylip_relaxed": [read_phylip, write_phylip, {"interleaved":True, "relaxed":True}],
101            "paml"   : [read_paml  , write_paml  , kwargs                   ]
102            }
103
104        self.id2name = {}
105        self.name2id = {}
106        self.id2comment= {}
107        self.id2seq = {}
108
109        if sequences is not None:
110            format = format.lower()
111            if format in self.parsers:
112                read = self.parsers[format][0]
113                args = self.parsers[format][2]
114                read(sequences, obj=self, fix_duplicates=fix_duplicates, **args)
115            else:
116                raise ValueError, "Unsupported format: [%s]" %format
117
118    def __repr__(self):
119        return "SeqGroup (%s)" %hex(self.__hash__())
120
121    def write(self, format="fasta", outfile=None):
122        """ Returns the text representation of the sequences in the
123        supplied given format (default=FASTA). If "oufile" argument is
124        used, the result is written into the given path."""
125
126        format = format.lower()
127        if format in self.parsers:
128            write = self.parsers[format][1]
129            args = self.parsers[format][2]
130            return write(self, outfile, **args)
131        else:
132            raise ValueError, "Unsupported format: [%s]" %format
133
134    def iter_entries(self):
135        """ Returns an iterator over all sequences in the
136        collection. Each item is a tuple with the sequence name,
137        sequence, and sequence comments """
138        for i, seq in self.id2seq.iteritems():
139            yield self.id2name[i], seq, self.id2comment.get(i, []), i
140
141    def get_seq(self, name):
142        """ Returns the sequence associated to a given entry name."""
143        return self.id2seq[self.name2id[name]]
144       
145    def get_seqbyid(self, iid):
146        return self.id2seq[iid]
147   
148    def get_comment(self, name):
149        return self.id2comment[self.name2id[name]]
150       
151    def get_name(self, sid):
152        return self.id2name[sid]
153
154    def get_entries(self):
155        """ Returns the list of entries currently stored."""
156        keys = self.id2seq.keys()
157        seqs = self.id2seq.values()
158        comments = [self.id2comment.get(x, []) for x in  keys]
159        names = map(lambda x: self.id2name[x], keys)
160        return zip(names, seqs, comments)
161
162    def set_seq(self, name, seq, comments = None):
163        """Updates or adds a sequence """
164        if comments is None:
165            comments = []
166        name = name.strip()
167        seq = seq.replace(" ", "")
168        seq = seq.replace("\t", "")
169        seq = seq.replace("\n", "")
170        seq = seq.replace("\r", "")
171        seqid = self.name2id.get(name, max([0]+self.name2id.values())+1)
172        self.name2id[name] = seqid
173        self.id2name[seqid] = name
174        self.id2comment[seqid] = comments
175        self.id2seq[seqid] = seq
176
177    def add_name_prefix(self, prefix):
178        for sid in self.id2name.iterkeys():
179            old_name = self.id2name[sid]
180            new_name = prefix + old_name
181            self.name2id[new_name] = self.name2id.pop(old_name)
182            self.id2name[sid] = new_name
183
184    def has_seq(self, name):
185        return (name in self.name2id)
Note: See TracBrowser for help on using the repository browser.