Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

seqgroup.py

Visit:

Last change on this file was 14544, checked in by akozlov, 10 years ago
partial merge from sativa branch log:branches/sativa@14543:14543
File size: 6.6 KB

Line
1	__VERSION__="ete2-2.2rev1026"
2	# -- coding: utf-8 --
3	# #START_LICENSE###########################################################
4	# !!!!This is a hacked version by Jiajie Zhang!!!!!
5	# !!!!The internally used sequence id is exposed!!!!!
6	#
7	# This file is part of the Environment for Tree Exploration program
8	# (ETE). http://ete.cgenomics.org
9	#
10	# ETE is free software: you can redistribute it and/or modify it
11	# under the terms of the GNU General Public License as published by
12	# the Free Software Foundation, either version 3 of the License, or
13	# (at your option) any later version.
14	#
15	# ETE is distributed in the hope that it will be useful, but WITHOUT
16	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17	# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
18	# License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with ETE. If not, see <http://www.gnu.org/licenses/>.
22	#
23	#
24	# ABOUT THE ETE PACKAGE
25	# =====================
26	#
27	# ETE is distributed under the GPL copyleft license (2008-2011).
28	#
29	# If you make use of ETE in published work, please cite:
30	#
31	# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
32	# ETE: a python Environment for Tree Exploration. Jaime BMC
33	# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
34	#
35	# Note that extra references to the specific methods implemented in
36	# the toolkit are available in the documentation.
37	#
38	# More info at http://ete.cgenomics.org
39	#
40	#
41	# #END_LICENSE#############################################################
42
43	"""
44	The 'seqgroup' module provides methods and classes to operate with
45	Multiple Sequence Files, including Multiple Sequence Alignments.
46
47	Currently, Fasta, Phylip sequencial and Phylip interleaved formats are
48	supported.
49	"""
50
51	from fasta import read_fasta, write_fasta, write_fasta_internal
52	from paml import read_paml, write_paml
53	from phylip import read_phylip, write_phylip
54
55	__all__ = ["SeqGroup"]
56
57	class SeqGroup(object):
58	"""
59	SeqGroup class can be used to store a set of sequences (aligned
60	or not).
61
62
63	:argument sequences: Path to the file containing the sequences or,
64	alternatively, the text string containing the same
65	information.
66
67	:argument fasta format: the format in which sequences are
68	encoded. Current supported formats are: ``fasta``, ``phylip``
69	(phylip sequencial) and ``iphylip`` (phylip
70	interleaved). Phylip format forces sequence names to a maximum
71	of 10 chars. To avoid this effect, you can use the relaxed
72	phylip format: ``phylip_relaxed`` and ``iphylip_relaxed``.
73
74	::
75
76	msf = ">seq1\\nAAAAAAAAAAA\\n>seq2\\nTTTTTTTTTTTTT\\n"
77	seqs = SeqGroup(msf, format="fasta")
78	print seqs.get_seq("seq1")
79	"""
80
81	def __len__(self):
82	return len(self.id2seq)
83
84	def __contains__(self, item):
85	return item in self.name2id
86
87	def __str__(self):
88	return write_fasta(self)
89
90	def __iter__(self):
91	return self.iter_entries()
92
93	def __init__(self, sequences=None , format="fasta", fix_duplicates=True, **kwargs):
94	self.parsers = {
95	"fasta": [read_fasta, write_fasta, {}],
96	"fasta_internal": [read_fasta, write_fasta_internal, {}],
97	"phylip": [read_phylip, write_phylip, {"interleaved":False, "relaxed":False}],
98	"iphylip": [read_phylip, write_phylip, {"interleaved":True, "relaxed":False}],
99	"phylip_relaxed": [read_phylip, write_phylip, {"interleaved":False, "relaxed":True}],
100	"iphylip_relaxed": [read_phylip, write_phylip, {"interleaved":True, "relaxed":True}],
101	"paml" : [read_paml , write_paml , kwargs ]
102	}
103
104	self.id2name = {}
105	self.name2id = {}
106	self.id2comment= {}
107	self.id2seq = {}
108
109	if sequences is not None:
110	format = format.lower()
111	if format in self.parsers:
112	read = self.parsers[format][0]
113	args = self.parsers[format][2]
114	read(sequences, obj=self, fix_duplicates=fix_duplicates, **args)
115	else:
116	raise ValueError, "Unsupported format: [%s]" %format
117
118	def __repr__(self):
119	return "SeqGroup (%s)" %hex(self.__hash__())
120
121	def write(self, format="fasta", outfile=None):
122	""" Returns the text representation of the sequences in the
123	supplied given format (default=FASTA). If "oufile" argument is
124	used, the result is written into the given path."""
125
126	format = format.lower()
127	if format in self.parsers:
128	write = self.parsers[format][1]
129	args = self.parsers[format][2]
130	return write(self, outfile, **args)
131	else:
132	raise ValueError, "Unsupported format: [%s]" %format
133
134	def iter_entries(self):
135	""" Returns an iterator over all sequences in the
136	collection. Each item is a tuple with the sequence name,
137	sequence, and sequence comments """
138	for i, seq in self.id2seq.iteritems():
139	yield self.id2name[i], seq, self.id2comment.get(i, []), i
140
141	def get_seq(self, name):
142	""" Returns the sequence associated to a given entry name."""
143	return self.id2seq[self.name2id[name]]
144
145	def get_seqbyid(self, iid):
146	return self.id2seq[iid]
147
148	def get_comment(self, name):
149	return self.id2comment[self.name2id[name]]
150
151	def get_name(self, sid):
152	return self.id2name[sid]
153
154	def get_entries(self):
155	""" Returns the list of entries currently stored."""
156	keys = self.id2seq.keys()
157	seqs = self.id2seq.values()
158	comments = [self.id2comment.get(x, []) for x in keys]
159	names = map(lambda x: self.id2name[x], keys)
160	return zip(names, seqs, comments)
161
162	def set_seq(self, name, seq, comments = None):
163	"""Updates or adds a sequence """
164	if comments is None:
165	comments = []
166	name = name.strip()
167	seq = seq.replace(" ", "")
168	seq = seq.replace("\t", "")
169	seq = seq.replace("\n", "")
170	seq = seq.replace("\r", "")
171	seqid = self.name2id.get(name, max([0]+self.name2id.values())+1)
172	self.name2id[name] = seqid
173	self.id2name[seqid] = name
174	self.id2comment[seqid] = comments
175	self.id2seq[seqid] = seq
176
177	def add_name_prefix(self, prefix):
178	for sid in self.id2name.iterkeys():
179	old_name = self.id2name[sid]
180	new_name = prefix + old_name
181	self.name2id[new_name] = self.name2id.pop(old_name)
182	self.id2name[sid] = new_name
183
184	def has_seq(self, name):
185	return (name in self.name2id)

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/nameserver/GDE/SATIVA/sativa/epac/ete2/seqgroup.py

Download in other formats: