Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: branches/items/GDE/SATIVA/sativa/epac/ete2/fasta.py

Visit:

Last change on this file was 12906, checked in by akozlov, 10 years ago
add sativa files and scripts for ARB integration
File size: 4.8 KB

Line
1	__VERSION__="ete2-2.2rev1026"
2	# -- coding: utf-8 --
3	# #START_LICENSE###########################################################
4	#
5	#
6	# This file is part of the Environment for Tree Exploration program
7	# (ETE). http://ete.cgenomics.org
8	#
9	# ETE is free software: you can redistribute it and/or modify it
10	# under the terms of the GNU General Public License as published by
11	# the Free Software Foundation, either version 3 of the License, or
12	# (at your option) any later version.
13	#
14	# ETE is distributed in the hope that it will be useful, but WITHOUT
15	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16	# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
17	# License for more details.
18	#
19	# You should have received a copy of the GNU General Public License
20	# along with ETE. If not, see <http://www.gnu.org/licenses/>.
21	#
22	#
23	# ABOUT THE ETE PACKAGE
24	# =====================
25	#
26	# ETE is distributed under the GPL copyleft license (2008-2011).
27	#
28	# If you make use of ETE in published work, please cite:
29	#
30	# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31	# ETE: a python Environment for Tree Exploration. Jaime BMC
32	# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33	#
34	# Note that extra references to the specific methods implemented in
35	# the toolkit are available in the documentation.
36	#
37	# More info at http://ete.cgenomics.org
38	#
39	#
40	# #END_LICENSE#############################################################
41
42	import os
43	import string
44	import textwrap
45	from sys import stderr as STDERR
46
47	def read_fasta(source, obj=None, header_delimiter="\t", fix_duplicates=True):
48	""" Reads a collection of sequences econded in FASTA format."""
49
50	if obj is None:
51	from ete2.coretype import seqgroup
52	SC = seqgroup.SeqGroup()
53	else:
54	SC = obj
55
56	names = set([])
57	seq_id = -1
58
59	# Prepares handle from which read sequences
60	if os.path.isfile(source):
61	_source = open(source, "rU")
62	else:
63	_source = iter(source.split("\n"))
64
65	seq_name = None
66	for line in _source:
67	line = line.strip()
68	if line.startswith('#') or not line:
69	continue
70	# Reads seq number
71	elif line.startswith('>'):
72	# Checks if previous name had seq
73	if seq_id>-1 and SC.id2seq[seq_id] == "":
74	raise Exception, "No sequence found for "+seq_name
75
76	seq_id += 1
77	# Takes header info
78	seq_header_fields = map(string.strip, line[1:].split(header_delimiter))
79	seq_name = seq_header_fields[0]
80
81	# Checks for duplicated seq names
82	if fix_duplicates and seq_name in names:
83	tag = str(len([k for k in SC.name2id.keys() if k.endswith(seq_name)]))
84	old_name = seq_name
85	seq_name = tag+"_"+seq_name
86	print >>STDERR, "Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name)
87
88	# stores seq_name
89	SC.id2seq[seq_id] = ""
90	SC.id2name[seq_id] = seq_name
91	SC.name2id[seq_name] = seq_id
92	SC.id2comment[seq_id] = seq_header_fields[1:]
93	names.add(seq_name)
94
95	else:
96	if seq_name is None:
97	raise Exception, "Error reading sequences: Wrong format."
98
99	# removes all white spaces in line
100	s = line.strip().replace(" ","")
101
102	# append to seq_string
103	SC.id2seq[seq_id] += s
104
105	if seq_name and SC.id2seq[seq_id] == "":
106	print >>STDERR, seq_name,"has no sequence"
107	return None
108
109	# Everything ok
110	return SC
111
112	def write_fasta(sequences, outfile = None, seqwidth = 80):
113	""" Writes a SeqGroup python object using FASTA format. """
114
115	wrapper = textwrap.TextWrapper()
116	wrapper.break_on_hyphens = False
117	wrapper.replace_whitespace = False
118	wrapper.expand_tabs = False
119	wrapper.break_long_words = True
120	wrapper.width = 80
121	text = '\n'.join([">%s\n%s\n" %( "\t".join([name]+comment), wrapper.fill(seq)) for
122	name, seq, comment, sid in sequences])
123
124	if outfile is not None:
125	OUT = open(outfile,"w")
126	OUT.write(text)
127	OUT.close()
128	else:
129	return text
130
131	def write_fasta_internal(sequences, outfile = None, seqwidth = 80):
132	""" Writes a SeqGroup python object using FASTA format. """
133
134	wrapper = textwrap.TextWrapper()
135	wrapper.break_on_hyphens = False
136	wrapper.replace_whitespace = False
137	wrapper.expand_tabs = False
138	wrapper.break_long_words = True
139	wrapper.width = 80
140	text = '\n'.join([">%s\n%s\n" %( "\t".join([str(sid)]+comment), wrapper.fill(seq)) for
141	name, seq, comment, sid in sequences])
142
143	if outfile is not None:
144	OUT = open(outfile,"w")
145	OUT.write(text)
146	OUT.close()
147	else:
148	return text

Note: See TracBrowser for help on using the repository browser.

Download in other formats: