source: branches/stable/GDE/SATIVA/sativa/epac/ete2/fasta.py

Last change on this file was 12906, checked in by akozlov, 10 years ago

add sativa files and scripts for ARB integration

File size: 4.8 KB
Line 
1__VERSION__="ete2-2.2rev1026" 
2# -*- coding: utf-8 -*-
3# #START_LICENSE###########################################################
4#
5#
6# This file is part of the Environment for Tree Exploration program
7# (ETE).  http://ete.cgenomics.org
8
9# ETE is free software: you can redistribute it and/or modify it
10# under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13
14# ETE is distributed in the hope that it will be useful, but WITHOUT
15# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
17# License for more details.
18
19# You should have received a copy of the GNU General Public License
20# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
21#
22#
23#                     ABOUT THE ETE PACKAGE
24#                     =====================
25#
26# ETE is distributed under the GPL copyleft license (2008-2011). 
27#
28# If you make use of ETE in published work, please cite:
29#
30# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31# ETE: a python Environment for Tree Exploration. Jaime BMC
32# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33#
34# Note that extra references to the specific methods implemented in
35# the toolkit are available in the documentation.
36#
37# More info at http://ete.cgenomics.org
38#
39#
40# #END_LICENSE#############################################################
41
42import os
43import string
44import textwrap
45from sys import stderr as STDERR
46
47def read_fasta(source, obj=None, header_delimiter="\t", fix_duplicates=True):
48    """ Reads a collection of sequences econded in FASTA format."""
49
50    if obj is None:
51        from ete2.coretype import seqgroup
52        SC = seqgroup.SeqGroup()
53    else:
54        SC = obj
55
56    names = set([])
57    seq_id = -1
58
59    # Prepares handle from which read sequences
60    if os.path.isfile(source):
61        _source = open(source, "rU")
62    else:
63        _source = iter(source.split("\n"))
64
65    seq_name = None
66    for line in _source:
67        line = line.strip()
68        if line.startswith('#') or not line:
69            continue
70        # Reads seq number
71        elif line.startswith('>'):
72            # Checks if previous name had seq
73            if seq_id>-1 and SC.id2seq[seq_id] == "":
74                raise Exception, "No sequence found for "+seq_name
75
76            seq_id += 1
77            # Takes header info
78            seq_header_fields = map(string.strip, line[1:].split(header_delimiter))
79            seq_name = seq_header_fields[0]
80
81            # Checks for duplicated seq names
82            if fix_duplicates and seq_name in names:
83                tag = str(len([k for k in SC.name2id.keys() if k.endswith(seq_name)]))
84                old_name = seq_name
85                seq_name = tag+"_"+seq_name
86                print >>STDERR, "Duplicated entry [%s] was renamed to [%s]" %(old_name, seq_name)
87
88            # stores seq_name
89            SC.id2seq[seq_id] = ""
90            SC.id2name[seq_id] = seq_name
91            SC.name2id[seq_name] = seq_id
92            SC.id2comment[seq_id] = seq_header_fields[1:]
93            names.add(seq_name)
94
95        else:
96            if seq_name is None:
97                raise Exception, "Error reading sequences: Wrong format."
98
99            # removes all white spaces in line
100            s = line.strip().replace(" ","")
101
102            # append to seq_string
103            SC.id2seq[seq_id] += s
104
105    if seq_name and SC.id2seq[seq_id] == "":
106        print >>STDERR, seq_name,"has no sequence"
107        return None
108
109    # Everything ok
110    return SC
111
112def write_fasta(sequences, outfile = None, seqwidth = 80):
113    """ Writes a SeqGroup python object using FASTA format. """
114
115    wrapper = textwrap.TextWrapper()
116    wrapper.break_on_hyphens = False
117    wrapper.replace_whitespace = False
118    wrapper.expand_tabs = False
119    wrapper.break_long_words = True
120    wrapper.width = 80
121    text =  '\n'.join([">%s\n%s\n" %( "\t".join([name]+comment), wrapper.fill(seq)) for
122                       name, seq, comment, sid in sequences])
123
124    if outfile is not None:
125        OUT = open(outfile,"w")
126        OUT.write(text)
127        OUT.close()
128    else:
129        return text
130
131def write_fasta_internal(sequences, outfile = None, seqwidth = 80):
132    """ Writes a SeqGroup python object using FASTA format. """
133
134    wrapper = textwrap.TextWrapper()
135    wrapper.break_on_hyphens = False
136    wrapper.replace_whitespace = False
137    wrapper.expand_tabs = False
138    wrapper.break_long_words = True
139    wrapper.width = 80
140    text =  '\n'.join([">%s\n%s\n" %( "\t".join([str(sid)]+comment), wrapper.fill(seq)) for
141                       name, seq, comment, sid in sequences])
142
143    if outfile is not None:
144        OUT = open(outfile,"w")
145        OUT.write(text)
146        OUT.close()
147    else:
148        return text
Note: See TracBrowser for help on using the repository browser.