source: branches/stable/GDE/SATIVA/sativa/epac/ete2/text_arraytable.py

Last change on this file was 12906, checked in by akozlov, 10 years ago

add sativa files and scripts for ARB integration

File size: 5.0 KB
Line 
1__VERSION__="ete2-2.2rev1026" 
2# -*- coding: utf-8 -*-
3# #START_LICENSE###########################################################
4#
5#
6# This file is part of the Environment for Tree Exploration program
7# (ETE).  http://ete.cgenomics.org
8
9# ETE is free software: you can redistribute it and/or modify it
10# under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13
14# ETE is distributed in the hope that it will be useful, but WITHOUT
15# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
17# License for more details.
18
19# You should have received a copy of the GNU General Public License
20# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
21#
22#
23#                     ABOUT THE ETE PACKAGE
24#                     =====================
25#
26# ETE is distributed under the GPL copyleft license (2008-2011). 
27#
28# If you make use of ETE in published work, please cite:
29#
30# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31# ETE: a python Environment for Tree Exploration. Jaime BMC
32# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33#
34# Note that extra references to the specific methods implemented in
35# the toolkit are available in the documentation.
36#
37# More info at http://ete.cgenomics.org
38#
39#
40# #END_LICENSE#############################################################
41
42#! /usr/bin/env python
43
44import re
45from sys import stderr
46import numpy
47
48
49__all__ = ['read_arraytable', 'write_arraytable']
50
51def read_arraytable(matrix_file, mtype="float", arraytable_object = None):
52    """ Reads a text tab-delimited matrix from file """
53
54    if arraytable_object is None:
55        from ete2.coretype import arraytable
56        A = arraytable.ArrayTable()
57    else:
58        A = arraytable_object
59
60    A.mtype          = mtype
61    temp_matrix         = []
62    rowname_counter     = {}
63    colname_counter     = {}
64    row_dup_flag = False
65    col_dup_flag = False
66
67    # if matrix_file has many lines, tries to read it as the matrix
68    # itself.
69    if len(matrix_file.split("\n"))>1:
70        matrix_data = matrix_file.split("\n")
71    else:
72        matrix_data = open(matrix_file)
73
74    for line in matrix_data:
75        # Clean up line
76        line = line.strip("\n")
77        #line = line.replace(" ","")
78        # Skip empty lines
79        if not line:
80            continue
81        # Get fields in line
82        fields = line.split("\t")
83        # Read column names
84        if line[0]=='#' and re.match("#NAMES",fields[0],re.IGNORECASE):
85            counter = 0
86            for colname in fields[1:]:
87                colname = colname.strip()
88
89                # Handle duplicated col names by adding a number
90                colname_counter[colname] = colname_counter.get(colname,0) + 1
91                if colname in A.colValues:
92                    colname += "_%d" % colname_counter[colname]
93                    col_dup_flag = True
94                # Adds colname
95                A.colValues[colname] = None
96                A.colNames.append(colname)
97            if col_dup_flag:
98                print >>stderr, "Duplicated column names were renamed."
99
100        # Skip comments
101        elif line[0]=='#':
102            continue
103
104        # Read values (only when column names are loaded)
105        elif A.colNames:
106            # Checks shape
107            if len(fields)-1 != len(A.colNames):
108                raise ValueError, "Invalid number of columns. Expecting:%d" % len(A.colNames)
109
110            # Extracts row name and remove it from fields
111            rowname  = fields.pop(0).strip()
112
113            # Handles duplicated row names by adding a number
114            rowname_counter[rowname] = rowname_counter.get(rowname,0) + 1
115            if rowname in A.rowValues:
116                rowname += "_%d" % rowname_counter[rowname]
117                row_dup_names = True
118
119            # Adds row name
120            A.rowValues[rowname] = None
121            A.rowNames.append(rowname)
122
123            # Reads row values
124            values = []
125            for f in fields:
126                if f.strip()=="":
127                    f = numpy.nan
128                values.append(f)
129            temp_matrix.append(values)
130        else:
131            raise ValueError, "Column names are required."
132
133    if row_dup_flag:
134        print >>stderr, "Duplicated row names were renamed."
135
136    # Convert all read lines into a numpy matrix
137    vmatrix = numpy.array(temp_matrix).astype(A.mtype)
138
139    # Updates indexes to link names and vectors in matrix
140    A._link_names2matrix(vmatrix)
141    return A
142
143def write_arraytable(A, fname, colnames=None):
144    if colnames is None:
145        colnames = []
146    elif colnames == []:
147        colnames = A.colNames
148
149    matrix = A.get_several_column_vectors(colnames)
150    matrix = matrix.swapaxes(0,1)
151    OUT = open(fname,"w")
152    print >>OUT, '\t'.join(["#NAMES"] + colnames)
153    counter = 0
154    for rname in A.rowNames:
155        print >>OUT, '\t'.join(map(str,[rname]+matrix[counter].tolist()))
156        counter +=1
157    OUT.close()
Note: See TracBrowser for help on using the repository browser.