1 | __VERSION__="ete2-2.2rev1026" |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | # #START_LICENSE########################################################### |
---|
4 | # |
---|
5 | # |
---|
6 | # This file is part of the Environment for Tree Exploration program |
---|
7 | # (ETE). http://ete.cgenomics.org |
---|
8 | # |
---|
9 | # ETE is free software: you can redistribute it and/or modify it |
---|
10 | # under the terms of the GNU General Public License as published by |
---|
11 | # the Free Software Foundation, either version 3 of the License, or |
---|
12 | # (at your option) any later version. |
---|
13 | # |
---|
14 | # ETE is distributed in the hope that it will be useful, but WITHOUT |
---|
15 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
---|
16 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
---|
17 | # License for more details. |
---|
18 | # |
---|
19 | # You should have received a copy of the GNU General Public License |
---|
20 | # along with ETE. If not, see <http://www.gnu.org/licenses/>. |
---|
21 | # |
---|
22 | # |
---|
23 | # ABOUT THE ETE PACKAGE |
---|
24 | # ===================== |
---|
25 | # |
---|
26 | # ETE is distributed under the GPL copyleft license (2008-2011). |
---|
27 | # |
---|
28 | # If you make use of ETE in published work, please cite: |
---|
29 | # |
---|
30 | # Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. |
---|
31 | # ETE: a python Environment for Tree Exploration. Jaime BMC |
---|
32 | # Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 |
---|
33 | # |
---|
34 | # Note that extra references to the specific methods implemented in |
---|
35 | # the toolkit are available in the documentation. |
---|
36 | # |
---|
37 | # More info at http://ete.cgenomics.org |
---|
38 | # |
---|
39 | # |
---|
40 | # #END_LICENSE############################################################# |
---|
41 | |
---|
42 | #! /usr/bin/env python |
---|
43 | |
---|
44 | import re |
---|
45 | from sys import stderr |
---|
46 | import numpy |
---|
47 | |
---|
48 | |
---|
49 | __all__ = ['read_arraytable', 'write_arraytable'] |
---|
50 | |
---|
51 | def read_arraytable(matrix_file, mtype="float", arraytable_object = None): |
---|
52 | """ Reads a text tab-delimited matrix from file """ |
---|
53 | |
---|
54 | if arraytable_object is None: |
---|
55 | from ete2.coretype import arraytable |
---|
56 | A = arraytable.ArrayTable() |
---|
57 | else: |
---|
58 | A = arraytable_object |
---|
59 | |
---|
60 | A.mtype = mtype |
---|
61 | temp_matrix = [] |
---|
62 | rowname_counter = {} |
---|
63 | colname_counter = {} |
---|
64 | row_dup_flag = False |
---|
65 | col_dup_flag = False |
---|
66 | |
---|
67 | # if matrix_file has many lines, tries to read it as the matrix |
---|
68 | # itself. |
---|
69 | if len(matrix_file.split("\n"))>1: |
---|
70 | matrix_data = matrix_file.split("\n") |
---|
71 | else: |
---|
72 | matrix_data = open(matrix_file) |
---|
73 | |
---|
74 | for line in matrix_data: |
---|
75 | # Clean up line |
---|
76 | line = line.strip("\n") |
---|
77 | #line = line.replace(" ","") |
---|
78 | # Skip empty lines |
---|
79 | if not line: |
---|
80 | continue |
---|
81 | # Get fields in line |
---|
82 | fields = line.split("\t") |
---|
83 | # Read column names |
---|
84 | if line[0]=='#' and re.match("#NAMES",fields[0],re.IGNORECASE): |
---|
85 | counter = 0 |
---|
86 | for colname in fields[1:]: |
---|
87 | colname = colname.strip() |
---|
88 | |
---|
89 | # Handle duplicated col names by adding a number |
---|
90 | colname_counter[colname] = colname_counter.get(colname,0) + 1 |
---|
91 | if colname in A.colValues: |
---|
92 | colname += "_%d" % colname_counter[colname] |
---|
93 | col_dup_flag = True |
---|
94 | # Adds colname |
---|
95 | A.colValues[colname] = None |
---|
96 | A.colNames.append(colname) |
---|
97 | if col_dup_flag: |
---|
98 | print >>stderr, "Duplicated column names were renamed." |
---|
99 | |
---|
100 | # Skip comments |
---|
101 | elif line[0]=='#': |
---|
102 | continue |
---|
103 | |
---|
104 | # Read values (only when column names are loaded) |
---|
105 | elif A.colNames: |
---|
106 | # Checks shape |
---|
107 | if len(fields)-1 != len(A.colNames): |
---|
108 | raise ValueError, "Invalid number of columns. Expecting:%d" % len(A.colNames) |
---|
109 | |
---|
110 | # Extracts row name and remove it from fields |
---|
111 | rowname = fields.pop(0).strip() |
---|
112 | |
---|
113 | # Handles duplicated row names by adding a number |
---|
114 | rowname_counter[rowname] = rowname_counter.get(rowname,0) + 1 |
---|
115 | if rowname in A.rowValues: |
---|
116 | rowname += "_%d" % rowname_counter[rowname] |
---|
117 | row_dup_names = True |
---|
118 | |
---|
119 | # Adds row name |
---|
120 | A.rowValues[rowname] = None |
---|
121 | A.rowNames.append(rowname) |
---|
122 | |
---|
123 | # Reads row values |
---|
124 | values = [] |
---|
125 | for f in fields: |
---|
126 | if f.strip()=="": |
---|
127 | f = numpy.nan |
---|
128 | values.append(f) |
---|
129 | temp_matrix.append(values) |
---|
130 | else: |
---|
131 | raise ValueError, "Column names are required." |
---|
132 | |
---|
133 | if row_dup_flag: |
---|
134 | print >>stderr, "Duplicated row names were renamed." |
---|
135 | |
---|
136 | # Convert all read lines into a numpy matrix |
---|
137 | vmatrix = numpy.array(temp_matrix).astype(A.mtype) |
---|
138 | |
---|
139 | # Updates indexes to link names and vectors in matrix |
---|
140 | A._link_names2matrix(vmatrix) |
---|
141 | return A |
---|
142 | |
---|
143 | def write_arraytable(A, fname, colnames=None): |
---|
144 | if colnames is None: |
---|
145 | colnames = [] |
---|
146 | elif colnames == []: |
---|
147 | colnames = A.colNames |
---|
148 | |
---|
149 | matrix = A.get_several_column_vectors(colnames) |
---|
150 | matrix = matrix.swapaxes(0,1) |
---|
151 | OUT = open(fname,"w") |
---|
152 | print >>OUT, '\t'.join(["#NAMES"] + colnames) |
---|
153 | counter = 0 |
---|
154 | for rname in A.rowNames: |
---|
155 | print >>OUT, '\t'.join(map(str,[rname]+matrix[counter].tolist())) |
---|
156 | counter +=1 |
---|
157 | OUT.close() |
---|