| 1 | __VERSION__="ete2-2.2rev1026" |
|---|
| 2 | # -*- coding: utf-8 -*- |
|---|
| 3 | # #START_LICENSE########################################################### |
|---|
| 4 | # |
|---|
| 5 | # |
|---|
| 6 | # This file is part of the Environment for Tree Exploration program |
|---|
| 7 | # (ETE). http://ete.cgenomics.org |
|---|
| 8 | # |
|---|
| 9 | # ETE is free software: you can redistribute it and/or modify it |
|---|
| 10 | # under the terms of the GNU General Public License as published by |
|---|
| 11 | # the Free Software Foundation, either version 3 of the License, or |
|---|
| 12 | # (at your option) any later version. |
|---|
| 13 | # |
|---|
| 14 | # ETE is distributed in the hope that it will be useful, but WITHOUT |
|---|
| 15 | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
|---|
| 16 | # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public |
|---|
| 17 | # License for more details. |
|---|
| 18 | # |
|---|
| 19 | # You should have received a copy of the GNU General Public License |
|---|
| 20 | # along with ETE. If not, see <http://www.gnu.org/licenses/>. |
|---|
| 21 | # |
|---|
| 22 | # |
|---|
| 23 | # ABOUT THE ETE PACKAGE |
|---|
| 24 | # ===================== |
|---|
| 25 | # |
|---|
| 26 | # ETE is distributed under the GPL copyleft license (2008-2011). |
|---|
| 27 | # |
|---|
| 28 | # If you make use of ETE in published work, please cite: |
|---|
| 29 | # |
|---|
| 30 | # Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon. |
|---|
| 31 | # ETE: a python Environment for Tree Exploration. Jaime BMC |
|---|
| 32 | # Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24 |
|---|
| 33 | # |
|---|
| 34 | # Note that extra references to the specific methods implemented in |
|---|
| 35 | # the toolkit are available in the documentation. |
|---|
| 36 | # |
|---|
| 37 | # More info at http://ete.cgenomics.org |
|---|
| 38 | # |
|---|
| 39 | # |
|---|
| 40 | # #END_LICENSE############################################################# |
|---|
| 41 | |
|---|
| 42 | #! /usr/bin/env python |
|---|
| 43 | |
|---|
| 44 | import re |
|---|
| 45 | from sys import stderr |
|---|
| 46 | import numpy |
|---|
| 47 | |
|---|
| 48 | |
|---|
| 49 | __all__ = ['read_arraytable', 'write_arraytable'] |
|---|
| 50 | |
|---|
| 51 | def read_arraytable(matrix_file, mtype="float", arraytable_object = None): |
|---|
| 52 | """ Reads a text tab-delimited matrix from file """ |
|---|
| 53 | |
|---|
| 54 | if arraytable_object is None: |
|---|
| 55 | from ete2.coretype import arraytable |
|---|
| 56 | A = arraytable.ArrayTable() |
|---|
| 57 | else: |
|---|
| 58 | A = arraytable_object |
|---|
| 59 | |
|---|
| 60 | A.mtype = mtype |
|---|
| 61 | temp_matrix = [] |
|---|
| 62 | rowname_counter = {} |
|---|
| 63 | colname_counter = {} |
|---|
| 64 | row_dup_flag = False |
|---|
| 65 | col_dup_flag = False |
|---|
| 66 | |
|---|
| 67 | # if matrix_file has many lines, tries to read it as the matrix |
|---|
| 68 | # itself. |
|---|
| 69 | if len(matrix_file.split("\n"))>1: |
|---|
| 70 | matrix_data = matrix_file.split("\n") |
|---|
| 71 | else: |
|---|
| 72 | matrix_data = open(matrix_file) |
|---|
| 73 | |
|---|
| 74 | for line in matrix_data: |
|---|
| 75 | # Clean up line |
|---|
| 76 | line = line.strip("\n") |
|---|
| 77 | #line = line.replace(" ","") |
|---|
| 78 | # Skip empty lines |
|---|
| 79 | if not line: |
|---|
| 80 | continue |
|---|
| 81 | # Get fields in line |
|---|
| 82 | fields = line.split("\t") |
|---|
| 83 | # Read column names |
|---|
| 84 | if line[0]=='#' and re.match("#NAMES",fields[0],re.IGNORECASE): |
|---|
| 85 | counter = 0 |
|---|
| 86 | for colname in fields[1:]: |
|---|
| 87 | colname = colname.strip() |
|---|
| 88 | |
|---|
| 89 | # Handle duplicated col names by adding a number |
|---|
| 90 | colname_counter[colname] = colname_counter.get(colname,0) + 1 |
|---|
| 91 | if colname in A.colValues: |
|---|
| 92 | colname += "_%d" % colname_counter[colname] |
|---|
| 93 | col_dup_flag = True |
|---|
| 94 | # Adds colname |
|---|
| 95 | A.colValues[colname] = None |
|---|
| 96 | A.colNames.append(colname) |
|---|
| 97 | if col_dup_flag: |
|---|
| 98 | print >>stderr, "Duplicated column names were renamed." |
|---|
| 99 | |
|---|
| 100 | # Skip comments |
|---|
| 101 | elif line[0]=='#': |
|---|
| 102 | continue |
|---|
| 103 | |
|---|
| 104 | # Read values (only when column names are loaded) |
|---|
| 105 | elif A.colNames: |
|---|
| 106 | # Checks shape |
|---|
| 107 | if len(fields)-1 != len(A.colNames): |
|---|
| 108 | raise ValueError, "Invalid number of columns. Expecting:%d" % len(A.colNames) |
|---|
| 109 | |
|---|
| 110 | # Extracts row name and remove it from fields |
|---|
| 111 | rowname = fields.pop(0).strip() |
|---|
| 112 | |
|---|
| 113 | # Handles duplicated row names by adding a number |
|---|
| 114 | rowname_counter[rowname] = rowname_counter.get(rowname,0) + 1 |
|---|
| 115 | if rowname in A.rowValues: |
|---|
| 116 | rowname += "_%d" % rowname_counter[rowname] |
|---|
| 117 | row_dup_names = True |
|---|
| 118 | |
|---|
| 119 | # Adds row name |
|---|
| 120 | A.rowValues[rowname] = None |
|---|
| 121 | A.rowNames.append(rowname) |
|---|
| 122 | |
|---|
| 123 | # Reads row values |
|---|
| 124 | values = [] |
|---|
| 125 | for f in fields: |
|---|
| 126 | if f.strip()=="": |
|---|
| 127 | f = numpy.nan |
|---|
| 128 | values.append(f) |
|---|
| 129 | temp_matrix.append(values) |
|---|
| 130 | else: |
|---|
| 131 | raise ValueError, "Column names are required." |
|---|
| 132 | |
|---|
| 133 | if row_dup_flag: |
|---|
| 134 | print >>stderr, "Duplicated row names were renamed." |
|---|
| 135 | |
|---|
| 136 | # Convert all read lines into a numpy matrix |
|---|
| 137 | vmatrix = numpy.array(temp_matrix).astype(A.mtype) |
|---|
| 138 | |
|---|
| 139 | # Updates indexes to link names and vectors in matrix |
|---|
| 140 | A._link_names2matrix(vmatrix) |
|---|
| 141 | return A |
|---|
| 142 | |
|---|
| 143 | def write_arraytable(A, fname, colnames=None): |
|---|
| 144 | if colnames is None: |
|---|
| 145 | colnames = [] |
|---|
| 146 | elif colnames == []: |
|---|
| 147 | colnames = A.colNames |
|---|
| 148 | |
|---|
| 149 | matrix = A.get_several_column_vectors(colnames) |
|---|
| 150 | matrix = matrix.swapaxes(0,1) |
|---|
| 151 | OUT = open(fname,"w") |
|---|
| 152 | print >>OUT, '\t'.join(["#NAMES"] + colnames) |
|---|
| 153 | counter = 0 |
|---|
| 154 | for rname in A.rowNames: |
|---|
| 155 | print >>OUT, '\t'.join(map(str,[rname]+matrix[counter].tolist())) |
|---|
| 156 | counter +=1 |
|---|
| 157 | OUT.close() |
|---|