source: branches/items/GDE/SATIVA/sativa/epac/ete2/arraytable.py

Last change on this file was 12906, checked in by akozlov, 10 years ago

add sativa files and scripts for ARB integration

File size: 8.8 KB
Line 
1__VERSION__="ete2-2.2rev1026" 
2# -*- coding: utf-8 -*-
3# #START_LICENSE###########################################################
4#
5#
6# This file is part of the Environment for Tree Exploration program
7# (ETE).  http://ete.cgenomics.org
8
9# ETE is free software: you can redistribute it and/or modify it
10# under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13
14# ETE is distributed in the hope that it will be useful, but WITHOUT
15# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
17# License for more details.
18
19# You should have received a copy of the GNU General Public License
20# along with ETE.  If not, see <http://www.gnu.org/licenses/>.
21#
22#
23#                     ABOUT THE ETE PACKAGE
24#                     =====================
25#
26# ETE is distributed under the GPL copyleft license (2008-2011). 
27#
28# If you make use of ETE in published work, please cite:
29#
30# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31# ETE: a python Environment for Tree Exploration. Jaime BMC
32# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33#
34# Note that extra references to the specific methods implemented in
35# the toolkit are available in the documentation.
36#
37# More info at http://ete.cgenomics.org
38#
39#
40# #END_LICENSE#############################################################
41
42
43import sys
44import re
45import math
46from os import path
47
48import numpy
49from ete2.parser.text_arraytable import write_arraytable, read_arraytable
50
51__all__ = ["ArrayTable"]
52
53class ArrayTable(object):
54    """This object is thought to work with matrix datasets (like
55    microarrays). It allows to load the matrix an access easily to row
56    and column vectors. """
57
58    def __repr__(self):
59        return "ArrayTable (%s)" %hex(self.__hash__())
60
61    def __str__(self):
62        return str(self.matrix)
63
64    def __init__(self, matrix_file=None, mtype="float"):
65        self.colNames  = []
66        self.rowNames  = []
67        self.colValues = {}
68        self.rowValues = {}
69        self.matrix   = None
70        self.mtype = None
71
72        # If matrix file is supplied
73        if matrix_file is not None:
74            read_arraytable(matrix_file, \
75                            mtype=mtype, \
76                            arraytable_object = self)
77
78    def get_row_vector(self,rowname):
79        """ Returns the vector associated to the given row name """
80        return self.rowValues.get(rowname,None)
81
82
83    def get_column_vector(self,colname):
84        """ Returns the vector associated to the given column name """
85        return self.colValues.get(colname,None)
86
87
88    def get_several_column_vectors(self,colnames):
89        """ Returns a list of vectors associated to several column names """
90        vectors = [self.colValues[cname] for cname in colnames]
91        return numpy.array(vectors)
92
93    def get_several_row_vectors(self,rownames):
94        """ Returns a list vectors associated to several row names """
95        vectors = [self.rowValues[rname] for rname in rownames]
96        return numpy.array(vectors)
97
98    def remove_column(self,colname):
99        """Removes the given column form the current dataset """
100        col_value = self.colValues.pop(colname, None)
101        if col_value != None:
102            new_indexes = range(len(self.colNames))
103            index = self.colNames.index(colname)
104            self.colNames.pop(index)
105            new_indexes.pop(index)
106            newmatrix = self.matrix.swapaxes(0,1)
107            newmatrix = newmatrix[new_indexes].swapaxes(0,1)
108            self._link_names2matrix(newmatrix)
109
110    def merge_columns(self, groups, grouping_criterion):
111        """ Returns a new ArrayTable object in which columns are
112        merged according to a given criterion.
113
114        'groups' argument must be a dictionary in which keys are the
115        new column names, and each value is the list of current
116        column names to be merged.
117
118        'grouping_criterion' must be 'min', 'max' or 'mean', and
119        defines how numeric values will be merged.
120
121        Example:
122           my_groups = {'NewColumn':['column5', 'column6']}
123           new_Array = Array.merge_columns(my_groups, 'max')
124
125        """
126
127        if grouping_criterion == "max":
128            grouping_f = get_max_vector
129        elif grouping_criterion == "min":
130            grouping_f = get_min_vector
131        elif grouping_criterion == "mean":
132            grouping_f = get_mean_vector
133        else:
134            raise ValueError, "grouping_criterion not supported. Use max|min|mean "
135
136        grouped_array = self.__class__()
137        grouped_matrix = []
138        colNames = []
139        alltnames = set([])
140        for gname,tnames in groups.iteritems():
141            all_vectors=[]
142            for tn in tnames:
143                if tn not in self.colValues:
144                    raise ValueError, str(tn)+" column not found."
145                if tn in alltnames:
146                    raise ValueError, str(tn)+" duplicated column name for merging"
147                alltnames.add(tn)
148                vector = self.get_column_vector(tn).astype(float)
149                all_vectors.append(vector)
150            # Store the group vector = max expression of all items in group
151            grouped_matrix.append(grouping_f(all_vectors))
152            # store group name
153            colNames.append(gname)
154
155        for cname in self.colNames:
156            if cname not in alltnames:
157                grouped_matrix.append(self.get_column_vector(cname))
158                colNames.append(cname)
159
160        grouped_array.rowNames= self.rowNames
161        grouped_array.colNames= colNames
162        vmatrix = numpy.array(grouped_matrix).transpose()
163        grouped_array._link_names2matrix(vmatrix)
164        return grouped_array
165
166    def transpose(self):
167        """ Returns a new ArrayTable in which current matrix is transposed. """
168
169        transposedA = self.__class__()
170        transposedM = self.matrix.transpose()
171        transposedA.colNames = list(self.rowNames)
172        transposedA.rowNames = list(self.colNames)
173        transposedA._link_names2matrix(transposedM)
174
175        # Check that everything is ok
176        # for n in self.colNames:
177        #     print self.get_column_vector(n) ==  transposedA.get_row_vector(n)
178        # for n in self.rowNames:
179        #     print self.get_row_vector(n) ==  transposedA.get_column_vector(n)
180        return transposedA
181
182    def _link_names2matrix(self, m):
183        """ Synchronize curent column and row names to the given matrix"""
184        if len(self.rowNames) != m.shape[0]:
185            raise ValueError , "Expecting matrix with  %d rows" % m.size[0]
186
187        if len(self.colNames) != m.shape[1]:
188            raise ValueError , "Expecting matrix with  %d columns" % m.size[1]
189
190        self.matrix = m
191        self.colValues.clear()
192        self.rowValues.clear()
193        # link columns names to vectors
194        i = 0
195        for colname in self.colNames:
196            self.colValues[colname] = self.matrix[:,i]
197            i+=1
198        # link row names to vectors
199        i = 0
200        for rowname in self.rowNames:
201            self.rowValues[rowname] = self.matrix[i,:]
202            i+=1
203
204    def write(self, fname, colnames=None):
205        write_arraytable(self, fname, colnames=colnames)
206
207
208
209def get_centroid_dist(vcenter,vlist,fdist):
210    d = 0.0
211    for v in vlist:
212        d += fdist(v,vcenter)
213    return 2*(d / len(vlist))
214
215def get_average_centroid_linkage_dist(vcenter1,vlist1,vcenter2,vlist2,fdist):
216    d1,d2 = 0.0, 0.0
217    for v in vlist1:
218        d1 += fdist(v,vcenter2)
219    for v in vlist2:
220        d2 += fdist(v,vcenter1)
221    return (d1+d2) / (len(vlist1)+len(vlist2))
222
223def safe_mean(values):
224    """ Returns mean value discarding non finite values """
225    valid_values = []
226    for v in values:
227        if numpy.isfinite(v):
228            valid_values.append(v)
229    return numpy.mean(valid_values), numpy.std(valid_values)
230
231def safe_mean_vector(vectors):
232    """ Returns mean profile discarding non finite values """
233    # if only one vector, avg = itself
234    if len(vectors)==1:
235        return vectors[0], numpy.zeros(len(vectors[0]))
236    # Takes the vector length form the first item
237    length = len(vectors[0])
238
239    safe_mean = []
240    safe_std  = []
241
242    for pos in xrange(length):
243        pos_mean = []
244        for v in vectors:
245            if numpy.isfinite(v[pos]):
246                pos_mean.append(v[pos])
247        safe_mean.append(numpy.mean(pos_mean))
248        safe_std.append(numpy.std(pos_mean))
249    return safe_mean, safe_std
250
251def get_mean_vector(vlist):
252    a = numpy.array(vlist)
253    return numpy.mean(a,0)
254
255def get_median_vector(vlist):
256    a = numpy.array(vlist)
257    return numpy.median(a)
258
259def get_max_vector(vlist):
260    a = numpy.array(vlist)
261    return numpy.max(a,0)
262
263def get_min_vector(vlist):
264    a = numpy.array(vlist)
265    return numpy.min(a,0)
Note: See TracBrowser for help on using the repository browser.