Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: trunk/GDE/SATIVA/sativa/epac/ete2/arraytable.py

Visit:

Last change on this file was 12906, checked in by akozlov, 11 years ago
add sativa files and scripts for ARB integration
File size: 8.8 KB

Line
1	__VERSION__="ete2-2.2rev1026"
2	# -- coding: utf-8 --
3	# #START_LICENSE###########################################################
4	#
5	#
6	# This file is part of the Environment for Tree Exploration program
7	# (ETE). http://ete.cgenomics.org
8	#
9	# ETE is free software: you can redistribute it and/or modify it
10	# under the terms of the GNU General Public License as published by
11	# the Free Software Foundation, either version 3 of the License, or
12	# (at your option) any later version.
13	#
14	# ETE is distributed in the hope that it will be useful, but WITHOUT
15	# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16	# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
17	# License for more details.
18	#
19	# You should have received a copy of the GNU General Public License
20	# along with ETE. If not, see <http://www.gnu.org/licenses/>.
21	#
22	#
23	# ABOUT THE ETE PACKAGE
24	# =====================
25	#
26	# ETE is distributed under the GPL copyleft license (2008-2011).
27	#
28	# If you make use of ETE in published work, please cite:
29	#
30	# Jaime Huerta-Cepas, Joaquin Dopazo and Toni Gabaldon.
31	# ETE: a python Environment for Tree Exploration. Jaime BMC
32	# Bioinformatics 2010,:24doi:10.1186/1471-2105-11-24
33	#
34	# Note that extra references to the specific methods implemented in
35	# the toolkit are available in the documentation.
36	#
37	# More info at http://ete.cgenomics.org
38	#
39	#
40	# #END_LICENSE#############################################################
41
42
43	import sys
44	import re
45	import math
46	from os import path
47
48	import numpy
49	from ete2.parser.text_arraytable import write_arraytable, read_arraytable
50
51	__all__ = ["ArrayTable"]
52
53	class ArrayTable(object):
54	"""This object is thought to work with matrix datasets (like
55	microarrays). It allows to load the matrix an access easily to row
56	and column vectors. """
57
58	def __repr__(self):
59	return "ArrayTable (%s)" %hex(self.__hash__())
60
61	def __str__(self):
62	return str(self.matrix)
63
64	def __init__(self, matrix_file=None, mtype="float"):
65	self.colNames = []
66	self.rowNames = []
67	self.colValues = {}
68	self.rowValues = {}
69	self.matrix = None
70	self.mtype = None
71
72	# If matrix file is supplied
73	if matrix_file is not None:
74	read_arraytable(matrix_file, \
75	mtype=mtype, \
76	arraytable_object = self)
77
78	def get_row_vector(self,rowname):
79	""" Returns the vector associated to the given row name """
80	return self.rowValues.get(rowname,None)
81
82
83	def get_column_vector(self,colname):
84	""" Returns the vector associated to the given column name """
85	return self.colValues.get(colname,None)
86
87
88	def get_several_column_vectors(self,colnames):
89	""" Returns a list of vectors associated to several column names """
90	vectors = [self.colValues[cname] for cname in colnames]
91	return numpy.array(vectors)
92
93	def get_several_row_vectors(self,rownames):
94	""" Returns a list vectors associated to several row names """
95	vectors = [self.rowValues[rname] for rname in rownames]
96	return numpy.array(vectors)
97
98	def remove_column(self,colname):
99	"""Removes the given column form the current dataset """
100	col_value = self.colValues.pop(colname, None)
101	if col_value != None:
102	new_indexes = range(len(self.colNames))
103	index = self.colNames.index(colname)
104	self.colNames.pop(index)
105	new_indexes.pop(index)
106	newmatrix = self.matrix.swapaxes(0,1)
107	newmatrix = newmatrix[new_indexes].swapaxes(0,1)
108	self._link_names2matrix(newmatrix)
109
110	def merge_columns(self, groups, grouping_criterion):
111	""" Returns a new ArrayTable object in which columns are
112	merged according to a given criterion.
113
114	'groups' argument must be a dictionary in which keys are the
115	new column names, and each value is the list of current
116	column names to be merged.
117
118	'grouping_criterion' must be 'min', 'max' or 'mean', and
119	defines how numeric values will be merged.
120
121	Example:
122	my_groups = {'NewColumn':['column5', 'column6']}
123	new_Array = Array.merge_columns(my_groups, 'max')
124
125	"""
126
127	if grouping_criterion == "max":
128	grouping_f = get_max_vector
129	elif grouping_criterion == "min":
130	grouping_f = get_min_vector
131	elif grouping_criterion == "mean":
132	grouping_f = get_mean_vector
133	else:
134	raise ValueError, "grouping_criterion not supported. Use max\|min\|mean "
135
136	grouped_array = self.__class__()
137	grouped_matrix = []
138	colNames = []
139	alltnames = set([])
140	for gname,tnames in groups.iteritems():
141	all_vectors=[]
142	for tn in tnames:
143	if tn not in self.colValues:
144	raise ValueError, str(tn)+" column not found."
145	if tn in alltnames:
146	raise ValueError, str(tn)+" duplicated column name for merging"
147	alltnames.add(tn)
148	vector = self.get_column_vector(tn).astype(float)
149	all_vectors.append(vector)
150	# Store the group vector = max expression of all items in group
151	grouped_matrix.append(grouping_f(all_vectors))
152	# store group name
153	colNames.append(gname)
154
155	for cname in self.colNames:
156	if cname not in alltnames:
157	grouped_matrix.append(self.get_column_vector(cname))
158	colNames.append(cname)
159
160	grouped_array.rowNames= self.rowNames
161	grouped_array.colNames= colNames
162	vmatrix = numpy.array(grouped_matrix).transpose()
163	grouped_array._link_names2matrix(vmatrix)
164	return grouped_array
165
166	def transpose(self):
167	""" Returns a new ArrayTable in which current matrix is transposed. """
168
169	transposedA = self.__class__()
170	transposedM = self.matrix.transpose()
171	transposedA.colNames = list(self.rowNames)
172	transposedA.rowNames = list(self.colNames)
173	transposedA._link_names2matrix(transposedM)
174
175	# Check that everything is ok
176	# for n in self.colNames:
177	# print self.get_column_vector(n) == transposedA.get_row_vector(n)
178	# for n in self.rowNames:
179	# print self.get_row_vector(n) == transposedA.get_column_vector(n)
180	return transposedA
181
182	def _link_names2matrix(self, m):
183	""" Synchronize curent column and row names to the given matrix"""
184	if len(self.rowNames) != m.shape[0]:
185	raise ValueError , "Expecting matrix with %d rows" % m.size[0]
186
187	if len(self.colNames) != m.shape[1]:
188	raise ValueError , "Expecting matrix with %d columns" % m.size[1]
189
190	self.matrix = m
191	self.colValues.clear()
192	self.rowValues.clear()
193	# link columns names to vectors
194	i = 0
195	for colname in self.colNames:
196	self.colValues[colname] = self.matrix[:,i]
197	i+=1
198	# link row names to vectors
199	i = 0
200	for rowname in self.rowNames:
201	self.rowValues[rowname] = self.matrix[i,:]
202	i+=1
203
204	def write(self, fname, colnames=None):
205	write_arraytable(self, fname, colnames=colnames)
206
207
208
209	def get_centroid_dist(vcenter,vlist,fdist):
210	d = 0.0
211	for v in vlist:
212	d += fdist(v,vcenter)
213	return 2*(d / len(vlist))
214
215	def get_average_centroid_linkage_dist(vcenter1,vlist1,vcenter2,vlist2,fdist):
216	d1,d2 = 0.0, 0.0
217	for v in vlist1:
218	d1 += fdist(v,vcenter2)
219	for v in vlist2:
220	d2 += fdist(v,vcenter1)
221	return (d1+d2) / (len(vlist1)+len(vlist2))
222
223	def safe_mean(values):
224	""" Returns mean value discarding non finite values """
225	valid_values = []
226	for v in values:
227	if numpy.isfinite(v):
228	valid_values.append(v)
229	return numpy.mean(valid_values), numpy.std(valid_values)
230
231	def safe_mean_vector(vectors):
232	""" Returns mean profile discarding non finite values """
233	# if only one vector, avg = itself
234	if len(vectors)==1:
235	return vectors[0], numpy.zeros(len(vectors[0]))
236	# Takes the vector length form the first item
237	length = len(vectors[0])
238
239	safe_mean = []
240	safe_std = []
241
242	for pos in xrange(length):
243	pos_mean = []
244	for v in vectors:
245	if numpy.isfinite(v[pos]):
246	pos_mean.append(v[pos])
247	safe_mean.append(numpy.mean(pos_mean))
248	safe_std.append(numpy.std(pos_mean))
249	return safe_mean, safe_std
250
251	def get_mean_vector(vlist):
252	a = numpy.array(vlist)
253	return numpy.mean(a,0)
254
255	def get_median_vector(vlist):
256	a = numpy.array(vlist)
257	return numpy.median(a)
258
259	def get_max_vector(vlist):
260	a = numpy.array(vlist)
261	return numpy.max(a,0)
262
263	def get_min_vector(vlist):
264	a = numpy.array(vlist)
265	return numpy.min(a,0)

Note: See TracBrowser for help on using the repository browser.

Download in other formats: