Context Navigation

source: tags/svn.1.5.4/RNACMA/Cma.cxx

Visit:

Last change on this file was 8309, checked in by westram, 14 years ago

moved much code into static scope

(partly reverted by [8310])

File size: 14.3 KB

Line
1	/*
2	* Cma.cpp
3	*
4	* Created on: Dec 16, 2009
5	* Author: Breno Faria
6	*/
7
8	#include <iostream>
9	#include <iomanip>
10	#include <time.h>
11
12	#include "Cma.h"
13
14	//>--------------------------Global definitions--------------------------------
15
16	// Some local function prototypes.
17	void unifyCluster(int cluster1, int cluster2, VectorXi & result);
18
19	//>-----------------------local helper functions-------------------------------
20
21	/**
22	* Helper function for Cma::computeClusters. Compares MITuples by
23	* mutual information value.
24	*
25	* @param first: the first MITuple
26	*
27	* @param second: the second MITuple
28	*
29	* @returns whether the first element should be placed before the second.
30	*/
31	static bool compareMITuples(MITuple first, MITuple second) {
32	return first.MI > second.MI;
33	}
34
35	/**
36	* Helper function for Cma::computeClusters. Unites two clusters.
37	*
38	* @param cluster1: the index of the first cluster.
39	*
40	* @param cluster2: the index of the second cluster.
41	*/
42	void unifyCluster(int cluster1, int cluster2, VectorXi & result) {
43	size_t size = result.size();
44	for (size_t i = 0; i < size; ++i) {
45	if (result[i] == cluster2) {
46	result[i] = cluster1;
47	}
48	}
49	}
50
51	//>---------------------------Public methods-----------------------------------
52
53	/**
54	* Constructor.
55	*
56	* @param seq_num: the number of sequences.
57	*
58	* @param alph: the vector containing the alphabet. Notice that letters of the
59	* alphabet might be longer than one character. This is not used
60	* so far, but limiting the length to 1 seems wrong...
61	*/
62	Cma::Cma(vector<string> & alph, int seq_num) {
63
64	initAlphabet(alph);
65	num_of_seqs = seq_num;
66
67	}
68
69	/**
70	* Destructor.
71	*/
72	Cma::~Cma() {
73	}
74
75	/**
76	* Computes the mutual information of position pairs in a set of
77	* sequences.
78	* Mutual information can be defined as:
79	*
80	* MI(pos1, pos2) = H(pos1) + H(pos2) - H(pos1, pos2),
81	*
82	* i.e. the sum of entropies at two positions minus the joint entropy
83	* of these positions.
84	*
85	* @param seq: the multiple sequence alignment.
86	*
87	* @returns a matrix with the MI values.
88	*/
89	MatrixXd Cma::computeMutualInformation(VecVecType & seq) {
90
91	JointEntropy = computeJointEntropy(seq);
92
93	JointEntropy = JointEntropy + JointEntropy.adjoint() - JointEntropy.diagonal().asDiagonal();
94
95	entropy = JointEntropy.diagonal();
96
97	cout << "computing Mutual Information... ";
98	flush(cout);
99	clock_t start = clock();
100
101	VectorXd Ones = VectorXd::Ones(entropy.size());
102	MatrixXd Hx = (entropy * Ones.transpose()).transpose();
103	MatrixXd Hy = Hx.transpose();
104
105	MutualInformation = Hx + Hy - JointEntropy;
106
107	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
108	<< "s." << endl;
109
110	return MutualInformation;
111
112	}
113
114	/**
115	* Computes the noiseless mutual information, or MIp (see classdoc), which is
116	* defined as:
117	*
118	* MIp(pos1, pos2) = MI(pos1, pos2) - (mMI(pos1) * mMI(pos2)) / mMI,
119	*
120	* where MIp is the noiseless mutual information between positions 1 and 2 and
121	* MI similarly. mMI(x) is the mean mutual information of position x, which is
122	* defined as:
123	*
124	* mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
125	* length of the sequences.
126	*
127	* mMI is the overall mean mutual information, defined as:
128	*
129	* mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
130	* again m is the length of the sequences.
131	*
132	* @param seq: the multiple sequence alignment.
133	*
134	* @returns the noiseless mutual information.
135	*/
136	MatrixXd Cma::computeMutualInformationP(VecVecType & seq) {
137
138	if (MutualInformation.size() == 0) {
139	computeMutualInformation(seq);
140	}
141
142	cout << "computing noiseless Mutual Information (MIp)... ";
143	flush(cout);
144	clock_t start = clock();
145	/*
146	* We must remove negative MI values (the negative values are artificially
147	* generated, see comments in computeJointEntropy).
148	*/
149	for (int i = 0; i < MutualInformation.rows(); ++i) {
150	for (int j = 0; j < MutualInformation.cols(); ++j) {
151	if (MutualInformation(i, j) < 0.) {
152	MutualInformation(i, j) = 0.;
153	}
154	}
155	}
156
157	VectorXd mMIs = computeMeanMutualInformationVector();
158	double mMI = computeOveralMeanMutualInformation();
159
160	MutualInformationP = MutualInformation - ((mMIs * mMIs.transpose()) / mMI);
161
162	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
163	<< "s." << endl;
164
165	return MutualInformationP;
166
167	}
168
169	/**
170	* Given a MI-matrix, computes the sorted list of the highest MI values.
171	*
172	* @param mutualInformation: a matrix with MI (or MIp) measures.
173	*
174	* @returns the list of tuples of positions sorted by their MI value.
175	*/
176	list<MITuple> Cma::compute_mituples(MatrixXd mutualInformation) {
177	size_t size = mutualInformation.cols();
178
179	list<MITuple> mituples;
180
181	// populate the mituples list with all entries in the upper triangular matrix
182	// of mutualInformation.
183	for (size_t i = 0; i < size; ++i) {
184	for (size_t j = i + 1; j < size; ++j) {
185	MITuple curr;
186	curr.MI = mutualInformation(i, j);
187	curr.pos1 = i;
188	curr.pos2 = j;
189	mituples.push_back(curr);
190	}
191	}
192
193	// sort mituples by MI-value
194	mituples.sort(compareMITuples);
195
196	return mituples;
197	}
198
199	/**
200	* Computes the clusters of positions that are most correlated, up to
201	* threshold.
202	*
203	* @param mituples: the mutualInformation matrix, as generated by
204	* Cma::computeMutualInformation.
205	*
206	* @param size: the original size of the alignment.
207	*
208	* @param threshold: the mutual information threshold. If two positions have
209	* mi < threshold, they won't be united in one cluster.
210	*
211	* @returns a vector with the cluster indices for each position.
212	*/
213	VectorXi Cma::computeClusters(list<MITuple> mituples, size_t size,
214	double threshold) {
215
216	cout << "computing clusters of correlated positions... ";
217	flush(cout);
218
219	VectorXi result = VectorXi::Zero(size);
220
221	//compute the clusters.
222	int cluster = 1;
223	int rest = int(size);
224	for (list<MITuple>::iterator it = mituples.begin(); it != mituples.end(); ++it) {
225	double result1 = abs(result[it->pos1]);
226	double result2 = abs(result[it->pos2]);
227	if (it->MI <= threshold \|\| rest == 0) {
228	break;
229	}
230	if (result1 < epsilon and result2 < epsilon) {
231	result[it->pos1] = cluster;
232	result[it->pos2] = cluster++;
233	rest -= 2;
234	}
235	else if (result1 > epsilon and result2 < epsilon) {
236	result[it->pos2] = result[it->pos1];
237	rest -= 1;
238	}
239	else if (result1 < epsilon and result2 > epsilon) {
240	result[it->pos1] = result[it->pos2];
241	rest -= 1;
242	}
243	else if (result1 > epsilon and result2 > epsilon && result1 - result2 > epsilon) {
244	unifyCluster(result[it->pos1], result[it->pos2], result);
245	}
246	}
247
248	clusters = result;
249
250	cout << "done." << endl;
251
252	return result;
253	}
254
255	//>-------------------------getters and setters--------------------------------
256
257	MatrixXd Cma::getEntropy() {
258
259	return entropy;
260
261	}
262
263	MatrixXd Cma::getJointEntropy() {
264
265	return JointEntropy;
266
267	}
268
269	MatrixXd Cma::getMI() {
270
271	return MutualInformation;
272
273	}
274
275	MatrixXd Cma::getMIp() {
276
277	return MutualInformationP;
278
279	}
280
281	VectorXi Cma::getClusters() {
282
283	return clusters;
284
285	}
286
287	//>----------------------------private methods---------------------------------
288
289	/**
290	* Here we define the alphabet.
291	*/
292	void Cma::initAlphabet(vector<string> alph) {
293
294	alphabet = vector<string> (alph);
295
296	int i = 0;
297	for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
298	alphabet_map[*it] = i;
299	i++;
300	}
301	for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
302	for (vector<string>::iterator it2 = alph.begin(); it2 != alph.end(); ++it2) {
303	alphabet_map[it + it2] = i;
304	i++;
305	}
306	}
307	alphabet_map["total"] = i;
308	}
309
310	/**
311	* Computes an approximation of the joint entropy for each pair of positions.
312	*
313	* The joint entropy of two positions in a set of aligned sequences
314	* is defined as:
315	* - ( p(A,A) * log2(p(A,A)) + p(A,C) * log2(p(A,C)) + ... +
316	* p(C,A) * log2(p(C,A)) + p(C,C) * log2(p(C,A)) + ... +
317	* ... +
318	* p(T,A) * log2(p(T,A)) + ... + p(T) * log2(p(T,T)) ),
319	* where p(x,y) is the probability of observing base x at position 1 and
320	* base y at position 2.
321	*
322	* @param seqs: the aligned sequences
323	*
324	* @returns a square matrix with the joint entropy values for
325	* each pair of positions.
326	*/
327	MatrixXd Cma::computeJointEntropy(const VecVecType & seqs) {
328
329	MapMatrixType hist = buildJointHistogram(seqs);
330	return Cma::computeJointEntropy(hist);
331
332	}
333
334	/**
335	* Computes an approximation of the joint entropy for each pair of positions.
336	* The measure also includes an heuristic to penalise mismatches in occurrences
337	* (see the documentation of this project for further explanations).
338	*
339	* @param hist: the histogram of labels for each position in the sequence.
340	*
341	* @returns a square matrix with the joint entropy values for
342	* each pair of positions.
343	*/
344	MatrixXd Cma::computeJointEntropy(MapMatrixType & hist) {
345
346	cout << "computing joint entropy (this may take a while)... ";
347	flush(cout);
348	clock_t start = clock();
349
350	int hist_size = int(hist.size());
351	MatrixXd result(hist_size, hist_size);
352	result.setZero(hist_size, hist_size);
353	for (unsigned int i = 0; i < (unsigned int) hist_size; ++i) {
354
355	//progress "bar"
356	if (hist_size > 30 and i % (hist_size / 30) == 0) {
357	cout << "(" << setw(6) << setiosflags(ios::fixed)
358	<< setprecision(2) << i / float(hist_size) * 100 << "%)";
359	flush(cout);
360	}
361
362	for (unsigned int j = i; j < hist[i].size(); ++j) {
363	double result_i_j = 0.;
364	int total_i_j = hist[i][j][alphabet_map.at("total")];
365	int total_i_i = hist[i][i][alphabet_map.at("total")];
366	int total_j_j = hist[j][j][alphabet_map.at("total")];
367	int mismatches = total_i_i + total_j_j - 2 * total_i_j;
368	if (total_i_j != 0) {
369	for (map<int, int>::iterator it = hist[i][j].begin(); it
370	!= hist[i][j].end(); ++it) {
371	double pair = double(it->second) + Cma::epsilon;
372	result_i_j += (pair / total_i_j) * log2(pair / total_i_j);
373	}
374
375	result(i, j) = -result_i_j + double(mismatches) / (total_i_i
376	+ total_j_j) * Cma::penalty;
377
378	} else {
379	/**
380	* We will only fall into this case if we have two positions which
381	* never occur simultaneously. In this case we cannot say anything about
382	* the MI.
383	* To guarantee that this joint entropy value will not come in our way in
384	* the MI calculation we set it artificially to 5.0, which is 1 greater than
385	* max(H(X) + H(Y)).
386	*/
387	result(i, j) = 5.;
388	}
389
390	}
391	//progress "bar"
392	if (hist_size > 30 and i % (hist_size / 30) == 0) {
393	cout << "\b\b\b\b\b\b\b\b\b";
394	}
395	}
396	flush(cout);
397	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
398	<< "s." << endl;
399	return result;
400
401	}
402
403	/**
404	* Checks whether the key is valid (if it pertains to the alphabet).
405	*
406	* @param key: the base to be checked.
407	*
408	* @return whether the key is valid (if it pertains to the alphabet).
409	*/
410	bool Cma::isValid(string & key) {
411
412	map<string, int>::iterator it = alphabet_map.find(key);
413	return (it != alphabet_map.end());
414
415	}
416
417	/**
418	* This method computes a matrix of maps, one for each pair of positions of the
419	* aligned sequences.
420	* Each map has an entry for each pair of occurring letters to count the joint
421	* occurrences of these letters.
422	*
423	* @param alignedSequences: an array of string vectors, containing the sequences.
424	*
425	* @returns a matrix of maps (see method description)
426	*/
427	MapMatrixType Cma::buildJointHistogram(const VecVecType & alignedSequences) {
428
429	cout << "building histogram (this may take a while)... ";
430	flush(cout);
431	clock_t start = clock();
432
433	if (alignedSequences.empty()) {
434	throw "Error: Cma::buildJointHistogram: parameter empty.";
435	}
436
437	// here we defined one dimension of the matrix as being the same as the
438	// length of the sequences.
439	MapMatrixType result(alignedSequences[0].size(), MapVecType(
440	alignedSequences[0].size()));
441
442	int ii = 0;
443	// for each sequence:
444	for (VecVecType::const_iterator it = alignedSequences.begin(); it
445	!= alignedSequences.end(); ++it) {
446
447	cout << "(" << setw(6) << setiosflags(ios::fixed) << setprecision(2)
448	<< ii / float(num_of_seqs) * 100 << "%)";
449	flush(cout);
450
451	// for each first position in the sequence
452	for (size_t i = 0; i < it->size(); ++i) {
453	string key1 = it->at(i);
454	if (isValid(key1)) {
455
456	//for each second position in the sequence
457	for (size_t j = i; j < it->size(); ++j) {
458	string key2 = it->at(j);
459	if (isValid(key2)) {
460	result[i][j][alphabet_map.at(key1 + key2)] += 1;
461	result[i][j][alphabet_map.at("total")] += 1;
462	}
463	}
464	}
465	}
466	cout << "\b\b\b\b\b\b\b\b\b";
467	ii++;
468	}
469	flush(cout);
470	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
471	<< "s." << endl;
472
473	return result;
474	}
475
476	/**
477	* Computes the vector of mean mutual information values mMI.
478	*
479	* mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
480	* length of the sequences.
481	*
482	* @returns the vector of mean mutual information values mMI.
483	*/
484	VectorXd Cma::computeMeanMutualInformationVector() {
485
486	return MutualInformation.rowwise().sum() / num_of_seqs;
487
488	}
489
490	/**
491	* Computes the overall mean mutual information value.
492	*
493	* mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
494	* again m is the length of the sequences.
495	*
496	* @returns the overall mean mutual information value.
497	*/
498	double Cma::computeOveralMeanMutualInformation() {
499
500	return 2. * MutualInformation.sum() / num_of_seqs / num_of_seqs;
501
502	}
503

Note: See TracBrowser for help on using the repository browser.

Download in other formats: