Context Navigation

source: branches/profile/RNACMA/Cma.cxx

Visit:

Last change on this file was 8940, checked in by westram, 12 years ago
added MBI copyright fixed include wrappers fixed cppcheck warning
File size: 14.4 KB

Line
1	/*
2	* Cma.cpp
3	*
4	* Created on: Dec 16, 2009
5	* Author: Breno Faria
6	*
7	* Institute of Microbiology (Technical University Munich)
8	* http://www.arb-home.de/
9	*/
10
11	#include <iostream>
12	#include <iomanip>
13	#include <time.h>
14
15	#include "Cma.h"
16
17	//>--------------------------Global definitions--------------------------------
18
19	// Some local function prototypes.
20	void unifyCluster(int cluster1, int cluster2, VectorXi & result);
21
22	//>-----------------------local helper functions-------------------------------
23
24	/**
25	* Helper function for Cma::computeClusters. Compares MITuples by
26	* mutual information value.
27	*
28	* @param first: the first MITuple
29	*
30	* @param second: the second MITuple
31	*
32	* @returns whether the first element should be placed before the second.
33	*/
34	static bool compareMITuples(MITuple first, MITuple second) {
35	return first.MI > second.MI;
36	}
37
38	/**
39	* Helper function for Cma::computeClusters. Unites two clusters.
40	*
41	* @param cluster1: the index of the first cluster.
42	*
43	* @param cluster2: the index of the second cluster.
44	*/
45	void unifyCluster(int cluster1, int cluster2, VectorXi & result) {
46	size_t size = result.size();
47	for (size_t i = 0; i < size; ++i) {
48	if (result[i] == cluster2) {
49	result[i] = cluster1;
50	}
51	}
52	}
53
54	//>---------------------------Public methods-----------------------------------
55
56	/**
57	* Constructor.
58	*
59	* @param seq_num: the number of sequences.
60	*
61	* @param alph: the vector containing the alphabet. Notice that letters of the
62	* alphabet might be longer than one character. This is not used
63	* so far, but limiting the length to 1 seems wrong...
64	*/
65	Cma::Cma(vector<string> & alph, int seq_num) {
66
67	initAlphabet(alph);
68	num_of_seqs = seq_num;
69
70	}
71
72	/**
73	* Destructor.
74	*/
75	Cma::~Cma() {
76	}
77
78	/**
79	* Computes the mutual information of position pairs in a set of
80	* sequences.
81	* Mutual information can be defined as:
82	*
83	* MI(pos1, pos2) = H(pos1) + H(pos2) - H(pos1, pos2),
84	*
85	* i.e. the sum of entropies at two positions minus the joint entropy
86	* of these positions.
87	*
88	* @param seq: the multiple sequence alignment.
89	*
90	* @returns a matrix with the MI values.
91	*/
92	MatrixXd Cma::computeMutualInformation(VecVecType & seq) {
93
94	JointEntropy = computeJointEntropy(seq);
95
96	JointEntropy = JointEntropy + JointEntropy.adjoint() - JointEntropy.diagonal().asDiagonal();
97
98	entropy = JointEntropy.diagonal();
99
100	cout << "computing Mutual Information... ";
101	flush(cout);
102	clock_t start = clock();
103
104	VectorXd Ones = VectorXd::Ones(entropy.size());
105	MatrixXd Hx = (entropy * Ones.transpose()).transpose();
106	MatrixXd Hy = Hx.transpose();
107
108	MutualInformation = Hx + Hy - JointEntropy;
109
110	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
111	<< "s." << endl;
112
113	return MutualInformation;
114
115	}
116
117	/**
118	* Computes the noiseless mutual information, or MIp (see classdoc), which is
119	* defined as:
120	*
121	* MIp(pos1, pos2) = MI(pos1, pos2) - (mMI(pos1) * mMI(pos2)) / mMI,
122	*
123	* where MIp is the noiseless mutual information between positions 1 and 2 and
124	* MI similarly. mMI(x) is the mean mutual information of position x, which is
125	* defined as:
126	*
127	* mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
128	* length of the sequences.
129	*
130	* mMI is the overall mean mutual information, defined as:
131	*
132	* mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
133	* again m is the length of the sequences.
134	*
135	* @param seq: the multiple sequence alignment.
136	*
137	* @returns the noiseless mutual information.
138	*/
139	MatrixXd Cma::computeMutualInformationP(VecVecType & seq) {
140
141	if (MutualInformation.size() == 0) {
142	computeMutualInformation(seq);
143	}
144
145	cout << "computing noiseless Mutual Information (MIp)... ";
146	flush(cout);
147	clock_t start = clock();
148	/*
149	* We must remove negative MI values (the negative values are artificially
150	* generated, see comments in computeJointEntropy).
151	*/
152	for (int i = 0; i < MutualInformation.rows(); ++i) {
153	for (int j = 0; j < MutualInformation.cols(); ++j) {
154	if (MutualInformation(i, j) < 0.) {
155	MutualInformation(i, j) = 0.;
156	}
157	}
158	}
159
160	VectorXd mMIs = computeMeanMutualInformationVector();
161	double mMI = computeOveralMeanMutualInformation();
162
163	MutualInformationP = MutualInformation - ((mMIs * mMIs.transpose()) / mMI);
164
165	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
166	<< "s." << endl;
167
168	return MutualInformationP;
169
170	}
171
172	/**
173	* Given a MI-matrix, computes the sorted list of the highest MI values.
174	*
175	* @param mutualInformation: a matrix with MI (or MIp) measures.
176	*
177	* @returns the list of tuples of positions sorted by their MI value.
178	*/
179	list<MITuple> Cma::compute_mituples(MatrixXd mutualInformation) {
180	size_t size = mutualInformation.cols();
181
182	list<MITuple> mituples;
183
184	// populate the mituples list with all entries in the upper triangular matrix
185	// of mutualInformation.
186	for (size_t i = 0; i < size; ++i) {
187	for (size_t j = i + 1; j < size; ++j) {
188	MITuple curr;
189	curr.MI = mutualInformation(i, j);
190	curr.pos1 = i;
191	curr.pos2 = j;
192	mituples.push_back(curr);
193	}
194	}
195
196	// sort mituples by MI-value
197	mituples.sort(compareMITuples);
198
199	return mituples;
200	}
201
202	/**
203	* Computes the clusters of positions that are most correlated, up to
204	* threshold.
205	*
206	* @param mituples: the mutualInformation matrix, as generated by
207	* Cma::computeMutualInformation.
208	*
209	* @param size: the original size of the alignment.
210	*
211	* @param threshold: the mutual information threshold. If two positions have
212	* mi < threshold, they won't be united in one cluster.
213	*
214	* @returns a vector with the cluster indices for each position.
215	*/
216	VectorXi Cma::computeClusters(list<MITuple> mituples, size_t size,
217	double threshold) {
218
219	cout << "computing clusters of correlated positions... ";
220	flush(cout);
221
222	VectorXi result = VectorXi::Zero(size);
223
224	//compute the clusters.
225	int cluster = 1;
226	int rest = int(size);
227	for (list<MITuple>::iterator it = mituples.begin(); it != mituples.end(); ++it) {
228	double result1 = abs(result[it->pos1]);
229	double result2 = abs(result[it->pos2]);
230	if (it->MI <= threshold \|\| rest == 0) {
231	break;
232	}
233	if (result1 < epsilon and result2 < epsilon) {
234	result[it->pos1] = cluster;
235	result[it->pos2] = cluster++;
236	rest -= 2;
237	}
238	else if (result1 > epsilon and result2 < epsilon) {
239	result[it->pos2] = result[it->pos1];
240	rest -= 1;
241	}
242	else if (result1 < epsilon and result2 > epsilon) {
243	result[it->pos1] = result[it->pos2];
244	rest -= 1;
245	}
246	else if (result1 > epsilon and result2 > epsilon && result1 - result2 > epsilon) {
247	unifyCluster(result[it->pos1], result[it->pos2], result);
248	}
249	}
250
251	clusters = result;
252
253	cout << "done." << endl;
254
255	return result;
256	}
257
258	//>-------------------------getters and setters--------------------------------
259
260	MatrixXd Cma::getEntropy() {
261
262	return entropy;
263
264	}
265
266	MatrixXd Cma::getJointEntropy() {
267
268	return JointEntropy;
269
270	}
271
272	MatrixXd Cma::getMI() {
273
274	return MutualInformation;
275
276	}
277
278	MatrixXd Cma::getMIp() {
279
280	return MutualInformationP;
281
282	}
283
284	VectorXi Cma::getClusters() {
285
286	return clusters;
287
288	}
289
290	//>----------------------------private methods---------------------------------
291
292	/**
293	* Here we define the alphabet.
294	*/
295	void Cma::initAlphabet(vector<string> alph) {
296
297	alphabet = vector<string> (alph);
298
299	int i = 0;
300	for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
301	alphabet_map[*it] = i;
302	i++;
303	}
304	for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
305	for (vector<string>::iterator it2 = alph.begin(); it2 != alph.end(); ++it2) {
306	alphabet_map[it + it2] = i;
307	i++;
308	}
309	}
310	alphabet_map["total"] = i;
311	}
312
313	/**
314	* Computes an approximation of the joint entropy for each pair of positions.
315	*
316	* The joint entropy of two positions in a set of aligned sequences
317	* is defined as:
318	* - ( p(A,A) * log2(p(A,A)) + p(A,C) * log2(p(A,C)) + ... +
319	* p(C,A) * log2(p(C,A)) + p(C,C) * log2(p(C,A)) + ... +
320	* ... +
321	* p(T,A) * log2(p(T,A)) + ... + p(T) * log2(p(T,T)) ),
322	* where p(x,y) is the probability of observing base x at position 1 and
323	* base y at position 2.
324	*
325	* @param seqs: the aligned sequences
326	*
327	* @returns a square matrix with the joint entropy values for
328	* each pair of positions.
329	*/
330	MatrixXd Cma::computeJointEntropy(const VecVecType & seqs) {
331
332	MapMatrixType hist = buildJointHistogram(seqs);
333	return Cma::computeJointEntropy(hist);
334
335	}
336
337	/**
338	* Computes an approximation of the joint entropy for each pair of positions.
339	* The measure also includes an heuristic to penalise mismatches in occurrences
340	* (see the documentation of this project for further explanations).
341	*
342	* @param hist: the histogram of labels for each position in the sequence.
343	*
344	* @returns a square matrix with the joint entropy values for
345	* each pair of positions.
346	*/
347	MatrixXd Cma::computeJointEntropy(MapMatrixType & hist) {
348
349	cout << "computing joint entropy (this may take a while)... ";
350	flush(cout);
351	clock_t start = clock();
352
353	int hist_size = int(hist.size());
354	MatrixXd result(hist_size, hist_size);
355	result.setZero(hist_size, hist_size);
356	for (unsigned int i = 0; i < (unsigned int) hist_size; ++i) {
357
358	//progress "bar"
359	if (hist_size > 30 and i % (hist_size / 30) == 0) {
360	cout << "(" << setw(6) << setiosflags(ios::fixed)
361	<< setprecision(2) << i / float(hist_size) * 100 << "%)";
362	flush(cout);
363	}
364
365	for (unsigned int j = i; j < hist[i].size(); ++j) {
366	int total_i_j = hist[i][j][alphabet_map.at("total")];
367	int total_i_i = hist[i][i][alphabet_map.at("total")];
368	int total_j_j = hist[j][j][alphabet_map.at("total")];
369	int mismatches = total_i_i + total_j_j - 2 * total_i_j;
370	if (total_i_j != 0) {
371	double result_i_j = 0.;
372	for (map<int, int>::iterator it = hist[i][j].begin(); it
373	!= hist[i][j].end(); ++it) {
374	double pair = double(it->second) + Cma::epsilon;
375	result_i_j += (pair / total_i_j) * log2(pair / total_i_j);
376	}
377
378	result(i, j) = -result_i_j + double(mismatches) / (total_i_i
379	+ total_j_j) * Cma::penalty;
380
381	} else {
382	/**
383	* We will only fall into this case if we have two positions which
384	* never occur simultaneously. In this case we cannot say anything about
385	* the MI.
386	* To guarantee that this joint entropy value will not come in our way in
387	* the MI calculation we set it artificially to 5.0, which is 1 greater than
388	* max(H(X) + H(Y)).
389	*/
390	result(i, j) = 5.;
391	}
392
393	}
394	//progress "bar"
395	if (hist_size > 30 and i % (hist_size / 30) == 0) {
396	cout << "\b\b\b\b\b\b\b\b\b";
397	}
398	}
399	flush(cout);
400	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
401	<< "s." << endl;
402	return result;
403
404	}
405
406	/**
407	* Checks whether the key is valid (if it pertains to the alphabet).
408	*
409	* @param key: the base to be checked.
410	*
411	* @return whether the key is valid (if it pertains to the alphabet).
412	*/
413	bool Cma::isValid(string & key) {
414
415	map<string, int>::iterator it = alphabet_map.find(key);
416	return (it != alphabet_map.end());
417
418	}
419
420	/**
421	* This method computes a matrix of maps, one for each pair of positions of the
422	* aligned sequences.
423	* Each map has an entry for each pair of occurring letters to count the joint
424	* occurrences of these letters.
425	*
426	* @param alignedSequences: an array of string vectors, containing the sequences.
427	*
428	* @returns a matrix of maps (see method description)
429	*/
430	MapMatrixType Cma::buildJointHistogram(const VecVecType & alignedSequences) {
431
432	cout << "building histogram (this may take a while)... ";
433	flush(cout);
434	clock_t start = clock();
435
436	if (alignedSequences.empty()) {
437	throw "Error: Cma::buildJointHistogram: parameter empty.";
438	}
439
440	// here we defined one dimension of the matrix as being the same as the
441	// length of the sequences.
442	MapMatrixType result(alignedSequences[0].size(), MapVecType(
443	alignedSequences[0].size()));
444
445	int ii = 0;
446	// for each sequence:
447	for (VecVecType::const_iterator it = alignedSequences.begin(); it
448	!= alignedSequences.end(); ++it) {
449
450	cout << "(" << setw(6) << setiosflags(ios::fixed) << setprecision(2)
451	<< ii / float(num_of_seqs) * 100 << "%)";
452	flush(cout);
453
454	// for each first position in the sequence
455	for (size_t i = 0; i < it->size(); ++i) {
456	string key1 = it->at(i);
457	if (isValid(key1)) {
458
459	//for each second position in the sequence
460	for (size_t j = i; j < it->size(); ++j) {
461	string key2 = it->at(j);
462	if (isValid(key2)) {
463	result[i][j][alphabet_map.at(key1 + key2)] += 1;
464	result[i][j][alphabet_map.at("total")] += 1;
465	}
466	}
467	}
468	}
469	cout << "\b\b\b\b\b\b\b\b\b";
470	ii++;
471	}
472	flush(cout);
473	cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
474	<< "s." << endl;
475
476	return result;
477	}
478
479	/**
480	* Computes the vector of mean mutual information values mMI.
481	*
482	* mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
483	* length of the sequences.
484	*
485	* @returns the vector of mean mutual information values mMI.
486	*/
487	VectorXd Cma::computeMeanMutualInformationVector() {
488
489	return MutualInformation.rowwise().sum() / num_of_seqs;
490
491	}
492
493	/**
494	* Computes the overall mean mutual information value.
495	*
496	* mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
497	* again m is the length of the sequences.
498	*
499	* @returns the overall mean mutual information value.
500	*/
501	double Cma::computeOveralMeanMutualInformation() {
502
503	return 2. * MutualInformation.sum() / num_of_seqs / num_of_seqs;
504
505	}
506

Note: See TracBrowser for help on using the repository browser.

Download in other formats: