source: tags/svn.1.5.4/RNACMA/Cma.cxx

Last change on this file was 8309, checked in by westram, 14 years ago
  • moved much code into static scope

(partly reverted by [8310])

File size: 14.3 KB
Line 
1/*
2 * Cma.cpp
3 *
4 *  Created on: Dec 16, 2009
5 *      Author: Breno Faria
6 */
7
8#include <iostream>
9#include <iomanip>
10#include <time.h>
11
12#include "Cma.h"
13
14//>--------------------------Global definitions--------------------------------
15
16// Some local function prototypes.
17void unifyCluster(int cluster1, int cluster2, VectorXi & result);
18
19//>-----------------------local helper functions-------------------------------
20
21/**
22 * Helper function for Cma::computeClusters. Compares MITuples by
23 * mutual information value.
24 *
25 * @param first: the first MITuple
26 *
27 * @param second: the second MITuple
28 *
29 * @returns whether the first element should be placed before the second.
30 */
31static bool compareMITuples(MITuple first, MITuple second) {
32    return first.MI > second.MI;
33}
34
35/**
36 * Helper function for Cma::computeClusters. Unites two clusters.
37 *
38 * @param cluster1: the index of the first cluster.
39 *
40 * @param cluster2: the index of the second cluster.
41 */
42void unifyCluster(int cluster1, int cluster2, VectorXi & result) {
43    size_t size = result.size();
44    for (size_t i = 0; i < size; ++i) {
45        if (result[i] == cluster2) {
46            result[i] = cluster1;
47        }
48    }
49}
50
51//>---------------------------Public methods-----------------------------------
52
53/**
54 * Constructor.
55 *
56 * @param seq_num: the number of sequences.
57 *
58 * @param alph: the vector containing the alphabet. Notice that letters of the
59 *              alphabet might be longer than one character. This is not used
60 *              so far, but limiting the length to 1 seems wrong...
61 */
62Cma::Cma(vector<string> & alph, int seq_num) {
63
64    initAlphabet(alph);
65    num_of_seqs = seq_num;
66
67}
68
69/**
70 * Destructor.
71 */
72Cma::~Cma() {
73}
74
75/**
76 * Computes the mutual information of position pairs in a set of
77 * sequences.
78 * Mutual information can be defined as:
79 *
80 * MI(pos1, pos2) = H(pos1) + H(pos2) - H(pos1, pos2),
81 *
82 * i.e. the sum of entropies at two positions minus the joint entropy
83 * of these positions.
84 *
85 * @param seq: the multiple sequence alignment.
86 *
87 * @returns a matrix with the MI values.
88 */
89MatrixXd Cma::computeMutualInformation(VecVecType & seq) {
90
91    JointEntropy = computeJointEntropy(seq);
92
93    JointEntropy = JointEntropy + JointEntropy.adjoint() - JointEntropy.diagonal().asDiagonal();
94
95    entropy = JointEntropy.diagonal();
96
97    cout << "computing Mutual Information... ";
98    flush(cout);
99    clock_t start = clock();
100
101    VectorXd Ones = VectorXd::Ones(entropy.size());
102    MatrixXd Hx = (entropy * Ones.transpose()).transpose();
103    MatrixXd Hy = Hx.transpose();
104
105    MutualInformation = Hx + Hy - JointEntropy;
106
107    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
108         << "s." << endl;
109
110    return MutualInformation;
111
112}
113
114/**
115 * Computes the noiseless mutual information, or MIp (see classdoc), which is
116 * defined as:
117 *
118 * MIp(pos1, pos2) = MI(pos1, pos2) - (mMI(pos1) * mMI(pos2)) / mMI,
119 *
120 * where MIp is the noiseless mutual information between positions 1 and 2 and
121 * MI similarly. mMI(x) is the mean mutual information of position x, which is
122 * defined as:
123 *
124 * mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
125 * length of the sequences.
126 *
127 * mMI is the overall mean mutual information, defined as:
128 *
129 * mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
130 * again m is the length of the sequences.
131 *
132 * @param seq: the multiple sequence alignment.
133 *
134 * @returns the noiseless mutual information.
135 */
136MatrixXd Cma::computeMutualInformationP(VecVecType & seq) {
137
138    if (MutualInformation.size() == 0) {
139        computeMutualInformation(seq);
140    }
141
142    cout << "computing noiseless Mutual Information (MIp)... ";
143    flush(cout);
144    clock_t start = clock();
145    /*
146     * We must remove negative MI values (the negative values are artificially
147     * generated, see comments in computeJointEntropy).
148     */
149    for (int i = 0; i < MutualInformation.rows(); ++i) {
150        for (int j = 0; j < MutualInformation.cols(); ++j) {
151            if (MutualInformation(i, j) < 0.) {
152                MutualInformation(i, j) = 0.;
153            }
154        }
155    }
156
157    VectorXd mMIs = computeMeanMutualInformationVector();
158    double mMI = computeOveralMeanMutualInformation();
159
160    MutualInformationP = MutualInformation - ((mMIs * mMIs.transpose()) / mMI);
161
162    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
163         << "s." << endl;
164
165    return MutualInformationP;
166
167}
168
169/**
170 * Given a MI-matrix, computes the sorted list of the highest MI values.
171 *
172 * @param mutualInformation: a matrix with MI (or MIp) measures.
173 *
174 * @returns the list of tuples of positions sorted by their MI value.
175 */
176list<MITuple> Cma::compute_mituples(MatrixXd mutualInformation) {
177    size_t size = mutualInformation.cols();
178
179    list<MITuple> mituples;
180
181    // populate the mituples list with all entries in the upper triangular matrix
182    // of mutualInformation.
183    for (size_t i = 0; i < size; ++i) {
184        for (size_t j = i + 1; j < size; ++j) {
185            MITuple curr;
186            curr.MI = mutualInformation(i, j);
187            curr.pos1 = i;
188            curr.pos2 = j;
189            mituples.push_back(curr);
190        }
191    }
192
193    // sort mituples by MI-value
194    mituples.sort(compareMITuples);
195
196    return mituples;
197}
198
199/**
200 * Computes the clusters of positions that are most correlated, up to
201 * threshold.
202 *
203 * @param mituples: the mutualInformation matrix, as generated by
204 *                  Cma::computeMutualInformation.
205 *
206 * @param size: the original size of the alignment.
207 *
208 * @param threshold: the mutual information threshold. If two positions have
209 *                   mi < threshold, they won't be united in one cluster.
210 *
211 * @returns a vector with the cluster indices for each position.
212 */
213VectorXi Cma::computeClusters(list<MITuple> mituples, size_t size,
214                              double threshold) {
215
216    cout << "computing clusters of correlated positions... ";
217    flush(cout);
218
219    VectorXi result = VectorXi::Zero(size);
220
221    //compute the clusters.
222    int cluster = 1;
223    int rest = int(size);
224    for (list<MITuple>::iterator it = mituples.begin(); it != mituples.end(); ++it) {
225        double result1 = abs(result[it->pos1]);
226        double result2 = abs(result[it->pos2]);
227        if (it->MI <= threshold || rest == 0) {
228            break;
229        }
230        if (result1 < epsilon and result2 < epsilon) {
231            result[it->pos1] = cluster;
232            result[it->pos2] = cluster++;
233            rest -= 2;
234        }
235        else if (result1 > epsilon and result2 < epsilon) {
236            result[it->pos2] = result[it->pos1];
237            rest -= 1;
238        }
239        else if (result1 < epsilon and result2 > epsilon) {
240            result[it->pos1] = result[it->pos2];
241            rest -= 1;
242        }
243        else if (result1 > epsilon and result2 > epsilon && result1 - result2 > epsilon) {
244            unifyCluster(result[it->pos1], result[it->pos2], result);
245        }
246    }
247
248    clusters = result;
249
250    cout << "done." << endl;
251
252    return result;
253}
254
255//>-------------------------getters and setters--------------------------------
256
257MatrixXd Cma::getEntropy() {
258
259    return entropy;
260
261}
262
263MatrixXd Cma::getJointEntropy() {
264
265    return JointEntropy;
266
267}
268
269MatrixXd Cma::getMI() {
270
271    return MutualInformation;
272
273}
274
275MatrixXd Cma::getMIp() {
276
277    return MutualInformationP;
278
279}
280
281VectorXi Cma::getClusters() {
282
283    return clusters;
284
285}
286
287//>----------------------------private methods---------------------------------
288
289/**
290 * Here we define the alphabet.
291 */
292void Cma::initAlphabet(vector<string> alph) {
293
294    alphabet = vector<string> (alph);
295
296    int i = 0;
297    for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
298        alphabet_map[*it] = i;
299        i++;
300    }
301    for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
302        for (vector<string>::iterator it2 = alph.begin(); it2 != alph.end(); ++it2) {
303            alphabet_map[*it + *it2] = i;
304            i++;
305        }
306    }
307    alphabet_map["total"] = i;
308}
309
310/**
311 * Computes an approximation of the joint entropy for each pair of positions.
312 *
313 * The joint entropy of two positions in a set of aligned sequences
314 * is defined as:
315 * - ( p(A,A) * log2(p(A,A)) + p(A,C) * log2(p(A,C)) + ... +
316 *     p(C,A) * log2(p(C,A)) + p(C,C) * log2(p(C,A)) + ... +
317 *     ... +
318 *     p(T,A) * log2(p(T,A)) + ... + p(T) * log2(p(T,T)) ),
319 * where p(x,y) is the probability of observing base x at position 1 and
320 * base y at position 2.
321 *
322 * @param seqs: the aligned sequences
323 *
324 * @returns a square matrix with the joint entropy values for
325 *          each pair of positions.
326 */
327MatrixXd Cma::computeJointEntropy(const VecVecType & seqs) {
328
329    MapMatrixType hist = buildJointHistogram(seqs);
330    return Cma::computeJointEntropy(hist);
331
332}
333
334/**
335 * Computes an approximation of the joint entropy for each pair of positions.
336 * The measure also includes an heuristic to penalise mismatches in occurrences
337 * (see the documentation of this project for further explanations).
338 *
339 * @param hist: the histogram of labels for each position in the sequence.
340 *
341 * @returns a square matrix with the joint entropy values for
342 *          each pair of positions.
343 */
344MatrixXd Cma::computeJointEntropy(MapMatrixType & hist) {
345
346    cout << "computing joint entropy (this may take a while)... ";
347    flush(cout);
348    clock_t start = clock();
349
350    int hist_size = int(hist.size());
351    MatrixXd result(hist_size, hist_size);
352    result.setZero(hist_size, hist_size);
353    for (unsigned int i = 0; i < (unsigned int) hist_size; ++i) {
354
355        //progress "bar"
356        if (hist_size > 30 and i % (hist_size / 30) == 0) {
357            cout << "(" << setw(6) << setiosflags(ios::fixed)
358                 << setprecision(2) << i / float(hist_size) * 100 << "%)";
359            flush(cout);
360        }
361
362        for (unsigned int j = i; j < hist[i].size(); ++j) {
363            double result_i_j = 0.;
364            int total_i_j = hist[i][j][alphabet_map.at("total")];
365            int total_i_i = hist[i][i][alphabet_map.at("total")];
366            int total_j_j = hist[j][j][alphabet_map.at("total")];
367            int mismatches = total_i_i + total_j_j - 2 * total_i_j;
368            if (total_i_j != 0) {
369                for (map<int, int>::iterator it = hist[i][j].begin(); it
370                         != hist[i][j].end(); ++it) {
371                    double pair = double(it->second) + Cma::epsilon;
372                    result_i_j += (pair / total_i_j) * log2(pair / total_i_j);
373                }
374
375                result(i, j) = -result_i_j + double(mismatches) / (total_i_i
376                                                                   + total_j_j) * Cma::penalty;
377
378            } else {
379                /**
380                 * We will only fall into this case if we have two positions which
381                 * never occur simultaneously. In this case we cannot say anything about
382                 * the MI.
383                 * To guarantee that this joint entropy value will not come in our way in
384                 * the MI calculation we set it artificially to 5.0, which is 1 greater than
385                 * max(H(X) + H(Y)).
386                 */
387                result(i, j) = 5.;
388            }
389
390        }
391        //progress "bar"
392        if (hist_size > 30 and i % (hist_size / 30) == 0) {
393            cout << "\b\b\b\b\b\b\b\b\b";
394        }
395    }
396    flush(cout);
397    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
398         << "s." << endl;
399    return result;
400
401}
402
403/**
404 * Checks whether the key is valid (if it pertains to the alphabet).
405 *
406 * @param key: the base to be checked.
407 *
408 * @return whether the key is valid (if it pertains to the alphabet).
409 */
410bool Cma::isValid(string & key) {
411
412    map<string, int>::iterator it = alphabet_map.find(key);
413    return (it != alphabet_map.end());
414
415}
416
417/**
418 * This method computes a matrix of maps, one for each pair of positions of the
419 * aligned sequences.
420 * Each map has an entry for each pair of occurring letters to count the joint
421 * occurrences of these letters.
422 *
423 * @param alignedSequences: an array of string vectors, containing the sequences.
424 *
425 * @returns a matrix of maps (see method description)
426 */
427MapMatrixType Cma::buildJointHistogram(const VecVecType & alignedSequences) {
428
429    cout << "building histogram (this may take a while)... ";
430    flush(cout);
431    clock_t start = clock();
432
433    if (alignedSequences.empty()) {
434        throw "Error: Cma::buildJointHistogram: parameter empty.";
435    }
436
437    // here we defined one dimension of the matrix as being the same as the
438    // length of the sequences.
439    MapMatrixType result(alignedSequences[0].size(), MapVecType(
440                             alignedSequences[0].size()));
441
442    int ii = 0;
443    // for each sequence:
444    for (VecVecType::const_iterator it = alignedSequences.begin(); it
445             != alignedSequences.end(); ++it) {
446
447        cout << "(" << setw(6) << setiosflags(ios::fixed) << setprecision(2)
448             << ii / float(num_of_seqs) * 100 << "%)";
449        flush(cout);
450
451        // for each first position in the sequence
452        for (size_t i = 0; i < it->size(); ++i) {
453            string key1 = it->at(i);
454            if (isValid(key1)) {
455
456                //for each second position in the sequence
457                for (size_t j = i; j < it->size(); ++j) {
458                    string key2 = it->at(j);
459                    if (isValid(key2)) {
460                        result[i][j][alphabet_map.at(key1 + key2)] += 1;
461                        result[i][j][alphabet_map.at("total")] += 1;
462                    }
463                }
464            }
465        }
466        cout << "\b\b\b\b\b\b\b\b\b";
467        ii++;
468    }
469    flush(cout);
470    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
471         << "s." << endl;
472
473    return result;
474}
475
476/**
477 * Computes the vector of mean mutual information values mMI.
478 *
479 * mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
480 * length of the sequences.
481 *
482 * @returns the vector of mean mutual information values mMI.
483 */
484VectorXd Cma::computeMeanMutualInformationVector() {
485
486    return MutualInformation.rowwise().sum() / num_of_seqs;
487
488}
489
490/**
491 * Computes the overall mean mutual information value.
492 *
493 * mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
494 * again m is the length of the sequences.
495 *
496 * @returns the overall mean mutual information value.
497 */
498double Cma::computeOveralMeanMutualInformation() {
499
500    return 2. * MutualInformation.sum() / num_of_seqs / num_of_seqs;
501
502}
503
Note: See TracBrowser for help on using the repository browser.