source: branches/help/RNACMA/Cma.cxx

Last change on this file was 18730, checked in by westram, 3 years ago
  • remove trailing whitespace from c source.
File size: 14.4 KB
Line 
1/*
2 * Cma.cpp
3 *
4 *  Created on: Dec 16, 2009
5 *      Author: Breno Faria
6 *
7 *  Institute of Microbiology (Technical University Munich)
8 *  http://www.arb-home.de/
9 */
10
11#include <iostream>
12#include <iomanip>
13#include <time.h>
14
15#include "Cma.h"
16
17//>--------------------------Global definitions--------------------------------
18
19// Some local function prototypes.
20void unifyCluster(int cluster1, int cluster2, VectorXi & result);
21
22//>-----------------------local helper functions-------------------------------
23
24/**
25 * Helper function for Cma::computeClusters. Compares MITuples by
26 * mutual information value.
27 *
28 * @param first: the first MITuple
29 *
30 * @param second: the second MITuple
31 *
32 * @returns whether the first element should be placed before the second.
33 */
34static bool compareMITuples(MITuple first, MITuple second) {
35    return first.MI > second.MI;
36}
37
38/**
39 * Helper function for Cma::computeClusters. Unites two clusters.
40 *
41 * @param cluster1: the index of the first cluster.
42 *
43 * @param cluster2: the index of the second cluster.
44 */
45void unifyCluster(int cluster1, int cluster2, VectorXi & result) {
46    size_t size = result.size();
47    for (size_t i = 0; i < size; ++i) {
48        if (result[i] == cluster2) {
49            result[i] = cluster1;
50        }
51    }
52}
53
54//>---------------------------Public methods-----------------------------------
55
56/**
57 * Constructor.
58 *
59 * @param seq_num: the number of sequences.
60 *
61 * @param alph: the vector containing the alphabet. Notice that letters of the
62 *              alphabet might be longer than one character. This is not used
63 *              so far, but limiting the length to 1 seems wrong...
64 */
65Cma::Cma(vector<string> & alph, int seq_num) {
66
67    initAlphabet(alph);
68    num_of_seqs = seq_num;
69
70}
71
72/**
73 * Destructor.
74 */
75Cma::~Cma() {
76}
77
78/**
79 * Computes the mutual information of position pairs in a set of
80 * sequences.
81 * Mutual information can be defined as:
82 *
83 * MI(pos1, pos2) = H(pos1) + H(pos2) - H(pos1, pos2),
84 *
85 * i.e. the sum of entropies at two positions minus the joint entropy
86 * of these positions.
87 *
88 * @param seq: the multiple sequence alignment.
89 *
90 * @returns a matrix with the MI values.
91 */
92MatrixXd Cma::computeMutualInformation(VecVecType & seq) {
93
94    JointEntropy = computeJointEntropy(seq);
95
96    JointEntropy = JointEntropy + JointEntropy.adjoint() - JointEntropy.diagonal().asDiagonal();
97
98    entropy = JointEntropy.diagonal();
99
100    cout << "computing Mutual Information... ";
101    flush(cout);
102    clock_t start = clock();
103
104    VectorXd Ones = VectorXd::Ones(entropy.size());
105    MatrixXd Hx = (entropy * Ones.transpose()).transpose();
106    MatrixXd Hy = Hx.transpose();
107
108    MutualInformation = Hx + Hy - JointEntropy;
109
110    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
111         << "s." << endl;
112
113    return MutualInformation;
114
115}
116
117/**
118 * Computes the noiseless mutual information, or MIp (see classdoc), which is
119 * defined as:
120 *
121 * MIp(pos1, pos2) = MI(pos1, pos2) - (mMI(pos1) * mMI(pos2)) / mMI,
122 *
123 * where MIp is the noiseless mutual information between positions 1 and 2 and
124 * MI similarly. mMI(x) is the mean mutual information of position x, which is
125 * defined as:
126 *
127 * mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
128 * length of the sequences.
129 *
130 * mMI is the overall mean mutual information, defined as:
131 *
132 * mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
133 * again m is the length of the sequences.
134 *
135 * @param seq: the multiple sequence alignment.
136 *
137 * @returns the noiseless mutual information.
138 */
139MatrixXd Cma::computeMutualInformationP(VecVecType & seq) {
140
141    if (MutualInformation.size() == 0) {
142        computeMutualInformation(seq);
143    }
144
145    cout << "computing noiseless Mutual Information (MIp)... ";
146    flush(cout);
147    clock_t start = clock();
148    /*
149     * We must remove negative MI values (the negative values are artificially
150     * generated, see comments in computeJointEntropy).
151     */
152    for (int i = 0; i < MutualInformation.rows(); ++i) {
153        for (int j = 0; j < MutualInformation.cols(); ++j) {
154            if (MutualInformation(i, j) < 0.) {
155                MutualInformation(i, j) = 0.;
156            }
157        }
158    }
159
160    VectorXd mMIs = computeMeanMutualInformationVector();
161    double mMI = computeOveralMeanMutualInformation();
162
163    MutualInformationP = MutualInformation - ((mMIs * mMIs.transpose()) / mMI);
164
165    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
166         << "s." << endl;
167
168    return MutualInformationP;
169
170}
171
172/**
173 * Given a MI-matrix, computes the sorted list of the highest MI values.
174 *
175 * @param mutualInformation: a matrix with MI (or MIp) measures.
176 *
177 * @returns the list of tuples of positions sorted by their MI value.
178 */
179list<MITuple> Cma::compute_mituples(MatrixXd mutualInformation) {
180    size_t size = mutualInformation.cols();
181
182    list<MITuple> mituples;
183
184    // populate the mituples list with all entries in the upper triangular matrix
185    // of mutualInformation.
186    for (size_t i = 0; i < size; ++i) {
187        for (size_t j = i + 1; j < size; ++j) {
188            MITuple curr;
189            curr.MI = mutualInformation(i, j);
190            curr.pos1 = i;
191            curr.pos2 = j;
192            mituples.push_back(curr);
193        }
194    }
195
196    // sort mituples by MI-value
197    mituples.sort(compareMITuples);
198
199    return mituples;
200}
201
202/**
203 * Computes the clusters of positions that are most correlated, up to
204 * threshold.
205 *
206 * @param mituples: the mutualInformation matrix, as generated by
207 *                  Cma::computeMutualInformation.
208 *
209 * @param size: the original size of the alignment.
210 *
211 * @param threshold: the mutual information threshold. If two positions have
212 *                   mi < threshold, they won't be united in one cluster.
213 *
214 * @returns a vector with the cluster indices for each position.
215 */
216VectorXi Cma::computeClusters(list<MITuple> mituples, size_t size,
217                              double threshold) {
218
219    cout << "computing clusters of correlated positions... ";
220    flush(cout);
221
222    VectorXi result = VectorXi::Zero(size);
223
224    //compute the clusters.
225    int cluster = 1;
226    int rest = int(size);
227    for (list<MITuple>::iterator it = mituples.begin(); it != mituples.end(); ++it) {
228        double result1 = abs(result[it->pos1]);
229        double result2 = abs(result[it->pos2]);
230        if (it->MI <= threshold || rest == 0) {
231            break;
232        }
233        if (result1 < epsilon and result2 < epsilon) {
234            result[it->pos1] = cluster;
235            result[it->pos2] = cluster++;
236            rest -= 2;
237        }
238        else if (result1 > epsilon and result2 < epsilon) {
239            result[it->pos2] = result[it->pos1];
240            rest -= 1;
241        }
242        else if (result1 < epsilon and result2 > epsilon) {
243            result[it->pos1] = result[it->pos2];
244            rest -= 1;
245        }
246        else if (result1 > epsilon and result2 > epsilon && result1 - result2 > epsilon) {
247            unifyCluster(result[it->pos1], result[it->pos2], result);
248        }
249    }
250
251    clusters = result;
252
253    cout << "done." << endl;
254
255    return result;
256}
257
258//>-------------------------getters and setters--------------------------------
259
260MatrixXd Cma::getEntropy() {
261
262    return entropy;
263
264}
265
266MatrixXd Cma::getJointEntropy() {
267
268    return JointEntropy;
269
270}
271
272MatrixXd Cma::getMI() {
273
274    return MutualInformation;
275
276}
277
278MatrixXd Cma::getMIp() {
279
280    return MutualInformationP;
281
282}
283
284VectorXi Cma::getClusters() {
285
286    return clusters;
287
288}
289
290//>----------------------------private methods---------------------------------
291
292/**
293 * Here we define the alphabet.
294 */
295void Cma::initAlphabet(vector<string> alph) {
296
297    alphabet = vector<string> (alph);
298
299    int i = 0;
300    for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
301        alphabet_map[*it] = i;
302        i++;
303    }
304    for (vector<string>::iterator it = alph.begin(); it != alph.end(); ++it) {
305        for (vector<string>::iterator it2 = alph.begin(); it2 != alph.end(); ++it2) {
306            alphabet_map[*it + *it2] = i;
307            i++;
308        }
309    }
310    alphabet_map["total"] = i;
311}
312
313/**
314 * Computes an approximation of the joint entropy for each pair of positions.
315 *
316 * The joint entropy of two positions in a set of aligned sequences
317 * is defined as:
318 * - ( p(A,A) * log2(p(A,A)) + p(A,C) * log2(p(A,C)) + ... +
319 *     p(C,A) * log2(p(C,A)) + p(C,C) * log2(p(C,A)) + ... +
320 *     ... +
321 *     p(T,A) * log2(p(T,A)) + ... + p(T) * log2(p(T,T)) ),
322 * where p(x,y) is the probability of observing base x at position 1 and
323 * base y at position 2.
324 *
325 * @param seqs: the aligned sequences
326 *
327 * @returns a square matrix with the joint entropy values for
328 *          each pair of positions.
329 */
330MatrixXd Cma::computeJointEntropy(const VecVecType & seqs) {
331
332    MapMatrixType hist = buildJointHistogram(seqs);
333    return Cma::computeJointEntropy(hist);
334
335}
336
337/**
338 * Computes an approximation of the joint entropy for each pair of positions.
339 * The measure also includes an heuristic to penalise mismatches in occurrences
340 * (see the documentation of this project for further explanations).
341 *
342 * @param hist: the histogram of labels for each position in the sequence.
343 *
344 * @returns a square matrix with the joint entropy values for
345 *          each pair of positions.
346 */
347MatrixXd Cma::computeJointEntropy(MapMatrixType & hist) {
348
349    cout << "computing joint entropy (this may take a while)... ";
350    flush(cout);
351    clock_t start = clock();
352
353    int hist_size = int(hist.size());
354    MatrixXd result(hist_size, hist_size);
355    result.setZero(hist_size, hist_size);
356    for (unsigned int i = 0; i < (unsigned int) hist_size; ++i) {
357
358        //progress "bar"
359        if (hist_size > 30 and i % (hist_size / 30) == 0) {
360            cout << "(" << setw(6) << setiosflags(ios::fixed)
361                 << setprecision(2) << i / float(hist_size) * 100 << "%)";
362            flush(cout);
363        }
364
365        for (unsigned int j = i; j < hist[i].size(); ++j) {
366            int total_i_j = hist[i][j][alphabet_map.at("total")];
367            int total_i_i = hist[i][i][alphabet_map.at("total")];
368            int total_j_j = hist[j][j][alphabet_map.at("total")];
369            int mismatches = total_i_i + total_j_j - 2 * total_i_j;
370            if (total_i_j != 0) {
371                double result_i_j = 0.;
372                for (map<int, int>::iterator it = hist[i][j].begin(); it
373                         != hist[i][j].end(); ++it) {
374                    double pair = double(it->second) + Cma::epsilon;
375                    result_i_j += (pair / total_i_j) * log2(pair / total_i_j);
376                }
377
378                result(i, j) = -result_i_j + double(mismatches) / (total_i_i
379                                                                   + total_j_j) * Cma::penalty;
380
381            } else {
382                /**
383                 * We will only fall into this case if we have two positions which
384                 * never occur simultaneously. In this case we cannot say anything about
385                 * the MI.
386                 * To guarantee that this joint entropy value will not come in our way in
387                 * the MI calculation we set it artificially to 5.0, which is 1 greater than
388                 * max(H(X) + H(Y)).
389                 */
390                result(i, j) = 5.;
391            }
392
393        }
394        //progress "bar"
395        if (hist_size > 30 and i % (hist_size / 30) == 0) {
396            cout << "\b\b\b\b\b\b\b\b\b";
397        }
398    }
399    flush(cout);
400    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
401         << "s." << endl;
402    return result;
403
404}
405
406/**
407 * Checks whether the key is valid (if it pertains to the alphabet).
408 *
409 * @param key: the base to be checked.
410 *
411 * @return whether the key is valid (if it pertains to the alphabet).
412 */
413bool Cma::isValid(string & key) {
414
415    map<string, int>::iterator it = alphabet_map.find(key);
416    return it != alphabet_map.end();
417
418}
419
420/**
421 * This method computes a matrix of maps, one for each pair of positions of the
422 * aligned sequences.
423 * Each map has an entry for each pair of occurring letters to count the joint
424 * occurrences of these letters.
425 *
426 * @param alignedSequences: an array of string vectors, containing the sequences.
427 *
428 * @returns a matrix of maps (see method description)
429 */
430MapMatrixType Cma::buildJointHistogram(const VecVecType & alignedSequences) {
431
432    cout << "building histogram (this may take a while)... ";
433    flush(cout);
434    clock_t start = clock();
435
436    if (alignedSequences.empty()) {
437        throw "no (marked?) sequences";
438    }
439
440    // here we defined one dimension of the matrix as being the same as the
441    // length of the sequences.
442    MapMatrixType result(alignedSequences[0].size(), MapVecType(
443                             alignedSequences[0].size()));
444
445    int ii = 0;
446    // for each sequence:
447    for (VecVecType::const_iterator it = alignedSequences.begin(); it
448             != alignedSequences.end(); ++it) {
449
450        cout << "(" << setw(6) << setiosflags(ios::fixed) << setprecision(2)
451             << ii / float(num_of_seqs) * 100 << "%)";
452        flush(cout);
453
454        // for each first position in the sequence
455        for (size_t i = 0; i < it->size(); ++i) {
456            string key1 = it->at(i);
457            if (isValid(key1)) {
458
459                //for each second position in the sequence
460                for (size_t j = i; j < it->size(); ++j) {
461                    string key2 = it->at(j);
462                    if (isValid(key2)) {
463                        result[i][j][alphabet_map.at(key1 + key2)] += 1;
464                        result[i][j][alphabet_map.at("total")] += 1;
465                    }
466                }
467            }
468        }
469        cout << "\b\b\b\b\b\b\b\b\b";
470        ii++;
471    }
472    flush(cout);
473    cout << "completed in " << ((double) clock() - start) / CLOCKS_PER_SEC
474         << "s." << endl;
475
476    return result;
477}
478
479/**
480 * Computes the vector of mean mutual information values mMI.
481 *
482 * mMI(pos1) = 1 / m * \sum_{i \in positions} MI(pos1, i), where m is the
483 * length of the sequences.
484 *
485 * @returns the vector of mean mutual information values mMI.
486 */
487VectorXd Cma::computeMeanMutualInformationVector() {
488
489    return MutualInformation.rowwise().sum() / num_of_seqs;
490
491}
492
493/**
494 * Computes the overall mean mutual information value.
495 *
496 * mMI = 2 / m * \sum_{i \in positions} \sum_{j in positions} MI(i, j), where
497 * again m is the length of the sequences.
498 *
499 * @returns the overall mean mutual information value.
500 */
501double Cma::computeOveralMeanMutualInformation() {
502
503    return 2. * MutualInformation.sum() / num_of_seqs / num_of_seqs;
504
505}
506
Note: See TracBrowser for help on using the repository browser.