source: tags/svn.1.5.4/RNACMA/Cma.h

Last change on this file was 8321, checked in by westram, 14 years ago
  • the daily wtf
File size: 4.5 KB
Line 
1/*
2 * Cma.h
3 *
4 * CMA stands for correlated mutation analysis. This class implements
5 * information-theoretical measures to compute correlations between
6 * positions in a set of aligned sequences.
7 * The correlation values computed are Mutual Information (MI) and noiseless
8 * Mutual Information (MIp). MIp is a correlation measure proposed in the
9 * paper "Mutual information without the influence of phylogeny or entropy
10 * dramatically improves residue contact prediction" by S. D. Dunn et al.,
11 * Bioinformatics Vol. 24 no. 3, 333-340, 2008.
12 * The clustering of correlated positions follows a simple heuristic, which
13 * can be found in Thomas Runkler, "Information Mining", chapter 5, 53-58,
14 * 2000.
15 *
16 *  Created on: Dec 16, 2009
17 *      Author: Breno Faria
18 */
19
20#ifndef CMA_H_
21#define CMA_H_
22
23#include <iostream>
24#include <eigen/Eigen/Eigen>
25#include <vector>
26#include <map>
27#include <math.h>
28#include <string>
29#include <list>
30
31using namespace std;
32using Eigen::VectorXd;
33using Eigen::VectorXi;
34using Eigen::MatrixXd;
35
36// Vector of maps to store the histogram of sequences.
37typedef vector<map<int, int> > MapVecType;
38// Vector of vectors to store the aligned sequences (each one represented by a vector)
39typedef vector<vector<string> > VecVecType; // @@@ string always is length 1. this should be vector<vector<char>> or vector<string>
40// Matrix of maps to store the joint histogram of sequences.
41typedef vector<MapVecType> MapMatrixType;
42
43// Struct representing the tuple (pos1, pos2, mutualInformation(pos1,pos2)).
44struct MITuple {
45    int pos1;
46    int pos2;
47    double MI;
48};
49
50
51
52class Cma {
53
54private:
55    /**
56     * This is used for numerical stability and to compare floating point numbers.
57     */
58    static const double epsilon = 1.0e-9;
59    /**
60     * See documentation of project, p. 7.
61     */
62    static const double penalty = 5.;
63    /**
64     * The alphabet considered.
65     */
66    vector<string> alphabet; // @@@ string always is length 1. this should be vector<char>
67    /**
68     * A map from alphabet letters to integers. This speeds things considerably up,
69     * and we use less memory.
70     */
71    map<string, int> alphabet_map; // @@@ string is length 1 most of the time, disregarding alphabet_map["total"] which is used as global counter. this should be vector<char>
72    /**
73     * The number of sequences in the MSA.
74     */
75    int num_of_seqs;
76    /**
77     * A vector with the entropies for eac position.
78     */
79    VectorXd entropy;
80    /**
81     * A matrix with all joint entropies.
82     */
83    MatrixXd JointEntropy;
84    /**
85     * A matrix with the MI values for each pair of positions.
86     */
87    MatrixXd MutualInformation;
88    /**
89     * A matrix with the MIp values for each pair of positions.
90     * See Dunn et al. "Mutual Information without the influence
91     * of phylogeny or entropy dramatically improves residue contact
92     * prediction", Bioinformatics Vol.24 no.3 333-340, 2008
93     */
94    MatrixXd MutualInformationP;
95    /**
96     * A vector with the cluster indices.
97     */
98    VectorXi clusters;
99
100    /**
101     * Initialises the alphabet.
102     */
103    void initAlphabet(vector<string> alph);
104    /**
105     * Builds the joint histogram of every pair of positions.
106     */
107    MapMatrixType buildJointHistogram(const VecVecType & alignedSequences);
108    /**
109     * Compute the joint entropy.
110     */
111    MatrixXd computeJointEntropy(MapMatrixType & hist);
112    MatrixXd computeJointEntropy(const VecVecType & seq);
113    /**
114     * Used to compute the MIp.
115     */
116    VectorXd computeMeanMutualInformationVector();
117    double computeOveralMeanMutualInformation();
118    /**
119     * Checks whether a letter pertains to the alphabet.
120     */
121    bool isValid(string & key1);
122
123
124public:
125
126    /**
127     * Computes the MI.
128     */
129    MatrixXd computeMutualInformation(VecVecType & seq);
130    /**
131     * Computes the MIp.
132     */
133    MatrixXd computeMutualInformationP(VecVecType & seq);
134    /**
135     * Computes the clusters.
136     */
137    VectorXi computeClusters(list<MITuple> mituples, size_t size, double threshold);
138    /**
139     * Transforms the upper triangular matrix of a symmetric matrix into a
140     * sortable list of MITuples.
141     */
142    list<MITuple> compute_mituples(MatrixXd mutualInformation);
143
144    /**
145     * Getters for the fields.
146     */
147    MatrixXd getMI();
148    MatrixXd getMIp();
149    MatrixXd getEntropy();
150    MatrixXd getJointEntropy();
151    VectorXi getClusters();
152
153    /**
154     * Birth and death...
155     */
156    Cma(vector<string> & alph, int seq_number);
157    virtual ~Cma();
158};
159
160#endif /* CMA_H_ */
Note: See TracBrowser for help on using the repository browser.