1 | /* |
---|
2 | * Cma.h |
---|
3 | * |
---|
4 | * CMA stands for correlated mutation analysis. This class implements |
---|
5 | * information-theoretical measures to compute correlations between |
---|
6 | * positions in a set of aligned sequences. |
---|
7 | * The correlation values computed are Mutual Information (MI) and noiseless |
---|
8 | * Mutual Information (MIp). MIp is a correlation measure proposed in the |
---|
9 | * paper "Mutual information without the influence of phylogeny or entropy |
---|
10 | * dramatically improves residue contact prediction" by S. D. Dunn et al., |
---|
11 | * Bioinformatics Vol. 24 no. 3, 333-340, 2008. |
---|
12 | * The clustering of correlated positions follows a simple heuristic, which |
---|
13 | * can be found in Thomas Runkler, "Information Mining", chapter 5, 53-58, |
---|
14 | * 2000. |
---|
15 | * |
---|
16 | * Created on: Dec 16, 2009 |
---|
17 | * Author: Breno Faria |
---|
18 | */ |
---|
19 | |
---|
20 | #ifndef CMA_H_ |
---|
21 | #define CMA_H_ |
---|
22 | |
---|
23 | #include <iostream> |
---|
24 | #include <eigen/Eigen/Eigen> |
---|
25 | #include <vector> |
---|
26 | #include <map> |
---|
27 | #include <math.h> |
---|
28 | #include <string> |
---|
29 | #include <list> |
---|
30 | |
---|
31 | using namespace std; |
---|
32 | using Eigen::VectorXd; |
---|
33 | using Eigen::VectorXi; |
---|
34 | using Eigen::MatrixXd; |
---|
35 | |
---|
36 | // Vector of maps to store the histogram of sequences. |
---|
37 | typedef vector<map<int, int> > MapVecType; |
---|
38 | // Vector of vectors to store the aligned sequences (each one represented by a vector) |
---|
39 | typedef vector<vector<string> > VecVecType; // @@@ string always is length 1. this should be vector<vector<char>> or vector<string> |
---|
40 | // Matrix of maps to store the joint histogram of sequences. |
---|
41 | typedef vector<MapVecType> MapMatrixType; |
---|
42 | |
---|
43 | // Struct representing the tuple (pos1, pos2, mutualInformation(pos1,pos2)). |
---|
44 | struct MITuple { |
---|
45 | int pos1; |
---|
46 | int pos2; |
---|
47 | double MI; |
---|
48 | }; |
---|
49 | |
---|
50 | |
---|
51 | |
---|
52 | class Cma { |
---|
53 | |
---|
54 | private: |
---|
55 | /** |
---|
56 | * This is used for numerical stability and to compare floating point numbers. |
---|
57 | */ |
---|
58 | static const double epsilon = 1.0e-9; |
---|
59 | /** |
---|
60 | * See documentation of project, p. 7. |
---|
61 | */ |
---|
62 | static const double penalty = 5.; |
---|
63 | /** |
---|
64 | * The alphabet considered. |
---|
65 | */ |
---|
66 | vector<string> alphabet; // @@@ string always is length 1. this should be vector<char> |
---|
67 | /** |
---|
68 | * A map from alphabet letters to integers. This speeds things considerably up, |
---|
69 | * and we use less memory. |
---|
70 | */ |
---|
71 | map<string, int> alphabet_map; // @@@ string is length 1 most of the time, disregarding alphabet_map["total"] which is used as global counter. this should be vector<char> |
---|
72 | /** |
---|
73 | * The number of sequences in the MSA. |
---|
74 | */ |
---|
75 | int num_of_seqs; |
---|
76 | /** |
---|
77 | * A vector with the entropies for eac position. |
---|
78 | */ |
---|
79 | VectorXd entropy; |
---|
80 | /** |
---|
81 | * A matrix with all joint entropies. |
---|
82 | */ |
---|
83 | MatrixXd JointEntropy; |
---|
84 | /** |
---|
85 | * A matrix with the MI values for each pair of positions. |
---|
86 | */ |
---|
87 | MatrixXd MutualInformation; |
---|
88 | /** |
---|
89 | * A matrix with the MIp values for each pair of positions. |
---|
90 | * See Dunn et al. "Mutual Information without the influence |
---|
91 | * of phylogeny or entropy dramatically improves residue contact |
---|
92 | * prediction", Bioinformatics Vol.24 no.3 333-340, 2008 |
---|
93 | */ |
---|
94 | MatrixXd MutualInformationP; |
---|
95 | /** |
---|
96 | * A vector with the cluster indices. |
---|
97 | */ |
---|
98 | VectorXi clusters; |
---|
99 | |
---|
100 | /** |
---|
101 | * Initialises the alphabet. |
---|
102 | */ |
---|
103 | void initAlphabet(vector<string> alph); |
---|
104 | /** |
---|
105 | * Builds the joint histogram of every pair of positions. |
---|
106 | */ |
---|
107 | MapMatrixType buildJointHistogram(const VecVecType & alignedSequences); |
---|
108 | /** |
---|
109 | * Compute the joint entropy. |
---|
110 | */ |
---|
111 | MatrixXd computeJointEntropy(MapMatrixType & hist); |
---|
112 | MatrixXd computeJointEntropy(const VecVecType & seq); |
---|
113 | /** |
---|
114 | * Used to compute the MIp. |
---|
115 | */ |
---|
116 | VectorXd computeMeanMutualInformationVector(); |
---|
117 | double computeOveralMeanMutualInformation(); |
---|
118 | /** |
---|
119 | * Checks whether a letter pertains to the alphabet. |
---|
120 | */ |
---|
121 | bool isValid(string & key1); |
---|
122 | |
---|
123 | |
---|
124 | public: |
---|
125 | |
---|
126 | /** |
---|
127 | * Computes the MI. |
---|
128 | */ |
---|
129 | MatrixXd computeMutualInformation(VecVecType & seq); |
---|
130 | /** |
---|
131 | * Computes the MIp. |
---|
132 | */ |
---|
133 | MatrixXd computeMutualInformationP(VecVecType & seq); |
---|
134 | /** |
---|
135 | * Computes the clusters. |
---|
136 | */ |
---|
137 | VectorXi computeClusters(list<MITuple> mituples, size_t size, double threshold); |
---|
138 | /** |
---|
139 | * Transforms the upper triangular matrix of a symmetric matrix into a |
---|
140 | * sortable list of MITuples. |
---|
141 | */ |
---|
142 | list<MITuple> compute_mituples(MatrixXd mutualInformation); |
---|
143 | |
---|
144 | /** |
---|
145 | * Getters for the fields. |
---|
146 | */ |
---|
147 | MatrixXd getMI(); |
---|
148 | MatrixXd getMIp(); |
---|
149 | MatrixXd getEntropy(); |
---|
150 | MatrixXd getJointEntropy(); |
---|
151 | VectorXi getClusters(); |
---|
152 | |
---|
153 | /** |
---|
154 | * Birth and death... |
---|
155 | */ |
---|
156 | Cma(vector<string> & alph, int seq_number); |
---|
157 | virtual ~Cma(); |
---|
158 | }; |
---|
159 | |
---|
160 | #endif /* CMA_H_ */ |
---|