1 | /* |
---|
2 | * Cma.h |
---|
3 | * |
---|
4 | * CMA stands for correlated mutation analysis. This class implements |
---|
5 | * information-theoretical measures to compute correlations between |
---|
6 | * positions in a set of aligned sequences. |
---|
7 | * The correlation values computed are Mutual Information (MI) and noiseless |
---|
8 | * Mutual Information (MIp). MIp is a correlation measure proposed in the |
---|
9 | * paper "Mutual information without the influence of phylogeny or entropy |
---|
10 | * dramatically improves residue contact prediction" by S. D. Dunn et al., |
---|
11 | * Bioinformatics Vol. 24 no. 3, 333-340, 2008. |
---|
12 | * The clustering of correlated positions follows a simple heuristic, which |
---|
13 | * can be found in Thomas Runkler, "Information Mining", chapter 5, 53-58, |
---|
14 | * 2000. |
---|
15 | * |
---|
16 | * Created on: Dec 16, 2009 |
---|
17 | * Author: Breno Faria |
---|
18 | * |
---|
19 | * Institute of Microbiology (Technical University Munich) |
---|
20 | * http://www.arb-home.de/ |
---|
21 | */ |
---|
22 | |
---|
23 | |
---|
24 | #ifndef CMA_H |
---|
25 | #define CMA_H |
---|
26 | |
---|
27 | #ifndef _GLIBCXX_IOSTREAM |
---|
28 | #include <iostream> |
---|
29 | #endif |
---|
30 | #ifndef _GLIBCXX_VECTOR |
---|
31 | #include <vector> |
---|
32 | #endif |
---|
33 | #ifndef _GLIBCXX_MAP |
---|
34 | #include <map> |
---|
35 | #endif |
---|
36 | #ifndef _GLIBCXX_CMATH |
---|
37 | #include <cmath> |
---|
38 | #endif |
---|
39 | #ifndef _GLIBCXX_STRING |
---|
40 | #include <string> |
---|
41 | #endif |
---|
42 | #ifndef _GLIBCXX_LIST |
---|
43 | #include <list> |
---|
44 | #endif |
---|
45 | |
---|
46 | #include <eigen/Eigen/Eigen> |
---|
47 | |
---|
48 | #ifndef CXXFORWARD_H |
---|
49 | #include <cxxforward.h> |
---|
50 | #endif |
---|
51 | |
---|
52 | using namespace std; // @@@ wtf! this really is a nono |
---|
53 | using Eigen::VectorXd; |
---|
54 | using Eigen::VectorXi; |
---|
55 | using Eigen::MatrixXd; |
---|
56 | |
---|
57 | // Vector of maps to store the histogram of sequences. |
---|
58 | typedef vector<map<int, int> > MapVecType; |
---|
59 | // Vector of vectors to store the aligned sequences (each one represented by a vector) |
---|
60 | typedef vector<vector<string> > VecVecType; // @@@ string always is length 1. this should be vector<vector<char>> or vector<string> |
---|
61 | // Matrix of maps to store the joint histogram of sequences. |
---|
62 | typedef vector<MapVecType> MapMatrixType; |
---|
63 | |
---|
64 | // Struct representing the tuple (pos1, pos2, mutualInformation(pos1,pos2)). |
---|
65 | struct MITuple { |
---|
66 | int pos1; |
---|
67 | int pos2; |
---|
68 | double MI; |
---|
69 | }; |
---|
70 | |
---|
71 | class Cma { |
---|
72 | |
---|
73 | private: |
---|
74 | /** |
---|
75 | * This is used for numerical stability and to compare floating point numbers. |
---|
76 | */ |
---|
77 | static CONSTEXPR double epsilon = 1.0e-9; |
---|
78 | /** |
---|
79 | * See documentation of project, p. 7. |
---|
80 | */ |
---|
81 | static CONSTEXPR double penalty = 5.; |
---|
82 | /** |
---|
83 | * The alphabet considered. |
---|
84 | */ |
---|
85 | vector<string> alphabet; // @@@ string always is length 1. this should be vector<char> |
---|
86 | /** |
---|
87 | * A map from alphabet letters to integers. This speeds things considerably up, |
---|
88 | * and we use less memory. |
---|
89 | */ |
---|
90 | map<string, int> alphabet_map; // @@@ string is length 1 most of the time, disregarding alphabet_map["total"] which is used as global counter. this should be vector<char> |
---|
91 | /** |
---|
92 | * The number of sequences in the MSA. |
---|
93 | */ |
---|
94 | int num_of_seqs; |
---|
95 | /** |
---|
96 | * A vector with the entropies for eac position. |
---|
97 | */ |
---|
98 | VectorXd entropy; |
---|
99 | /** |
---|
100 | * A matrix with all joint entropies. |
---|
101 | */ |
---|
102 | MatrixXd JointEntropy; |
---|
103 | /** |
---|
104 | * A matrix with the MI values for each pair of positions. |
---|
105 | */ |
---|
106 | MatrixXd MutualInformation; |
---|
107 | /** |
---|
108 | * A matrix with the MIp values for each pair of positions. |
---|
109 | * See Dunn et al. "Mutual Information without the influence |
---|
110 | * of phylogeny or entropy dramatically improves residue contact |
---|
111 | * prediction", Bioinformatics Vol.24 no.3 333-340, 2008 |
---|
112 | */ |
---|
113 | MatrixXd MutualInformationP; |
---|
114 | /** |
---|
115 | * A vector with the cluster indices. |
---|
116 | */ |
---|
117 | VectorXi clusters; |
---|
118 | |
---|
119 | /** |
---|
120 | * Initialises the alphabet. |
---|
121 | */ |
---|
122 | void initAlphabet(vector<string> alph); |
---|
123 | /** |
---|
124 | * Builds the joint histogram of every pair of positions. |
---|
125 | */ |
---|
126 | MapMatrixType buildJointHistogram(const VecVecType & alignedSequences); |
---|
127 | /** |
---|
128 | * Compute the joint entropy. |
---|
129 | */ |
---|
130 | MatrixXd computeJointEntropy(MapMatrixType & hist); |
---|
131 | MatrixXd computeJointEntropy(const VecVecType & seq); |
---|
132 | /** |
---|
133 | * Used to compute the MIp. |
---|
134 | */ |
---|
135 | VectorXd computeMeanMutualInformationVector(); |
---|
136 | double computeOveralMeanMutualInformation(); |
---|
137 | /** |
---|
138 | * Checks whether a letter pertains to the alphabet. |
---|
139 | */ |
---|
140 | bool isValid(string & key1); |
---|
141 | |
---|
142 | |
---|
143 | public: |
---|
144 | |
---|
145 | /** |
---|
146 | * Computes the MI. |
---|
147 | */ |
---|
148 | MatrixXd computeMutualInformation(VecVecType & seq); |
---|
149 | /** |
---|
150 | * Computes the MIp. |
---|
151 | */ |
---|
152 | MatrixXd computeMutualInformationP(VecVecType & seq); |
---|
153 | /** |
---|
154 | * Computes the clusters. |
---|
155 | */ |
---|
156 | VectorXi computeClusters(list<MITuple> mituples, size_t size, double threshold); |
---|
157 | /** |
---|
158 | * Transforms the upper triangular matrix of a symmetric matrix into a |
---|
159 | * sortable list of MITuples. |
---|
160 | */ |
---|
161 | list<MITuple> compute_mituples(MatrixXd mutualInformation); |
---|
162 | |
---|
163 | /** |
---|
164 | * Getters for the fields. |
---|
165 | */ |
---|
166 | MatrixXd getMI(); |
---|
167 | MatrixXd getMIp(); |
---|
168 | MatrixXd getEntropy(); |
---|
169 | MatrixXd getJointEntropy(); |
---|
170 | VectorXi getClusters(); |
---|
171 | |
---|
172 | /** |
---|
173 | * Birth and death... |
---|
174 | */ |
---|
175 | Cma(vector<string> & alph, int seq_number); |
---|
176 | virtual ~Cma(); |
---|
177 | }; |
---|
178 | |
---|
179 | #else |
---|
180 | #error Cma.h included twice |
---|
181 | #endif // CMA_H |
---|