| 1 | /* |
|---|
| 2 | * Cma.h |
|---|
| 3 | * |
|---|
| 4 | * CMA stands for correlated mutation analysis. This class implements |
|---|
| 5 | * information-theoretical measures to compute correlations between |
|---|
| 6 | * positions in a set of aligned sequences. |
|---|
| 7 | * The correlation values computed are Mutual Information (MI) and noiseless |
|---|
| 8 | * Mutual Information (MIp). MIp is a correlation measure proposed in the |
|---|
| 9 | * paper "Mutual information without the influence of phylogeny or entropy |
|---|
| 10 | * dramatically improves residue contact prediction" by S. D. Dunn et al., |
|---|
| 11 | * Bioinformatics Vol. 24 no. 3, 333-340, 2008. |
|---|
| 12 | * The clustering of correlated positions follows a simple heuristic, which |
|---|
| 13 | * can be found in Thomas Runkler, "Information Mining", chapter 5, 53-58, |
|---|
| 14 | * 2000. |
|---|
| 15 | * |
|---|
| 16 | * Created on: Dec 16, 2009 |
|---|
| 17 | * Author: Breno Faria |
|---|
| 18 | * |
|---|
| 19 | * Institute of Microbiology (Technical University Munich) |
|---|
| 20 | * http://www.arb-home.de/ |
|---|
| 21 | */ |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | #ifndef CMA_H |
|---|
| 25 | #define CMA_H |
|---|
| 26 | |
|---|
| 27 | #ifndef _GLIBCXX_IOSTREAM |
|---|
| 28 | #include <iostream> |
|---|
| 29 | #endif |
|---|
| 30 | #ifndef _GLIBCXX_VECTOR |
|---|
| 31 | #include <vector> |
|---|
| 32 | #endif |
|---|
| 33 | #ifndef _GLIBCXX_MAP |
|---|
| 34 | #include <map> |
|---|
| 35 | #endif |
|---|
| 36 | #ifndef _GLIBCXX_CMATH |
|---|
| 37 | #include <cmath> |
|---|
| 38 | #endif |
|---|
| 39 | #ifndef _GLIBCXX_STRING |
|---|
| 40 | #include <string> |
|---|
| 41 | #endif |
|---|
| 42 | #ifndef _GLIBCXX_LIST |
|---|
| 43 | #include <list> |
|---|
| 44 | #endif |
|---|
| 45 | |
|---|
| 46 | #include <eigen/Eigen/Eigen> |
|---|
| 47 | |
|---|
| 48 | #ifndef CXXFORWARD_H |
|---|
| 49 | #include <cxxforward.h> |
|---|
| 50 | #endif |
|---|
| 51 | |
|---|
| 52 | using namespace std; // @@@ wtf! this really is a nono |
|---|
| 53 | using Eigen::VectorXd; |
|---|
| 54 | using Eigen::VectorXi; |
|---|
| 55 | using Eigen::MatrixXd; |
|---|
| 56 | |
|---|
| 57 | // Vector of maps to store the histogram of sequences. |
|---|
| 58 | typedef vector<map<int, int> > MapVecType; |
|---|
| 59 | // Vector of vectors to store the aligned sequences (each one represented by a vector) |
|---|
| 60 | typedef vector<vector<string> > VecVecType; // @@@ string always is length 1. this should be vector<vector<char>> or vector<string> |
|---|
| 61 | // Matrix of maps to store the joint histogram of sequences. |
|---|
| 62 | typedef vector<MapVecType> MapMatrixType; |
|---|
| 63 | |
|---|
| 64 | // Struct representing the tuple (pos1, pos2, mutualInformation(pos1,pos2)). |
|---|
| 65 | struct MITuple { |
|---|
| 66 | int pos1; |
|---|
| 67 | int pos2; |
|---|
| 68 | double MI; |
|---|
| 69 | }; |
|---|
| 70 | |
|---|
| 71 | class Cma FINAL_TYPE { |
|---|
| 72 | |
|---|
| 73 | private: |
|---|
| 74 | /** |
|---|
| 75 | * This is used for numerical stability and to compare floating point numbers. |
|---|
| 76 | */ |
|---|
| 77 | static CONSTEXPR double epsilon = 1.0e-9; |
|---|
| 78 | /** |
|---|
| 79 | * See documentation of project, p. 7. |
|---|
| 80 | */ |
|---|
| 81 | static CONSTEXPR double penalty = 5.; |
|---|
| 82 | /** |
|---|
| 83 | * The alphabet considered. |
|---|
| 84 | */ |
|---|
| 85 | vector<string> alphabet; // @@@ string always is length 1. this should be vector<char> |
|---|
| 86 | /** |
|---|
| 87 | * A map from alphabet letters to integers. This speeds things considerably up, |
|---|
| 88 | * and we use less memory. |
|---|
| 89 | */ |
|---|
| 90 | map<string, int> alphabet_map; // @@@ string is length 1 most of the time, disregarding alphabet_map["total"] which is used as global counter. this should be vector<char> |
|---|
| 91 | /** |
|---|
| 92 | * The number of sequences in the MSA. |
|---|
| 93 | */ |
|---|
| 94 | int num_of_seqs; |
|---|
| 95 | /** |
|---|
| 96 | * A vector with the entropies for eac position. |
|---|
| 97 | */ |
|---|
| 98 | VectorXd entropy; |
|---|
| 99 | /** |
|---|
| 100 | * A matrix with all joint entropies. |
|---|
| 101 | */ |
|---|
| 102 | MatrixXd JointEntropy; |
|---|
| 103 | /** |
|---|
| 104 | * A matrix with the MI values for each pair of positions. |
|---|
| 105 | */ |
|---|
| 106 | MatrixXd MutualInformation; |
|---|
| 107 | /** |
|---|
| 108 | * A matrix with the MIp values for each pair of positions. |
|---|
| 109 | * See Dunn et al. "Mutual Information without the influence |
|---|
| 110 | * of phylogeny or entropy dramatically improves residue contact |
|---|
| 111 | * prediction", Bioinformatics Vol.24 no.3 333-340, 2008 |
|---|
| 112 | */ |
|---|
| 113 | MatrixXd MutualInformationP; |
|---|
| 114 | /** |
|---|
| 115 | * A vector with the cluster indices. |
|---|
| 116 | */ |
|---|
| 117 | VectorXi clusters; |
|---|
| 118 | |
|---|
| 119 | /** |
|---|
| 120 | * Initialises the alphabet. |
|---|
| 121 | */ |
|---|
| 122 | void initAlphabet(vector<string> alph); |
|---|
| 123 | /** |
|---|
| 124 | * Builds the joint histogram of every pair of positions. |
|---|
| 125 | */ |
|---|
| 126 | MapMatrixType buildJointHistogram(const VecVecType & alignedSequences); |
|---|
| 127 | /** |
|---|
| 128 | * Compute the joint entropy. |
|---|
| 129 | */ |
|---|
| 130 | MatrixXd computeJointEntropy(MapMatrixType & hist); |
|---|
| 131 | MatrixXd computeJointEntropy(const VecVecType & seq); |
|---|
| 132 | /** |
|---|
| 133 | * Used to compute the MIp. |
|---|
| 134 | */ |
|---|
| 135 | VectorXd computeMeanMutualInformationVector(); |
|---|
| 136 | double computeOveralMeanMutualInformation(); |
|---|
| 137 | /** |
|---|
| 138 | * Checks whether a letter pertains to the alphabet. |
|---|
| 139 | */ |
|---|
| 140 | bool isValid(string & key1); |
|---|
| 141 | |
|---|
| 142 | |
|---|
| 143 | public: |
|---|
| 144 | |
|---|
| 145 | /** |
|---|
| 146 | * Computes the MI. |
|---|
| 147 | */ |
|---|
| 148 | MatrixXd computeMutualInformation(VecVecType & seq); |
|---|
| 149 | /** |
|---|
| 150 | * Computes the MIp. |
|---|
| 151 | */ |
|---|
| 152 | MatrixXd computeMutualInformationP(VecVecType & seq); |
|---|
| 153 | /** |
|---|
| 154 | * Computes the clusters. |
|---|
| 155 | */ |
|---|
| 156 | VectorXi computeClusters(list<MITuple> mituples, size_t size, double threshold); |
|---|
| 157 | /** |
|---|
| 158 | * Transforms the upper triangular matrix of a symmetric matrix into a |
|---|
| 159 | * sortable list of MITuples. |
|---|
| 160 | */ |
|---|
| 161 | list<MITuple> compute_mituples(MatrixXd mutualInformation); |
|---|
| 162 | |
|---|
| 163 | /** |
|---|
| 164 | * Getters for the fields. |
|---|
| 165 | */ |
|---|
| 166 | MatrixXd getMI(); |
|---|
| 167 | MatrixXd getMIp(); |
|---|
| 168 | MatrixXd getEntropy(); |
|---|
| 169 | MatrixXd getJointEntropy(); |
|---|
| 170 | VectorXi getClusters(); |
|---|
| 171 | |
|---|
| 172 | /** |
|---|
| 173 | * Birth and death... |
|---|
| 174 | */ |
|---|
| 175 | Cma(vector<string> & alph, int seq_number); |
|---|
| 176 | virtual ~Cma(); |
|---|
| 177 | }; |
|---|
| 178 | |
|---|
| 179 | #else |
|---|
| 180 | #error Cma.h included twice |
|---|
| 181 | #endif // CMA_H |
|---|