source: branches/profile/RNACMA/Cma.h

Last change on this file was 9520, checked in by westram, 11 years ago
  • include cxxforward.h via arbtools.h (which declares Noncopyable)
File size: 4.8 KB
Line 
1/*
2 * Cma.h
3 *
4 * CMA stands for correlated mutation analysis. This class implements
5 * information-theoretical measures to compute correlations between
6 * positions in a set of aligned sequences.
7 * The correlation values computed are Mutual Information (MI) and noiseless
8 * Mutual Information (MIp). MIp is a correlation measure proposed in the
9 * paper "Mutual information without the influence of phylogeny or entropy
10 * dramatically improves residue contact prediction" by S. D. Dunn et al.,
11 * Bioinformatics Vol. 24 no. 3, 333-340, 2008.
12 * The clustering of correlated positions follows a simple heuristic, which
13 * can be found in Thomas Runkler, "Information Mining", chapter 5, 53-58,
14 * 2000.
15 *
16 *  Created on: Dec 16, 2009
17 *      Author: Breno Faria
18 *
19 *  Institute of Microbiology (Technical University Munich)
20 *  http://www.arb-home.de/
21 */
22
23 
24#ifndef CMA_H
25#define CMA_H
26
27#ifndef _GLIBCXX_IOSTREAM
28#include <iostream>
29#endif
30#ifndef _GLIBCXX_VECTOR
31#include <vector>
32#endif
33#ifndef _GLIBCXX_MAP
34#include <map>
35#endif
36#ifndef _GLIBCXX_CMATH
37#include <cmath>
38#endif
39#ifndef _GLIBCXX_STRING
40#include <string>
41#endif
42#ifndef _GLIBCXX_LIST
43#include <list>
44#endif
45
46#include <eigen/Eigen/Eigen>
47
48#ifndef CXXFORWARD_H
49#include <cxxforward.h>
50#endif
51
52using namespace std; // @@@ wtf! this really is a nono
53using Eigen::VectorXd;
54using Eigen::VectorXi;
55using Eigen::MatrixXd;
56
57// Vector of maps to store the histogram of sequences.
58typedef vector<map<int, int> > MapVecType;
59// Vector of vectors to store the aligned sequences (each one represented by a vector)
60typedef vector<vector<string> > VecVecType; // @@@ string always is length 1. this should be vector<vector<char>> or vector<string>
61// Matrix of maps to store the joint histogram of sequences.
62typedef vector<MapVecType> MapMatrixType;
63
64// Struct representing the tuple (pos1, pos2, mutualInformation(pos1,pos2)).
65struct MITuple {
66    int pos1;
67    int pos2;
68    double MI;
69};
70
71class Cma {
72
73private:
74    /**
75     * This is used for numerical stability and to compare floating point numbers.
76     */
77    static CONSTEXPR double epsilon = 1.0e-9;
78    /**
79     * See documentation of project, p. 7.
80     */
81    static CONSTEXPR double penalty = 5.;
82    /**
83     * The alphabet considered.
84     */
85    vector<string> alphabet; // @@@ string always is length 1. this should be vector<char>
86    /**
87     * A map from alphabet letters to integers. This speeds things considerably up,
88     * and we use less memory.
89     */
90    map<string, int> alphabet_map; // @@@ string is length 1 most of the time, disregarding alphabet_map["total"] which is used as global counter. this should be vector<char>
91    /**
92     * The number of sequences in the MSA.
93     */
94    int num_of_seqs;
95    /**
96     * A vector with the entropies for eac position.
97     */
98    VectorXd entropy;
99    /**
100     * A matrix with all joint entropies.
101     */
102    MatrixXd JointEntropy;
103    /**
104     * A matrix with the MI values for each pair of positions.
105     */
106    MatrixXd MutualInformation;
107    /**
108     * A matrix with the MIp values for each pair of positions.
109     * See Dunn et al. "Mutual Information without the influence
110     * of phylogeny or entropy dramatically improves residue contact
111     * prediction", Bioinformatics Vol.24 no.3 333-340, 2008
112     */
113    MatrixXd MutualInformationP;
114    /**
115     * A vector with the cluster indices.
116     */
117    VectorXi clusters;
118
119    /**
120     * Initialises the alphabet.
121     */
122    void initAlphabet(vector<string> alph);
123    /**
124     * Builds the joint histogram of every pair of positions.
125     */
126    MapMatrixType buildJointHistogram(const VecVecType & alignedSequences);
127    /**
128     * Compute the joint entropy.
129     */
130    MatrixXd computeJointEntropy(MapMatrixType & hist);
131    MatrixXd computeJointEntropy(const VecVecType & seq);
132    /**
133     * Used to compute the MIp.
134     */
135    VectorXd computeMeanMutualInformationVector();
136    double computeOveralMeanMutualInformation();
137    /**
138     * Checks whether a letter pertains to the alphabet.
139     */
140    bool isValid(string & key1);
141
142
143public:
144
145    /**
146     * Computes the MI.
147     */
148    MatrixXd computeMutualInformation(VecVecType & seq);
149    /**
150     * Computes the MIp.
151     */
152    MatrixXd computeMutualInformationP(VecVecType & seq);
153    /**
154     * Computes the clusters.
155     */
156    VectorXi computeClusters(list<MITuple> mituples, size_t size, double threshold);
157    /**
158     * Transforms the upper triangular matrix of a symmetric matrix into a
159     * sortable list of MITuples.
160     */
161    list<MITuple> compute_mituples(MatrixXd mutualInformation);
162
163    /**
164     * Getters for the fields.
165     */
166    MatrixXd getMI();
167    MatrixXd getMIp();
168    MatrixXd getEntropy();
169    MatrixXd getJointEntropy();
170    VectorXi getClusters();
171
172    /**
173     * Birth and death...
174     */
175    Cma(vector<string> & alph, int seq_number);
176    virtual ~Cma();
177};
178
179#else
180#error Cma.h included twice
181#endif // CMA_H
Note: See TracBrowser for help on using the repository browser.