| 1 | /* |
|---|
| 2 | * Analyser.cpp |
|---|
| 3 | * |
|---|
| 4 | * Responsible for interacting with the Cma code and Arb. |
|---|
| 5 | * |
|---|
| 6 | * Created on: Feb 15, 2010 |
|---|
| 7 | * Author: breno |
|---|
| 8 | * |
|---|
| 9 | * Institute of Microbiology (Technical University Munich) |
|---|
| 10 | * http://www.arb-home.de/ |
|---|
| 11 | */ |
|---|
| 12 | |
|---|
| 13 | #include "Analyser.h" |
|---|
| 14 | #include <time.h> |
|---|
| 15 | #include <iostream> |
|---|
| 16 | #include <iterator> |
|---|
| 17 | #include "dbconn.h" |
|---|
| 18 | |
|---|
| 19 | /** |
|---|
| 20 | * Constructor |
|---|
| 21 | */ |
|---|
| 22 | Analyser::Analyser() { |
|---|
| 23 | |
|---|
| 24 | //Definition of the alphabet |
|---|
| 25 | vector<string> alphabet(0); |
|---|
| 26 | alphabet.push_back("A"); |
|---|
| 27 | alphabet.push_back("C"); |
|---|
| 28 | alphabet.push_back("G"); |
|---|
| 29 | alphabet.push_back("T"); |
|---|
| 30 | |
|---|
| 31 | loader = new AlignedSequenceLoader; |
|---|
| 32 | VecVecType *seqs = loader->getSequences(); |
|---|
| 33 | |
|---|
| 34 | cma = new Cma(alphabet, seqs->size()); |
|---|
| 35 | } |
|---|
| 36 | |
|---|
| 37 | /** |
|---|
| 38 | * Destructor. |
|---|
| 39 | */ |
|---|
| 40 | Analyser::~Analyser() { |
|---|
| 41 | delete loader; |
|---|
| 42 | delete cma; |
|---|
| 43 | } |
|---|
| 44 | |
|---|
| 45 | /** |
|---|
| 46 | * Getter for the Cma object. |
|---|
| 47 | * |
|---|
| 48 | * @return the Cma object. |
|---|
| 49 | */ |
|---|
| 50 | Cma* Analyser::getCma() { |
|---|
| 51 | return cma; |
|---|
| 52 | } |
|---|
| 53 | |
|---|
| 54 | /** |
|---|
| 55 | * Returns the AlignedSequenceLoader object. |
|---|
| 56 | * |
|---|
| 57 | * @return the AlignedSequenceLoader object. |
|---|
| 58 | */ |
|---|
| 59 | AlignedSequenceLoader* Analyser::getLoader() { |
|---|
| 60 | return loader; |
|---|
| 61 | } |
|---|
| 62 | |
|---|
| 63 | /** |
|---|
| 64 | * Saves the clusters to the DB as SAI. |
|---|
| 65 | * |
|---|
| 66 | * @param clusters: the cluster indices. |
|---|
| 67 | * @param threshold: the clustering threshold used. |
|---|
| 68 | * |
|---|
| 69 | * @return an GB_ERROR if some DB transaction fails. |
|---|
| 70 | */ |
|---|
| 71 | GB_ERROR Analyser::saveSAI(vector<size_t> clusters, double threshold) { |
|---|
| 72 | |
|---|
| 73 | GBDATA *gb_main = runningDatabase(); |
|---|
| 74 | GB_ERROR error = GB_push_transaction(gb_main); |
|---|
| 75 | if (error) { |
|---|
| 76 | cout << "ERROR 1" << endl; |
|---|
| 77 | } |
|---|
| 78 | |
|---|
| 79 | char *al_name = GBT_get_default_alignment(gb_main); |
|---|
| 80 | |
|---|
| 81 | vector<char> clusts = normalizeClusters(clusters); |
|---|
| 82 | // build result string |
|---|
| 83 | stringstream ss; |
|---|
| 84 | copy(clusts.begin(), clusts.end(), ostream_iterator<char> (ss, "")); |
|---|
| 85 | string result = ss.str(); |
|---|
| 86 | |
|---|
| 87 | //save |
|---|
| 88 | GBDATA *gb_sai = GBT_find_or_create_SAI(gb_main, getSAIname()); |
|---|
| 89 | GBDATA *gb_data = GBT_add_data(gb_sai, al_name, "data", GB_STRING); |
|---|
| 90 | error = GB_write_string(gb_data, result.c_str()); |
|---|
| 91 | if (error) { |
|---|
| 92 | cout << "RNACMA-Error: " << error << "\n"; |
|---|
| 93 | exit(EXIT_FAILURE); |
|---|
| 94 | } |
|---|
| 95 | |
|---|
| 96 | GBDATA *gb_options = GBT_add_data(gb_sai, al_name, "_TYPE", GB_STRING); |
|---|
| 97 | stringstream options; |
|---|
| 98 | options << "CMA_CLUSTERING: [threshold: " << threshold << "]"; |
|---|
| 99 | error = GB_write_string(gb_options, options.str().c_str()); |
|---|
| 100 | |
|---|
| 101 | GB_commit_transaction(gb_main); |
|---|
| 102 | |
|---|
| 103 | return error; |
|---|
| 104 | } |
|---|
| 105 | |
|---|
| 106 | /** |
|---|
| 107 | * Gives clusters a reasonable name. The clustering algorithm may return |
|---|
| 108 | * clusters with indices 123 even though there are only 50 clusters. |
|---|
| 109 | * Here we normalise the cluster names, in this example we would have |
|---|
| 110 | * only clusters from 0..50 as result. |
|---|
| 111 | * |
|---|
| 112 | * @param clusters: the result of the clustering algorithm. |
|---|
| 113 | * |
|---|
| 114 | * @return the new cluster names (note that the cluster index is a char |
|---|
| 115 | * because we have to be able to show it in the SAI, where we |
|---|
| 116 | * are allowed to use only one character for each position). |
|---|
| 117 | */ |
|---|
| 118 | vector<char> Analyser::normalizeClusters(vector<size_t> clusters) { |
|---|
| 119 | |
|---|
| 120 | vector<char> result; |
|---|
| 121 | map<unsigned int, char> rename_map; |
|---|
| 122 | rename_map[0] = '-'; |
|---|
| 123 | char classes = '0'; |
|---|
| 124 | |
|---|
| 125 | for (vector<size_t>::iterator it = clusters.begin(); it != clusters.end(); ++it) { |
|---|
| 126 | if (rename_map.find(*it) == rename_map.end()) { |
|---|
| 127 | rename_map[*it] = classes++; |
|---|
| 128 | } |
|---|
| 129 | result.push_back(rename_map[*it]); |
|---|
| 130 | } |
|---|
| 131 | return result; |
|---|
| 132 | |
|---|
| 133 | } |
|---|
| 134 | |
|---|
| 135 | //-------------------------------- |
|---|
| 136 | |
|---|
| 137 | int main(void) { |
|---|
| 138 | cout |
|---|
| 139 | << "arb_rnacma -- correlated mutation analysis" << endl |
|---|
| 140 | << " computes clusters of correlated positions" << endl |
|---|
| 141 | << "(C) 2010 Lehrstuhl fuer Mikrobiologie, TU Muenchen" << endl |
|---|
| 142 | << "Written 2009/2010 by Breno Faria" << endl |
|---|
| 143 | << endl |
|---|
| 144 | << "arb_rnacma uses the eigen C++ library (http://eigen.tuxfamily.org/)" << endl |
|---|
| 145 | << "eigen is copyrighted by LGPL3" << endl |
|---|
| 146 | << endl; |
|---|
| 147 | |
|---|
| 148 | Analyser *a = new Analyser; |
|---|
| 149 | Cma *cma = a->getCma(); |
|---|
| 150 | |
|---|
| 151 | cma->computeMutualInformationP(*(a->getLoader()->getSequences())); |
|---|
| 152 | |
|---|
| 153 | list<MITuple> mituple = cma->compute_mituples(cma->getMIp()); |
|---|
| 154 | |
|---|
| 155 | cout << endl |
|---|
| 156 | << "The highest MI value was: " << mituple.begin()->MI |
|---|
| 157 | << " at position (" << mituple.begin()->pos1 << ", " |
|---|
| 158 | << mituple.begin()->pos2 << ")." << endl |
|---|
| 159 | << "(Note: pairs with MI-values down to the threshold will be joined in one cluster)" << endl; |
|---|
| 160 | |
|---|
| 161 | while (true) { |
|---|
| 162 | cout << endl |
|---|
| 163 | << "Press Ctrl-d to quit or" << endl |
|---|
| 164 | << "choose a threshold value for the clustering process: "; |
|---|
| 165 | |
|---|
| 166 | |
|---|
| 167 | string input; |
|---|
| 168 | cin >> input; |
|---|
| 169 | |
|---|
| 170 | if (input.empty()) { |
|---|
| 171 | cout << endl << "quit" << endl; |
|---|
| 172 | break; |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | double threshold = strtod(input.c_str(), NULL); |
|---|
| 176 | cout << "Building clusters with threshold = " << threshold << endl; |
|---|
| 177 | |
|---|
| 178 | VectorXi cl = cma->computeClusters(mituple, size_t(cma->getMIp().cols()), threshold); |
|---|
| 179 | vector<size_t> *clusters = new vector<size_t> (0, 0); |
|---|
| 180 | AlignedSequenceLoader *l = a->getLoader(); |
|---|
| 181 | size_t i = 0; |
|---|
| 182 | size_t j = 0; |
|---|
| 183 | for (vector<size_t>::iterator it = l->getPositionMap()->begin(); it |
|---|
| 184 | != l->getPositionMap()->end(); ++it) { |
|---|
| 185 | while (i < *it) { |
|---|
| 186 | clusters->push_back(0); |
|---|
| 187 | i++; |
|---|
| 188 | } |
|---|
| 189 | clusters->push_back(cl[j]); |
|---|
| 190 | j++; |
|---|
| 191 | i++; |
|---|
| 192 | } |
|---|
| 193 | |
|---|
| 194 | GB_ERROR e = a->saveSAI(*clusters, threshold); |
|---|
| 195 | |
|---|
| 196 | if (e) { |
|---|
| 197 | cout << "Error" << endl; |
|---|
| 198 | } |
|---|
| 199 | |
|---|
| 200 | cout << "Saved results to SAI '" << Analyser::getSAIname() << "'" << endl |
|---|
| 201 | << "(To analyse the results, open the editor and visualise the clusters using the SAI annotations)" << endl; |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | delete a; |
|---|
| 205 | |
|---|
| 206 | } |
|---|