| 1 | /* | 
|---|
| 2 | * Analyser.cpp | 
|---|
| 3 | * | 
|---|
| 4 | * Responsible for interacting with the Cma code and Arb. | 
|---|
| 5 | * | 
|---|
| 6 | *  Created on: Feb 15, 2010 | 
|---|
| 7 | *      Author: breno | 
|---|
| 8 | * | 
|---|
| 9 | *  Institute of Microbiology (Technical University Munich) | 
|---|
| 10 | *  http://www.arb-home.de/ | 
|---|
| 11 | */ | 
|---|
| 12 |  | 
|---|
| 13 | #include "Analyser.h" | 
|---|
| 14 |  | 
|---|
| 15 | #include <ctime> | 
|---|
| 16 | #include <iostream> | 
|---|
| 17 | #include <iterator> | 
|---|
| 18 |  | 
|---|
| 19 | /** | 
|---|
| 20 | * Constructor | 
|---|
| 21 | */ | 
|---|
| 22 | Analyser::Analyser(GBDATA *gb_main) : | 
|---|
| 23 | loader(NULp), | 
|---|
| 24 | cma(NULp) | 
|---|
| 25 | { | 
|---|
| 26 |  | 
|---|
| 27 | //Definition of the alphabet | 
|---|
| 28 | vector<string> alphabet(0); | 
|---|
| 29 | alphabet.push_back("A"); | 
|---|
| 30 | alphabet.push_back("C"); | 
|---|
| 31 | alphabet.push_back("G"); | 
|---|
| 32 | alphabet.push_back("T"); | 
|---|
| 33 |  | 
|---|
| 34 | loader = new AlignedSequenceLoader(gb_main); | 
|---|
| 35 | if (!loader->has_error()) { | 
|---|
| 36 | VecVecType *seqs = loader->getSequences(); | 
|---|
| 37 | cma = new Cma(alphabet, seqs->size()); | 
|---|
| 38 | } | 
|---|
| 39 | } | 
|---|
| 40 |  | 
|---|
| 41 | /** | 
|---|
| 42 | * Destructor. | 
|---|
| 43 | */ | 
|---|
| 44 | Analyser::~Analyser() { | 
|---|
| 45 | delete loader; | 
|---|
| 46 | delete cma; | 
|---|
| 47 | } | 
|---|
| 48 |  | 
|---|
| 49 | /** | 
|---|
| 50 | * Getter for the Cma object. | 
|---|
| 51 | * | 
|---|
| 52 | * @return the Cma object. | 
|---|
| 53 | */ | 
|---|
| 54 | Cma* Analyser::getCma() { | 
|---|
| 55 | arb_assert(!has_error()); | 
|---|
| 56 | return cma; | 
|---|
| 57 | } | 
|---|
| 58 |  | 
|---|
| 59 | /** | 
|---|
| 60 | * Returns the AlignedSequenceLoader object. | 
|---|
| 61 | * | 
|---|
| 62 | * @return the AlignedSequenceLoader object. | 
|---|
| 63 | */ | 
|---|
| 64 | AlignedSequenceLoader* Analyser::getLoader() { | 
|---|
| 65 | return loader; | 
|---|
| 66 | } | 
|---|
| 67 |  | 
|---|
| 68 | /** | 
|---|
| 69 | * Saves the clusters to the DB as SAI. | 
|---|
| 70 | * | 
|---|
| 71 | * @param clusters: the cluster indices. | 
|---|
| 72 | * @param threshold: the clustering threshold used. | 
|---|
| 73 | * | 
|---|
| 74 | * @return an GB_ERROR if some DB transaction fails. | 
|---|
| 75 | */ | 
|---|
| 76 | GB_ERROR Analyser::saveSAI(GBDATA *gb_main, vector<size_t> clusters, double threshold) { | 
|---|
| 77 | GB_ERROR error = GB_push_transaction(gb_main); | 
|---|
| 78 |  | 
|---|
| 79 | if (!error) { | 
|---|
| 80 | char *al_name = GBT_get_default_alignment(gb_main); | 
|---|
| 81 |  | 
|---|
| 82 | vector<char> clusts = normalizeClusters(clusters); | 
|---|
| 83 | // build result string | 
|---|
| 84 | stringstream ss; | 
|---|
| 85 | copy(clusts.begin(), clusts.end(), ostream_iterator<char> (ss, "")); | 
|---|
| 86 | string result = ss.str(); | 
|---|
| 87 |  | 
|---|
| 88 | //save | 
|---|
| 89 | GBDATA *gb_sai = GBT_find_or_create_SAI(gb_main, getSAIname()); | 
|---|
| 90 | GBDATA *gb_data = GBT_add_data(gb_sai, al_name, "data", GB_STRING); | 
|---|
| 91 | error = GB_write_string(gb_data, result.c_str()); | 
|---|
| 92 |  | 
|---|
| 93 | if (!error) { | 
|---|
| 94 | GBDATA       *gb_options = GBT_add_data(gb_sai, al_name, "_TYPE", GB_STRING); | 
|---|
| 95 | stringstream  options; | 
|---|
| 96 | options << "CMA_CLUSTERING: [threshold: " << threshold << "]"; | 
|---|
| 97 |  | 
|---|
| 98 | error = GB_write_string(gb_options, options.str().c_str()); | 
|---|
| 99 | } | 
|---|
| 100 | free(al_name); | 
|---|
| 101 | } | 
|---|
| 102 | error = GB_end_transaction(gb_main, error); | 
|---|
| 103 |  | 
|---|
| 104 | return error; | 
|---|
| 105 | } | 
|---|
| 106 |  | 
|---|
| 107 | /** | 
|---|
| 108 | * Gives clusters a reasonable name. The clustering algorithm may return | 
|---|
| 109 | * clusters with indices 123 even though there are only 50 clusters. | 
|---|
| 110 | * Here we normalise the cluster names, in this example we would have | 
|---|
| 111 | * only clusters from 0..50 as result. | 
|---|
| 112 | * | 
|---|
| 113 | * @param clusters: the result of the clustering algorithm. | 
|---|
| 114 | * | 
|---|
| 115 | * @return the new cluster names (note that the cluster index is a char | 
|---|
| 116 | *         because we have to be able to show it in the SAI, where we | 
|---|
| 117 | *         are allowed to use only one character for each position). | 
|---|
| 118 | */ | 
|---|
| 119 | vector<char> Analyser::normalizeClusters(vector<size_t> clusters) { | 
|---|
| 120 |  | 
|---|
| 121 | vector<char> result; | 
|---|
| 122 | map<unsigned int, char> rename_map; | 
|---|
| 123 | rename_map[0] = '-'; | 
|---|
| 124 | char classes = '0'; | 
|---|
| 125 |  | 
|---|
| 126 | for (vector<size_t>::iterator it = clusters.begin(); it != clusters.end(); ++it) { | 
|---|
| 127 | if (rename_map.find(*it) == rename_map.end()) { | 
|---|
| 128 | rename_map[*it] = classes++; | 
|---|
| 129 | } | 
|---|
| 130 | result.push_back(rename_map[*it]); | 
|---|
| 131 | } | 
|---|
| 132 | return result; | 
|---|
| 133 |  | 
|---|
| 134 | } | 
|---|
| 135 |  | 
|---|
| 136 | //-------------------------------- | 
|---|
| 137 |  | 
|---|
| 138 | GB_ERROR run_rnacma(GBDATA *gb_main, bool interactive, double threshold) { | 
|---|
| 139 | /*! interactive main loop of RNACMA. | 
|---|
| 140 | * @param gb_main the database. | 
|---|
| 141 | * @param interactive normally true. set to false for (non-interactive) unittesting. | 
|---|
| 142 | * @param threshold only used if !interactive (i.e. for unittesting) | 
|---|
| 143 | */ | 
|---|
| 144 |  | 
|---|
| 145 | GB_ERROR error = NULp; | 
|---|
| 146 | try { | 
|---|
| 147 | Analyser a(gb_main); | 
|---|
| 148 | error = a.get_error(); | 
|---|
| 149 |  | 
|---|
| 150 | if (!error) { | 
|---|
| 151 | Cma *cma = a.getCma(); | 
|---|
| 152 |  | 
|---|
| 153 | cma->computeMutualInformationP(*(a.getLoader()->getSequences())); | 
|---|
| 154 |  | 
|---|
| 155 | list<MITuple> mituple = cma->compute_mituples(cma->getMIp()); | 
|---|
| 156 |  | 
|---|
| 157 | cout << endl | 
|---|
| 158 | << "The highest MI value was: " << mituple.begin()->MI | 
|---|
| 159 | << " at position (" << mituple.begin()->pos1 << ", " | 
|---|
| 160 | << mituple.begin()->pos2 << ")." << endl | 
|---|
| 161 | << "(Note: pairs with MI-values down to the threshold will be joined in one cluster)" << endl; | 
|---|
| 162 |  | 
|---|
| 163 | while (true) { | 
|---|
| 164 | if (interactive) { | 
|---|
| 165 | cout << endl | 
|---|
| 166 | << "Press Ctrl-d to quit or" << endl | 
|---|
| 167 | << "choose a threshold value for the clustering process: "; | 
|---|
| 168 |  | 
|---|
| 169 |  | 
|---|
| 170 | string input; | 
|---|
| 171 | cin >> input; | 
|---|
| 172 |  | 
|---|
| 173 | if (input.empty()) { | 
|---|
| 174 | cout << endl << "quit" << endl; | 
|---|
| 175 | break; | 
|---|
| 176 | } | 
|---|
| 177 |  | 
|---|
| 178 | threshold = strtod(input.c_str(), NULp); | 
|---|
| 179 | } | 
|---|
| 180 | cout << "Building clusters with threshold = " << threshold << endl; | 
|---|
| 181 |  | 
|---|
| 182 | VectorXi cl = cma->computeClusters(mituple, size_t(cma->getMIp().cols()), threshold); | 
|---|
| 183 | vector<size_t> *clusters = new vector<size_t> (0, 0); | 
|---|
| 184 | AlignedSequenceLoader *l = a.getLoader(); | 
|---|
| 185 | size_t i = 0; | 
|---|
| 186 | size_t j = 0; | 
|---|
| 187 | for (vector<size_t>::const_iterator it = l->getPositionMap().begin(); it != l->getPositionMap().end(); ++it) { | 
|---|
| 188 | while (i < *it) { | 
|---|
| 189 | clusters->push_back(0); | 
|---|
| 190 | i++; | 
|---|
| 191 | } | 
|---|
| 192 | clusters->push_back(cl[j]); | 
|---|
| 193 | j++; | 
|---|
| 194 | i++; | 
|---|
| 195 | } | 
|---|
| 196 |  | 
|---|
| 197 | GB_ERROR e = a.saveSAI(gb_main, *clusters, threshold); | 
|---|
| 198 | delete clusters; | 
|---|
| 199 |  | 
|---|
| 200 | if (e) { | 
|---|
| 201 | cout << "Error while saving SAI: " << e << endl; | 
|---|
| 202 | throw e; | 
|---|
| 203 | } | 
|---|
| 204 | else { | 
|---|
| 205 | cout << "Saved results to SAI '" << Analyser::getSAIname() << "'" << endl | 
|---|
| 206 | << "(To analyse the results, open the editor and visualise the clusters using the SAI annotations)" << endl; | 
|---|
| 207 | } | 
|---|
| 208 |  | 
|---|
| 209 | if (!interactive) break; | 
|---|
| 210 | } | 
|---|
| 211 | } | 
|---|
| 212 | } | 
|---|
| 213 | catch (const char *err) { | 
|---|
| 214 | error = GBS_global_string("caught exception: %s", err); | 
|---|
| 215 | } | 
|---|
| 216 | return error; | 
|---|
| 217 | } | 
|---|
| 218 |  | 
|---|
| 219 | int ARB_main(int /*argc*/, char */*argv*/[]) { | 
|---|
| 220 | cout | 
|---|
| 221 | << "arb_rnacma -- correlated mutation analysis" << endl | 
|---|
| 222 | << "              computes clusters of correlated positions" << endl | 
|---|
| 223 | << "(C) 2010 The ARB-project" << endl | 
|---|
| 224 | << "Written 2009/2010 by Breno Faria" << endl | 
|---|
| 225 | << endl | 
|---|
| 226 | << "arb_rnacma uses the eigen C++ library (http://eigen.tuxfamily.org/)" << endl | 
|---|
| 227 | << "eigen is copyrighted by LGPL3" << endl | 
|---|
| 228 | << endl; | 
|---|
| 229 |  | 
|---|
| 230 | int exitcode = EXIT_SUCCESS; | 
|---|
| 231 | { | 
|---|
| 232 | GB_ERROR error = NULp; | 
|---|
| 233 | { | 
|---|
| 234 | GB_shell  shell; | 
|---|
| 235 | GBDATA   *gb_main = GB_open(":", "rwt"); | 
|---|
| 236 |  | 
|---|
| 237 | if (!gb_main) { | 
|---|
| 238 | error = GB_await_error(); | 
|---|
| 239 | } | 
|---|
| 240 | else { | 
|---|
| 241 | error = run_rnacma(gb_main, true, 0.0); | 
|---|
| 242 | } | 
|---|
| 243 | GB_close(gb_main); | 
|---|
| 244 | } | 
|---|
| 245 |  | 
|---|
| 246 | if (error) { | 
|---|
| 247 | cout << endl << "Error in arb_rnacma: " << error << ". terminating." << endl; | 
|---|
| 248 | exitcode = EXIT_FAILURE; | 
|---|
| 249 | } | 
|---|
| 250 | } | 
|---|
| 251 |  | 
|---|
| 252 | return exitcode; | 
|---|
| 253 | } | 
|---|
| 254 |  | 
|---|
| 255 | // -------------------------------------------------------------------------------- | 
|---|
| 256 |  | 
|---|
| 257 | #ifdef UNIT_TESTS | 
|---|
| 258 | #ifndef TEST_UNIT_H | 
|---|
| 259 | #include <test_unit.h> | 
|---|
| 260 | #endif | 
|---|
| 261 |  | 
|---|
| 262 | void TEST_RNACMA() { | 
|---|
| 263 | GB_shell shell; | 
|---|
| 264 | { | 
|---|
| 265 | const char *db      = "TEST_nuc_short.arb"; // ../UNIT_TESTER/run/TEST_nuc_short.arb | 
|---|
| 266 | const char *aliname = "ali_16s"; | 
|---|
| 267 | const char *sainame = "cma_clusters"; | 
|---|
| 268 |  | 
|---|
| 269 | GBDATA *gb_main; | 
|---|
| 270 | TEST_EXPECT_RESULT__NOERROREXPORTED(gb_main = GB_open(db, "rw")); | 
|---|
| 271 |  | 
|---|
| 272 | const int NO_OF_MARKED = 8; | 
|---|
| 273 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED); | 
|---|
| 274 |  | 
|---|
| 275 | { | 
|---|
| 276 | GB_transaction ta(gb_main); | 
|---|
| 277 | TEST_EXPECT_NORESULT__NOERROREXPORTED(GBT_find_SAI(gb_main, sainame)); // expect SAI to be missing | 
|---|
| 278 | } | 
|---|
| 279 |  | 
|---|
| 280 | { | 
|---|
| 281 | GB_transaction ta(gb_main); | 
|---|
| 282 |  | 
|---|
| 283 | struct { | 
|---|
| 284 | double      threshold; | 
|---|
| 285 | const char *expected_clusters_data; | 
|---|
| 286 | } data[] = { | 
|---|
| 287 | { 0.5,   "--------------------------0-----10-----0---------------------23--041---000--5-55-----000060----------0600005---55575-7----5000140--3----0--0--------------2-6------0--0000--000--------0000------0-00----------0------0-00000-0-" }, | 
|---|
| 288 | { 0.666, "--------------------------0------1-----1----------------------2--13----456--7-88-----9::0;------------;004:8---888<7-<----8494-3---2----5--5----------------;----------16---199---------01---------------------9------:--::99---" }, | 
|---|
| 289 | { 0.833, "----------------------------------------------------------------------------0--1-----2-------------------------111-0---------------------------------------------------------3---------------------------------3------------2---" }, | 
|---|
| 290 | { 1.0,   "-------------------------------------------------------------------------------0--------------------------------00-----------------------------------------------------------1---------------------------------1----------------" }, | 
|---|
| 291 | { 1.5,   "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" }, | 
|---|
| 292 |  | 
|---|
| 293 | // output for testdata: "The highest MI value was: 1.48 at position (77, 106)." | 
|---|
| 294 | { -1.0, NULp } | 
|---|
| 295 | }; | 
|---|
| 296 |  | 
|---|
| 297 | TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[0].threshold)); | 
|---|
| 298 |  | 
|---|
| 299 | GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame); | 
|---|
| 300 | TEST_REJECT_NULL(gb_sai); | 
|---|
| 301 |  | 
|---|
| 302 | // test sai content: | 
|---|
| 303 | GBDATA *gb_sai_data = GBT_find_sequence(gb_sai, aliname); | 
|---|
| 304 | TEST_REJECT_NULL(gb_sai_data); | 
|---|
| 305 |  | 
|---|
| 306 | int idx = 0; | 
|---|
| 307 | while (1) { | 
|---|
| 308 | TEST_ANNOTATE(GBS_global_string("idx=%i", idx)); | 
|---|
| 309 | const char *sai_data = GB_read_char_pntr(gb_sai_data); | 
|---|
| 310 | TEST_EXPECT_EQUAL(sai_data, data[idx].expected_clusters_data); | 
|---|
| 311 |  | 
|---|
| 312 | ++idx; | 
|---|
| 313 | if (!data[idx].expected_clusters_data) break; // continue as long there are results to test | 
|---|
| 314 |  | 
|---|
| 315 | TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[idx].threshold)); | 
|---|
| 316 | } | 
|---|
| 317 | } | 
|---|
| 318 |  | 
|---|
| 319 | const int SOME_THRESHOLD = 0.5; | 
|---|
| 320 |  | 
|---|
| 321 | // fail on invalid default alignment: | 
|---|
| 322 | { | 
|---|
| 323 | GB_transaction ta(gb_main); | 
|---|
| 324 | TEST_EXPECT_NO_ERROR(GBT_set_default_alignment(gb_main, "ali_fantasy")); | 
|---|
| 325 | TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "alignment 'ali_fantasy' not found"); | 
|---|
| 326 | ta.close("abort"); // undo change of default alignment | 
|---|
| 327 | } | 
|---|
| 328 |  | 
|---|
| 329 | // want failure if a marked sequence lacks data: | 
|---|
| 330 | for (int fail = 1; fail<=2; ++fail) { | 
|---|
| 331 | TEST_ANNOTATE(GBS_global_string("fail=%i", fail)); | 
|---|
| 332 |  | 
|---|
| 333 | GBDATA *gb_species; | 
|---|
| 334 | { | 
|---|
| 335 | GB_transaction ta(gb_main); | 
|---|
| 336 |  | 
|---|
| 337 | gb_species      = GBT_first_marked_species(gb_main); | 
|---|
| 338 | GBDATA *gb_data = GBT_find_sequence(gb_species, aliname); | 
|---|
| 339 |  | 
|---|
| 340 | switch (fail) { | 
|---|
| 341 | case 1: { | 
|---|
| 342 | GB_topSecurityLevel permit(gb_main); | 
|---|
| 343 | TEST_EXPECT_NO_ERROR(GB_delete(gb_data)); | 
|---|
| 344 | break; | 
|---|
| 345 | } | 
|---|
| 346 | case 2: { | 
|---|
| 347 | GBDATA *gb_ali = GB_get_father(gb_data); | 
|---|
| 348 | TEST_EXPECT_NO_ERROR(GB_delete(gb_ali)); | 
|---|
| 349 | break; | 
|---|
| 350 | } | 
|---|
| 351 | default: arb_assert(0); break; | 
|---|
| 352 | } | 
|---|
| 353 |  | 
|---|
| 354 | TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "species 'LcbPont2' has no data in selected alignment 'ali_16s'"); | 
|---|
| 355 |  | 
|---|
| 356 | ta.close("abort"); // undo changes | 
|---|
| 357 | } | 
|---|
| 358 | { | 
|---|
| 359 | // somehow the species flag is reset by the above (aborted) transaction. | 
|---|
| 360 | // @@@ could be a general database bug!? undo delete is known to be problematic (see #496) | 
|---|
| 361 | GB_transaction ta(gb_main); | 
|---|
| 362 | GB_write_flag(gb_species, 1); | 
|---|
| 363 | } | 
|---|
| 364 | } | 
|---|
| 365 | TEST_ANNOTATE(NULp); | 
|---|
| 366 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED); | 
|---|
| 367 |  | 
|---|
| 368 | // @@@ provoke some more errors? | 
|---|
| 369 |  | 
|---|
| 370 | // test failure when no species marked: | 
|---|
| 371 | { | 
|---|
| 372 | GB_transaction ta(gb_main); | 
|---|
| 373 | GBT_mark_all(gb_main, 0); | 
|---|
| 374 | TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "caught exception: no (marked?) sequences"); | 
|---|
| 375 | ta.close("abort"); // undo changes | 
|---|
| 376 | } | 
|---|
| 377 |  | 
|---|
| 378 | // Note that marks are correctly restored by aborting TA | 
|---|
| 379 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED); | 
|---|
| 380 |  | 
|---|
| 381 | GB_close(gb_main); | 
|---|
| 382 | } | 
|---|
| 383 | } | 
|---|
| 384 |  | 
|---|
| 385 | #endif // UNIT_TESTS | 
|---|
| 386 |  | 
|---|
| 387 | // -------------------------------------------------------------------------------- | 
|---|