| 1 | /* |
|---|
| 2 | * Analyser.cpp |
|---|
| 3 | * |
|---|
| 4 | * Responsible for interacting with the Cma code and Arb. |
|---|
| 5 | * |
|---|
| 6 | * Created on: Feb 15, 2010 |
|---|
| 7 | * Author: breno |
|---|
| 8 | * |
|---|
| 9 | * Institute of Microbiology (Technical University Munich) |
|---|
| 10 | * http://www.arb-home.de/ |
|---|
| 11 | */ |
|---|
| 12 | |
|---|
| 13 | #include "Analyser.h" |
|---|
| 14 | |
|---|
| 15 | #include <ctime> |
|---|
| 16 | #include <iostream> |
|---|
| 17 | #include <iterator> |
|---|
| 18 | |
|---|
| 19 | /** |
|---|
| 20 | * Constructor |
|---|
| 21 | */ |
|---|
| 22 | Analyser::Analyser(GBDATA *gb_main) : |
|---|
| 23 | loader(NULp), |
|---|
| 24 | cma(NULp) |
|---|
| 25 | { |
|---|
| 26 | |
|---|
| 27 | //Definition of the alphabet |
|---|
| 28 | vector<string> alphabet(0); |
|---|
| 29 | alphabet.push_back("A"); |
|---|
| 30 | alphabet.push_back("C"); |
|---|
| 31 | alphabet.push_back("G"); |
|---|
| 32 | alphabet.push_back("T"); |
|---|
| 33 | |
|---|
| 34 | loader = new AlignedSequenceLoader(gb_main); |
|---|
| 35 | if (!loader->has_error()) { |
|---|
| 36 | VecVecType *seqs = loader->getSequences(); |
|---|
| 37 | cma = new Cma(alphabet, seqs->size()); |
|---|
| 38 | } |
|---|
| 39 | } |
|---|
| 40 | |
|---|
| 41 | /** |
|---|
| 42 | * Destructor. |
|---|
| 43 | */ |
|---|
| 44 | Analyser::~Analyser() { |
|---|
| 45 | delete loader; |
|---|
| 46 | delete cma; |
|---|
| 47 | } |
|---|
| 48 | |
|---|
| 49 | /** |
|---|
| 50 | * Getter for the Cma object. |
|---|
| 51 | * |
|---|
| 52 | * @return the Cma object. |
|---|
| 53 | */ |
|---|
| 54 | Cma* Analyser::getCma() { |
|---|
| 55 | arb_assert(!has_error()); |
|---|
| 56 | return cma; |
|---|
| 57 | } |
|---|
| 58 | |
|---|
| 59 | /** |
|---|
| 60 | * Returns the AlignedSequenceLoader object. |
|---|
| 61 | * |
|---|
| 62 | * @return the AlignedSequenceLoader object. |
|---|
| 63 | */ |
|---|
| 64 | AlignedSequenceLoader* Analyser::getLoader() { |
|---|
| 65 | return loader; |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | /** |
|---|
| 69 | * Saves the clusters to the DB as SAI. |
|---|
| 70 | * |
|---|
| 71 | * @param clusters: the cluster indices. |
|---|
| 72 | * @param threshold: the clustering threshold used. |
|---|
| 73 | * |
|---|
| 74 | * @return an GB_ERROR if some DB transaction fails. |
|---|
| 75 | */ |
|---|
| 76 | GB_ERROR Analyser::saveSAI(GBDATA *gb_main, vector<size_t> clusters, double threshold) { |
|---|
| 77 | GB_ERROR error = GB_push_transaction(gb_main); |
|---|
| 78 | |
|---|
| 79 | if (!error) { |
|---|
| 80 | char *al_name = GBT_get_default_alignment(gb_main); |
|---|
| 81 | if (!al_name) { |
|---|
| 82 | error = GB_await_error(); |
|---|
| 83 | } |
|---|
| 84 | else { |
|---|
| 85 | vector<char> clusts = normalizeClusters(clusters); |
|---|
| 86 | // build result string |
|---|
| 87 | stringstream ss; |
|---|
| 88 | copy(clusts.begin(), clusts.end(), ostream_iterator<char> (ss, "")); |
|---|
| 89 | string result = ss.str(); |
|---|
| 90 | |
|---|
| 91 | //save |
|---|
| 92 | GBDATA *gb_sai = GBT_find_or_create_SAI(gb_main, getSAIname()); |
|---|
| 93 | GBDATA *gb_data = GBT_add_data(gb_sai, al_name, "data", GB_STRING); |
|---|
| 94 | error = GB_write_string(gb_data, result.c_str()); |
|---|
| 95 | |
|---|
| 96 | if (!error) { |
|---|
| 97 | GBDATA *gb_options = GBT_add_data(gb_sai, al_name, "_TYPE", GB_STRING); |
|---|
| 98 | stringstream options; |
|---|
| 99 | options << "CMA_CLUSTERING: [threshold: " << threshold << "]"; |
|---|
| 100 | |
|---|
| 101 | error = GB_write_string(gb_options, options.str().c_str()); |
|---|
| 102 | } |
|---|
| 103 | free(al_name); |
|---|
| 104 | } |
|---|
| 105 | } |
|---|
| 106 | error = GB_end_transaction(gb_main, error); |
|---|
| 107 | |
|---|
| 108 | return error; |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | /** |
|---|
| 112 | * Gives clusters a reasonable name. The clustering algorithm may return |
|---|
| 113 | * clusters with indices 123 even though there are only 50 clusters. |
|---|
| 114 | * Here we normalise the cluster names, in this example we would have |
|---|
| 115 | * only clusters from 0..50 as result. |
|---|
| 116 | * |
|---|
| 117 | * @param clusters: the result of the clustering algorithm. |
|---|
| 118 | * |
|---|
| 119 | * @return the new cluster names (note that the cluster index is a char |
|---|
| 120 | * because we have to be able to show it in the SAI, where we |
|---|
| 121 | * are allowed to use only one character for each position). |
|---|
| 122 | */ |
|---|
| 123 | vector<char> Analyser::normalizeClusters(vector<size_t> clusters) { |
|---|
| 124 | |
|---|
| 125 | vector<char> result; |
|---|
| 126 | map<unsigned int, char> rename_map; |
|---|
| 127 | rename_map[0] = '-'; |
|---|
| 128 | char classes = '0'; |
|---|
| 129 | |
|---|
| 130 | for (vector<size_t>::iterator it = clusters.begin(); it != clusters.end(); ++it) { |
|---|
| 131 | if (rename_map.find(*it) == rename_map.end()) { |
|---|
| 132 | rename_map[*it] = classes++; |
|---|
| 133 | } |
|---|
| 134 | result.push_back(rename_map[*it]); |
|---|
| 135 | } |
|---|
| 136 | return result; |
|---|
| 137 | |
|---|
| 138 | } |
|---|
| 139 | |
|---|
| 140 | //-------------------------------- |
|---|
| 141 | |
|---|
| 142 | GB_ERROR run_rnacma(GBDATA *gb_main, bool interactive, double threshold) { |
|---|
| 143 | /*! interactive main loop of RNACMA. |
|---|
| 144 | * @param gb_main the database. |
|---|
| 145 | * @param interactive normally true. set to false for (non-interactive) unittesting. |
|---|
| 146 | * @param threshold only used if !interactive (i.e. for unittesting) |
|---|
| 147 | */ |
|---|
| 148 | |
|---|
| 149 | GB_ERROR error = NULp; |
|---|
| 150 | try { |
|---|
| 151 | Analyser a(gb_main); |
|---|
| 152 | error = a.get_error(); |
|---|
| 153 | |
|---|
| 154 | if (!error) { |
|---|
| 155 | Cma *cma = a.getCma(); |
|---|
| 156 | |
|---|
| 157 | cma->computeMutualInformationP(*(a.getLoader()->getSequences())); |
|---|
| 158 | |
|---|
| 159 | list<MITuple> mituple = cma->compute_mituples(cma->getMIp()); |
|---|
| 160 | |
|---|
| 161 | cout << endl |
|---|
| 162 | << "The highest MI value was: " << mituple.begin()->MI |
|---|
| 163 | << " at position (" << mituple.begin()->pos1 << ", " |
|---|
| 164 | << mituple.begin()->pos2 << ")." << endl |
|---|
| 165 | << "(Note: pairs with MI-values down to the threshold will be joined in one cluster)" << endl; |
|---|
| 166 | |
|---|
| 167 | while (true) { |
|---|
| 168 | if (interactive) { |
|---|
| 169 | cout << endl |
|---|
| 170 | << "Press Ctrl-d to quit or" << endl |
|---|
| 171 | << "choose a threshold value for the clustering process: "; |
|---|
| 172 | |
|---|
| 173 | |
|---|
| 174 | string input; |
|---|
| 175 | cin >> input; |
|---|
| 176 | |
|---|
| 177 | if (input.empty()) { |
|---|
| 178 | cout << endl << "quit" << endl; |
|---|
| 179 | break; |
|---|
| 180 | } |
|---|
| 181 | |
|---|
| 182 | threshold = strtod(input.c_str(), NULp); |
|---|
| 183 | } |
|---|
| 184 | cout << "Building clusters with threshold = " << threshold << endl; |
|---|
| 185 | |
|---|
| 186 | VectorXi cl = cma->computeClusters(mituple, size_t(cma->getMIp().cols()), threshold); |
|---|
| 187 | vector<size_t> *clusters = new vector<size_t> (0, 0); |
|---|
| 188 | AlignedSequenceLoader *l = a.getLoader(); |
|---|
| 189 | size_t i = 0; |
|---|
| 190 | size_t j = 0; |
|---|
| 191 | for (vector<size_t>::const_iterator it = l->getPositionMap().begin(); it != l->getPositionMap().end(); ++it) { |
|---|
| 192 | while (i < *it) { |
|---|
| 193 | clusters->push_back(0); |
|---|
| 194 | i++; |
|---|
| 195 | } |
|---|
| 196 | clusters->push_back(cl[j]); |
|---|
| 197 | j++; |
|---|
| 198 | i++; |
|---|
| 199 | } |
|---|
| 200 | |
|---|
| 201 | GB_ERROR e = a.saveSAI(gb_main, *clusters, threshold); |
|---|
| 202 | delete clusters; |
|---|
| 203 | |
|---|
| 204 | if (e) { |
|---|
| 205 | cout << "Error while saving SAI: " << e << endl; |
|---|
| 206 | throw e; |
|---|
| 207 | } |
|---|
| 208 | else { |
|---|
| 209 | cout << "Saved results to SAI '" << Analyser::getSAIname() << "'" << endl |
|---|
| 210 | << "(To analyse the results, open the editor and visualise the clusters using the SAI annotations)" << endl; |
|---|
| 211 | } |
|---|
| 212 | |
|---|
| 213 | if (!interactive) break; |
|---|
| 214 | } |
|---|
| 215 | } |
|---|
| 216 | } |
|---|
| 217 | catch (const char *err) { |
|---|
| 218 | error = GBS_global_string("caught exception: %s", err); |
|---|
| 219 | } |
|---|
| 220 | return error; |
|---|
| 221 | } |
|---|
| 222 | |
|---|
| 223 | int ARB_main(int /*argc*/, char */*argv*/[]) { |
|---|
| 224 | cout |
|---|
| 225 | << "arb_rnacma -- correlated mutation analysis" << endl |
|---|
| 226 | << " computes clusters of correlated positions" << endl |
|---|
| 227 | << "(C) 2010 The ARB-project" << endl |
|---|
| 228 | << "Written 2009/2010 by Breno Faria" << endl |
|---|
| 229 | << endl |
|---|
| 230 | << "arb_rnacma uses the eigen C++ library (http://eigen.tuxfamily.org/)" << endl |
|---|
| 231 | << "eigen is copyrighted by LGPL3" << endl |
|---|
| 232 | << endl; |
|---|
| 233 | |
|---|
| 234 | int exitcode = EXIT_SUCCESS; |
|---|
| 235 | { |
|---|
| 236 | GB_ERROR error = NULp; |
|---|
| 237 | { |
|---|
| 238 | GB_shell shell; |
|---|
| 239 | GBDATA *gb_main = GB_open(":", "rwt"); |
|---|
| 240 | |
|---|
| 241 | if (!gb_main) { |
|---|
| 242 | error = GB_await_error(); |
|---|
| 243 | } |
|---|
| 244 | else { |
|---|
| 245 | error = run_rnacma(gb_main, true, 0.0); |
|---|
| 246 | } |
|---|
| 247 | GB_close(gb_main); |
|---|
| 248 | } |
|---|
| 249 | |
|---|
| 250 | if (error) { |
|---|
| 251 | cout << endl << "Error in arb_rnacma: " << error << ". terminating." << endl; |
|---|
| 252 | exitcode = EXIT_FAILURE; |
|---|
| 253 | } |
|---|
| 254 | } |
|---|
| 255 | |
|---|
| 256 | return exitcode; |
|---|
| 257 | } |
|---|
| 258 | |
|---|
| 259 | // -------------------------------------------------------------------------------- |
|---|
| 260 | |
|---|
| 261 | #ifdef UNIT_TESTS |
|---|
| 262 | #ifndef TEST_UNIT_H |
|---|
| 263 | #include <test_unit.h> |
|---|
| 264 | #endif |
|---|
| 265 | |
|---|
| 266 | void TEST_RNACMA() { |
|---|
| 267 | GB_shell shell; |
|---|
| 268 | { |
|---|
| 269 | const char *db = "TEST_nuc_short.arb"; // ../UNIT_TESTER/run/TEST_nuc_short.arb |
|---|
| 270 | const char *aliname = "ali_16s"; |
|---|
| 271 | const char *sainame = "cma_clusters"; |
|---|
| 272 | |
|---|
| 273 | GBDATA *gb_main; |
|---|
| 274 | TEST_EXPECT_RESULT__NOERROREXPORTED(gb_main = GB_open(db, "rw")); |
|---|
| 275 | |
|---|
| 276 | const int NO_OF_MARKED = 8; |
|---|
| 277 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED); |
|---|
| 278 | |
|---|
| 279 | { |
|---|
| 280 | GB_transaction ta(gb_main); |
|---|
| 281 | TEST_EXPECT_NORESULT__NOERROREXPORTED(GBT_find_SAI(gb_main, sainame)); // expect SAI to be missing |
|---|
| 282 | } |
|---|
| 283 | |
|---|
| 284 | { |
|---|
| 285 | GB_transaction ta(gb_main); |
|---|
| 286 | |
|---|
| 287 | struct { |
|---|
| 288 | double threshold; |
|---|
| 289 | const char *expected_clusters_data; |
|---|
| 290 | } data[] = { |
|---|
| 291 | { 0.5, "--------------------------0-----10-----0---------------------23--041---000--5-55-----000060----------0600005---55575-7----5000140--3----0--0--------------2-6------0--0000--000--------0000------0-00----------0------0-00000-0-" }, |
|---|
| 292 | { 0.666, "--------------------------0------1-----1----------------------2--13----456--7-88-----9::0;------------;004:8---888<7-<----8494-3---2----5--5----------------;----------16---199---------01---------------------9------:--::99---" }, |
|---|
| 293 | { 0.833, "----------------------------------------------------------------------------0--1-----2-------------------------111-0---------------------------------------------------------3---------------------------------3------------2---" }, |
|---|
| 294 | { 1.0, "-------------------------------------------------------------------------------0--------------------------------00-----------------------------------------------------------1---------------------------------1----------------" }, |
|---|
| 295 | { 1.5, "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------" }, |
|---|
| 296 | |
|---|
| 297 | // output for testdata: "The highest MI value was: 1.48 at position (77, 106)." |
|---|
| 298 | { -1.0, NULp } |
|---|
| 299 | }; |
|---|
| 300 | |
|---|
| 301 | TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[0].threshold)); |
|---|
| 302 | |
|---|
| 303 | GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame); |
|---|
| 304 | TEST_REJECT_NULL(gb_sai); |
|---|
| 305 | |
|---|
| 306 | // test sai content: |
|---|
| 307 | GBDATA *gb_sai_data = GBT_find_sequence(gb_sai, aliname); |
|---|
| 308 | TEST_REJECT_NULL(gb_sai_data); |
|---|
| 309 | |
|---|
| 310 | int idx = 0; |
|---|
| 311 | while (1) { |
|---|
| 312 | TEST_ANNOTATE(GBS_global_string("idx=%i", idx)); |
|---|
| 313 | const char *sai_data = GB_read_char_pntr(gb_sai_data); |
|---|
| 314 | TEST_EXPECT_EQUAL(sai_data, data[idx].expected_clusters_data); |
|---|
| 315 | |
|---|
| 316 | ++idx; |
|---|
| 317 | if (!data[idx].expected_clusters_data) break; // continue as long there are results to test |
|---|
| 318 | |
|---|
| 319 | TEST_EXPECT_NO_ERROR(run_rnacma(gb_main, false, data[idx].threshold)); |
|---|
| 320 | } |
|---|
| 321 | } |
|---|
| 322 | |
|---|
| 323 | const int SOME_THRESHOLD = 0.5; |
|---|
| 324 | |
|---|
| 325 | // fail on invalid default alignment: |
|---|
| 326 | { |
|---|
| 327 | GB_transaction ta(gb_main); |
|---|
| 328 | TEST_EXPECT_NO_ERROR(GBT_set_default_alignment(gb_main, "ali_fantasy")); |
|---|
| 329 | TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "alignment 'ali_fantasy' not found"); |
|---|
| 330 | ta.close("abort"); // undo change of default alignment |
|---|
| 331 | } |
|---|
| 332 | |
|---|
| 333 | // want failure if a marked sequence lacks data: |
|---|
| 334 | for (int fail = 1; fail<=2; ++fail) { |
|---|
| 335 | TEST_ANNOTATE(GBS_global_string("fail=%i", fail)); |
|---|
| 336 | |
|---|
| 337 | GBDATA *gb_species; |
|---|
| 338 | { |
|---|
| 339 | GB_transaction ta(gb_main); |
|---|
| 340 | |
|---|
| 341 | gb_species = GBT_first_marked_species(gb_main); |
|---|
| 342 | GBDATA *gb_data = GBT_find_sequence(gb_species, aliname); |
|---|
| 343 | |
|---|
| 344 | switch (fail) { |
|---|
| 345 | case 1: { |
|---|
| 346 | GB_topSecurityLevel permit(gb_main); |
|---|
| 347 | TEST_EXPECT_NO_ERROR(GB_delete(gb_data)); |
|---|
| 348 | break; |
|---|
| 349 | } |
|---|
| 350 | case 2: { |
|---|
| 351 | GBDATA *gb_ali = GB_get_father(gb_data); |
|---|
| 352 | TEST_EXPECT_NO_ERROR(GB_delete(gb_ali)); |
|---|
| 353 | break; |
|---|
| 354 | } |
|---|
| 355 | default: arb_assert(0); break; |
|---|
| 356 | } |
|---|
| 357 | |
|---|
| 358 | TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "species 'LcbPont2' has no data in selected alignment 'ali_16s'"); |
|---|
| 359 | |
|---|
| 360 | ta.close("abort"); // undo changes |
|---|
| 361 | } |
|---|
| 362 | { |
|---|
| 363 | // somehow the species flag is reset by the above (aborted) transaction. |
|---|
| 364 | // @@@ could be a general database bug!? undo delete is known to be problematic (see #496) |
|---|
| 365 | GB_transaction ta(gb_main); |
|---|
| 366 | GB_write_flag(gb_species, 1); |
|---|
| 367 | } |
|---|
| 368 | } |
|---|
| 369 | TEST_ANNOTATE(NULp); |
|---|
| 370 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED); |
|---|
| 371 | |
|---|
| 372 | // @@@ provoke some more errors? |
|---|
| 373 | |
|---|
| 374 | // test failure when no species marked: |
|---|
| 375 | { |
|---|
| 376 | GB_transaction ta(gb_main); |
|---|
| 377 | GBT_mark_all(gb_main, 0); |
|---|
| 378 | TEST_EXPECT_ERROR_CONTAINS(run_rnacma(gb_main, false, SOME_THRESHOLD), "caught exception: no (marked?) sequences"); |
|---|
| 379 | ta.close("abort"); // undo changes |
|---|
| 380 | } |
|---|
| 381 | |
|---|
| 382 | // Note that marks are correctly restored by aborting TA |
|---|
| 383 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), NO_OF_MARKED); |
|---|
| 384 | |
|---|
| 385 | GB_close(gb_main); |
|---|
| 386 | } |
|---|
| 387 | } |
|---|
| 388 | |
|---|
| 389 | #endif // UNIT_TESTS |
|---|
| 390 | |
|---|
| 391 | // -------------------------------------------------------------------------------- |
|---|