| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : distanalyse.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "di_matr.hxx" |
|---|
| 12 | |
|---|
| 13 | using std::min; |
|---|
| 14 | using std::max; |
|---|
| 15 | using std::string; |
|---|
| 16 | |
|---|
| 17 | DI_TRANSFORMATION DI_MATRIX::detect_transformation(string& msg) { |
|---|
| 18 | di_assert(nentries>0); |
|---|
| 19 | |
|---|
| 20 | DI_TRANSFORMATION result = DI_TRANSFORMATION_NONE_DETECTED; |
|---|
| 21 | if (is_AA) { |
|---|
| 22 | if (nentries> 100) { |
|---|
| 23 | msg = "A lot of sequences!\n ==> fast Kimura selected! (instead of PAM)"; |
|---|
| 24 | result = DI_TRANSFORMATION_KIMURA; |
|---|
| 25 | } |
|---|
| 26 | else { |
|---|
| 27 | msg = "Only limited number of sequences!\n ==> slow PAM selected! (instead of Kimura)"; |
|---|
| 28 | result = DI_TRANSFORMATION_PAM; |
|---|
| 29 | } |
|---|
| 30 | } |
|---|
| 31 | else { |
|---|
| 32 | long mean_len = 0; |
|---|
| 33 | float min_gc = 9999.9; |
|---|
| 34 | float max_gc = 0.0; |
|---|
| 35 | long min_len = 9999999; |
|---|
| 36 | long max_len = 0; |
|---|
| 37 | |
|---|
| 38 | // calculate meanvalue of sequencelength: |
|---|
| 39 | for (size_t row=0; row<nentries; row++) { |
|---|
| 40 | const char *sequ = entries[row]->get_nucl_seq()->get_sequence(); |
|---|
| 41 | |
|---|
| 42 | size_t flen = aliview->get_length(); |
|---|
| 43 | |
|---|
| 44 | long act_gci = 0; |
|---|
| 45 | long act_len = 0; |
|---|
| 46 | |
|---|
| 47 | for (size_t pos=0; pos<flen; pos++) { |
|---|
| 48 | char ch = sequ[pos]; |
|---|
| 49 | if (ch == AP_C || ch == AP_G) act_gci++; |
|---|
| 50 | if (ch == AP_A || ch == AP_C || ch == AP_G || ch == AP_T) act_len++; |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | mean_len += act_len; |
|---|
| 54 | |
|---|
| 55 | float act_gc = ((float) act_gci) / act_len; |
|---|
| 56 | |
|---|
| 57 | min_gc = min(min_gc, act_gc); |
|---|
| 58 | max_gc = max(max_gc, act_gc); |
|---|
| 59 | |
|---|
| 60 | min_len = min(min_len, act_len); |
|---|
| 61 | max_len = max(max_len, act_len); |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | string warn; |
|---|
| 65 | if (min_len * 1.3 < max_len) { |
|---|
| 66 | warn = "Warning: The length of sequences differs significantly!\n" |
|---|
| 67 | "Be careful: Neighbour Joining is sensitive to this kind of \"error\""; |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | mean_len /= nentries; |
|---|
| 71 | |
|---|
| 72 | if (mean_len < 100) { |
|---|
| 73 | msg = "Too short sequences!\n ==> No correction selected!"; |
|---|
| 74 | result = DI_TRANSFORMATION_NONE; |
|---|
| 75 | } |
|---|
| 76 | else if (mean_len < 300) { |
|---|
| 77 | msg = "Meanlength shorter than 300\n ==> Jukes Cantor selected!"; |
|---|
| 78 | result = DI_TRANSFORMATION_JUKES_CANTOR; |
|---|
| 79 | } |
|---|
| 80 | else if ((mean_len < 1000) || ((max_gc / min_gc) < 1.2)) { |
|---|
| 81 | const char *reason; |
|---|
| 82 | if (mean_len < 1000) reason = "Sequences are too short for Olsen!"; |
|---|
| 83 | else reason = GBS_global_string("Maximal GC (%f) : Minimal GC (%f) < 1.2", max_gc, min_gc); |
|---|
| 84 | |
|---|
| 85 | msg = GBS_global_string("%s ==> Felsenstein selected!", reason); |
|---|
| 86 | result = DI_TRANSFORMATION_FELSENSTEIN; |
|---|
| 87 | } |
|---|
| 88 | else { |
|---|
| 89 | msg = "Olsen selected!"; |
|---|
| 90 | result = DI_TRANSFORMATION_OLSEN; |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | if (!warn.empty()) { |
|---|
| 94 | di_assert(!msg.empty()); |
|---|
| 95 | msg = warn+'\n'+msg; |
|---|
| 96 | } |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | di_assert(!msg.empty()); |
|---|
| 100 | di_assert(result != DI_TRANSFORMATION_NONE_DETECTED); |
|---|
| 101 | |
|---|
| 102 | return result; |
|---|
| 103 | } |
|---|
| 104 | |
|---|