| 1 | #include <stdio.h> |
|---|
| 2 | #include <memory.h> |
|---|
| 3 | #include <string.h> |
|---|
| 4 | #include <math.h> |
|---|
| 5 | #include <iostream> |
|---|
| 6 | |
|---|
| 7 | #include <arbdb.h> |
|---|
| 8 | #include <arbdbt.h> |
|---|
| 9 | #include <awt_tree.hxx> |
|---|
| 10 | #include "awt_seq_dna.hxx" |
|---|
| 11 | #include "awt_parsimony_defaults.hxx" |
|---|
| 12 | |
|---|
| 13 | #ifndef ARB_ASSERT_H |
|---|
| 14 | #include <arb_assert.h> |
|---|
| 15 | #endif |
|---|
| 16 | #define awt_assert(bed) arb_assert(bed) |
|---|
| 17 | |
|---|
| 18 | char *AP_sequence_parsimony::table; |
|---|
| 19 | |
|---|
| 20 | /***************************** |
|---|
| 21 | |
|---|
| 22 | |
|---|
| 23 | ******************************/ |
|---|
| 24 | |
|---|
| 25 | AP_sequence_parsimony::AP_sequence_parsimony(AP_tree_root *rooti) : AP_sequence(rooti) |
|---|
| 26 | { |
|---|
| 27 | sequence = 0; |
|---|
| 28 | } |
|---|
| 29 | |
|---|
| 30 | AP_sequence_parsimony::~AP_sequence_parsimony(void) |
|---|
| 31 | { |
|---|
| 32 | if (sequence != 0) delete [] sequence; |
|---|
| 33 | sequence = 0; |
|---|
| 34 | } |
|---|
| 35 | |
|---|
| 36 | AP_sequence *AP_sequence_parsimony::dup(void) |
|---|
| 37 | { |
|---|
| 38 | return new AP_sequence_parsimony(root); |
|---|
| 39 | } |
|---|
| 40 | |
|---|
| 41 | void AP_sequence_parsimony::build_table(void) |
|---|
| 42 | { |
|---|
| 43 | table = (char *)AP_create_dna_to_ap_bases(); |
|---|
| 44 | } |
|---|
| 45 | |
|---|
| 46 | |
|---|
| 47 | |
|---|
| 48 | /************************************************************************ |
|---|
| 49 | combine(const AP_sequence *lefts, const AP_sequence *rights) |
|---|
| 50 | set(char *isequence) |
|---|
| 51 | |
|---|
| 52 | for wagner & fitch parsimony algorithm |
|---|
| 53 | is_set_flag is used by AP_tree_nlen::parsimony_rek() |
|---|
| 54 | *************************************************************************/ |
|---|
| 55 | |
|---|
| 56 | // #define SHOW_SEQ |
|---|
| 57 | |
|---|
| 58 | void AP_sequence_parsimony::set(const char *isequence) |
|---|
| 59 | { |
|---|
| 60 | // char *s,*d,*f1,c; |
|---|
| 61 | sequence_len = root->filter->real_len; |
|---|
| 62 | sequence = new char[sequence_len+1]; |
|---|
| 63 | memset(sequence,AP_N,(size_t)sequence_len+1); |
|---|
| 64 | // s = isequence; |
|---|
| 65 | // d = sequence; |
|---|
| 66 | // f1 = root->filter->filter_mask; |
|---|
| 67 | |
|---|
| 68 | const uchar *simplify = root->filter->simplify; |
|---|
| 69 | if (!table) this->build_table(); |
|---|
| 70 | |
|---|
| 71 | if (root->filter->bootstrap) { |
|---|
| 72 | int iseqlen = strlen(isequence); |
|---|
| 73 | |
|---|
| 74 | for (int i = 0; i<sequence_len; ++i) { |
|---|
| 75 | int pos = root->filter->bootstrap[i]; // enthaelt zufallsfolge |
|---|
| 76 | |
|---|
| 77 | // printf("i=%i pos=%i sequence_len=%li iseqlen=%i\n", i, pos, sequence_len, iseqlen); |
|---|
| 78 | |
|---|
| 79 | awt_assert(pos >= 0); |
|---|
| 80 | awt_assert(pos<iseqlen); |
|---|
| 81 | if (pos >= iseqlen) continue; |
|---|
| 82 | |
|---|
| 83 | unsigned char c = (unsigned char)isequence[pos]; // @@@ muss ueber mapping tabelle aufgefaltet werden 10/99 |
|---|
| 84 | |
|---|
| 85 | #if defined(SHOW_SEQ) |
|---|
| 86 | fputc(simplify[c], stdout); |
|---|
| 87 | #endif // SHOW_SEQ |
|---|
| 88 | |
|---|
| 89 | sequence[i] = table[simplify[c]]; |
|---|
| 90 | |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | // int i; |
|---|
| 94 | // for (i=root->filter->real_len-1;i>=0;i--){ |
|---|
| 95 | // int pos = root->filter->bootstrap[i]; // enthaelt zufallsfolge |
|---|
| 96 | // if (pos >= iseqlen) continue; |
|---|
| 97 | // c = s[pos]; |
|---|
| 98 | // d[i] = table[simplify[c]]; |
|---|
| 99 | // } |
|---|
| 100 | |
|---|
| 101 | } |
|---|
| 102 | else { |
|---|
| 103 | char *filt = root->filter->filter_mask; |
|---|
| 104 | int left_bases = sequence_len; |
|---|
| 105 | int filter_len = root->filter->filter_len; |
|---|
| 106 | int oidx = 0; // for output sequence |
|---|
| 107 | |
|---|
| 108 | for (int idx = 0; idx<filter_len && left_bases; ++idx) { |
|---|
| 109 | if (filt[idx]) { |
|---|
| 110 | unsigned char c = (unsigned char)isequence[idx]; |
|---|
| 111 | sequence[oidx++] = table[simplify[c]]; |
|---|
| 112 | --left_bases; |
|---|
| 113 | #if defined(SHOW_SEQ) |
|---|
| 114 | fputc(simplify[c], stdout); |
|---|
| 115 | #endif // SHOW_SEQ |
|---|
| 116 | } |
|---|
| 117 | } |
|---|
| 118 | |
|---|
| 119 | // while ( (c = (*s++)) ) { |
|---|
| 120 | // if (!i) break; |
|---|
| 121 | // if (*(f++)) { // use current position ? |
|---|
| 122 | // i--; // decrement leftover positions |
|---|
| 123 | // *(d++) = table[simplify[c]]; |
|---|
| 124 | // } |
|---|
| 125 | // } |
|---|
| 126 | } |
|---|
| 127 | |
|---|
| 128 | #if defined(SHOW_SEQ) |
|---|
| 129 | fputc('\n', stdout); |
|---|
| 130 | #endif // SHOW_SEQ |
|---|
| 131 | |
|---|
| 132 | update = AP_timer(); |
|---|
| 133 | is_set_flag = AP_TRUE; |
|---|
| 134 | cashed_real_len = -1.0; |
|---|
| 135 | } |
|---|
| 136 | |
|---|
| 137 | AP_FLOAT AP_sequence_parsimony::combine( const AP_sequence *lefts, const AP_sequence *rights) { |
|---|
| 138 | // char *p1,*p2,*p; |
|---|
| 139 | // char c1,c2; |
|---|
| 140 | // long result; |
|---|
| 141 | |
|---|
| 142 | const AP_sequence_parsimony *left = (const AP_sequence_parsimony *)lefts; |
|---|
| 143 | const AP_sequence_parsimony *right = (const AP_sequence_parsimony *)rights; |
|---|
| 144 | |
|---|
| 145 | if (sequence == 0) { |
|---|
| 146 | sequence_len = root->filter->real_len; |
|---|
| 147 | sequence = new char[root->filter->real_len +1]; |
|---|
| 148 | } |
|---|
| 149 | |
|---|
| 150 | const char *p1 = left->sequence; |
|---|
| 151 | const char *p2 = right->sequence; |
|---|
| 152 | char *p = sequence; |
|---|
| 153 | |
|---|
| 154 | // result = 0; |
|---|
| 155 | // char *end_seq1 = p1 + sequence_len; |
|---|
| 156 | |
|---|
| 157 | GB_UINT4 *w = 0; |
|---|
| 158 | char *mutpsite = 0; |
|---|
| 159 | |
|---|
| 160 | if (mutation_per_site) { // count site specific mutations in mutation_per_site |
|---|
| 161 | w = root->weights->weights; |
|---|
| 162 | mutpsite = mutation_per_site; |
|---|
| 163 | } |
|---|
| 164 | else if (root->weights->dummy_weights) { // no weights, no mutation_per_site |
|---|
| 165 | ; |
|---|
| 166 | } |
|---|
| 167 | else { // weighted (but don't count mutation_per_site) |
|---|
| 168 | w = root->weights->weights; |
|---|
| 169 | } |
|---|
| 170 | |
|---|
| 171 | long result = 0; |
|---|
| 172 | |
|---|
| 173 | for (long idx = 0; idx<sequence_len; ++idx) { |
|---|
| 174 | char c1 = p1[idx]; |
|---|
| 175 | char c2 = p2[idx]; |
|---|
| 176 | |
|---|
| 177 | awt_assert(c1 != 0); // not a base and not a gap -- what should it be ? |
|---|
| 178 | awt_assert(c2 != 0); |
|---|
| 179 | |
|---|
| 180 | if ((c1&c2) == 0) { // bases are distinct (that means we count a mismatch) |
|---|
| 181 | p[idx] = c1|c2; // mix distinct bases |
|---|
| 182 | |
|---|
| 183 | #if !defined(MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS) |
|---|
| 184 | // count multiple mutations as 1 mutation |
|---|
| 185 | // this was an experiment (don't use it, it does not work!) |
|---|
| 186 | if (p[idx]&AP_S) { // contains a gap |
|---|
| 187 | if (idx>0 && (p[idx-1]&AP_S)) { // last position also contained gap.. |
|---|
| 188 | if (((c1&AP_S) && (p1[idx-1]&AP_S)) || // ..in same sequence |
|---|
| 189 | ((c2&AP_S) && (p2[idx-1]&AP_S))) |
|---|
| 190 | { |
|---|
| 191 | if (!(p1[idx-1]&AP_S) || !(p2[idx-1]&AP_S)) { // if one of the sequences had no gap at previous position |
|---|
| 192 | continue; // skip multiple gaps |
|---|
| 193 | } |
|---|
| 194 | } |
|---|
| 195 | } |
|---|
| 196 | } |
|---|
| 197 | #endif // MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS |
|---|
| 198 | |
|---|
| 199 | // now count mutation : |
|---|
| 200 | |
|---|
| 201 | if (mutpsite) mutpsite[idx]++; // count mutations per site (unweighted) |
|---|
| 202 | result += w ? w[idx] : 1; // count weighted or simple |
|---|
| 203 | } |
|---|
| 204 | else { |
|---|
| 205 | p[idx] = c1&c2; // store common bases for both subtrees |
|---|
| 206 | } |
|---|
| 207 | |
|---|
| 208 | |
|---|
| 209 | #if !defined(PROPAGATE_GAPS_UPWARDS) |
|---|
| 210 | // do not propagate mixed gaps upwards (they cause neg. branches) |
|---|
| 211 | // this was an experiment (don't use it, it does not work!) |
|---|
| 212 | if (p[idx] & AP_S) { |
|---|
| 213 | if (p[idx] != AP_S) { |
|---|
| 214 | p[idx] = AP_BASES(p[idx]^AP_S); |
|---|
| 215 | } |
|---|
| 216 | } |
|---|
| 217 | #endif // PROPAGATE_GAPS_UPWARDS |
|---|
| 218 | |
|---|
| 219 | awt_assert(p[idx] != 0); |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | #if defined(DEBUG) && 0 |
|---|
| 223 | #define P1 75 |
|---|
| 224 | #define P2 90 |
|---|
| 225 | printf("Seq1: "); |
|---|
| 226 | for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p1[idx]); |
|---|
| 227 | printf("\nSeq2: "); |
|---|
| 228 | for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p2[idx]); |
|---|
| 229 | printf("\nCombine value: %f\n", float(result)); |
|---|
| 230 | #undef P1 |
|---|
| 231 | #undef P2 |
|---|
| 232 | #endif // DEBUG |
|---|
| 233 | |
|---|
| 234 | global_combineCount++; |
|---|
| 235 | this->is_set_flag = AP_TRUE; |
|---|
| 236 | cashed_real_len = -1.0; |
|---|
| 237 | |
|---|
| 238 | awt_assert(result >= 0.0); |
|---|
| 239 | |
|---|
| 240 | return (AP_FLOAT)result; |
|---|
| 241 | } |
|---|
| 242 | |
|---|
| 243 | |
|---|
| 244 | void AP_sequence_parsimony::partial_match(const AP_sequence* part_, long *overlapPtr, long *penaltyPtr) const { |
|---|
| 245 | // matches the partial sequences 'part_' against 'this' |
|---|
| 246 | // '*penaltyPtr' is set to the number of mismatches (possibly weighted) |
|---|
| 247 | // '*overlapPtr' is set to the number of base positions both sequences overlap |
|---|
| 248 | // example: |
|---|
| 249 | // fullseq 'XXX---XXX' 'XXX---XXX' |
|---|
| 250 | // partialseq '-XX---XX-' '---XXX---' |
|---|
| 251 | // overlap 7 3 |
|---|
| 252 | // |
|---|
| 253 | // algorithm is similar to AP_sequence_parsimony::combine() |
|---|
| 254 | |
|---|
| 255 | const AP_sequence_parsimony *part = (const AP_sequence_parsimony *)part_; |
|---|
| 256 | |
|---|
| 257 | const char *pf = sequence; |
|---|
| 258 | const char *pp = part->sequence; |
|---|
| 259 | |
|---|
| 260 | GB_UINT4 *w = 0; |
|---|
| 261 | if (!root->weights->dummy_weights) w = root->weights->weights; |
|---|
| 262 | |
|---|
| 263 | long min_end; // minimum of both last non-gap positions |
|---|
| 264 | |
|---|
| 265 | for (min_end = sequence_len-1; min_end >= 0; --min_end) { |
|---|
| 266 | char both = pf[min_end]|pp[min_end]; |
|---|
| 267 | if (both != AP_S) { // last non-gap found |
|---|
| 268 | if (pf[min_end] != AP_S) { // occurred in full sequence |
|---|
| 269 | for (; min_end >= 0; --min_end) { // search same in partial sequence |
|---|
| 270 | if (pp[min_end] != AP_S) break; |
|---|
| 271 | } |
|---|
| 272 | } |
|---|
| 273 | else { |
|---|
| 274 | awt_assert(pp[min_end] != AP_S); // occurred in partial sequence |
|---|
| 275 | for (; min_end >= 0; --min_end) { // search same in full sequence |
|---|
| 276 | if (pf[min_end] != AP_S) break; |
|---|
| 277 | } |
|---|
| 278 | } |
|---|
| 279 | break; |
|---|
| 280 | } |
|---|
| 281 | } |
|---|
| 282 | |
|---|
| 283 | long penalty = 0; |
|---|
| 284 | long overlap = 0; |
|---|
| 285 | |
|---|
| 286 | if (min_end >= 0) { |
|---|
| 287 | long max_start; // maximum of both first non-gap positions |
|---|
| 288 | for (max_start = 0; max_start <= min_end; ++max_start) { |
|---|
| 289 | char both = pf[max_start]|pp[max_start]; |
|---|
| 290 | if (both != AP_S) { // first non-gap found |
|---|
| 291 | if (pf[max_start] != AP_S) { // occurred in full sequence |
|---|
| 292 | for (; max_start <= min_end; ++max_start) { // search same in partial |
|---|
| 293 | if (pp[max_start] != AP_S) break; |
|---|
| 294 | } |
|---|
| 295 | } |
|---|
| 296 | else { |
|---|
| 297 | awt_assert(pp[max_start] != AP_S); // occurred in partial sequence |
|---|
| 298 | for (; max_start <= min_end; ++max_start) { // search same in full |
|---|
| 299 | if (pf[max_start] != AP_S) break; |
|---|
| 300 | } |
|---|
| 301 | } |
|---|
| 302 | break; |
|---|
| 303 | } |
|---|
| 304 | } |
|---|
| 305 | |
|---|
| 306 | if (max_start <= min_end) { // if sequences overlap |
|---|
| 307 | for (long idx = max_start; idx <= min_end; ++idx) { |
|---|
| 308 | if ((pf[idx]&pp[idx]) == 0) { // bases are distinct (aka mismatch) |
|---|
| 309 | penalty += w ? w[idx] : 1; |
|---|
| 310 | } |
|---|
| 311 | } |
|---|
| 312 | overlap = min_end-max_start+1; |
|---|
| 313 | } |
|---|
| 314 | } |
|---|
| 315 | |
|---|
| 316 | *overlapPtr = overlap; |
|---|
| 317 | *penaltyPtr = penalty; |
|---|
| 318 | } |
|---|
| 319 | |
|---|
| 320 | |
|---|
| 321 | |
|---|
| 322 | AP_FLOAT AP_sequence_parsimony::real_len(void) // count all bases |
|---|
| 323 | { |
|---|
| 324 | if (!sequence) return -1.0; |
|---|
| 325 | if (cashed_real_len>=0.0) return cashed_real_len; |
|---|
| 326 | char hits[256]; |
|---|
| 327 | long i; |
|---|
| 328 | long sum = 0; |
|---|
| 329 | unsigned char *p = (unsigned char*)sequence; |
|---|
| 330 | for (i=0;i<256;i++){ // count ambigous characters half |
|---|
| 331 | hits[i] = 1; |
|---|
| 332 | } |
|---|
| 333 | hits[AP_A] = 2; // real characters full |
|---|
| 334 | hits[AP_C] = 2; |
|---|
| 335 | hits[AP_G] = 2; |
|---|
| 336 | hits[AP_T] = 2; |
|---|
| 337 | hits[AP_S] = 0; // count no gaps |
|---|
| 338 | hits[AP_N] = 0; // no Ns |
|---|
| 339 | GB_UINT4 *w = root->weights->weights; |
|---|
| 340 | |
|---|
| 341 | for ( i = sequence_len; i ;i-- ) { // all but no gaps |
|---|
| 342 | sum += hits[*(p++)] * *(w++); |
|---|
| 343 | } |
|---|
| 344 | cashed_real_len = sum * .5; |
|---|
| 345 | return sum * .5; |
|---|
| 346 | } |
|---|