| 1 | #include <stdio.h> |
|---|
| 2 | #include <memory.h> |
|---|
| 3 | #include <string.h> |
|---|
| 4 | #include <ctype.h> |
|---|
| 5 | #include <limits.h> |
|---|
| 6 | |
|---|
| 7 | #include <arbdb.h> |
|---|
| 8 | #include <arbdbt.h> |
|---|
| 9 | #include <awt_tree.hxx> |
|---|
| 10 | #include "awt_seq_protein.hxx" |
|---|
| 11 | #include "awt_parsimony_defaults.hxx" |
|---|
| 12 | |
|---|
| 13 | #include <inline.h> |
|---|
| 14 | |
|---|
| 15 | #define awt_assert(bed) arb_assert(bed) |
|---|
| 16 | |
|---|
| 17 | // start of implementation of class AP_sequence_protein: |
|---|
| 18 | |
|---|
| 19 | AP_sequence_protein::AP_sequence_protein(AP_tree_root *tree_root) |
|---|
| 20 | : AP_sequence(tree_root) |
|---|
| 21 | { |
|---|
| 22 | sequence = 0; |
|---|
| 23 | } |
|---|
| 24 | |
|---|
| 25 | AP_sequence_protein::~AP_sequence_protein() |
|---|
| 26 | { |
|---|
| 27 | if (sequence != 0) delete sequence; |
|---|
| 28 | sequence = 0; |
|---|
| 29 | } |
|---|
| 30 | |
|---|
| 31 | AP_sequence *AP_sequence_protein::dup() |
|---|
| 32 | { |
|---|
| 33 | return new AP_sequence_protein(root); |
|---|
| 34 | } |
|---|
| 35 | |
|---|
| 36 | static AP_PROTEINS prot2AP_PROTEIN[26] = { |
|---|
| 37 | APP_A, |
|---|
| 38 | APP_B, |
|---|
| 39 | APP_C, |
|---|
| 40 | APP_D, |
|---|
| 41 | APP_E, |
|---|
| 42 | APP_F, |
|---|
| 43 | APP_G, |
|---|
| 44 | APP_H, |
|---|
| 45 | APP_I, |
|---|
| 46 | APP_ILLEGAL, |
|---|
| 47 | APP_K, |
|---|
| 48 | APP_L, |
|---|
| 49 | APP_M, |
|---|
| 50 | APP_N, |
|---|
| 51 | APP_ILLEGAL, |
|---|
| 52 | APP_P, |
|---|
| 53 | APP_Q, |
|---|
| 54 | APP_R, |
|---|
| 55 | APP_S, |
|---|
| 56 | APP_T, |
|---|
| 57 | APP_ILLEGAL, |
|---|
| 58 | APP_V, |
|---|
| 59 | APP_W, |
|---|
| 60 | APP_X, |
|---|
| 61 | APP_Y, |
|---|
| 62 | APP_Z |
|---|
| 63 | }; |
|---|
| 64 | |
|---|
| 65 | #define PROTEINS_TO_TEST 22 |
|---|
| 66 | |
|---|
| 67 | static AP_PROTEINS prot2test[PROTEINS_TO_TEST] = { // uses same indexing as prot_idx |
|---|
| 68 | APP_STAR, |
|---|
| 69 | APP_A, |
|---|
| 70 | APP_C, |
|---|
| 71 | APP_D, |
|---|
| 72 | APP_E, |
|---|
| 73 | APP_F, |
|---|
| 74 | APP_G, |
|---|
| 75 | APP_H, |
|---|
| 76 | APP_I, |
|---|
| 77 | APP_K, |
|---|
| 78 | APP_L, |
|---|
| 79 | APP_M, |
|---|
| 80 | APP_N, |
|---|
| 81 | APP_P, |
|---|
| 82 | APP_Q, |
|---|
| 83 | APP_R, |
|---|
| 84 | APP_S, |
|---|
| 85 | APP_T, |
|---|
| 86 | APP_V, |
|---|
| 87 | APP_W, |
|---|
| 88 | APP_Y, |
|---|
| 89 | APP_GAP |
|---|
| 90 | }; |
|---|
| 91 | |
|---|
| 92 | static int prot_idx[PROTEINS_TO_TEST] = { // uses same indexing as prot2test |
|---|
| 93 | // contains indexes for 'AWT_distance_meter->dist' |
|---|
| 94 | 0, // * |
|---|
| 95 | 1, // A |
|---|
| 96 | 3, // C |
|---|
| 97 | 4, // D |
|---|
| 98 | 5, // E |
|---|
| 99 | 6, // F |
|---|
| 100 | 7, // G |
|---|
| 101 | 8, // H |
|---|
| 102 | 9, // I |
|---|
| 103 | 10, // K |
|---|
| 104 | 11, // L |
|---|
| 105 | 12, // M |
|---|
| 106 | 13, // N |
|---|
| 107 | 14, // P |
|---|
| 108 | 15, // Q |
|---|
| 109 | 16, // R |
|---|
| 110 | 17, // S |
|---|
| 111 | 18, // T |
|---|
| 112 | 19, // V |
|---|
| 113 | 20, // W |
|---|
| 114 | 21, // Y |
|---|
| 115 | 23 // gap |
|---|
| 116 | }; |
|---|
| 117 | |
|---|
| 118 | static char prot_mindist[PROTEINS_TO_TEST][PROTEINS_TO_TEST]; |
|---|
| 119 | static int min_mutations_initialized_for_codenr = -1; |
|---|
| 120 | |
|---|
| 121 | static void update_min_mutations(int code_nr, const AWT_distance_meter *distance_meter) { |
|---|
| 122 | if (min_mutations_initialized_for_codenr != code_nr) { |
|---|
| 123 | for (int d = 0; d<PROTEINS_TO_TEST; ++d) { |
|---|
| 124 | int D = prot_idx[d]; |
|---|
| 125 | int D_bit = 1<<D; |
|---|
| 126 | |
|---|
| 127 | for (int s = 0; s<PROTEINS_TO_TEST; ++s) { |
|---|
| 128 | const AWT_PDP *dist = distance_meter->getDistance(prot_idx[s]); |
|---|
| 129 | |
|---|
| 130 | int i; |
|---|
| 131 | for (i = 0; i<3; ++i) { |
|---|
| 132 | if (dist->patd[i] & D_bit) break; |
|---|
| 133 | } |
|---|
| 134 | |
|---|
| 135 | prot_mindist[s][d] = char(i); |
|---|
| 136 | } |
|---|
| 137 | } |
|---|
| 138 | |
|---|
| 139 | min_mutations_initialized_for_codenr = code_nr; |
|---|
| 140 | } |
|---|
| 141 | } |
|---|
| 142 | |
|---|
| 143 | |
|---|
| 144 | #if defined(DEBUG) |
|---|
| 145 | // #define SHOW_SEQ |
|---|
| 146 | #endif // DEBUG |
|---|
| 147 | |
|---|
| 148 | void AP_sequence_protein::set(const char *isequence) |
|---|
| 149 | { |
|---|
| 150 | AWT_translator *translator = AWT_get_user_translator(this->root->gb_main); |
|---|
| 151 | update_min_mutations(translator->CodeNr(), translator->getDistanceMeter()); |
|---|
| 152 | |
|---|
| 153 | sequence_len = root->filter->real_len; |
|---|
| 154 | sequence = new AP_PROTEINS[sequence_len+1]; |
|---|
| 155 | |
|---|
| 156 | awt_assert(root->filter->bootstrap == 0); // bootstrapping not implemented for protein parsimony |
|---|
| 157 | |
|---|
| 158 | const uchar *simplify = root->filter->simplify; |
|---|
| 159 | char *filt = root->filter->filter_mask; |
|---|
| 160 | int left_bases = sequence_len; |
|---|
| 161 | long filter_len = root->filter->filter_len; |
|---|
| 162 | |
|---|
| 163 | awt_assert(filt); |
|---|
| 164 | |
|---|
| 165 | int oidx = 0; // index for output sequence |
|---|
| 166 | |
|---|
| 167 | for (int idx = 0; idx<filter_len && left_bases; ++idx) { |
|---|
| 168 | if (filt[idx]) { |
|---|
| 169 | char c = toupper(simplify[safeCharIndex(isequence[idx])]); |
|---|
| 170 | AP_PROTEINS p = APP_ILLEGAL; |
|---|
| 171 | |
|---|
| 172 | #if defined(SHOW_SEQ) |
|---|
| 173 | fputc(c, stdout); |
|---|
| 174 | #endif // SHOW_SEQ |
|---|
| 175 | |
|---|
| 176 | |
|---|
| 177 | if (c >= 'A' && c <= 'Z') p = prot2AP_PROTEIN[c-'A']; |
|---|
| 178 | else if (c == '-' || c == '.') p = APP_GAP; |
|---|
| 179 | else if (c == '*') p = APP_STAR; |
|---|
| 180 | |
|---|
| 181 | if (p == APP_ILLEGAL) { |
|---|
| 182 | aw_message(GBS_global_string("Illegal sequence character '%c' replaced by gap", c)); |
|---|
| 183 | p = APP_GAP; |
|---|
| 184 | } |
|---|
| 185 | |
|---|
| 186 | sequence[oidx++] = p; |
|---|
| 187 | --left_bases; |
|---|
| 188 | } |
|---|
| 189 | } |
|---|
| 190 | |
|---|
| 191 | awt_assert(oidx == sequence_len); |
|---|
| 192 | sequence[sequence_len] = APP_ILLEGAL; |
|---|
| 193 | |
|---|
| 194 | #if defined(SHOW_SEQ) |
|---|
| 195 | fputc('\n', stdout); |
|---|
| 196 | #endif // SHOW_SEQ |
|---|
| 197 | |
|---|
| 198 | update = AP_timer(); |
|---|
| 199 | is_set_flag = AP_TRUE; |
|---|
| 200 | cashed_real_len = -1.0; |
|---|
| 201 | } |
|---|
| 202 | |
|---|
| 203 | // for (int idx = 0; idx<sequence_len; ++idx) { |
|---|
| 204 | // char c = toupper(isequence[idx]); |
|---|
| 205 | // AP_PROTEINS p = APP_ILLEGAL; |
|---|
| 206 | |
|---|
| 207 | // if (c >= 'A' && c <= 'Z') p = prot2AP_PROTEIN[c-'A']; |
|---|
| 208 | // else if (c == '-' || c == '.') p = APP_GAP; |
|---|
| 209 | // else if (c == '*') p = APP_STAR; |
|---|
| 210 | |
|---|
| 211 | // if (p == APP_ILLEGAL) { |
|---|
| 212 | // awt_assert(0); |
|---|
| 213 | // p = APP_GAP; |
|---|
| 214 | // } |
|---|
| 215 | |
|---|
| 216 | // sequence[idx] = p; |
|---|
| 217 | // } |
|---|
| 218 | |
|---|
| 219 | |
|---|
| 220 | |
|---|
| 221 | AP_FLOAT AP_sequence_protein::combine( const AP_sequence * lefts, const AP_sequence *rights) |
|---|
| 222 | { |
|---|
| 223 | // this works similar to AP_sequence_parsimony::combine |
|---|
| 224 | |
|---|
| 225 | const AP_sequence_protein *left = (const AP_sequence_protein *)lefts; |
|---|
| 226 | const AP_sequence_protein *right = (const AP_sequence_protein *)rights; |
|---|
| 227 | |
|---|
| 228 | if (!sequence) { |
|---|
| 229 | sequence_len = root->filter->real_len; |
|---|
| 230 | sequence = new AP_PROTEINS[sequence_len +1]; |
|---|
| 231 | } |
|---|
| 232 | |
|---|
| 233 | const AP_PROTEINS *p1 = left->sequence; |
|---|
| 234 | const AP_PROTEINS *p2 = right->sequence; |
|---|
| 235 | AP_PROTEINS *p = sequence; |
|---|
| 236 | |
|---|
| 237 | GB_UINT4 *w = 0; |
|---|
| 238 | char *mutpsite = 0; |
|---|
| 239 | |
|---|
| 240 | if (mutation_per_site) { // count site specific mutations in mutation_per_site |
|---|
| 241 | w = root->weights->weights; |
|---|
| 242 | mutpsite = mutation_per_site; |
|---|
| 243 | } |
|---|
| 244 | else if (root->weights->dummy_weights) { // no weights, no mutation_per_site |
|---|
| 245 | ; |
|---|
| 246 | } |
|---|
| 247 | else { // weighted (but don't count mutation_per_site) |
|---|
| 248 | w = root->weights->weights; |
|---|
| 249 | } |
|---|
| 250 | |
|---|
| 251 | long result = 0; |
|---|
| 252 | // check if initialized for correct instance of translator: |
|---|
| 253 | awt_assert(min_mutations_initialized_for_codenr == AWT_get_user_translator()->CodeNr()); |
|---|
| 254 | |
|---|
| 255 | for (long idx = 0; idx<sequence_len; ++idx) { |
|---|
| 256 | AP_PROTEINS c1 = p1[idx]; |
|---|
| 257 | AP_PROTEINS c2 = p2[idx]; |
|---|
| 258 | |
|---|
| 259 | awt_assert(c1 != APP_ILLEGAL); |
|---|
| 260 | awt_assert(c2 != APP_ILLEGAL); |
|---|
| 261 | |
|---|
| 262 | if ((c1&c2) == 0) { // proteins are distinct |
|---|
| 263 | p[idx] = AP_PROTEINS(c1|c2); // mix distinct proteins |
|---|
| 264 | |
|---|
| 265 | int mutations = INT_MAX; |
|---|
| 266 | |
|---|
| 267 | if (p[idx]&APP_GAP) { // contains a gap |
|---|
| 268 | mutations = 1; // count first gap as mutation |
|---|
| 269 | // @@@ FIXME: rethink then line above. maybe it should be 3 mutations ? |
|---|
| 270 | |
|---|
| 271 | #if !defined(MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS) |
|---|
| 272 | // count multiple mutations as 1 mutation |
|---|
| 273 | // this was an experiment (don't use it, it does not work!) |
|---|
| 274 | if (idx>0 && (p[idx-1]&APP_GAP)) { // last position also contained gap.. |
|---|
| 275 | if (((c1&APP_GAP) && (p1[idx-1]&APP_GAP)) || // ..in same sequence |
|---|
| 276 | ((c2&APP_GAP) && (p2[idx-1]&APP_GAP))) |
|---|
| 277 | { |
|---|
| 278 | if (!(p1[idx-1]&APP_GAP) || !(p2[idx-1]&APP_GAP)) { // if one of the sequences had no gap at previous position |
|---|
| 279 | mutations = 0; // skip multiple gaps |
|---|
| 280 | } |
|---|
| 281 | } |
|---|
| 282 | } |
|---|
| 283 | #endif // MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS |
|---|
| 284 | } |
|---|
| 285 | else { |
|---|
| 286 | for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test |
|---|
| 287 | if (c1&prot2test[t1]) { // if protein is contained in subtree |
|---|
| 288 | for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) { |
|---|
| 289 | if (c2&prot2test[t2]) { |
|---|
| 290 | int mut = prot_mindist[t1][t2]; |
|---|
| 291 | if (mut<mutations) { |
|---|
| 292 | mutations = mut; |
|---|
| 293 | if (mutations < 2) break; // minimum reached -- abort |
|---|
| 294 | } |
|---|
| 295 | } |
|---|
| 296 | } |
|---|
| 297 | } |
|---|
| 298 | } |
|---|
| 299 | } |
|---|
| 300 | |
|---|
| 301 | awt_assert(mutations >= 0 && mutations <= 3); |
|---|
| 302 | |
|---|
| 303 | if (mutpsite) mutpsite[idx] += mutations; // count mutations per site (unweighted) |
|---|
| 304 | result += mutations * (w ? w[idx] : 1); // count weighted or simple |
|---|
| 305 | |
|---|
| 306 | } |
|---|
| 307 | else { |
|---|
| 308 | p[idx] = AP_PROTEINS(c1&c2); // store common proteins for both subtrees |
|---|
| 309 | } |
|---|
| 310 | |
|---|
| 311 | #if !defined(PROPAGATE_GAPS_UPWARDS) |
|---|
| 312 | // do not propagate mixed gaps upwards (they cause neg. branches) |
|---|
| 313 | if (p[idx] & APP_GAP) { // contains gap |
|---|
| 314 | if (p[idx] != APP_GAP) { // is not real gap |
|---|
| 315 | p[idx] = AP_PROTEINS(p[idx]^APP_GAP); // remove the gap |
|---|
| 316 | } |
|---|
| 317 | } |
|---|
| 318 | #endif // PROPAGATE_GAPS_UPWARDS |
|---|
| 319 | } |
|---|
| 320 | |
|---|
| 321 | #if defined(DEBUG) && 0 |
|---|
| 322 | #define P1 75 |
|---|
| 323 | #define P2 90 |
|---|
| 324 | printf("Seq1: "); |
|---|
| 325 | for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p1[idx]); |
|---|
| 326 | printf("\nSeq2: "); |
|---|
| 327 | for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p2[idx]); |
|---|
| 328 | printf("\nCombine value: %f\n", float(result)); |
|---|
| 329 | #undef P1 |
|---|
| 330 | #undef P2 |
|---|
| 331 | #endif // DEBUG |
|---|
| 332 | |
|---|
| 333 | #if defined(DEBUG) && 0 |
|---|
| 334 | printf("\nCombine value: %f\n", float(result)); |
|---|
| 335 | #endif // DEBUG |
|---|
| 336 | |
|---|
| 337 | global_combineCount++; |
|---|
| 338 | this->is_set_flag = AP_TRUE; |
|---|
| 339 | cashed_real_len = -1.0; |
|---|
| 340 | |
|---|
| 341 | awt_assert(result >= 0.0); |
|---|
| 342 | |
|---|
| 343 | return (AP_FLOAT)result; |
|---|
| 344 | } |
|---|
| 345 | |
|---|
| 346 | void AP_sequence_protein::partial_match(const AP_sequence* part_, long *overlapPtr, long *penaltyPtr) const { |
|---|
| 347 | // matches the partial sequences 'part_' against 'this' |
|---|
| 348 | // '*penaltyPtr' is set to the number of mismatches (possibly weighted) |
|---|
| 349 | // '*overlapPtr' is set to the number of base positions both sequences overlap |
|---|
| 350 | // example: |
|---|
| 351 | // fullseq 'XXX---XXX' 'XXX---XXX' |
|---|
| 352 | // partialseq '-XX---XX-' '---XXX---' |
|---|
| 353 | // overlap 7 3 |
|---|
| 354 | // |
|---|
| 355 | // algorithm is similar to AP_sequence_protein::combine() |
|---|
| 356 | |
|---|
| 357 | const AP_sequence_protein *part = (const AP_sequence_protein *)part_; |
|---|
| 358 | const AP_PROTEINS *pf = sequence; |
|---|
| 359 | const AP_PROTEINS *pp = part->sequence; |
|---|
| 360 | |
|---|
| 361 | GB_UINT4 *w = 0; |
|---|
| 362 | if (!root->weights->dummy_weights) w = root->weights->weights; |
|---|
| 363 | |
|---|
| 364 | long min_end; // minimum of both last non-gap positions |
|---|
| 365 | |
|---|
| 366 | for (min_end = sequence_len-1; min_end >= 0; --min_end) { |
|---|
| 367 | AP_PROTEINS both = AP_PROTEINS(pf[min_end]|pp[min_end]); |
|---|
| 368 | if (both != APP_GAP) { // last non-gap found |
|---|
| 369 | if (pf[min_end] != APP_GAP) { // occurred in full sequence |
|---|
| 370 | for (; min_end >= 0; --min_end) { // search same in partial sequence |
|---|
| 371 | if (pp[min_end] != APP_GAP) break; |
|---|
| 372 | } |
|---|
| 373 | } |
|---|
| 374 | else { |
|---|
| 375 | awt_assert(pp[min_end] != APP_GAP); // occurred in partial sequence |
|---|
| 376 | for (; min_end >= 0; --min_end) { // search same in full sequence |
|---|
| 377 | if (pf[min_end] != APP_GAP) break; |
|---|
| 378 | } |
|---|
| 379 | } |
|---|
| 380 | break; |
|---|
| 381 | } |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | long penalty = 0; |
|---|
| 385 | long overlap = 0; |
|---|
| 386 | |
|---|
| 387 | if (min_end >= 0) { |
|---|
| 388 | long max_start; // maximum of both first non-gap positions |
|---|
| 389 | for (max_start = 0; max_start <= min_end; ++max_start) { |
|---|
| 390 | AP_PROTEINS both = AP_PROTEINS(pf[max_start]|pp[max_start]); |
|---|
| 391 | if (both != APP_GAP) { // first non-gap found |
|---|
| 392 | if (pf[max_start] != APP_GAP) { // occurred in full sequence |
|---|
| 393 | for (; max_start <= min_end; ++max_start) { // search same in partial |
|---|
| 394 | if (pp[max_start] != APP_GAP) break; |
|---|
| 395 | } |
|---|
| 396 | } |
|---|
| 397 | else { |
|---|
| 398 | awt_assert(pp[max_start] != APP_GAP); // occurred in partial sequence |
|---|
| 399 | for (; max_start <= min_end; ++max_start) { // search same in full |
|---|
| 400 | if (pf[max_start] != APP_GAP) break; |
|---|
| 401 | } |
|---|
| 402 | } |
|---|
| 403 | break; |
|---|
| 404 | } |
|---|
| 405 | } |
|---|
| 406 | |
|---|
| 407 | if (max_start <= min_end) { // if sequences overlap |
|---|
| 408 | for (long idx = max_start; idx <= min_end; ++idx) { |
|---|
| 409 | if ((pf[idx]&pp[idx]) == 0) { // bases are distinct (aka mismatch) |
|---|
| 410 | int mutations; |
|---|
| 411 | if ((pf[idx]|pp[idx])&APP_GAP) { // one is a gap |
|---|
| 412 | mutations = 3; |
|---|
| 413 | } |
|---|
| 414 | else { |
|---|
| 415 | mutations = INT_MAX; |
|---|
| 416 | for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test |
|---|
| 417 | if (pf[idx] & prot2test[t1]) { // if protein is contained in subtree |
|---|
| 418 | for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) { |
|---|
| 419 | if (pp[idx] & prot2test[t2]) { |
|---|
| 420 | int mut = prot_mindist[t1][t2]; |
|---|
| 421 | if (mut<mutations) { |
|---|
| 422 | mutations = mut; |
|---|
| 423 | if (mutations < 2) break; // minimum reached -- abort |
|---|
| 424 | } |
|---|
| 425 | } |
|---|
| 426 | } |
|---|
| 427 | } |
|---|
| 428 | } |
|---|
| 429 | } |
|---|
| 430 | penalty += (w ? w[idx] : 1)*mutations; |
|---|
| 431 | } |
|---|
| 432 | } |
|---|
| 433 | overlap = (min_end-max_start+1)*3; |
|---|
| 434 | } |
|---|
| 435 | } |
|---|
| 436 | |
|---|
| 437 | *overlapPtr = overlap; |
|---|
| 438 | *penaltyPtr = penalty; |
|---|
| 439 | } |
|---|
| 440 | |
|---|
| 441 | AP_FLOAT AP_sequence_protein::real_len() |
|---|
| 442 | { |
|---|
| 443 | if (!sequence) return -1.0; |
|---|
| 444 | if (cashed_real_len>=0.0) return cashed_real_len; |
|---|
| 445 | |
|---|
| 446 | long sum = 0; |
|---|
| 447 | for (long idx = 0; idx<sequence_len; ++idx) { |
|---|
| 448 | if (sequence[idx] != APP_GAP) { |
|---|
| 449 | ++sum; |
|---|
| 450 | } |
|---|
| 451 | } |
|---|
| 452 | cashed_real_len = sum; |
|---|
| 453 | return sum; |
|---|
| 454 | } |
|---|
| 455 | |
|---|
| 456 | // -end- of implementation of class AP_sequence_protein. |
|---|
| 457 | |
|---|
| 458 | |
|---|