1 | #include "AP_seq_protein.hxx" |
---|
2 | #include "AP_parsimony_defaults.hxx" |
---|
3 | #include <AP_pro_a_nucs.hxx> |
---|
4 | |
---|
5 | #include <AP_filter.hxx> |
---|
6 | #include <ARB_Tree.hxx> |
---|
7 | |
---|
8 | #include <arb_str.h> |
---|
9 | #include <climits> |
---|
10 | |
---|
11 | |
---|
12 | // #define ap_assert(bed) arb_assert(bed) |
---|
13 | |
---|
14 | AP_sequence_protein::AP_sequence_protein(const AliView *aliview) |
---|
15 | : AP_sequence(aliview) |
---|
16 | , seq_prot(NULL) |
---|
17 | { |
---|
18 | } |
---|
19 | |
---|
20 | AP_sequence_protein::~AP_sequence_protein() { |
---|
21 | delete [] seq_prot; |
---|
22 | } |
---|
23 | |
---|
24 | AP_sequence *AP_sequence_protein::dup() const { |
---|
25 | return new AP_sequence_protein(get_aliview()); |
---|
26 | } |
---|
27 | |
---|
28 | static AP_PROTEINS prot2AP_PROTEIN[26] = { |
---|
29 | APP_A, |
---|
30 | APP_B, |
---|
31 | APP_C, |
---|
32 | APP_D, |
---|
33 | APP_E, |
---|
34 | APP_F, |
---|
35 | APP_G, |
---|
36 | APP_H, |
---|
37 | APP_I, |
---|
38 | APP_ILLEGAL, |
---|
39 | APP_K, |
---|
40 | APP_L, |
---|
41 | APP_M, |
---|
42 | APP_N, |
---|
43 | APP_ILLEGAL, |
---|
44 | APP_P, |
---|
45 | APP_Q, |
---|
46 | APP_R, |
---|
47 | APP_S, |
---|
48 | APP_T, |
---|
49 | APP_ILLEGAL, |
---|
50 | APP_V, |
---|
51 | APP_W, |
---|
52 | APP_X, |
---|
53 | APP_Y, |
---|
54 | APP_Z |
---|
55 | }; |
---|
56 | |
---|
57 | #define PROTEINS_TO_TEST 22 |
---|
58 | |
---|
59 | static AP_PROTEINS prot2test[PROTEINS_TO_TEST] = { // uses same indexing as prot_idx |
---|
60 | APP_STAR, |
---|
61 | APP_A, |
---|
62 | APP_C, |
---|
63 | APP_D, |
---|
64 | APP_E, |
---|
65 | APP_F, |
---|
66 | APP_G, |
---|
67 | APP_H, |
---|
68 | APP_I, |
---|
69 | APP_K, |
---|
70 | APP_L, |
---|
71 | APP_M, |
---|
72 | APP_N, |
---|
73 | APP_P, |
---|
74 | APP_Q, |
---|
75 | APP_R, |
---|
76 | APP_S, |
---|
77 | APP_T, |
---|
78 | APP_V, |
---|
79 | APP_W, |
---|
80 | APP_Y, |
---|
81 | APP_GAP |
---|
82 | }; |
---|
83 | |
---|
84 | static int prot_idx[PROTEINS_TO_TEST] = { // uses same indexing as prot2test |
---|
85 | // contains indexes for 'AWT_distance_meter->dist' |
---|
86 | 0, // * |
---|
87 | 1, // A |
---|
88 | 3, // C |
---|
89 | 4, // D |
---|
90 | 5, // E |
---|
91 | 6, // F |
---|
92 | 7, // G |
---|
93 | 8, // H |
---|
94 | 9, // I |
---|
95 | 10, // K |
---|
96 | 11, // L |
---|
97 | 12, // M |
---|
98 | 13, // N |
---|
99 | 14, // P |
---|
100 | 15, // Q |
---|
101 | 16, // R |
---|
102 | 17, // S |
---|
103 | 18, // T |
---|
104 | 19, // V |
---|
105 | 20, // W |
---|
106 | 21, // Y |
---|
107 | 23 // gap |
---|
108 | }; |
---|
109 | |
---|
110 | static char prot_mindist[PROTEINS_TO_TEST][PROTEINS_TO_TEST]; |
---|
111 | static int min_mutations_initialized_for_codenr = -1; |
---|
112 | |
---|
113 | static void update_min_mutations(int code_nr, const AWT_distance_meter *distance_meter) { |
---|
114 | if (min_mutations_initialized_for_codenr != code_nr) { |
---|
115 | for (int d = 0; d<PROTEINS_TO_TEST; ++d) { |
---|
116 | int D = prot_idx[d]; |
---|
117 | int D_bit = 1<<D; |
---|
118 | |
---|
119 | for (int s = 0; s<PROTEINS_TO_TEST; ++s) { |
---|
120 | const AWT_PDP *dist = distance_meter->getDistance(prot_idx[s]); |
---|
121 | |
---|
122 | int i; |
---|
123 | for (i = 0; i<3; ++i) { |
---|
124 | if (dist->patd[i] & D_bit) break; |
---|
125 | } |
---|
126 | |
---|
127 | prot_mindist[s][d] = char(i); |
---|
128 | } |
---|
129 | } |
---|
130 | |
---|
131 | min_mutations_initialized_for_codenr = code_nr; |
---|
132 | } |
---|
133 | } |
---|
134 | |
---|
135 | |
---|
136 | #if defined(DEBUG) |
---|
137 | // #define SHOW_SEQ |
---|
138 | #endif // DEBUG |
---|
139 | |
---|
140 | void AP_sequence_protein::set(const char *isequence) { |
---|
141 | AWT_translator *translator = AWT_get_user_translator(get_aliview()->get_gb_main()); |
---|
142 | update_min_mutations(translator->CodeNr(), translator->getDistanceMeter()); |
---|
143 | |
---|
144 | size_t sequence_len = get_sequence_length(); |
---|
145 | seq_prot = new AP_PROTEINS[sequence_len+1]; |
---|
146 | |
---|
147 | ap_assert(!get_filter()->does_bootstrap()); // bootstrapping not implemented for protein parsimony |
---|
148 | |
---|
149 | const AP_filter *filt = get_filter(); |
---|
150 | const uchar *simplify = filt->get_simplify_table(); |
---|
151 | int left_bases = sequence_len; |
---|
152 | long filter_len = filt->get_length(); |
---|
153 | |
---|
154 | ap_assert(filt); |
---|
155 | |
---|
156 | size_t oidx = 0; // index for output sequence |
---|
157 | |
---|
158 | for (int idx = 0; idx<filter_len && left_bases; ++idx) { |
---|
159 | if (filt->use_position(idx)) { |
---|
160 | char c = toupper(simplify[safeCharIndex(isequence[idx])]); |
---|
161 | AP_PROTEINS p = APP_ILLEGAL; |
---|
162 | |
---|
163 | #if defined(SHOW_SEQ) |
---|
164 | fputc(c, stdout); |
---|
165 | #endif // SHOW_SEQ |
---|
166 | |
---|
167 | if (c >= 'A' && c <= 'Z') p = prot2AP_PROTEIN[c-'A']; |
---|
168 | else if (c == '-') p = APP_GAP; |
---|
169 | else if (c == '.') p = APP_X; |
---|
170 | else if (c == '*') p = APP_STAR; |
---|
171 | |
---|
172 | if (p == APP_ILLEGAL) { |
---|
173 | GB_warning(GBS_global_string("Illegal sequence character '%c' replaced by gap", c)); |
---|
174 | p = APP_GAP; |
---|
175 | } |
---|
176 | |
---|
177 | seq_prot[oidx++] = p; |
---|
178 | --left_bases; |
---|
179 | } |
---|
180 | } |
---|
181 | |
---|
182 | ap_assert(oidx == sequence_len); |
---|
183 | seq_prot[sequence_len] = APP_ILLEGAL; |
---|
184 | |
---|
185 | #if defined(SHOW_SEQ) |
---|
186 | fputc('\n', stdout); |
---|
187 | #endif // SHOW_SEQ |
---|
188 | |
---|
189 | mark_sequence_set(true); |
---|
190 | } |
---|
191 | |
---|
192 | void AP_sequence_protein::unset() { |
---|
193 | delete [] seq_prot; |
---|
194 | seq_prot = 0; |
---|
195 | mark_sequence_set(false); |
---|
196 | } |
---|
197 | |
---|
198 | #if !defined(AP_PARSIMONY_DEFAULTS_HXX) |
---|
199 | #error AP_parsimony_defaults.hxx not included |
---|
200 | #endif // AP_PARSIMONY_DEFAULTS_HXX |
---|
201 | |
---|
202 | |
---|
203 | AP_FLOAT AP_sequence_protein::combine(const AP_sequence *lefts, const AP_sequence *rights, char *mutation_per_site) { |
---|
204 | // this works similar to AP_sequence_parsimony::combine |
---|
205 | |
---|
206 | const AP_sequence_protein *left = (const AP_sequence_protein *)lefts; |
---|
207 | const AP_sequence_protein *right = (const AP_sequence_protein *)rights; |
---|
208 | |
---|
209 | size_t sequence_len = get_sequence_length(); |
---|
210 | if (!seq_prot) seq_prot = new AP_PROTEINS[sequence_len + 1]; |
---|
211 | |
---|
212 | const AP_PROTEINS *p1 = left->get_sequence(); |
---|
213 | const AP_PROTEINS *p2 = right->get_sequence(); |
---|
214 | AP_PROTEINS *p = seq_prot; |
---|
215 | const AP_weights *weights = get_weights(); |
---|
216 | char *mutpsite = mutation_per_site; |
---|
217 | |
---|
218 | long result = 0; |
---|
219 | // check if initialized for correct instance of translator: |
---|
220 | ap_assert(min_mutations_initialized_for_codenr == AWT_get_user_translator()->CodeNr()); |
---|
221 | |
---|
222 | for (size_t idx = 0; idx<sequence_len; ++idx) { |
---|
223 | AP_PROTEINS c1 = p1[idx]; |
---|
224 | AP_PROTEINS c2 = p2[idx]; |
---|
225 | |
---|
226 | ap_assert(c1 != APP_ILLEGAL); |
---|
227 | ap_assert(c2 != APP_ILLEGAL); |
---|
228 | |
---|
229 | if ((c1&c2) == 0) { // proteins are distinct |
---|
230 | p[idx] = AP_PROTEINS(c1|c2); // mix distinct proteins |
---|
231 | |
---|
232 | int mutations = INT_MAX; |
---|
233 | |
---|
234 | if (p[idx]&APP_GAP) { // contains a gap |
---|
235 | mutations = 1; // count first gap as mutation |
---|
236 | // @@@ FIXME: rethink the line above. maybe it should be 3 mutations ? |
---|
237 | |
---|
238 | #if !defined(MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS) |
---|
239 | // count multiple mutations as 1 mutation |
---|
240 | // this was an experiment (don't use it, it does not work!) |
---|
241 | if (idx>0 && (p[idx-1]&APP_GAP)) { // last position also contained gap.. |
---|
242 | if (((c1&APP_GAP) && (p1[idx-1]&APP_GAP)) || // ..in same sequence |
---|
243 | ((c2&APP_GAP) && (p2[idx-1]&APP_GAP))) |
---|
244 | { |
---|
245 | if (!(p1[idx-1]&APP_GAP) || !(p2[idx-1]&APP_GAP)) { // if one of the sequences had no gap at previous position |
---|
246 | mutations = 0; // skip multiple gaps |
---|
247 | } |
---|
248 | } |
---|
249 | } |
---|
250 | #endif // MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS |
---|
251 | } |
---|
252 | else { |
---|
253 | for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test |
---|
254 | if (c1&prot2test[t1]) { // if protein is contained in subtree |
---|
255 | for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) { |
---|
256 | if (c2&prot2test[t2]) { |
---|
257 | int mut = prot_mindist[t1][t2]; |
---|
258 | if (mut<mutations) { |
---|
259 | mutations = mut; |
---|
260 | if (mutations < 2) break; // minimum reached -- abort |
---|
261 | } |
---|
262 | } |
---|
263 | } |
---|
264 | } |
---|
265 | } |
---|
266 | } |
---|
267 | |
---|
268 | ap_assert(mutations >= 0 && mutations <= 3); |
---|
269 | |
---|
270 | if (mutpsite) mutpsite[idx] += mutations; // count mutations per site (unweighted) |
---|
271 | result += mutations * weights->weight(idx); // count weighted or simple |
---|
272 | |
---|
273 | } |
---|
274 | else { |
---|
275 | p[idx] = AP_PROTEINS(c1&c2); // store common proteins for both subtrees |
---|
276 | } |
---|
277 | |
---|
278 | #if !defined(PROPAGATE_GAPS_UPWARDS) |
---|
279 | // do not propagate mixed gaps upwards (they cause neg. branches) |
---|
280 | if (p[idx] & APP_GAP) { // contains gap |
---|
281 | if (p[idx] != APP_GAP) { // is not real gap |
---|
282 | p[idx] = AP_PROTEINS(p[idx]^APP_GAP); // remove the gap |
---|
283 | } |
---|
284 | } |
---|
285 | #endif // PROPAGATE_GAPS_UPWARDS |
---|
286 | } |
---|
287 | |
---|
288 | #if defined(DEBUG) && 0 |
---|
289 | #define P1 75 |
---|
290 | #define P2 90 |
---|
291 | printf("Seq1: "); |
---|
292 | for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p1[idx]); |
---|
293 | printf("\nSeq2: "); |
---|
294 | for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p2[idx]); |
---|
295 | printf("\nCombine value: %f\n", float(result)); |
---|
296 | #undef P1 |
---|
297 | #undef P2 |
---|
298 | #endif // DEBUG |
---|
299 | |
---|
300 | #if defined(DEBUG) && 0 |
---|
301 | printf("\nCombine value: %f\n", float(result)); |
---|
302 | #endif // DEBUG |
---|
303 | |
---|
304 | inc_combine_count(); |
---|
305 | mark_sequence_set(true); |
---|
306 | |
---|
307 | ap_assert(result >= 0.0); |
---|
308 | return (AP_FLOAT)result; |
---|
309 | } |
---|
310 | |
---|
311 | void AP_sequence_protein::partial_match(const AP_sequence* part_, long *overlapPtr, long *penaltyPtr) const { |
---|
312 | // matches the partial sequences 'part_' against 'this' |
---|
313 | // '*penaltyPtr' is set to the number of mismatches (possibly weighted) |
---|
314 | // '*overlapPtr' is set to the number of base positions both sequences overlap |
---|
315 | // example: |
---|
316 | // fullseq 'XXX---XXX' 'XXX---XXX' |
---|
317 | // partialseq '-XX---XX-' '---XXX---' |
---|
318 | // overlap 7 3 |
---|
319 | // |
---|
320 | // algorithm is similar to AP_sequence_protein::combine() |
---|
321 | |
---|
322 | const AP_sequence_protein *part = (const AP_sequence_protein *)part_; |
---|
323 | |
---|
324 | const AP_PROTEINS *pf = get_sequence(); |
---|
325 | const AP_PROTEINS *pp = part->get_sequence(); |
---|
326 | const AP_weights *weights = get_weights(); |
---|
327 | |
---|
328 | long min_end; // minimum of both last non-gap positions |
---|
329 | for (min_end = get_sequence_length()-1; min_end >= 0; --min_end) { |
---|
330 | AP_PROTEINS both = AP_PROTEINS(pf[min_end]|pp[min_end]); |
---|
331 | if (both != APP_GAP) { // last non-gap found |
---|
332 | if (pf[min_end] != APP_GAP) { // occurred in full sequence |
---|
333 | for (; min_end >= 0; --min_end) { // search same in partial sequence |
---|
334 | if (pp[min_end] != APP_GAP) break; |
---|
335 | } |
---|
336 | } |
---|
337 | else { |
---|
338 | ap_assert(pp[min_end] != APP_GAP); // occurred in partial sequence |
---|
339 | for (; min_end >= 0; --min_end) { // search same in full sequence |
---|
340 | if (pf[min_end] != APP_GAP) break; |
---|
341 | } |
---|
342 | } |
---|
343 | break; |
---|
344 | } |
---|
345 | } |
---|
346 | |
---|
347 | long penalty = 0; |
---|
348 | long overlap = 0; |
---|
349 | |
---|
350 | if (min_end >= 0) { |
---|
351 | long max_start; // maximum of both first non-gap positions |
---|
352 | for (max_start = 0; max_start <= min_end; ++max_start) { |
---|
353 | AP_PROTEINS both = AP_PROTEINS(pf[max_start]|pp[max_start]); |
---|
354 | if (both != APP_GAP) { // first non-gap found |
---|
355 | if (pf[max_start] != APP_GAP) { // occurred in full sequence |
---|
356 | for (; max_start <= min_end; ++max_start) { // search same in partial |
---|
357 | if (pp[max_start] != APP_GAP) break; |
---|
358 | } |
---|
359 | } |
---|
360 | else { |
---|
361 | ap_assert(pp[max_start] != APP_GAP); // occurred in partial sequence |
---|
362 | for (; max_start <= min_end; ++max_start) { // search same in full |
---|
363 | if (pf[max_start] != APP_GAP) break; |
---|
364 | } |
---|
365 | } |
---|
366 | break; |
---|
367 | } |
---|
368 | } |
---|
369 | |
---|
370 | if (max_start <= min_end) { // if sequences overlap |
---|
371 | for (long idx = max_start; idx <= min_end; ++idx) { |
---|
372 | if ((pf[idx]&pp[idx]) == 0) { // bases are distinct (aka mismatch) |
---|
373 | int mutations; |
---|
374 | if ((pf[idx]|pp[idx])&APP_GAP) { // one is a gap |
---|
375 | mutations = 3; |
---|
376 | } |
---|
377 | else { |
---|
378 | mutations = INT_MAX; |
---|
379 | for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test |
---|
380 | if (pf[idx] & prot2test[t1]) { // if protein is contained in subtree |
---|
381 | for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) { |
---|
382 | if (pp[idx] & prot2test[t2]) { |
---|
383 | int mut = prot_mindist[t1][t2]; |
---|
384 | if (mut<mutations) { |
---|
385 | mutations = mut; |
---|
386 | if (mutations < 2) break; // minimum reached -- abort |
---|
387 | } |
---|
388 | } |
---|
389 | } |
---|
390 | } |
---|
391 | } |
---|
392 | } |
---|
393 | penalty += weights->weight(idx)*mutations; |
---|
394 | } |
---|
395 | } |
---|
396 | overlap = (min_end-max_start+1)*3; |
---|
397 | } |
---|
398 | } |
---|
399 | |
---|
400 | *overlapPtr = overlap; |
---|
401 | *penaltyPtr = penalty; |
---|
402 | } |
---|
403 | |
---|
404 | AP_FLOAT AP_sequence_protein::count_weighted_bases() const { |
---|
405 | AP_FLOAT wcount; |
---|
406 | const AP_PROTEINS *sequence = get_sequence(); |
---|
407 | |
---|
408 | if (!sequence) wcount = -1.0; |
---|
409 | else { |
---|
410 | long sum = 0; |
---|
411 | size_t sequence_len = get_sequence_length(); |
---|
412 | |
---|
413 | const AP_weights *weights = get_weights(); |
---|
414 | |
---|
415 | for (size_t idx = 0; idx<sequence_len; ++idx) { |
---|
416 | if (sequence[idx] != APP_GAP) { |
---|
417 | sum += weights->weight(idx); |
---|
418 | } |
---|
419 | } |
---|
420 | wcount = sum; |
---|
421 | } |
---|
422 | return wcount; |
---|
423 | } |
---|
424 | |
---|
425 | |
---|
426 | |
---|
427 | |
---|
428 | |
---|