1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : ali_profile.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "ali_profile.hxx" |
---|
12 | |
---|
13 | #include <BI_helix.hxx> |
---|
14 | #include <cctype> |
---|
15 | |
---|
16 | |
---|
17 | inline int ALI_PROFILE::is_binding_marker(char c) { |
---|
18 | return (c == '~' || c == 'x'); |
---|
19 | } |
---|
20 | |
---|
21 | |
---|
22 | ALI_TLIST<ali_family_member *> *ALI_PROFILE::find_family(ALI_SEQUENCE *Sequence, ALI_PROFILE_CONTEXT *context) { |
---|
23 | // find the family in the pt server |
---|
24 | char message_buffer[100]; |
---|
25 | ALI_PT &pt = (ALI_PT &) *(context->pt); |
---|
26 | ALI_ARBDB &arbdb = (ALI_ARBDB &) *(context->arbdb); |
---|
27 | ALI_SEQUENCE *seq; |
---|
28 | ali_family_member *family_member; |
---|
29 | ALI_TLIST<ali_family_member *> *family_list; |
---|
30 | ALI_TLIST<ali_pt_member *> *pt_fam_list; |
---|
31 | ALI_TLIST<ali_pt_member *> *pt_ext_list; |
---|
32 | ali_pt_member *pt_member; |
---|
33 | float weight, d; |
---|
34 | unsigned long number; |
---|
35 | |
---|
36 | // Initialization |
---|
37 | family_list = new ALI_TLIST<ali_family_member *>; |
---|
38 | |
---|
39 | ali_message("Searching for the family"); |
---|
40 | pt.find_family(Sequence, context->find_family_mode); |
---|
41 | ali_message("Family found"); |
---|
42 | |
---|
43 | pt_fam_list = pt.get_family_list(); |
---|
44 | pt_ext_list = pt.get_extension_list(); |
---|
45 | |
---|
46 | ali_message("Reading family:"); |
---|
47 | |
---|
48 | d = (context->ext_max_weight - 1.0) / (float) pt_fam_list->cardinality(); |
---|
49 | |
---|
50 | arbdb.begin_transaction(); |
---|
51 | |
---|
52 | // calculate the real family members |
---|
53 | number = 0; |
---|
54 | while (!pt_fam_list->is_empty()) { |
---|
55 | pt_member = pt_fam_list->first(); |
---|
56 | seq = arbdb.get_sequence(pt_member->name, context->mark_family_flag); |
---|
57 | if (seq) { |
---|
58 | weight = 1 + d * number; |
---|
59 | sprintf(message_buffer, "%s (weight = %f, matches = %d)", |
---|
60 | pt_member->name, weight, pt_member->matches); |
---|
61 | ali_message(message_buffer); |
---|
62 | family_member = new ali_family_member(seq, |
---|
63 | (float) pt_member->matches, |
---|
64 | weight); |
---|
65 | family_list->append_end(family_member); |
---|
66 | number++; |
---|
67 | } |
---|
68 | else { |
---|
69 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
---|
70 | } |
---|
71 | pt_fam_list->delete_element(); |
---|
72 | } |
---|
73 | delete pt_fam_list; |
---|
74 | |
---|
75 | ali_message("Reading family extension:"); |
---|
76 | |
---|
77 | d = -1.0 * context->ext_max_weight / (float) pt_ext_list->cardinality(); |
---|
78 | |
---|
79 | // calculate the extension of the family |
---|
80 | number = 0; |
---|
81 | while (!pt_ext_list->is_empty()) { |
---|
82 | pt_member = pt_ext_list->first(); |
---|
83 | seq = arbdb.get_sequence(pt_member->name, |
---|
84 | context->mark_family_extension_flag); |
---|
85 | if (seq) { |
---|
86 | weight = context->ext_max_weight + d * number; |
---|
87 | sprintf(message_buffer, "%s (weight = %f, matches = %d)", |
---|
88 | pt_member->name, weight, pt_member->matches); |
---|
89 | ali_message(message_buffer); |
---|
90 | family_member = new ali_family_member(seq, |
---|
91 | (float) pt_member->matches, |
---|
92 | weight); |
---|
93 | family_list->append_end(family_member); |
---|
94 | number++; |
---|
95 | } |
---|
96 | else { |
---|
97 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
---|
98 | } |
---|
99 | pt_ext_list->delete_element(); |
---|
100 | } |
---|
101 | delete pt_ext_list; |
---|
102 | |
---|
103 | arbdb.commit_transaction(); |
---|
104 | |
---|
105 | return family_list; |
---|
106 | } |
---|
107 | |
---|
108 | void ALI_PROFILE::calculate_costs(ALI_TLIST<ali_family_member *> *family_list, ALI_PROFILE_CONTEXT *context) { |
---|
109 | // calculate the costs for aligning against a family |
---|
110 | ali_family_member *family_member; |
---|
111 | float a[7], w[7], w_sum, sm[7][7]; |
---|
112 | float base_gap_weights[5], w_bg_sum; |
---|
113 | long members; |
---|
114 | size_t p; |
---|
115 | int i; |
---|
116 | size_t j; |
---|
117 | unsigned long *l; |
---|
118 | float *g; |
---|
119 | unsigned char **seq; |
---|
120 | long *seq_len; |
---|
121 | float (*w_Del)[], (*percent)[]; |
---|
122 | |
---|
123 | // allocate temporary memory |
---|
124 | members = family_list->cardinality(); |
---|
125 | l = (unsigned long *) CALLOC((unsigned int) members, sizeof(long)); |
---|
126 | g = (float *) CALLOC((unsigned int) members, sizeof(float)); |
---|
127 | seq = (unsigned char **) CALLOC((unsigned int) members, sizeof(char *)); |
---|
128 | seq_len = (long *) CALLOC((unsigned int) members, sizeof(long)); |
---|
129 | if (l == 0 || g == 0 || seq == 0 || seq_len == 0) |
---|
130 | ali_fatal_error("Out of memory"); |
---|
131 | |
---|
132 | // initialize arrays |
---|
133 | family_member = family_list->first(); |
---|
134 | prof_len = family_member->sequence->length(); |
---|
135 | seq[0] = family_member->sequence->sequence(); |
---|
136 | seq_len[0] = family_member->sequence->length(); |
---|
137 | g[0] = family_member->weight; |
---|
138 | i = 1; |
---|
139 | sub_costs_maximum = 0.0; |
---|
140 | |
---|
141 | while (family_list->is_next()) { |
---|
142 | family_member = family_list->next(); |
---|
143 | seq[i] = family_member->sequence->sequence(); |
---|
144 | seq_len[i] = family_member->sequence->length(); |
---|
145 | g[i] = family_member->weight; |
---|
146 | i++; |
---|
147 | if (prof_len < family_member->sequence->length()) { |
---|
148 | ali_warning("Family members have different length"); |
---|
149 | prof_len = family_member->sequence->length(); |
---|
150 | } |
---|
151 | } |
---|
152 | |
---|
153 | // Calculate the substitution cost matrix |
---|
154 | for (i = 0; i < 5; i++) |
---|
155 | for (j = 0; j < 5; j++) |
---|
156 | sm[i][j] = context->substitute_matrix[i][j]; |
---|
157 | |
---|
158 | // Initialize l-array (position of last base) |
---|
159 | for (i = 0; i < members; i++) |
---|
160 | l[i] = prof_len + 1; |
---|
161 | |
---|
162 | // allocate memory for costs |
---|
163 | |
---|
164 | base_weights = (float (**) [4]) CALLOC((unsigned int) prof_len, sizeof(float [4])); |
---|
165 | sub_costs = (float (**) [6]) CALLOC((unsigned int) prof_len, sizeof(float [6])); |
---|
166 | binding_costs = (float (*) [5][5]) CALLOC((unsigned int) 5, sizeof(float [5])); |
---|
167 | lmin = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
---|
168 | lmax = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
---|
169 | gap_costs = (float ***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
---|
170 | gap_percents = (float***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
---|
171 | |
---|
172 | if (binding_costs == 0 || sub_costs == 0 || lmin == 0 || lmax == 0 || |
---|
173 | gap_costs == 0 || gap_percents == 0 || base_weights == 0) { |
---|
174 | ali_fatal_error("Out of memory"); |
---|
175 | } |
---|
176 | |
---|
177 | // Copy the binding costs matrix |
---|
178 | w_bind_maximum = context->binding_matrix[0][0]; |
---|
179 | for (i = 0; i < 5; i++) |
---|
180 | for (j = 0; j < 5; j++) { |
---|
181 | (*binding_costs)[i][j] = context->binding_matrix[i][j]; |
---|
182 | if ((*binding_costs)[i][j] > w_bind_maximum) |
---|
183 | w_bind_maximum = (*binding_costs)[i][j]; |
---|
184 | } |
---|
185 | |
---|
186 | // calculate the costs for EVERY position |
---|
187 | ali_message("Calculating costs for substitution"); |
---|
188 | for (p = 0; p < prof_len; p++) { |
---|
189 | // Initialization |
---|
190 | for (i = 0; i < 7; i++) |
---|
191 | a[i] = w[i] = sm[5][i] = sm[i][5] = sm[6][i] = sm[i][6] = 0.0; |
---|
192 | for (i = 0; i < 6; i++) |
---|
193 | (*sub_costs)[p][i] = 0.0; |
---|
194 | w_sum = 0.0; |
---|
195 | w_bg_sum = 0.0; |
---|
196 | |
---|
197 | // Statistic consensus |
---|
198 | for (i = 0; i < members; i++) { |
---|
199 | if (p < size_t(seq_len[i])) { |
---|
200 | a[seq[i][p]]++; |
---|
201 | w[seq[i][p]] += g[i]; |
---|
202 | if (ali_is_real_base(seq[i][p])) |
---|
203 | w_sum += g[i]; |
---|
204 | if (ali_is_real_base(seq[i][p]) || ali_is_gap(seq[i][p])) |
---|
205 | w_bg_sum += g[i]; |
---|
206 | } |
---|
207 | else { |
---|
208 | a[ALI_DOT_CODE]++; |
---|
209 | w[ALI_DOT_CODE] += g[i]; |
---|
210 | } |
---|
211 | } |
---|
212 | |
---|
213 | // Relative weight of bases |
---|
214 | if (w_sum != 0) |
---|
215 | for (i = 0; i < 4; i++) |
---|
216 | (*base_weights)[p][i] = w[i] / w_sum; |
---|
217 | else |
---|
218 | for (i = 0; i < 4; i++) |
---|
219 | (*base_weights)[p][i] = 0.25; |
---|
220 | |
---|
221 | // Relative weight of bases and gaps |
---|
222 | if (w_bg_sum != 0) |
---|
223 | for (i = 0; i < 5; i++) |
---|
224 | base_gap_weights[i] = w[i] / w_bg_sum; |
---|
225 | else |
---|
226 | for (i = 0; i < 5; i++) |
---|
227 | base_gap_weights[i] = 0.2; |
---|
228 | |
---|
229 | // Expandation of substitute matrix (for 'n') |
---|
230 | for (j = 0; j < 5; j++) { |
---|
231 | for (i = 0; i < 4; i++) { |
---|
232 | sm[5][j] += (*base_weights)[p][i] * sm[i][j]; |
---|
233 | sm[j][5] += (*base_weights)[p][i] * sm[j][i]; |
---|
234 | } |
---|
235 | } |
---|
236 | for (i = 0; i < 4; i++) |
---|
237 | sm[5][5] += (*base_weights)[p][i] * sm[i][i]; |
---|
238 | |
---|
239 | // Expandation of substitute matrix (for '.') |
---|
240 | for (j = 0; j < 6; j++) |
---|
241 | for (i = 0; i < 5; i++) { |
---|
242 | sm[6][j] += base_gap_weights[i] * sm[i][j]; |
---|
243 | sm[j][6] += base_gap_weights[i] * sm[j][i]; |
---|
244 | } |
---|
245 | for (i = 0; i < 5; i++) |
---|
246 | sm[6][6] += base_gap_weights[i] * sm[i][i]; |
---|
247 | |
---|
248 | // Substitution costs |
---|
249 | for (i = 0; i < members; i++) { |
---|
250 | if (p < size_t(seq_len[i])) { |
---|
251 | for (j = 0; j < 6; j++) { |
---|
252 | (*sub_costs)[p][j] += g[i] * sm[seq[i][p]][j]; |
---|
253 | } |
---|
254 | } |
---|
255 | else { |
---|
256 | for (j = 0; j < 6; j++) { |
---|
257 | (*sub_costs)[p][j] += g[i] * sm[ALI_DOT_CODE][j]; |
---|
258 | } |
---|
259 | } |
---|
260 | } |
---|
261 | for (j = 0; j < 6; j++) { |
---|
262 | (*sub_costs)[p][j] /= members; |
---|
263 | if ((*sub_costs)[p][j] > sub_costs_maximum) |
---|
264 | sub_costs_maximum = (*sub_costs)[p][j]; |
---|
265 | } |
---|
266 | |
---|
267 | // Calculate dynamic deletion costs and percents of real gaps |
---|
268 | lmax[p] = 0; |
---|
269 | lmin[p] = p; |
---|
270 | for (i = 0; i < members; i++) |
---|
271 | if (l[i] < p) { |
---|
272 | if (lmin[p] > l[i]) |
---|
273 | lmin[p] = l[i]; |
---|
274 | if (lmax[p] < l[i]) |
---|
275 | lmax[p] = l[i]; |
---|
276 | } |
---|
277 | if (lmin[p] == p && lmax[p] == 0) { |
---|
278 | lmin[p] = lmax[p] = p; |
---|
279 | } |
---|
280 | |
---|
281 | w_Del = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
---|
282 | percent = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
---|
283 | if (w_Del == 0 || percent == 0) |
---|
284 | ali_fatal_error("Out of memory"); |
---|
285 | (*gap_costs)[p] = (float *) w_Del; |
---|
286 | (*gap_percents)[p] = (float *) percent; |
---|
287 | |
---|
288 | // Calculate dynamic deletion costs |
---|
289 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
---|
290 | (*w_Del)[j] = 0; |
---|
291 | for (i = 0; i < members; i++) { |
---|
292 | // Normal case |
---|
293 | if (p < size_t(seq_len[i])) { |
---|
294 | if (l[i] == prof_len + 1 || l[i] >= j + lmin[p]) { |
---|
295 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4] * context->multi_gap_factor; |
---|
296 | } |
---|
297 | else { |
---|
298 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4]; |
---|
299 | } |
---|
300 | } |
---|
301 | // expand sequence with dots |
---|
302 | else { |
---|
303 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
---|
304 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4] * context->multi_gap_factor; |
---|
305 | } |
---|
306 | else { |
---|
307 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4]; |
---|
308 | } |
---|
309 | } |
---|
310 | } |
---|
311 | (*w_Del)[j] /= members; |
---|
312 | } |
---|
313 | |
---|
314 | // Update the l-array |
---|
315 | for (i = 0; i < members; i++) { |
---|
316 | if (!ali_is_gap(seq[i][p])) |
---|
317 | l[i] = p; |
---|
318 | } |
---|
319 | |
---|
320 | // Calculate percents of real gaps |
---|
321 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
---|
322 | (*percent)[j] = 0; |
---|
323 | for (i = 0; i < members; i++) { |
---|
324 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
---|
325 | (*percent)[j] += 1.0; |
---|
326 | } |
---|
327 | } |
---|
328 | (*percent)[j] /= members; |
---|
329 | } |
---|
330 | } |
---|
331 | |
---|
332 | ali_message("Calculation finished"); |
---|
333 | |
---|
334 | free(l); |
---|
335 | free(g); |
---|
336 | free(seq); |
---|
337 | free(seq_len); |
---|
338 | } |
---|
339 | |
---|
340 | int ALI_PROFILE::find_next_helix(char h[], unsigned long h_len, |
---|
341 | unsigned long pos, |
---|
342 | unsigned long *helix_nr, |
---|
343 | unsigned long *start, unsigned long *end) |
---|
344 | { |
---|
345 | // find the next helix |
---|
346 | unsigned long i; |
---|
347 | |
---|
348 | for (i = pos; i < h_len && !isdigit(h[i]); i++) ; |
---|
349 | if (i >= h_len) return -1; |
---|
350 | |
---|
351 | *start = i; |
---|
352 | sscanf(&h[i], "%ld", helix_nr); |
---|
353 | for (; i < h_len && isdigit(h[i]); i++) ; |
---|
354 | for (; i < h_len && !isdigit(h[i]); i++) ; |
---|
355 | *end = i - 1; |
---|
356 | |
---|
357 | return 0; |
---|
358 | } |
---|
359 | |
---|
360 | int ALI_PROFILE::find_comp_helix(char h[], unsigned long h_len, |
---|
361 | unsigned long pos, unsigned long helix_nr, |
---|
362 | unsigned long *start, unsigned long *end) |
---|
363 | { |
---|
364 | // find the complementary part of a helix |
---|
365 | unsigned long nr, i; |
---|
366 | |
---|
367 | i = pos; |
---|
368 | do { |
---|
369 | for (; i < h_len && !isdigit(h[i]); i++) ; |
---|
370 | if (i >= h_len) return -1; |
---|
371 | *start = i; |
---|
372 | sscanf(&h[i], "%ld", &nr); |
---|
373 | for (; i < h_len && isdigit(h[i]); i++) ; |
---|
374 | } while (helix_nr != nr); |
---|
375 | |
---|
376 | for (; i < h_len && !isdigit(h[i]); i++) ; |
---|
377 | *end = i - 1; |
---|
378 | |
---|
379 | return 0; |
---|
380 | } |
---|
381 | |
---|
382 | void ALI_PROFILE::delete_comp_helix(char h1[], char h2[], unsigned long h_len, |
---|
383 | unsigned long start, unsigned long end) |
---|
384 | { |
---|
385 | unsigned long i; |
---|
386 | |
---|
387 | for (i = start; i < h_len && i <= end; i++) { |
---|
388 | h1[i] = '.'; |
---|
389 | h2[i] = '.'; |
---|
390 | } |
---|
391 | } |
---|
392 | |
---|
393 | |
---|
394 | void ALI_PROFILE::initialize_helix(ALI_PROFILE_CONTEXT *context) { |
---|
395 | // initialize the array, representing the helix |
---|
396 | const char *error_string; |
---|
397 | BI_helix bi_helix; |
---|
398 | |
---|
399 | unsigned long i; |
---|
400 | |
---|
401 | // read helix |
---|
402 | if ((error_string = bi_helix.init(context->arbdb->gb_main)) != 0) |
---|
403 | ali_warning(error_string); |
---|
404 | |
---|
405 | helix_len = bi_helix.size(); |
---|
406 | helix = (long **) CALLOC((unsigned int) helix_len, sizeof(long)); |
---|
407 | helix_borders = (char **) CALLOC((unsigned int) helix_len, sizeof(long)); |
---|
408 | if (helix == 0 || helix_borders == 0) ali_fatal_error("Out of memory"); |
---|
409 | |
---|
410 | // convert helix for internal use |
---|
411 | for (i = 0; i < helix_len; i++) |
---|
412 | if (bi_helix.pairtype(i) == HELIX_PAIR) |
---|
413 | (*helix)[i] = bi_helix.opposite_position(i); |
---|
414 | else |
---|
415 | (*helix)[i] = -1; |
---|
416 | } |
---|
417 | |
---|
418 | |
---|
419 | ALI_PROFILE::ALI_PROFILE(ALI_SEQUENCE *seq, ALI_PROFILE_CONTEXT *context) |
---|
420 | { |
---|
421 | char message_buffer[100]; |
---|
422 | ali_family_member *family_member; |
---|
423 | ALI_TLIST<ali_family_member *> *family_list; |
---|
424 | |
---|
425 | norm_sequence = new ALI_NORM_SEQUENCE(seq); |
---|
426 | |
---|
427 | multi_gap_factor = context->multi_gap_factor; |
---|
428 | |
---|
429 | initialize_helix(context); |
---|
430 | |
---|
431 | family_list = find_family(seq, context); |
---|
432 | if (family_list->is_empty()) { |
---|
433 | ali_error("Family not found (maybe incompatible PT and DB Servers)"); |
---|
434 | } |
---|
435 | |
---|
436 | calculate_costs(family_list, context); |
---|
437 | |
---|
438 | insert_cost = sub_costs_maximum * context->insert_factor; |
---|
439 | multi_insert_cost = insert_cost * context->multi_insert_factor; |
---|
440 | |
---|
441 | sprintf(message_buffer, "Multi gap factor = %f", multi_gap_factor); |
---|
442 | ali_message(message_buffer); |
---|
443 | sprintf(message_buffer, "Maximal substitution costs = %f", sub_costs_maximum); |
---|
444 | ali_message(message_buffer); |
---|
445 | sprintf(message_buffer, "Normal insert costs = %f", insert_cost); |
---|
446 | ali_message(message_buffer); |
---|
447 | sprintf(message_buffer, "Multiple insert costs = %f", multi_insert_cost); |
---|
448 | ali_message(message_buffer); |
---|
449 | |
---|
450 | // Delete the family list |
---|
451 | family_member = family_list->first(); |
---|
452 | delete family_member->sequence; |
---|
453 | delete family_member; |
---|
454 | while (family_list->is_next()) { |
---|
455 | family_member = family_list->next(); |
---|
456 | delete family_member->sequence; |
---|
457 | delete family_member; |
---|
458 | } |
---|
459 | delete family_list; |
---|
460 | } |
---|
461 | |
---|
462 | ALI_PROFILE::~ALI_PROFILE() |
---|
463 | { |
---|
464 | size_t i; |
---|
465 | |
---|
466 | free(helix); |
---|
467 | free(helix_borders); |
---|
468 | free(binding_costs); |
---|
469 | free(sub_costs); |
---|
470 | if (gap_costs) { |
---|
471 | for (i = 0; i < prof_len; i++) |
---|
472 | if ((*gap_costs)[i]) |
---|
473 | free((*gap_costs)[i]); |
---|
474 | free(gap_costs); |
---|
475 | } |
---|
476 | if (gap_percents) { |
---|
477 | for (i = 0; i < prof_len; i++) |
---|
478 | if ((*gap_percents)[i]) |
---|
479 | free((*gap_percents)[i]); |
---|
480 | free(gap_percents); |
---|
481 | } |
---|
482 | free(lmin); |
---|
483 | free(lmax); |
---|
484 | delete norm_sequence; |
---|
485 | } |
---|
486 | |
---|
487 | |
---|
488 | int ALI_PROFILE::is_in_helix(unsigned long pos, unsigned long *first, unsigned long *last) { |
---|
489 | // test whether a position is inside a helix |
---|
490 | long i; |
---|
491 | |
---|
492 | if (pos > helix_len) |
---|
493 | return 0; |
---|
494 | |
---|
495 | switch ((*helix_borders)[pos]) { |
---|
496 | case ALI_PROFILE_BORDER_LEFT: |
---|
497 | *first = pos; |
---|
498 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
---|
499 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_RIGHT) { |
---|
500 | *last = (unsigned long) i; |
---|
501 | return 1; |
---|
502 | } |
---|
503 | ali_warning("Helix borders inconsistent (1)"); |
---|
504 | return 0; |
---|
505 | case ALI_PROFILE_BORDER_RIGHT: |
---|
506 | *last = pos; |
---|
507 | for (i = (long) pos - 1; i >= 0; i--) |
---|
508 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_LEFT) { |
---|
509 | *first = (unsigned long) i; |
---|
510 | return 1; |
---|
511 | } |
---|
512 | ali_warning("Helix borders inconsistent (2)"); |
---|
513 | return 0; |
---|
514 | default: |
---|
515 | for (i = (long) pos - 1; i >= 0; i--) |
---|
516 | switch ((*helix_borders)[i]) { |
---|
517 | case ALI_PROFILE_BORDER_RIGHT: |
---|
518 | return 0; |
---|
519 | case ALI_PROFILE_BORDER_LEFT: |
---|
520 | *first = (unsigned long) i; |
---|
521 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
---|
522 | switch ((*helix_borders)[i]) { |
---|
523 | case ALI_PROFILE_BORDER_LEFT: |
---|
524 | ali_warning("Helix borders inconsistent (3)"); |
---|
525 | printf("pos = %ld\n", pos); |
---|
526 | return 0; |
---|
527 | case ALI_PROFILE_BORDER_RIGHT: |
---|
528 | *last = (unsigned long) i; |
---|
529 | return 1; |
---|
530 | } |
---|
531 | } |
---|
532 | } |
---|
533 | return 0; |
---|
534 | } |
---|
535 | |
---|
536 | int ALI_PROFILE::is_outside_helix(unsigned long pos, unsigned long *first, unsigned long *last) { |
---|
537 | // test, whether a position is outside a helix |
---|
538 | long i; |
---|
539 | |
---|
540 | switch ((*helix_borders)[pos]) { |
---|
541 | case ALI_PROFILE_BORDER_LEFT: |
---|
542 | return 0; |
---|
543 | case ALI_PROFILE_BORDER_RIGHT: |
---|
544 | return 0; |
---|
545 | default: |
---|
546 | for (i = (long) pos - 1; i >= 0; i--) |
---|
547 | switch ((*helix_borders)[i]) { |
---|
548 | case ALI_PROFILE_BORDER_LEFT: |
---|
549 | return 0; |
---|
550 | case ALI_PROFILE_BORDER_RIGHT: |
---|
551 | *first = (unsigned long) i + 1; |
---|
552 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
---|
553 | switch ((*helix_borders)[i]) { |
---|
554 | case ALI_PROFILE_BORDER_LEFT: |
---|
555 | *last = (unsigned long) i - 1; |
---|
556 | return 1; |
---|
557 | case ALI_PROFILE_BORDER_RIGHT: |
---|
558 | ali_warning("Helix borders inconsistent [1]"); |
---|
559 | return 0; |
---|
560 | } |
---|
561 | *last = prof_len - 1; |
---|
562 | return 1; |
---|
563 | } |
---|
564 | *first = 0; |
---|
565 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
---|
566 | switch ((*helix_borders)[i]) { |
---|
567 | case ALI_PROFILE_BORDER_LEFT: |
---|
568 | *last = (unsigned long) i - 1; |
---|
569 | return 1; |
---|
570 | case ALI_PROFILE_BORDER_RIGHT: |
---|
571 | ali_warning("Helix borders inconsistent [2]"); |
---|
572 | return 0; |
---|
573 | } |
---|
574 | *last = prof_len - 1; |
---|
575 | return 1; |
---|
576 | } |
---|
577 | } |
---|
578 | |
---|
579 | |
---|
580 | char *ALI_PROFILE::cheapest_sequence() { |
---|
581 | // generate a 'consensus sequence' |
---|
582 | |
---|
583 | char *seq; |
---|
584 | size_t p; |
---|
585 | int i, min_i; |
---|
586 | float min; |
---|
587 | |
---|
588 | |
---|
589 | seq = (char *) CALLOC((unsigned int) prof_len + 1, sizeof(char)); |
---|
590 | if (seq == 0) |
---|
591 | ali_fatal_error("Out of memory"); |
---|
592 | |
---|
593 | for (p = 0; p < prof_len; p++) { |
---|
594 | min = (*sub_costs)[p][0]; |
---|
595 | min_i = 0; |
---|
596 | for (i = 1; i < 5; i++) { |
---|
597 | if (min > (*sub_costs)[p][i]) { |
---|
598 | min = (*sub_costs)[p][i]; |
---|
599 | min_i = i; |
---|
600 | } |
---|
601 | else { |
---|
602 | if (min == (*sub_costs)[p][i]) |
---|
603 | min_i = -1; |
---|
604 | } |
---|
605 | } |
---|
606 | if (min_i >= 0) |
---|
607 | seq[p] = ali_number_to_base(min_i); |
---|
608 | else { |
---|
609 | if (min > 0) |
---|
610 | seq[p] = '*'; |
---|
611 | else |
---|
612 | seq[p] = '.'; |
---|
613 | } |
---|
614 | } |
---|
615 | seq[prof_len] = '\0'; |
---|
616 | |
---|
617 | return seq; |
---|
618 | } |
---|
619 | |
---|
620 | float ALI_PROFILE::w_binding(unsigned long first_seq_pos, ALI_SEQUENCE *seq) { |
---|
621 | // calculate the costs of a binding |
---|
622 | unsigned long pos_1_seq, pos_2_seq, last_seq_pos; |
---|
623 | long pos_compl; |
---|
624 | float costs = 0; |
---|
625 | |
---|
626 | last_seq_pos = first_seq_pos + seq->length() - 1; |
---|
627 | for (pos_1_seq = first_seq_pos; pos_1_seq <= last_seq_pos; pos_1_seq++) { |
---|
628 | pos_compl = (*helix)[pos_1_seq]; |
---|
629 | if (pos_compl >= 0) { |
---|
630 | pos_2_seq = (unsigned long) pos_compl; |
---|
631 | if (pos_2_seq > pos_1_seq && pos_2_seq <= last_seq_pos) |
---|
632 | costs += w_bind(pos_1_seq, seq->base(pos_1_seq), |
---|
633 | pos_2_seq, seq->base(pos_2_seq)); |
---|
634 | else |
---|
635 | if (pos_2_seq < first_seq_pos || pos_2_seq > last_seq_pos) |
---|
636 | costs += w_bind_maximum; |
---|
637 | } |
---|
638 | } |
---|
639 | |
---|
640 | return costs; |
---|
641 | } |
---|
642 | |
---|
643 | |
---|