| 1 | |
|---|
| 2 | #include <ctype.h> |
|---|
| 3 | // #include <malloc.h> |
|---|
| 4 | #include <stdlib.h> |
|---|
| 5 | |
|---|
| 6 | #include "ali_misc.hxx" |
|---|
| 7 | #include "ali_profile.hxx" |
|---|
| 8 | #include "ali_solution.hxx" |
|---|
| 9 | #include <BI_helix.hxx> |
|---|
| 10 | |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | GB_INLINE int ALI_PROFILE::is_binding_marker(char c) { |
|---|
| 14 | return (c == '~' || c == 'x'); |
|---|
| 15 | } |
|---|
| 16 | |
|---|
| 17 | |
|---|
| 18 | /* |
|---|
| 19 | * find the family in the pt server |
|---|
| 20 | */ |
|---|
| 21 | ALI_TLIST<ali_family_member *> *ALI_PROFILE::find_family(ALI_SEQUENCE *Sequence, |
|---|
| 22 | ALI_PROFILE_CONTEXT *context) |
|---|
| 23 | { |
|---|
| 24 | char message_buffer[100]; |
|---|
| 25 | ALI_PT &pt = (ALI_PT &) *(context->pt); |
|---|
| 26 | ALI_ARBDB &arbdb = (ALI_ARBDB &) *(context->arbdb); |
|---|
| 27 | ALI_SEQUENCE *seq; |
|---|
| 28 | ali_family_member *family_member; |
|---|
| 29 | ALI_TLIST<ali_family_member *> *family_list; |
|---|
| 30 | ALI_TLIST<ali_pt_member *> *pt_fam_list; |
|---|
| 31 | ALI_TLIST<ali_pt_member *> *pt_ext_list; |
|---|
| 32 | ali_pt_member *pt_member; |
|---|
| 33 | float weight, d; |
|---|
| 34 | unsigned long number; |
|---|
| 35 | |
|---|
| 36 | /* |
|---|
| 37 | * Initialisation |
|---|
| 38 | */ |
|---|
| 39 | family_list = new ALI_TLIST<ali_family_member *>; |
|---|
| 40 | |
|---|
| 41 | ali_message("Searching for the family"); |
|---|
| 42 | pt.find_family(Sequence,context->find_family_mode); |
|---|
| 43 | ali_message("Family found"); |
|---|
| 44 | |
|---|
| 45 | pt_fam_list = pt.get_family_list(); |
|---|
| 46 | pt_ext_list = pt.get_extension_list(); |
|---|
| 47 | |
|---|
| 48 | ali_message("Reading family:"); |
|---|
| 49 | |
|---|
| 50 | d = (context->ext_max_weight - 1.0) / (float) pt_fam_list->cardinality(); |
|---|
| 51 | |
|---|
| 52 | arbdb.begin_transaction(); |
|---|
| 53 | |
|---|
| 54 | /* |
|---|
| 55 | * calculate the real family members |
|---|
| 56 | */ |
|---|
| 57 | number = 0; |
|---|
| 58 | while (!pt_fam_list->is_empty()) { |
|---|
| 59 | pt_member = pt_fam_list->first(); |
|---|
| 60 | seq = arbdb.get_sequence(pt_member->name,context->mark_family_flag); |
|---|
| 61 | if (seq) { |
|---|
| 62 | weight = 1 + d * number; |
|---|
| 63 | sprintf(message_buffer,"%s (weight = %f, matches = %d)", |
|---|
| 64 | pt_member->name,weight,pt_member->matches); |
|---|
| 65 | ali_message(message_buffer); |
|---|
| 66 | family_member = new ali_family_member(seq, |
|---|
| 67 | (float) pt_member->matches, |
|---|
| 68 | weight); |
|---|
| 69 | family_list->append_end(family_member); |
|---|
| 70 | number++; |
|---|
| 71 | } |
|---|
| 72 | else { |
|---|
| 73 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
|---|
| 74 | } |
|---|
| 75 | pt_fam_list->delete_element(); |
|---|
| 76 | } |
|---|
| 77 | delete pt_fam_list; |
|---|
| 78 | |
|---|
| 79 | ali_message("Reading family extension:"); |
|---|
| 80 | |
|---|
| 81 | d = -1.0 * context->ext_max_weight / (float) pt_ext_list->cardinality(); |
|---|
| 82 | |
|---|
| 83 | /* |
|---|
| 84 | * calculate the extension of the family |
|---|
| 85 | */ |
|---|
| 86 | number = 0; |
|---|
| 87 | while (!pt_ext_list->is_empty()) { |
|---|
| 88 | pt_member = pt_ext_list->first(); |
|---|
| 89 | seq = arbdb.get_sequence(pt_member->name, |
|---|
| 90 | context->mark_family_extension_flag); |
|---|
| 91 | if (seq) { |
|---|
| 92 | weight = context->ext_max_weight + d * number; |
|---|
| 93 | sprintf(message_buffer,"%s (weight = %f, matches = %d)", |
|---|
| 94 | pt_member->name,weight,pt_member->matches); |
|---|
| 95 | ali_message(message_buffer); |
|---|
| 96 | family_member = new ali_family_member(seq, |
|---|
| 97 | (float) pt_member->matches, |
|---|
| 98 | weight); |
|---|
| 99 | family_list->append_end(family_member); |
|---|
| 100 | number++; |
|---|
| 101 | } |
|---|
| 102 | else { |
|---|
| 103 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
|---|
| 104 | } |
|---|
| 105 | pt_ext_list->delete_element(); |
|---|
| 106 | } |
|---|
| 107 | delete pt_ext_list; |
|---|
| 108 | |
|---|
| 109 | arbdb.commit_transaction(); |
|---|
| 110 | |
|---|
| 111 | return family_list; |
|---|
| 112 | } |
|---|
| 113 | |
|---|
| 114 | /* |
|---|
| 115 | * calculate the costs for aligning against a family |
|---|
| 116 | */ |
|---|
| 117 | void ALI_PROFILE::calculate_costs(ALI_TLIST<ali_family_member *> *family_list, |
|---|
| 118 | ALI_PROFILE_CONTEXT *context) |
|---|
| 119 | { |
|---|
| 120 | ali_family_member *family_member; |
|---|
| 121 | float a[7], w[7], w_sum, sm[7][7]; |
|---|
| 122 | float base_gap_weights[5], w_bg_sum; |
|---|
| 123 | long members; |
|---|
| 124 | size_t p; |
|---|
| 125 | int i; |
|---|
| 126 | size_t j; |
|---|
| 127 | unsigned long *l; |
|---|
| 128 | float *g; |
|---|
| 129 | unsigned char **seq; |
|---|
| 130 | long *seq_len; |
|---|
| 131 | float (*w_Del)[], (*percent)[]; |
|---|
| 132 | |
|---|
| 133 | /* |
|---|
| 134 | * allocate temporary memory |
|---|
| 135 | */ |
|---|
| 136 | members = family_list->cardinality(); |
|---|
| 137 | l = (unsigned long *) CALLOC((unsigned int) members,sizeof(long)); |
|---|
| 138 | g = (float *) CALLOC((unsigned int) members,sizeof(float)); |
|---|
| 139 | seq = (unsigned char ** ) CALLOC((unsigned int) members,sizeof(char *)); |
|---|
| 140 | seq_len = (long *) CALLOC((unsigned int) members,sizeof(long)); |
|---|
| 141 | if (l == 0 || g == 0 || seq == 0 || seq_len == 0) |
|---|
| 142 | ali_fatal_error("Out of memory"); |
|---|
| 143 | |
|---|
| 144 | /* |
|---|
| 145 | * initialize arrays |
|---|
| 146 | */ |
|---|
| 147 | family_member = family_list->first(); |
|---|
| 148 | prof_len = family_member->sequence->length(); |
|---|
| 149 | seq[0] = family_member->sequence->sequence(); |
|---|
| 150 | seq_len[0] = family_member->sequence->length(); |
|---|
| 151 | g[0] = family_member->weight; |
|---|
| 152 | i = 1; |
|---|
| 153 | sub_costs_maximum = 0.0; |
|---|
| 154 | |
|---|
| 155 | while (family_list->is_next()) { |
|---|
| 156 | family_member = family_list->next(); |
|---|
| 157 | seq[i] = family_member->sequence->sequence(); |
|---|
| 158 | seq_len[i] = family_member->sequence->length(); |
|---|
| 159 | g[i] = family_member->weight; |
|---|
| 160 | i++; |
|---|
| 161 | if (prof_len < family_member->sequence->length()) { |
|---|
| 162 | ali_warning("Family members have different length"); |
|---|
| 163 | prof_len = family_member->sequence->length(); |
|---|
| 164 | } |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | /* |
|---|
| 168 | * Calculate the substitution cost matrix |
|---|
| 169 | */ |
|---|
| 170 | for (i = 0; i < 5; i++) |
|---|
| 171 | for (j = 0; j < 5; j++) |
|---|
| 172 | sm[i][j] = context->substitute_matrix[i][j]; |
|---|
| 173 | |
|---|
| 174 | /* |
|---|
| 175 | * Initialize l-array (position of last base) |
|---|
| 176 | */ |
|---|
| 177 | for (i = 0; i < members; i++) |
|---|
| 178 | l[i] = prof_len + 1; |
|---|
| 179 | |
|---|
| 180 | /* |
|---|
| 181 | * allocate memory for costs |
|---|
| 182 | */ |
|---|
| 183 | |
|---|
| 184 | base_weights = (float (**) [4]) CALLOC((unsigned int) prof_len, sizeof(float [4])); |
|---|
| 185 | //base_weights = (float (*) [1][4]) CALLOC((unsigned int) prof_len, sizeof(float [4])); |
|---|
| 186 | sub_costs = (float (**) [6]) CALLOC((unsigned int) prof_len, sizeof(float [6])); |
|---|
| 187 | //sub_costs = (float (*) [1][6]) CALLOC((unsigned int) prof_len, sizeof(float [6])); |
|---|
| 188 | binding_costs = (float (*) [5][5]) CALLOC((unsigned int) 5, sizeof(float [5])); |
|---|
| 189 | lmin = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
|---|
| 190 | lmax = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
|---|
| 191 | gap_costs = (float ***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 192 | //gap_costs = (float *(*)[1]) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 193 | gap_percents = (float***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 194 | //gap_percents = (float*(*)[1]) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 195 | if (binding_costs == 0 || sub_costs == 0 || lmin == 0 || lmax == 0 || |
|---|
| 196 | gap_costs == 0 || gap_percents == 0 || base_weights == 0) { |
|---|
| 197 | ali_fatal_error("Out of memory"); |
|---|
| 198 | } |
|---|
| 199 | |
|---|
| 200 | /* |
|---|
| 201 | * Copy the binding costs matrix |
|---|
| 202 | */ |
|---|
| 203 | w_bind_maximum = context->binding_matrix[0][0]; |
|---|
| 204 | for (i = 0; i < 5; i++) |
|---|
| 205 | for (j = 0; j < 5; j++) { |
|---|
| 206 | (*binding_costs)[i][j] = context->binding_matrix[i][j]; |
|---|
| 207 | if ((*binding_costs)[i][j] > w_bind_maximum) |
|---|
| 208 | w_bind_maximum = (*binding_costs)[i][j]; |
|---|
| 209 | } |
|---|
| 210 | |
|---|
| 211 | /* |
|---|
| 212 | * calculate the costs for EVERY position |
|---|
| 213 | */ |
|---|
| 214 | ali_message("Calculating costs for substitution"); |
|---|
| 215 | for (p = 0; p < prof_len; p++) { |
|---|
| 216 | /* |
|---|
| 217 | * Initialisation |
|---|
| 218 | */ |
|---|
| 219 | for (i = 0; i < 7; i++) |
|---|
| 220 | a[i] = w[i] = sm[5][i] = sm[i][5] = sm[6][i] = sm[i][6] = 0.0; |
|---|
| 221 | for (i = 0; i < 6; i++) |
|---|
| 222 | (*sub_costs)[p][i] = 0.0; |
|---|
| 223 | w_sum = 0.0; |
|---|
| 224 | w_bg_sum = 0.0; |
|---|
| 225 | |
|---|
| 226 | /* |
|---|
| 227 | * Statistic consensus |
|---|
| 228 | */ |
|---|
| 229 | for (i = 0; i < members; i++) { |
|---|
| 230 | if (p < size_t(seq_len[i])) { |
|---|
| 231 | a[seq[i][p]]++; |
|---|
| 232 | w[seq[i][p]] += g[i]; |
|---|
| 233 | if (ali_is_real_base(seq[i][p])) |
|---|
| 234 | w_sum += g[i]; |
|---|
| 235 | if (ali_is_real_base(seq[i][p]) || ali_is_gap(seq[i][p])) |
|---|
| 236 | w_bg_sum += g[i]; |
|---|
| 237 | } |
|---|
| 238 | else { |
|---|
| 239 | a[ALI_DOT_CODE]++; |
|---|
| 240 | w[ALI_DOT_CODE] += g[i]; |
|---|
| 241 | } |
|---|
| 242 | } |
|---|
| 243 | |
|---|
| 244 | /* |
|---|
| 245 | * Relative weight of bases |
|---|
| 246 | */ |
|---|
| 247 | if (w_sum != 0) |
|---|
| 248 | for (i = 0; i < 4; i++) |
|---|
| 249 | (*base_weights)[p][i] = w[i] / w_sum; |
|---|
| 250 | else |
|---|
| 251 | for (i = 0; i < 4; i++) |
|---|
| 252 | (*base_weights)[p][i] = 0.25; |
|---|
| 253 | |
|---|
| 254 | /* |
|---|
| 255 | * Relative weight of bases and gaps |
|---|
| 256 | */ |
|---|
| 257 | if (w_bg_sum != 0) |
|---|
| 258 | for (i = 0; i < 5; i++) |
|---|
| 259 | base_gap_weights[i] = w[i] / w_bg_sum; |
|---|
| 260 | else |
|---|
| 261 | for (i = 0; i < 5; i++) |
|---|
| 262 | base_gap_weights[i] = 0.2; |
|---|
| 263 | |
|---|
| 264 | /* |
|---|
| 265 | * Expandation of substitute matrix (for 'n') |
|---|
| 266 | */ |
|---|
| 267 | for (j = 0; j < 5; j++) { |
|---|
| 268 | for (i = 0; i < 4; i++) { |
|---|
| 269 | sm[5][j] += (*base_weights)[p][i] * sm[i][j]; |
|---|
| 270 | sm[j][5] += (*base_weights)[p][i] * sm[j][i]; |
|---|
| 271 | } |
|---|
| 272 | } |
|---|
| 273 | for (i = 0; i < 4; i++) |
|---|
| 274 | sm[5][5] += (*base_weights)[p][i] * sm[i][i]; |
|---|
| 275 | |
|---|
| 276 | /* |
|---|
| 277 | * Expandation of substitute matrix (for '.') |
|---|
| 278 | */ |
|---|
| 279 | for (j = 0; j < 6; j++) |
|---|
| 280 | for (i = 0; i < 5; i++) { |
|---|
| 281 | sm[6][j] += base_gap_weights[i] * sm[i][j]; |
|---|
| 282 | sm[j][6] += base_gap_weights[i] * sm[j][i]; |
|---|
| 283 | } |
|---|
| 284 | for (i = 0; i < 5; i++) |
|---|
| 285 | sm[6][6] += base_gap_weights[i] * sm[i][i]; |
|---|
| 286 | |
|---|
| 287 | /* |
|---|
| 288 | * Substitution costs |
|---|
| 289 | */ |
|---|
| 290 | for (i = 0; i < members; i++) { |
|---|
| 291 | if (p < size_t(seq_len[i])) { |
|---|
| 292 | for (j = 0; j < 6; j++) { |
|---|
| 293 | (*sub_costs)[p][j] += g[i] * sm[seq[i][p]][j]; |
|---|
| 294 | } |
|---|
| 295 | } else { |
|---|
| 296 | for (j = 0; j < 6; j++) { |
|---|
| 297 | (*sub_costs)[p][j] += g[i] * sm[ALI_DOT_CODE][j]; |
|---|
| 298 | } |
|---|
| 299 | } |
|---|
| 300 | } |
|---|
| 301 | for (j = 0; j < 6; j++) { |
|---|
| 302 | (*sub_costs)[p][j] /= members; |
|---|
| 303 | if ((*sub_costs)[p][j] > sub_costs_maximum) |
|---|
| 304 | sub_costs_maximum = (*sub_costs)[p][j]; |
|---|
| 305 | } |
|---|
| 306 | |
|---|
| 307 | /* |
|---|
| 308 | * Calculate dynamic deletion costs and percents of real gaps |
|---|
| 309 | */ |
|---|
| 310 | lmax[p] = 0; |
|---|
| 311 | lmin[p] = p; |
|---|
| 312 | for (i = 0; i < members; i++) |
|---|
| 313 | if (l[i] < p) { |
|---|
| 314 | if (lmin[p] > l[i]) |
|---|
| 315 | lmin[p] = l[i]; |
|---|
| 316 | if (lmax[p] < l[i]) |
|---|
| 317 | lmax[p] = l[i]; |
|---|
| 318 | } |
|---|
| 319 | if (lmin[p] == p && lmax[p] == 0) { |
|---|
| 320 | lmin[p] = lmax[p] = p; |
|---|
| 321 | } |
|---|
| 322 | |
|---|
| 323 | w_Del = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
|---|
| 324 | percent = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
|---|
| 325 | if (w_Del == 0 || percent == 0) |
|---|
| 326 | ali_fatal_error("Out of memory"); |
|---|
| 327 | (*gap_costs)[p] = (float *) w_Del; |
|---|
| 328 | (*gap_percents)[p] = (float *) percent; |
|---|
| 329 | |
|---|
| 330 | /* |
|---|
| 331 | * Calculate dynamic deletion costs |
|---|
| 332 | */ |
|---|
| 333 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
|---|
| 334 | (*w_Del)[j] = 0; |
|---|
| 335 | for (i = 0; i < members; i++) { |
|---|
| 336 | /* |
|---|
| 337 | * Normal case |
|---|
| 338 | */ |
|---|
| 339 | if (p < size_t(seq_len[i]) /* && !ali_is_dot(seq[i][p]) */) { |
|---|
| 340 | if (l[i] == prof_len + 1 || l[i] >= j + lmin[p]) { |
|---|
| 341 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4] * context->multi_gap_factor; |
|---|
| 342 | }else{ |
|---|
| 343 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4]; |
|---|
| 344 | } |
|---|
| 345 | } |
|---|
| 346 | /* |
|---|
| 347 | * expand sequence with dots |
|---|
| 348 | */ |
|---|
| 349 | else { |
|---|
| 350 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
|---|
| 351 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4] * context->multi_gap_factor; |
|---|
| 352 | }else{ |
|---|
| 353 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4]; |
|---|
| 354 | } |
|---|
| 355 | } |
|---|
| 356 | } |
|---|
| 357 | (*w_Del)[j] /= members; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | /* |
|---|
| 361 | * Update the l-array |
|---|
| 362 | */ |
|---|
| 363 | for (i = 0; i < members; i++){ |
|---|
| 364 | if (!ali_is_gap(seq[i][p])) |
|---|
| 365 | l[i] = p; |
|---|
| 366 | } |
|---|
| 367 | |
|---|
| 368 | /* |
|---|
| 369 | * Calculate percents of real gaps |
|---|
| 370 | */ |
|---|
| 371 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
|---|
| 372 | (*percent)[j] = 0; |
|---|
| 373 | for (i = 0; i < members; i++) { |
|---|
| 374 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
|---|
| 375 | (*percent)[j] += 1.0; |
|---|
| 376 | } |
|---|
| 377 | } |
|---|
| 378 | (*percent)[j] /= members; |
|---|
| 379 | } |
|---|
| 380 | } |
|---|
| 381 | |
|---|
| 382 | ali_message("Calculation finished"); |
|---|
| 383 | |
|---|
| 384 | free((char *) l); |
|---|
| 385 | free((char *) g); |
|---|
| 386 | free((char *) seq); |
|---|
| 387 | free((char *) seq_len); |
|---|
| 388 | } |
|---|
| 389 | |
|---|
| 390 | /* |
|---|
| 391 | * find the next helix |
|---|
| 392 | */ |
|---|
| 393 | int ALI_PROFILE::find_next_helix(char h[], unsigned long h_len, |
|---|
| 394 | unsigned long pos, |
|---|
| 395 | unsigned long *helix_nr, |
|---|
| 396 | unsigned long *start, unsigned long *end) |
|---|
| 397 | { |
|---|
| 398 | unsigned long i; |
|---|
| 399 | |
|---|
| 400 | for (i = pos; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 401 | if (i >= h_len) return -1; |
|---|
| 402 | |
|---|
| 403 | *start = i; |
|---|
| 404 | sscanf(&h[i],"%ld",helix_nr); |
|---|
| 405 | for (; i < h_len && isdigit(h[i]); i++) ; |
|---|
| 406 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 407 | *end = i - 1; |
|---|
| 408 | |
|---|
| 409 | return 0; |
|---|
| 410 | } |
|---|
| 411 | |
|---|
| 412 | /* |
|---|
| 413 | * find the complementary part of a helix |
|---|
| 414 | */ |
|---|
| 415 | int ALI_PROFILE::find_comp_helix(char h[], unsigned long h_len, |
|---|
| 416 | unsigned long pos, unsigned long helix_nr, |
|---|
| 417 | unsigned long *start, unsigned long *end) |
|---|
| 418 | { |
|---|
| 419 | unsigned long nr, i; |
|---|
| 420 | |
|---|
| 421 | i = pos; |
|---|
| 422 | do { |
|---|
| 423 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 424 | if (i >= h_len) return -1; |
|---|
| 425 | *start = i; |
|---|
| 426 | sscanf(&h[i],"%ld",&nr); |
|---|
| 427 | for (; i < h_len && isdigit(h[i]); i++) ; |
|---|
| 428 | } while (helix_nr != nr); |
|---|
| 429 | |
|---|
| 430 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 431 | *end = i - 1; |
|---|
| 432 | |
|---|
| 433 | return 0; |
|---|
| 434 | } |
|---|
| 435 | |
|---|
| 436 | void ALI_PROFILE::delete_comp_helix(char h1[], char h2[], unsigned long h_len, |
|---|
| 437 | unsigned long start, unsigned long end) |
|---|
| 438 | { |
|---|
| 439 | unsigned long i; |
|---|
| 440 | |
|---|
| 441 | for (i = start; i < h_len && i <= end; i++) { |
|---|
| 442 | h1[i] = '.'; |
|---|
| 443 | h2[i] = '.'; |
|---|
| 444 | } |
|---|
| 445 | } |
|---|
| 446 | |
|---|
| 447 | #if 0 |
|---|
| 448 | |
|---|
| 449 | int ALI_PROFILE::map_helix(char h[], unsigned long h_len, |
|---|
| 450 | unsigned long start1, unsigned long end1, |
|---|
| 451 | unsigned long start2, unsigned long end2) |
|---|
| 452 | { |
|---|
| 453 | unsigned long p1, p2; |
|---|
| 454 | unsigned long last1, last2; |
|---|
| 455 | |
|---|
| 456 | if (end1 >= h_len || end2 >= h_len || start1 > end1 || start2 > end2) |
|---|
| 457 | ali_fatal_error("Inconsistent parameters","ALI_PROFILE::map_helix()"); |
|---|
| 458 | |
|---|
| 459 | p1 = start1; |
|---|
| 460 | for (p2 = end2; p2 >= start2 && !is_binding_marker(h[p2]); p2--); |
|---|
| 461 | |
|---|
| 462 | (*helix_borders)[p1] = ALI_PROFILE_BORDER_LEFT; |
|---|
| 463 | (*helix_borders)[p2] = ALI_PROFILE_BORDER_RIGHT; |
|---|
| 464 | last1 = p1; |
|---|
| 465 | last2 = p2; |
|---|
| 466 | while (p1 <= end1 && p2 >= start2) { |
|---|
| 467 | (*helix)[p1] = p2; |
|---|
| 468 | (*helix)[p2] = p1; |
|---|
| 469 | last1 = p1; |
|---|
| 470 | last2 = p2; |
|---|
| 471 | for (p1++; p1 <= end1 && !is_binding_marker(h[p1]); p1++); |
|---|
| 472 | for (p2--; p2 >= start2 && !is_binding_marker(h[p2]); p2--); |
|---|
| 473 | } |
|---|
| 474 | (*helix_borders)[last1] = ALI_PROFILE_BORDER_RIGHT; |
|---|
| 475 | (*helix_borders)[last2] = ALI_PROFILE_BORDER_LEFT; |
|---|
| 476 | |
|---|
| 477 | if (p1 <= end1 || p2 >= start2) |
|---|
| 478 | return -1; |
|---|
| 479 | |
|---|
| 480 | return 0; |
|---|
| 481 | } |
|---|
| 482 | #endif |
|---|
| 483 | |
|---|
| 484 | /* |
|---|
| 485 | * initialize the array, representing the helix |
|---|
| 486 | */ |
|---|
| 487 | void ALI_PROFILE::initialize_helix(ALI_PROFILE_CONTEXT *context) |
|---|
| 488 | { |
|---|
| 489 | const char *error_string; |
|---|
| 490 | BI_helix bi_helix; |
|---|
| 491 | |
|---|
| 492 | unsigned long i; |
|---|
| 493 | |
|---|
| 494 | /* |
|---|
| 495 | * read helix |
|---|
| 496 | */ |
|---|
| 497 | if ((error_string = bi_helix.init(context->arbdb->gb_main)) != 0) |
|---|
| 498 | ali_warning(error_string); |
|---|
| 499 | |
|---|
| 500 | helix_len = bi_helix.size(); |
|---|
| 501 | helix = (long **) CALLOC((unsigned int) helix_len, sizeof(long)); |
|---|
| 502 | helix_borders = (char **) CALLOC((unsigned int) helix_len, sizeof(long)); |
|---|
| 503 | if (helix == 0 || helix_borders == 0) ali_fatal_error("Out of memory"); |
|---|
| 504 | |
|---|
| 505 | /* |
|---|
| 506 | * convert helix for internal use |
|---|
| 507 | */ |
|---|
| 508 | for (i = 0; i < helix_len; i++) |
|---|
| 509 | if (bi_helix.pairtype(i) == HELIX_PAIR) |
|---|
| 510 | (*helix)[i] = bi_helix.opposite_position(i); |
|---|
| 511 | else |
|---|
| 512 | (*helix)[i] = -1; |
|---|
| 513 | } |
|---|
| 514 | |
|---|
| 515 | |
|---|
| 516 | ALI_PROFILE::ALI_PROFILE(ALI_SEQUENCE *seq, ALI_PROFILE_CONTEXT *context) |
|---|
| 517 | { |
|---|
| 518 | char message_buffer[100]; |
|---|
| 519 | ali_family_member *family_member; |
|---|
| 520 | ALI_TLIST<ali_family_member *> *family_list; |
|---|
| 521 | |
|---|
| 522 | norm_sequence = new ALI_NORM_SEQUENCE(seq); |
|---|
| 523 | |
|---|
| 524 | multi_gap_factor = context->multi_gap_factor; |
|---|
| 525 | |
|---|
| 526 | initialize_helix(context); |
|---|
| 527 | |
|---|
| 528 | family_list = find_family(seq,context); |
|---|
| 529 | if (family_list->is_empty()) { |
|---|
| 530 | ali_error("Family not found (maybe incompatible PT and DB Servers)"); |
|---|
| 531 | } |
|---|
| 532 | |
|---|
| 533 | calculate_costs(family_list,context); |
|---|
| 534 | |
|---|
| 535 | insert_cost = sub_costs_maximum * context->insert_factor; |
|---|
| 536 | multi_insert_cost = insert_cost * context->multi_insert_factor; |
|---|
| 537 | |
|---|
| 538 | sprintf(message_buffer,"Multi gap factor = %f",multi_gap_factor); |
|---|
| 539 | ali_message(message_buffer); |
|---|
| 540 | sprintf(message_buffer,"Maximal substitution costs = %f",sub_costs_maximum); |
|---|
| 541 | ali_message(message_buffer); |
|---|
| 542 | sprintf(message_buffer,"Normal insert costs = %f",insert_cost); |
|---|
| 543 | ali_message(message_buffer); |
|---|
| 544 | sprintf(message_buffer,"Multiple insert costs = %f",multi_insert_cost); |
|---|
| 545 | ali_message(message_buffer); |
|---|
| 546 | |
|---|
| 547 | /* |
|---|
| 548 | * Delete the family list |
|---|
| 549 | */ |
|---|
| 550 | family_member = family_list->first(); |
|---|
| 551 | delete family_member->sequence; |
|---|
| 552 | delete family_member; |
|---|
| 553 | while (family_list->is_next()) { |
|---|
| 554 | family_member = family_list->next(); |
|---|
| 555 | delete family_member->sequence; |
|---|
| 556 | delete family_member; |
|---|
| 557 | } |
|---|
| 558 | delete family_list; |
|---|
| 559 | } |
|---|
| 560 | |
|---|
| 561 | ALI_PROFILE::~ALI_PROFILE(void) |
|---|
| 562 | { |
|---|
| 563 | size_t i; |
|---|
| 564 | |
|---|
| 565 | if (helix) |
|---|
| 566 | free((char *) helix); |
|---|
| 567 | if (helix_borders) |
|---|
| 568 | free((char *) helix_borders); |
|---|
| 569 | if (binding_costs) |
|---|
| 570 | free((char *) binding_costs); |
|---|
| 571 | if (sub_costs) |
|---|
| 572 | free((char *) sub_costs); |
|---|
| 573 | if (gap_costs) { |
|---|
| 574 | for (i = 0; i < prof_len; i++) |
|---|
| 575 | if ((*gap_costs)[i]) |
|---|
| 576 | free((char *) (*gap_costs)[i]); |
|---|
| 577 | free((char *) gap_costs); |
|---|
| 578 | } |
|---|
| 579 | if (gap_percents) { |
|---|
| 580 | for (i = 0; i < prof_len; i++) |
|---|
| 581 | if ((*gap_percents)[i]) |
|---|
| 582 | free((char *) (*gap_percents)[i]); |
|---|
| 583 | free((char *) gap_percents); |
|---|
| 584 | } |
|---|
| 585 | if (lmin) |
|---|
| 586 | free((char *) lmin); |
|---|
| 587 | if (lmax) |
|---|
| 588 | free((char *) lmax); |
|---|
| 589 | if (norm_sequence) |
|---|
| 590 | delete norm_sequence; |
|---|
| 591 | } |
|---|
| 592 | |
|---|
| 593 | |
|---|
| 594 | /* |
|---|
| 595 | * test whether a position is inside a helix |
|---|
| 596 | */ |
|---|
| 597 | int ALI_PROFILE::is_in_helix(unsigned long pos, |
|---|
| 598 | unsigned long *first, unsigned long *last) { |
|---|
| 599 | long i; |
|---|
| 600 | |
|---|
| 601 | if (pos > helix_len) |
|---|
| 602 | return 0; |
|---|
| 603 | |
|---|
| 604 | switch ((*helix_borders)[pos]) { |
|---|
| 605 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 606 | *first = pos; |
|---|
| 607 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 608 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_RIGHT) { |
|---|
| 609 | *last = (unsigned long) i; |
|---|
| 610 | return 1; |
|---|
| 611 | } |
|---|
| 612 | ali_warning("Helix borders inconsistent (1)"); |
|---|
| 613 | return 0; |
|---|
| 614 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 615 | *last = pos; |
|---|
| 616 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 617 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_LEFT) { |
|---|
| 618 | *first = (unsigned long) i; |
|---|
| 619 | return 1; |
|---|
| 620 | } |
|---|
| 621 | ali_warning("Helix borders inconsistent (2)"); |
|---|
| 622 | return 0; |
|---|
| 623 | default: |
|---|
| 624 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 625 | switch ((*helix_borders)[i]) { |
|---|
| 626 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 627 | return 0; |
|---|
| 628 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 629 | *first = (unsigned long) i; |
|---|
| 630 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 631 | switch ((*helix_borders)[i]) { |
|---|
| 632 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 633 | ali_warning("Helix borders inconsistent (3)"); |
|---|
| 634 | printf("pos = %ld\n",pos); |
|---|
| 635 | return 0; |
|---|
| 636 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 637 | *last = (unsigned long) i; |
|---|
| 638 | return 1; |
|---|
| 639 | } |
|---|
| 640 | } |
|---|
| 641 | } |
|---|
| 642 | return 0; |
|---|
| 643 | } |
|---|
| 644 | |
|---|
| 645 | /* |
|---|
| 646 | * test, whether a position is outside a helix |
|---|
| 647 | */ |
|---|
| 648 | int ALI_PROFILE::is_outside_helix(unsigned long pos, |
|---|
| 649 | unsigned long *first, unsigned long *last) { |
|---|
| 650 | long i; |
|---|
| 651 | |
|---|
| 652 | switch ((*helix_borders)[pos]) { |
|---|
| 653 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 654 | return 0; |
|---|
| 655 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 656 | return 0; |
|---|
| 657 | default: |
|---|
| 658 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 659 | switch ((*helix_borders)[i]) { |
|---|
| 660 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 661 | return 0; |
|---|
| 662 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 663 | *first = (unsigned long) i + 1; |
|---|
| 664 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 665 | switch ((*helix_borders)[i]) { |
|---|
| 666 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 667 | *last = (unsigned long) i - 1; |
|---|
| 668 | return 1; |
|---|
| 669 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 670 | ali_warning("Helix borders inconsistent [1]"); |
|---|
| 671 | return 0; |
|---|
| 672 | } |
|---|
| 673 | *last = prof_len - 1; |
|---|
| 674 | return 1; |
|---|
| 675 | } |
|---|
| 676 | *first = 0; |
|---|
| 677 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 678 | switch ((*helix_borders)[i]) { |
|---|
| 679 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 680 | *last = (unsigned long) i - 1; |
|---|
| 681 | return 1; |
|---|
| 682 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 683 | ali_warning("Helix borders inconsistent [2]"); |
|---|
| 684 | return 0; |
|---|
| 685 | } |
|---|
| 686 | *last = prof_len - 1; |
|---|
| 687 | return 1; |
|---|
| 688 | } |
|---|
| 689 | } |
|---|
| 690 | |
|---|
| 691 | |
|---|
| 692 | /* |
|---|
| 693 | * generate a 'consensus sequence' |
|---|
| 694 | */ |
|---|
| 695 | char *ALI_PROFILE::cheapest_sequence(void) |
|---|
| 696 | { |
|---|
| 697 | |
|---|
| 698 | char *seq; |
|---|
| 699 | size_t p; |
|---|
| 700 | int i, min_i; |
|---|
| 701 | float min; |
|---|
| 702 | |
|---|
| 703 | |
|---|
| 704 | seq = (char *) CALLOC((unsigned int) prof_len + 1, sizeof(char)); |
|---|
| 705 | if (seq == 0) |
|---|
| 706 | ali_fatal_error("Out of memory"); |
|---|
| 707 | |
|---|
| 708 | for (p = 0; p < prof_len; p++) { |
|---|
| 709 | min = (*sub_costs)[p][0]; |
|---|
| 710 | min_i = 0; |
|---|
| 711 | for (i = 1; i < 5; i++) { |
|---|
| 712 | if (min > (*sub_costs)[p][i]) { |
|---|
| 713 | min = (*sub_costs)[p][i]; |
|---|
| 714 | min_i = i; |
|---|
| 715 | } |
|---|
| 716 | else { |
|---|
| 717 | if (min == (*sub_costs)[p][i]) |
|---|
| 718 | min_i = -1; |
|---|
| 719 | } |
|---|
| 720 | } |
|---|
| 721 | if (min_i >= 0) |
|---|
| 722 | seq[p] = ali_number_to_base(min_i); |
|---|
| 723 | else { |
|---|
| 724 | if (min > 0) |
|---|
| 725 | seq[p] = '*'; |
|---|
| 726 | else |
|---|
| 727 | seq[p] = '.'; |
|---|
| 728 | } |
|---|
| 729 | } |
|---|
| 730 | seq[prof_len] = '\0'; |
|---|
| 731 | |
|---|
| 732 | return seq; |
|---|
| 733 | } |
|---|
| 734 | |
|---|
| 735 | /* |
|---|
| 736 | * calculate the costs of a binding |
|---|
| 737 | */ |
|---|
| 738 | float ALI_PROFILE::w_binding(unsigned long first_seq_pos, |
|---|
| 739 | ALI_SEQUENCE *seq) |
|---|
| 740 | { |
|---|
| 741 | unsigned long pos_1_seq, pos_2_seq, last_seq_pos; |
|---|
| 742 | long pos_compl; |
|---|
| 743 | float costs = 0; |
|---|
| 744 | |
|---|
| 745 | last_seq_pos = first_seq_pos + seq->length() - 1; |
|---|
| 746 | for (pos_1_seq = first_seq_pos; pos_1_seq <= last_seq_pos; pos_1_seq++) { |
|---|
| 747 | pos_compl = (*helix)[pos_1_seq]; |
|---|
| 748 | if (pos_compl >= 0) { |
|---|
| 749 | pos_2_seq = (unsigned long) pos_compl; |
|---|
| 750 | if (pos_2_seq > pos_1_seq && pos_2_seq <= last_seq_pos) |
|---|
| 751 | costs += w_bind(pos_1_seq, seq->base(pos_1_seq), |
|---|
| 752 | pos_2_seq, seq->base(pos_2_seq)); |
|---|
| 753 | else |
|---|
| 754 | if (pos_2_seq < first_seq_pos || pos_2_seq > last_seq_pos) |
|---|
| 755 | costs += w_bind_maximum; |
|---|
| 756 | } |
|---|
| 757 | } |
|---|
| 758 | |
|---|
| 759 | return costs; |
|---|
| 760 | } |
|---|
| 761 | |
|---|
| 762 | |
|---|