| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ali_profile.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "ali_profile.hxx" |
|---|
| 12 | |
|---|
| 13 | #include <BI_helix.hxx> |
|---|
| 14 | #include <cctype> |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | inline int ALI_PROFILE::is_binding_marker(char c) { |
|---|
| 18 | return (c == '~' || c == 'x'); |
|---|
| 19 | } |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | ALI_TLIST<ali_family_member *> *ALI_PROFILE::find_family(ALI_SEQUENCE *Sequence, ALI_PROFILE_CONTEXT *context) { |
|---|
| 23 | // find the family in the pt server |
|---|
| 24 | char message_buffer[100]; |
|---|
| 25 | ALI_PT &pt = (ALI_PT &) *(context->pt); |
|---|
| 26 | ALI_ARBDB &arbdb = (ALI_ARBDB &) *(context->arbdb); |
|---|
| 27 | ALI_SEQUENCE *seq; |
|---|
| 28 | ali_family_member *family_member; |
|---|
| 29 | ALI_TLIST<ali_family_member *> *family_list; |
|---|
| 30 | ALI_TLIST<ali_pt_member *> *pt_fam_list; |
|---|
| 31 | ALI_TLIST<ali_pt_member *> *pt_ext_list; |
|---|
| 32 | ali_pt_member *pt_member; |
|---|
| 33 | float weight, d; |
|---|
| 34 | unsigned long number; |
|---|
| 35 | |
|---|
| 36 | // Initialization |
|---|
| 37 | family_list = new ALI_TLIST<ali_family_member *>; |
|---|
| 38 | |
|---|
| 39 | ali_message("Searching for the family"); |
|---|
| 40 | pt.find_family(Sequence, context->find_family_mode); |
|---|
| 41 | ali_message("Family found"); |
|---|
| 42 | |
|---|
| 43 | pt_fam_list = pt.get_family_list(); |
|---|
| 44 | pt_ext_list = pt.get_extension_list(); |
|---|
| 45 | |
|---|
| 46 | ali_message("Reading family:"); |
|---|
| 47 | |
|---|
| 48 | d = (context->ext_max_weight - 1.0) / (float) pt_fam_list->cardinality(); |
|---|
| 49 | |
|---|
| 50 | arbdb.begin_transaction(); |
|---|
| 51 | |
|---|
| 52 | // calculate the real family members |
|---|
| 53 | number = 0; |
|---|
| 54 | while (!pt_fam_list->is_empty()) { |
|---|
| 55 | pt_member = pt_fam_list->first(); |
|---|
| 56 | seq = arbdb.get_sequence(pt_member->name, context->mark_family_flag); |
|---|
| 57 | if (seq) { |
|---|
| 58 | weight = 1 + d * number; |
|---|
| 59 | sprintf(message_buffer, "%s (weight = %f, matches = %d)", |
|---|
| 60 | pt_member->name, weight, pt_member->matches); |
|---|
| 61 | ali_message(message_buffer); |
|---|
| 62 | family_member = new ali_family_member(seq, |
|---|
| 63 | (float) pt_member->matches, |
|---|
| 64 | weight); |
|---|
| 65 | family_list->append_end(family_member); |
|---|
| 66 | number++; |
|---|
| 67 | } |
|---|
| 68 | else { |
|---|
| 69 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
|---|
| 70 | } |
|---|
| 71 | pt_fam_list->delete_element(); |
|---|
| 72 | } |
|---|
| 73 | delete pt_fam_list; |
|---|
| 74 | |
|---|
| 75 | ali_message("Reading family extension:"); |
|---|
| 76 | |
|---|
| 77 | d = -1.0 * context->ext_max_weight / (float) pt_ext_list->cardinality(); |
|---|
| 78 | |
|---|
| 79 | // calculate the extension of the family |
|---|
| 80 | number = 0; |
|---|
| 81 | while (!pt_ext_list->is_empty()) { |
|---|
| 82 | pt_member = pt_ext_list->first(); |
|---|
| 83 | seq = arbdb.get_sequence(pt_member->name, |
|---|
| 84 | context->mark_family_extension_flag); |
|---|
| 85 | if (seq) { |
|---|
| 86 | weight = context->ext_max_weight + d * number; |
|---|
| 87 | sprintf(message_buffer, "%s (weight = %f, matches = %d)", |
|---|
| 88 | pt_member->name, weight, pt_member->matches); |
|---|
| 89 | ali_message(message_buffer); |
|---|
| 90 | family_member = new ali_family_member(seq, |
|---|
| 91 | (float) pt_member->matches, |
|---|
| 92 | weight); |
|---|
| 93 | family_list->append_end(family_member); |
|---|
| 94 | number++; |
|---|
| 95 | } |
|---|
| 96 | else { |
|---|
| 97 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
|---|
| 98 | } |
|---|
| 99 | pt_ext_list->delete_element(); |
|---|
| 100 | } |
|---|
| 101 | delete pt_ext_list; |
|---|
| 102 | |
|---|
| 103 | arbdb.commit_transaction(); |
|---|
| 104 | |
|---|
| 105 | return family_list; |
|---|
| 106 | } |
|---|
| 107 | |
|---|
| 108 | void ALI_PROFILE::calculate_costs(ALI_TLIST<ali_family_member *> *family_list, ALI_PROFILE_CONTEXT *context) { |
|---|
| 109 | // calculate the costs for aligning against a family |
|---|
| 110 | ali_family_member *family_member; |
|---|
| 111 | float a[7], w[7], w_sum, sm[7][7]; |
|---|
| 112 | float base_gap_weights[5], w_bg_sum; |
|---|
| 113 | long members; |
|---|
| 114 | size_t p; |
|---|
| 115 | int i; |
|---|
| 116 | size_t j; |
|---|
| 117 | unsigned long *l; |
|---|
| 118 | float *g; |
|---|
| 119 | unsigned char **seq; |
|---|
| 120 | long *seq_len; |
|---|
| 121 | float (*w_Del)[], (*percent)[]; |
|---|
| 122 | |
|---|
| 123 | // allocate temporary memory |
|---|
| 124 | members = family_list->cardinality(); |
|---|
| 125 | l = (unsigned long *) CALLOC((unsigned int) members, sizeof(long)); |
|---|
| 126 | g = (float *) CALLOC((unsigned int) members, sizeof(float)); |
|---|
| 127 | seq = (unsigned char **) CALLOC((unsigned int) members, sizeof(char *)); |
|---|
| 128 | seq_len = (long *) CALLOC((unsigned int) members, sizeof(long)); |
|---|
| 129 | if (l == 0 || g == 0 || seq == 0 || seq_len == 0) |
|---|
| 130 | ali_fatal_error("Out of memory"); |
|---|
| 131 | |
|---|
| 132 | // initialize arrays |
|---|
| 133 | family_member = family_list->first(); |
|---|
| 134 | prof_len = family_member->sequence->length(); |
|---|
| 135 | seq[0] = family_member->sequence->sequence(); |
|---|
| 136 | seq_len[0] = family_member->sequence->length(); |
|---|
| 137 | g[0] = family_member->weight; |
|---|
| 138 | i = 1; |
|---|
| 139 | sub_costs_maximum = 0.0; |
|---|
| 140 | |
|---|
| 141 | while (family_list->is_next()) { |
|---|
| 142 | family_member = family_list->next(); |
|---|
| 143 | seq[i] = family_member->sequence->sequence(); |
|---|
| 144 | seq_len[i] = family_member->sequence->length(); |
|---|
| 145 | g[i] = family_member->weight; |
|---|
| 146 | i++; |
|---|
| 147 | if (prof_len < family_member->sequence->length()) { |
|---|
| 148 | ali_warning("Family members have different length"); |
|---|
| 149 | prof_len = family_member->sequence->length(); |
|---|
| 150 | } |
|---|
| 151 | } |
|---|
| 152 | |
|---|
| 153 | // Calculate the substitution cost matrix |
|---|
| 154 | for (i = 0; i < 5; i++) |
|---|
| 155 | for (j = 0; j < 5; j++) |
|---|
| 156 | sm[i][j] = context->substitute_matrix[i][j]; |
|---|
| 157 | |
|---|
| 158 | // Initialize l-array (position of last base) |
|---|
| 159 | for (i = 0; i < members; i++) |
|---|
| 160 | l[i] = prof_len + 1; |
|---|
| 161 | |
|---|
| 162 | // allocate memory for costs |
|---|
| 163 | |
|---|
| 164 | base_weights = (float (**) [4]) CALLOC((unsigned int) prof_len, sizeof(float [4])); |
|---|
| 165 | sub_costs = (float (**) [6]) CALLOC((unsigned int) prof_len, sizeof(float [6])); |
|---|
| 166 | binding_costs = (float (*) [5][5]) CALLOC((unsigned int) 5, sizeof(float [5])); |
|---|
| 167 | lmin = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
|---|
| 168 | lmax = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
|---|
| 169 | gap_costs = (float ***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 170 | gap_percents = (float***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 171 | |
|---|
| 172 | if (binding_costs == 0 || sub_costs == 0 || lmin == 0 || lmax == 0 || |
|---|
| 173 | gap_costs == 0 || gap_percents == 0 || base_weights == 0) { |
|---|
| 174 | ali_fatal_error("Out of memory"); |
|---|
| 175 | } |
|---|
| 176 | |
|---|
| 177 | // Copy the binding costs matrix |
|---|
| 178 | w_bind_maximum = context->binding_matrix[0][0]; |
|---|
| 179 | for (i = 0; i < 5; i++) |
|---|
| 180 | for (j = 0; j < 5; j++) { |
|---|
| 181 | (*binding_costs)[i][j] = context->binding_matrix[i][j]; |
|---|
| 182 | if ((*binding_costs)[i][j] > w_bind_maximum) |
|---|
| 183 | w_bind_maximum = (*binding_costs)[i][j]; |
|---|
| 184 | } |
|---|
| 185 | |
|---|
| 186 | // calculate the costs for EVERY position |
|---|
| 187 | ali_message("Calculating costs for substitution"); |
|---|
| 188 | for (p = 0; p < prof_len; p++) { |
|---|
| 189 | // Initialization |
|---|
| 190 | for (i = 0; i < 7; i++) |
|---|
| 191 | a[i] = w[i] = sm[5][i] = sm[i][5] = sm[6][i] = sm[i][6] = 0.0; |
|---|
| 192 | for (i = 0; i < 6; i++) |
|---|
| 193 | (*sub_costs)[p][i] = 0.0; |
|---|
| 194 | w_sum = 0.0; |
|---|
| 195 | w_bg_sum = 0.0; |
|---|
| 196 | |
|---|
| 197 | // Statistic consensus |
|---|
| 198 | for (i = 0; i < members; i++) { |
|---|
| 199 | if (p < size_t(seq_len[i])) { |
|---|
| 200 | a[seq[i][p]]++; |
|---|
| 201 | w[seq[i][p]] += g[i]; |
|---|
| 202 | if (ali_is_real_base(seq[i][p])) |
|---|
| 203 | w_sum += g[i]; |
|---|
| 204 | if (ali_is_real_base(seq[i][p]) || ali_is_gap(seq[i][p])) |
|---|
| 205 | w_bg_sum += g[i]; |
|---|
| 206 | } |
|---|
| 207 | else { |
|---|
| 208 | a[ALI_DOT_CODE]++; |
|---|
| 209 | w[ALI_DOT_CODE] += g[i]; |
|---|
| 210 | } |
|---|
| 211 | } |
|---|
| 212 | |
|---|
| 213 | // Relative weight of bases |
|---|
| 214 | if (w_sum != 0) |
|---|
| 215 | for (i = 0; i < 4; i++) |
|---|
| 216 | (*base_weights)[p][i] = w[i] / w_sum; |
|---|
| 217 | else |
|---|
| 218 | for (i = 0; i < 4; i++) |
|---|
| 219 | (*base_weights)[p][i] = 0.25; |
|---|
| 220 | |
|---|
| 221 | // Relative weight of bases and gaps |
|---|
| 222 | if (w_bg_sum != 0) |
|---|
| 223 | for (i = 0; i < 5; i++) |
|---|
| 224 | base_gap_weights[i] = w[i] / w_bg_sum; |
|---|
| 225 | else |
|---|
| 226 | for (i = 0; i < 5; i++) |
|---|
| 227 | base_gap_weights[i] = 0.2; |
|---|
| 228 | |
|---|
| 229 | // Expandation of substitute matrix (for 'n') |
|---|
| 230 | for (j = 0; j < 5; j++) { |
|---|
| 231 | for (i = 0; i < 4; i++) { |
|---|
| 232 | sm[5][j] += (*base_weights)[p][i] * sm[i][j]; |
|---|
| 233 | sm[j][5] += (*base_weights)[p][i] * sm[j][i]; |
|---|
| 234 | } |
|---|
| 235 | } |
|---|
| 236 | for (i = 0; i < 4; i++) |
|---|
| 237 | sm[5][5] += (*base_weights)[p][i] * sm[i][i]; |
|---|
| 238 | |
|---|
| 239 | // Expandation of substitute matrix (for '.') |
|---|
| 240 | for (j = 0; j < 6; j++) |
|---|
| 241 | for (i = 0; i < 5; i++) { |
|---|
| 242 | sm[6][j] += base_gap_weights[i] * sm[i][j]; |
|---|
| 243 | sm[j][6] += base_gap_weights[i] * sm[j][i]; |
|---|
| 244 | } |
|---|
| 245 | for (i = 0; i < 5; i++) |
|---|
| 246 | sm[6][6] += base_gap_weights[i] * sm[i][i]; |
|---|
| 247 | |
|---|
| 248 | // Substitution costs |
|---|
| 249 | for (i = 0; i < members; i++) { |
|---|
| 250 | if (p < size_t(seq_len[i])) { |
|---|
| 251 | for (j = 0; j < 6; j++) { |
|---|
| 252 | (*sub_costs)[p][j] += g[i] * sm[seq[i][p]][j]; |
|---|
| 253 | } |
|---|
| 254 | } |
|---|
| 255 | else { |
|---|
| 256 | for (j = 0; j < 6; j++) { |
|---|
| 257 | (*sub_costs)[p][j] += g[i] * sm[ALI_DOT_CODE][j]; |
|---|
| 258 | } |
|---|
| 259 | } |
|---|
| 260 | } |
|---|
| 261 | for (j = 0; j < 6; j++) { |
|---|
| 262 | (*sub_costs)[p][j] /= members; |
|---|
| 263 | if ((*sub_costs)[p][j] > sub_costs_maximum) |
|---|
| 264 | sub_costs_maximum = (*sub_costs)[p][j]; |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | // Calculate dynamic deletion costs and percents of real gaps |
|---|
| 268 | lmax[p] = 0; |
|---|
| 269 | lmin[p] = p; |
|---|
| 270 | for (i = 0; i < members; i++) |
|---|
| 271 | if (l[i] < p) { |
|---|
| 272 | if (lmin[p] > l[i]) |
|---|
| 273 | lmin[p] = l[i]; |
|---|
| 274 | if (lmax[p] < l[i]) |
|---|
| 275 | lmax[p] = l[i]; |
|---|
| 276 | } |
|---|
| 277 | if (lmin[p] == p && lmax[p] == 0) { |
|---|
| 278 | lmin[p] = lmax[p] = p; |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | w_Del = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
|---|
| 282 | percent = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
|---|
| 283 | if (w_Del == 0 || percent == 0) |
|---|
| 284 | ali_fatal_error("Out of memory"); |
|---|
| 285 | (*gap_costs)[p] = (float *) w_Del; |
|---|
| 286 | (*gap_percents)[p] = (float *) percent; |
|---|
| 287 | |
|---|
| 288 | // Calculate dynamic deletion costs |
|---|
| 289 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
|---|
| 290 | (*w_Del)[j] = 0; |
|---|
| 291 | for (i = 0; i < members; i++) { |
|---|
| 292 | // Normal case |
|---|
| 293 | if (p < size_t(seq_len[i])) { |
|---|
| 294 | if (l[i] == prof_len + 1 || l[i] >= j + lmin[p]) { |
|---|
| 295 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4] * context->multi_gap_factor; |
|---|
| 296 | } |
|---|
| 297 | else { |
|---|
| 298 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4]; |
|---|
| 299 | } |
|---|
| 300 | } |
|---|
| 301 | // expand sequence with dots |
|---|
| 302 | else { |
|---|
| 303 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
|---|
| 304 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4] * context->multi_gap_factor; |
|---|
| 305 | } |
|---|
| 306 | else { |
|---|
| 307 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4]; |
|---|
| 308 | } |
|---|
| 309 | } |
|---|
| 310 | } |
|---|
| 311 | (*w_Del)[j] /= members; |
|---|
| 312 | } |
|---|
| 313 | |
|---|
| 314 | // Update the l-array |
|---|
| 315 | for (i = 0; i < members; i++) { |
|---|
| 316 | if (!ali_is_gap(seq[i][p])) |
|---|
| 317 | l[i] = p; |
|---|
| 318 | } |
|---|
| 319 | |
|---|
| 320 | // Calculate percents of real gaps |
|---|
| 321 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
|---|
| 322 | (*percent)[j] = 0; |
|---|
| 323 | for (i = 0; i < members; i++) { |
|---|
| 324 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
|---|
| 325 | (*percent)[j] += 1.0; |
|---|
| 326 | } |
|---|
| 327 | } |
|---|
| 328 | (*percent)[j] /= members; |
|---|
| 329 | } |
|---|
| 330 | } |
|---|
| 331 | |
|---|
| 332 | ali_message("Calculation finished"); |
|---|
| 333 | |
|---|
| 334 | free(l); |
|---|
| 335 | free(g); |
|---|
| 336 | free(seq); |
|---|
| 337 | free(seq_len); |
|---|
| 338 | } |
|---|
| 339 | |
|---|
| 340 | int ALI_PROFILE::find_next_helix(char h[], unsigned long h_len, |
|---|
| 341 | unsigned long pos, |
|---|
| 342 | unsigned long *helix_nr, |
|---|
| 343 | unsigned long *start, unsigned long *end) |
|---|
| 344 | { |
|---|
| 345 | // find the next helix |
|---|
| 346 | unsigned long i; |
|---|
| 347 | |
|---|
| 348 | for (i = pos; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 349 | if (i >= h_len) return -1; |
|---|
| 350 | |
|---|
| 351 | *start = i; |
|---|
| 352 | sscanf(&h[i], "%ld", helix_nr); |
|---|
| 353 | for (; i < h_len && isdigit(h[i]); i++) ; |
|---|
| 354 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 355 | *end = i - 1; |
|---|
| 356 | |
|---|
| 357 | return 0; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | int ALI_PROFILE::find_comp_helix(char h[], unsigned long h_len, |
|---|
| 361 | unsigned long pos, unsigned long helix_nr, |
|---|
| 362 | unsigned long *start, unsigned long *end) |
|---|
| 363 | { |
|---|
| 364 | // find the complementary part of a helix |
|---|
| 365 | unsigned long nr, i; |
|---|
| 366 | |
|---|
| 367 | i = pos; |
|---|
| 368 | do { |
|---|
| 369 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 370 | if (i >= h_len) return -1; |
|---|
| 371 | *start = i; |
|---|
| 372 | sscanf(&h[i], "%ld", &nr); |
|---|
| 373 | for (; i < h_len && isdigit(h[i]); i++) ; |
|---|
| 374 | } while (helix_nr != nr); |
|---|
| 375 | |
|---|
| 376 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 377 | *end = i - 1; |
|---|
| 378 | |
|---|
| 379 | return 0; |
|---|
| 380 | } |
|---|
| 381 | |
|---|
| 382 | void ALI_PROFILE::delete_comp_helix(char h1[], char h2[], unsigned long h_len, |
|---|
| 383 | unsigned long start, unsigned long end) |
|---|
| 384 | { |
|---|
| 385 | unsigned long i; |
|---|
| 386 | |
|---|
| 387 | for (i = start; i < h_len && i <= end; i++) { |
|---|
| 388 | h1[i] = '.'; |
|---|
| 389 | h2[i] = '.'; |
|---|
| 390 | } |
|---|
| 391 | } |
|---|
| 392 | |
|---|
| 393 | |
|---|
| 394 | void ALI_PROFILE::initialize_helix(ALI_PROFILE_CONTEXT *context) { |
|---|
| 395 | // initialize the array, representing the helix |
|---|
| 396 | const char *error_string; |
|---|
| 397 | BI_helix bi_helix; |
|---|
| 398 | |
|---|
| 399 | unsigned long i; |
|---|
| 400 | |
|---|
| 401 | // read helix |
|---|
| 402 | if ((error_string = bi_helix.init(context->arbdb->gb_main)) != 0) |
|---|
| 403 | ali_warning(error_string); |
|---|
| 404 | |
|---|
| 405 | helix_len = bi_helix.size(); |
|---|
| 406 | helix = (long **) CALLOC((unsigned int) helix_len, sizeof(long)); |
|---|
| 407 | helix_borders = (char **) CALLOC((unsigned int) helix_len, sizeof(long)); |
|---|
| 408 | if (helix == 0 || helix_borders == 0) ali_fatal_error("Out of memory"); |
|---|
| 409 | |
|---|
| 410 | // convert helix for internal use |
|---|
| 411 | for (i = 0; i < helix_len; i++) |
|---|
| 412 | if (bi_helix.pairtype(i) == HELIX_PAIR) |
|---|
| 413 | (*helix)[i] = bi_helix.opposite_position(i); |
|---|
| 414 | else |
|---|
| 415 | (*helix)[i] = -1; |
|---|
| 416 | } |
|---|
| 417 | |
|---|
| 418 | |
|---|
| 419 | ALI_PROFILE::ALI_PROFILE(ALI_SEQUENCE *seq, ALI_PROFILE_CONTEXT *context) |
|---|
| 420 | { |
|---|
| 421 | char message_buffer[100]; |
|---|
| 422 | ali_family_member *family_member; |
|---|
| 423 | ALI_TLIST<ali_family_member *> *family_list; |
|---|
| 424 | |
|---|
| 425 | norm_sequence = new ALI_NORM_SEQUENCE(seq); |
|---|
| 426 | |
|---|
| 427 | multi_gap_factor = context->multi_gap_factor; |
|---|
| 428 | |
|---|
| 429 | initialize_helix(context); |
|---|
| 430 | |
|---|
| 431 | family_list = find_family(seq, context); |
|---|
| 432 | if (family_list->is_empty()) { |
|---|
| 433 | ali_error("Family not found (maybe incompatible PT and DB Servers)"); |
|---|
| 434 | } |
|---|
| 435 | |
|---|
| 436 | calculate_costs(family_list, context); |
|---|
| 437 | |
|---|
| 438 | insert_cost = sub_costs_maximum * context->insert_factor; |
|---|
| 439 | multi_insert_cost = insert_cost * context->multi_insert_factor; |
|---|
| 440 | |
|---|
| 441 | sprintf(message_buffer, "Multi gap factor = %f", multi_gap_factor); |
|---|
| 442 | ali_message(message_buffer); |
|---|
| 443 | sprintf(message_buffer, "Maximal substitution costs = %f", sub_costs_maximum); |
|---|
| 444 | ali_message(message_buffer); |
|---|
| 445 | sprintf(message_buffer, "Normal insert costs = %f", insert_cost); |
|---|
| 446 | ali_message(message_buffer); |
|---|
| 447 | sprintf(message_buffer, "Multiple insert costs = %f", multi_insert_cost); |
|---|
| 448 | ali_message(message_buffer); |
|---|
| 449 | |
|---|
| 450 | // Delete the family list |
|---|
| 451 | family_member = family_list->first(); |
|---|
| 452 | delete family_member->sequence; |
|---|
| 453 | delete family_member; |
|---|
| 454 | while (family_list->is_next()) { |
|---|
| 455 | family_member = family_list->next(); |
|---|
| 456 | delete family_member->sequence; |
|---|
| 457 | delete family_member; |
|---|
| 458 | } |
|---|
| 459 | delete family_list; |
|---|
| 460 | } |
|---|
| 461 | |
|---|
| 462 | ALI_PROFILE::~ALI_PROFILE() |
|---|
| 463 | { |
|---|
| 464 | size_t i; |
|---|
| 465 | |
|---|
| 466 | free(helix); |
|---|
| 467 | free(helix_borders); |
|---|
| 468 | free(binding_costs); |
|---|
| 469 | free(sub_costs); |
|---|
| 470 | if (gap_costs) { |
|---|
| 471 | for (i = 0; i < prof_len; i++) |
|---|
| 472 | if ((*gap_costs)[i]) |
|---|
| 473 | free((*gap_costs)[i]); |
|---|
| 474 | free(gap_costs); |
|---|
| 475 | } |
|---|
| 476 | if (gap_percents) { |
|---|
| 477 | for (i = 0; i < prof_len; i++) |
|---|
| 478 | if ((*gap_percents)[i]) |
|---|
| 479 | free((*gap_percents)[i]); |
|---|
| 480 | free(gap_percents); |
|---|
| 481 | } |
|---|
| 482 | free(lmin); |
|---|
| 483 | free(lmax); |
|---|
| 484 | delete norm_sequence; |
|---|
| 485 | } |
|---|
| 486 | |
|---|
| 487 | |
|---|
| 488 | int ALI_PROFILE::is_in_helix(unsigned long pos, unsigned long *first, unsigned long *last) { |
|---|
| 489 | // test whether a position is inside a helix |
|---|
| 490 | long i; |
|---|
| 491 | |
|---|
| 492 | if (pos > helix_len) |
|---|
| 493 | return 0; |
|---|
| 494 | |
|---|
| 495 | switch ((*helix_borders)[pos]) { |
|---|
| 496 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 497 | *first = pos; |
|---|
| 498 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 499 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_RIGHT) { |
|---|
| 500 | *last = (unsigned long) i; |
|---|
| 501 | return 1; |
|---|
| 502 | } |
|---|
| 503 | ali_warning("Helix borders inconsistent (1)"); |
|---|
| 504 | return 0; |
|---|
| 505 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 506 | *last = pos; |
|---|
| 507 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 508 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_LEFT) { |
|---|
| 509 | *first = (unsigned long) i; |
|---|
| 510 | return 1; |
|---|
| 511 | } |
|---|
| 512 | ali_warning("Helix borders inconsistent (2)"); |
|---|
| 513 | return 0; |
|---|
| 514 | default: |
|---|
| 515 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 516 | switch ((*helix_borders)[i]) { |
|---|
| 517 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 518 | return 0; |
|---|
| 519 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 520 | *first = (unsigned long) i; |
|---|
| 521 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 522 | switch ((*helix_borders)[i]) { |
|---|
| 523 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 524 | ali_warning("Helix borders inconsistent (3)"); |
|---|
| 525 | printf("pos = %ld\n", pos); |
|---|
| 526 | return 0; |
|---|
| 527 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 528 | *last = (unsigned long) i; |
|---|
| 529 | return 1; |
|---|
| 530 | } |
|---|
| 531 | } |
|---|
| 532 | } |
|---|
| 533 | return 0; |
|---|
| 534 | } |
|---|
| 535 | |
|---|
| 536 | int ALI_PROFILE::is_outside_helix(unsigned long pos, unsigned long *first, unsigned long *last) { |
|---|
| 537 | // test, whether a position is outside a helix |
|---|
| 538 | long i; |
|---|
| 539 | |
|---|
| 540 | switch ((*helix_borders)[pos]) { |
|---|
| 541 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 542 | return 0; |
|---|
| 543 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 544 | return 0; |
|---|
| 545 | default: |
|---|
| 546 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 547 | switch ((*helix_borders)[i]) { |
|---|
| 548 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 549 | return 0; |
|---|
| 550 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 551 | *first = (unsigned long) i + 1; |
|---|
| 552 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 553 | switch ((*helix_borders)[i]) { |
|---|
| 554 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 555 | *last = (unsigned long) i - 1; |
|---|
| 556 | return 1; |
|---|
| 557 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 558 | ali_warning("Helix borders inconsistent [1]"); |
|---|
| 559 | return 0; |
|---|
| 560 | } |
|---|
| 561 | *last = prof_len - 1; |
|---|
| 562 | return 1; |
|---|
| 563 | } |
|---|
| 564 | *first = 0; |
|---|
| 565 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 566 | switch ((*helix_borders)[i]) { |
|---|
| 567 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 568 | *last = (unsigned long) i - 1; |
|---|
| 569 | return 1; |
|---|
| 570 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 571 | ali_warning("Helix borders inconsistent [2]"); |
|---|
| 572 | return 0; |
|---|
| 573 | } |
|---|
| 574 | *last = prof_len - 1; |
|---|
| 575 | return 1; |
|---|
| 576 | } |
|---|
| 577 | } |
|---|
| 578 | |
|---|
| 579 | |
|---|
| 580 | char *ALI_PROFILE::cheapest_sequence() { |
|---|
| 581 | // generate a 'consensus sequence' |
|---|
| 582 | |
|---|
| 583 | char *seq; |
|---|
| 584 | size_t p; |
|---|
| 585 | int i, min_i; |
|---|
| 586 | float min; |
|---|
| 587 | |
|---|
| 588 | |
|---|
| 589 | seq = (char *) CALLOC((unsigned int) prof_len + 1, sizeof(char)); |
|---|
| 590 | if (seq == 0) |
|---|
| 591 | ali_fatal_error("Out of memory"); |
|---|
| 592 | |
|---|
| 593 | for (p = 0; p < prof_len; p++) { |
|---|
| 594 | min = (*sub_costs)[p][0]; |
|---|
| 595 | min_i = 0; |
|---|
| 596 | for (i = 1; i < 5; i++) { |
|---|
| 597 | if (min > (*sub_costs)[p][i]) { |
|---|
| 598 | min = (*sub_costs)[p][i]; |
|---|
| 599 | min_i = i; |
|---|
| 600 | } |
|---|
| 601 | else { |
|---|
| 602 | if (min == (*sub_costs)[p][i]) |
|---|
| 603 | min_i = -1; |
|---|
| 604 | } |
|---|
| 605 | } |
|---|
| 606 | if (min_i >= 0) |
|---|
| 607 | seq[p] = ali_number_to_base(min_i); |
|---|
| 608 | else { |
|---|
| 609 | if (min > 0) |
|---|
| 610 | seq[p] = '*'; |
|---|
| 611 | else |
|---|
| 612 | seq[p] = '.'; |
|---|
| 613 | } |
|---|
| 614 | } |
|---|
| 615 | seq[prof_len] = '\0'; |
|---|
| 616 | |
|---|
| 617 | return seq; |
|---|
| 618 | } |
|---|
| 619 | |
|---|
| 620 | float ALI_PROFILE::w_binding(unsigned long first_seq_pos, ALI_SEQUENCE *seq) { |
|---|
| 621 | // calculate the costs of a binding |
|---|
| 622 | unsigned long pos_1_seq, pos_2_seq, last_seq_pos; |
|---|
| 623 | long pos_compl; |
|---|
| 624 | float costs = 0; |
|---|
| 625 | |
|---|
| 626 | last_seq_pos = first_seq_pos + seq->length() - 1; |
|---|
| 627 | for (pos_1_seq = first_seq_pos; pos_1_seq <= last_seq_pos; pos_1_seq++) { |
|---|
| 628 | pos_compl = (*helix)[pos_1_seq]; |
|---|
| 629 | if (pos_compl >= 0) { |
|---|
| 630 | pos_2_seq = (unsigned long) pos_compl; |
|---|
| 631 | if (pos_2_seq > pos_1_seq && pos_2_seq <= last_seq_pos) |
|---|
| 632 | costs += w_bind(pos_1_seq, seq->base(pos_1_seq), |
|---|
| 633 | pos_2_seq, seq->base(pos_2_seq)); |
|---|
| 634 | else |
|---|
| 635 | if (pos_2_seq < first_seq_pos || pos_2_seq > last_seq_pos) |
|---|
| 636 | costs += w_bind_maximum; |
|---|
| 637 | } |
|---|
| 638 | } |
|---|
| 639 | |
|---|
| 640 | return costs; |
|---|
| 641 | } |
|---|
| 642 | |
|---|
| 643 | |
|---|