| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ali_profile.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "ali_profile.hxx" |
|---|
| 12 | |
|---|
| 13 | #include <BI_helix.hxx> |
|---|
| 14 | #include <cctype> |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | inline int ALI_PROFILE::is_binding_marker(char c) { |
|---|
| 18 | return c == '~' || c == 'x'; |
|---|
| 19 | } |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | ALI_TLIST<ali_family_member *> *ALI_PROFILE::find_family(ALI_SEQUENCE *Sequence, ALI_PROFILE_CONTEXT *context) { |
|---|
| 23 | // find the family in the pt server |
|---|
| 24 | char message_buffer[100]; |
|---|
| 25 | ALI_PT &pt = (ALI_PT &) *(context->pt); |
|---|
| 26 | ALI_ARBDB &arbdb = (ALI_ARBDB &) *(context->arbdb); |
|---|
| 27 | ALI_SEQUENCE *seq; |
|---|
| 28 | ali_family_member *family_member; |
|---|
| 29 | ALI_TLIST<ali_family_member *> *family_list; |
|---|
| 30 | ALI_TLIST<ali_pt_member *> *pt_fam_list; |
|---|
| 31 | ALI_TLIST<ali_pt_member *> *pt_ext_list; |
|---|
| 32 | ali_pt_member *pt_member; |
|---|
| 33 | float weight, d; |
|---|
| 34 | unsigned long number; |
|---|
| 35 | |
|---|
| 36 | // Initialization |
|---|
| 37 | family_list = new ALI_TLIST<ali_family_member *>; |
|---|
| 38 | |
|---|
| 39 | ali_message("Searching for the family"); |
|---|
| 40 | pt.find_family(Sequence, context->find_family_mode); |
|---|
| 41 | ali_message("Family found"); |
|---|
| 42 | |
|---|
| 43 | pt_fam_list = pt.get_family_list(); |
|---|
| 44 | pt_ext_list = pt.get_extension_list(); |
|---|
| 45 | |
|---|
| 46 | ali_message("Reading family:"); |
|---|
| 47 | |
|---|
| 48 | d = (context->ext_max_weight - 1.0) / (float) pt_fam_list->cardinality(); |
|---|
| 49 | |
|---|
| 50 | arbdb.begin_transaction(); |
|---|
| 51 | |
|---|
| 52 | // calculate the real family members |
|---|
| 53 | number = 0; |
|---|
| 54 | while (!pt_fam_list->is_empty()) { |
|---|
| 55 | pt_member = pt_fam_list->first(); |
|---|
| 56 | seq = arbdb.get_sequence(pt_member->name, context->mark_family_flag); |
|---|
| 57 | if (seq) { |
|---|
| 58 | weight = 1 + d * number; |
|---|
| 59 | sprintf(message_buffer, "%s (weight = %f, matches = %d)", |
|---|
| 60 | pt_member->name, weight, pt_member->matches); |
|---|
| 61 | ali_message(message_buffer); |
|---|
| 62 | family_member = new ali_family_member(seq, |
|---|
| 63 | (float) pt_member->matches, |
|---|
| 64 | weight); |
|---|
| 65 | family_list->append_end(family_member); |
|---|
| 66 | number++; |
|---|
| 67 | } |
|---|
| 68 | else { |
|---|
| 69 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
|---|
| 70 | } |
|---|
| 71 | pt_fam_list->delete_element(); |
|---|
| 72 | } |
|---|
| 73 | delete pt_fam_list; |
|---|
| 74 | |
|---|
| 75 | ali_message("Reading family extension:"); |
|---|
| 76 | |
|---|
| 77 | d = -1.0 * context->ext_max_weight / (float) pt_ext_list->cardinality(); |
|---|
| 78 | |
|---|
| 79 | // calculate the extension of the family |
|---|
| 80 | number = 0; |
|---|
| 81 | while (!pt_ext_list->is_empty()) { |
|---|
| 82 | pt_member = pt_ext_list->first(); |
|---|
| 83 | seq = arbdb.get_sequence(pt_member->name, |
|---|
| 84 | context->mark_family_extension_flag); |
|---|
| 85 | if (seq) { |
|---|
| 86 | weight = context->ext_max_weight + d * number; |
|---|
| 87 | sprintf(message_buffer, "%s (weight = %f, matches = %d)", |
|---|
| 88 | pt_member->name, weight, pt_member->matches); |
|---|
| 89 | ali_message(message_buffer); |
|---|
| 90 | family_member = new ali_family_member(seq, |
|---|
| 91 | (float) pt_member->matches, |
|---|
| 92 | weight); |
|---|
| 93 | family_list->append_end(family_member); |
|---|
| 94 | number++; |
|---|
| 95 | } |
|---|
| 96 | else { |
|---|
| 97 | ali_warning("Sequence not found in Database (Sequence ignored)"); |
|---|
| 98 | } |
|---|
| 99 | pt_ext_list->delete_element(); |
|---|
| 100 | } |
|---|
| 101 | delete pt_ext_list; |
|---|
| 102 | |
|---|
| 103 | arbdb.commit_transaction(); |
|---|
| 104 | |
|---|
| 105 | return family_list; |
|---|
| 106 | } |
|---|
| 107 | |
|---|
| 108 | void ALI_PROFILE::calculate_costs(ALI_TLIST<ali_family_member *> *family_list, ALI_PROFILE_CONTEXT *context) { |
|---|
| 109 | // calculate the costs for aligning against a family |
|---|
| 110 | ali_family_member *family_member; |
|---|
| 111 | float a[7], w[7], w_sum, sm[7][7]; |
|---|
| 112 | float base_gap_weights[5], w_bg_sum; |
|---|
| 113 | long members; |
|---|
| 114 | size_t p; |
|---|
| 115 | int i; |
|---|
| 116 | size_t j; |
|---|
| 117 | unsigned long *l; |
|---|
| 118 | float *g; |
|---|
| 119 | unsigned char **seq; |
|---|
| 120 | long *seq_len; |
|---|
| 121 | float (*w_Del)[], (*percent)[]; |
|---|
| 122 | |
|---|
| 123 | // allocate temporary memory |
|---|
| 124 | members = family_list->cardinality(); |
|---|
| 125 | l = (unsigned long *) CALLOC((unsigned int) members, sizeof(long)); |
|---|
| 126 | g = (float *) CALLOC((unsigned int) members, sizeof(float)); |
|---|
| 127 | seq = (unsigned char **) CALLOC((unsigned int) members, sizeof(char *)); |
|---|
| 128 | seq_len = (long *) CALLOC((unsigned int) members, sizeof(long)); |
|---|
| 129 | ali_out_of_memory_if(!l || !g || !seq || !seq_len); |
|---|
| 130 | |
|---|
| 131 | // initialize arrays |
|---|
| 132 | family_member = family_list->first(); |
|---|
| 133 | prof_len = family_member->sequence->length(); |
|---|
| 134 | seq[0] = family_member->sequence->sequence(); |
|---|
| 135 | seq_len[0] = family_member->sequence->length(); |
|---|
| 136 | g[0] = family_member->weight; |
|---|
| 137 | i = 1; |
|---|
| 138 | sub_costs_maximum = 0.0; |
|---|
| 139 | |
|---|
| 140 | while (family_list->has_next()) { |
|---|
| 141 | family_member = family_list->next(); |
|---|
| 142 | seq[i] = family_member->sequence->sequence(); |
|---|
| 143 | seq_len[i] = family_member->sequence->length(); |
|---|
| 144 | g[i] = family_member->weight; |
|---|
| 145 | i++; |
|---|
| 146 | if (prof_len < family_member->sequence->length()) { |
|---|
| 147 | ali_warning("Family members have different length"); |
|---|
| 148 | prof_len = family_member->sequence->length(); |
|---|
| 149 | } |
|---|
| 150 | } |
|---|
| 151 | |
|---|
| 152 | // Calculate the substitution cost matrix |
|---|
| 153 | for (i = 0; i < 5; i++) |
|---|
| 154 | for (j = 0; j < 5; j++) |
|---|
| 155 | sm[i][j] = context->substitute_matrix[i][j]; |
|---|
| 156 | |
|---|
| 157 | // Initialize l-array (position of last base) |
|---|
| 158 | for (i = 0; i < members; i++) |
|---|
| 159 | l[i] = prof_len + 1; |
|---|
| 160 | |
|---|
| 161 | // allocate memory for costs |
|---|
| 162 | |
|---|
| 163 | base_weights = (float (**) [4]) CALLOC((unsigned int) prof_len, sizeof(float [4])); |
|---|
| 164 | sub_costs = (float (**) [6]) CALLOC((unsigned int) prof_len, sizeof(float [6])); |
|---|
| 165 | binding_costs = (float (*) [5][5]) CALLOC((unsigned int) 5, sizeof(float [5])); |
|---|
| 166 | lmin = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
|---|
| 167 | lmax = (unsigned long *) CALLOC((unsigned int) prof_len, sizeof(unsigned long)); |
|---|
| 168 | gap_costs = (float ***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 169 | gap_percents = (float***) CALLOC((unsigned int) prof_len, sizeof(float *)); |
|---|
| 170 | |
|---|
| 171 | ali_out_of_memory_if(!binding_costs || !sub_costs || !lmin || !lmax || !gap_costs || !gap_percents || !base_weights); |
|---|
| 172 | |
|---|
| 173 | // Copy the binding costs matrix |
|---|
| 174 | w_bind_maximum = context->binding_matrix[0][0]; |
|---|
| 175 | for (i = 0; i < 5; i++) |
|---|
| 176 | for (j = 0; j < 5; j++) { |
|---|
| 177 | (*binding_costs)[i][j] = context->binding_matrix[i][j]; |
|---|
| 178 | if ((*binding_costs)[i][j] > w_bind_maximum) |
|---|
| 179 | w_bind_maximum = (*binding_costs)[i][j]; |
|---|
| 180 | } |
|---|
| 181 | |
|---|
| 182 | // calculate the costs for EVERY position |
|---|
| 183 | ali_message("Calculating costs for substitution"); |
|---|
| 184 | for (p = 0; p < prof_len; p++) { |
|---|
| 185 | // Initialization |
|---|
| 186 | for (i = 0; i < 7; i++) |
|---|
| 187 | a[i] = w[i] = sm[5][i] = sm[i][5] = sm[6][i] = sm[i][6] = 0.0; |
|---|
| 188 | for (i = 0; i < 6; i++) |
|---|
| 189 | (*sub_costs)[p][i] = 0.0; |
|---|
| 190 | w_sum = 0.0; |
|---|
| 191 | w_bg_sum = 0.0; |
|---|
| 192 | |
|---|
| 193 | // Statistic consensus |
|---|
| 194 | for (i = 0; i < members; i++) { |
|---|
| 195 | if (p < size_t(seq_len[i])) { |
|---|
| 196 | a[seq[i][p]]++; |
|---|
| 197 | w[seq[i][p]] += g[i]; |
|---|
| 198 | if (ali_is_real_base(seq[i][p])) |
|---|
| 199 | w_sum += g[i]; |
|---|
| 200 | if (ali_is_real_base(seq[i][p]) || ali_is_gap(seq[i][p])) |
|---|
| 201 | w_bg_sum += g[i]; |
|---|
| 202 | } |
|---|
| 203 | else { |
|---|
| 204 | a[ALI_DOT_CODE]++; |
|---|
| 205 | w[ALI_DOT_CODE] += g[i]; |
|---|
| 206 | } |
|---|
| 207 | } |
|---|
| 208 | |
|---|
| 209 | // Relative weight of bases |
|---|
| 210 | if (w_sum != 0) |
|---|
| 211 | for (i = 0; i < 4; i++) |
|---|
| 212 | (*base_weights)[p][i] = w[i] / w_sum; |
|---|
| 213 | else |
|---|
| 214 | for (i = 0; i < 4; i++) |
|---|
| 215 | (*base_weights)[p][i] = 0.25; |
|---|
| 216 | |
|---|
| 217 | // Relative weight of bases and gaps |
|---|
| 218 | if (w_bg_sum != 0) |
|---|
| 219 | for (i = 0; i < 5; i++) |
|---|
| 220 | base_gap_weights[i] = w[i] / w_bg_sum; |
|---|
| 221 | else |
|---|
| 222 | for (i = 0; i < 5; i++) |
|---|
| 223 | base_gap_weights[i] = 0.2; |
|---|
| 224 | |
|---|
| 225 | // Expandation of substitute matrix (for 'n') |
|---|
| 226 | for (j = 0; j < 5; j++) { |
|---|
| 227 | for (i = 0; i < 4; i++) { |
|---|
| 228 | sm[5][j] += (*base_weights)[p][i] * sm[i][j]; |
|---|
| 229 | sm[j][5] += (*base_weights)[p][i] * sm[j][i]; |
|---|
| 230 | } |
|---|
| 231 | } |
|---|
| 232 | for (i = 0; i < 4; i++) |
|---|
| 233 | sm[5][5] += (*base_weights)[p][i] * sm[i][i]; |
|---|
| 234 | |
|---|
| 235 | // Expandation of substitute matrix (for '.') |
|---|
| 236 | for (j = 0; j < 6; j++) |
|---|
| 237 | for (i = 0; i < 5; i++) { |
|---|
| 238 | sm[6][j] += base_gap_weights[i] * sm[i][j]; |
|---|
| 239 | sm[j][6] += base_gap_weights[i] * sm[j][i]; |
|---|
| 240 | } |
|---|
| 241 | for (i = 0; i < 5; i++) |
|---|
| 242 | sm[6][6] += base_gap_weights[i] * sm[i][i]; |
|---|
| 243 | |
|---|
| 244 | // Substitution costs |
|---|
| 245 | for (i = 0; i < members; i++) { |
|---|
| 246 | if (p < size_t(seq_len[i])) { |
|---|
| 247 | for (j = 0; j < 6; j++) { |
|---|
| 248 | (*sub_costs)[p][j] += g[i] * sm[seq[i][p]][j]; |
|---|
| 249 | } |
|---|
| 250 | } |
|---|
| 251 | else { |
|---|
| 252 | for (j = 0; j < 6; j++) { |
|---|
| 253 | (*sub_costs)[p][j] += g[i] * sm[ALI_DOT_CODE][j]; |
|---|
| 254 | } |
|---|
| 255 | } |
|---|
| 256 | } |
|---|
| 257 | for (j = 0; j < 6; j++) { |
|---|
| 258 | (*sub_costs)[p][j] /= members; |
|---|
| 259 | if ((*sub_costs)[p][j] > sub_costs_maximum) |
|---|
| 260 | sub_costs_maximum = (*sub_costs)[p][j]; |
|---|
| 261 | } |
|---|
| 262 | |
|---|
| 263 | // Calculate dynamic deletion costs and percents of real gaps |
|---|
| 264 | lmax[p] = 0; |
|---|
| 265 | lmin[p] = p; |
|---|
| 266 | for (i = 0; i < members; i++) |
|---|
| 267 | if (l[i] < p) { |
|---|
| 268 | if (lmin[p] > l[i]) |
|---|
| 269 | lmin[p] = l[i]; |
|---|
| 270 | if (lmax[p] < l[i]) |
|---|
| 271 | lmax[p] = l[i]; |
|---|
| 272 | } |
|---|
| 273 | if (lmin[p] == p && lmax[p] == 0) { |
|---|
| 274 | lmin[p] = lmax[p] = p; |
|---|
| 275 | } |
|---|
| 276 | |
|---|
| 277 | w_Del = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
|---|
| 278 | percent = (float (*) []) CALLOC((unsigned int) (lmax[p]-lmin[p]+2), sizeof(float)); |
|---|
| 279 | ali_out_of_memory_if(!w_Del || !percent); |
|---|
| 280 | (*gap_costs)[p] = (float *) w_Del; |
|---|
| 281 | (*gap_percents)[p] = (float *) percent; |
|---|
| 282 | |
|---|
| 283 | // Calculate dynamic deletion costs |
|---|
| 284 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
|---|
| 285 | (*w_Del)[j] = 0; |
|---|
| 286 | for (i = 0; i < members; i++) { |
|---|
| 287 | // Normal case |
|---|
| 288 | if (p < size_t(seq_len[i])) { |
|---|
| 289 | if (l[i] == prof_len + 1 || l[i] >= j + lmin[p]) { |
|---|
| 290 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4] * context->multi_gap_factor; |
|---|
| 291 | } |
|---|
| 292 | else { |
|---|
| 293 | (*w_Del)[j] += g[i] * sm[seq[i][p]][4]; |
|---|
| 294 | } |
|---|
| 295 | } |
|---|
| 296 | // expand sequence with dots |
|---|
| 297 | else { |
|---|
| 298 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
|---|
| 299 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4] * context->multi_gap_factor; |
|---|
| 300 | } |
|---|
| 301 | else { |
|---|
| 302 | (*w_Del)[j] += g[i] * sm[ALI_DOT_CODE][4]; |
|---|
| 303 | } |
|---|
| 304 | } |
|---|
| 305 | } |
|---|
| 306 | (*w_Del)[j] /= members; |
|---|
| 307 | } |
|---|
| 308 | |
|---|
| 309 | // Update the l-array |
|---|
| 310 | for (i = 0; i < members; i++) { |
|---|
| 311 | if (!ali_is_gap(seq[i][p])) |
|---|
| 312 | l[i] = p; |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | // Calculate percents of real gaps |
|---|
| 316 | for (j = 0; j <= lmax[p] - lmin[p] + 1; j++) { |
|---|
| 317 | (*percent)[j] = 0; |
|---|
| 318 | for (i = 0; i < members; i++) { |
|---|
| 319 | if (l[i] >= j + lmin[p] && l[i] < prof_len+1) { |
|---|
| 320 | (*percent)[j] += 1.0; |
|---|
| 321 | } |
|---|
| 322 | } |
|---|
| 323 | (*percent)[j] /= members; |
|---|
| 324 | } |
|---|
| 325 | } |
|---|
| 326 | |
|---|
| 327 | ali_message("Calculation finished"); |
|---|
| 328 | |
|---|
| 329 | free(l); |
|---|
| 330 | free(g); |
|---|
| 331 | free(seq); |
|---|
| 332 | free(seq_len); |
|---|
| 333 | } |
|---|
| 334 | |
|---|
| 335 | int ALI_PROFILE::find_next_helix(char h[], unsigned long h_len, |
|---|
| 336 | unsigned long pos, |
|---|
| 337 | unsigned long *helix_nr, |
|---|
| 338 | unsigned long *start, unsigned long *end) |
|---|
| 339 | { |
|---|
| 340 | // find the next helix |
|---|
| 341 | unsigned long i; |
|---|
| 342 | |
|---|
| 343 | for (i = pos; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 344 | if (i >= h_len) return -1; |
|---|
| 345 | |
|---|
| 346 | *start = i; |
|---|
| 347 | sscanf(&h[i], "%ld", helix_nr); |
|---|
| 348 | for (; i < h_len && isdigit(h[i]); i++) ; |
|---|
| 349 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 350 | *end = i - 1; |
|---|
| 351 | |
|---|
| 352 | return 0; |
|---|
| 353 | } |
|---|
| 354 | |
|---|
| 355 | int ALI_PROFILE::find_comp_helix(char h[], unsigned long h_len, |
|---|
| 356 | unsigned long pos, unsigned long helix_nr, |
|---|
| 357 | unsigned long *start, unsigned long *end) |
|---|
| 358 | { |
|---|
| 359 | // find the complementary part of a helix |
|---|
| 360 | unsigned long nr, i; |
|---|
| 361 | |
|---|
| 362 | i = pos; |
|---|
| 363 | do { |
|---|
| 364 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 365 | if (i >= h_len) return -1; |
|---|
| 366 | *start = i; |
|---|
| 367 | sscanf(&h[i], "%ld", &nr); |
|---|
| 368 | for (; i < h_len && isdigit(h[i]); i++) ; |
|---|
| 369 | } while (helix_nr != nr); |
|---|
| 370 | |
|---|
| 371 | for (; i < h_len && !isdigit(h[i]); i++) ; |
|---|
| 372 | *end = i - 1; |
|---|
| 373 | |
|---|
| 374 | return 0; |
|---|
| 375 | } |
|---|
| 376 | |
|---|
| 377 | void ALI_PROFILE::delete_comp_helix(char h1[], char h2[], unsigned long h_len, |
|---|
| 378 | unsigned long start, unsigned long end) |
|---|
| 379 | { |
|---|
| 380 | unsigned long i; |
|---|
| 381 | |
|---|
| 382 | for (i = start; i < h_len && i <= end; i++) { |
|---|
| 383 | h1[i] = '.'; |
|---|
| 384 | h2[i] = '.'; |
|---|
| 385 | } |
|---|
| 386 | } |
|---|
| 387 | |
|---|
| 388 | |
|---|
| 389 | void ALI_PROFILE::initialize_helix(ALI_PROFILE_CONTEXT *context) { |
|---|
| 390 | // initialize the array, representing the helix |
|---|
| 391 | const char *error_string; |
|---|
| 392 | BI_helix bi_helix; |
|---|
| 393 | |
|---|
| 394 | unsigned long i; |
|---|
| 395 | |
|---|
| 396 | // read helix |
|---|
| 397 | if ((error_string = bi_helix.init(context->arbdb->gb_main))) |
|---|
| 398 | ali_warning(error_string); |
|---|
| 399 | |
|---|
| 400 | helix_len = bi_helix.size(); |
|---|
| 401 | helix = (long **) CALLOC((unsigned int) helix_len, sizeof(long)); |
|---|
| 402 | helix_borders = (char **) CALLOC((unsigned int) helix_len, sizeof(long)); |
|---|
| 403 | ali_out_of_memory_if(!helix || !helix_borders); |
|---|
| 404 | |
|---|
| 405 | // convert helix for internal use |
|---|
| 406 | for (i = 0; i < helix_len; i++) { |
|---|
| 407 | (*helix)[i] = bi_helix.is_pairpos(i) ? bi_helix.opposite_position(i) : -1; |
|---|
| 408 | } |
|---|
| 409 | } |
|---|
| 410 | |
|---|
| 411 | |
|---|
| 412 | ALI_PROFILE::ALI_PROFILE(ALI_SEQUENCE *seq, ALI_PROFILE_CONTEXT *context) { |
|---|
| 413 | char message_buffer[100]; |
|---|
| 414 | ali_family_member *family_member; |
|---|
| 415 | ALI_TLIST<ali_family_member *> *family_list; |
|---|
| 416 | |
|---|
| 417 | norm_sequence = new ALI_NORM_SEQUENCE(seq); |
|---|
| 418 | |
|---|
| 419 | multi_gap_factor = context->multi_gap_factor; |
|---|
| 420 | |
|---|
| 421 | initialize_helix(context); |
|---|
| 422 | |
|---|
| 423 | family_list = find_family(seq, context); |
|---|
| 424 | if (family_list->is_empty()) { |
|---|
| 425 | ali_error("Family not found (maybe incompatible PT and DB Servers)"); |
|---|
| 426 | } |
|---|
| 427 | |
|---|
| 428 | calculate_costs(family_list, context); |
|---|
| 429 | |
|---|
| 430 | insert_cost = sub_costs_maximum * context->insert_factor; |
|---|
| 431 | multi_insert_cost = insert_cost * context->multi_insert_factor; |
|---|
| 432 | |
|---|
| 433 | sprintf(message_buffer, "Multi gap factor = %f", multi_gap_factor); |
|---|
| 434 | ali_message(message_buffer); |
|---|
| 435 | sprintf(message_buffer, "Maximal substitution costs = %f", sub_costs_maximum); |
|---|
| 436 | ali_message(message_buffer); |
|---|
| 437 | sprintf(message_buffer, "Normal insert costs = %f", insert_cost); |
|---|
| 438 | ali_message(message_buffer); |
|---|
| 439 | sprintf(message_buffer, "Multiple insert costs = %f", multi_insert_cost); |
|---|
| 440 | ali_message(message_buffer); |
|---|
| 441 | |
|---|
| 442 | // Delete the family list |
|---|
| 443 | family_member = family_list->first(); |
|---|
| 444 | delete family_member->sequence; |
|---|
| 445 | delete family_member; |
|---|
| 446 | while (family_list->has_next()) { |
|---|
| 447 | family_member = family_list->next(); |
|---|
| 448 | delete family_member->sequence; |
|---|
| 449 | delete family_member; |
|---|
| 450 | } |
|---|
| 451 | delete family_list; |
|---|
| 452 | } |
|---|
| 453 | |
|---|
| 454 | ALI_PROFILE::~ALI_PROFILE() { |
|---|
| 455 | size_t i; |
|---|
| 456 | |
|---|
| 457 | free(helix); |
|---|
| 458 | free(helix_borders); |
|---|
| 459 | free(binding_costs); |
|---|
| 460 | free(sub_costs); |
|---|
| 461 | if (gap_costs) { |
|---|
| 462 | for (i = 0; i < prof_len; i++) |
|---|
| 463 | if ((*gap_costs)[i]) |
|---|
| 464 | free((*gap_costs)[i]); |
|---|
| 465 | free(gap_costs); |
|---|
| 466 | } |
|---|
| 467 | if (gap_percents) { |
|---|
| 468 | for (i = 0; i < prof_len; i++) |
|---|
| 469 | if ((*gap_percents)[i]) |
|---|
| 470 | free((*gap_percents)[i]); |
|---|
| 471 | free(gap_percents); |
|---|
| 472 | } |
|---|
| 473 | free(lmin); |
|---|
| 474 | free(lmax); |
|---|
| 475 | delete norm_sequence; |
|---|
| 476 | } |
|---|
| 477 | |
|---|
| 478 | |
|---|
| 479 | int ALI_PROFILE::is_in_helix(unsigned long pos, unsigned long *first, unsigned long *last) { |
|---|
| 480 | // test whether a position is inside a helix |
|---|
| 481 | long i; |
|---|
| 482 | |
|---|
| 483 | if (pos > helix_len) |
|---|
| 484 | return 0; |
|---|
| 485 | |
|---|
| 486 | switch ((*helix_borders)[pos]) { |
|---|
| 487 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 488 | *first = pos; |
|---|
| 489 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 490 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_RIGHT) { |
|---|
| 491 | *last = (unsigned long) i; |
|---|
| 492 | return 1; |
|---|
| 493 | } |
|---|
| 494 | ali_warning("Helix borders inconsistent (1)"); |
|---|
| 495 | return 0; |
|---|
| 496 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 497 | *last = pos; |
|---|
| 498 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 499 | if ((*helix_borders)[i] == ALI_PROFILE_BORDER_LEFT) { |
|---|
| 500 | *first = (unsigned long) i; |
|---|
| 501 | return 1; |
|---|
| 502 | } |
|---|
| 503 | ali_warning("Helix borders inconsistent (2)"); |
|---|
| 504 | return 0; |
|---|
| 505 | default: |
|---|
| 506 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 507 | switch ((*helix_borders)[i]) { |
|---|
| 508 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 509 | return 0; |
|---|
| 510 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 511 | *first = (unsigned long) i; |
|---|
| 512 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 513 | switch ((*helix_borders)[i]) { |
|---|
| 514 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 515 | ali_warning("Helix borders inconsistent (3)"); |
|---|
| 516 | printf("pos = %ld\n", pos); |
|---|
| 517 | return 0; |
|---|
| 518 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 519 | *last = (unsigned long) i; |
|---|
| 520 | return 1; |
|---|
| 521 | } |
|---|
| 522 | } |
|---|
| 523 | } |
|---|
| 524 | return 0; |
|---|
| 525 | } |
|---|
| 526 | |
|---|
| 527 | int ALI_PROFILE::is_outside_helix(unsigned long pos, unsigned long *first, unsigned long *last) { |
|---|
| 528 | // test, whether a position is outside a helix |
|---|
| 529 | long i; |
|---|
| 530 | |
|---|
| 531 | switch ((*helix_borders)[pos]) { |
|---|
| 532 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 533 | return 0; |
|---|
| 534 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 535 | return 0; |
|---|
| 536 | default: |
|---|
| 537 | for (i = (long) pos - 1; i >= 0; i--) |
|---|
| 538 | switch ((*helix_borders)[i]) { |
|---|
| 539 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 540 | return 0; |
|---|
| 541 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 542 | *first = (unsigned long) i + 1; |
|---|
| 543 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 544 | switch ((*helix_borders)[i]) { |
|---|
| 545 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 546 | *last = (unsigned long) i - 1; |
|---|
| 547 | return 1; |
|---|
| 548 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 549 | ali_warning("Helix borders inconsistent [1]"); |
|---|
| 550 | return 0; |
|---|
| 551 | } |
|---|
| 552 | *last = prof_len - 1; |
|---|
| 553 | return 1; |
|---|
| 554 | } |
|---|
| 555 | *first = 0; |
|---|
| 556 | for (i = (long) pos + 1; i < (long) prof_len; i++) |
|---|
| 557 | switch ((*helix_borders)[i]) { |
|---|
| 558 | case ALI_PROFILE_BORDER_LEFT: |
|---|
| 559 | *last = (unsigned long) i - 1; |
|---|
| 560 | return 1; |
|---|
| 561 | case ALI_PROFILE_BORDER_RIGHT: |
|---|
| 562 | ali_warning("Helix borders inconsistent [2]"); |
|---|
| 563 | return 0; |
|---|
| 564 | } |
|---|
| 565 | *last = prof_len - 1; |
|---|
| 566 | return 1; |
|---|
| 567 | } |
|---|
| 568 | } |
|---|
| 569 | |
|---|
| 570 | |
|---|
| 571 | char *ALI_PROFILE::cheapest_sequence() { |
|---|
| 572 | // generate a 'consensus sequence' |
|---|
| 573 | |
|---|
| 574 | char *seq; |
|---|
| 575 | size_t p; |
|---|
| 576 | int i, min_i; |
|---|
| 577 | float min; |
|---|
| 578 | |
|---|
| 579 | |
|---|
| 580 | seq = (char *) CALLOC((unsigned int) prof_len + 1, sizeof(char)); |
|---|
| 581 | ali_out_of_memory_if(!seq); |
|---|
| 582 | |
|---|
| 583 | for (p = 0; p < prof_len; p++) { |
|---|
| 584 | min = (*sub_costs)[p][0]; |
|---|
| 585 | min_i = 0; |
|---|
| 586 | for (i = 1; i < 5; i++) { |
|---|
| 587 | if (min > (*sub_costs)[p][i]) { |
|---|
| 588 | min = (*sub_costs)[p][i]; |
|---|
| 589 | min_i = i; |
|---|
| 590 | } |
|---|
| 591 | else { |
|---|
| 592 | if (min == (*sub_costs)[p][i]) |
|---|
| 593 | min_i = -1; |
|---|
| 594 | } |
|---|
| 595 | } |
|---|
| 596 | if (min_i >= 0) |
|---|
| 597 | seq[p] = ali_number_to_base(min_i); |
|---|
| 598 | else { |
|---|
| 599 | if (min > 0) |
|---|
| 600 | seq[p] = '*'; |
|---|
| 601 | else |
|---|
| 602 | seq[p] = '.'; |
|---|
| 603 | } |
|---|
| 604 | } |
|---|
| 605 | seq[prof_len] = '\0'; |
|---|
| 606 | |
|---|
| 607 | return seq; |
|---|
| 608 | } |
|---|
| 609 | |
|---|
| 610 | float ALI_PROFILE::w_binding(unsigned long first_seq_pos, ALI_SEQUENCE *seq) { |
|---|
| 611 | // calculate the costs of a binding |
|---|
| 612 | unsigned long pos_1_seq, pos_2_seq, last_seq_pos; |
|---|
| 613 | long pos_compl; |
|---|
| 614 | float costs = 0; |
|---|
| 615 | |
|---|
| 616 | last_seq_pos = first_seq_pos + seq->length() - 1; |
|---|
| 617 | for (pos_1_seq = first_seq_pos; pos_1_seq <= last_seq_pos; pos_1_seq++) { |
|---|
| 618 | pos_compl = (*helix)[pos_1_seq]; |
|---|
| 619 | if (pos_compl >= 0) { |
|---|
| 620 | pos_2_seq = (unsigned long) pos_compl; |
|---|
| 621 | if (pos_2_seq > pos_1_seq && pos_2_seq <= last_seq_pos) |
|---|
| 622 | costs += w_bind(pos_1_seq, seq->base(pos_1_seq), |
|---|
| 623 | pos_2_seq, seq->base(pos_2_seq)); |
|---|
| 624 | else |
|---|
| 625 | if (pos_2_seq < first_seq_pos || pos_2_seq > last_seq_pos) |
|---|
| 626 | costs += w_bind_maximum; |
|---|
| 627 | } |
|---|
| 628 | } |
|---|
| 629 | |
|---|
| 630 | return costs; |
|---|
| 631 | } |
|---|
| 632 | |
|---|
| 633 | |
|---|