| 1 | #include <stdio.h> |
|---|
| 2 | #include <string.h> |
|---|
| 3 | #include <ctype.h> |
|---|
| 4 | #include <stdlib.h> |
|---|
| 5 | #include "clustalw.h" |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | /* |
|---|
| 9 | * Prototypes |
|---|
| 10 | */ |
|---|
| 11 | |
|---|
| 12 | /* |
|---|
| 13 | * Global Variables |
|---|
| 14 | */ |
|---|
| 15 | |
|---|
| 16 | extern double **tmat; |
|---|
| 17 | extern Boolean no_weights; |
|---|
| 18 | extern sint debug; |
|---|
| 19 | extern sint max_aa; |
|---|
| 20 | extern sint nseqs; |
|---|
| 21 | extern sint profile1_nseqs; |
|---|
| 22 | extern sint nsets; |
|---|
| 23 | extern sint **sets; |
|---|
| 24 | extern sint divergence_cutoff; |
|---|
| 25 | extern sint *seq_weight; |
|---|
| 26 | extern sint output_order, *output_index; |
|---|
| 27 | extern Boolean distance_tree; |
|---|
| 28 | extern char seqname[]; |
|---|
| 29 | extern sint *seqlen_array; |
|---|
| 30 | extern char **seq_array; |
|---|
| 31 | |
|---|
| 32 | sint malign(sint istart,char *phylip_name) /* full progressive alignment*/ |
|---|
| 33 | { |
|---|
| 34 | static sint *aligned; |
|---|
| 35 | static sint *group; |
|---|
| 36 | static sint ix; |
|---|
| 37 | |
|---|
| 38 | sint *maxid, max, sum; |
|---|
| 39 | sint *tree_weight; |
|---|
| 40 | sint i,j,set,iseq=0; |
|---|
| 41 | sint status,entries; |
|---|
| 42 | lint score = 0; |
|---|
| 43 | |
|---|
| 44 | |
|---|
| 45 | info("Start of Multiple Alignment"); |
|---|
| 46 | |
|---|
| 47 | /* get the phylogenetic tree from *.ph */ |
|---|
| 48 | |
|---|
| 49 | if (nseqs >= 2) |
|---|
| 50 | { |
|---|
| 51 | status = read_tree(phylip_name, (sint)0, nseqs); |
|---|
| 52 | if (status == 0) return((sint)0); |
|---|
| 53 | } |
|---|
| 54 | |
|---|
| 55 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 56 | weights in global variable seq_weight normalised to sum to 100 */ |
|---|
| 57 | |
|---|
| 58 | calc_seq_weights((sint)0, nseqs, seq_weight); |
|---|
| 59 | |
|---|
| 60 | /* recalculate tmat matrix as percent similarity matrix */ |
|---|
| 61 | |
|---|
| 62 | status = calc_similarities(nseqs); |
|---|
| 63 | if (status == 0) return((sint)0); |
|---|
| 64 | |
|---|
| 65 | /* for each sequence, find the most closely related sequence */ |
|---|
| 66 | |
|---|
| 67 | maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 68 | for (i=1;i<=nseqs;i++) |
|---|
| 69 | { |
|---|
| 70 | maxid[i] = -1; |
|---|
| 71 | for (j=1;j<=nseqs;j++) |
|---|
| 72 | if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j]; |
|---|
| 73 | } |
|---|
| 74 | |
|---|
| 75 | /* group the sequences according to their relative divergence */ |
|---|
| 76 | |
|---|
| 77 | if (istart == 0) |
|---|
| 78 | { |
|---|
| 79 | sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) ); |
|---|
| 80 | for(i=0;i<=nseqs;i++) |
|---|
| 81 | sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 82 | |
|---|
| 83 | create_sets((sint)0,nseqs); |
|---|
| 84 | info("There are %d groups",(pint)nsets); |
|---|
| 85 | |
|---|
| 86 | /* clear the memory used for the phylogenetic tree */ |
|---|
| 87 | |
|---|
| 88 | if (nseqs >= 2) |
|---|
| 89 | clear_tree(NULL); |
|---|
| 90 | |
|---|
| 91 | /* start the multiple alignments......... */ |
|---|
| 92 | |
|---|
| 93 | info("Aligning..."); |
|---|
| 94 | |
|---|
| 95 | /* first pass, align closely related sequences first.... */ |
|---|
| 96 | |
|---|
| 97 | ix = 0; |
|---|
| 98 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 99 | for (i=0;i<=nseqs;i++) aligned[i] = 0; |
|---|
| 100 | |
|---|
| 101 | for(set=1;set<=nsets;++set) |
|---|
| 102 | { |
|---|
| 103 | entries=0; |
|---|
| 104 | for (i=1;i<=nseqs;i++) |
|---|
| 105 | { |
|---|
| 106 | if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff)) |
|---|
| 107 | { |
|---|
| 108 | entries++; |
|---|
| 109 | if (aligned[i] == 0) |
|---|
| 110 | { |
|---|
| 111 | if (output_order==INPUT) |
|---|
| 112 | { |
|---|
| 113 | ++ix; |
|---|
| 114 | output_index[i] = i; |
|---|
| 115 | } |
|---|
| 116 | else output_index[++ix] = i; |
|---|
| 117 | aligned[i] = 1; |
|---|
| 118 | } |
|---|
| 119 | } |
|---|
| 120 | } |
|---|
| 121 | |
|---|
| 122 | if(entries > 0) score = prfalign(sets[set], aligned); |
|---|
| 123 | else score=0.0; |
|---|
| 124 | |
|---|
| 125 | |
|---|
| 126 | /* negative score means fatal error... exit now! */ |
|---|
| 127 | |
|---|
| 128 | if (score < 0) |
|---|
| 129 | { |
|---|
| 130 | return(-1); |
|---|
| 131 | } |
|---|
| 132 | if ((entries > 0) && (score > 0)) |
|---|
| 133 | info("Group %d: Sequences:%4d Score:%d", |
|---|
| 134 | (pint)set,(pint)entries,(pint)score); |
|---|
| 135 | else |
|---|
| 136 | info("Group %d: Delayed", |
|---|
| 137 | (pint)set); |
|---|
| 138 | } |
|---|
| 139 | |
|---|
| 140 | for (i=0;i<=nseqs;i++) |
|---|
| 141 | sets[i]=ckfree((void *)sets[i]); |
|---|
| 142 | sets=ckfree(sets); |
|---|
| 143 | } |
|---|
| 144 | else |
|---|
| 145 | { |
|---|
| 146 | /* clear the memory used for the phylogenetic tree */ |
|---|
| 147 | |
|---|
| 148 | if (nseqs >= 2) |
|---|
| 149 | clear_tree(NULL); |
|---|
| 150 | |
|---|
| 151 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 152 | ix = 0; |
|---|
| 153 | for (i=1;i<=istart+1;i++) |
|---|
| 154 | { |
|---|
| 155 | aligned[i] = 1; |
|---|
| 156 | ++ix; |
|---|
| 157 | output_index[i] = i; |
|---|
| 158 | } |
|---|
| 159 | for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; |
|---|
| 160 | } |
|---|
| 161 | |
|---|
| 162 | /* second pass - align remaining, more divergent sequences..... */ |
|---|
| 163 | |
|---|
| 164 | /* if not all sequences were aligned, for each unaligned sequence, |
|---|
| 165 | find it's closest pair amongst the aligned sequences. */ |
|---|
| 166 | |
|---|
| 167 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 168 | tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 169 | for (i=0;i<nseqs;i++) |
|---|
| 170 | tree_weight[i] = seq_weight[i]; |
|---|
| 171 | |
|---|
| 172 | /* if we haven't aligned any sequences, in the first pass - align the |
|---|
| 173 | two most closely related sequences now */ |
|---|
| 174 | if(ix==0) |
|---|
| 175 | { |
|---|
| 176 | max = -1; |
|---|
| 177 | iseq = 0; |
|---|
| 178 | for (i=1;i<=nseqs;i++) |
|---|
| 179 | { |
|---|
| 180 | for (j=i+1;j<=nseqs;j++) |
|---|
| 181 | { |
|---|
| 182 | if (max < tmat[i][j]) |
|---|
| 183 | { |
|---|
| 184 | max = tmat[i][j]; |
|---|
| 185 | iseq = i; |
|---|
| 186 | } |
|---|
| 187 | } |
|---|
| 188 | } |
|---|
| 189 | aligned[iseq]=1; |
|---|
| 190 | if (output_order == INPUT) |
|---|
| 191 | { |
|---|
| 192 | ++ix; |
|---|
| 193 | output_index[iseq] = iseq; |
|---|
| 194 | } |
|---|
| 195 | else |
|---|
| 196 | output_index[++ix] = iseq; |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | while (ix < nseqs) |
|---|
| 200 | { |
|---|
| 201 | for (i=1;i<=nseqs;i++) { |
|---|
| 202 | if (aligned[i] == 0) |
|---|
| 203 | { |
|---|
| 204 | maxid[i] = -1; |
|---|
| 205 | for (j=1;j<=nseqs;j++) |
|---|
| 206 | if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0)) |
|---|
| 207 | maxid[i] = tmat[i][j]; |
|---|
| 208 | } |
|---|
| 209 | } |
|---|
| 210 | /* find the most closely related sequence to those already aligned */ |
|---|
| 211 | |
|---|
| 212 | max = -1; |
|---|
| 213 | iseq = 0; |
|---|
| 214 | for (i=1;i<=nseqs;i++) |
|---|
| 215 | { |
|---|
| 216 | if ((aligned[i] == 0) && (maxid[i] > max)) |
|---|
| 217 | { |
|---|
| 218 | max = maxid[i]; |
|---|
| 219 | iseq = i; |
|---|
| 220 | } |
|---|
| 221 | } |
|---|
| 222 | |
|---|
| 223 | |
|---|
| 224 | /* align this sequence to the existing alignment */ |
|---|
| 225 | /* weight sequences with percent identity with profile*/ |
|---|
| 226 | /* OR...., multiply sequence weights from tree by percent identity with new sequence */ |
|---|
| 227 | if(no_weights==FALSE) { |
|---|
| 228 | for (j=0;j<nseqs;j++) |
|---|
| 229 | if (aligned[j+1] != 0) |
|---|
| 230 | seq_weight[j] = tree_weight[j] * tmat[j+1][iseq]; |
|---|
| 231 | /* |
|---|
| 232 | Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR |
|---|
| 233 | */ |
|---|
| 234 | |
|---|
| 235 | sum = 0; |
|---|
| 236 | for (j=0;j<nseqs;j++) |
|---|
| 237 | if (aligned[j+1] != 0) |
|---|
| 238 | sum += seq_weight[j]; |
|---|
| 239 | if (sum == 0) |
|---|
| 240 | { |
|---|
| 241 | for (j=0;j<nseqs;j++) |
|---|
| 242 | seq_weight[j] = 1; |
|---|
| 243 | sum = j; |
|---|
| 244 | } |
|---|
| 245 | for (j=0;j<nseqs;j++) |
|---|
| 246 | if (aligned[j+1] != 0) |
|---|
| 247 | { |
|---|
| 248 | seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; |
|---|
| 249 | if (seq_weight[j] < 1) seq_weight[j] = 1; |
|---|
| 250 | } |
|---|
| 251 | } |
|---|
| 252 | |
|---|
| 253 | entries = 0; |
|---|
| 254 | for (j=1;j<=nseqs;j++) |
|---|
| 255 | if (aligned[j] != 0) |
|---|
| 256 | { |
|---|
| 257 | group[j] = 1; |
|---|
| 258 | entries++; |
|---|
| 259 | } |
|---|
| 260 | else if (iseq==j) |
|---|
| 261 | { |
|---|
| 262 | group[j] = 2; |
|---|
| 263 | entries++; |
|---|
| 264 | } |
|---|
| 265 | aligned[iseq] = 1; |
|---|
| 266 | |
|---|
| 267 | score = prfalign(group, aligned); |
|---|
| 268 | info("Sequence:%d Score:%d",(pint)iseq,(pint)score); |
|---|
| 269 | if (output_order == INPUT) |
|---|
| 270 | { |
|---|
| 271 | ++ix; |
|---|
| 272 | output_index[iseq] = iseq; |
|---|
| 273 | } |
|---|
| 274 | else |
|---|
| 275 | output_index[++ix] = iseq; |
|---|
| 276 | } |
|---|
| 277 | |
|---|
| 278 | group=ckfree((void *)group); |
|---|
| 279 | aligned=ckfree((void *)aligned); |
|---|
| 280 | maxid=ckfree((void *)maxid); |
|---|
| 281 | tree_weight=ckfree((void *)tree_weight); |
|---|
| 282 | |
|---|
| 283 | aln_score(); |
|---|
| 284 | |
|---|
| 285 | /* make the rest (output stuff) into routine clustal_out in file amenu.c */ |
|---|
| 286 | |
|---|
| 287 | return(nseqs); |
|---|
| 288 | |
|---|
| 289 | } |
|---|
| 290 | |
|---|
| 291 | sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */ |
|---|
| 292 | { |
|---|
| 293 | static sint *aligned, *tree_weight; |
|---|
| 294 | static sint *group; |
|---|
| 295 | static sint ix; |
|---|
| 296 | |
|---|
| 297 | sint *maxid, max; |
|---|
| 298 | sint i,j,status,iseq; |
|---|
| 299 | sint sum,entries; |
|---|
| 300 | lint score = 0; |
|---|
| 301 | |
|---|
| 302 | |
|---|
| 303 | info("Start of Multiple Alignment"); |
|---|
| 304 | |
|---|
| 305 | /* get the phylogenetic tree from *.ph */ |
|---|
| 306 | |
|---|
| 307 | if (nseqs >= 2) |
|---|
| 308 | { |
|---|
| 309 | status = read_tree(phylip_name, (sint)0, nseqs); |
|---|
| 310 | if (status == 0) return(0); |
|---|
| 311 | } |
|---|
| 312 | |
|---|
| 313 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 314 | weights in global variable seq_weight normalised to sum to 100 */ |
|---|
| 315 | |
|---|
| 316 | calc_seq_weights((sint)0, nseqs, seq_weight); |
|---|
| 317 | |
|---|
| 318 | tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 319 | for (i=0;i<nseqs;i++) |
|---|
| 320 | tree_weight[i] = seq_weight[i]; |
|---|
| 321 | |
|---|
| 322 | /* recalculate tmat matrix as percent similarity matrix */ |
|---|
| 323 | |
|---|
| 324 | status = calc_similarities(nseqs); |
|---|
| 325 | if (status == 0) return((sint)0); |
|---|
| 326 | |
|---|
| 327 | /* for each sequence, find the most closely related sequence */ |
|---|
| 328 | |
|---|
| 329 | maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 330 | for (i=1;i<=nseqs;i++) |
|---|
| 331 | { |
|---|
| 332 | maxid[i] = -1; |
|---|
| 333 | for (j=1;j<=nseqs;j++) |
|---|
| 334 | if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j]; |
|---|
| 335 | } |
|---|
| 336 | |
|---|
| 337 | /* clear the memory used for the phylogenetic tree */ |
|---|
| 338 | |
|---|
| 339 | if (nseqs >= 2) |
|---|
| 340 | clear_tree(NULL); |
|---|
| 341 | |
|---|
| 342 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 343 | ix = 0; |
|---|
| 344 | for (i=1;i<=istart+1;i++) |
|---|
| 345 | { |
|---|
| 346 | aligned[i] = 1; |
|---|
| 347 | ++ix; |
|---|
| 348 | output_index[i] = i; |
|---|
| 349 | } |
|---|
| 350 | for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; |
|---|
| 351 | |
|---|
| 352 | /* for each unaligned sequence, find it's closest pair amongst the |
|---|
| 353 | aligned sequences. */ |
|---|
| 354 | |
|---|
| 355 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 356 | |
|---|
| 357 | while (ix < nseqs) |
|---|
| 358 | { |
|---|
| 359 | if (ix > 0) |
|---|
| 360 | { |
|---|
| 361 | for (i=1;i<=nseqs;i++) { |
|---|
| 362 | if (aligned[i] == 0) |
|---|
| 363 | { |
|---|
| 364 | maxid[i] = -1; |
|---|
| 365 | for (j=1;j<=nseqs;j++) |
|---|
| 366 | if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0)) |
|---|
| 367 | maxid[i] = tmat[i][j]; |
|---|
| 368 | } |
|---|
| 369 | } |
|---|
| 370 | } |
|---|
| 371 | |
|---|
| 372 | /* find the most closely related sequence to those already aligned */ |
|---|
| 373 | |
|---|
| 374 | max = -1; |
|---|
| 375 | for (i=1;i<=nseqs;i++) |
|---|
| 376 | { |
|---|
| 377 | if ((aligned[i] == 0) && (maxid[i] > max)) |
|---|
| 378 | { |
|---|
| 379 | max = maxid[i]; |
|---|
| 380 | iseq = i; |
|---|
| 381 | } |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | /* align this sequence to the existing alignment */ |
|---|
| 385 | |
|---|
| 386 | entries = 0; |
|---|
| 387 | for (j=1;j<=nseqs;j++) |
|---|
| 388 | if (aligned[j] != 0) |
|---|
| 389 | { |
|---|
| 390 | group[j] = 1; |
|---|
| 391 | entries++; |
|---|
| 392 | } |
|---|
| 393 | else if (iseq==j) |
|---|
| 394 | { |
|---|
| 395 | group[j] = 2; |
|---|
| 396 | entries++; |
|---|
| 397 | } |
|---|
| 398 | aligned[iseq] = 1; |
|---|
| 399 | |
|---|
| 400 | |
|---|
| 401 | /* EITHER....., set sequence weights equal to percent identity with new sequence */ |
|---|
| 402 | /* |
|---|
| 403 | for (j=0;j<nseqs;j++) |
|---|
| 404 | seq_weight[j] = tmat[j+1][iseq]; |
|---|
| 405 | */ |
|---|
| 406 | /* OR...., multiply sequence weights from tree by percent identity with new sequence */ |
|---|
| 407 | for (j=0;j<nseqs;j++) |
|---|
| 408 | seq_weight[j] = tree_weight[j] * tmat[j+1][iseq]; |
|---|
| 409 | if (debug>1) |
|---|
| 410 | for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf (stdout,"sequence %d: %d\n", j+1,tree_weight[j]); |
|---|
| 411 | /* |
|---|
| 412 | Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR |
|---|
| 413 | */ |
|---|
| 414 | |
|---|
| 415 | sum = 0; |
|---|
| 416 | for (j=0;j<nseqs;j++) |
|---|
| 417 | if (group[j+1] == 1) sum += seq_weight[j]; |
|---|
| 418 | if (sum == 0) |
|---|
| 419 | { |
|---|
| 420 | for (j=0;j<nseqs;j++) |
|---|
| 421 | seq_weight[j] = 1; |
|---|
| 422 | sum = j; |
|---|
| 423 | } |
|---|
| 424 | for (j=0;j<nseqs;j++) |
|---|
| 425 | { |
|---|
| 426 | seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; |
|---|
| 427 | if (seq_weight[j] < 1) seq_weight[j] = 1; |
|---|
| 428 | } |
|---|
| 429 | |
|---|
| 430 | if (debug > 1) { |
|---|
| 431 | fprintf(stdout,"new weights\n"); |
|---|
| 432 | for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]); |
|---|
| 433 | } |
|---|
| 434 | |
|---|
| 435 | score = prfalign(group, aligned); |
|---|
| 436 | info("Sequence:%d Score:%d",(pint)iseq,(pint)score); |
|---|
| 437 | if (output_order == INPUT) |
|---|
| 438 | { |
|---|
| 439 | ++ix; |
|---|
| 440 | output_index[iseq] = iseq; |
|---|
| 441 | } |
|---|
| 442 | else |
|---|
| 443 | output_index[++ix] = iseq; |
|---|
| 444 | } |
|---|
| 445 | |
|---|
| 446 | group=ckfree((void *)group); |
|---|
| 447 | aligned=ckfree((void *)aligned); |
|---|
| 448 | maxid=ckfree((void *)maxid); |
|---|
| 449 | |
|---|
| 450 | aln_score(); |
|---|
| 451 | |
|---|
| 452 | /* make the rest (output stuff) into routine clustal_out in file amenu.c */ |
|---|
| 453 | |
|---|
| 454 | return(nseqs); |
|---|
| 455 | |
|---|
| 456 | } |
|---|
| 457 | |
|---|
| 458 | |
|---|
| 459 | sint palign1(void) /* a profile alignment */ |
|---|
| 460 | { |
|---|
| 461 | sint i,j,temp; |
|---|
| 462 | sint entries; |
|---|
| 463 | sint *aligned, *group; |
|---|
| 464 | float dscore; |
|---|
| 465 | lint score; |
|---|
| 466 | |
|---|
| 467 | info("Start of Initial Alignment"); |
|---|
| 468 | |
|---|
| 469 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 470 | weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */ |
|---|
| 471 | |
|---|
| 472 | temp = INT_SCALE_FACTOR/nseqs; |
|---|
| 473 | for (i=0;i<nseqs;i++) seq_weight[i] = temp; |
|---|
| 474 | |
|---|
| 475 | distance_tree = FALSE; |
|---|
| 476 | |
|---|
| 477 | /* do the initial alignment......... */ |
|---|
| 478 | |
|---|
| 479 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 480 | |
|---|
| 481 | for(i=1; i<=profile1_nseqs; ++i) |
|---|
| 482 | group[i] = 1; |
|---|
| 483 | for(i=profile1_nseqs+1; i<=nseqs; ++i) |
|---|
| 484 | group[i] = 2; |
|---|
| 485 | entries = nseqs; |
|---|
| 486 | |
|---|
| 487 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 488 | for (i=1;i<=nseqs;i++) aligned[i] = 1; |
|---|
| 489 | |
|---|
| 490 | score = prfalign(group, aligned); |
|---|
| 491 | info("Sequences:%d Score:%d",(pint)entries,(pint)score); |
|---|
| 492 | group=ckfree((void *)group); |
|---|
| 493 | aligned=ckfree((void *)aligned); |
|---|
| 494 | |
|---|
| 495 | for (i=1;i<=nseqs;i++) { |
|---|
| 496 | for (j=i+1;j<=nseqs;j++) { |
|---|
| 497 | dscore = countid(i,j); |
|---|
| 498 | tmat[i][j] = ((double)100.0 - (double)dscore)/(double)100.0; |
|---|
| 499 | tmat[j][i] = tmat[i][j]; |
|---|
| 500 | } |
|---|
| 501 | } |
|---|
| 502 | |
|---|
| 503 | return(nseqs); |
|---|
| 504 | } |
|---|
| 505 | |
|---|
| 506 | float countid(sint s1, sint s2) |
|---|
| 507 | { |
|---|
| 508 | char c1,c2; |
|---|
| 509 | sint i; |
|---|
| 510 | sint count,total; |
|---|
| 511 | float score; |
|---|
| 512 | |
|---|
| 513 | count = total = 0; |
|---|
| 514 | for (i=1;i<=seqlen_array[s1] && i<=seqlen_array[s2];i++) { |
|---|
| 515 | c1 = seq_array[s1][i]; |
|---|
| 516 | c2 = seq_array[s2][i]; |
|---|
| 517 | if ((c1>=0) && (c1<max_aa)) { |
|---|
| 518 | total++; |
|---|
| 519 | if (c1 == c2) count++; |
|---|
| 520 | } |
|---|
| 521 | |
|---|
| 522 | } |
|---|
| 523 | |
|---|
| 524 | if(total==0) score=0; |
|---|
| 525 | else |
|---|
| 526 | score = 100.0 * (float)count / (float)total; |
|---|
| 527 | return(score); |
|---|
| 528 | |
|---|
| 529 | } |
|---|
| 530 | |
|---|
| 531 | sint palign2(char *p1_tree_name,char *p2_tree_name) /* a profile alignment */ |
|---|
| 532 | { |
|---|
| 533 | sint i,j,sum,entries,status; |
|---|
| 534 | lint score; |
|---|
| 535 | sint *aligned, *group; |
|---|
| 536 | sint *maxid,*p1_weight,*p2_weight; |
|---|
| 537 | |
|---|
| 538 | info("Start of Multiple Alignment"); |
|---|
| 539 | |
|---|
| 540 | /* get the phylogenetic trees from *.ph */ |
|---|
| 541 | |
|---|
| 542 | if (profile1_nseqs >= 2) |
|---|
| 543 | { |
|---|
| 544 | status = read_tree(p1_tree_name, (sint)0, profile1_nseqs); |
|---|
| 545 | if (status == 0) return(0); |
|---|
| 546 | } |
|---|
| 547 | |
|---|
| 548 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 549 | weights in global variable seq_weight normalised to sum to 100 */ |
|---|
| 550 | |
|---|
| 551 | p1_weight = (sint *) ckalloc( (profile1_nseqs) * sizeof(sint) ); |
|---|
| 552 | |
|---|
| 553 | calc_seq_weights((sint)0, profile1_nseqs, p1_weight); |
|---|
| 554 | |
|---|
| 555 | /* clear the memory for the phylogenetic tree */ |
|---|
| 556 | |
|---|
| 557 | if (profile1_nseqs >= 2) |
|---|
| 558 | clear_tree(NULL); |
|---|
| 559 | |
|---|
| 560 | if (nseqs-profile1_nseqs >= 2) |
|---|
| 561 | { |
|---|
| 562 | status = read_tree(p2_tree_name, profile1_nseqs, nseqs); |
|---|
| 563 | if (status == 0) return(0); |
|---|
| 564 | } |
|---|
| 565 | |
|---|
| 566 | p2_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 567 | |
|---|
| 568 | calc_seq_weights(profile1_nseqs,nseqs, p2_weight); |
|---|
| 569 | |
|---|
| 570 | |
|---|
| 571 | /* clear the memory for the phylogenetic tree */ |
|---|
| 572 | |
|---|
| 573 | if (nseqs-profile1_nseqs >= 2) |
|---|
| 574 | clear_tree(NULL); |
|---|
| 575 | |
|---|
| 576 | /* convert tmat distances to similarities */ |
|---|
| 577 | |
|---|
| 578 | for (i=1;i<nseqs;i++) |
|---|
| 579 | for (j=i+1;j<=nseqs;j++) { |
|---|
| 580 | tmat[i][j]=100.0-tmat[i][j]*100.0; |
|---|
| 581 | tmat[j][i]=tmat[i][j]; |
|---|
| 582 | } |
|---|
| 583 | |
|---|
| 584 | |
|---|
| 585 | /* weight sequences with max percent identity with other profile*/ |
|---|
| 586 | |
|---|
| 587 | maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 588 | for (i=0;i<profile1_nseqs;i++) { |
|---|
| 589 | maxid[i] = 0; |
|---|
| 590 | for (j=profile1_nseqs+1;j<=nseqs;j++) |
|---|
| 591 | if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j]; |
|---|
| 592 | seq_weight[i] = maxid[i]*p1_weight[i]; |
|---|
| 593 | } |
|---|
| 594 | |
|---|
| 595 | for (i=profile1_nseqs;i<nseqs;i++) { |
|---|
| 596 | maxid[i] = -1; |
|---|
| 597 | for (j=1;j<=profile1_nseqs;j++) |
|---|
| 598 | if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j]; |
|---|
| 599 | seq_weight[i] = maxid[i]*p2_weight[i]; |
|---|
| 600 | } |
|---|
| 601 | /* |
|---|
| 602 | Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR |
|---|
| 603 | */ |
|---|
| 604 | |
|---|
| 605 | sum = 0; |
|---|
| 606 | for (j=0;j<nseqs;j++) |
|---|
| 607 | sum += seq_weight[j]; |
|---|
| 608 | if (sum == 0) |
|---|
| 609 | { |
|---|
| 610 | for (j=0;j<nseqs;j++) |
|---|
| 611 | seq_weight[j] = 1; |
|---|
| 612 | sum = j; |
|---|
| 613 | } |
|---|
| 614 | for (j=0;j<nseqs;j++) |
|---|
| 615 | { |
|---|
| 616 | seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; |
|---|
| 617 | if (seq_weight[j] < 1) seq_weight[j] = 1; |
|---|
| 618 | } |
|---|
| 619 | if (debug > 1) { |
|---|
| 620 | fprintf(stdout,"new weights\n"); |
|---|
| 621 | for (j=0;j<nseqs;j++) fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]); |
|---|
| 622 | } |
|---|
| 623 | |
|---|
| 624 | |
|---|
| 625 | /* do the alignment......... */ |
|---|
| 626 | |
|---|
| 627 | info("Aligning..."); |
|---|
| 628 | |
|---|
| 629 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 630 | |
|---|
| 631 | for(i=1; i<=profile1_nseqs; ++i) |
|---|
| 632 | group[i] = 1; |
|---|
| 633 | for(i=profile1_nseqs+1; i<=nseqs; ++i) |
|---|
| 634 | group[i] = 2; |
|---|
| 635 | entries = nseqs; |
|---|
| 636 | |
|---|
| 637 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 638 | for (i=1;i<=nseqs;i++) aligned[i] = 1; |
|---|
| 639 | |
|---|
| 640 | score = prfalign(group, aligned); |
|---|
| 641 | info("Sequences:%d Score:%d",(pint)entries,(pint)score); |
|---|
| 642 | group=ckfree((void *)group); |
|---|
| 643 | p1_weight=ckfree((void *)p1_weight); |
|---|
| 644 | p2_weight=ckfree((void *)p2_weight); |
|---|
| 645 | aligned=ckfree((void *)aligned); |
|---|
| 646 | maxid=ckfree((void *)maxid); |
|---|
| 647 | |
|---|
| 648 | /* DES output_index = (int *)ckalloc( (nseqs+1) * sizeof (int)); */ |
|---|
| 649 | for (i=1;i<=nseqs;i++) output_index[i] = i; |
|---|
| 650 | |
|---|
| 651 | return(nseqs); |
|---|
| 652 | } |
|---|
| 653 | |
|---|