| 1 | #include <stdio.h> |
|---|
| 2 | #include <string.h> |
|---|
| 3 | #include <ctype.h> |
|---|
| 4 | #include <stdlib.h> |
|---|
| 5 | #include "clustalw.h" |
|---|
| 6 | |
|---|
| 7 | |
|---|
| 8 | /* |
|---|
| 9 | * Prototypes |
|---|
| 10 | */ |
|---|
| 11 | |
|---|
| 12 | /* |
|---|
| 13 | * Global Variables |
|---|
| 14 | */ |
|---|
| 15 | |
|---|
| 16 | extern double **tmat; |
|---|
| 17 | extern sint debug; |
|---|
| 18 | extern sint max_aa; |
|---|
| 19 | extern sint nseqs; |
|---|
| 20 | extern sint profile1_nseqs; |
|---|
| 21 | extern sint nsets; |
|---|
| 22 | extern sint **sets; |
|---|
| 23 | extern sint divergence_cutoff; |
|---|
| 24 | extern sint *seq_weight; |
|---|
| 25 | extern sint output_order, *output_index; |
|---|
| 26 | extern Boolean distance_tree; |
|---|
| 27 | extern char seqname[]; |
|---|
| 28 | extern sint *seqlen_array; |
|---|
| 29 | extern char **seq_array; |
|---|
| 30 | |
|---|
| 31 | sint malign(sint istart,char *phylip_name) /* full progressive alignment*/ |
|---|
| 32 | { |
|---|
| 33 | static sint *aligned; |
|---|
| 34 | static sint *group; |
|---|
| 35 | static sint ix; |
|---|
| 36 | |
|---|
| 37 | sint *maxid, max, sum; |
|---|
| 38 | sint *tree_weight; |
|---|
| 39 | sint i,j,set,iseq=0; |
|---|
| 40 | sint status,entries; |
|---|
| 41 | lint score = 0; |
|---|
| 42 | |
|---|
| 43 | |
|---|
| 44 | info("Start of Multiple Alignment"); |
|---|
| 45 | |
|---|
| 46 | seq_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 47 | |
|---|
| 48 | /* get the phylogenetic tree from *.ph */ |
|---|
| 49 | |
|---|
| 50 | if (nseqs > 3) |
|---|
| 51 | { |
|---|
| 52 | status = read_tree(phylip_name, (sint)0, nseqs); |
|---|
| 53 | if (status == 0) return((sint)0); |
|---|
| 54 | } |
|---|
| 55 | |
|---|
| 56 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 57 | weights in global variable seq_weight normalised to sum to 100 */ |
|---|
| 58 | |
|---|
| 59 | calc_seq_weights((sint)0, nseqs, seq_weight); |
|---|
| 60 | |
|---|
| 61 | /* recalculate tmat matrix as percent similarity matrix */ |
|---|
| 62 | |
|---|
| 63 | status = calc_similarities(nseqs); |
|---|
| 64 | if (status == 0) return((sint)0); |
|---|
| 65 | |
|---|
| 66 | /* for each sequence, find the most closely related sequence */ |
|---|
| 67 | |
|---|
| 68 | maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 69 | for (i=1;i<=nseqs;i++) |
|---|
| 70 | { |
|---|
| 71 | maxid[i] = 0; |
|---|
| 72 | for (j=1;j<=nseqs;j++) |
|---|
| 73 | if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j]; |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | /* group the sequences according to their relative divergence */ |
|---|
| 77 | |
|---|
| 78 | if (istart == 0) |
|---|
| 79 | { |
|---|
| 80 | sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) ); |
|---|
| 81 | for(i=0;i<=nseqs;i++) |
|---|
| 82 | sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 83 | |
|---|
| 84 | create_sets((sint)0,nseqs); |
|---|
| 85 | info("There are %d groups",(pint)nsets); |
|---|
| 86 | |
|---|
| 87 | /* clear the memory used for the phylogenetic tree */ |
|---|
| 88 | |
|---|
| 89 | if (nseqs > 3) |
|---|
| 90 | clear_tree(NULL); |
|---|
| 91 | |
|---|
| 92 | /* start the multiple alignments......... */ |
|---|
| 93 | |
|---|
| 94 | info("Aligning..."); |
|---|
| 95 | |
|---|
| 96 | /* first pass, align closely related sequences first.... */ |
|---|
| 97 | |
|---|
| 98 | ix = 0; |
|---|
| 99 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 100 | for (i=0;i<=nseqs;i++) aligned[i] = 0; |
|---|
| 101 | |
|---|
| 102 | for(set=1;set<=nsets;++set) |
|---|
| 103 | { |
|---|
| 104 | entries=0; |
|---|
| 105 | for (i=1;i<=nseqs;i++) |
|---|
| 106 | { |
|---|
| 107 | if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff)) |
|---|
| 108 | { |
|---|
| 109 | entries++; |
|---|
| 110 | if (aligned[i] == 0) |
|---|
| 111 | { |
|---|
| 112 | if (output_order==INPUT) |
|---|
| 113 | { |
|---|
| 114 | ++ix; |
|---|
| 115 | output_index[i] = i; |
|---|
| 116 | } |
|---|
| 117 | else output_index[++ix] = i; |
|---|
| 118 | aligned[i] = 1; |
|---|
| 119 | } |
|---|
| 120 | } |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | if(entries > 0) score = prfalign(sets[set], aligned); |
|---|
| 124 | else score=0.0; |
|---|
| 125 | |
|---|
| 126 | |
|---|
| 127 | /* negative score means fatal error... exit now! */ |
|---|
| 128 | |
|---|
| 129 | if (score < 0) |
|---|
| 130 | { |
|---|
| 131 | return(-1); |
|---|
| 132 | } |
|---|
| 133 | if ((entries > 0) && (score > 0)) |
|---|
| 134 | info("Group %d: Sequences:%4d Score:%d", |
|---|
| 135 | (pint)set,(pint)entries,(pint)score); |
|---|
| 136 | else |
|---|
| 137 | info("Group %d: Delayed", |
|---|
| 138 | (pint)set); |
|---|
| 139 | } |
|---|
| 140 | |
|---|
| 141 | for (i=0;i<=nseqs;i++) |
|---|
| 142 | sets[i]=ckfree((void *)sets[i]); |
|---|
| 143 | sets=ckfree(sets); |
|---|
| 144 | } |
|---|
| 145 | else |
|---|
| 146 | { |
|---|
| 147 | /* clear the memory used for the phylogenetic tree */ |
|---|
| 148 | |
|---|
| 149 | if (nseqs > 3) |
|---|
| 150 | clear_tree(NULL); |
|---|
| 151 | |
|---|
| 152 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 153 | ix = 0; |
|---|
| 154 | for (i=1;i<=istart+1;i++) |
|---|
| 155 | { |
|---|
| 156 | aligned[i] = 1; |
|---|
| 157 | ++ix; |
|---|
| 158 | output_index[i] = i; |
|---|
| 159 | } |
|---|
| 160 | for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; |
|---|
| 161 | } |
|---|
| 162 | |
|---|
| 163 | /* second pass - align remaining, more divergent sequences..... */ |
|---|
| 164 | |
|---|
| 165 | /* if not all sequences were aligned, for each unaligned sequence, |
|---|
| 166 | find it's closest pair amongst the aligned sequences. */ |
|---|
| 167 | |
|---|
| 168 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 169 | tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 170 | for (i=0;i<nseqs;i++) |
|---|
| 171 | tree_weight[i] = seq_weight[i]; |
|---|
| 172 | |
|---|
| 173 | while (ix < nseqs) |
|---|
| 174 | { |
|---|
| 175 | if (ix > 0) |
|---|
| 176 | { |
|---|
| 177 | for (i=1;i<=nseqs;i++) { |
|---|
| 178 | if (aligned[i] == 0) |
|---|
| 179 | { |
|---|
| 180 | maxid[i] = 1; |
|---|
| 181 | for (j=1;j<=nseqs;j++) |
|---|
| 182 | if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0)) |
|---|
| 183 | maxid[i] = tmat[i][j]; |
|---|
| 184 | } |
|---|
| 185 | } |
|---|
| 186 | } |
|---|
| 187 | |
|---|
| 188 | /* find the most closely related sequence to those already aligned */ |
|---|
| 189 | |
|---|
| 190 | max = 0; |
|---|
| 191 | iseq = 0; |
|---|
| 192 | for (i=1;i<=nseqs;i++) |
|---|
| 193 | { |
|---|
| 194 | if ((aligned[i] == 0) && (maxid[i] > max)) |
|---|
| 195 | { |
|---|
| 196 | max = maxid[i]; |
|---|
| 197 | iseq = i; |
|---|
| 198 | } |
|---|
| 199 | } |
|---|
| 200 | |
|---|
| 201 | /* align this sequence to the existing alignment */ |
|---|
| 202 | /* weight sequences with percent identity with profile*/ |
|---|
| 203 | /* OR...., multiply sequence weights from tree by percent identity with new sequence */ |
|---|
| 204 | for (j=0;j<nseqs;j++) |
|---|
| 205 | if (aligned[j+1] != 0) |
|---|
| 206 | seq_weight[j] = tree_weight[j] * tmat[j+1][iseq]; |
|---|
| 207 | /* |
|---|
| 208 | Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR |
|---|
| 209 | */ |
|---|
| 210 | |
|---|
| 211 | sum = 0; |
|---|
| 212 | for (j=0;j<nseqs;j++) |
|---|
| 213 | if (aligned[j+1] != 0) |
|---|
| 214 | sum += seq_weight[j]; |
|---|
| 215 | if (sum == 0) |
|---|
| 216 | { |
|---|
| 217 | for (j=0;j<nseqs;j++) |
|---|
| 218 | seq_weight[j] = 1; |
|---|
| 219 | sum = j; |
|---|
| 220 | } |
|---|
| 221 | for (j=0;j<nseqs;j++) |
|---|
| 222 | if (aligned[j+1] != 0) |
|---|
| 223 | { |
|---|
| 224 | seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; |
|---|
| 225 | if (seq_weight[j] < 1) seq_weight[j] = 1; |
|---|
| 226 | } |
|---|
| 227 | |
|---|
| 228 | entries = 0; |
|---|
| 229 | for (j=1;j<=nseqs;j++) |
|---|
| 230 | if (aligned[j] != 0) |
|---|
| 231 | { |
|---|
| 232 | group[j] = 1; |
|---|
| 233 | entries++; |
|---|
| 234 | } |
|---|
| 235 | else if (iseq==j) |
|---|
| 236 | { |
|---|
| 237 | group[j] = 2; |
|---|
| 238 | entries++; |
|---|
| 239 | } |
|---|
| 240 | aligned[iseq] = 1; |
|---|
| 241 | |
|---|
| 242 | score = prfalign(group, aligned); |
|---|
| 243 | info("Sequence:%d Score:%d",(pint)iseq,(pint)score); |
|---|
| 244 | if (output_order == INPUT) |
|---|
| 245 | { |
|---|
| 246 | ++ix; |
|---|
| 247 | output_index[iseq] = iseq; |
|---|
| 248 | } |
|---|
| 249 | else |
|---|
| 250 | output_index[++ix] = iseq; |
|---|
| 251 | } |
|---|
| 252 | |
|---|
| 253 | group=ckfree((void *)group); |
|---|
| 254 | seq_weight=ckfree((void *)seq_weight); |
|---|
| 255 | aligned=ckfree((void *)aligned); |
|---|
| 256 | maxid=ckfree((void *)maxid); |
|---|
| 257 | tree_weight=ckfree((void *)tree_weight); |
|---|
| 258 | |
|---|
| 259 | aln_score(); |
|---|
| 260 | |
|---|
| 261 | /* make the rest (output stuff) into routine clustal_out in file amenu.c */ |
|---|
| 262 | |
|---|
| 263 | return(nseqs); |
|---|
| 264 | |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */ |
|---|
| 268 | { |
|---|
| 269 | static sint *aligned, *tree_weight; |
|---|
| 270 | static sint *group; |
|---|
| 271 | static sint ix; |
|---|
| 272 | |
|---|
| 273 | sint *maxid, max; |
|---|
| 274 | sint i,j,status,iseq; |
|---|
| 275 | sint sum,entries; |
|---|
| 276 | lint score = 0; |
|---|
| 277 | |
|---|
| 278 | |
|---|
| 279 | info("Start of Multiple Alignment"); |
|---|
| 280 | |
|---|
| 281 | seq_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 282 | |
|---|
| 283 | /* get the phylogenetic tree from *.ph */ |
|---|
| 284 | |
|---|
| 285 | if (nseqs > 3) |
|---|
| 286 | { |
|---|
| 287 | status = read_tree(phylip_name, (sint)0, nseqs); |
|---|
| 288 | if (status == 0) return(0); |
|---|
| 289 | } |
|---|
| 290 | |
|---|
| 291 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 292 | weights in global variable seq_weight normalised to sum to 100 */ |
|---|
| 293 | |
|---|
| 294 | calc_seq_weights((sint)0, nseqs, seq_weight); |
|---|
| 295 | |
|---|
| 296 | tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 297 | for (i=0;i<nseqs;i++) |
|---|
| 298 | tree_weight[i] = seq_weight[i]; |
|---|
| 299 | |
|---|
| 300 | /* recalculate tmat matrix as percent similarity matrix */ |
|---|
| 301 | |
|---|
| 302 | status = calc_similarities(nseqs); |
|---|
| 303 | if (status == 0) return((sint)0); |
|---|
| 304 | |
|---|
| 305 | /* for each sequence, find the most closely related sequence */ |
|---|
| 306 | |
|---|
| 307 | maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 308 | for (i=1;i<=nseqs;i++) |
|---|
| 309 | { |
|---|
| 310 | maxid[i] = 0; |
|---|
| 311 | for (j=1;j<=nseqs;j++) |
|---|
| 312 | if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j]; |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | /* clear the memory used for the phylogenetic tree */ |
|---|
| 316 | |
|---|
| 317 | if (nseqs > 3) |
|---|
| 318 | clear_tree(NULL); |
|---|
| 319 | |
|---|
| 320 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 321 | ix = 0; |
|---|
| 322 | for (i=1;i<=istart+1;i++) |
|---|
| 323 | { |
|---|
| 324 | aligned[i] = 1; |
|---|
| 325 | ++ix; |
|---|
| 326 | output_index[i] = i; |
|---|
| 327 | } |
|---|
| 328 | for (i=istart+2;i<=nseqs;i++) aligned[i] = 0; |
|---|
| 329 | |
|---|
| 330 | /* for each unaligned sequence, find it's closest pair amongst the |
|---|
| 331 | aligned sequences. */ |
|---|
| 332 | |
|---|
| 333 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 334 | |
|---|
| 335 | while (ix < nseqs) |
|---|
| 336 | { |
|---|
| 337 | if (ix > 0) |
|---|
| 338 | { |
|---|
| 339 | for (i=1;i<=nseqs;i++) { |
|---|
| 340 | if (aligned[i] == 0) |
|---|
| 341 | { |
|---|
| 342 | maxid[i] = 0; |
|---|
| 343 | for (j=1;j<=nseqs;j++) |
|---|
| 344 | if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0)) |
|---|
| 345 | maxid[i] = tmat[i][j]; |
|---|
| 346 | } |
|---|
| 347 | } |
|---|
| 348 | } |
|---|
| 349 | |
|---|
| 350 | /* find the most closely related sequence to those already aligned */ |
|---|
| 351 | |
|---|
| 352 | max = 0; |
|---|
| 353 | for (i=1;i<=nseqs;i++) |
|---|
| 354 | { |
|---|
| 355 | if ((aligned[i] == 0) && (maxid[i] > max)) |
|---|
| 356 | { |
|---|
| 357 | max = maxid[i]; |
|---|
| 358 | iseq = i; |
|---|
| 359 | } |
|---|
| 360 | } |
|---|
| 361 | |
|---|
| 362 | /* align this sequence to the existing alignment */ |
|---|
| 363 | |
|---|
| 364 | entries = 0; |
|---|
| 365 | for (j=1;j<=nseqs;j++) |
|---|
| 366 | if (aligned[j] != 0) |
|---|
| 367 | { |
|---|
| 368 | group[j] = 1; |
|---|
| 369 | entries++; |
|---|
| 370 | } |
|---|
| 371 | else if (iseq==j) |
|---|
| 372 | { |
|---|
| 373 | group[j] = 2; |
|---|
| 374 | entries++; |
|---|
| 375 | } |
|---|
| 376 | aligned[iseq] = 1; |
|---|
| 377 | |
|---|
| 378 | |
|---|
| 379 | /* EITHER....., set sequence weights equal to percent identity with new sequence */ |
|---|
| 380 | /* |
|---|
| 381 | for (j=0;j<nseqs;j++) |
|---|
| 382 | seq_weight[j] = tmat[j+1][iseq]; |
|---|
| 383 | */ |
|---|
| 384 | /* OR...., multiply sequence weights from tree by percent identity with new sequence */ |
|---|
| 385 | for (j=0;j<nseqs;j++) |
|---|
| 386 | seq_weight[j] = tree_weight[j] * tmat[j+1][iseq]; |
|---|
| 387 | if (debug>1) |
|---|
| 388 | for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf (stdout,"sequence %d: %d\n", j+1,tree_weight[j]); |
|---|
| 389 | /* |
|---|
| 390 | Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR |
|---|
| 391 | */ |
|---|
| 392 | |
|---|
| 393 | sum = 0; |
|---|
| 394 | for (j=0;j<nseqs;j++) |
|---|
| 395 | if (group[j+1] == 1) sum += seq_weight[j]; |
|---|
| 396 | if (sum == 0) |
|---|
| 397 | { |
|---|
| 398 | for (j=0;j<nseqs;j++) |
|---|
| 399 | seq_weight[j] = 1; |
|---|
| 400 | sum = j; |
|---|
| 401 | } |
|---|
| 402 | for (j=0;j<nseqs;j++) |
|---|
| 403 | { |
|---|
| 404 | seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; |
|---|
| 405 | if (seq_weight[j] < 1) seq_weight[j] = 1; |
|---|
| 406 | } |
|---|
| 407 | |
|---|
| 408 | if (debug > 1) { |
|---|
| 409 | fprintf(stdout,"new weights\n"); |
|---|
| 410 | for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]); |
|---|
| 411 | } |
|---|
| 412 | |
|---|
| 413 | score = prfalign(group, aligned); |
|---|
| 414 | info("Sequence:%d Score:%d",(pint)iseq,(pint)score); |
|---|
| 415 | if (output_order == INPUT) |
|---|
| 416 | { |
|---|
| 417 | ++ix; |
|---|
| 418 | output_index[iseq] = iseq; |
|---|
| 419 | } |
|---|
| 420 | else |
|---|
| 421 | output_index[++ix] = iseq; |
|---|
| 422 | } |
|---|
| 423 | |
|---|
| 424 | group=ckfree((void *)group); |
|---|
| 425 | seq_weight=ckfree((void *)seq_weight); |
|---|
| 426 | aligned=ckfree((void *)aligned); |
|---|
| 427 | maxid=ckfree((void *)maxid); |
|---|
| 428 | |
|---|
| 429 | aln_score(); |
|---|
| 430 | |
|---|
| 431 | /* make the rest (output stuff) into routine clustal_out in file amenu.c */ |
|---|
| 432 | |
|---|
| 433 | return(nseqs); |
|---|
| 434 | |
|---|
| 435 | } |
|---|
| 436 | |
|---|
| 437 | |
|---|
| 438 | sint palign1(void) /* a profile alignment */ |
|---|
| 439 | { |
|---|
| 440 | sint i,j,temp; |
|---|
| 441 | sint entries; |
|---|
| 442 | sint *aligned, *group; |
|---|
| 443 | float dscore; |
|---|
| 444 | lint score; |
|---|
| 445 | |
|---|
| 446 | info("Start of Initial Alignment"); |
|---|
| 447 | |
|---|
| 448 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 449 | weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */ |
|---|
| 450 | |
|---|
| 451 | seq_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 452 | |
|---|
| 453 | temp = INT_SCALE_FACTOR/nseqs; |
|---|
| 454 | for (i=0;i<nseqs;i++) seq_weight[i] = temp; |
|---|
| 455 | |
|---|
| 456 | distance_tree = FALSE; |
|---|
| 457 | |
|---|
| 458 | /* do the initial alignment......... */ |
|---|
| 459 | |
|---|
| 460 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 461 | |
|---|
| 462 | for(i=1; i<=profile1_nseqs; ++i) |
|---|
| 463 | group[i] = 1; |
|---|
| 464 | for(i=profile1_nseqs+1; i<=nseqs; ++i) |
|---|
| 465 | group[i] = 2; |
|---|
| 466 | entries = nseqs; |
|---|
| 467 | |
|---|
| 468 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 469 | for (i=1;i<=nseqs;i++) aligned[i] = 1; |
|---|
| 470 | |
|---|
| 471 | score = prfalign(group, aligned); |
|---|
| 472 | info("Sequences:%d Score:%d",(pint)entries,(pint)score); |
|---|
| 473 | group=ckfree((void *)group); |
|---|
| 474 | seq_weight=ckfree((void *)seq_weight); |
|---|
| 475 | aligned=ckfree((void *)aligned); |
|---|
| 476 | |
|---|
| 477 | for (i=1;i<=nseqs;i++) { |
|---|
| 478 | for (j=i+1;j<=nseqs;j++) { |
|---|
| 479 | dscore = countid(i,j); |
|---|
| 480 | tmat[i][j] = ((double)100.0 - (double)dscore)/(double)100.0; |
|---|
| 481 | tmat[j][i] = tmat[i][j]; |
|---|
| 482 | } |
|---|
| 483 | } |
|---|
| 484 | |
|---|
| 485 | return(nseqs); |
|---|
| 486 | } |
|---|
| 487 | |
|---|
| 488 | float countid(sint s1, sint s2) |
|---|
| 489 | { |
|---|
| 490 | char c1,c2; |
|---|
| 491 | sint i; |
|---|
| 492 | sint count,total; |
|---|
| 493 | float score; |
|---|
| 494 | |
|---|
| 495 | count = total = 0; |
|---|
| 496 | for (i=1;i<=seqlen_array[s1] && i<=seqlen_array[s2];i++) { |
|---|
| 497 | c1 = seq_array[s1][i]; |
|---|
| 498 | c2 = seq_array[s2][i]; |
|---|
| 499 | if ((c1>=0) && (c1<max_aa)) { |
|---|
| 500 | total++; |
|---|
| 501 | if (c1 == c2) count++; |
|---|
| 502 | } |
|---|
| 503 | |
|---|
| 504 | } |
|---|
| 505 | |
|---|
| 506 | score = 100.0 * (float)count / (float)total; |
|---|
| 507 | return(score); |
|---|
| 508 | |
|---|
| 509 | } |
|---|
| 510 | |
|---|
| 511 | sint palign2(char *p1_tree_name,char *p2_tree_name) /* a profile alignment */ |
|---|
| 512 | { |
|---|
| 513 | sint i,j,sum,entries,status; |
|---|
| 514 | lint score; |
|---|
| 515 | sint *aligned, *group; |
|---|
| 516 | sint *maxid,*p1_weight,*p2_weight; |
|---|
| 517 | sint dscore; |
|---|
| 518 | |
|---|
| 519 | info("Start of Multiple Alignment"); |
|---|
| 520 | |
|---|
| 521 | /* get the phylogenetic trees from *.ph */ |
|---|
| 522 | |
|---|
| 523 | if (profile1_nseqs > 3) |
|---|
| 524 | { |
|---|
| 525 | status = read_tree(p1_tree_name, (sint)0, profile1_nseqs); |
|---|
| 526 | if (status == 0) return(0); |
|---|
| 527 | } |
|---|
| 528 | |
|---|
| 529 | /* calculate sequence weights according to branch lengths of the tree - |
|---|
| 530 | weights in global variable seq_weight normalised to sum to 100 */ |
|---|
| 531 | |
|---|
| 532 | p1_weight = (sint *) ckalloc( (profile1_nseqs) * sizeof(sint) ); |
|---|
| 533 | |
|---|
| 534 | calc_seq_weights((sint)0, profile1_nseqs, p1_weight); |
|---|
| 535 | |
|---|
| 536 | /* clear the memory for the phylogenetic tree */ |
|---|
| 537 | |
|---|
| 538 | if (profile1_nseqs > 3) |
|---|
| 539 | clear_tree(NULL); |
|---|
| 540 | |
|---|
| 541 | if (nseqs-profile1_nseqs > 3) |
|---|
| 542 | { |
|---|
| 543 | status = read_tree(p2_tree_name, profile1_nseqs, nseqs); |
|---|
| 544 | if (status == 0) return(0); |
|---|
| 545 | } |
|---|
| 546 | |
|---|
| 547 | p2_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 548 | |
|---|
| 549 | calc_seq_weights(profile1_nseqs,nseqs, p2_weight); |
|---|
| 550 | |
|---|
| 551 | |
|---|
| 552 | /* clear the memory for the phylogenetic tree */ |
|---|
| 553 | |
|---|
| 554 | if (nseqs-profile1_nseqs > 3) |
|---|
| 555 | clear_tree(NULL); |
|---|
| 556 | |
|---|
| 557 | /* convert tmat distances to similarities */ |
|---|
| 558 | |
|---|
| 559 | for (i=1;i<nseqs;i++) |
|---|
| 560 | for (j=i+1;j<=nseqs;j++) { |
|---|
| 561 | tmat[i][j]=100.0-tmat[i][j]*100.0; |
|---|
| 562 | tmat[j][i]=tmat[i][j]; |
|---|
| 563 | } |
|---|
| 564 | |
|---|
| 565 | |
|---|
| 566 | /* weight sequences with max percent identity with other profile*/ |
|---|
| 567 | seq_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) ); |
|---|
| 568 | |
|---|
| 569 | maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 570 | for (i=0;i<profile1_nseqs;i++) { |
|---|
| 571 | maxid[i] = 0; |
|---|
| 572 | for (j=profile1_nseqs+1;j<=nseqs;j++) |
|---|
| 573 | if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j]; |
|---|
| 574 | seq_weight[i] = maxid[i]*p1_weight[i]; |
|---|
| 575 | } |
|---|
| 576 | |
|---|
| 577 | for (i=profile1_nseqs;i<nseqs;i++) { |
|---|
| 578 | maxid[i] = 0; |
|---|
| 579 | for (j=1;j<=profile1_nseqs;j++) |
|---|
| 580 | if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j]; |
|---|
| 581 | seq_weight[i] = maxid[i]*p2_weight[i]; |
|---|
| 582 | } |
|---|
| 583 | /* |
|---|
| 584 | Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR |
|---|
| 585 | */ |
|---|
| 586 | |
|---|
| 587 | sum = 0; |
|---|
| 588 | for (j=0;j<nseqs;j++) |
|---|
| 589 | sum += seq_weight[j]; |
|---|
| 590 | if (sum == 0) |
|---|
| 591 | { |
|---|
| 592 | for (j=0;j<nseqs;j++) |
|---|
| 593 | seq_weight[j] = 1; |
|---|
| 594 | sum = j; |
|---|
| 595 | } |
|---|
| 596 | for (j=0;j<nseqs;j++) |
|---|
| 597 | { |
|---|
| 598 | seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum; |
|---|
| 599 | if (seq_weight[j] < 1) seq_weight[j] = 1; |
|---|
| 600 | } |
|---|
| 601 | if (debug > 1) { |
|---|
| 602 | fprintf(stdout,"new weights\n"); |
|---|
| 603 | for (j=0;j<nseqs;j++) fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]); |
|---|
| 604 | } |
|---|
| 605 | |
|---|
| 606 | |
|---|
| 607 | /* do the alignment......... */ |
|---|
| 608 | |
|---|
| 609 | info("Aligning..."); |
|---|
| 610 | |
|---|
| 611 | group = (sint *)ckalloc( (nseqs+1) * sizeof (sint)); |
|---|
| 612 | |
|---|
| 613 | for(i=1; i<=profile1_nseqs; ++i) |
|---|
| 614 | group[i] = 1; |
|---|
| 615 | for(i=profile1_nseqs+1; i<=nseqs; ++i) |
|---|
| 616 | group[i] = 2; |
|---|
| 617 | entries = nseqs; |
|---|
| 618 | |
|---|
| 619 | aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) ); |
|---|
| 620 | for (i=1;i<=nseqs;i++) aligned[i] = 1; |
|---|
| 621 | |
|---|
| 622 | score = prfalign(group, aligned); |
|---|
| 623 | info("Sequences:%d Score:%d",(pint)entries,(pint)score); |
|---|
| 624 | group=ckfree((void *)group); |
|---|
| 625 | p1_weight=ckfree((void *)p1_weight); |
|---|
| 626 | p2_weight=ckfree((void *)p2_weight); |
|---|
| 627 | seq_weight=ckfree((void *)seq_weight); |
|---|
| 628 | aligned=ckfree((void *)aligned); |
|---|
| 629 | maxid=ckfree((void *)maxid); |
|---|
| 630 | |
|---|
| 631 | /* DES output_index = (int *)ckalloc( (nseqs+1) * sizeof (int)); */ |
|---|
| 632 | for (i=1;i<=nseqs;i++) output_index[i] = i; |
|---|
| 633 | |
|---|
| 634 | return(nseqs); |
|---|
| 635 | } |
|---|
| 636 | |
|---|