| 1 | #include "phylip.h" |
|---|
| 2 | #include "seq.h" |
|---|
| 3 | |
|---|
| 4 | /* version 3.6. (c) Copyright 1993-2002 by the University of Washington. |
|---|
| 5 | Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. |
|---|
| 6 | Permission is granted to copy and use this program provided no fee is |
|---|
| 7 | charged for it and provided that this copyright notice is not removed. */ |
|---|
| 8 | |
|---|
| 9 | #define iterationsd 100 /* number of iterates of EM for each distance */ |
|---|
| 10 | |
|---|
| 11 | typedef struct valrec { |
|---|
| 12 | double rat, ratxv, z1, y1, z1zz, z1yy, z1xv; |
|---|
| 13 | } valrec; |
|---|
| 14 | |
|---|
| 15 | extern sequence y; |
|---|
| 16 | |
|---|
| 17 | Char infilename[FNMLNGTH], outfilename[FNMLNGTH], catfilename[FNMLNGTH], weightfilename[FNMLNGTH]; |
|---|
| 18 | long sites, categs, weightsum, datasets, ith, rcategs; |
|---|
| 19 | boolean freqsfrom, jukes, kimura, logdet, gama, invar, similarity, lower, f84, |
|---|
| 20 | weights, progress, ctgry, mulsets, justwts, firstset, baddists; |
|---|
| 21 | node **nodep; |
|---|
| 22 | double xi, xv, ttratio, ttratio0, freqa, freqc, freqg, freqt, freqr, freqy, |
|---|
| 23 | freqar, freqcy, freqgr, freqty, cvi, invarfrac, sumrates, fracchange; |
|---|
| 24 | steptr oldweight; |
|---|
| 25 | double rate[maxcategs]; |
|---|
| 26 | double **d; |
|---|
| 27 | double sumweightrat; /* these values were propagated */ |
|---|
| 28 | double *weightrat; /* to global values from */ |
|---|
| 29 | valrec tbl[maxcategs]; /* function makedists. */ |
|---|
| 30 | |
|---|
| 31 | |
|---|
| 32 | #ifndef OLDC |
|---|
| 33 | /* function prototypes */ |
|---|
| 34 | void getoptions(void); |
|---|
| 35 | void allocrest(void); |
|---|
| 36 | void reallocsites(void); |
|---|
| 37 | void doinit(void); |
|---|
| 38 | void inputcategories(void); |
|---|
| 39 | void printcategories(void); |
|---|
| 40 | void inputoptions(void); |
|---|
| 41 | void dnadist_sitesort(void); |
|---|
| 42 | void dnadist_sitecombine(void); |
|---|
| 43 | void dnadist_sitescrunch(void); |
|---|
| 44 | void makeweights(void); |
|---|
| 45 | void dnadist_makevalues(void); |
|---|
| 46 | void dnadist_empiricalfreqs(void); |
|---|
| 47 | void getinput(void); |
|---|
| 48 | void inittable(void); |
|---|
| 49 | double lndet(double (*a)[4]); |
|---|
| 50 | void makev(long, long, double *); |
|---|
| 51 | void makedists(void); |
|---|
| 52 | void writedists(void); |
|---|
| 53 | /* function prototypes */ |
|---|
| 54 | #endif |
|---|
| 55 | |
|---|
| 56 | |
|---|
| 57 | void getoptions() |
|---|
| 58 | { |
|---|
| 59 | /* interactively set options */ |
|---|
| 60 | long loopcount, loopcount2; |
|---|
| 61 | Char ch, ch2; |
|---|
| 62 | boolean ttr; |
|---|
| 63 | |
|---|
| 64 | ctgry = false; |
|---|
| 65 | categs = 1; |
|---|
| 66 | cvi = 1.0; |
|---|
| 67 | rcategs = 1; |
|---|
| 68 | rate[0] = 1.0; |
|---|
| 69 | freqsfrom = true; |
|---|
| 70 | gama = false; |
|---|
| 71 | invar = false; |
|---|
| 72 | invarfrac = 0.0; |
|---|
| 73 | jukes = false; |
|---|
| 74 | justwts = false; |
|---|
| 75 | kimura = false; |
|---|
| 76 | logdet = false; |
|---|
| 77 | f84 = true; |
|---|
| 78 | lower = false; |
|---|
| 79 | similarity = false; |
|---|
| 80 | ttratio = 2.0; |
|---|
| 81 | ttr = false; |
|---|
| 82 | weights = false; |
|---|
| 83 | printdata = false; |
|---|
| 84 | progress = true; |
|---|
| 85 | interleaved = true; |
|---|
| 86 | loopcount = 0; |
|---|
| 87 | for (;;) { |
|---|
| 88 | cleerhome(); |
|---|
| 89 | printf("\nNucleic acid sequence Distance Matrix program,"); |
|---|
| 90 | printf(" version %s\n\n",VERSION); |
|---|
| 91 | printf("Settings for this run:\n"); |
|---|
| 92 | printf(" D Distance (F84, Kimura, Jukes-Cantor, LogDet)? %s\n", |
|---|
| 93 | kimura ? "Kimura 2-parameter" : |
|---|
| 94 | jukes ? "Jukes-Cantor" : |
|---|
| 95 | logdet ? "LogDet" : |
|---|
| 96 | similarity ? "Similarity table" : "F84"); |
|---|
| 97 | if (kimura || f84 || jukes) { |
|---|
| 98 | printf(" G Gamma distributed rates across sites? "); |
|---|
| 99 | if (gama) |
|---|
| 100 | printf("Yes\n"); |
|---|
| 101 | else { |
|---|
| 102 | if (invar) |
|---|
| 103 | printf("Gamma+Invariant\n"); |
|---|
| 104 | else |
|---|
| 105 | printf("No\n"); |
|---|
| 106 | } |
|---|
| 107 | } |
|---|
| 108 | if (kimura || f84) { |
|---|
| 109 | printf(" T Transition/transversion ratio?"); |
|---|
| 110 | if (!ttr) |
|---|
| 111 | printf(" 2.0\n"); |
|---|
| 112 | else |
|---|
| 113 | printf("%8.4f\n", ttratio); |
|---|
| 114 | } |
|---|
| 115 | if (!logdet && !similarity && !gama && !invar) { |
|---|
| 116 | printf(" C One category of substitution rates?"); |
|---|
| 117 | if (!ctgry || categs == 1) |
|---|
| 118 | printf(" Yes\n"); |
|---|
| 119 | else |
|---|
| 120 | printf(" %ld categories\n", categs); |
|---|
| 121 | } |
|---|
| 122 | printf(" W Use weights for sites?"); |
|---|
| 123 | if (weights) |
|---|
| 124 | printf(" Yes\n"); |
|---|
| 125 | else |
|---|
| 126 | printf(" No\n"); |
|---|
| 127 | if (f84) |
|---|
| 128 | printf(" F Use empirical base frequencies? %s\n", |
|---|
| 129 | (freqsfrom ? "Yes" : "No")); |
|---|
| 130 | printf(" L Form of distance matrix? %s\n", |
|---|
| 131 | (lower ? "Lower-triangular" : "Square")); |
|---|
| 132 | printf(" M Analyze multiple data sets?"); |
|---|
| 133 | if (mulsets) |
|---|
| 134 | printf(" Yes, %2ld %s\n", datasets, |
|---|
| 135 | (justwts ? "sets of weights" : "data sets")); |
|---|
| 136 | else |
|---|
| 137 | printf(" No\n"); |
|---|
| 138 | printf(" I Input sequences interleaved? %s\n", |
|---|
| 139 | (interleaved ? "Yes" : "No, sequential")); |
|---|
| 140 | printf(" 0 Terminal type (IBM PC, ANSI, none)? %s\n", |
|---|
| 141 | ibmpc ? "IBM PC" : ansi ? "ANSI" : "(none)"); |
|---|
| 142 | printf(" 1 Print out the data at start of run %s\n", |
|---|
| 143 | (printdata ? "Yes" : "No")); |
|---|
| 144 | printf(" 2 Print indications of progress of run %s\n", |
|---|
| 145 | (progress ? "Yes" : "No")); |
|---|
| 146 | printf("\n Y to accept these or type the letter for one to change\n"); |
|---|
| 147 | #ifdef WIN32 |
|---|
| 148 | phyFillScreenColor(); |
|---|
| 149 | #endif |
|---|
| 150 | scanf("%c%*[^\n]", &ch); |
|---|
| 151 | getchar(); |
|---|
| 152 | uppercase(&ch); |
|---|
| 153 | if (ch == 'Y') |
|---|
| 154 | break; |
|---|
| 155 | if ((f84 && (strchr("CFGWLDTMI012",ch) != NULL)) || |
|---|
| 156 | (kimura && (strchr("CGWLDTMI012",ch) != NULL)) || |
|---|
| 157 | (jukes && (strchr("CGWLDMI012",ch) != NULL)) || |
|---|
| 158 | ((logdet || similarity) && (strchr("WLDMI012",ch)) != NULL) || |
|---|
| 159 | (ctgry && (strchr("CFWLDTMI012",ch) != NULL))) { |
|---|
| 160 | switch (ch) { |
|---|
| 161 | |
|---|
| 162 | case 'D': |
|---|
| 163 | if (kimura) { |
|---|
| 164 | kimura = false; |
|---|
| 165 | jukes = true; |
|---|
| 166 | freqsfrom = false; |
|---|
| 167 | } else if (f84) { |
|---|
| 168 | f84 = false; |
|---|
| 169 | kimura = true; |
|---|
| 170 | freqsfrom = false; |
|---|
| 171 | } else if (logdet) { |
|---|
| 172 | logdet = false; |
|---|
| 173 | similarity = true; |
|---|
| 174 | } else if (similarity) { |
|---|
| 175 | similarity = false; |
|---|
| 176 | f84 = true; |
|---|
| 177 | freqsfrom = true; |
|---|
| 178 | } else { |
|---|
| 179 | jukes = false; |
|---|
| 180 | logdet = true; |
|---|
| 181 | freqsfrom = false; |
|---|
| 182 | } |
|---|
| 183 | break; |
|---|
| 184 | |
|---|
| 185 | case 'G': |
|---|
| 186 | if (!(gama || invar)) |
|---|
| 187 | gama = true; |
|---|
| 188 | else { |
|---|
| 189 | if (gama) { |
|---|
| 190 | gama = false; |
|---|
| 191 | invar = true; |
|---|
| 192 | } else { |
|---|
| 193 | if (invar) |
|---|
| 194 | invar = false; |
|---|
| 195 | } |
|---|
| 196 | } |
|---|
| 197 | break; |
|---|
| 198 | |
|---|
| 199 | |
|---|
| 200 | case 'C': |
|---|
| 201 | ctgry = !ctgry; |
|---|
| 202 | if (ctgry) { |
|---|
| 203 | initcatn(&categs); |
|---|
| 204 | initcategs(categs, rate); |
|---|
| 205 | } |
|---|
| 206 | break; |
|---|
| 207 | |
|---|
| 208 | case 'F': |
|---|
| 209 | freqsfrom = !freqsfrom; |
|---|
| 210 | if (!freqsfrom) |
|---|
| 211 | initfreqs(&freqa, &freqc, &freqg, &freqt); |
|---|
| 212 | break; |
|---|
| 213 | |
|---|
| 214 | case 'W': |
|---|
| 215 | weights = !weights; |
|---|
| 216 | break; |
|---|
| 217 | |
|---|
| 218 | case 'L': |
|---|
| 219 | lower = !lower; |
|---|
| 220 | break; |
|---|
| 221 | |
|---|
| 222 | case 'T': |
|---|
| 223 | ttr = !ttr; |
|---|
| 224 | if (ttr) |
|---|
| 225 | initratio(&ttratio); |
|---|
| 226 | break; |
|---|
| 227 | |
|---|
| 228 | case 'M': |
|---|
| 229 | mulsets = !mulsets; |
|---|
| 230 | if (mulsets) { |
|---|
| 231 | printf("Multiple data sets or multiple weights?"); |
|---|
| 232 | loopcount2 = 0; |
|---|
| 233 | do { |
|---|
| 234 | printf(" (type D or W)\n"); |
|---|
| 235 | #ifdef WIN32 |
|---|
| 236 | phyFillScreenColor(); |
|---|
| 237 | #endif |
|---|
| 238 | scanf("%c%*[^\n]", &ch2); |
|---|
| 239 | uppercase(&ch2); |
|---|
| 240 | getchar(); |
|---|
| 241 | countup(&loopcount2, 10); |
|---|
| 242 | } while ((ch2 != 'W') && (ch2 != 'D')); |
|---|
| 243 | justwts = (ch2 == 'W'); |
|---|
| 244 | if (justwts) |
|---|
| 245 | justweights(&datasets); |
|---|
| 246 | else |
|---|
| 247 | initdatasets(&datasets); |
|---|
| 248 | } |
|---|
| 249 | break; |
|---|
| 250 | |
|---|
| 251 | case 'I': |
|---|
| 252 | interleaved = !interleaved; |
|---|
| 253 | break; |
|---|
| 254 | |
|---|
| 255 | case '0': |
|---|
| 256 | initterminal(&ibmpc, &ansi); |
|---|
| 257 | break; |
|---|
| 258 | |
|---|
| 259 | case '1': |
|---|
| 260 | printdata = !printdata; |
|---|
| 261 | break; |
|---|
| 262 | |
|---|
| 263 | case '2': |
|---|
| 264 | progress = !progress; |
|---|
| 265 | break; |
|---|
| 266 | } |
|---|
| 267 | } else { |
|---|
| 268 | if (strchr("CFGWLDTMI012",ch) == NULL) |
|---|
| 269 | printf("Not a possible option!\n"); |
|---|
| 270 | else |
|---|
| 271 | printf("That option not allowed with these settings\n"); |
|---|
| 272 | printf("\nPress Enter or Return key to continue\n"); |
|---|
| 273 | getchar(); |
|---|
| 274 | } |
|---|
| 275 | countup(&loopcount, 100); |
|---|
| 276 | } |
|---|
| 277 | if (gama || invar) { |
|---|
| 278 | loopcount = 0; |
|---|
| 279 | do { |
|---|
| 280 | printf( |
|---|
| 281 | "\nCoefficient of variation of substitution rate among sites (must be positive)\n"); |
|---|
| 282 | printf( |
|---|
| 283 | " In gamma distribution parameters, this is 1/(square root of alpha)\n"); |
|---|
| 284 | #ifdef WIN32 |
|---|
| 285 | phyFillScreenColor(); |
|---|
| 286 | #endif |
|---|
| 287 | scanf("%lf%*[^\n]", &cvi); |
|---|
| 288 | getchar(); |
|---|
| 289 | countup(&loopcount, 10); |
|---|
| 290 | } while (cvi <= 0.0); |
|---|
| 291 | cvi = 1.0 / (cvi * cvi); |
|---|
| 292 | } |
|---|
| 293 | if (invar) { |
|---|
| 294 | loopcount = 0; |
|---|
| 295 | do { |
|---|
| 296 | printf("Fraction of invariant sites?\n"); |
|---|
| 297 | scanf("%lf%*[^\n]", &invarfrac); |
|---|
| 298 | getchar(); |
|---|
| 299 | countup (&loopcount, 10); |
|---|
| 300 | } while ((invarfrac <= 0.0) || (invarfrac >= 1.0)); |
|---|
| 301 | } |
|---|
| 302 | if (!printdata) |
|---|
| 303 | return; |
|---|
| 304 | fprintf(outfile, "\nNucleic acid sequence Distance Matrix program,"); |
|---|
| 305 | fprintf(outfile, " version %s\n\n",VERSION); |
|---|
| 306 | } /* getoptions */ |
|---|
| 307 | |
|---|
| 308 | |
|---|
| 309 | void allocrest() |
|---|
| 310 | { |
|---|
| 311 | long i; |
|---|
| 312 | |
|---|
| 313 | y = (Char **)Malloc(spp*sizeof(Char *)); |
|---|
| 314 | nodep = (node **)Malloc(spp*sizeof(node *)); |
|---|
| 315 | for (i = 0; i < spp; i++) { |
|---|
| 316 | y[i] = (Char *)Malloc(sites*sizeof(Char)); |
|---|
| 317 | nodep[i] = (node *)Malloc(sizeof(node)); |
|---|
| 318 | } |
|---|
| 319 | d = (double **)Malloc(spp*sizeof(double *)); |
|---|
| 320 | for (i = 0; i < spp; i++) |
|---|
| 321 | d[i] = (double*)Malloc(spp*sizeof(double)); |
|---|
| 322 | nayme = (naym *)Malloc(spp*sizeof(naym)); |
|---|
| 323 | category = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 324 | oldweight = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 325 | weight = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 326 | alias = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 327 | ally = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 328 | location = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 329 | weightrat = (double *)Malloc(sites*sizeof(double)); |
|---|
| 330 | } /* allocrest */ |
|---|
| 331 | |
|---|
| 332 | |
|---|
| 333 | void reallocsites() |
|---|
| 334 | {/* The amount of sites can change between runs |
|---|
| 335 | this function reallocates all the variables |
|---|
| 336 | whose size depends on the amount of sites */ |
|---|
| 337 | long i; |
|---|
| 338 | |
|---|
| 339 | for (i = 0; i < spp; i++) { |
|---|
| 340 | free(y[i]); |
|---|
| 341 | y[i] = (Char *)Malloc(sites*sizeof(Char)); |
|---|
| 342 | } |
|---|
| 343 | free(category); |
|---|
| 344 | free(oldweight); |
|---|
| 345 | free(weight); |
|---|
| 346 | free(alias); |
|---|
| 347 | free(ally); |
|---|
| 348 | free(location); |
|---|
| 349 | free(weightrat); |
|---|
| 350 | |
|---|
| 351 | category = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 352 | oldweight = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 353 | weight = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 354 | alias = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 355 | ally = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 356 | location = (steptr)Malloc(sites*sizeof(long)); |
|---|
| 357 | weightrat = (double *)Malloc(sites*sizeof(double)); |
|---|
| 358 | } /* reallocsites */ |
|---|
| 359 | |
|---|
| 360 | |
|---|
| 361 | void doinit() |
|---|
| 362 | { |
|---|
| 363 | /* initializes variables */ |
|---|
| 364 | |
|---|
| 365 | inputnumbers(&spp, &sites, &nonodes, 1); |
|---|
| 366 | getoptions(); |
|---|
| 367 | if (printdata) |
|---|
| 368 | fprintf(outfile, "%2ld species, %3ld sites\n", spp, sites); |
|---|
| 369 | allocrest(); |
|---|
| 370 | } /* doinit */ |
|---|
| 371 | |
|---|
| 372 | |
|---|
| 373 | void inputcategories() |
|---|
| 374 | { |
|---|
| 375 | /* reads the categories for each site */ |
|---|
| 376 | long i; |
|---|
| 377 | Char ch; |
|---|
| 378 | |
|---|
| 379 | for (i = 1; i < nmlngth; i++) |
|---|
| 380 | gettc(infile); |
|---|
| 381 | for (i = 0; i < sites; i++) { |
|---|
| 382 | do { |
|---|
| 383 | if (eoln(infile)) |
|---|
| 384 | scan_eoln(infile); |
|---|
| 385 | ch = gettc(infile); |
|---|
| 386 | } while (ch == ' '); |
|---|
| 387 | category[i] = ch - '0'; |
|---|
| 388 | } |
|---|
| 389 | scan_eoln(infile); |
|---|
| 390 | } /* inputcategories */ |
|---|
| 391 | |
|---|
| 392 | |
|---|
| 393 | void printcategories() |
|---|
| 394 | { /* print out list of categories of sites */ |
|---|
| 395 | long i, j; |
|---|
| 396 | |
|---|
| 397 | fprintf(outfile, "Rate categories\n\n"); |
|---|
| 398 | for (i = 1; i <= nmlngth + 3; i++) |
|---|
| 399 | putc(' ', outfile); |
|---|
| 400 | for (i = 1; i <= sites; i++) { |
|---|
| 401 | fprintf(outfile, "%ld", category[i - 1]); |
|---|
| 402 | if (i % 60 == 0) { |
|---|
| 403 | putc('\n', outfile); |
|---|
| 404 | for (j = 1; j <= nmlngth + 3; j++) |
|---|
| 405 | putc(' ', outfile); |
|---|
| 406 | } else if (i % 10 == 0) |
|---|
| 407 | putc(' ', outfile); |
|---|
| 408 | } |
|---|
| 409 | fprintf(outfile, "\n\n"); |
|---|
| 410 | } /* printcategories */ |
|---|
| 411 | |
|---|
| 412 | |
|---|
| 413 | void inputoptions() |
|---|
| 414 | { |
|---|
| 415 | /* read options information */ |
|---|
| 416 | long i; |
|---|
| 417 | |
|---|
| 418 | if (!firstset && !justwts) { |
|---|
| 419 | samenumsp(&sites, ith); |
|---|
| 420 | reallocsites(); |
|---|
| 421 | } |
|---|
| 422 | for (i = 0; i < sites; i++) { |
|---|
| 423 | category[i] = 1; |
|---|
| 424 | oldweight[i] = 1; |
|---|
| 425 | } |
|---|
| 426 | if (justwts || weights) |
|---|
| 427 | inputweights(sites, oldweight, &weights); |
|---|
| 428 | if (printdata) |
|---|
| 429 | putc('\n', outfile); |
|---|
| 430 | if (jukes && printdata) |
|---|
| 431 | fprintf(outfile, " Jukes-Cantor Distance\n"); |
|---|
| 432 | if (kimura && printdata) |
|---|
| 433 | fprintf(outfile, " Kimura 2-parameter Distance\n"); |
|---|
| 434 | if (f84 && printdata) |
|---|
| 435 | fprintf(outfile, " F84 Distance\n"); |
|---|
| 436 | if (similarity) |
|---|
| 437 | fprintf(outfile, " \n Table of similarity between sequences\n"); |
|---|
| 438 | if (firstset && printdata && (kimura || f84)) |
|---|
| 439 | fprintf(outfile, "\nTransition/transversion ratio = %10.6f\n", ttratio); |
|---|
| 440 | if (ctgry && categs > 1) { |
|---|
| 441 | inputcategs(0, sites, category, categs, "DnaDist"); |
|---|
| 442 | if (printdata) |
|---|
| 443 | printcategs(outfile, sites, category, "Site categories"); |
|---|
| 444 | } else if (printdata && (categs > 1)) { |
|---|
| 445 | fprintf(outfile, "\nSite category Rate of change\n\n"); |
|---|
| 446 | for (i = 1; i <= categs; i++) |
|---|
| 447 | fprintf(outfile, "%12ld%13.3f\n", i, rate[i - 1]); |
|---|
| 448 | putc('\n', outfile); |
|---|
| 449 | printcategories(); |
|---|
| 450 | } |
|---|
| 451 | if ((jukes || kimura || logdet) && freqsfrom) { |
|---|
| 452 | printf(" WARNING: CANNOT USE EMPIRICAL BASE FREQUENCIES"); |
|---|
| 453 | printf(" WITH JUKES-CANTOR, KIMURA, JIN/NEI OR LOGDET DISTANCES\n"); |
|---|
| 454 | exxit(-1); |
|---|
| 455 | } |
|---|
| 456 | if (jukes) |
|---|
| 457 | ttratio = 0.5000001; |
|---|
| 458 | if (weights && printdata) |
|---|
| 459 | printweights(outfile, 0, sites, oldweight, "Sites"); |
|---|
| 460 | } /* inputoptions */ |
|---|
| 461 | |
|---|
| 462 | |
|---|
| 463 | void dnadist_sitesort() |
|---|
| 464 | { |
|---|
| 465 | /* Shell sort of sites lexicographically */ |
|---|
| 466 | long gap, i, j, jj, jg, k, itemp; |
|---|
| 467 | boolean flip, tied; |
|---|
| 468 | |
|---|
| 469 | gap = sites / 2; |
|---|
| 470 | while (gap > 0) { |
|---|
| 471 | for (i = gap + 1; i <= sites; i++) { |
|---|
| 472 | j = i - gap; |
|---|
| 473 | flip = true; |
|---|
| 474 | while (j > 0 && flip) { |
|---|
| 475 | jj = alias[j - 1]; |
|---|
| 476 | jg = alias[j + gap - 1]; |
|---|
| 477 | tied = (oldweight[jj - 1] == oldweight[jg - 1]); |
|---|
| 478 | flip = (oldweight[jj - 1] < oldweight[jg - 1] || |
|---|
| 479 | (tied && category[jj - 1] > category[jg - 1])); |
|---|
| 480 | tied = (tied && category[jj - 1] == category[jg - 1]); |
|---|
| 481 | k = 1; |
|---|
| 482 | while (k <= spp && tied) { |
|---|
| 483 | flip = (y[k - 1][jj - 1] > y[k - 1][jg - 1]); |
|---|
| 484 | tied = (tied && y[k - 1][jj - 1] == y[k - 1][jg - 1]); |
|---|
| 485 | k++; |
|---|
| 486 | } |
|---|
| 487 | if (!flip) |
|---|
| 488 | break; |
|---|
| 489 | itemp = alias[j - 1]; |
|---|
| 490 | alias[j - 1] = alias[j + gap - 1]; |
|---|
| 491 | alias[j + gap - 1] = itemp; |
|---|
| 492 | j -= gap; |
|---|
| 493 | } |
|---|
| 494 | } |
|---|
| 495 | gap /= 2; |
|---|
| 496 | } |
|---|
| 497 | } /* dnadist_sitesort */ |
|---|
| 498 | |
|---|
| 499 | |
|---|
| 500 | void dnadist_sitecombine() |
|---|
| 501 | { |
|---|
| 502 | /* combine sites that have identical patterns */ |
|---|
| 503 | long i, j, k; |
|---|
| 504 | boolean tied; |
|---|
| 505 | |
|---|
| 506 | i = 1; |
|---|
| 507 | while (i < sites) { |
|---|
| 508 | j = i + 1; |
|---|
| 509 | tied = true; |
|---|
| 510 | while (j <= sites && tied) { |
|---|
| 511 | tied = (oldweight[alias[i - 1] - 1] == oldweight[alias[j - 1] - 1] && |
|---|
| 512 | category[alias[i - 1] - 1] == category[alias[j - 1] - 1]); |
|---|
| 513 | k = 1; |
|---|
| 514 | while (k <= spp && tied) { |
|---|
| 515 | tied = (tied && |
|---|
| 516 | y[k - 1][alias[i - 1] - 1] == y[k - 1][alias[j - 1] - 1]); |
|---|
| 517 | k++; |
|---|
| 518 | } |
|---|
| 519 | if (!tied) |
|---|
| 520 | break; |
|---|
| 521 | ally[alias[j - 1] - 1] = alias[i - 1]; |
|---|
| 522 | j++; |
|---|
| 523 | } |
|---|
| 524 | i = j; |
|---|
| 525 | } |
|---|
| 526 | } /* dnadist_sitecombine */ |
|---|
| 527 | |
|---|
| 528 | |
|---|
| 529 | void dnadist_sitescrunch() |
|---|
| 530 | { |
|---|
| 531 | /* move so one representative of each pattern of |
|---|
| 532 | sites comes first */ |
|---|
| 533 | long i, j, itemp; |
|---|
| 534 | boolean done, found, completed; |
|---|
| 535 | |
|---|
| 536 | done = false; |
|---|
| 537 | i = 1; |
|---|
| 538 | j = 2; |
|---|
| 539 | while (!done) { |
|---|
| 540 | if (ally[alias[i - 1] - 1] != alias[i - 1]) { |
|---|
| 541 | if (j <= i) |
|---|
| 542 | j = i + 1; |
|---|
| 543 | if (j <= sites) { |
|---|
| 544 | do { |
|---|
| 545 | found = (ally[alias[j - 1] - 1] == alias[j - 1]); |
|---|
| 546 | j++; |
|---|
| 547 | completed = (j > sites); |
|---|
| 548 | if (j <= sites) |
|---|
| 549 | completed = (oldweight[alias[j - 1] - 1] == 0); |
|---|
| 550 | } while (!(found || completed)); |
|---|
| 551 | if (found) { |
|---|
| 552 | j--; |
|---|
| 553 | itemp = alias[i - 1]; |
|---|
| 554 | alias[i - 1] = alias[j - 1]; |
|---|
| 555 | alias[j - 1] = itemp; |
|---|
| 556 | } else |
|---|
| 557 | done = true; |
|---|
| 558 | } else |
|---|
| 559 | done = true; |
|---|
| 560 | } |
|---|
| 561 | i++; |
|---|
| 562 | done = (done || i >= sites); |
|---|
| 563 | } |
|---|
| 564 | } /* dnadist_sitescrunch */ |
|---|
| 565 | |
|---|
| 566 | |
|---|
| 567 | void makeweights() |
|---|
| 568 | { |
|---|
| 569 | /* make up weights vector to avoid duplicate computations */ |
|---|
| 570 | long i; |
|---|
| 571 | |
|---|
| 572 | for (i = 1; i <= sites; i++) { |
|---|
| 573 | alias[i - 1] = i; |
|---|
| 574 | ally[i - 1] = i; |
|---|
| 575 | weight[i - 1] = 0; |
|---|
| 576 | } |
|---|
| 577 | dnadist_sitesort(); |
|---|
| 578 | dnadist_sitecombine(); |
|---|
| 579 | dnadist_sitescrunch(); |
|---|
| 580 | endsite = 0; |
|---|
| 581 | for (i = 1; i <= sites; i++) { |
|---|
| 582 | if (ally[i - 1] == i && oldweight[i - 1] > 0) |
|---|
| 583 | endsite++; |
|---|
| 584 | } |
|---|
| 585 | for (i = 1; i <= endsite; i++) |
|---|
| 586 | location[alias[i - 1] - 1] = i; |
|---|
| 587 | weightsum = 0; |
|---|
| 588 | for (i = 0; i < sites; i++) |
|---|
| 589 | weightsum += oldweight[i]; |
|---|
| 590 | sumrates = 0.0; |
|---|
| 591 | for (i = 0; i < sites; i++) |
|---|
| 592 | sumrates += oldweight[i] * rate[category[i] - 1]; |
|---|
| 593 | for (i = 0; i < categs; i++) |
|---|
| 594 | rate[i] *= weightsum / sumrates; |
|---|
| 595 | for (i = 0; i < sites; i++) |
|---|
| 596 | weight[location[ally[i] - 1] - 1] += oldweight[i]; |
|---|
| 597 | } /* makeweights */ |
|---|
| 598 | |
|---|
| 599 | |
|---|
| 600 | void dnadist_makevalues() |
|---|
| 601 | { |
|---|
| 602 | /* set up fractional likelihoods at tips */ |
|---|
| 603 | long i, j, k; |
|---|
| 604 | bases b; |
|---|
| 605 | |
|---|
| 606 | for (i = 0; i < spp; i++) { |
|---|
| 607 | nodep[i]->x = (phenotype)Malloc(endsite*sizeof(ratelike)); |
|---|
| 608 | for (j = 0; j < endsite; j++) |
|---|
| 609 | nodep[i]->x[j] = (ratelike)Malloc(rcategs*sizeof(sitelike)); |
|---|
| 610 | } |
|---|
| 611 | for (k = 0; k < endsite; k++) { |
|---|
| 612 | j = alias[k]; |
|---|
| 613 | for (i = 0; i < spp; i++) { |
|---|
| 614 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
|---|
| 615 | nodep[i]->x[k][0][(long)b - (long)A] = 0.0; |
|---|
| 616 | switch (y[i][j - 1]) { |
|---|
| 617 | |
|---|
| 618 | case 'A': |
|---|
| 619 | nodep[i]->x[k][0][0] = 1.0; |
|---|
| 620 | break; |
|---|
| 621 | |
|---|
| 622 | case 'C': |
|---|
| 623 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
|---|
| 624 | break; |
|---|
| 625 | |
|---|
| 626 | case 'G': |
|---|
| 627 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
|---|
| 628 | break; |
|---|
| 629 | |
|---|
| 630 | case 'T': |
|---|
| 631 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 632 | break; |
|---|
| 633 | |
|---|
| 634 | case 'U': |
|---|
| 635 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 636 | break; |
|---|
| 637 | |
|---|
| 638 | case 'M': |
|---|
| 639 | nodep[i]->x[k][0][0] = 1.0; |
|---|
| 640 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
|---|
| 641 | break; |
|---|
| 642 | |
|---|
| 643 | case 'R': |
|---|
| 644 | nodep[i]->x[k][0][0] = 1.0; |
|---|
| 645 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
|---|
| 646 | break; |
|---|
| 647 | |
|---|
| 648 | case 'W': |
|---|
| 649 | nodep[i]->x[k][0][0] = 1.0; |
|---|
| 650 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 651 | break; |
|---|
| 652 | |
|---|
| 653 | case 'S': |
|---|
| 654 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
|---|
| 655 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
|---|
| 656 | break; |
|---|
| 657 | |
|---|
| 658 | case 'Y': |
|---|
| 659 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
|---|
| 660 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 661 | break; |
|---|
| 662 | |
|---|
| 663 | case 'K': |
|---|
| 664 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
|---|
| 665 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 666 | break; |
|---|
| 667 | |
|---|
| 668 | case 'B': |
|---|
| 669 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
|---|
| 670 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
|---|
| 671 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 672 | break; |
|---|
| 673 | |
|---|
| 674 | case 'D': |
|---|
| 675 | nodep[i]->x[k][0][0] = 1.0; |
|---|
| 676 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
|---|
| 677 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 678 | break; |
|---|
| 679 | |
|---|
| 680 | case 'H': |
|---|
| 681 | nodep[i]->x[k][0][0] = 1.0; |
|---|
| 682 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
|---|
| 683 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
|---|
| 684 | break; |
|---|
| 685 | |
|---|
| 686 | case 'V': |
|---|
| 687 | nodep[i]->x[k][0][0] = 1.0; |
|---|
| 688 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
|---|
| 689 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
|---|
| 690 | break; |
|---|
| 691 | |
|---|
| 692 | case 'N': |
|---|
| 693 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
|---|
| 694 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
|---|
| 695 | break; |
|---|
| 696 | |
|---|
| 697 | case 'X': |
|---|
| 698 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
|---|
| 699 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
|---|
| 700 | break; |
|---|
| 701 | |
|---|
| 702 | case '?': |
|---|
| 703 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
|---|
| 704 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
|---|
| 705 | break; |
|---|
| 706 | |
|---|
| 707 | case 'O': |
|---|
| 708 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
|---|
| 709 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
|---|
| 710 | break; |
|---|
| 711 | |
|---|
| 712 | case '-': |
|---|
| 713 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
|---|
| 714 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
|---|
| 715 | break; |
|---|
| 716 | } |
|---|
| 717 | } |
|---|
| 718 | } |
|---|
| 719 | } /* dnadist_makevalues */ |
|---|
| 720 | |
|---|
| 721 | |
|---|
| 722 | void dnadist_empiricalfreqs() |
|---|
| 723 | { |
|---|
| 724 | /* Get empirical base frequencies from the data */ |
|---|
| 725 | long i, j, k; |
|---|
| 726 | double sum, suma, sumc, sumg, sumt, w; |
|---|
| 727 | |
|---|
| 728 | freqa = 0.25; |
|---|
| 729 | freqc = 0.25; |
|---|
| 730 | freqg = 0.25; |
|---|
| 731 | freqt = 0.25; |
|---|
| 732 | for (k = 1; k <= 8; k++) { |
|---|
| 733 | suma = 0.0; |
|---|
| 734 | sumc = 0.0; |
|---|
| 735 | sumg = 0.0; |
|---|
| 736 | sumt = 0.0; |
|---|
| 737 | for (i = 0; i < spp; i++) { |
|---|
| 738 | for (j = 0; j < endsite; j++) { |
|---|
| 739 | w = weight[j]; |
|---|
| 740 | sum = freqa * nodep[i]->x[j][0][0]; |
|---|
| 741 | sum += freqc * nodep[i]->x[j][0][(long)C - (long)A]; |
|---|
| 742 | sum += freqg * nodep[i]->x[j][0][(long)G - (long)A]; |
|---|
| 743 | sum += freqt * nodep[i]->x[j][0][(long)T - (long)A]; |
|---|
| 744 | suma += w * freqa * nodep[i]->x[j][0][0] / sum; |
|---|
| 745 | sumc += w * freqc * nodep[i]->x[j][0][(long)C - (long)A] / sum; |
|---|
| 746 | sumg += w * freqg * nodep[i]->x[j][0][(long)G - (long)A] / sum; |
|---|
| 747 | sumt += w * freqt * nodep[i]->x[j][0][(long)T - (long)A] / sum; |
|---|
| 748 | } |
|---|
| 749 | } |
|---|
| 750 | sum = suma + sumc + sumg + sumt; |
|---|
| 751 | freqa = suma / sum; |
|---|
| 752 | freqc = sumc / sum; |
|---|
| 753 | freqg = sumg / sum; |
|---|
| 754 | freqt = sumt / sum; |
|---|
| 755 | } |
|---|
| 756 | } /* dnadist_empiricalfreqs */ |
|---|
| 757 | |
|---|
| 758 | |
|---|
| 759 | void getinput() |
|---|
| 760 | { |
|---|
| 761 | /* reads the input data */ |
|---|
| 762 | inputoptions(); |
|---|
| 763 | if ((!freqsfrom) && !logdet && !similarity) { |
|---|
| 764 | if (kimura || jukes) { |
|---|
| 765 | freqa = 0.25; |
|---|
| 766 | freqc = 0.25; |
|---|
| 767 | freqg = 0.25; |
|---|
| 768 | freqt = 0.25; |
|---|
| 769 | } |
|---|
| 770 | getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy, |
|---|
| 771 | &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange, |
|---|
| 772 | freqsfrom, printdata); |
|---|
| 773 | if (freqa < 0.00000001) { |
|---|
| 774 | freqa = 0.000001; |
|---|
| 775 | freqc = 0.999999*freqc; |
|---|
| 776 | freqg = 0.999999*freqg; |
|---|
| 777 | freqt = 0.999999*freqt; |
|---|
| 778 | } |
|---|
| 779 | if (freqc < 0.00000001) { |
|---|
| 780 | freqa = 0.999999*freqa; |
|---|
| 781 | freqc = 0.000001; |
|---|
| 782 | freqg = 0.999999*freqg; |
|---|
| 783 | freqt = 0.999999*freqt; |
|---|
| 784 | } |
|---|
| 785 | if (freqg < 0.00000001) { |
|---|
| 786 | freqa = 0.999999*freqa; |
|---|
| 787 | freqc = 0.999999*freqc; |
|---|
| 788 | freqg = 0.000001; |
|---|
| 789 | freqt = 0.999999*freqt; |
|---|
| 790 | } |
|---|
| 791 | if (freqt < 0.00000001) { |
|---|
| 792 | freqa = 0.999999*freqa; |
|---|
| 793 | freqc = 0.999999*freqc; |
|---|
| 794 | freqg = 0.999999*freqg; |
|---|
| 795 | freqt = 0.000001; |
|---|
| 796 | } |
|---|
| 797 | } |
|---|
| 798 | if (!justwts || firstset) |
|---|
| 799 | inputdata(sites); |
|---|
| 800 | makeweights(); |
|---|
| 801 | dnadist_makevalues(); |
|---|
| 802 | if (freqsfrom) { |
|---|
| 803 | dnadist_empiricalfreqs(); |
|---|
| 804 | getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy, |
|---|
| 805 | &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange, |
|---|
| 806 | freqsfrom, printdata); |
|---|
| 807 | } |
|---|
| 808 | } /* getinput */ |
|---|
| 809 | |
|---|
| 810 | |
|---|
| 811 | void inittable() |
|---|
| 812 | { |
|---|
| 813 | /* Define a lookup table. Precompute values and store in a table */ |
|---|
| 814 | long i; |
|---|
| 815 | |
|---|
| 816 | for (i = 0; i < categs; i++) { |
|---|
| 817 | tbl[i].rat = rate[i]; |
|---|
| 818 | tbl[i].ratxv = rate[i] * xv; |
|---|
| 819 | } |
|---|
| 820 | } /* inittable */ |
|---|
| 821 | |
|---|
| 822 | |
|---|
| 823 | double lndet(double (*a)[4]) |
|---|
| 824 | { |
|---|
| 825 | long i, j, k; |
|---|
| 826 | double temp, ld; |
|---|
| 827 | |
|---|
| 828 | /*Gauss-Jordan reduction -- invert matrix a in place, |
|---|
| 829 | overwriting previous contents of a. On exit, matrix a |
|---|
| 830 | contains the inverse, lndet contains the log of the determinant */ |
|---|
| 831 | ld = 1.0; |
|---|
| 832 | for (i = 0; i < 4; i++) { |
|---|
| 833 | ld *= a[i][i]; |
|---|
| 834 | temp = 1.0 / a[i][i]; |
|---|
| 835 | a[i][i] = 1.0; |
|---|
| 836 | for (j = 0; j < 4; j++) |
|---|
| 837 | a[i][j] *= temp; |
|---|
| 838 | for (j = 0; j < 4; j++) { |
|---|
| 839 | if (j != i) { |
|---|
| 840 | temp = a[j][i]; |
|---|
| 841 | a[j][i] = 0.0; |
|---|
| 842 | for (k = 0; k < 4; k++) |
|---|
| 843 | a[j][k] -= temp * a[i][k]; |
|---|
| 844 | } |
|---|
| 845 | } |
|---|
| 846 | } |
|---|
| 847 | if (ld <= 0.0) |
|---|
| 848 | return(99.0); |
|---|
| 849 | else |
|---|
| 850 | return(log(ld)); |
|---|
| 851 | } /* lndet */ |
|---|
| 852 | |
|---|
| 853 | |
|---|
| 854 | void makev(long m, long n, double *v) |
|---|
| 855 | { |
|---|
| 856 | /* compute one distance */ |
|---|
| 857 | long i, j, k, l, it, num1, num2, idx; |
|---|
| 858 | long numerator = 0, denominator = 0; |
|---|
| 859 | double sum, sum1, sum2, sumyr, lz, aa, bb, cc, vv=0, |
|---|
| 860 | p1, p2, p3, q1, q2, q3, tt, delta, slope, |
|---|
| 861 | xx1freqa, xx1freqc, xx1freqg, xx1freqt; |
|---|
| 862 | double *prod, *prod2, *prod3; |
|---|
| 863 | boolean quick, jukesquick, kimquick, logdetquick; |
|---|
| 864 | bases b; |
|---|
| 865 | node *p, *q; |
|---|
| 866 | sitelike xx1, xx2; |
|---|
| 867 | double basetable[4][4]; /* for quick logdet */ |
|---|
| 868 | double basefreq1[4], basefreq2[4]; |
|---|
| 869 | |
|---|
| 870 | p = nodep[m - 1]; |
|---|
| 871 | q = nodep[n - 1]; |
|---|
| 872 | quick = (!ctgry || categs == 1); |
|---|
| 873 | if (jukes || kimura || logdet || similarity) { |
|---|
| 874 | numerator = 0; |
|---|
| 875 | denominator = 0; |
|---|
| 876 | for (i = 0; i < endsite; i++) { |
|---|
| 877 | memcpy(xx1, p->x[i][0], sizeof(sitelike)); |
|---|
| 878 | memcpy(xx2, q->x[i][0], sizeof(sitelike)); |
|---|
| 879 | sum = 0.0; |
|---|
| 880 | sum1 = 0.0; |
|---|
| 881 | sum2 = 0.0; |
|---|
| 882 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) { |
|---|
| 883 | sum1 += xx1[(long)b - (long)A]; |
|---|
| 884 | sum2 += xx2[(long)b - (long)A]; |
|---|
| 885 | sum += xx1[(long)b - (long)A] * xx2[(long)b - (long)A]; |
|---|
| 886 | } |
|---|
| 887 | quick = (quick && (sum1 == 1.0 || sum1 == 4.0) && |
|---|
| 888 | (sum2 == 1.0 || sum2 == 4.0)); |
|---|
| 889 | if (sum1 == 1.0 && sum2 == 1.0) { |
|---|
| 890 | numerator += (long)(weight[i] * sum); |
|---|
| 891 | denominator += weight[i]; |
|---|
| 892 | } |
|---|
| 893 | } |
|---|
| 894 | } |
|---|
| 895 | jukesquick = ((jukes || similarity) && quick); |
|---|
| 896 | kimquick = (kimura && quick); |
|---|
| 897 | logdetquick = (logdet && quick); |
|---|
| 898 | if (logdet && !quick) { |
|---|
| 899 | printf(" WARNING: CANNOT CALCULATE LOGDET DISTANCE\n"); |
|---|
| 900 | printf(" WITH PRESENT PROGRAM IF PARTIALLY AMBIGUOUS NUCLEOTIDES\n"); |
|---|
| 901 | baddists = true; |
|---|
| 902 | } |
|---|
| 903 | if (jukesquick && jukes && (numerator * 4 <= denominator)) { |
|---|
| 904 | printf("\nWARNING: INFINITE DISTANCE BETWEEN "); |
|---|
| 905 | printf(" SPECIES %3ld AND %3ld\n", m, n); |
|---|
| 906 | baddists = true; |
|---|
| 907 | } |
|---|
| 908 | if (jukesquick && invar |
|---|
| 909 | && (4 * (((double)numerator / denominator) - invarfrac) |
|---|
| 910 | <= (1.0 - invarfrac))) { |
|---|
| 911 | printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n); |
|---|
| 912 | printf(" TOO LARGE FOR INVARIABLE SITES\n"); |
|---|
| 913 | baddists = true; |
|---|
| 914 | } |
|---|
| 915 | if (jukesquick) { |
|---|
| 916 | if (similarity) |
|---|
| 917 | vv = (double)numerator / denominator; |
|---|
| 918 | else { |
|---|
| 919 | if (!gama && !invar) |
|---|
| 920 | vv = -0.75 * log((4.0*((double)numerator / denominator) - 1.0) / 3.0); |
|---|
| 921 | else if (!invar) |
|---|
| 922 | vv = 0.75 * cvi * (exp(-(1/cvi)* |
|---|
| 923 | log((4.0 * ((double)numerator / denominator) - 1.0) / 3.0)) - 1.0); |
|---|
| 924 | else |
|---|
| 925 | vv = 0.75 * cvi * (exp(-(1/cvi)* |
|---|
| 926 | log((4.0 * ((double)numerator / denominator - invarfrac)/ |
|---|
| 927 | (1.0-invarfrac) - 1.0) / 3.0)) - 1.0); |
|---|
| 928 | } |
|---|
| 929 | } |
|---|
| 930 | if (kimquick) { |
|---|
| 931 | num1 = 0; |
|---|
| 932 | num2 = 0; |
|---|
| 933 | denominator = 0; |
|---|
| 934 | for (i = 0; i < endsite; i++) { |
|---|
| 935 | memcpy(xx1, p->x[i][0], sizeof(sitelike)); |
|---|
| 936 | memcpy(xx2, q->x[i][0], sizeof(sitelike)); |
|---|
| 937 | sum = 0.0; |
|---|
| 938 | sum1 = 0.0; |
|---|
| 939 | sum2 = 0.0; |
|---|
| 940 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) { |
|---|
| 941 | sum1 += xx1[(long)b - (long)A]; |
|---|
| 942 | sum2 += xx2[(long)b - (long)A]; |
|---|
| 943 | sum += xx1[(long)b - (long)A] * xx2[(long)b - (long)A]; |
|---|
| 944 | } |
|---|
| 945 | sumyr = (xx1[0] + xx1[(long)G - (long)A]) |
|---|
| 946 | * (xx2[0] + xx2[(long)G - (long)A]) + |
|---|
| 947 | (xx1[(long)C - (long)A] + xx1[(long)T - (long)A]) * |
|---|
| 948 | (xx2[(long)C - (long)A] + xx2[(long)T - (long)A]); |
|---|
| 949 | if (sum1 == 1.0 && sum2 == 1.0) { |
|---|
| 950 | num1 += (long)(weight[i] * sum); |
|---|
| 951 | num2 += (long)(weight[i] * (sumyr - sum)); |
|---|
| 952 | denominator += weight[i]; |
|---|
| 953 | } |
|---|
| 954 | } |
|---|
| 955 | tt = ((1.0 - (double)num1 / denominator)-invarfrac)/(1.0-invarfrac); |
|---|
| 956 | if (tt > 0.0) { |
|---|
| 957 | delta = 0.1; |
|---|
| 958 | tt = delta; |
|---|
| 959 | it = 0; |
|---|
| 960 | while (fabs(delta) > 0.00002 && it < iterationsd) { |
|---|
| 961 | it++; |
|---|
| 962 | if (!gama) { |
|---|
| 963 | p1 = exp(-tt); |
|---|
| 964 | p2 = exp(-xv * tt) - exp(-tt); |
|---|
| 965 | p3 = 1.0 - exp(-xv * tt); |
|---|
| 966 | } else { |
|---|
| 967 | p1 = exp(-cvi * log(1 + tt / cvi)); |
|---|
| 968 | p2 = exp(-cvi * log(1 + xv * tt / cvi)) |
|---|
| 969 | - exp(-cvi * log(1 + tt / cvi)); |
|---|
| 970 | p3 = 1.0 - exp(-cvi * log(1 + xv * tt / cvi)); |
|---|
| 971 | } |
|---|
| 972 | q1 = p1 + p2 / 2.0 + p3 / 4.0; |
|---|
| 973 | q2 = p2 / 2.0 + p3 / 4.0; |
|---|
| 974 | q3 = p3 / 2.0; |
|---|
| 975 | q1 = q1 * (1.0-invarfrac) + invarfrac; |
|---|
| 976 | q2 *= (1.0 - invarfrac); |
|---|
| 977 | q3 *= (1.0 - invarfrac); |
|---|
| 978 | if (!gama && !invar) |
|---|
| 979 | slope = 0.5 * exp(-tt) * (num2 / q2 - num1 / q1) + |
|---|
| 980 | 0.25 * xv * exp(-xv * tt) * |
|---|
| 981 | ((denominator - num1 - num2) * 2 / q3 - num2 / q2 - num1 / q1); |
|---|
| 982 | else |
|---|
| 983 | slope = 0.5 * (1 / (1 + tt / cvi)) * exp(-cvi * log(1 + tt / cvi)) * |
|---|
| 984 | (num2 / q2 - num1 / q1) + 0.25 * (xv / (1 + xv * tt / cvi)) * |
|---|
| 985 | exp(-cvi * log(1 + xv * tt / cvi)) * |
|---|
| 986 | ((denominator - num1 - num2) * 2 / q3 - num2 / q2 - num1 / q1); |
|---|
| 987 | slope *= (1.0-invarfrac); |
|---|
| 988 | if (slope < 0.0) |
|---|
| 989 | delta = fabs(delta) / -2.0; |
|---|
| 990 | else |
|---|
| 991 | delta = fabs(delta); |
|---|
| 992 | tt += delta; |
|---|
| 993 | } |
|---|
| 994 | } |
|---|
| 995 | if ((delta >= 0.1) && (!similarity)) { |
|---|
| 996 | printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n); |
|---|
| 997 | if (invar) |
|---|
| 998 | printf(" TOO LARGE FOR INVARIABLE SITES\n"); |
|---|
| 999 | else |
|---|
| 1000 | printf(" TOO LARGE TO ESTIMATE DISTANCE\n"); |
|---|
| 1001 | baddists = true; |
|---|
| 1002 | } |
|---|
| 1003 | vv = fracchange * tt; |
|---|
| 1004 | } |
|---|
| 1005 | if (!(jukesquick || kimquick || logdet)) { |
|---|
| 1006 | prod = (double *)Malloc(sites*sizeof(double)); |
|---|
| 1007 | prod2 = (double *)Malloc(sites*sizeof(double)); |
|---|
| 1008 | prod3 = (double *)Malloc(sites*sizeof(double)); |
|---|
| 1009 | for (i = 0; i < endsite; i++) { |
|---|
| 1010 | memcpy(xx1, p->x[i][0], sizeof(sitelike)); |
|---|
| 1011 | memcpy(xx2, q->x[i][0], sizeof(sitelike)); |
|---|
| 1012 | xx1freqa = xx1[0] * freqa; |
|---|
| 1013 | xx1freqc = xx1[(long)C - (long)A] * freqc; |
|---|
| 1014 | xx1freqg = xx1[(long)G - (long)A] * freqg; |
|---|
| 1015 | xx1freqt = xx1[(long)T - (long)A] * freqt; |
|---|
| 1016 | sum1 = xx1freqa + xx1freqc + xx1freqg + xx1freqt; |
|---|
| 1017 | sum2 = freqa * xx2[0] + freqc * xx2[(long)C - (long)A] + |
|---|
| 1018 | freqg * xx2[(long)G - (long)A] + freqt * xx2[(long)T - (long)A]; |
|---|
| 1019 | prod[i] = sum1 * sum2; |
|---|
| 1020 | prod2[i] = (xx1freqa + xx1freqg) * |
|---|
| 1021 | (xx2[0] * freqar + xx2[(long)G - (long)A] * freqgr) + |
|---|
| 1022 | (xx1freqc + xx1freqt) * |
|---|
| 1023 | (xx2[(long)C - (long)A] * freqcy + xx2[(long)T - (long)A] * freqty); |
|---|
| 1024 | prod3[i] = xx1freqa * xx2[0] + xx1freqc * xx2[(long)C - (long)A] + |
|---|
| 1025 | xx1freqg * xx2[(long)G - (long)A] + xx1freqt * xx2[(long)T - (long)A]; |
|---|
| 1026 | } |
|---|
| 1027 | tt = 0.1; |
|---|
| 1028 | delta = 0.1; |
|---|
| 1029 | it = 1; |
|---|
| 1030 | while (it < iterationsd && fabs(delta) > 0.00002) { |
|---|
| 1031 | slope = 0.0; |
|---|
| 1032 | if (tt > 0.0) { |
|---|
| 1033 | lz = -tt; |
|---|
| 1034 | for (i = 0; i < categs; i++) { |
|---|
| 1035 | if (!gama) { |
|---|
| 1036 | tbl[i].z1 = exp(tbl[i].ratxv * lz); |
|---|
| 1037 | tbl[i].z1zz = exp(tbl[i].rat * lz); |
|---|
| 1038 | } |
|---|
| 1039 | else { |
|---|
| 1040 | tbl[i].z1 = exp(-cvi*log(1.0-tbl[i].ratxv * lz/cvi)); |
|---|
| 1041 | tbl[i].z1zz = exp(-cvi*log(1.0-tbl[i].rat * lz/cvi)); |
|---|
| 1042 | } |
|---|
| 1043 | tbl[i].y1 = 1.0 - tbl[i].z1; |
|---|
| 1044 | tbl[i].z1yy = tbl[i].z1 - tbl[i].z1zz; |
|---|
| 1045 | tbl[i].z1xv = tbl[i].z1 * xv; |
|---|
| 1046 | } |
|---|
| 1047 | for (i = 0; i < endsite; i++) { |
|---|
| 1048 | idx = category[alias[i] - 1]; |
|---|
| 1049 | cc = prod[i]; |
|---|
| 1050 | bb = prod2[i]; |
|---|
| 1051 | aa = prod3[i]; |
|---|
| 1052 | if (!gama && !invar) |
|---|
| 1053 | slope += weightrat[i] * (tbl[idx - 1].z1zz * (bb - aa) + |
|---|
| 1054 | tbl[idx - 1].z1xv * (cc - bb)) / |
|---|
| 1055 | (aa * tbl[idx - 1].z1zz + bb * tbl[idx - 1].z1yy + |
|---|
| 1056 | cc * tbl[idx - 1].y1); |
|---|
| 1057 | else |
|---|
| 1058 | slope += (1.0-invarfrac) * weightrat[i] * ( |
|---|
| 1059 | ((tbl[idx-1].rat)/(1.0-tbl[idx-1].rat * lz/cvi)) |
|---|
| 1060 | * tbl[idx - 1].z1zz * (bb - aa) + |
|---|
| 1061 | ((tbl[idx-1].ratxv)/(1.0-tbl[idx-1].ratxv * lz/cvi)) |
|---|
| 1062 | * tbl[idx - 1].z1 * (cc - bb)) / |
|---|
| 1063 | (aa * ((1.0-invarfrac)*tbl[idx - 1].z1zz + invarfrac) |
|---|
| 1064 | + bb * (1.0-invarfrac)*tbl[idx - 1].z1yy |
|---|
| 1065 | + cc * (1.0-invarfrac)*tbl[idx - 1].y1); |
|---|
| 1066 | } |
|---|
| 1067 | } |
|---|
| 1068 | if (slope < 0.0) |
|---|
| 1069 | delta = fabs(delta) / -2.0; |
|---|
| 1070 | else |
|---|
| 1071 | delta = fabs(delta); |
|---|
| 1072 | tt += delta; |
|---|
| 1073 | it++; |
|---|
| 1074 | } |
|---|
| 1075 | if ((delta >= 0.1) && (!similarity)) { |
|---|
| 1076 | printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n); |
|---|
| 1077 | if (invar) |
|---|
| 1078 | printf(" TOO LARGE FOR INVARIABLE SITES\n"); |
|---|
| 1079 | else |
|---|
| 1080 | printf(" TOO LARGE TO ESTIMATE DISTANCE\n"); |
|---|
| 1081 | baddists = true; |
|---|
| 1082 | } |
|---|
| 1083 | vv = tt * fracchange; |
|---|
| 1084 | free(prod); |
|---|
| 1085 | free(prod2); |
|---|
| 1086 | free(prod3); |
|---|
| 1087 | } |
|---|
| 1088 | if (logdetquick) { /* compute logdet when no ambiguous nucleotides */ |
|---|
| 1089 | for (i = 0; i < 4; i++) { |
|---|
| 1090 | basefreq1[i] = 0.0; |
|---|
| 1091 | basefreq2[i] = 0.0; |
|---|
| 1092 | for (j = 0; j < 4; j++) |
|---|
| 1093 | basetable[i][j] = 0.0; |
|---|
| 1094 | } |
|---|
| 1095 | for (i = 0; i < endsite; i++) { |
|---|
| 1096 | for (k = 0; p->x[i][0][k] == 0.0; k++); |
|---|
| 1097 | basefreq1[k] += weight[i]; |
|---|
| 1098 | for (l = 0; q->x[i][0][l] == 0.0; l++); |
|---|
| 1099 | basefreq2[l] += weight[i]; |
|---|
| 1100 | basetable[k][l] += weight[i]; |
|---|
| 1101 | } |
|---|
| 1102 | vv = lndet(basetable); |
|---|
| 1103 | if (vv == 99.0) { |
|---|
| 1104 | printf("\nNegative or zero determinant for distance between species"); |
|---|
| 1105 | printf(" %ld and %ld\n", m, n); |
|---|
| 1106 | baddists = true; |
|---|
| 1107 | } |
|---|
| 1108 | vv = -0.25*(vv - 0.5*(log(basefreq1[0])+log(basefreq1[1]) |
|---|
| 1109 | +log(basefreq1[2])+log(basefreq1[3]) |
|---|
| 1110 | +log(basefreq2[0])+log(basefreq2[1]) |
|---|
| 1111 | +log(basefreq2[2])+log(basefreq2[3]))); |
|---|
| 1112 | } |
|---|
| 1113 | *v = vv; |
|---|
| 1114 | } /* makev */ |
|---|
| 1115 | |
|---|
| 1116 | |
|---|
| 1117 | void makedists() |
|---|
| 1118 | { |
|---|
| 1119 | /* compute distance matrix */ |
|---|
| 1120 | long i, j; |
|---|
| 1121 | double v; |
|---|
| 1122 | |
|---|
| 1123 | inittable(); |
|---|
| 1124 | for (i = 0; i < endsite; i++) |
|---|
| 1125 | weightrat[i] = weight[i] * rate[category[alias[i] - 1] - 1]; |
|---|
| 1126 | if (progress) { |
|---|
| 1127 | printf("Distances calculated for species\n"); |
|---|
| 1128 | #ifdef WIN32 |
|---|
| 1129 | phyFillScreenColor(); |
|---|
| 1130 | #endif |
|---|
| 1131 | } |
|---|
| 1132 | for (i = 0; i < spp; i++) |
|---|
| 1133 | if (similarity) |
|---|
| 1134 | d[i][i] = 1.0; |
|---|
| 1135 | else |
|---|
| 1136 | d[i][i] = 0.0; |
|---|
| 1137 | baddists = false; |
|---|
| 1138 | for (i = 1; i < spp; i++) { |
|---|
| 1139 | if (progress) { |
|---|
| 1140 | printf(" "); |
|---|
| 1141 | for (j = 0; j < nmlngth; j++) |
|---|
| 1142 | putchar(nayme[i - 1][j]); |
|---|
| 1143 | printf(" "); |
|---|
| 1144 | } |
|---|
| 1145 | for (j = i + 1; j <= spp; j++) { |
|---|
| 1146 | makev(i, j, &v); |
|---|
| 1147 | d[i - 1][j - 1] = v; |
|---|
| 1148 | d[j - 1][i - 1] = v; |
|---|
| 1149 | if (progress) { |
|---|
| 1150 | putchar('.'); |
|---|
| 1151 | fflush(stdout); |
|---|
| 1152 | } |
|---|
| 1153 | } |
|---|
| 1154 | if (progress) { |
|---|
| 1155 | putchar('\n'); |
|---|
| 1156 | #ifdef WIN32 |
|---|
| 1157 | phyFillScreenColor(); |
|---|
| 1158 | #endif |
|---|
| 1159 | } |
|---|
| 1160 | } |
|---|
| 1161 | if (baddists) |
|---|
| 1162 | exxit(-1); |
|---|
| 1163 | if (progress) { |
|---|
| 1164 | printf(" "); |
|---|
| 1165 | for (j = 0; j < nmlngth; j++) |
|---|
| 1166 | putchar(nayme[spp - 1][j]); |
|---|
| 1167 | putchar('\n'); |
|---|
| 1168 | } |
|---|
| 1169 | for (i = 0; i < spp; i++) { |
|---|
| 1170 | for (j = 0; j < endsite; j++) |
|---|
| 1171 | free(nodep[i]->x[j]); |
|---|
| 1172 | free(nodep[i]->x); |
|---|
| 1173 | } |
|---|
| 1174 | } /* makedists */ |
|---|
| 1175 | |
|---|
| 1176 | |
|---|
| 1177 | void writedists() |
|---|
| 1178 | { |
|---|
| 1179 | /* write out distances */ |
|---|
| 1180 | long i, j, k, n; |
|---|
| 1181 | |
|---|
| 1182 | if (!printdata && !similarity) |
|---|
| 1183 | fprintf(outfile, "%5ld\n", spp); |
|---|
| 1184 | else |
|---|
| 1185 | fprintf(outfile, "\n"); |
|---|
| 1186 | if (!similarity) { |
|---|
| 1187 | for (i = 0; i < spp; i++) { |
|---|
| 1188 | for (j = 0; j < nmlngth; j++) |
|---|
| 1189 | putc(nayme[i][j], outfile); |
|---|
| 1190 | if (lower) |
|---|
| 1191 | k = i; |
|---|
| 1192 | else |
|---|
| 1193 | k = spp; |
|---|
| 1194 | for (j = 1; j <= k; j++) { |
|---|
| 1195 | fprintf(outfile, "%8.4f", d[i][j - 1]); |
|---|
| 1196 | if ((j + 1) % 9 == 0 && j < k) |
|---|
| 1197 | putc('\n', outfile); |
|---|
| 1198 | } |
|---|
| 1199 | putc('\n', outfile); |
|---|
| 1200 | } |
|---|
| 1201 | } else { |
|---|
| 1202 | for (i = 0; i < spp; i += 8) { |
|---|
| 1203 | if ((i+8) < spp) |
|---|
| 1204 | n = i+8; |
|---|
| 1205 | else |
|---|
| 1206 | n = spp; |
|---|
| 1207 | fprintf(outfile, " "); |
|---|
| 1208 | for (j = i; j < n ; j++) { |
|---|
| 1209 | for (k = 0; k < (nmlngth-3); k++) |
|---|
| 1210 | putc(nayme[j][k], outfile); |
|---|
| 1211 | putc(' ', outfile); |
|---|
| 1212 | } |
|---|
| 1213 | putc('\n', outfile); |
|---|
| 1214 | for (j = 0; j < spp; j++) { |
|---|
| 1215 | for (k = 0; k < nmlngth; k++) |
|---|
| 1216 | putc(nayme[j][k], outfile); |
|---|
| 1217 | if ((i+8) < spp) |
|---|
| 1218 | n = i+8; |
|---|
| 1219 | else |
|---|
| 1220 | n = spp; |
|---|
| 1221 | for (k = i; k < n ; k++) |
|---|
| 1222 | fprintf(outfile, "%8.4f", d[j][k]); |
|---|
| 1223 | putc('\n', outfile); |
|---|
| 1224 | } |
|---|
| 1225 | putc('\n', outfile); |
|---|
| 1226 | } |
|---|
| 1227 | } |
|---|
| 1228 | if (progress) |
|---|
| 1229 | printf("\nDistances written to file \"%s\"\n\n", outfilename); |
|---|
| 1230 | } /* writedists */ |
|---|
| 1231 | |
|---|
| 1232 | |
|---|
| 1233 | int main(int argc, Char *argv[]) |
|---|
| 1234 | { /* DNA Distances by Maximum Likelihood */ |
|---|
| 1235 | #ifdef MAC |
|---|
| 1236 | argc = 1; /* macsetup("Dnadist",""); */ |
|---|
| 1237 | argv[0] = "Dnadist"; |
|---|
| 1238 | #endif |
|---|
| 1239 | init(argc, argv); |
|---|
| 1240 | openfile(&infile,INFILE,"input file","r",argv[0],infilename); |
|---|
| 1241 | openfile(&outfile,OUTFILE,"output file","w",argv[0],outfilename); |
|---|
| 1242 | |
|---|
| 1243 | ibmpc = IBMCRT; |
|---|
| 1244 | ansi = ANSICRT; |
|---|
| 1245 | mulsets = false; |
|---|
| 1246 | datasets = 1; |
|---|
| 1247 | firstset = true; |
|---|
| 1248 | doinit(); |
|---|
| 1249 | ttratio0 = ttratio; |
|---|
| 1250 | if (ctgry) |
|---|
| 1251 | openfile(&catfile,CATFILE,"categories file","r",argv[0],catfilename); |
|---|
| 1252 | if (weights || justwts) |
|---|
| 1253 | openfile(&weightfile,WEIGHTFILE,"weights file","r",argv[0],weightfilename); |
|---|
| 1254 | for (ith = 1; ith <= datasets; ith++) { |
|---|
| 1255 | ttratio = ttratio0; |
|---|
| 1256 | getinput(); |
|---|
| 1257 | if (ith == 1) |
|---|
| 1258 | firstset = false; |
|---|
| 1259 | if (datasets > 1 && progress) |
|---|
| 1260 | printf("Data set # %ld:\n\n",ith); |
|---|
| 1261 | makedists(); |
|---|
| 1262 | writedists(); |
|---|
| 1263 | } |
|---|
| 1264 | FClose(infile); |
|---|
| 1265 | FClose(outfile); |
|---|
| 1266 | #ifdef MAC |
|---|
| 1267 | fixmacfile(outfilename); |
|---|
| 1268 | #endif |
|---|
| 1269 | printf("Done.\n\n"); |
|---|
| 1270 | #ifdef WIN32 |
|---|
| 1271 | phyRestoreConsoleAttributes(); |
|---|
| 1272 | #endif |
|---|
| 1273 | return 0; |
|---|
| 1274 | } /* DNA Distances by Maximum Likelihood */ |
|---|
| 1275 | |
|---|