1 | #include "phylip.h" |
---|
2 | #include "seq.h" |
---|
3 | |
---|
4 | /* version 3.6. (c) Copyright 1993-2002 by the University of Washington. |
---|
5 | Written by Joseph Felsenstein, Akiko Fuseki, Sean Lamont, and Andrew Keeffe. |
---|
6 | Permission is granted to copy and use this program provided no fee is |
---|
7 | charged for it and provided that this copyright notice is not removed. */ |
---|
8 | |
---|
9 | #define iterationsd 100 /* number of iterates of EM for each distance */ |
---|
10 | |
---|
11 | typedef struct valrec { |
---|
12 | double rat, ratxv, z1, y1, z1zz, z1yy, z1xv; |
---|
13 | } valrec; |
---|
14 | |
---|
15 | extern sequence y; |
---|
16 | |
---|
17 | Char infilename[FNMLNGTH], outfilename[FNMLNGTH], catfilename[FNMLNGTH], weightfilename[FNMLNGTH]; |
---|
18 | long sites, categs, weightsum, datasets, ith, rcategs; |
---|
19 | boolean freqsfrom, jukes, kimura, logdet, gama, invar, similarity, lower, f84, |
---|
20 | weights, progress, ctgry, mulsets, justwts, firstset, baddists; |
---|
21 | node **nodep; |
---|
22 | double xi, xv, ttratio, ttratio0, freqa, freqc, freqg, freqt, freqr, freqy, |
---|
23 | freqar, freqcy, freqgr, freqty, cvi, invarfrac, sumrates, fracchange; |
---|
24 | steptr oldweight; |
---|
25 | double rate[maxcategs]; |
---|
26 | double **d; |
---|
27 | double sumweightrat; /* these values were propagated */ |
---|
28 | double *weightrat; /* to global values from */ |
---|
29 | valrec tbl[maxcategs]; /* function makedists. */ |
---|
30 | |
---|
31 | |
---|
32 | #ifndef OLDC |
---|
33 | /* function prototypes */ |
---|
34 | void getoptions(void); |
---|
35 | void allocrest(void); |
---|
36 | void reallocsites(void); |
---|
37 | void doinit(void); |
---|
38 | void inputcategories(void); |
---|
39 | void printcategories(void); |
---|
40 | void inputoptions(void); |
---|
41 | void dnadist_sitesort(void); |
---|
42 | void dnadist_sitecombine(void); |
---|
43 | void dnadist_sitescrunch(void); |
---|
44 | void makeweights(void); |
---|
45 | void dnadist_makevalues(void); |
---|
46 | void dnadist_empiricalfreqs(void); |
---|
47 | void getinput(void); |
---|
48 | void inittable(void); |
---|
49 | double lndet(double (*a)[4]); |
---|
50 | void makev(long, long, double *); |
---|
51 | void makedists(void); |
---|
52 | void writedists(void); |
---|
53 | /* function prototypes */ |
---|
54 | #endif |
---|
55 | |
---|
56 | |
---|
57 | void getoptions() |
---|
58 | { |
---|
59 | /* interactively set options */ |
---|
60 | long loopcount, loopcount2; |
---|
61 | Char ch, ch2; |
---|
62 | boolean ttr; |
---|
63 | |
---|
64 | ctgry = false; |
---|
65 | categs = 1; |
---|
66 | cvi = 1.0; |
---|
67 | rcategs = 1; |
---|
68 | rate[0] = 1.0; |
---|
69 | freqsfrom = true; |
---|
70 | gama = false; |
---|
71 | invar = false; |
---|
72 | invarfrac = 0.0; |
---|
73 | jukes = false; |
---|
74 | justwts = false; |
---|
75 | kimura = false; |
---|
76 | logdet = false; |
---|
77 | f84 = true; |
---|
78 | lower = false; |
---|
79 | similarity = false; |
---|
80 | ttratio = 2.0; |
---|
81 | ttr = false; |
---|
82 | weights = false; |
---|
83 | printdata = false; |
---|
84 | progress = true; |
---|
85 | interleaved = true; |
---|
86 | loopcount = 0; |
---|
87 | for (;;) { |
---|
88 | cleerhome(); |
---|
89 | printf("\nNucleic acid sequence Distance Matrix program,"); |
---|
90 | printf(" version %s\n\n",VERSION); |
---|
91 | printf("Settings for this run:\n"); |
---|
92 | printf(" D Distance (F84, Kimura, Jukes-Cantor, LogDet)? %s\n", |
---|
93 | kimura ? "Kimura 2-parameter" : |
---|
94 | jukes ? "Jukes-Cantor" : |
---|
95 | logdet ? "LogDet" : |
---|
96 | similarity ? "Similarity table" : "F84"); |
---|
97 | if (kimura || f84 || jukes) { |
---|
98 | printf(" G Gamma distributed rates across sites? "); |
---|
99 | if (gama) |
---|
100 | printf("Yes\n"); |
---|
101 | else { |
---|
102 | if (invar) |
---|
103 | printf("Gamma+Invariant\n"); |
---|
104 | else |
---|
105 | printf("No\n"); |
---|
106 | } |
---|
107 | } |
---|
108 | if (kimura || f84) { |
---|
109 | printf(" T Transition/transversion ratio?"); |
---|
110 | if (!ttr) |
---|
111 | printf(" 2.0\n"); |
---|
112 | else |
---|
113 | printf("%8.4f\n", ttratio); |
---|
114 | } |
---|
115 | if (!logdet && !similarity && !gama && !invar) { |
---|
116 | printf(" C One category of substitution rates?"); |
---|
117 | if (!ctgry || categs == 1) |
---|
118 | printf(" Yes\n"); |
---|
119 | else |
---|
120 | printf(" %ld categories\n", categs); |
---|
121 | } |
---|
122 | printf(" W Use weights for sites?"); |
---|
123 | if (weights) |
---|
124 | printf(" Yes\n"); |
---|
125 | else |
---|
126 | printf(" No\n"); |
---|
127 | if (f84) |
---|
128 | printf(" F Use empirical base frequencies? %s\n", |
---|
129 | (freqsfrom ? "Yes" : "No")); |
---|
130 | printf(" L Form of distance matrix? %s\n", |
---|
131 | (lower ? "Lower-triangular" : "Square")); |
---|
132 | printf(" M Analyze multiple data sets?"); |
---|
133 | if (mulsets) |
---|
134 | printf(" Yes, %2ld %s\n", datasets, |
---|
135 | (justwts ? "sets of weights" : "data sets")); |
---|
136 | else |
---|
137 | printf(" No\n"); |
---|
138 | printf(" I Input sequences interleaved? %s\n", |
---|
139 | (interleaved ? "Yes" : "No, sequential")); |
---|
140 | printf(" 0 Terminal type (IBM PC, ANSI, none)? %s\n", |
---|
141 | ibmpc ? "IBM PC" : ansi ? "ANSI" : "(none)"); |
---|
142 | printf(" 1 Print out the data at start of run %s\n", |
---|
143 | (printdata ? "Yes" : "No")); |
---|
144 | printf(" 2 Print indications of progress of run %s\n", |
---|
145 | (progress ? "Yes" : "No")); |
---|
146 | printf("\n Y to accept these or type the letter for one to change\n"); |
---|
147 | #ifdef WIN32 |
---|
148 | phyFillScreenColor(); |
---|
149 | #endif |
---|
150 | scanf("%c%*[^\n]", &ch); |
---|
151 | getchar(); |
---|
152 | uppercase(&ch); |
---|
153 | if (ch == 'Y') |
---|
154 | break; |
---|
155 | if ((f84 && (strchr("CFGWLDTMI012",ch) != NULL)) || |
---|
156 | (kimura && (strchr("CGWLDTMI012",ch) != NULL)) || |
---|
157 | (jukes && (strchr("CGWLDMI012",ch) != NULL)) || |
---|
158 | ((logdet || similarity) && (strchr("WLDMI012",ch)) != NULL) || |
---|
159 | (ctgry && (strchr("CFWLDTMI012",ch) != NULL))) { |
---|
160 | switch (ch) { |
---|
161 | |
---|
162 | case 'D': |
---|
163 | if (kimura) { |
---|
164 | kimura = false; |
---|
165 | jukes = true; |
---|
166 | freqsfrom = false; |
---|
167 | } else if (f84) { |
---|
168 | f84 = false; |
---|
169 | kimura = true; |
---|
170 | freqsfrom = false; |
---|
171 | } else if (logdet) { |
---|
172 | logdet = false; |
---|
173 | similarity = true; |
---|
174 | } else if (similarity) { |
---|
175 | similarity = false; |
---|
176 | f84 = true; |
---|
177 | freqsfrom = true; |
---|
178 | } else { |
---|
179 | jukes = false; |
---|
180 | logdet = true; |
---|
181 | freqsfrom = false; |
---|
182 | } |
---|
183 | break; |
---|
184 | |
---|
185 | case 'G': |
---|
186 | if (!(gama || invar)) |
---|
187 | gama = true; |
---|
188 | else { |
---|
189 | if (gama) { |
---|
190 | gama = false; |
---|
191 | invar = true; |
---|
192 | } else { |
---|
193 | if (invar) |
---|
194 | invar = false; |
---|
195 | } |
---|
196 | } |
---|
197 | break; |
---|
198 | |
---|
199 | |
---|
200 | case 'C': |
---|
201 | ctgry = !ctgry; |
---|
202 | if (ctgry) { |
---|
203 | initcatn(&categs); |
---|
204 | initcategs(categs, rate); |
---|
205 | } |
---|
206 | break; |
---|
207 | |
---|
208 | case 'F': |
---|
209 | freqsfrom = !freqsfrom; |
---|
210 | if (!freqsfrom) |
---|
211 | initfreqs(&freqa, &freqc, &freqg, &freqt); |
---|
212 | break; |
---|
213 | |
---|
214 | case 'W': |
---|
215 | weights = !weights; |
---|
216 | break; |
---|
217 | |
---|
218 | case 'L': |
---|
219 | lower = !lower; |
---|
220 | break; |
---|
221 | |
---|
222 | case 'T': |
---|
223 | ttr = !ttr; |
---|
224 | if (ttr) |
---|
225 | initratio(&ttratio); |
---|
226 | break; |
---|
227 | |
---|
228 | case 'M': |
---|
229 | mulsets = !mulsets; |
---|
230 | if (mulsets) { |
---|
231 | printf("Multiple data sets or multiple weights?"); |
---|
232 | loopcount2 = 0; |
---|
233 | do { |
---|
234 | printf(" (type D or W)\n"); |
---|
235 | #ifdef WIN32 |
---|
236 | phyFillScreenColor(); |
---|
237 | #endif |
---|
238 | scanf("%c%*[^\n]", &ch2); |
---|
239 | uppercase(&ch2); |
---|
240 | getchar(); |
---|
241 | countup(&loopcount2, 10); |
---|
242 | } while ((ch2 != 'W') && (ch2 != 'D')); |
---|
243 | justwts = (ch2 == 'W'); |
---|
244 | if (justwts) |
---|
245 | justweights(&datasets); |
---|
246 | else |
---|
247 | initdatasets(&datasets); |
---|
248 | } |
---|
249 | break; |
---|
250 | |
---|
251 | case 'I': |
---|
252 | interleaved = !interleaved; |
---|
253 | break; |
---|
254 | |
---|
255 | case '0': |
---|
256 | initterminal(&ibmpc, &ansi); |
---|
257 | break; |
---|
258 | |
---|
259 | case '1': |
---|
260 | printdata = !printdata; |
---|
261 | break; |
---|
262 | |
---|
263 | case '2': |
---|
264 | progress = !progress; |
---|
265 | break; |
---|
266 | } |
---|
267 | } else { |
---|
268 | if (strchr("CFGWLDTMI012",ch) == NULL) |
---|
269 | printf("Not a possible option!\n"); |
---|
270 | else |
---|
271 | printf("That option not allowed with these settings\n"); |
---|
272 | printf("\nPress Enter or Return key to continue\n"); |
---|
273 | getchar(); |
---|
274 | } |
---|
275 | countup(&loopcount, 100); |
---|
276 | } |
---|
277 | if (gama || invar) { |
---|
278 | loopcount = 0; |
---|
279 | do { |
---|
280 | printf( |
---|
281 | "\nCoefficient of variation of substitution rate among sites (must be positive)\n"); |
---|
282 | printf( |
---|
283 | " In gamma distribution parameters, this is 1/(square root of alpha)\n"); |
---|
284 | #ifdef WIN32 |
---|
285 | phyFillScreenColor(); |
---|
286 | #endif |
---|
287 | scanf("%lf%*[^\n]", &cvi); |
---|
288 | getchar(); |
---|
289 | countup(&loopcount, 10); |
---|
290 | } while (cvi <= 0.0); |
---|
291 | cvi = 1.0 / (cvi * cvi); |
---|
292 | } |
---|
293 | if (invar) { |
---|
294 | loopcount = 0; |
---|
295 | do { |
---|
296 | printf("Fraction of invariant sites?\n"); |
---|
297 | scanf("%lf%*[^\n]", &invarfrac); |
---|
298 | getchar(); |
---|
299 | countup (&loopcount, 10); |
---|
300 | } while ((invarfrac <= 0.0) || (invarfrac >= 1.0)); |
---|
301 | } |
---|
302 | if (!printdata) |
---|
303 | return; |
---|
304 | fprintf(outfile, "\nNucleic acid sequence Distance Matrix program,"); |
---|
305 | fprintf(outfile, " version %s\n\n",VERSION); |
---|
306 | } /* getoptions */ |
---|
307 | |
---|
308 | |
---|
309 | void allocrest() |
---|
310 | { |
---|
311 | long i; |
---|
312 | |
---|
313 | y = (Char **)Malloc(spp*sizeof(Char *)); |
---|
314 | nodep = (node **)Malloc(spp*sizeof(node *)); |
---|
315 | for (i = 0; i < spp; i++) { |
---|
316 | y[i] = (Char *)Malloc(sites*sizeof(Char)); |
---|
317 | nodep[i] = (node *)Malloc(sizeof(node)); |
---|
318 | } |
---|
319 | d = (double **)Malloc(spp*sizeof(double *)); |
---|
320 | for (i = 0; i < spp; i++) |
---|
321 | d[i] = (double*)Malloc(spp*sizeof(double)); |
---|
322 | nayme = (naym *)Malloc(spp*sizeof(naym)); |
---|
323 | category = (steptr)Malloc(sites*sizeof(long)); |
---|
324 | oldweight = (steptr)Malloc(sites*sizeof(long)); |
---|
325 | weight = (steptr)Malloc(sites*sizeof(long)); |
---|
326 | alias = (steptr)Malloc(sites*sizeof(long)); |
---|
327 | ally = (steptr)Malloc(sites*sizeof(long)); |
---|
328 | location = (steptr)Malloc(sites*sizeof(long)); |
---|
329 | weightrat = (double *)Malloc(sites*sizeof(double)); |
---|
330 | } /* allocrest */ |
---|
331 | |
---|
332 | |
---|
333 | void reallocsites() |
---|
334 | {/* The amount of sites can change between runs |
---|
335 | this function reallocates all the variables |
---|
336 | whose size depends on the amount of sites */ |
---|
337 | long i; |
---|
338 | |
---|
339 | for (i = 0; i < spp; i++) { |
---|
340 | free(y[i]); |
---|
341 | y[i] = (Char *)Malloc(sites*sizeof(Char)); |
---|
342 | } |
---|
343 | free(category); |
---|
344 | free(oldweight); |
---|
345 | free(weight); |
---|
346 | free(alias); |
---|
347 | free(ally); |
---|
348 | free(location); |
---|
349 | free(weightrat); |
---|
350 | |
---|
351 | category = (steptr)Malloc(sites*sizeof(long)); |
---|
352 | oldweight = (steptr)Malloc(sites*sizeof(long)); |
---|
353 | weight = (steptr)Malloc(sites*sizeof(long)); |
---|
354 | alias = (steptr)Malloc(sites*sizeof(long)); |
---|
355 | ally = (steptr)Malloc(sites*sizeof(long)); |
---|
356 | location = (steptr)Malloc(sites*sizeof(long)); |
---|
357 | weightrat = (double *)Malloc(sites*sizeof(double)); |
---|
358 | } /* reallocsites */ |
---|
359 | |
---|
360 | |
---|
361 | void doinit() |
---|
362 | { |
---|
363 | /* initializes variables */ |
---|
364 | |
---|
365 | inputnumbers(&spp, &sites, &nonodes, 1); |
---|
366 | getoptions(); |
---|
367 | if (printdata) |
---|
368 | fprintf(outfile, "%2ld species, %3ld sites\n", spp, sites); |
---|
369 | allocrest(); |
---|
370 | } /* doinit */ |
---|
371 | |
---|
372 | |
---|
373 | void inputcategories() |
---|
374 | { |
---|
375 | /* reads the categories for each site */ |
---|
376 | long i; |
---|
377 | Char ch; |
---|
378 | |
---|
379 | for (i = 1; i < nmlngth; i++) |
---|
380 | gettc(infile); |
---|
381 | for (i = 0; i < sites; i++) { |
---|
382 | do { |
---|
383 | if (eoln(infile)) |
---|
384 | scan_eoln(infile); |
---|
385 | ch = gettc(infile); |
---|
386 | } while (ch == ' '); |
---|
387 | category[i] = ch - '0'; |
---|
388 | } |
---|
389 | scan_eoln(infile); |
---|
390 | } /* inputcategories */ |
---|
391 | |
---|
392 | |
---|
393 | void printcategories() |
---|
394 | { /* print out list of categories of sites */ |
---|
395 | long i, j; |
---|
396 | |
---|
397 | fprintf(outfile, "Rate categories\n\n"); |
---|
398 | for (i = 1; i <= nmlngth + 3; i++) |
---|
399 | putc(' ', outfile); |
---|
400 | for (i = 1; i <= sites; i++) { |
---|
401 | fprintf(outfile, "%ld", category[i - 1]); |
---|
402 | if (i % 60 == 0) { |
---|
403 | putc('\n', outfile); |
---|
404 | for (j = 1; j <= nmlngth + 3; j++) |
---|
405 | putc(' ', outfile); |
---|
406 | } else if (i % 10 == 0) |
---|
407 | putc(' ', outfile); |
---|
408 | } |
---|
409 | fprintf(outfile, "\n\n"); |
---|
410 | } /* printcategories */ |
---|
411 | |
---|
412 | |
---|
413 | void inputoptions() |
---|
414 | { |
---|
415 | /* read options information */ |
---|
416 | long i; |
---|
417 | |
---|
418 | if (!firstset && !justwts) { |
---|
419 | samenumsp(&sites, ith); |
---|
420 | reallocsites(); |
---|
421 | } |
---|
422 | for (i = 0; i < sites; i++) { |
---|
423 | category[i] = 1; |
---|
424 | oldweight[i] = 1; |
---|
425 | } |
---|
426 | if (justwts || weights) |
---|
427 | inputweights(sites, oldweight, &weights); |
---|
428 | if (printdata) |
---|
429 | putc('\n', outfile); |
---|
430 | if (jukes && printdata) |
---|
431 | fprintf(outfile, " Jukes-Cantor Distance\n"); |
---|
432 | if (kimura && printdata) |
---|
433 | fprintf(outfile, " Kimura 2-parameter Distance\n"); |
---|
434 | if (f84 && printdata) |
---|
435 | fprintf(outfile, " F84 Distance\n"); |
---|
436 | if (similarity) |
---|
437 | fprintf(outfile, " \n Table of similarity between sequences\n"); |
---|
438 | if (firstset && printdata && (kimura || f84)) |
---|
439 | fprintf(outfile, "\nTransition/transversion ratio = %10.6f\n", ttratio); |
---|
440 | if (ctgry && categs > 1) { |
---|
441 | inputcategs(0, sites, category, categs, "DnaDist"); |
---|
442 | if (printdata) |
---|
443 | printcategs(outfile, sites, category, "Site categories"); |
---|
444 | } else if (printdata && (categs > 1)) { |
---|
445 | fprintf(outfile, "\nSite category Rate of change\n\n"); |
---|
446 | for (i = 1; i <= categs; i++) |
---|
447 | fprintf(outfile, "%12ld%13.3f\n", i, rate[i - 1]); |
---|
448 | putc('\n', outfile); |
---|
449 | printcategories(); |
---|
450 | } |
---|
451 | if ((jukes || kimura || logdet) && freqsfrom) { |
---|
452 | printf(" WARNING: CANNOT USE EMPIRICAL BASE FREQUENCIES"); |
---|
453 | printf(" WITH JUKES-CANTOR, KIMURA, JIN/NEI OR LOGDET DISTANCES\n"); |
---|
454 | exxit(-1); |
---|
455 | } |
---|
456 | if (jukes) |
---|
457 | ttratio = 0.5000001; |
---|
458 | if (weights && printdata) |
---|
459 | printweights(outfile, 0, sites, oldweight, "Sites"); |
---|
460 | } /* inputoptions */ |
---|
461 | |
---|
462 | |
---|
463 | void dnadist_sitesort() |
---|
464 | { |
---|
465 | /* Shell sort of sites lexicographically */ |
---|
466 | long gap, i, j, jj, jg, k, itemp; |
---|
467 | boolean flip, tied; |
---|
468 | |
---|
469 | gap = sites / 2; |
---|
470 | while (gap > 0) { |
---|
471 | for (i = gap + 1; i <= sites; i++) { |
---|
472 | j = i - gap; |
---|
473 | flip = true; |
---|
474 | while (j > 0 && flip) { |
---|
475 | jj = alias[j - 1]; |
---|
476 | jg = alias[j + gap - 1]; |
---|
477 | tied = (oldweight[jj - 1] == oldweight[jg - 1]); |
---|
478 | flip = (oldweight[jj - 1] < oldweight[jg - 1] || |
---|
479 | (tied && category[jj - 1] > category[jg - 1])); |
---|
480 | tied = (tied && category[jj - 1] == category[jg - 1]); |
---|
481 | k = 1; |
---|
482 | while (k <= spp && tied) { |
---|
483 | flip = (y[k - 1][jj - 1] > y[k - 1][jg - 1]); |
---|
484 | tied = (tied && y[k - 1][jj - 1] == y[k - 1][jg - 1]); |
---|
485 | k++; |
---|
486 | } |
---|
487 | if (!flip) |
---|
488 | break; |
---|
489 | itemp = alias[j - 1]; |
---|
490 | alias[j - 1] = alias[j + gap - 1]; |
---|
491 | alias[j + gap - 1] = itemp; |
---|
492 | j -= gap; |
---|
493 | } |
---|
494 | } |
---|
495 | gap /= 2; |
---|
496 | } |
---|
497 | } /* dnadist_sitesort */ |
---|
498 | |
---|
499 | |
---|
500 | void dnadist_sitecombine() |
---|
501 | { |
---|
502 | /* combine sites that have identical patterns */ |
---|
503 | long i, j, k; |
---|
504 | boolean tied; |
---|
505 | |
---|
506 | i = 1; |
---|
507 | while (i < sites) { |
---|
508 | j = i + 1; |
---|
509 | tied = true; |
---|
510 | while (j <= sites && tied) { |
---|
511 | tied = (oldweight[alias[i - 1] - 1] == oldweight[alias[j - 1] - 1] && |
---|
512 | category[alias[i - 1] - 1] == category[alias[j - 1] - 1]); |
---|
513 | k = 1; |
---|
514 | while (k <= spp && tied) { |
---|
515 | tied = (tied && |
---|
516 | y[k - 1][alias[i - 1] - 1] == y[k - 1][alias[j - 1] - 1]); |
---|
517 | k++; |
---|
518 | } |
---|
519 | if (!tied) |
---|
520 | break; |
---|
521 | ally[alias[j - 1] - 1] = alias[i - 1]; |
---|
522 | j++; |
---|
523 | } |
---|
524 | i = j; |
---|
525 | } |
---|
526 | } /* dnadist_sitecombine */ |
---|
527 | |
---|
528 | |
---|
529 | void dnadist_sitescrunch() |
---|
530 | { |
---|
531 | /* move so one representative of each pattern of |
---|
532 | sites comes first */ |
---|
533 | long i, j, itemp; |
---|
534 | boolean done, found, completed; |
---|
535 | |
---|
536 | done = false; |
---|
537 | i = 1; |
---|
538 | j = 2; |
---|
539 | while (!done) { |
---|
540 | if (ally[alias[i - 1] - 1] != alias[i - 1]) { |
---|
541 | if (j <= i) |
---|
542 | j = i + 1; |
---|
543 | if (j <= sites) { |
---|
544 | do { |
---|
545 | found = (ally[alias[j - 1] - 1] == alias[j - 1]); |
---|
546 | j++; |
---|
547 | completed = (j > sites); |
---|
548 | if (j <= sites) |
---|
549 | completed = (oldweight[alias[j - 1] - 1] == 0); |
---|
550 | } while (!(found || completed)); |
---|
551 | if (found) { |
---|
552 | j--; |
---|
553 | itemp = alias[i - 1]; |
---|
554 | alias[i - 1] = alias[j - 1]; |
---|
555 | alias[j - 1] = itemp; |
---|
556 | } else |
---|
557 | done = true; |
---|
558 | } else |
---|
559 | done = true; |
---|
560 | } |
---|
561 | i++; |
---|
562 | done = (done || i >= sites); |
---|
563 | } |
---|
564 | } /* dnadist_sitescrunch */ |
---|
565 | |
---|
566 | |
---|
567 | void makeweights() |
---|
568 | { |
---|
569 | /* make up weights vector to avoid duplicate computations */ |
---|
570 | long i; |
---|
571 | |
---|
572 | for (i = 1; i <= sites; i++) { |
---|
573 | alias[i - 1] = i; |
---|
574 | ally[i - 1] = i; |
---|
575 | weight[i - 1] = 0; |
---|
576 | } |
---|
577 | dnadist_sitesort(); |
---|
578 | dnadist_sitecombine(); |
---|
579 | dnadist_sitescrunch(); |
---|
580 | endsite = 0; |
---|
581 | for (i = 1; i <= sites; i++) { |
---|
582 | if (ally[i - 1] == i && oldweight[i - 1] > 0) |
---|
583 | endsite++; |
---|
584 | } |
---|
585 | for (i = 1; i <= endsite; i++) |
---|
586 | location[alias[i - 1] - 1] = i; |
---|
587 | weightsum = 0; |
---|
588 | for (i = 0; i < sites; i++) |
---|
589 | weightsum += oldweight[i]; |
---|
590 | sumrates = 0.0; |
---|
591 | for (i = 0; i < sites; i++) |
---|
592 | sumrates += oldweight[i] * rate[category[i] - 1]; |
---|
593 | for (i = 0; i < categs; i++) |
---|
594 | rate[i] *= weightsum / sumrates; |
---|
595 | for (i = 0; i < sites; i++) |
---|
596 | weight[location[ally[i] - 1] - 1] += oldweight[i]; |
---|
597 | } /* makeweights */ |
---|
598 | |
---|
599 | |
---|
600 | void dnadist_makevalues() |
---|
601 | { |
---|
602 | /* set up fractional likelihoods at tips */ |
---|
603 | long i, j, k; |
---|
604 | bases b; |
---|
605 | |
---|
606 | for (i = 0; i < spp; i++) { |
---|
607 | nodep[i]->x = (phenotype)Malloc(endsite*sizeof(ratelike)); |
---|
608 | for (j = 0; j < endsite; j++) |
---|
609 | nodep[i]->x[j] = (ratelike)Malloc(rcategs*sizeof(sitelike)); |
---|
610 | } |
---|
611 | for (k = 0; k < endsite; k++) { |
---|
612 | j = alias[k]; |
---|
613 | for (i = 0; i < spp; i++) { |
---|
614 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
---|
615 | nodep[i]->x[k][0][(long)b - (long)A] = 0.0; |
---|
616 | switch (y[i][j - 1]) { |
---|
617 | |
---|
618 | case 'A': |
---|
619 | nodep[i]->x[k][0][0] = 1.0; |
---|
620 | break; |
---|
621 | |
---|
622 | case 'C': |
---|
623 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
---|
624 | break; |
---|
625 | |
---|
626 | case 'G': |
---|
627 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
---|
628 | break; |
---|
629 | |
---|
630 | case 'T': |
---|
631 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
632 | break; |
---|
633 | |
---|
634 | case 'U': |
---|
635 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
636 | break; |
---|
637 | |
---|
638 | case 'M': |
---|
639 | nodep[i]->x[k][0][0] = 1.0; |
---|
640 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
---|
641 | break; |
---|
642 | |
---|
643 | case 'R': |
---|
644 | nodep[i]->x[k][0][0] = 1.0; |
---|
645 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
---|
646 | break; |
---|
647 | |
---|
648 | case 'W': |
---|
649 | nodep[i]->x[k][0][0] = 1.0; |
---|
650 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
651 | break; |
---|
652 | |
---|
653 | case 'S': |
---|
654 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
---|
655 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
---|
656 | break; |
---|
657 | |
---|
658 | case 'Y': |
---|
659 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
---|
660 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
661 | break; |
---|
662 | |
---|
663 | case 'K': |
---|
664 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
---|
665 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
666 | break; |
---|
667 | |
---|
668 | case 'B': |
---|
669 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
---|
670 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
---|
671 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
672 | break; |
---|
673 | |
---|
674 | case 'D': |
---|
675 | nodep[i]->x[k][0][0] = 1.0; |
---|
676 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
---|
677 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
678 | break; |
---|
679 | |
---|
680 | case 'H': |
---|
681 | nodep[i]->x[k][0][0] = 1.0; |
---|
682 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
---|
683 | nodep[i]->x[k][0][(long)T - (long)A] = 1.0; |
---|
684 | break; |
---|
685 | |
---|
686 | case 'V': |
---|
687 | nodep[i]->x[k][0][0] = 1.0; |
---|
688 | nodep[i]->x[k][0][(long)C - (long)A] = 1.0; |
---|
689 | nodep[i]->x[k][0][(long)G - (long)A] = 1.0; |
---|
690 | break; |
---|
691 | |
---|
692 | case 'N': |
---|
693 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
---|
694 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
---|
695 | break; |
---|
696 | |
---|
697 | case 'X': |
---|
698 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
---|
699 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
---|
700 | break; |
---|
701 | |
---|
702 | case '?': |
---|
703 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
---|
704 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
---|
705 | break; |
---|
706 | |
---|
707 | case 'O': |
---|
708 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
---|
709 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
---|
710 | break; |
---|
711 | |
---|
712 | case '-': |
---|
713 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) |
---|
714 | nodep[i]->x[k][0][(long)b - (long)A] = 1.0; |
---|
715 | break; |
---|
716 | } |
---|
717 | } |
---|
718 | } |
---|
719 | } /* dnadist_makevalues */ |
---|
720 | |
---|
721 | |
---|
722 | void dnadist_empiricalfreqs() |
---|
723 | { |
---|
724 | /* Get empirical base frequencies from the data */ |
---|
725 | long i, j, k; |
---|
726 | double sum, suma, sumc, sumg, sumt, w; |
---|
727 | |
---|
728 | freqa = 0.25; |
---|
729 | freqc = 0.25; |
---|
730 | freqg = 0.25; |
---|
731 | freqt = 0.25; |
---|
732 | for (k = 1; k <= 8; k++) { |
---|
733 | suma = 0.0; |
---|
734 | sumc = 0.0; |
---|
735 | sumg = 0.0; |
---|
736 | sumt = 0.0; |
---|
737 | for (i = 0; i < spp; i++) { |
---|
738 | for (j = 0; j < endsite; j++) { |
---|
739 | w = weight[j]; |
---|
740 | sum = freqa * nodep[i]->x[j][0][0]; |
---|
741 | sum += freqc * nodep[i]->x[j][0][(long)C - (long)A]; |
---|
742 | sum += freqg * nodep[i]->x[j][0][(long)G - (long)A]; |
---|
743 | sum += freqt * nodep[i]->x[j][0][(long)T - (long)A]; |
---|
744 | suma += w * freqa * nodep[i]->x[j][0][0] / sum; |
---|
745 | sumc += w * freqc * nodep[i]->x[j][0][(long)C - (long)A] / sum; |
---|
746 | sumg += w * freqg * nodep[i]->x[j][0][(long)G - (long)A] / sum; |
---|
747 | sumt += w * freqt * nodep[i]->x[j][0][(long)T - (long)A] / sum; |
---|
748 | } |
---|
749 | } |
---|
750 | sum = suma + sumc + sumg + sumt; |
---|
751 | freqa = suma / sum; |
---|
752 | freqc = sumc / sum; |
---|
753 | freqg = sumg / sum; |
---|
754 | freqt = sumt / sum; |
---|
755 | } |
---|
756 | } /* dnadist_empiricalfreqs */ |
---|
757 | |
---|
758 | |
---|
759 | void getinput() |
---|
760 | { |
---|
761 | /* reads the input data */ |
---|
762 | inputoptions(); |
---|
763 | if ((!freqsfrom) && !logdet && !similarity) { |
---|
764 | if (kimura || jukes) { |
---|
765 | freqa = 0.25; |
---|
766 | freqc = 0.25; |
---|
767 | freqg = 0.25; |
---|
768 | freqt = 0.25; |
---|
769 | } |
---|
770 | getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy, |
---|
771 | &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange, |
---|
772 | freqsfrom, printdata); |
---|
773 | if (freqa < 0.00000001) { |
---|
774 | freqa = 0.000001; |
---|
775 | freqc = 0.999999*freqc; |
---|
776 | freqg = 0.999999*freqg; |
---|
777 | freqt = 0.999999*freqt; |
---|
778 | } |
---|
779 | if (freqc < 0.00000001) { |
---|
780 | freqa = 0.999999*freqa; |
---|
781 | freqc = 0.000001; |
---|
782 | freqg = 0.999999*freqg; |
---|
783 | freqt = 0.999999*freqt; |
---|
784 | } |
---|
785 | if (freqg < 0.00000001) { |
---|
786 | freqa = 0.999999*freqa; |
---|
787 | freqc = 0.999999*freqc; |
---|
788 | freqg = 0.000001; |
---|
789 | freqt = 0.999999*freqt; |
---|
790 | } |
---|
791 | if (freqt < 0.00000001) { |
---|
792 | freqa = 0.999999*freqa; |
---|
793 | freqc = 0.999999*freqc; |
---|
794 | freqg = 0.999999*freqg; |
---|
795 | freqt = 0.000001; |
---|
796 | } |
---|
797 | } |
---|
798 | if (!justwts || firstset) |
---|
799 | inputdata(sites); |
---|
800 | makeweights(); |
---|
801 | dnadist_makevalues(); |
---|
802 | if (freqsfrom) { |
---|
803 | dnadist_empiricalfreqs(); |
---|
804 | getbasefreqs(freqa, freqc, freqg, freqt, &freqr, &freqy, &freqar, &freqcy, |
---|
805 | &freqgr, &freqty, &ttratio, &xi, &xv, &fracchange, |
---|
806 | freqsfrom, printdata); |
---|
807 | } |
---|
808 | } /* getinput */ |
---|
809 | |
---|
810 | |
---|
811 | void inittable() |
---|
812 | { |
---|
813 | /* Define a lookup table. Precompute values and store in a table */ |
---|
814 | long i; |
---|
815 | |
---|
816 | for (i = 0; i < categs; i++) { |
---|
817 | tbl[i].rat = rate[i]; |
---|
818 | tbl[i].ratxv = rate[i] * xv; |
---|
819 | } |
---|
820 | } /* inittable */ |
---|
821 | |
---|
822 | |
---|
823 | double lndet(double (*a)[4]) |
---|
824 | { |
---|
825 | long i, j, k; |
---|
826 | double temp, ld; |
---|
827 | |
---|
828 | /*Gauss-Jordan reduction -- invert matrix a in place, |
---|
829 | overwriting previous contents of a. On exit, matrix a |
---|
830 | contains the inverse, lndet contains the log of the determinant */ |
---|
831 | ld = 1.0; |
---|
832 | for (i = 0; i < 4; i++) { |
---|
833 | ld *= a[i][i]; |
---|
834 | temp = 1.0 / a[i][i]; |
---|
835 | a[i][i] = 1.0; |
---|
836 | for (j = 0; j < 4; j++) |
---|
837 | a[i][j] *= temp; |
---|
838 | for (j = 0; j < 4; j++) { |
---|
839 | if (j != i) { |
---|
840 | temp = a[j][i]; |
---|
841 | a[j][i] = 0.0; |
---|
842 | for (k = 0; k < 4; k++) |
---|
843 | a[j][k] -= temp * a[i][k]; |
---|
844 | } |
---|
845 | } |
---|
846 | } |
---|
847 | if (ld <= 0.0) |
---|
848 | return(99.0); |
---|
849 | else |
---|
850 | return(log(ld)); |
---|
851 | } /* lndet */ |
---|
852 | |
---|
853 | |
---|
854 | void makev(long m, long n, double *v) |
---|
855 | { |
---|
856 | /* compute one distance */ |
---|
857 | long i, j, k, l, it, num1, num2, idx; |
---|
858 | long numerator = 0, denominator = 0; |
---|
859 | double sum, sum1, sum2, sumyr, lz, aa, bb, cc, vv=0, |
---|
860 | p1, p2, p3, q1, q2, q3, tt, delta, slope, |
---|
861 | xx1freqa, xx1freqc, xx1freqg, xx1freqt; |
---|
862 | double *prod, *prod2, *prod3; |
---|
863 | boolean quick, jukesquick, kimquick, logdetquick; |
---|
864 | bases b; |
---|
865 | node *p, *q; |
---|
866 | sitelike xx1, xx2; |
---|
867 | double basetable[4][4]; /* for quick logdet */ |
---|
868 | double basefreq1[4], basefreq2[4]; |
---|
869 | |
---|
870 | p = nodep[m - 1]; |
---|
871 | q = nodep[n - 1]; |
---|
872 | quick = (!ctgry || categs == 1); |
---|
873 | if (jukes || kimura || logdet || similarity) { |
---|
874 | numerator = 0; |
---|
875 | denominator = 0; |
---|
876 | for (i = 0; i < endsite; i++) { |
---|
877 | memcpy(xx1, p->x[i][0], sizeof(sitelike)); |
---|
878 | memcpy(xx2, q->x[i][0], sizeof(sitelike)); |
---|
879 | sum = 0.0; |
---|
880 | sum1 = 0.0; |
---|
881 | sum2 = 0.0; |
---|
882 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) { |
---|
883 | sum1 += xx1[(long)b - (long)A]; |
---|
884 | sum2 += xx2[(long)b - (long)A]; |
---|
885 | sum += xx1[(long)b - (long)A] * xx2[(long)b - (long)A]; |
---|
886 | } |
---|
887 | quick = (quick && (sum1 == 1.0 || sum1 == 4.0) && |
---|
888 | (sum2 == 1.0 || sum2 == 4.0)); |
---|
889 | if (sum1 == 1.0 && sum2 == 1.0) { |
---|
890 | numerator += (long)(weight[i] * sum); |
---|
891 | denominator += weight[i]; |
---|
892 | } |
---|
893 | } |
---|
894 | } |
---|
895 | jukesquick = ((jukes || similarity) && quick); |
---|
896 | kimquick = (kimura && quick); |
---|
897 | logdetquick = (logdet && quick); |
---|
898 | if (logdet && !quick) { |
---|
899 | printf(" WARNING: CANNOT CALCULATE LOGDET DISTANCE\n"); |
---|
900 | printf(" WITH PRESENT PROGRAM IF PARTIALLY AMBIGUOUS NUCLEOTIDES\n"); |
---|
901 | baddists = true; |
---|
902 | } |
---|
903 | if (jukesquick && jukes && (numerator * 4 <= denominator)) { |
---|
904 | printf("\nWARNING: INFINITE DISTANCE BETWEEN "); |
---|
905 | printf(" SPECIES %3ld AND %3ld\n", m, n); |
---|
906 | baddists = true; |
---|
907 | } |
---|
908 | if (jukesquick && invar |
---|
909 | && (4 * (((double)numerator / denominator) - invarfrac) |
---|
910 | <= (1.0 - invarfrac))) { |
---|
911 | printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n); |
---|
912 | printf(" TOO LARGE FOR INVARIABLE SITES\n"); |
---|
913 | baddists = true; |
---|
914 | } |
---|
915 | if (jukesquick) { |
---|
916 | if (similarity) |
---|
917 | vv = (double)numerator / denominator; |
---|
918 | else { |
---|
919 | if (!gama && !invar) |
---|
920 | vv = -0.75 * log((4.0*((double)numerator / denominator) - 1.0) / 3.0); |
---|
921 | else if (!invar) |
---|
922 | vv = 0.75 * cvi * (exp(-(1/cvi)* |
---|
923 | log((4.0 * ((double)numerator / denominator) - 1.0) / 3.0)) - 1.0); |
---|
924 | else |
---|
925 | vv = 0.75 * cvi * (exp(-(1/cvi)* |
---|
926 | log((4.0 * ((double)numerator / denominator - invarfrac)/ |
---|
927 | (1.0-invarfrac) - 1.0) / 3.0)) - 1.0); |
---|
928 | } |
---|
929 | } |
---|
930 | if (kimquick) { |
---|
931 | num1 = 0; |
---|
932 | num2 = 0; |
---|
933 | denominator = 0; |
---|
934 | for (i = 0; i < endsite; i++) { |
---|
935 | memcpy(xx1, p->x[i][0], sizeof(sitelike)); |
---|
936 | memcpy(xx2, q->x[i][0], sizeof(sitelike)); |
---|
937 | sum = 0.0; |
---|
938 | sum1 = 0.0; |
---|
939 | sum2 = 0.0; |
---|
940 | for (b = A; (long)b <= (long)T; b = (bases)((long)b + 1)) { |
---|
941 | sum1 += xx1[(long)b - (long)A]; |
---|
942 | sum2 += xx2[(long)b - (long)A]; |
---|
943 | sum += xx1[(long)b - (long)A] * xx2[(long)b - (long)A]; |
---|
944 | } |
---|
945 | sumyr = (xx1[0] + xx1[(long)G - (long)A]) |
---|
946 | * (xx2[0] + xx2[(long)G - (long)A]) + |
---|
947 | (xx1[(long)C - (long)A] + xx1[(long)T - (long)A]) * |
---|
948 | (xx2[(long)C - (long)A] + xx2[(long)T - (long)A]); |
---|
949 | if (sum1 == 1.0 && sum2 == 1.0) { |
---|
950 | num1 += (long)(weight[i] * sum); |
---|
951 | num2 += (long)(weight[i] * (sumyr - sum)); |
---|
952 | denominator += weight[i]; |
---|
953 | } |
---|
954 | } |
---|
955 | tt = ((1.0 - (double)num1 / denominator)-invarfrac)/(1.0-invarfrac); |
---|
956 | if (tt > 0.0) { |
---|
957 | delta = 0.1; |
---|
958 | tt = delta; |
---|
959 | it = 0; |
---|
960 | while (fabs(delta) > 0.00002 && it < iterationsd) { |
---|
961 | it++; |
---|
962 | if (!gama) { |
---|
963 | p1 = exp(-tt); |
---|
964 | p2 = exp(-xv * tt) - exp(-tt); |
---|
965 | p3 = 1.0 - exp(-xv * tt); |
---|
966 | } else { |
---|
967 | p1 = exp(-cvi * log(1 + tt / cvi)); |
---|
968 | p2 = exp(-cvi * log(1 + xv * tt / cvi)) |
---|
969 | - exp(-cvi * log(1 + tt / cvi)); |
---|
970 | p3 = 1.0 - exp(-cvi * log(1 + xv * tt / cvi)); |
---|
971 | } |
---|
972 | q1 = p1 + p2 / 2.0 + p3 / 4.0; |
---|
973 | q2 = p2 / 2.0 + p3 / 4.0; |
---|
974 | q3 = p3 / 2.0; |
---|
975 | q1 = q1 * (1.0-invarfrac) + invarfrac; |
---|
976 | q2 *= (1.0 - invarfrac); |
---|
977 | q3 *= (1.0 - invarfrac); |
---|
978 | if (!gama && !invar) |
---|
979 | slope = 0.5 * exp(-tt) * (num2 / q2 - num1 / q1) + |
---|
980 | 0.25 * xv * exp(-xv * tt) * |
---|
981 | ((denominator - num1 - num2) * 2 / q3 - num2 / q2 - num1 / q1); |
---|
982 | else |
---|
983 | slope = 0.5 * (1 / (1 + tt / cvi)) * exp(-cvi * log(1 + tt / cvi)) * |
---|
984 | (num2 / q2 - num1 / q1) + 0.25 * (xv / (1 + xv * tt / cvi)) * |
---|
985 | exp(-cvi * log(1 + xv * tt / cvi)) * |
---|
986 | ((denominator - num1 - num2) * 2 / q3 - num2 / q2 - num1 / q1); |
---|
987 | slope *= (1.0-invarfrac); |
---|
988 | if (slope < 0.0) |
---|
989 | delta = fabs(delta) / -2.0; |
---|
990 | else |
---|
991 | delta = fabs(delta); |
---|
992 | tt += delta; |
---|
993 | } |
---|
994 | } |
---|
995 | if ((delta >= 0.1) && (!similarity)) { |
---|
996 | printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n); |
---|
997 | if (invar) |
---|
998 | printf(" TOO LARGE FOR INVARIABLE SITES\n"); |
---|
999 | else |
---|
1000 | printf(" TOO LARGE TO ESTIMATE DISTANCE\n"); |
---|
1001 | baddists = true; |
---|
1002 | } |
---|
1003 | vv = fracchange * tt; |
---|
1004 | } |
---|
1005 | if (!(jukesquick || kimquick || logdet)) { |
---|
1006 | prod = (double *)Malloc(sites*sizeof(double)); |
---|
1007 | prod2 = (double *)Malloc(sites*sizeof(double)); |
---|
1008 | prod3 = (double *)Malloc(sites*sizeof(double)); |
---|
1009 | for (i = 0; i < endsite; i++) { |
---|
1010 | memcpy(xx1, p->x[i][0], sizeof(sitelike)); |
---|
1011 | memcpy(xx2, q->x[i][0], sizeof(sitelike)); |
---|
1012 | xx1freqa = xx1[0] * freqa; |
---|
1013 | xx1freqc = xx1[(long)C - (long)A] * freqc; |
---|
1014 | xx1freqg = xx1[(long)G - (long)A] * freqg; |
---|
1015 | xx1freqt = xx1[(long)T - (long)A] * freqt; |
---|
1016 | sum1 = xx1freqa + xx1freqc + xx1freqg + xx1freqt; |
---|
1017 | sum2 = freqa * xx2[0] + freqc * xx2[(long)C - (long)A] + |
---|
1018 | freqg * xx2[(long)G - (long)A] + freqt * xx2[(long)T - (long)A]; |
---|
1019 | prod[i] = sum1 * sum2; |
---|
1020 | prod2[i] = (xx1freqa + xx1freqg) * |
---|
1021 | (xx2[0] * freqar + xx2[(long)G - (long)A] * freqgr) + |
---|
1022 | (xx1freqc + xx1freqt) * |
---|
1023 | (xx2[(long)C - (long)A] * freqcy + xx2[(long)T - (long)A] * freqty); |
---|
1024 | prod3[i] = xx1freqa * xx2[0] + xx1freqc * xx2[(long)C - (long)A] + |
---|
1025 | xx1freqg * xx2[(long)G - (long)A] + xx1freqt * xx2[(long)T - (long)A]; |
---|
1026 | } |
---|
1027 | tt = 0.1; |
---|
1028 | delta = 0.1; |
---|
1029 | it = 1; |
---|
1030 | while (it < iterationsd && fabs(delta) > 0.00002) { |
---|
1031 | slope = 0.0; |
---|
1032 | if (tt > 0.0) { |
---|
1033 | lz = -tt; |
---|
1034 | for (i = 0; i < categs; i++) { |
---|
1035 | if (!gama) { |
---|
1036 | tbl[i].z1 = exp(tbl[i].ratxv * lz); |
---|
1037 | tbl[i].z1zz = exp(tbl[i].rat * lz); |
---|
1038 | } |
---|
1039 | else { |
---|
1040 | tbl[i].z1 = exp(-cvi*log(1.0-tbl[i].ratxv * lz/cvi)); |
---|
1041 | tbl[i].z1zz = exp(-cvi*log(1.0-tbl[i].rat * lz/cvi)); |
---|
1042 | } |
---|
1043 | tbl[i].y1 = 1.0 - tbl[i].z1; |
---|
1044 | tbl[i].z1yy = tbl[i].z1 - tbl[i].z1zz; |
---|
1045 | tbl[i].z1xv = tbl[i].z1 * xv; |
---|
1046 | } |
---|
1047 | for (i = 0; i < endsite; i++) { |
---|
1048 | idx = category[alias[i] - 1]; |
---|
1049 | cc = prod[i]; |
---|
1050 | bb = prod2[i]; |
---|
1051 | aa = prod3[i]; |
---|
1052 | if (!gama && !invar) |
---|
1053 | slope += weightrat[i] * (tbl[idx - 1].z1zz * (bb - aa) + |
---|
1054 | tbl[idx - 1].z1xv * (cc - bb)) / |
---|
1055 | (aa * tbl[idx - 1].z1zz + bb * tbl[idx - 1].z1yy + |
---|
1056 | cc * tbl[idx - 1].y1); |
---|
1057 | else |
---|
1058 | slope += (1.0-invarfrac) * weightrat[i] * ( |
---|
1059 | ((tbl[idx-1].rat)/(1.0-tbl[idx-1].rat * lz/cvi)) |
---|
1060 | * tbl[idx - 1].z1zz * (bb - aa) + |
---|
1061 | ((tbl[idx-1].ratxv)/(1.0-tbl[idx-1].ratxv * lz/cvi)) |
---|
1062 | * tbl[idx - 1].z1 * (cc - bb)) / |
---|
1063 | (aa * ((1.0-invarfrac)*tbl[idx - 1].z1zz + invarfrac) |
---|
1064 | + bb * (1.0-invarfrac)*tbl[idx - 1].z1yy |
---|
1065 | + cc * (1.0-invarfrac)*tbl[idx - 1].y1); |
---|
1066 | } |
---|
1067 | } |
---|
1068 | if (slope < 0.0) |
---|
1069 | delta = fabs(delta) / -2.0; |
---|
1070 | else |
---|
1071 | delta = fabs(delta); |
---|
1072 | tt += delta; |
---|
1073 | it++; |
---|
1074 | } |
---|
1075 | if ((delta >= 0.1) && (!similarity)) { |
---|
1076 | printf("\nWARNING: DIFFERENCE BETWEEN SPECIES %3ld AND %3ld", m, n); |
---|
1077 | if (invar) |
---|
1078 | printf(" TOO LARGE FOR INVARIABLE SITES\n"); |
---|
1079 | else |
---|
1080 | printf(" TOO LARGE TO ESTIMATE DISTANCE\n"); |
---|
1081 | baddists = true; |
---|
1082 | } |
---|
1083 | vv = tt * fracchange; |
---|
1084 | free(prod); |
---|
1085 | free(prod2); |
---|
1086 | free(prod3); |
---|
1087 | } |
---|
1088 | if (logdetquick) { /* compute logdet when no ambiguous nucleotides */ |
---|
1089 | for (i = 0; i < 4; i++) { |
---|
1090 | basefreq1[i] = 0.0; |
---|
1091 | basefreq2[i] = 0.0; |
---|
1092 | for (j = 0; j < 4; j++) |
---|
1093 | basetable[i][j] = 0.0; |
---|
1094 | } |
---|
1095 | for (i = 0; i < endsite; i++) { |
---|
1096 | for (k = 0; p->x[i][0][k] == 0.0; k++); |
---|
1097 | basefreq1[k] += weight[i]; |
---|
1098 | for (l = 0; q->x[i][0][l] == 0.0; l++); |
---|
1099 | basefreq2[l] += weight[i]; |
---|
1100 | basetable[k][l] += weight[i]; |
---|
1101 | } |
---|
1102 | vv = lndet(basetable); |
---|
1103 | if (vv == 99.0) { |
---|
1104 | printf("\nNegative or zero determinant for distance between species"); |
---|
1105 | printf(" %ld and %ld\n", m, n); |
---|
1106 | baddists = true; |
---|
1107 | } |
---|
1108 | vv = -0.25*(vv - 0.5*(log(basefreq1[0])+log(basefreq1[1]) |
---|
1109 | +log(basefreq1[2])+log(basefreq1[3]) |
---|
1110 | +log(basefreq2[0])+log(basefreq2[1]) |
---|
1111 | +log(basefreq2[2])+log(basefreq2[3]))); |
---|
1112 | } |
---|
1113 | *v = vv; |
---|
1114 | } /* makev */ |
---|
1115 | |
---|
1116 | |
---|
1117 | void makedists() |
---|
1118 | { |
---|
1119 | /* compute distance matrix */ |
---|
1120 | long i, j; |
---|
1121 | double v; |
---|
1122 | |
---|
1123 | inittable(); |
---|
1124 | for (i = 0; i < endsite; i++) |
---|
1125 | weightrat[i] = weight[i] * rate[category[alias[i] - 1] - 1]; |
---|
1126 | if (progress) { |
---|
1127 | printf("Distances calculated for species\n"); |
---|
1128 | #ifdef WIN32 |
---|
1129 | phyFillScreenColor(); |
---|
1130 | #endif |
---|
1131 | } |
---|
1132 | for (i = 0; i < spp; i++) |
---|
1133 | if (similarity) |
---|
1134 | d[i][i] = 1.0; |
---|
1135 | else |
---|
1136 | d[i][i] = 0.0; |
---|
1137 | baddists = false; |
---|
1138 | for (i = 1; i < spp; i++) { |
---|
1139 | if (progress) { |
---|
1140 | printf(" "); |
---|
1141 | for (j = 0; j < nmlngth; j++) |
---|
1142 | putchar(nayme[i - 1][j]); |
---|
1143 | printf(" "); |
---|
1144 | } |
---|
1145 | for (j = i + 1; j <= spp; j++) { |
---|
1146 | makev(i, j, &v); |
---|
1147 | d[i - 1][j - 1] = v; |
---|
1148 | d[j - 1][i - 1] = v; |
---|
1149 | if (progress) { |
---|
1150 | putchar('.'); |
---|
1151 | fflush(stdout); |
---|
1152 | } |
---|
1153 | } |
---|
1154 | if (progress) { |
---|
1155 | putchar('\n'); |
---|
1156 | #ifdef WIN32 |
---|
1157 | phyFillScreenColor(); |
---|
1158 | #endif |
---|
1159 | } |
---|
1160 | } |
---|
1161 | if (baddists) |
---|
1162 | exxit(-1); |
---|
1163 | if (progress) { |
---|
1164 | printf(" "); |
---|
1165 | for (j = 0; j < nmlngth; j++) |
---|
1166 | putchar(nayme[spp - 1][j]); |
---|
1167 | putchar('\n'); |
---|
1168 | } |
---|
1169 | for (i = 0; i < spp; i++) { |
---|
1170 | for (j = 0; j < endsite; j++) |
---|
1171 | free(nodep[i]->x[j]); |
---|
1172 | free(nodep[i]->x); |
---|
1173 | } |
---|
1174 | } /* makedists */ |
---|
1175 | |
---|
1176 | |
---|
1177 | void writedists() |
---|
1178 | { |
---|
1179 | /* write out distances */ |
---|
1180 | long i, j, k, n; |
---|
1181 | |
---|
1182 | if (!printdata && !similarity) |
---|
1183 | fprintf(outfile, "%5ld\n", spp); |
---|
1184 | else |
---|
1185 | fprintf(outfile, "\n"); |
---|
1186 | if (!similarity) { |
---|
1187 | for (i = 0; i < spp; i++) { |
---|
1188 | for (j = 0; j < nmlngth; j++) |
---|
1189 | putc(nayme[i][j], outfile); |
---|
1190 | if (lower) |
---|
1191 | k = i; |
---|
1192 | else |
---|
1193 | k = spp; |
---|
1194 | for (j = 1; j <= k; j++) { |
---|
1195 | fprintf(outfile, "%8.4f", d[i][j - 1]); |
---|
1196 | if ((j + 1) % 9 == 0 && j < k) |
---|
1197 | putc('\n', outfile); |
---|
1198 | } |
---|
1199 | putc('\n', outfile); |
---|
1200 | } |
---|
1201 | } else { |
---|
1202 | for (i = 0; i < spp; i += 8) { |
---|
1203 | if ((i+8) < spp) |
---|
1204 | n = i+8; |
---|
1205 | else |
---|
1206 | n = spp; |
---|
1207 | fprintf(outfile, " "); |
---|
1208 | for (j = i; j < n ; j++) { |
---|
1209 | for (k = 0; k < (nmlngth-3); k++) |
---|
1210 | putc(nayme[j][k], outfile); |
---|
1211 | putc(' ', outfile); |
---|
1212 | } |
---|
1213 | putc('\n', outfile); |
---|
1214 | for (j = 0; j < spp; j++) { |
---|
1215 | for (k = 0; k < nmlngth; k++) |
---|
1216 | putc(nayme[j][k], outfile); |
---|
1217 | if ((i+8) < spp) |
---|
1218 | n = i+8; |
---|
1219 | else |
---|
1220 | n = spp; |
---|
1221 | for (k = i; k < n ; k++) |
---|
1222 | fprintf(outfile, "%8.4f", d[j][k]); |
---|
1223 | putc('\n', outfile); |
---|
1224 | } |
---|
1225 | putc('\n', outfile); |
---|
1226 | } |
---|
1227 | } |
---|
1228 | if (progress) |
---|
1229 | printf("\nDistances written to file \"%s\"\n\n", outfilename); |
---|
1230 | } /* writedists */ |
---|
1231 | |
---|
1232 | |
---|
1233 | int main(int argc, Char *argv[]) |
---|
1234 | { /* DNA Distances by Maximum Likelihood */ |
---|
1235 | #ifdef MAC |
---|
1236 | argc = 1; /* macsetup("Dnadist",""); */ |
---|
1237 | argv[0] = "Dnadist"; |
---|
1238 | #endif |
---|
1239 | init(argc, argv); |
---|
1240 | openfile(&infile,INFILE,"input file","r",argv[0],infilename); |
---|
1241 | openfile(&outfile,OUTFILE,"output file","w",argv[0],outfilename); |
---|
1242 | |
---|
1243 | ibmpc = IBMCRT; |
---|
1244 | ansi = ANSICRT; |
---|
1245 | mulsets = false; |
---|
1246 | datasets = 1; |
---|
1247 | firstset = true; |
---|
1248 | doinit(); |
---|
1249 | ttratio0 = ttratio; |
---|
1250 | if (ctgry) |
---|
1251 | openfile(&catfile,CATFILE,"categories file","r",argv[0],catfilename); |
---|
1252 | if (weights || justwts) |
---|
1253 | openfile(&weightfile,WEIGHTFILE,"weights file","r",argv[0],weightfilename); |
---|
1254 | for (ith = 1; ith <= datasets; ith++) { |
---|
1255 | ttratio = ttratio0; |
---|
1256 | getinput(); |
---|
1257 | if (ith == 1) |
---|
1258 | firstset = false; |
---|
1259 | if (datasets > 1 && progress) |
---|
1260 | printf("Data set # %ld:\n\n",ith); |
---|
1261 | makedists(); |
---|
1262 | writedists(); |
---|
1263 | } |
---|
1264 | FClose(infile); |
---|
1265 | FClose(outfile); |
---|
1266 | #ifdef MAC |
---|
1267 | fixmacfile(outfilename); |
---|
1268 | #endif |
---|
1269 | printf("Done.\n\n"); |
---|
1270 | #ifdef WIN32 |
---|
1271 | phyRestoreConsoleAttributes(); |
---|
1272 | #endif |
---|
1273 | return 0; |
---|
1274 | } /* DNA Distances by Maximum Likelihood */ |
---|
1275 | |
---|