1 | /* |
---|
2 | * ml3.c |
---|
3 | * |
---|
4 | * |
---|
5 | * Part of TREE-PUZZLE 5.0 (June 2000) |
---|
6 | * |
---|
7 | * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, |
---|
8 | * M. Vingron, and Arndt von Haeseler |
---|
9 | * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler |
---|
10 | * |
---|
11 | * All parts of the source except where indicated are distributed under |
---|
12 | * the GNU public licence. See http://www.opensource.org for details. |
---|
13 | */ |
---|
14 | |
---|
15 | |
---|
16 | #define EXTERN extern |
---|
17 | |
---|
18 | |
---|
19 | /* prototypes */ |
---|
20 | #include <stdio.h> |
---|
21 | #include <stdlib.h> |
---|
22 | #include <math.h> |
---|
23 | #include "util.h" |
---|
24 | #include "ml.h" |
---|
25 | #include "gamma.h" |
---|
26 | |
---|
27 | |
---|
28 | |
---|
29 | /******************************************************************************/ |
---|
30 | /* discrete Gamma-distribution and related stuff */ |
---|
31 | /******************************************************************************/ |
---|
32 | |
---|
33 | /* compare general base frequencies with frequencies of taxon i with chi square */ |
---|
34 | double homogentest(int taxon) |
---|
35 | { |
---|
36 | return chi2test(Freqtpm, Basecomp[taxon], gettpmradix(), &chi2fail); |
---|
37 | } |
---|
38 | |
---|
39 | |
---|
40 | /* discrete Gamma according to Yang 1994 (JME 39:306-314) */ |
---|
41 | void YangDiscreteGamma (double shape, int c, dvector x) |
---|
42 | { |
---|
43 | double twoc, mu; |
---|
44 | int i; |
---|
45 | |
---|
46 | twoc = 2.0*c; |
---|
47 | mu = 0.0; |
---|
48 | for (i = 0; i < c; i++) |
---|
49 | { |
---|
50 | /* corresponding rates */ |
---|
51 | x[i] = icdfGamma ( (2.0*i+1.0)/twoc, shape); |
---|
52 | mu += x[i]; |
---|
53 | } |
---|
54 | mu = mu/c; |
---|
55 | |
---|
56 | /* rescale for avarage rate of 1.0 */ |
---|
57 | for (i = 0; i < c; i++) |
---|
58 | { |
---|
59 | x[i] /= mu; |
---|
60 | } |
---|
61 | } |
---|
62 | |
---|
63 | /* compute rates of each category when rates are Gamma-distributed */ |
---|
64 | void updaterates() |
---|
65 | { |
---|
66 | int i; |
---|
67 | double alpha; |
---|
68 | |
---|
69 | if (numcats == 1) |
---|
70 | { |
---|
71 | Rates[0] = 1.0; |
---|
72 | return; |
---|
73 | } |
---|
74 | if (Geta == 0.0) |
---|
75 | { |
---|
76 | for (i = 0; i < numcats; i++) |
---|
77 | Rates[i] = 1.0; |
---|
78 | return; |
---|
79 | } |
---|
80 | alpha = (1.0 - Geta)/Geta; |
---|
81 | |
---|
82 | YangDiscreteGamma (alpha, numcats, Rates); |
---|
83 | |
---|
84 | /* if invariable sites are present */ |
---|
85 | for (i = 0; i < numcats; i++) |
---|
86 | Rates[i] = Rates[i]/(1.0-fracinv); |
---|
87 | |
---|
88 | /* check for very small rates */ |
---|
89 | for (i = 0; i < numcats; i++) |
---|
90 | if (Rates[i] < 0.000001) Rates[i] = 0.000001; |
---|
91 | } |
---|
92 | |
---|
93 | |
---|
94 | |
---|
95 | /******************************************************************************/ |
---|
96 | /* parameter estimation */ |
---|
97 | /******************************************************************************/ |
---|
98 | |
---|
99 | /* compute sample mean and standard deviation of sample mean */ |
---|
100 | void computestat(double *data, int n, double *mean, double *err) |
---|
101 | { |
---|
102 | int i; |
---|
103 | double sum; |
---|
104 | |
---|
105 | sum = 0; |
---|
106 | for (i = 0; i < n; i++) sum += data[i]; |
---|
107 | (*mean) = sum/(double) n; |
---|
108 | |
---|
109 | sum = 0; |
---|
110 | for (i = 0; i < n; i++) sum += (data[i] - (*mean))*(data[i] - (*mean)); |
---|
111 | if (n != 1) |
---|
112 | (*err) = sqrt(sum)/sqrt((double)(n-1)*n); /* unbiased estimator */ |
---|
113 | else |
---|
114 | (*err) = 0.0; /* if n == 1 */ |
---|
115 | } |
---|
116 | |
---|
117 | /* compute ML value of quartet (a,b,c,d) */ |
---|
118 | double quartetml(int a, int b, int c, int d) |
---|
119 | { |
---|
120 | double d1, d2, d3; |
---|
121 | |
---|
122 | /* compute ML for all topologies */ |
---|
123 | if (approxp_optn) { /* approximate parameter mode */ |
---|
124 | d1 = quartet_alklhd(a,b,c,d); /* (a,b)-(c,d) */ |
---|
125 | d2 = quartet_alklhd(a,c,b,d); /* (a,c)-(b,d) */ |
---|
126 | d3 = quartet_alklhd(a,d,b,c); /* (a,d)-(b,c) */ |
---|
127 | } else { |
---|
128 | d1 = quartet_lklhd(a,b,c,d); /* (a,b)-(c,d) */ |
---|
129 | d2 = quartet_lklhd(a,c,b,d); /* (a,c)-(b,d) */ |
---|
130 | d3 = quartet_lklhd(a,d,b,c); /* (a,d)-(b,c) */ |
---|
131 | } |
---|
132 | |
---|
133 | /* looking for max(d1, d2, d3) */ |
---|
134 | if (d1 < d2) { /* d2 > d1 */ |
---|
135 | if (d2 < d3) { /* d3 > d2 > d1 */ |
---|
136 | /* d3 maximum */ |
---|
137 | return d3; |
---|
138 | } else { /* d2 >= d3 > d1 */ |
---|
139 | /* d2 maximum */ |
---|
140 | return d2; |
---|
141 | } |
---|
142 | } else { /* d1 >= d2 */ |
---|
143 | if (d1 < d3) { /* d3 > d1 >= d2 */ |
---|
144 | /* d3 maximum */ |
---|
145 | return d3; |
---|
146 | } else { /* d1 >= d2 && d1 >= d3 */ |
---|
147 | /* d1 maximum */ |
---|
148 | return d1; |
---|
149 | } |
---|
150 | } |
---|
151 | } |
---|
152 | |
---|
153 | /* optimization function TSparam - quartets */ |
---|
154 | double opttsq(double x) |
---|
155 | { |
---|
156 | if (x < MINTS) TSparam = MINTS; |
---|
157 | else if (x > MAXTS) TSparam = MAXTS; |
---|
158 | else TSparam = x; |
---|
159 | tranprobmat(); |
---|
160 | distupdate(qca, qcb, qcc, qcd); |
---|
161 | return (-quartetml(qca, qcb, qcc, qcd)); |
---|
162 | } |
---|
163 | |
---|
164 | /* optimization function YRparam - quartets */ |
---|
165 | double optyrq(double x) |
---|
166 | { |
---|
167 | if (x < MINYR) YRparam = MINYR; |
---|
168 | else if (x > MAXYR) YRparam = MAXYR; |
---|
169 | else YRparam = x; |
---|
170 | tranprobmat(); |
---|
171 | distupdate(qca, qcb, qcc, qcd); |
---|
172 | return (-quartetml(qca, qcb, qcc, qcd)); |
---|
173 | } |
---|
174 | |
---|
175 | /* estimate substitution process parameters - random quartets */ |
---|
176 | void optimseqevolparamsq() |
---|
177 | { |
---|
178 | double tsmeanold, yrmeanold; |
---|
179 | dvector tslist, yrlist; |
---|
180 | int fin; |
---|
181 | ivector taxon; |
---|
182 | uli minqts, maxqts, n; |
---|
183 | |
---|
184 | |
---|
185 | taxon = new_ivector(4); |
---|
186 | |
---|
187 | /* number of quartets to be investigated */ |
---|
188 | minqts = (uli) floor(0.25 * MINPERTAXUM * Maxspc) + 1; |
---|
189 | maxqts = (uli) floor(0.25 * MAXPERTAXUM * Maxspc) + 1; |
---|
190 | if (Maxspc == 4) { |
---|
191 | minqts = (uli) 1; |
---|
192 | maxqts = (uli) 1; |
---|
193 | } |
---|
194 | |
---|
195 | tslist = new_dvector(maxqts); |
---|
196 | yrlist = new_dvector(maxqts); |
---|
197 | |
---|
198 | /* initialize averages */ |
---|
199 | tsmean = TSparam; |
---|
200 | yrmean = YRparam; |
---|
201 | |
---|
202 | fin = FALSE; |
---|
203 | |
---|
204 | /* investigate maxqts random quartets */ |
---|
205 | for (n = 0; n < maxqts; n++) { |
---|
206 | |
---|
207 | /* choose random quartet */ |
---|
208 | chooser(Maxspc, 4, taxon); |
---|
209 | |
---|
210 | /* |
---|
211 | * optimize parameters on this quartet |
---|
212 | */ |
---|
213 | |
---|
214 | qca = taxon[0]; |
---|
215 | qcb = taxon[1]; |
---|
216 | qcc = taxon[2]; |
---|
217 | qcd = taxon[3]; |
---|
218 | |
---|
219 | /* initialize start values with average value */ |
---|
220 | if ((SH_optn || nuc_optn) && optim_optn && (data_optn == 0)) TSparam = tsmean; |
---|
221 | if ((nuc_optn && TN_optn) && optim_optn && (data_optn == 0)) YRparam = yrmean; |
---|
222 | |
---|
223 | /* estimation */ |
---|
224 | twodimenmin(PEPS1, |
---|
225 | (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), |
---|
226 | MINTS, &TSparam, MAXTS, opttsq, &tserr, |
---|
227 | (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), |
---|
228 | MINYR, &YRparam, MAXYR, optyrq, &yrerr); |
---|
229 | |
---|
230 | |
---|
231 | tsmeanold = tsmean; |
---|
232 | yrmeanold = yrmean; |
---|
233 | tslist[n] = TSparam; |
---|
234 | yrlist[n] = YRparam; |
---|
235 | computestat(tslist, n+1 , &tsmean, &tserr); |
---|
236 | computestat(yrlist, n+1 , &yrmean, &yrerr); |
---|
237 | |
---|
238 | /* check whether the means are converging */ |
---|
239 | if (n > minqts-2) { |
---|
240 | if ((fabs(tsmean-tsmeanold) < TSDIFF) && |
---|
241 | (fabs(yrmean-yrmeanold) < YRDIFF)) |
---|
242 | fin = TRUE; |
---|
243 | } |
---|
244 | |
---|
245 | /* investigate at least minqts quartets */ |
---|
246 | if (n > minqts-2 && (fin || n > maxqts-2)) break; |
---|
247 | } |
---|
248 | |
---|
249 | /* round estimated numbers to 2 digits after the decimal point */ |
---|
250 | if (tserr != 0.0) tsmean = floor(100.0*tsmean+0.5)/100.0; |
---|
251 | if (yrerr != 0.0) yrmean = floor(100.0*yrmean+0.5)/100.0; |
---|
252 | |
---|
253 | /* update ML engine */ |
---|
254 | TSparam = tsmean; |
---|
255 | YRparam = yrmean; |
---|
256 | tranprobmat(); |
---|
257 | |
---|
258 | free_ivector(taxon); |
---|
259 | } |
---|
260 | |
---|
261 | /* optimization function TSparam - tree */ |
---|
262 | double opttst(double x) |
---|
263 | { |
---|
264 | double result; |
---|
265 | |
---|
266 | if (x < MINTS) TSparam = MINTS; |
---|
267 | else if (x > MAXTS) TSparam = MAXTS; |
---|
268 | else TSparam = x; |
---|
269 | tranprobmat(); |
---|
270 | computedistan(); |
---|
271 | if (approxp_optn) result = usertree_alklhd(); |
---|
272 | else result = usertree_lklhd(); |
---|
273 | |
---|
274 | return (-result); |
---|
275 | } |
---|
276 | |
---|
277 | /* optimization function YRparam - tree */ |
---|
278 | double optyrt(double x) |
---|
279 | { |
---|
280 | double result; |
---|
281 | |
---|
282 | if (x < MINYR) YRparam = MINYR; |
---|
283 | else if (x > MAXYR) YRparam = MAXYR; |
---|
284 | else YRparam = x; |
---|
285 | tranprobmat(); |
---|
286 | computedistan(); |
---|
287 | if (approxp_optn) result = usertree_alklhd(); |
---|
288 | else result = usertree_lklhd(); |
---|
289 | |
---|
290 | return (-result); |
---|
291 | } |
---|
292 | |
---|
293 | |
---|
294 | /* optimize substitution process parameters - tree */ |
---|
295 | void optimseqevolparamst() |
---|
296 | { |
---|
297 | twodimenmin(PEPS1, |
---|
298 | (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), |
---|
299 | MINTS, &TSparam, MAXTS, opttst, &tserr, |
---|
300 | (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), |
---|
301 | MINYR, &YRparam, MAXYR, optyrt, &yrerr); |
---|
302 | } |
---|
303 | |
---|
304 | |
---|
305 | /* optimization function fracinv */ |
---|
306 | double optfi(double x) |
---|
307 | { |
---|
308 | double result; |
---|
309 | |
---|
310 | if (x < MINFI) fracinv = MINFI; |
---|
311 | else if (x > MAXFI) fracinv = MAXFI; |
---|
312 | else fracinv = x; |
---|
313 | |
---|
314 | computedistan(); |
---|
315 | if (approxp_optn) result = usertree_alklhd(); |
---|
316 | else result = usertree_lklhd(); |
---|
317 | |
---|
318 | return (-result); |
---|
319 | } |
---|
320 | |
---|
321 | |
---|
322 | /* optimization function Geta */ |
---|
323 | double optge(double x) |
---|
324 | { |
---|
325 | double result; |
---|
326 | |
---|
327 | if (x < MINGE) Geta = MINGE; |
---|
328 | else if (x > MAXGE) Geta = MAXGE; |
---|
329 | else Geta = x; |
---|
330 | |
---|
331 | updaterates(); |
---|
332 | |
---|
333 | computedistan(); |
---|
334 | if (approxp_optn) result = usertree_alklhd(); |
---|
335 | else result = usertree_lklhd(); |
---|
336 | |
---|
337 | return (-result); |
---|
338 | } |
---|
339 | |
---|
340 | |
---|
341 | /* optimize rate heterogeneity parameters */ |
---|
342 | void optimrateparams() |
---|
343 | { |
---|
344 | twodimenmin(PEPS2, |
---|
345 | fracinv_optim, |
---|
346 | MINFI, &fracinv, fracconst, optfi, &fierr, |
---|
347 | grate_optim, |
---|
348 | MINGE, &Geta, MAXGE, optge, &geerr); |
---|
349 | |
---|
350 | } |
---|