| 1 | /* |
|---|
| 2 | * ml3.c |
|---|
| 3 | * |
|---|
| 4 | * |
|---|
| 5 | * Part of TREE-PUZZLE 5.0 (June 2000) |
|---|
| 6 | * |
|---|
| 7 | * (c) 1999-2000 by Heiko A. Schmidt, Korbinian Strimmer, |
|---|
| 8 | * M. Vingron, and Arndt von Haeseler |
|---|
| 9 | * (c) 1995-1999 by Korbinian Strimmer and Arndt von Haeseler |
|---|
| 10 | * |
|---|
| 11 | * All parts of the source except where indicated are distributed under |
|---|
| 12 | * the GNU public licence. See http://www.opensource.org for details. |
|---|
| 13 | */ |
|---|
| 14 | |
|---|
| 15 | |
|---|
| 16 | #define EXTERN extern |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | /* prototypes */ |
|---|
| 20 | #include <stdio.h> |
|---|
| 21 | #include <stdlib.h> |
|---|
| 22 | #include <math.h> |
|---|
| 23 | #include "util.h" |
|---|
| 24 | #include "ml.h" |
|---|
| 25 | #include "gamma.h" |
|---|
| 26 | |
|---|
| 27 | |
|---|
| 28 | |
|---|
| 29 | /******************************************************************************/ |
|---|
| 30 | /* discrete Gamma-distribution and related stuff */ |
|---|
| 31 | /******************************************************************************/ |
|---|
| 32 | |
|---|
| 33 | /* compare general base frequencies with frequencies of taxon i with chi square */ |
|---|
| 34 | double homogentest(int taxon) |
|---|
| 35 | { |
|---|
| 36 | return chi2test(Freqtpm, Basecomp[taxon], gettpmradix(), &chi2fail); |
|---|
| 37 | } |
|---|
| 38 | |
|---|
| 39 | |
|---|
| 40 | /* discrete Gamma according to Yang 1994 (JME 39:306-314) */ |
|---|
| 41 | void YangDiscreteGamma (double shape, int c, dvector x) |
|---|
| 42 | { |
|---|
| 43 | double twoc, mu; |
|---|
| 44 | int i; |
|---|
| 45 | |
|---|
| 46 | twoc = 2.0*c; |
|---|
| 47 | mu = 0.0; |
|---|
| 48 | for (i = 0; i < c; i++) |
|---|
| 49 | { |
|---|
| 50 | /* corresponding rates */ |
|---|
| 51 | x[i] = icdfGamma ( (2.0*i+1.0)/twoc, shape); |
|---|
| 52 | mu += x[i]; |
|---|
| 53 | } |
|---|
| 54 | mu = mu/c; |
|---|
| 55 | |
|---|
| 56 | /* rescale for avarage rate of 1.0 */ |
|---|
| 57 | for (i = 0; i < c; i++) |
|---|
| 58 | { |
|---|
| 59 | x[i] /= mu; |
|---|
| 60 | } |
|---|
| 61 | } |
|---|
| 62 | |
|---|
| 63 | /* compute rates of each category when rates are Gamma-distributed */ |
|---|
| 64 | void updaterates() |
|---|
| 65 | { |
|---|
| 66 | int i; |
|---|
| 67 | double alpha; |
|---|
| 68 | |
|---|
| 69 | if (numcats == 1) |
|---|
| 70 | { |
|---|
| 71 | Rates[0] = 1.0; |
|---|
| 72 | return; |
|---|
| 73 | } |
|---|
| 74 | if (Geta == 0.0) |
|---|
| 75 | { |
|---|
| 76 | for (i = 0; i < numcats; i++) |
|---|
| 77 | Rates[i] = 1.0; |
|---|
| 78 | return; |
|---|
| 79 | } |
|---|
| 80 | alpha = (1.0 - Geta)/Geta; |
|---|
| 81 | |
|---|
| 82 | YangDiscreteGamma (alpha, numcats, Rates); |
|---|
| 83 | |
|---|
| 84 | /* if invariable sites are present */ |
|---|
| 85 | for (i = 0; i < numcats; i++) |
|---|
| 86 | Rates[i] = Rates[i]/(1.0-fracinv); |
|---|
| 87 | |
|---|
| 88 | /* check for very small rates */ |
|---|
| 89 | for (i = 0; i < numcats; i++) |
|---|
| 90 | if (Rates[i] < 0.000001) Rates[i] = 0.000001; |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | |
|---|
| 94 | |
|---|
| 95 | /******************************************************************************/ |
|---|
| 96 | /* parameter estimation */ |
|---|
| 97 | /******************************************************************************/ |
|---|
| 98 | |
|---|
| 99 | /* compute sample mean and standard deviation of sample mean */ |
|---|
| 100 | void computestat(double *data, int n, double *mean, double *err) |
|---|
| 101 | { |
|---|
| 102 | int i; |
|---|
| 103 | double sum; |
|---|
| 104 | |
|---|
| 105 | sum = 0; |
|---|
| 106 | for (i = 0; i < n; i++) sum += data[i]; |
|---|
| 107 | (*mean) = sum/(double) n; |
|---|
| 108 | |
|---|
| 109 | sum = 0; |
|---|
| 110 | for (i = 0; i < n; i++) sum += (data[i] - (*mean))*(data[i] - (*mean)); |
|---|
| 111 | if (n != 1) |
|---|
| 112 | (*err) = sqrt(sum)/sqrt((double)(n-1)*n); /* unbiased estimator */ |
|---|
| 113 | else |
|---|
| 114 | (*err) = 0.0; /* if n == 1 */ |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | /* compute ML value of quartet (a,b,c,d) */ |
|---|
| 118 | double quartetml(int a, int b, int c, int d) |
|---|
| 119 | { |
|---|
| 120 | double d1, d2, d3; |
|---|
| 121 | |
|---|
| 122 | /* compute ML for all topologies */ |
|---|
| 123 | if (approxp_optn) { /* approximate parameter mode */ |
|---|
| 124 | d1 = quartet_alklhd(a,b,c,d); /* (a,b)-(c,d) */ |
|---|
| 125 | d2 = quartet_alklhd(a,c,b,d); /* (a,c)-(b,d) */ |
|---|
| 126 | d3 = quartet_alklhd(a,d,b,c); /* (a,d)-(b,c) */ |
|---|
| 127 | } else { |
|---|
| 128 | d1 = quartet_lklhd(a,b,c,d); /* (a,b)-(c,d) */ |
|---|
| 129 | d2 = quartet_lklhd(a,c,b,d); /* (a,c)-(b,d) */ |
|---|
| 130 | d3 = quartet_lklhd(a,d,b,c); /* (a,d)-(b,c) */ |
|---|
| 131 | } |
|---|
| 132 | |
|---|
| 133 | /* looking for max(d1, d2, d3) */ |
|---|
| 134 | if (d1 < d2) { /* d2 > d1 */ |
|---|
| 135 | if (d2 < d3) { /* d3 > d2 > d1 */ |
|---|
| 136 | /* d3 maximum */ |
|---|
| 137 | return d3; |
|---|
| 138 | } else { /* d2 >= d3 > d1 */ |
|---|
| 139 | /* d2 maximum */ |
|---|
| 140 | return d2; |
|---|
| 141 | } |
|---|
| 142 | } else { /* d1 >= d2 */ |
|---|
| 143 | if (d1 < d3) { /* d3 > d1 >= d2 */ |
|---|
| 144 | /* d3 maximum */ |
|---|
| 145 | return d3; |
|---|
| 146 | } else { /* d1 >= d2 && d1 >= d3 */ |
|---|
| 147 | /* d1 maximum */ |
|---|
| 148 | return d1; |
|---|
| 149 | } |
|---|
| 150 | } |
|---|
| 151 | } |
|---|
| 152 | |
|---|
| 153 | /* optimization function TSparam - quartets */ |
|---|
| 154 | double opttsq(double x) |
|---|
| 155 | { |
|---|
| 156 | if (x < MINTS) TSparam = MINTS; |
|---|
| 157 | else if (x > MAXTS) TSparam = MAXTS; |
|---|
| 158 | else TSparam = x; |
|---|
| 159 | tranprobmat(); |
|---|
| 160 | distupdate(qca, qcb, qcc, qcd); |
|---|
| 161 | return (-quartetml(qca, qcb, qcc, qcd)); |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | /* optimization function YRparam - quartets */ |
|---|
| 165 | double optyrq(double x) |
|---|
| 166 | { |
|---|
| 167 | if (x < MINYR) YRparam = MINYR; |
|---|
| 168 | else if (x > MAXYR) YRparam = MAXYR; |
|---|
| 169 | else YRparam = x; |
|---|
| 170 | tranprobmat(); |
|---|
| 171 | distupdate(qca, qcb, qcc, qcd); |
|---|
| 172 | return (-quartetml(qca, qcb, qcc, qcd)); |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | /* estimate substitution process parameters - random quartets */ |
|---|
| 176 | void optimseqevolparamsq() |
|---|
| 177 | { |
|---|
| 178 | double tsmeanold, yrmeanold; |
|---|
| 179 | dvector tslist, yrlist; |
|---|
| 180 | int fin; |
|---|
| 181 | ivector taxon; |
|---|
| 182 | uli minqts, maxqts, n; |
|---|
| 183 | |
|---|
| 184 | |
|---|
| 185 | taxon = new_ivector(4); |
|---|
| 186 | |
|---|
| 187 | /* number of quartets to be investigated */ |
|---|
| 188 | minqts = (uli) floor(0.25 * MINPERTAXUM * Maxspc) + 1; |
|---|
| 189 | maxqts = (uli) floor(0.25 * MAXPERTAXUM * Maxspc) + 1; |
|---|
| 190 | if (Maxspc == 4) { |
|---|
| 191 | minqts = (uli) 1; |
|---|
| 192 | maxqts = (uli) 1; |
|---|
| 193 | } |
|---|
| 194 | |
|---|
| 195 | tslist = new_dvector(maxqts); |
|---|
| 196 | yrlist = new_dvector(maxqts); |
|---|
| 197 | |
|---|
| 198 | /* initialize averages */ |
|---|
| 199 | tsmean = TSparam; |
|---|
| 200 | yrmean = YRparam; |
|---|
| 201 | |
|---|
| 202 | fin = FALSE; |
|---|
| 203 | |
|---|
| 204 | /* investigate maxqts random quartets */ |
|---|
| 205 | for (n = 0; n < maxqts; n++) { |
|---|
| 206 | |
|---|
| 207 | /* choose random quartet */ |
|---|
| 208 | chooser(Maxspc, 4, taxon); |
|---|
| 209 | |
|---|
| 210 | /* |
|---|
| 211 | * optimize parameters on this quartet |
|---|
| 212 | */ |
|---|
| 213 | |
|---|
| 214 | qca = taxon[0]; |
|---|
| 215 | qcb = taxon[1]; |
|---|
| 216 | qcc = taxon[2]; |
|---|
| 217 | qcd = taxon[3]; |
|---|
| 218 | |
|---|
| 219 | /* initialize start values with average value */ |
|---|
| 220 | if ((SH_optn || nuc_optn) && optim_optn && (data_optn == 0)) TSparam = tsmean; |
|---|
| 221 | if ((nuc_optn && TN_optn) && optim_optn && (data_optn == 0)) YRparam = yrmean; |
|---|
| 222 | |
|---|
| 223 | /* estimation */ |
|---|
| 224 | twodimenmin(PEPS1, |
|---|
| 225 | (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), |
|---|
| 226 | MINTS, &TSparam, MAXTS, opttsq, &tserr, |
|---|
| 227 | (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), |
|---|
| 228 | MINYR, &YRparam, MAXYR, optyrq, &yrerr); |
|---|
| 229 | |
|---|
| 230 | |
|---|
| 231 | tsmeanold = tsmean; |
|---|
| 232 | yrmeanold = yrmean; |
|---|
| 233 | tslist[n] = TSparam; |
|---|
| 234 | yrlist[n] = YRparam; |
|---|
| 235 | computestat(tslist, n+1 , &tsmean, &tserr); |
|---|
| 236 | computestat(yrlist, n+1 , &yrmean, &yrerr); |
|---|
| 237 | |
|---|
| 238 | /* check whether the means are converging */ |
|---|
| 239 | if (n > minqts-2) { |
|---|
| 240 | if ((fabs(tsmean-tsmeanold) < TSDIFF) && |
|---|
| 241 | (fabs(yrmean-yrmeanold) < YRDIFF)) |
|---|
| 242 | fin = TRUE; |
|---|
| 243 | } |
|---|
| 244 | |
|---|
| 245 | /* investigate at least minqts quartets */ |
|---|
| 246 | if (n > minqts-2 && (fin || n > maxqts-2)) break; |
|---|
| 247 | } |
|---|
| 248 | |
|---|
| 249 | /* round estimated numbers to 2 digits after the decimal point */ |
|---|
| 250 | if (tserr != 0.0) tsmean = floor(100.0*tsmean+0.5)/100.0; |
|---|
| 251 | if (yrerr != 0.0) yrmean = floor(100.0*yrmean+0.5)/100.0; |
|---|
| 252 | |
|---|
| 253 | /* update ML engine */ |
|---|
| 254 | TSparam = tsmean; |
|---|
| 255 | YRparam = yrmean; |
|---|
| 256 | tranprobmat(); |
|---|
| 257 | |
|---|
| 258 | free_ivector(taxon); |
|---|
| 259 | } |
|---|
| 260 | |
|---|
| 261 | /* optimization function TSparam - tree */ |
|---|
| 262 | double opttst(double x) |
|---|
| 263 | { |
|---|
| 264 | double result; |
|---|
| 265 | |
|---|
| 266 | if (x < MINTS) TSparam = MINTS; |
|---|
| 267 | else if (x > MAXTS) TSparam = MAXTS; |
|---|
| 268 | else TSparam = x; |
|---|
| 269 | tranprobmat(); |
|---|
| 270 | computedistan(); |
|---|
| 271 | if (approxp_optn) result = usertree_alklhd(); |
|---|
| 272 | else result = usertree_lklhd(); |
|---|
| 273 | |
|---|
| 274 | return (-result); |
|---|
| 275 | } |
|---|
| 276 | |
|---|
| 277 | /* optimization function YRparam - tree */ |
|---|
| 278 | double optyrt(double x) |
|---|
| 279 | { |
|---|
| 280 | double result; |
|---|
| 281 | |
|---|
| 282 | if (x < MINYR) YRparam = MINYR; |
|---|
| 283 | else if (x > MAXYR) YRparam = MAXYR; |
|---|
| 284 | else YRparam = x; |
|---|
| 285 | tranprobmat(); |
|---|
| 286 | computedistan(); |
|---|
| 287 | if (approxp_optn) result = usertree_alklhd(); |
|---|
| 288 | else result = usertree_lklhd(); |
|---|
| 289 | |
|---|
| 290 | return (-result); |
|---|
| 291 | } |
|---|
| 292 | |
|---|
| 293 | |
|---|
| 294 | /* optimize substitution process parameters - tree */ |
|---|
| 295 | void optimseqevolparamst() |
|---|
| 296 | { |
|---|
| 297 | twodimenmin(PEPS1, |
|---|
| 298 | (SH_optn || nuc_optn) && optim_optn && (data_optn == 0), |
|---|
| 299 | MINTS, &TSparam, MAXTS, opttst, &tserr, |
|---|
| 300 | (nuc_optn && TN_optn) && optim_optn && (data_optn == 0), |
|---|
| 301 | MINYR, &YRparam, MAXYR, optyrt, &yrerr); |
|---|
| 302 | } |
|---|
| 303 | |
|---|
| 304 | |
|---|
| 305 | /* optimization function fracinv */ |
|---|
| 306 | double optfi(double x) |
|---|
| 307 | { |
|---|
| 308 | double result; |
|---|
| 309 | |
|---|
| 310 | if (x < MINFI) fracinv = MINFI; |
|---|
| 311 | else if (x > MAXFI) fracinv = MAXFI; |
|---|
| 312 | else fracinv = x; |
|---|
| 313 | |
|---|
| 314 | computedistan(); |
|---|
| 315 | if (approxp_optn) result = usertree_alklhd(); |
|---|
| 316 | else result = usertree_lklhd(); |
|---|
| 317 | |
|---|
| 318 | return (-result); |
|---|
| 319 | } |
|---|
| 320 | |
|---|
| 321 | |
|---|
| 322 | /* optimization function Geta */ |
|---|
| 323 | double optge(double x) |
|---|
| 324 | { |
|---|
| 325 | double result; |
|---|
| 326 | |
|---|
| 327 | if (x < MINGE) Geta = MINGE; |
|---|
| 328 | else if (x > MAXGE) Geta = MAXGE; |
|---|
| 329 | else Geta = x; |
|---|
| 330 | |
|---|
| 331 | updaterates(); |
|---|
| 332 | |
|---|
| 333 | computedistan(); |
|---|
| 334 | if (approxp_optn) result = usertree_alklhd(); |
|---|
| 335 | else result = usertree_lklhd(); |
|---|
| 336 | |
|---|
| 337 | return (-result); |
|---|
| 338 | } |
|---|
| 339 | |
|---|
| 340 | |
|---|
| 341 | /* optimize rate heterogeneity parameters */ |
|---|
| 342 | void optimrateparams() |
|---|
| 343 | { |
|---|
| 344 | twodimenmin(PEPS2, |
|---|
| 345 | fracinv_optim, |
|---|
| 346 | MINFI, &fracinv, fracconst, optfi, &fierr, |
|---|
| 347 | grate_optim, |
|---|
| 348 | MINGE, &Geta, MAXGE, optge, &geerr); |
|---|
| 349 | |
|---|
| 350 | } |
|---|