1 | /* RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees |
---|
2 | * Copyright August 2006 by Alexandros Stamatakis |
---|
3 | * |
---|
4 | * Partially derived from |
---|
5 | * fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen |
---|
6 | * |
---|
7 | * and |
---|
8 | * |
---|
9 | * Programs of the PHYLIP package by Joe Felsenstein. |
---|
10 | * |
---|
11 | * This program is free software; you may redistribute it and/or modify its |
---|
12 | * under the terms of the GNU General Public License as published by the Free |
---|
13 | * Software Foundation; either version 2 of the License, or (at your option) |
---|
14 | * any later version. |
---|
15 | * |
---|
16 | * This program is distributed in the hope that it will be useful, but |
---|
17 | * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY |
---|
18 | * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
---|
19 | * for more details. |
---|
20 | * |
---|
21 | * |
---|
22 | * For any other enquiries send an Email to Alexandros Stamatakis |
---|
23 | * Alexandros.Stamatakis@epfl.ch |
---|
24 | * |
---|
25 | * When publishing work that is based on the results from RAxML-VI-HPC please cite: |
---|
26 | * |
---|
27 | * Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands |
---|
28 | * of taxa and mixed models". |
---|
29 | * Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446 |
---|
30 | */ |
---|
31 | |
---|
32 | #ifndef WIN32 |
---|
33 | #include <unistd.h> |
---|
34 | #endif |
---|
35 | |
---|
36 | #include <math.h> |
---|
37 | #include <time.h> |
---|
38 | #include <stdlib.h> |
---|
39 | #include <stdio.h> |
---|
40 | #include <ctype.h> |
---|
41 | #include <string.h> |
---|
42 | #include "axml.h" |
---|
43 | |
---|
44 | extern char run_id[128]; |
---|
45 | extern char workdir[1024]; |
---|
46 | extern double masterTime; |
---|
47 | |
---|
48 | /* |
---|
49 | function below not very interesting, standard RAxML stuff |
---|
50 | */ |
---|
51 | |
---|
52 | static double testInsertThorough(tree *tr, nodeptr r, nodeptr q, boolean useVector) |
---|
53 | { |
---|
54 | double |
---|
55 | result, |
---|
56 | qz[NUM_BRANCHES], |
---|
57 | z[NUM_BRANCHES]; |
---|
58 | |
---|
59 | nodeptr |
---|
60 | x = q->back, |
---|
61 | s = r->back; |
---|
62 | |
---|
63 | int |
---|
64 | j; |
---|
65 | |
---|
66 | for(j = 0; j < tr->numBranches; j++) |
---|
67 | { |
---|
68 | qz[j] = q->z[j]; |
---|
69 | z[j] = sqrt(qz[j]); |
---|
70 | |
---|
71 | if(z[j] < zmin) |
---|
72 | z[j] = zmin; |
---|
73 | |
---|
74 | if(z[j] > zmax) |
---|
75 | z[j] = zmax; |
---|
76 | } |
---|
77 | |
---|
78 | hookup(r->next, q, z, tr->numBranches); |
---|
79 | hookup(r->next->next, x, z, tr->numBranches); |
---|
80 | hookupDefault(r, s, tr->numBranches); |
---|
81 | |
---|
82 | newviewGeneric(tr, r); |
---|
83 | |
---|
84 | localSmooth(tr, r, smoothings); |
---|
85 | |
---|
86 | if(useVector) |
---|
87 | result = evaluateGenericVector(tr, r); |
---|
88 | else |
---|
89 | result = evaluateGeneric(tr, r); |
---|
90 | |
---|
91 | hookup(q, x, qz, tr->numBranches); |
---|
92 | |
---|
93 | r->next->next->back = r->next->back = (nodeptr) NULL; |
---|
94 | |
---|
95 | return result; |
---|
96 | } |
---|
97 | |
---|
98 | |
---|
99 | /* |
---|
100 | structure to store likelihood and insertion node |
---|
101 | for each window position |
---|
102 | */ |
---|
103 | |
---|
104 | typedef struct |
---|
105 | { |
---|
106 | double lh; |
---|
107 | nodeptr p; |
---|
108 | } positionData; |
---|
109 | |
---|
110 | |
---|
111 | /* |
---|
112 | traverse the tree recursively and insert taxon into each position |
---|
113 | */ |
---|
114 | |
---|
115 | static void traverseBias(nodeptr p, nodeptr q, tree *tr, int *branchCounter, positionData *pd, int windowSize) |
---|
116 | { |
---|
117 | double |
---|
118 | sum = 0.0, |
---|
119 | result = 0.0; |
---|
120 | |
---|
121 | int |
---|
122 | i; |
---|
123 | |
---|
124 | |
---|
125 | /* |
---|
126 | compute the likelihood of inserting the tip attached to |
---|
127 | p between q and q->back |
---|
128 | |
---|
129 | Actually in testInsertThorough() we compute the per site |
---|
130 | log likes which are then stored in an array tr->perSiteLL[i] |
---|
131 | */ |
---|
132 | |
---|
133 | result = testInsertThorough(tr, p, q, TRUE); |
---|
134 | |
---|
135 | |
---|
136 | /* |
---|
137 | stuff below can be removed at some point |
---|
138 | it just makes me feel better |
---|
139 | */ |
---|
140 | |
---|
141 | for(i = 0; i < tr->cdta->endsite; i++) |
---|
142 | sum += tr->perSiteLL[i]; |
---|
143 | |
---|
144 | assert(ABS(sum - result) < 0.001); |
---|
145 | |
---|
146 | /*************************************/ |
---|
147 | |
---|
148 | /* |
---|
149 | for each window position just compute the likelihood over |
---|
150 | the window. |
---|
151 | |
---|
152 | If its is better than the current best one, store the likelihood |
---|
153 | and the insertion node in the data structure |
---|
154 | */ |
---|
155 | |
---|
156 | for(i = 0; i < tr->cdta->endsite - windowSize; i++) |
---|
157 | { |
---|
158 | int |
---|
159 | j; |
---|
160 | |
---|
161 | for(j = i, sum = 0.0; j < i + windowSize; j++) |
---|
162 | sum += tr->perSiteLL[j]; |
---|
163 | |
---|
164 | if(sum > pd[i].lh) |
---|
165 | { |
---|
166 | pd[i].lh = sum; |
---|
167 | pd[i].p = q; |
---|
168 | } |
---|
169 | } |
---|
170 | |
---|
171 | *branchCounter = *branchCounter + 1; |
---|
172 | |
---|
173 | |
---|
174 | /* and here comes the recursion */ |
---|
175 | |
---|
176 | if(!isTip(q->number, tr->rdta->numsp)) |
---|
177 | { |
---|
178 | traverseBias(p, q->next->back, tr, branchCounter, pd, windowSize); |
---|
179 | traverseBias(p, q->next->next->back, tr, branchCounter, pd, windowSize); |
---|
180 | } |
---|
181 | } |
---|
182 | |
---|
183 | |
---|
184 | /* |
---|
185 | functions to compute the node distances between inferred and true |
---|
186 | placement positions. I think that they are correct. |
---|
187 | */ |
---|
188 | |
---|
189 | static int findRec(nodeptr ref, nodeptr best, int mxtips, int value) |
---|
190 | { |
---|
191 | if(isTip(ref->number, mxtips)) |
---|
192 | { |
---|
193 | if(ref == best || ref->back == best) |
---|
194 | return value; |
---|
195 | else |
---|
196 | return 0; |
---|
197 | } |
---|
198 | else |
---|
199 | { |
---|
200 | int |
---|
201 | d1, |
---|
202 | d2; |
---|
203 | |
---|
204 | if(ref == best || ref->back == best) |
---|
205 | return value; |
---|
206 | |
---|
207 | d1 = findRec(ref->next->back, best, mxtips, value + 1); |
---|
208 | d2 = findRec(ref->next->next->back, best, mxtips, value + 1); |
---|
209 | |
---|
210 | assert((d1 > 0 && d2 == 0) || (d2 > 0 && d1 == 0) || (d1 == 0 && d2 == 0)); |
---|
211 | |
---|
212 | return (d1 + d2); |
---|
213 | } |
---|
214 | } |
---|
215 | |
---|
216 | static double getNodeDistance(nodeptr ref, nodeptr best, int mxtips) |
---|
217 | { |
---|
218 | int |
---|
219 | d1 = findRec(ref, best, mxtips, 0), |
---|
220 | d2 = findRec(ref->back, best, mxtips, 0); |
---|
221 | |
---|
222 | assert((d1 > 0 && d2 == 0) || (d2 > 0 && d1 == 0) || (d1 == 0 && d2 == 0)); |
---|
223 | |
---|
224 | return ((double)(d1 + d2)); |
---|
225 | } |
---|
226 | |
---|
227 | void computePlacementBias(tree *tr, analdef *adef) |
---|
228 | { |
---|
229 | int |
---|
230 | windowSize = adef->slidingWindowSize, |
---|
231 | k, |
---|
232 | i, |
---|
233 | tips, |
---|
234 | numTraversalBranches = (2 * (tr->mxtips - 1)) - 3; /* compute number of branches into which we need to insert once we have removed a taxon */ |
---|
235 | |
---|
236 | char |
---|
237 | fileName[1024]; |
---|
238 | |
---|
239 | FILE |
---|
240 | *outFile; |
---|
241 | |
---|
242 | /* data for each sliding window starting position */ |
---|
243 | |
---|
244 | positionData |
---|
245 | *pd = (positionData *)rax_malloc(sizeof(positionData) * (tr->cdta->endsite - windowSize)); |
---|
246 | |
---|
247 | double |
---|
248 | *nodeDistances = (double*)rax_calloc(tr->cdta->endsite - windowSize, sizeof(double)), /* array to store node distnces ND for every sliding window position */ |
---|
249 | *distances = (double*)rax_calloc(tr->cdta->endsite, sizeof(double)); /* array to store avg distances for every site */ |
---|
250 | |
---|
251 | strcpy(fileName, workdir); |
---|
252 | strcat(fileName, "RAxML_SiteSpecificPlacementBias."); |
---|
253 | strcat(fileName, run_id); |
---|
254 | |
---|
255 | outFile = myfopen(fileName, "w"); |
---|
256 | |
---|
257 | printBothOpen("Likelihood of comprehensive tree %f\n\n", tr->likelihood); |
---|
258 | |
---|
259 | if(windowSize > tr->cdta->endsite) |
---|
260 | { |
---|
261 | printBothOpen("The size of your sliding window is %d while the number of sites in the alignment is %d\n\n", windowSize, tr->cdta->endsite); |
---|
262 | exit(-1); |
---|
263 | } |
---|
264 | |
---|
265 | if(windowSize >= (int)(0.9 * tr->cdta->endsite)) |
---|
266 | printBothOpen("WARNING: your sliding window of size %d is only slightly smaller than you alignment that has %d sites\n\n", windowSize, tr->cdta->endsite); |
---|
267 | |
---|
268 | printBothOpen("Sliding window size: %d\n\n", windowSize); |
---|
269 | |
---|
270 | /* prune and re-insert on tip at a time into all branches of the remaining tree */ |
---|
271 | |
---|
272 | for(tips = 1; tips <= tr->mxtips; tips++) |
---|
273 | { |
---|
274 | nodeptr |
---|
275 | myStart, |
---|
276 | p = tr->nodep[tips]->back, /* this is the node at which we are prunung */ |
---|
277 | p1 = p->next->back, |
---|
278 | p2 = p->next->next->back; |
---|
279 | |
---|
280 | double |
---|
281 | pz[NUM_BRANCHES], |
---|
282 | p1z[NUM_BRANCHES], |
---|
283 | p2z[NUM_BRANCHES]; |
---|
284 | |
---|
285 | int |
---|
286 | branchCounter = 0; |
---|
287 | |
---|
288 | /* reset array values for this tip */ |
---|
289 | |
---|
290 | for(i = 0; i < tr->cdta->endsite; i++) |
---|
291 | { |
---|
292 | pd[i].lh = unlikely; |
---|
293 | pd[i].p = (nodeptr)NULL; |
---|
294 | } |
---|
295 | |
---|
296 | /* store the three branch lengths adjacent to the position at which we prune */ |
---|
297 | |
---|
298 | for(i = 0; i < tr->numBranches; i++) |
---|
299 | { |
---|
300 | p1z[i] = p1->z[i]; |
---|
301 | p2z[i] = p2->z[i]; |
---|
302 | pz[i] = p->z[i]; |
---|
303 | } |
---|
304 | |
---|
305 | /* prune the taxon, optimizing the branch between p1 and p2 */ |
---|
306 | |
---|
307 | removeNodeBIG(tr, p, tr->numBranches); |
---|
308 | |
---|
309 | printBothOpen("Pruning taxon Number %d [%s]\n", tips, tr->nameList[tips]); |
---|
310 | |
---|
311 | /* find any tip to start traversing the tree */ |
---|
312 | |
---|
313 | myStart = findAnyTip(p1, tr->mxtips); |
---|
314 | |
---|
315 | /* insert taxon, compute likelihood and remove taxon again from all branches */ |
---|
316 | |
---|
317 | traverseBias(p, myStart->back, tr, &branchCounter, pd, windowSize); |
---|
318 | |
---|
319 | assert(branchCounter == numTraversalBranches); |
---|
320 | |
---|
321 | /* for every sliding window position calc ND to the true/correct position at p */ |
---|
322 | |
---|
323 | for(i = 0; i < tr->cdta->endsite - windowSize; i++) |
---|
324 | nodeDistances[i] = getNodeDistance(p1, pd[i].p, tr->mxtips); |
---|
325 | |
---|
326 | /* now analyze */ |
---|
327 | |
---|
328 | for(i = 0; i < tr->cdta->endsite; i++) |
---|
329 | { |
---|
330 | double |
---|
331 | d = 0.0; |
---|
332 | |
---|
333 | int |
---|
334 | s = 0; |
---|
335 | |
---|
336 | /* |
---|
337 | check site position, i.e., doe we have windowSize data points available |
---|
338 | or fewer because we are at the start or the end of the alignment |
---|
339 | */ |
---|
340 | |
---|
341 | /* |
---|
342 | for each site just accumulate the node distances we have for all |
---|
343 | sliding windows that passed over this site |
---|
344 | */ |
---|
345 | |
---|
346 | if(i < windowSize) |
---|
347 | { |
---|
348 | for(k = 0; k <= i; k++, s++) |
---|
349 | d += nodeDistances[k]; |
---|
350 | } |
---|
351 | else |
---|
352 | { |
---|
353 | if(i < tr->cdta->endsite - windowSize) |
---|
354 | { |
---|
355 | for(k = i - windowSize + 1; k <= i; k++, s++) |
---|
356 | d += nodeDistances[k]; |
---|
357 | } |
---|
358 | else |
---|
359 | { |
---|
360 | for(k = i - windowSize; k < (tr->cdta->endsite - windowSize); k++, s++) |
---|
361 | d += nodeDistances[k]; |
---|
362 | } |
---|
363 | } |
---|
364 | |
---|
365 | |
---|
366 | /* |
---|
367 | now just divide the accumultaed ND distance by |
---|
368 | the number of distances we have for this position and then add it to the acc |
---|
369 | distances over all taxa. |
---|
370 | I just realized that the version on which I did the tests |
---|
371 | I sent to Simon I used |
---|
372 | |
---|
373 | distances[i] = d / ((double)s); |
---|
374 | |
---|
375 | instead of |
---|
376 | |
---|
377 | distances[i] += d / ((double)s); |
---|
378 | |
---|
379 | gamo tin poutana mou |
---|
380 | */ |
---|
381 | |
---|
382 | distances[i] += (d / ((double)s)); |
---|
383 | } |
---|
384 | |
---|
385 | |
---|
386 | |
---|
387 | /* |
---|
388 | re-connect taxon to its original position |
---|
389 | */ |
---|
390 | |
---|
391 | hookup(p->next, p1, p1z, tr->numBranches); |
---|
392 | hookup(p->next->next, p2, p2z, tr->numBranches); |
---|
393 | hookup(p, p->back, pz, tr->numBranches); |
---|
394 | |
---|
395 | /* |
---|
396 | fix likelihood vectors |
---|
397 | */ |
---|
398 | |
---|
399 | newviewGeneric(tr, p); |
---|
400 | } |
---|
401 | |
---|
402 | /* |
---|
403 | now just compute the average ND over all taxa |
---|
404 | */ |
---|
405 | |
---|
406 | |
---|
407 | for(i = 0; i < tr->cdta->endsite; i++) |
---|
408 | { |
---|
409 | double |
---|
410 | avg = distances[i] / ((double)tr->mxtips); |
---|
411 | |
---|
412 | fprintf(outFile, "%d %f\n", i, avg); |
---|
413 | } |
---|
414 | |
---|
415 | printBothOpen("\nTime for EPA-based site-specific placement bias calculation: %f\n", gettime() - masterTime); |
---|
416 | printBothOpen("Site-specific placement bias statistics written to file %s\n", fileName); |
---|
417 | |
---|
418 | fclose(outFile); |
---|
419 | |
---|
420 | exit(0); |
---|
421 | } |
---|