source: trunk/GDE/RAxML/rogueEPA.c

Last change on this file was 13845, checked in by westram, 10 years ago
  • remove executable flag
File size: 10.2 KB
Line 
1/*  RAxML-VI-HPC (version 2.2) a program for sequential and parallel estimation of phylogenetic trees
2 *  Copyright August 2006 by Alexandros Stamatakis
3 *
4 *  Partially derived from
5 *  fastDNAml, a program for estimation of phylogenetic trees from sequences by Gary J. Olsen
6 * 
7 *  and
8 *
9 *  Programs of the PHYLIP package by Joe Felsenstein.
10 *
11 *  This program is free software; you may redistribute it and/or modify its
12 *  under the terms of the GNU General Public License as published by the Free
13 *  Software Foundation; either version 2 of the License, or (at your option)
14 *  any later version.
15 *
16 *  This program is distributed in the hope that it will be useful, but
17 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
19 *  for more details.
20 *
21 *
22 *  For any other enquiries send an Email to Alexandros Stamatakis
23 *  Alexandros.Stamatakis@epfl.ch
24 *
25 *  When publishing work that is based on the results from RAxML-VI-HPC please cite:
26 *
27 *  Alexandros Stamatakis:"RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands
28 *  of taxa and mixed models".
29 *  Bioinformatics 2006; doi: 10.1093/bioinformatics/btl446
30 */
31
32#ifndef WIN32
33#include <unistd.h>
34#endif
35
36#include <math.h>
37#include <time.h>
38#include <stdlib.h>
39#include <stdio.h>
40#include <ctype.h>
41#include <string.h>
42#include "axml.h"
43
44extern char run_id[128];
45extern char workdir[1024];
46extern double masterTime;
47
48/*
49  function below not very interesting, standard RAxML stuff
50*/
51
52static double testInsertThorough(tree *tr, nodeptr r, nodeptr q, boolean useVector)
53{
54  double 
55    result,           
56    qz[NUM_BRANCHES],
57    z[NUM_BRANCHES];
58 
59  nodeptr 
60    x = q->back,
61    s = r->back;
62 
63  int     
64    j;   
65
66  for(j = 0; j < tr->numBranches; j++)   
67    {
68      qz[j] = q->z[j];
69      z[j] = sqrt(qz[j]); 
70
71      if(z[j] < zmin) 
72        z[j] = zmin;
73     
74      if(z[j] > zmax)
75        z[j] = zmax;
76    }                                     
77   
78  hookup(r->next,       q, z, tr->numBranches);
79  hookup(r->next->next, x, z, tr->numBranches);
80  hookupDefault(r, s, tr->numBranches);                     
81   
82  newviewGeneric(tr, r);             
83   
84  localSmooth(tr, r, smoothings);
85         
86  if(useVector)
87    result = evaluateGenericVector(tr, r);
88  else
89    result = evaluateGeneric(tr, r);                               
90
91  hookup(q, x, qz, tr->numBranches);
92     
93  r->next->next->back = r->next->back = (nodeptr) NULL; 
94
95  return result;
96}
97
98
99/*
100  structure to store likelihood and insertion node
101  for each window position
102*/
103
104typedef struct 
105{
106  double lh;
107  nodeptr p;
108} positionData;
109
110
111/*
112  traverse the tree recursively and insert taxon into each position
113*/
114
115static void traverseBias(nodeptr p, nodeptr q, tree *tr, int *branchCounter, positionData *pd, int windowSize)
116{
117  double 
118    sum = 0.0,
119    result = 0.0;
120
121  int 
122    i;
123
124
125  /*
126     compute the likelihood of inserting the tip attached to
127     p between q and q->back
128
129     Actually in testInsertThorough() we compute the per site
130     log likes which are then stored in an array tr->perSiteLL[i]
131  */
132 
133  result = testInsertThorough(tr, p, q, TRUE); 
134
135
136  /*
137     stuff below can be removed at some point
138     it just makes me feel better
139  */
140
141  for(i = 0; i < tr->cdta->endsite; i++)
142    sum += tr->perSiteLL[i];
143
144  assert(ABS(sum - result) < 0.001);
145
146  /*************************************/
147
148  /*
149     for each window position just compute the likelihood over
150     the window.
151
152     If its is better than the current best one, store the likelihood
153     and the insertion node in the data structure
154  */
155
156  for(i = 0; i < tr->cdta->endsite - windowSize; i++)
157    {     
158      int 
159        j;
160
161      for(j = i, sum = 0.0; j < i + windowSize; j++)
162        sum += tr->perSiteLL[j];     
163
164      if(sum > pd[i].lh)
165        {     
166          pd[i].lh = sum;
167          pd[i].p = q;
168        }               
169    }
170
171  *branchCounter = *branchCounter + 1;
172 
173
174  /* and here comes the recursion */
175 
176  if(!isTip(q->number, tr->rdta->numsp))
177    {   
178      traverseBias(p, q->next->back,       tr, branchCounter, pd, windowSize);
179      traverseBias(p, q->next->next->back, tr, branchCounter, pd, windowSize);   
180    }   
181}
182
183
184/*
185  functions to compute the node distances between inferred and true
186  placement positions. I think that they are correct.
187*/
188
189static int findRec(nodeptr ref, nodeptr best, int mxtips, int value)
190{
191  if(isTip(ref->number, mxtips))
192    {
193      if(ref == best || ref->back == best)
194        return value;
195      else
196        return 0;
197    }
198  else
199    {
200      int 
201        d1,
202        d2;
203
204      if(ref == best || ref->back == best)
205        return value;
206
207      d1 = findRec(ref->next->back, best, mxtips, value + 1);
208      d2 = findRec(ref->next->next->back, best, mxtips, value + 1);
209     
210      assert((d1 > 0 && d2 == 0) || (d2 > 0 && d1 == 0) || (d1 == 0 && d2 == 0)); 
211     
212      return (d1 + d2);
213    }
214}
215
216static double getNodeDistance(nodeptr ref, nodeptr best, int mxtips)
217{ 
218  int 
219    d1 = findRec(ref, best, mxtips, 0),
220    d2 = findRec(ref->back, best, mxtips, 0);
221
222  assert((d1 > 0 && d2 == 0) || (d2 > 0 && d1 == 0) || (d1 == 0 && d2 == 0));   
223
224  return ((double)(d1 + d2));
225}
226
227void computePlacementBias(tree *tr, analdef *adef)
228{
229  int 
230    windowSize = adef->slidingWindowSize,
231    k,   
232    i, 
233    tips,
234    numTraversalBranches = (2 * (tr->mxtips - 1)) - 3; /* compute number of branches into which we need to insert once we have removed a taxon */   
235   
236  char 
237    fileName[1024];
238
239  FILE 
240    *outFile;
241
242  /* data for each sliding window starting position */
243
244  positionData
245    *pd = (positionData *)rax_malloc(sizeof(positionData) * (tr->cdta->endsite - windowSize));
246
247  double 
248    *nodeDistances = (double*)rax_calloc(tr->cdta->endsite - windowSize, sizeof(double)), /* array to store node distnces ND for every sliding window position */
249    *distances = (double*)rax_calloc(tr->cdta->endsite, sizeof(double)); /* array to store avg distances for every site */
250
251  strcpy(fileName,         workdir);
252  strcat(fileName,         "RAxML_SiteSpecificPlacementBias.");
253  strcat(fileName,         run_id);
254
255  outFile = myfopen(fileName, "w");
256
257  printBothOpen("Likelihood of comprehensive tree %f\n\n", tr->likelihood);
258
259  if(windowSize > tr->cdta->endsite)
260    {
261      printBothOpen("The size of your sliding window is %d while the number of sites in the alignment is %d\n\n", windowSize, tr->cdta->endsite);
262      exit(-1);
263    }
264
265  if(windowSize >=  (int)(0.9 * tr->cdta->endsite))   
266    printBothOpen("WARNING: your sliding window of size %d is only slightly smaller than you alignment that has %d sites\n\n",  windowSize, tr->cdta->endsite);
267
268  printBothOpen("Sliding window size: %d\n\n", windowSize);
269
270  /* prune and re-insert on tip at a time into all branches of the remaining tree */
271
272  for(tips = 1; tips <= tr->mxtips; tips++)   
273    {
274      nodeptr
275        myStart,
276        p = tr->nodep[tips]->back, /* this is the node at which we are prunung */
277        p1 =  p->next->back,
278        p2 =  p->next->next->back;
279     
280      double
281        pz[NUM_BRANCHES],
282        p1z[NUM_BRANCHES], 
283        p2z[NUM_BRANCHES];
284
285      int 
286        branchCounter = 0;
287     
288      /* reset array values for this tip */
289
290      for(i = 0; i < tr->cdta->endsite; i++)
291        {
292          pd[i].lh = unlikely;
293          pd[i].p = (nodeptr)NULL;
294        }
295
296      /* store the three branch lengths adjacent to the position at which we prune */
297
298      for(i = 0; i < tr->numBranches; i++)
299        {
300          p1z[i] = p1->z[i];
301          p2z[i] = p2->z[i];               
302          pz[i] = p->z[i];
303        }           
304
305      /* prune the taxon, optimizing the branch between p1 and p2 */
306
307      removeNodeBIG(tr, p,  tr->numBranches); 
308
309      printBothOpen("Pruning taxon Number %d [%s]\n", tips, tr->nameList[tips]);
310     
311      /* find any tip to start traversing the tree */
312
313      myStart = findAnyTip(p1, tr->mxtips);     
314
315      /* insert taxon, compute likelihood and remove taxon again from all branches */
316
317      traverseBias(p, myStart->back, tr, &branchCounter, pd, windowSize);     
318
319      assert(branchCounter == numTraversalBranches);                   
320
321      /* for every sliding window position calc ND to the true/correct position at p */
322
323      for(i = 0; i < tr->cdta->endsite - windowSize; i++)       
324        nodeDistances[i] = getNodeDistance(p1, pd[i].p, tr->mxtips);     
325
326      /* now analyze */
327
328      for(i = 0; i < tr->cdta->endsite; i++)
329        {
330          double 
331            d = 0.0;
332
333          int 
334            s = 0;               
335         
336          /*
337             check site position, i.e., doe we have windowSize data points available
338             or fewer because we are at the start or the end of the alignment
339          */
340
341          /*
342            for each site just accumulate the node distances we have for all
343            sliding windows that passed over this site
344          */
345         
346          if(i < windowSize)
347            {
348              for(k = 0; k <= i; k++, s++)                     
349                d += nodeDistances[k];                         
350            }
351          else
352            {
353              if(i < tr->cdta->endsite - windowSize)
354                {
355                  for(k = i - windowSize + 1; k <= i; k++, s++)                     
356                    d += nodeDistances[k];                                 
357                }           
358              else
359                {                                 
360                  for(k = i - windowSize; k < (tr->cdta->endsite - windowSize); k++, s++)                                     
361                    d += nodeDistances[k];                                               
362                }
363            }                   
364
365         
366          /*
367             now just divide the accumultaed ND distance by
368             the number of distances we have for this position and then add it to the acc
369             distances over all taxa.
370             I just realized that the version on which I did the tests
371             I sent to Simon I used
372             
373             distances[i] = d / ((double)s);
374
375             instead of
376             
377             distances[i] += d / ((double)s);
378
379             gamo tin poutana mou
380          */
381             
382          distances[i] += (d / ((double)s));     
383        }
384
385     
386
387      /*
388         re-connect taxon to its original position
389      */
390
391      hookup(p->next,       p1,      p1z, tr->numBranches); 
392      hookup(p->next->next, p2,      p2z, tr->numBranches);
393      hookup(p,             p->back, pz, tr->numBranches);     
394
395      /*
396        fix likelihood vectors
397      */
398
399      newviewGeneric(tr, p);         
400    }
401
402  /*
403     now just compute the average ND over all taxa
404  */
405   
406
407  for(i = 0; i < tr->cdta->endsite; i++)
408    {
409      double 
410        avg = distances[i] / ((double)tr->mxtips);
411
412      fprintf(outFile, "%d %f\n", i, avg);
413    }
414
415  printBothOpen("\nTime for EPA-based site-specific placement bias calculation: %f\n", gettime() - masterTime); 
416  printBothOpen("Site-specific placement bias statistics written to file %s\n", fileName);
417
418  fclose(outFile);
419
420  exit(0);
421}
Note: See TracBrowser for help on using the repository browser.