| 1 | /* |
|---|
| 2 | |
|---|
| 3 | PHYML : a program that computes maximum likelihood phylogenies from |
|---|
| 4 | DNA or AA homologous sequences |
|---|
| 5 | |
|---|
| 6 | Copyright (C) Stephane Guindon. Oct 2003 onward |
|---|
| 7 | |
|---|
| 8 | All parts of the source except where indicated are distributed under |
|---|
| 9 | the GNU public licence. See http://www.opensource.org for details. |
|---|
| 10 | |
|---|
| 11 | */ |
|---|
| 12 | |
|---|
| 13 | #ifndef UTILITIES_H |
|---|
| 14 | #define UTILITIES_H |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | #include <stdio.h> |
|---|
| 18 | #include <stdlib.h> |
|---|
| 19 | #include <math.h> |
|---|
| 20 | #include <ctype.h> |
|---|
| 21 | #include <string.h> |
|---|
| 22 | #include <time.h> |
|---|
| 23 | |
|---|
| 24 | #define VERSION "v2.4.5" |
|---|
| 25 | |
|---|
| 26 | extern int NODE_DEG_MAX; |
|---|
| 27 | extern int BRENT_ITMAX; |
|---|
| 28 | extern double BRENT_CGOLD; |
|---|
| 29 | extern double BRENT_ZEPS; |
|---|
| 30 | extern double MNBRAK_GOLD; |
|---|
| 31 | extern double MNBRAK_GLIMIT; |
|---|
| 32 | extern double MNBRAK_TINY; |
|---|
| 33 | extern double ALPHA_MIN; |
|---|
| 34 | extern double ALPHA_MAX; |
|---|
| 35 | extern double BL_MIN; |
|---|
| 36 | extern double BL_START; |
|---|
| 37 | extern double BL_MAX; |
|---|
| 38 | extern double MIN_DIFF_LK; |
|---|
| 39 | extern double GOLDEN_R; |
|---|
| 40 | extern double GOLDEN_C; |
|---|
| 41 | extern int T_MAX_FILE; |
|---|
| 42 | extern int T_MAX_LINE; |
|---|
| 43 | extern int T_MAX_NAME; |
|---|
| 44 | extern int T_MAX_SEQ; |
|---|
| 45 | extern int N_MAX_INSERT; |
|---|
| 46 | extern int N_MAX_OTU; |
|---|
| 47 | extern double UNLIKELY; |
|---|
| 48 | extern double NJ_SEUIL; |
|---|
| 49 | extern int MAX_TOPO_DIST; |
|---|
| 50 | extern double ROUND_MAX; |
|---|
| 51 | extern double DIST_MAX; |
|---|
| 52 | extern int LIM_SCALE; |
|---|
| 53 | extern double LIM_SCALE_VAL; |
|---|
| 54 | extern double AROUND_LK; |
|---|
| 55 | extern double PROP_STEP; |
|---|
| 56 | extern int T_MAX_ALPHABET; |
|---|
| 57 | extern double MDBL_MAX; |
|---|
| 58 | extern double MDBL_MIN; |
|---|
| 59 | extern int POWELL_ITMAX; |
|---|
| 60 | extern double LINMIN_TOL; |
|---|
| 61 | |
|---|
| 62 | #define For(i,n) for(i=0; i<n; i++) |
|---|
| 63 | #define Fors(i,n,s) for(i=0; i<n; i+=s) |
|---|
| 64 | #define PointGamma(prob,alpha,beta) PointChi2(prob,2.0*(alpha))/(2.0*(beta)) |
|---|
| 65 | #define SHFT2(a,b,c) (a)=(b);(b)=(c); |
|---|
| 66 | #define SHFT3(a,b,c,d) (a)=(b);(b)=(c);(c)=(d); |
|---|
| 67 | #define MAX(a,b) ((a)>(b)?(a):(b)) |
|---|
| 68 | #define MIN(a,b) ((a)<(b)?(a):(b)) |
|---|
| 69 | #define SIGN(a,b) ((b) > 0.0 ? fabs(a) : -fabs(a)) |
|---|
| 70 | #define SHFT(a,b,c,d) (a)=(b);(b)=(c);(c)=(d); |
|---|
| 71 | |
|---|
| 72 | #define NT 0 /* nucleotides */ |
|---|
| 73 | #define AA 1 /* amino acids */ |
|---|
| 74 | |
|---|
| 75 | #define ACGT 0 /* A,G,G,T encoding */ |
|---|
| 76 | #define RY 1 /* R,Y encoding */ |
|---|
| 77 | |
|---|
| 78 | |
|---|
| 79 | #define NODE_DEG_MAX 50 |
|---|
| 80 | #define BRENT_ITMAX 100 |
|---|
| 81 | #define BRENT_CGOLD 0.3819660 |
|---|
| 82 | #define BRENT_ZEPS 1.e-10 |
|---|
| 83 | #define MNBRAK_GOLD 1.618034 |
|---|
| 84 | #define MNBRAK_GLIMIT 100.0 |
|---|
| 85 | #define MNBRAK_TINY 1.e-20 |
|---|
| 86 | #define ALPHA_MIN 0.04 |
|---|
| 87 | #define ALPHA_MAX 100 |
|---|
| 88 | #define BL_MIN 1.e-10 |
|---|
| 89 | #define BL_START 1.e-03 |
|---|
| 90 | #define BL_MAX 1.e+05 |
|---|
| 91 | #define MIN_DIFF_LK 1.e-06 |
|---|
| 92 | #define GOLDEN_R 0.61803399 |
|---|
| 93 | #define GOLDEN_C (1.0-GOLDEN_R) |
|---|
| 94 | #define T_MAX_FILE 200 |
|---|
| 95 | #define T_MAX_LINE 100000 |
|---|
| 96 | #define T_MAX_NAME 100 |
|---|
| 97 | #define T_MAX_SEQ 1000000 |
|---|
| 98 | #define N_MAX_INSERT 20 |
|---|
| 99 | #define N_MAX_OTU 4000 |
|---|
| 100 | #define UNLIKELY -1.e10 |
|---|
| 101 | #define NJ_SEUIL 0.1 |
|---|
| 102 | #define ROUND_MAX 100 |
|---|
| 103 | #define DIST_MAX 2.00 |
|---|
| 104 | #define AROUND_LK 50.0 |
|---|
| 105 | #define PROP_STEP 1.0 |
|---|
| 106 | #define T_MAX_ALPHABET 100 |
|---|
| 107 | #define MDBL_MIN 2.225074E-308 |
|---|
| 108 | #define MDBL_MAX 1.797693E+308 |
|---|
| 109 | #define POWELL_ITMAX 200 |
|---|
| 110 | #define LINMIN_TOL 2.0E-04 |
|---|
| 111 | #define LIM_SCALE 3 |
|---|
| 112 | #define LIM_SCALE_VAL 1.E-50 |
|---|
| 113 | /* LIM_SCALE = 300; */ |
|---|
| 114 | /* LIM_SCALE_VAL = 1.E-500; */ |
|---|
| 115 | |
|---|
| 116 | |
|---|
| 117 | /*********************************************************/ |
|---|
| 118 | |
|---|
| 119 | typedef struct __Node { |
|---|
| 120 | struct __Node **v; /* table of pointers to neighbor nodes. Dimension = 2 x n_otu - 3 */ |
|---|
| 121 | struct __Node ***bip_node; /* three lists of pointer to tip nodes. One list for each direction */ |
|---|
| 122 | struct __Edge **b; /* table of pointers to neighbor branches */ |
|---|
| 123 | char ***bip_name; /* three lists of tip node names. One list for each direction */ |
|---|
| 124 | int *bip_size; /* Size of each of the three lists from bip_node */ |
|---|
| 125 | double *l; /* lengths of the (three or one) branches connected to one internal node */ |
|---|
| 126 | int num; /* node number */ |
|---|
| 127 | char *name; /* taxon name (if exists) */ |
|---|
| 128 | int tax; /* tax = 1 -> external node, else -> internal node */ |
|---|
| 129 | int check_branch; /* check_branch=1 is the corresponding branch is labelled with '*' */ |
|---|
| 130 | double *score; /* score used in BIONJ to determine the best pair of nodes to agglomerate */ |
|---|
| 131 | }node; |
|---|
| 132 | |
|---|
| 133 | |
|---|
| 134 | /*********************************************************/ |
|---|
| 135 | |
|---|
| 136 | typedef struct __Edge { |
|---|
| 137 | /* |
|---|
| 138 | syntax : (node) [edge] |
|---|
| 139 | (left_1) . .(right_1) |
|---|
| 140 | \ (left) (right) / |
|---|
| 141 | \._____________./ |
|---|
| 142 | / [b_fcus] \ |
|---|
| 143 | / \ |
|---|
| 144 | (left_2) . .(right_2) |
|---|
| 145 | |
|---|
| 146 | */ |
|---|
| 147 | |
|---|
| 148 | struct __Node *left,*rght; /* node on the left/right side of the edge */ |
|---|
| 149 | int l_r,r_l,l_v1,l_v2,r_v1,r_v2; |
|---|
| 150 | /* these are directions (i.e., 0, 1 or 2): */ |
|---|
| 151 | /* l_r (left to right) -> left[b_fcus->l_r] = right */ |
|---|
| 152 | /* r_l (right to left) -> right[b_fcus->r_l] = left */ |
|---|
| 153 | /* l_v1 (left node to first node != from right) -> left[b_fcus->l_v1] = left_1 */ |
|---|
| 154 | /* l_v2 (left node to secnd node != from right) -> left[b_fcus->l_v2] = left_2 */ |
|---|
| 155 | /* r_v1 (right node to first node != from left) -> right[b_fcus->r_v1] = right_1 */ |
|---|
| 156 | /* r_v2 (right node to secnd node != from left) -> right[b_fcus->r_v2] = right_2 */ |
|---|
| 157 | |
|---|
| 158 | int num; /* branch number */ |
|---|
| 159 | double l; /* branch length */ |
|---|
| 160 | double best_l; /* best branch length found so far */ |
|---|
| 161 | double l_old; /* old branch length */ |
|---|
| 162 | |
|---|
| 163 | int bip_score; /* score of the bipartition generated by the corresponding edge |
|---|
| 164 | bip_score = 1 iif the branch is fond in both trees to be compared, |
|---|
| 165 | bip_score = 0 otherwise. */ |
|---|
| 166 | double nj_score; /* score of the agglomeration that generated that branch in BIONJ */ |
|---|
| 167 | double diff_lk; /* difference of likelihood between the current topological |
|---|
| 168 | configuration at this branch and the best alternative one */ |
|---|
| 169 | |
|---|
| 170 | int get_p_lk_left; /* 1 if the likelihood of the subtree on the left has to be computed */ |
|---|
| 171 | int get_p_lk_rght; /* 1 if the likelihood of the subtree on the right has to be computed */ |
|---|
| 172 | int ud_p_lk_left; /* 1 if the likelihood of the subtree on the left is up to date */ |
|---|
| 173 | int ud_p_lk_rght; /* 1 if the likelihood of the subtree on the right is up to date */ |
|---|
| 174 | double site_dlk; /* derivative of the likelihood (deprecated) */ |
|---|
| 175 | double site_d2lk; /* 2nd derivative of the likelihood (deprecated) */ |
|---|
| 176 | double *site_dlk_rr; /* derivative of the likelihood conditional on the current relative rate */ |
|---|
| 177 | double *site_d2lk_rr; /* 2nd derivative of the likelihood conditional on the current relative rate */ |
|---|
| 178 | double ***p_lk_left,***p_lk_rght; /* likelihoods of the subtree on the left and |
|---|
| 179 | right side (for each site and each relative rate category) */ |
|---|
| 180 | double **site_p_lk_rght, **site_p_lk_left; /* deprecated */ |
|---|
| 181 | double ***Pij_rr,***dPij_rr,***d2Pij_rr; /* matrix of change probabilities and its first and secnd derivates */ |
|---|
| 182 | |
|---|
| 183 | double *ql; /* ql[0], ql[1], ql[2] give the likelihood of the three topological |
|---|
| 184 | configurations around that branch */ |
|---|
| 185 | int best_conf; /* best topological configuration : |
|---|
| 186 | ((left_1,left_2),right_1,right_2) or |
|---|
| 187 | ((left_1,right_2),right_1,left_2) or |
|---|
| 188 | ((left_1,right_1),right_1,left_2) */ |
|---|
| 189 | |
|---|
| 190 | int num_st_left; /* number of the subtree on the left side */ |
|---|
| 191 | int num_st_rght; /* number of the subtree on the right side */ |
|---|
| 192 | |
|---|
| 193 | |
|---|
| 194 | /* Below are the likelihood scaling factors (used in functions |
|---|
| 195 | `Get_All_Partial_Lk_Scale' in lk.c */ |
|---|
| 196 | int scale_left; |
|---|
| 197 | int scale_rght; |
|---|
| 198 | double site_sum_scale_f_left; |
|---|
| 199 | double site_sum_scale_f_rght; |
|---|
| 200 | double site_scale_f_left; |
|---|
| 201 | double site_scale_f_rght; |
|---|
| 202 | double *sum_scale_f_left; |
|---|
| 203 | double *sum_scale_f_rght; |
|---|
| 204 | |
|---|
| 205 | |
|---|
| 206 | double bootval; /* bootstrap value (if exists) */ |
|---|
| 207 | }edge; |
|---|
| 208 | |
|---|
| 209 | /*********************************************************/ |
|---|
| 210 | |
|---|
| 211 | typedef struct __Arbre { |
|---|
| 212 | struct __Node *root; /* root node */ |
|---|
| 213 | struct __Node **noeud; /* array of nodes that defines the tree topology */ |
|---|
| 214 | struct __Edge **t_edges; /* array of edges */ |
|---|
| 215 | struct __Arbre *old_tree; /* old copy of the tree */ |
|---|
| 216 | struct __Arbre *best_tree; /* best tree found so far */ |
|---|
| 217 | struct __Model *mod; /* substitution model */ |
|---|
| 218 | struct __AllSeq *data; /* sequences */ |
|---|
| 219 | struct __Option *input; /* input parameters */ |
|---|
| 220 | struct __Matrix *mat; /* pairwise distance matrix */ |
|---|
| 221 | |
|---|
| 222 | int has_bip; /*if has_bip=1, then the structure to compare |
|---|
| 223 | tree topologies is allocated, has_bip=0 otherwise */ |
|---|
| 224 | double min_diff_lk; /* min_diff_lk is the minimum taken among the 2n-3 |
|---|
| 225 | diff_lk values */ |
|---|
| 226 | |
|---|
| 227 | int both_sides; /* both_sides=1 -> a pre-order and a post-order tree |
|---|
| 228 | traversals are required to compute the likelihood |
|---|
| 229 | of every subtree in the phylogeny*/ |
|---|
| 230 | int n_otu; /* number of taxa */ |
|---|
| 231 | |
|---|
| 232 | int curr_site; /* current site of the alignment to be processed */ |
|---|
| 233 | int curr_catg; /* current class of the discrete gamma rate distribution */ |
|---|
| 234 | double best_loglk; /* highest value of the loglikelihood found so far */ |
|---|
| 235 | |
|---|
| 236 | double tot_loglk; /* loglikelihood */ |
|---|
| 237 | double *tot_loglk_sorted; /* used to compute tot_loglk by adding sorted terms to minimize CPU errors */ |
|---|
| 238 | double *tot_dloglk; /* first derivative of the likelihood with respect to |
|---|
| 239 | branch lengths */ |
|---|
| 240 | double *tot_d2loglk; /* second derivative of the likelihood with respect to |
|---|
| 241 | branch lengths */ |
|---|
| 242 | double *site_lk; /* vector of likelihoods at individual sites */ |
|---|
| 243 | |
|---|
| 244 | double **log_site_lk_cat; /* loglikelihood at individual sites and for each class of rate*/ |
|---|
| 245 | |
|---|
| 246 | double unconstraint_lk; /* unconstrained (or multinomial) likelihood */ |
|---|
| 247 | |
|---|
| 248 | int n_swap; /* number of NNIs performed */ |
|---|
| 249 | int n_pattern; /* number of distinct site patterns */ |
|---|
| 250 | int has_branch_lengths; /* =1 iff input tree displays branch lengths */ |
|---|
| 251 | int print_boot_val; /* if print_boot_val=1, the bootstrap values are printed */ |
|---|
| 252 | }arbre; |
|---|
| 253 | |
|---|
| 254 | |
|---|
| 255 | /*********************************************************/ |
|---|
| 256 | |
|---|
| 257 | typedef struct __Seq { |
|---|
| 258 | char *name; /* sequence name */ |
|---|
| 259 | int len; /* sequence length */ |
|---|
| 260 | char *state; /* sequence itself */ |
|---|
| 261 | }seq; |
|---|
| 262 | |
|---|
| 263 | /*********************************************************/ |
|---|
| 264 | |
|---|
| 265 | |
|---|
| 266 | typedef struct __AllSeq { |
|---|
| 267 | seq **c_seq; /* compressed sequences */ |
|---|
| 268 | int *invar; /* 1 -> states are identical, 0 states vary */ |
|---|
| 269 | double *wght; /* # of each site in c_seq */ |
|---|
| 270 | int n_otu; /* number of taxa */ |
|---|
| 271 | int clean_len; /* uncrunched sequences lenghts without gaps */ |
|---|
| 272 | int crunch_len; /* crunched sequences lengths */ |
|---|
| 273 | double *b_frq; /* observed state frequencies */ |
|---|
| 274 | int init_len; /* length of the uncompressed sequences */ |
|---|
| 275 | int *ambigu; /* ambigu[i]=1 is one or more of the sequences at site |
|---|
| 276 | i display an ambiguous character */ |
|---|
| 277 | int *sitepatt; /* this array maps the position of the patterns in the |
|---|
| 278 | compressed alignment to the positions in the uncompressed |
|---|
| 279 | one */ |
|---|
| 280 | }allseq; |
|---|
| 281 | |
|---|
| 282 | /*********************************************************/ |
|---|
| 283 | |
|---|
| 284 | typedef struct __Matrix { /* mostly used in BIONJ */ |
|---|
| 285 | double **P,**Q,**dist; /* observed proportions of transition, transverion and distances |
|---|
| 286 | between pairs of sequences */ |
|---|
| 287 | arbre *tree; /* tree... */ |
|---|
| 288 | int *on_off; /* on_off[i]=1 if column/line i corresponds to a node that has not |
|---|
| 289 | been agglomerated yet */ |
|---|
| 290 | int n_otu; /* number of taxa */ |
|---|
| 291 | char **name; /* sequence names */ |
|---|
| 292 | int r; /* number of nodes that have not been agglomerated yet */ |
|---|
| 293 | struct __Node **tip_node; /* array of pointer to the leaves of the tree */ |
|---|
| 294 | int curr_int; /* used in the NJ/BIONJ algorithms */ |
|---|
| 295 | int method; /* if method=1->NJ method is used, BIONJ otherwise */ |
|---|
| 296 | }matrix; |
|---|
| 297 | |
|---|
| 298 | /*********************************************************/ |
|---|
| 299 | |
|---|
| 300 | typedef struct __Model { |
|---|
| 301 | int whichmodel; |
|---|
| 302 | /* |
|---|
| 303 | 1 => JC69 |
|---|
| 304 | 2 => K2P |
|---|
| 305 | 3 => F81 |
|---|
| 306 | 4 => HKY85 |
|---|
| 307 | 5 => F84 |
|---|
| 308 | 6 => TN93 |
|---|
| 309 | 7 => GTR |
|---|
| 310 | 11 => Dayhoff |
|---|
| 311 | 12 => JTT |
|---|
| 312 | 13 => MtREV |
|---|
| 313 | */ |
|---|
| 314 | int ns; /* number of states (4 for ADN, 20 for AA) */ |
|---|
| 315 | double *pi; /* states frequencies */ |
|---|
| 316 | int datatype; /* 0->DNA, 1->AA */ |
|---|
| 317 | |
|---|
| 318 | /* ADN parameters */ |
|---|
| 319 | double kappa; /* transition/transversion rate */ |
|---|
| 320 | double lambda; /* parameter used to define the ts/tv ratios in the F84 and TN93 models */ |
|---|
| 321 | double alpha; /* gamma shapa parameter */ |
|---|
| 322 | double *r_proba; /* probabilities of the substitution rates defined by the discrete gamma distribution */ |
|---|
| 323 | double *rr; /* substitution rates defined by the discrete gamma distribution */ |
|---|
| 324 | int n_catg; /* number of categories in the discrete gamma distribution */ |
|---|
| 325 | double pinvar; /* proportion of invariable sites */ |
|---|
| 326 | int invar; /* =1 iff the substitution model takes into account invariable sites */ |
|---|
| 327 | |
|---|
| 328 | /* Below are 'old' values of some substitution parameters (see the comments above) */ |
|---|
| 329 | double alpha_old; |
|---|
| 330 | double kappa_old; |
|---|
| 331 | double lambda_old; |
|---|
| 332 | double pinvar_old; |
|---|
| 333 | |
|---|
| 334 | char *custom_mod_string; /* string of characters used to define custom |
|---|
| 335 | models of substitution */ |
|---|
| 336 | double **rr_param; /* table of pointers to relative rate parameters of the GTR or custom model */ |
|---|
| 337 | double *rr_param_values; /* relative rate parameters of the GTR or custom model */ |
|---|
| 338 | int **rr_param_num; /* each line of this 2d table gives a serie of equal relative rate parameter number */ |
|---|
| 339 | /* A<->C : number 0 */ |
|---|
| 340 | /* A<->G : number 1 */ |
|---|
| 341 | /* A<->T : number 2 */ |
|---|
| 342 | /* C<->G : number 3 */ |
|---|
| 343 | /* C<->T : number 4 */ |
|---|
| 344 | /* G<->T : number 5 */ |
|---|
| 345 | /* For example, [0][2][3] |
|---|
| 346 | [1] |
|---|
| 347 | [4][5] |
|---|
| 348 | corresponds to the model 010022, i.e., |
|---|
| 349 | (A<->C = A<->T = C<->T) != (A<->G) != (C<->T = G<->T) |
|---|
| 350 | */ |
|---|
| 351 | int *n_rr_param_per_cat; /* [3][1][2] for the previous example */ |
|---|
| 352 | int n_diff_rr_param; /* number of different relative substitution rates in the custom model */ |
|---|
| 353 | |
|---|
| 354 | int update_eigen; /* update_eigen=1-> eigen values/vectors need to be updated */ |
|---|
| 355 | |
|---|
| 356 | double ***Pij_rr; /* matrix of change probabilities */ |
|---|
| 357 | double ***dPij_rr; /* first derivative of the change probabilities with respect to branch length */ |
|---|
| 358 | double ***d2Pij_rr; /* second derivative of the change probabilities with respect to branch length */ |
|---|
| 359 | |
|---|
| 360 | |
|---|
| 361 | int seq_len; /* sequence length */ |
|---|
| 362 | /* AA parameters */ |
|---|
| 363 | /* see PMat_Empirical in models.c for AA algorithm explanation */ |
|---|
| 364 | double *mat_Q; /* 20x20 amino-acids substitution rates matrix */ |
|---|
| 365 | double *mat_Vr; /* 20x20 right eigenvectors of mat_Q */ |
|---|
| 366 | double *mat_Vi; /* 20x20 inverse matrix of mat_Vr */ |
|---|
| 367 | double *vct_ev; /* eigen values */ |
|---|
| 368 | double mr; /* mean rate = branch length/time interval */ |
|---|
| 369 | /* mr = -sum(i)(vct_pi[i].mat_Q[ii]) */ |
|---|
| 370 | double *vct_eDmr; /* diagonal terms of a 20x20 diagonal matrix */ |
|---|
| 371 | /* term n = exp(nth eigenvalue of mat_Q / mr) */ |
|---|
| 372 | int stepsize; /* stepsize=1 for nucleotide models, 3 for codon models */ |
|---|
| 373 | int n_otu; /* number of taxa */ |
|---|
| 374 | struct __Optimiz *s_opt; /* pointer to parameters to optimize */ |
|---|
| 375 | int bootstrap; /* bootstrap values are computed if bootstrap > 0. |
|---|
| 376 | The value give the number of replicates */ |
|---|
| 377 | double *user_b_freq; /* user-defined nucleotide frequencies */ |
|---|
| 378 | |
|---|
| 379 | |
|---|
| 380 | }model; |
|---|
| 381 | |
|---|
| 382 | /*********************************************************/ |
|---|
| 383 | |
|---|
| 384 | typedef struct __Option { /* mostly used in 'options.c' */ |
|---|
| 385 | char *seqfile; /* sequence file name */ |
|---|
| 386 | char *modelname; /* name of the model */ |
|---|
| 387 | struct __Model *mod; /* substitution model */ |
|---|
| 388 | int interleaved; /* interleaved or sequential sequence file format ? */ |
|---|
| 389 | int inputtree; /* =1 iff a user input tree is used as input */ |
|---|
| 390 | struct __Arbre *tree; /* pointer to the current tree */ |
|---|
| 391 | char *inputtreefile; /* input tree file name */ |
|---|
| 392 | FILE *fp_seq; /* pointer to the sequence file */ |
|---|
| 393 | FILE *fp_input_tree; /* pointer to the input tree file */ |
|---|
| 394 | FILE *fp_boot_tree; /* pointer to the bootstrap tree file */ |
|---|
| 395 | FILE *fp_boot_stats; /* pointer to the statistics file */ |
|---|
| 396 | int print_boot_trees; /* =1 if the bootstrapped trees are printed in output */ |
|---|
| 397 | char *phyml_stat_file; /* name of the statistics file */ |
|---|
| 398 | char *phyml_tree_file; /* name of the tree file */ |
|---|
| 399 | char *phyml_lk_file; /* name of the file in which the likelihood of the model is written */ |
|---|
| 400 | int phyml_stat_file_open_mode; /* opening file mode for statistics file */ |
|---|
| 401 | int phyml_tree_file_open_mode; /* opening file mode for tree file */ |
|---|
| 402 | int n_data_sets; /* number of data sets to be analysed */ |
|---|
| 403 | int n_trees; /* number of trees */ |
|---|
| 404 | int seq_len; /* sequence length */ |
|---|
| 405 | int n_data_set_asked; /* number of bootstrap replicates */ |
|---|
| 406 | struct __Seq **data; /* pointer to the uncompressed sequences */ |
|---|
| 407 | struct __AllSeq *alldata; /* pointer to the compressed sequences */ |
|---|
| 408 | char *nt_or_cd; /* nucleotide or codon data ? (not used) */ |
|---|
| 409 | }option; |
|---|
| 410 | |
|---|
| 411 | /*********************************************************/ |
|---|
| 412 | |
|---|
| 413 | typedef struct __Optimiz { /* parameters to be optimised (mostly used in 'optimiz.c') */ |
|---|
| 414 | int print; /* =1 -> verbose mode */ |
|---|
| 415 | |
|---|
| 416 | int opt_alpha; /* =1 -> the gamma shape parameter is optimised */ |
|---|
| 417 | int opt_kappa; /* =1 -> the ts/tv ratio parameter is optimised */ |
|---|
| 418 | int opt_lambda; /* =1 -> the F84|TN93 model specific parameter is optimised */ |
|---|
| 419 | int opt_pinvar; /* =1 -> the proportion of invariants is optimised */ |
|---|
| 420 | int opt_bfreq; /* =1 -> the nucleotide frequencies are optimised */ |
|---|
| 421 | int opt_rr_param; /* =1 -> the relative rate parameters of the GTR or the customn model are optimised */ |
|---|
| 422 | int opt_free_param; /* if opt_topo=0 and opt_free_param=1 -> the numerical parameters of the |
|---|
| 423 | model are optimised. if opt_topo=0 and opt_free_param=0 -> no parameter is |
|---|
| 424 | optimised */ |
|---|
| 425 | int opt_bl; /* =1 -> the branch lengths are optimised */ |
|---|
| 426 | int opt_topo; /* =1 -> the tree topology is optimised */ |
|---|
| 427 | double init_lk; /* initial loglikelihood value */ |
|---|
| 428 | int n_it_max; /* maximum bnumber of iteration during an optimisation step */ |
|---|
| 429 | int last_opt; /* =1 -> the numerical parameters are optimised further while the |
|---|
| 430 | tree topology remains fixed */ |
|---|
| 431 | }optimiz; |
|---|
| 432 | |
|---|
| 433 | /*********************************************************/ |
|---|
| 434 | |
|---|
| 435 | typedef struct __Qmat{ |
|---|
| 436 | double **u_mat; /* right eigen vectors */ |
|---|
| 437 | double **v_mat; /* left eigen vectors = inv(u_mat) */ |
|---|
| 438 | double *root_vct; /* eigen values */ |
|---|
| 439 | double *q; /* instantaneous rate matrix */ |
|---|
| 440 | }qmat; |
|---|
| 441 | |
|---|
| 442 | /*********************************************************/ |
|---|
| 443 | |
|---|
| 444 | double bico(int n,int k); |
|---|
| 445 | double factln(int n); |
|---|
| 446 | double gammln(double xx); |
|---|
| 447 | double Pbinom(int N,int ni,double p); |
|---|
| 448 | void Plim_Binom(double pH0,int N,double *pinf,double *psup); |
|---|
| 449 | double LnGamma(double alpha); |
|---|
| 450 | double IncompleteGamma(double x,double alpha,double ln_gamma_alpha); |
|---|
| 451 | double PointChi2(double prob,double v); |
|---|
| 452 | double PointNormal(double prob); |
|---|
| 453 | int DiscreteGamma(double freqK[],double rK[],double alfa,double beta,int K,int median); |
|---|
| 454 | arbre *Read_Tree(char *s_tree); |
|---|
| 455 | void Make_All_Edges_Light(node *a,node *d); |
|---|
| 456 | void Make_All_Edges_Lk(node *a,node *d,arbre *tree); |
|---|
| 457 | void R_rtree(char *s_tree,node *pere,arbre *tree,int *n_int,int *n_ext); |
|---|
| 458 | void Clean_Multifurcation(char **subtrees,int current_deg,int end_deg); |
|---|
| 459 | char **Sub_Trees(char *tree,int *degree); |
|---|
| 460 | int Next_Par(char *s,int pos); |
|---|
| 461 | char *Write_Tree(arbre *tree); |
|---|
| 462 | void R_wtree(node *pere,node *fils,char *s_tree,arbre *tree); |
|---|
| 463 | void Init_Tree(arbre *tree); |
|---|
| 464 | void Make_Edge_Light(node *a,node *d); |
|---|
| 465 | void Init_Edge_Light(edge *b); |
|---|
| 466 | void Make_Edge_Dirs(edge *b,node *a,node *d); |
|---|
| 467 | void Make_Edge_Lk(node *a,node *d,arbre *tree); |
|---|
| 468 | void Make_Node_Light(node *n); |
|---|
| 469 | void Init_Node_Light(node *n); |
|---|
| 470 | void Make_Node_Lk(node *n); |
|---|
| 471 | seq **Get_Seq(option *input,int rw); |
|---|
| 472 | seq **Read_Seq_Sequential(FILE *in,int *n_otu); |
|---|
| 473 | seq **Read_Seq_Interleaved(FILE *in,int *n_otu); |
|---|
| 474 | int Read_One_Line_Seq(seq ***data,int num_otu,FILE *in); |
|---|
| 475 | void Uppercase(char *ch); |
|---|
| 476 | allseq *Compact_Seq(seq **data,option *input); |
|---|
| 477 | allseq *Compact_CSeq(allseq *data,model *mod); |
|---|
| 478 | void Get_Base_Freqs(allseq *data); |
|---|
| 479 | void Get_AA_Freqs(allseq *data); |
|---|
| 480 | arbre *Read_Tree_File(FILE *fp_input_tree); |
|---|
| 481 | void Init_Tree_Edges(node *a,node *d,arbre *tree,int *cur); |
|---|
| 482 | void Exit(char *message); |
|---|
| 483 | void *mCalloc(int nb,size_t size); |
|---|
| 484 | void *mRealloc(void *p,int nb,size_t size); |
|---|
| 485 | arbre *Make_Light_Tree_Struct(int n_otu); |
|---|
| 486 | int Sort_Double_Decrease(const void *a,const void *b); |
|---|
| 487 | void qksort(double *A,int ilo,int ihi); |
|---|
| 488 | void Print_Site(allseq *alldata,int num,int n_otu,char *sep,int stepsize); |
|---|
| 489 | void Print_Seq(seq **data,int n_otu); |
|---|
| 490 | void Print_CSeq(FILE *fp,allseq *alldata); |
|---|
| 491 | void Order_Tree_Seq(arbre *tree,seq **data); |
|---|
| 492 | void Order_Tree_CSeq(arbre *tree,allseq *data); |
|---|
| 493 | matrix *Make_Mat(int n_otu); |
|---|
| 494 | void Init_Mat(matrix *mat,allseq *data); |
|---|
| 495 | arbre *Make_Tree(allseq *data); |
|---|
| 496 | void Print_Dist(matrix *mat); |
|---|
| 497 | void Print_Node(node *a,node *d,arbre *tree); |
|---|
| 498 | void Share_Lk_Struct(arbre *t_full,arbre *t_empt); |
|---|
| 499 | void Init_Constant(); |
|---|
| 500 | void Print_Mat(matrix *mat); |
|---|
| 501 | int Sort_Edges_Diff_Lk(arbre *tree,edge **sorted_edges,int n_elem); |
|---|
| 502 | void NNI(arbre *tree,edge *b_fcus,int do_swap); |
|---|
| 503 | void Swap(node *a,node *b,node *c,node *d,arbre *tree); |
|---|
| 504 | void Update_All_Partial_Lk(edge *b_fcus,arbre *tree); |
|---|
| 505 | void Update_SubTree_Partial_Lk(edge *b_fcus,node *a,node *d,arbre *tree); |
|---|
| 506 | double Update_Lk_At_Given_Edge(edge *b_fcus,arbre *tree); |
|---|
| 507 | void Update_PMat_At_Given_Edge(edge *b_fcus,arbre *tree); |
|---|
| 508 | allseq *Make_Seq(int n_otu,int len,char **sp_names); |
|---|
| 509 | allseq *Copy_CData(allseq *ori,model *mod); |
|---|
| 510 | optimiz *Alloc_Optimiz(); |
|---|
| 511 | void Init_Optimiz(optimiz *s_opt); |
|---|
| 512 | int Filexists(char *filename); |
|---|
| 513 | FILE *Openfile(char *filename,int mode); |
|---|
| 514 | void Print_Fp_Out(FILE *fp_out,time_t t_beg,time_t t_end,arbre *tree,option *input,int n_data_set); |
|---|
| 515 | void Print_Fp_Out_Lines(FILE *fp_out,time_t t_beg,time_t t_end,arbre *tree,option *input,int n_data_set); |
|---|
| 516 | void Alloc_All_P_Lk(arbre *tree); |
|---|
| 517 | matrix *K2P_dist(allseq *data,double g_shape); |
|---|
| 518 | matrix *JC69_Dist(allseq *data,model *mod); |
|---|
| 519 | matrix *Hamming_Dist(allseq *data,model *mod); |
|---|
| 520 | int Is_Ambigu(char *state,int datatype,int stepsize); |
|---|
| 521 | void Check_Ambiguities(allseq *data,int datatype,int stepsize); |
|---|
| 522 | int Assign_State(char *c,int datatype,int stepsize); |
|---|
| 523 | void Bootstrap(arbre *tree); |
|---|
| 524 | void Update_BrLen_Invar(arbre *tree); |
|---|
| 525 | void Getstring_Stdin(char *file_name); |
|---|
| 526 | void Print_Freq(arbre *tree); |
|---|
| 527 | double Num_Derivatives_One_Param(double(*func)(arbre *tree),arbre *tree,double f0,double *param,double stepsize,double *err,int precise); |
|---|
| 528 | void Num_Derivative_Several_Param(arbre *tree,double *param,int n_param,double stepsize,double(*func)(arbre *tree),double *derivatives); |
|---|
| 529 | int Compare_Two_States(char *state1,char *state2,int state_size); |
|---|
| 530 | void Copy_One_State(char *from,char *to,int state_size); |
|---|
| 531 | model *Make_Model_Basic(); |
|---|
| 532 | void Make_Model_Complete(model *mod); |
|---|
| 533 | model *Copy_Model(model *ori); |
|---|
| 534 | void Set_Defaults_Input(option *input); |
|---|
| 535 | void Set_Defaults_Model(model *mod); |
|---|
| 536 | void Set_Defaults_Optimiz(optimiz *s_opt); |
|---|
| 537 | void Copy_Optimiz(optimiz *ori,optimiz *cpy); |
|---|
| 538 | void Get_Bip(node *a,node *d,arbre *tree); |
|---|
| 539 | void Alloc_Bip(arbre *tree); |
|---|
| 540 | int Sort_Double_Increase(const void *a,const void *b); |
|---|
| 541 | int Sort_String(const void *a,const void *b); |
|---|
| 542 | void Compare_Bip(arbre *tree1,arbre *tree2); |
|---|
| 543 | void Test_Multiple_Data_Set_Format(option *input); |
|---|
| 544 | int Are_Compatible(char *statea,char *stateb,int stepsize,int datatype); |
|---|
| 545 | void Hide_Ambiguities(allseq *data); |
|---|
| 546 | void Print_Site_Lk(arbre *tree, FILE *fp); |
|---|
| 547 | #endif |
|---|
| 548 | |
|---|
| 549 | |
|---|
| 550 | |
|---|
| 551 | |
|---|