1 | /* Phyle of filogenetic tree calculating functions for CLUSTAL V */ |
---|
2 | #include <stdio.h> |
---|
3 | #include <string.h> |
---|
4 | #include <stdlib.h> |
---|
5 | #include <math.h> |
---|
6 | #include "clustalv.h" |
---|
7 | |
---|
8 | /* |
---|
9 | * Prototypes |
---|
10 | */ |
---|
11 | |
---|
12 | extern void *ckalloc(size_t); |
---|
13 | extern int getint(const char *, int, int, int); |
---|
14 | extern void get_path(char *, char *); |
---|
15 | extern FILE * open_output_file(const char *, const char *, char *, const char *); |
---|
16 | |
---|
17 | |
---|
18 | void init_trees(void); |
---|
19 | void phylogenetic_tree(void); |
---|
20 | void bootstrap_tree(void); |
---|
21 | void compare_tree(char **, char **, int *, int); |
---|
22 | void tree_gap_delete(void); /*flag all positions in alignment that have a gap */ |
---|
23 | void dna_distance_matrix(FILE *); |
---|
24 | void prot_distance_matrix(FILE *); |
---|
25 | void nj_tree(char **, FILE *); |
---|
26 | void print_tree(char **, FILE *, int *); |
---|
27 | void root_tree(char **, int); |
---|
28 | |
---|
29 | Boolean transition(int,int); |
---|
30 | |
---|
31 | /* |
---|
32 | * Global variables |
---|
33 | */ |
---|
34 | |
---|
35 | |
---|
36 | extern double **tmat; /* general nxn array of reals; allocated from main */ |
---|
37 | /* this is used as a distance matrix */ |
---|
38 | extern double **smat; |
---|
39 | extern Boolean dnaflag; /* TRUE for DNA seqs; FALSE for proteins */ |
---|
40 | extern Boolean tossgaps; /* Ignore places in align. where ANY seq. has a gap*/ |
---|
41 | extern Boolean kimura; /* Use correction for multiple substitutions */ |
---|
42 | extern Boolean empty; /* any sequences in memory? */ |
---|
43 | extern Boolean usemenu; /* interactive (TRUE) or command line (FALSE) */ |
---|
44 | extern void error(const char *,...); /* error reporting */ |
---|
45 | extern int nseqs; /* total no. of seqs. */ |
---|
46 | extern int *seqlen_array; /* the lengths of the sequences */ |
---|
47 | extern char **seq_array; /* the sequences */ |
---|
48 | extern char **names; /* the seq. names */ |
---|
49 | extern char seqname[]; /* name of input file */ |
---|
50 | |
---|
51 | static double *av; |
---|
52 | static int *kill; |
---|
53 | static int ran_factor; |
---|
54 | int boot_ntrials; /* number of bootstrap trials */ |
---|
55 | unsigned int boot_ran_seed; /* random number generator seed */ |
---|
56 | static int *boot_positions; |
---|
57 | static int *boot_totals; |
---|
58 | static char **standard_tree; |
---|
59 | static char **sample_tree; |
---|
60 | static FILE *phy_tree_file; |
---|
61 | static char phy_tree_name[FILENAMELEN+1]; |
---|
62 | static Boolean verbose; |
---|
63 | static char *tree_gaps; /* array of weights; 1 for use this posn.; 0 don't */ |
---|
64 | |
---|
65 | |
---|
66 | void init_trees() |
---|
67 | { |
---|
68 | register int i; |
---|
69 | |
---|
70 | tree_gaps = (char *)ckalloc( (MAXLEN+1) * sizeof (char) ); |
---|
71 | |
---|
72 | boot_positions = (int *)ckalloc( (MAXLEN+1) * sizeof (int) ); |
---|
73 | boot_totals = (int *)ckalloc( (MAXN+1) * sizeof (int) ); |
---|
74 | |
---|
75 | kill = (int *) ckalloc( (MAXN+1) * sizeof (int) ); |
---|
76 | av = (double *) ckalloc( (MAXN+1) * sizeof (double) ); |
---|
77 | |
---|
78 | standard_tree = (char **) ckalloc( (MAXN+1) * sizeof (char *) ); |
---|
79 | for(i=0; i<MAXN+1; i++) |
---|
80 | standard_tree[i] = (char *) ckalloc( (MAXN+1) * sizeof(char) ); |
---|
81 | |
---|
82 | sample_tree = (char **) ckalloc( (MAXN+1) * sizeof (char *) ); |
---|
83 | for(i=0; i<MAXN+1; i++) |
---|
84 | sample_tree[i] = (char *) ckalloc( (MAXN+1) * sizeof(char) ); |
---|
85 | |
---|
86 | boot_ntrials = 1000; |
---|
87 | boot_ran_seed = 111; |
---|
88 | kimura = FALSE; |
---|
89 | tossgaps = FALSE; |
---|
90 | } |
---|
91 | |
---|
92 | |
---|
93 | |
---|
94 | void phylogenetic_tree() |
---|
95 | { char path[FILENAMELEN+1]; |
---|
96 | int j; |
---|
97 | |
---|
98 | if(empty) { |
---|
99 | error("You must load an alignment first"); |
---|
100 | return; |
---|
101 | } |
---|
102 | |
---|
103 | get_path(seqname,path); |
---|
104 | |
---|
105 | if((phy_tree_file = open_output_file( |
---|
106 | "\nEnter name for tree output file ",path, |
---|
107 | phy_tree_name,"nj")) == NULL) return; |
---|
108 | |
---|
109 | for(j=1; j<=seqlen_array[1]; ++j) |
---|
110 | boot_positions[j] = j; |
---|
111 | |
---|
112 | verbose = TRUE; /* Turn on screen output */ |
---|
113 | if(dnaflag) |
---|
114 | dna_distance_matrix(phy_tree_file); |
---|
115 | else |
---|
116 | prot_distance_matrix(phy_tree_file); |
---|
117 | |
---|
118 | verbose = TRUE; /* Turn on output */ |
---|
119 | nj_tree(standard_tree,phy_tree_file); |
---|
120 | fclose(phy_tree_file); |
---|
121 | /* |
---|
122 | print_tree(standard_tree,phy_tree_file); |
---|
123 | */ |
---|
124 | fprintf(stdout,"\nPhylogenetic tree file created: [%s]",phy_tree_name); |
---|
125 | } |
---|
126 | |
---|
127 | |
---|
128 | |
---|
129 | |
---|
130 | |
---|
131 | Boolean transition(int base1, int base2) /* TRUE if transition; else FALSE */ |
---|
132 | /* |
---|
133 | |
---|
134 | assumes that the bases of DNA sequences have been translated as |
---|
135 | a,A = 1; c,C = 2; g,G = 3; t,T,u,U = 4; X or N = 0; "-" < 0; |
---|
136 | |
---|
137 | A <--> G and T <--> C are transitions; all others are transversions. |
---|
138 | |
---|
139 | */ |
---|
140 | { |
---|
141 | if( ((base1 == 1) && (base2 == 3)) || ((base1 == 3) && (base2 == 1)) ) |
---|
142 | return TRUE; /* A <--> G */ |
---|
143 | if( ((base1 == 4) && (base2 == 2)) || ((base1 == 2) && (base2 == 4)) ) |
---|
144 | return TRUE; /* T <--> C */ |
---|
145 | return FALSE; |
---|
146 | } |
---|
147 | |
---|
148 | |
---|
149 | void tree_gap_delete() /* flag all positions in alignment that have a gap */ |
---|
150 | { /* in ANY sequence */ |
---|
151 | int seqn, posn; |
---|
152 | |
---|
153 | for(posn=1; posn<=seqlen_array[1]; ++posn) { |
---|
154 | tree_gaps[posn] = 0; |
---|
155 | for(seqn=1; seqn<=nseqs; ++seqn) { |
---|
156 | if(seq_array[seqn][posn] <= 0) { |
---|
157 | tree_gaps[posn] = 1; |
---|
158 | break; |
---|
159 | } |
---|
160 | } |
---|
161 | } |
---|
162 | } |
---|
163 | |
---|
164 | |
---|
165 | |
---|
166 | void nj_tree(char **tree_description, FILE *tree) |
---|
167 | { |
---|
168 | register int i; |
---|
169 | int l[4],nude,k; |
---|
170 | int nc,mini,minj,j,ii,jj; |
---|
171 | double fnseqs,fnseqs2,sumd; |
---|
172 | double diq,djq,dij,d2r,dr,dio,djo,da; |
---|
173 | double tmin,total,dmin; |
---|
174 | double bi,bj,b1,b2,b3,branch[4]; |
---|
175 | int typei,typej; /* 0 = node; 1 = OTU */ |
---|
176 | |
---|
177 | fnseqs = (double)nseqs; |
---|
178 | |
---|
179 | /*********************** First initialisation ***************************/ |
---|
180 | |
---|
181 | if(verbose) { |
---|
182 | fprintf(tree,"\n\n\t\t\tNeighbor-joining Method\n"); |
---|
183 | fprintf(tree,"\n Saitou, N. and Nei, M. (1987)"); |
---|
184 | fprintf(tree," The Neighbor-joining Method:"); |
---|
185 | fprintf(tree,"\n A New Method for Reconstructing Phylogenetic Trees."); |
---|
186 | fprintf(tree,"\n Mol. Biol. Evol., 4(4), 406-425\n"); |
---|
187 | fprintf(tree,"\n\n This is an UNROOTED tree\n"); |
---|
188 | fprintf(tree,"\n Numbers in parentheses are branch lengths\n\n"); |
---|
189 | } |
---|
190 | |
---|
191 | mini = minj = 0; |
---|
192 | |
---|
193 | for(i=1;i<=nseqs;++i) |
---|
194 | { |
---|
195 | tmat[i][i] = av[i] = 0.0; |
---|
196 | kill[i] = 0; |
---|
197 | } |
---|
198 | |
---|
199 | /*********************** Enter The Main Cycle ***************************/ |
---|
200 | |
---|
201 | /* for(nc=1; nc<=(nseqs-3); ++nc) { */ /**start main cycle**/ |
---|
202 | for(nc=1; nc<=(nseqs-3); ++nc) { |
---|
203 | sumd = 0.0; |
---|
204 | for(j=2; j<=nseqs; ++j) |
---|
205 | for(i=1; i<j; ++i) { |
---|
206 | tmat[j][i] = tmat[i][j]; |
---|
207 | sumd = sumd + tmat[i][j]; |
---|
208 | smat[i][j] = smat[j][i] = 0.0; |
---|
209 | } |
---|
210 | |
---|
211 | tmin = 99999.0; |
---|
212 | |
---|
213 | /*.................compute SMATij values and find the smallest one ........*/ |
---|
214 | |
---|
215 | for(jj=2; jj<=nseqs; ++jj) |
---|
216 | if(kill[jj] != 1) |
---|
217 | for(ii=1; ii<jj; ++ii) |
---|
218 | if(kill[ii] != 1) { |
---|
219 | diq = djq = 0.0; |
---|
220 | |
---|
221 | for(i=1; i<=nseqs; ++i) { |
---|
222 | diq = diq + tmat[i][ii]; |
---|
223 | djq = djq + tmat[i][jj]; |
---|
224 | } |
---|
225 | |
---|
226 | dij = tmat[ii][jj]; |
---|
227 | d2r = diq + djq - (2.0*dij); |
---|
228 | dr = sumd - dij -d2r; |
---|
229 | fnseqs2 = fnseqs - 2.0; |
---|
230 | total= d2r+ fnseqs2*dij +dr*2.0; |
---|
231 | total= total / (2.0*fnseqs2); |
---|
232 | smat[ii][jj] = total; |
---|
233 | |
---|
234 | if(total < tmin) { |
---|
235 | tmin = total; |
---|
236 | mini = ii; |
---|
237 | minj = jj; |
---|
238 | } |
---|
239 | } |
---|
240 | |
---|
241 | |
---|
242 | /*.................compute branch lengths and print the results ........*/ |
---|
243 | |
---|
244 | |
---|
245 | dio = djo = 0.0; |
---|
246 | for(i=1; i<=nseqs; ++i) { |
---|
247 | dio = dio + tmat[i][mini]; |
---|
248 | djo = djo + tmat[i][minj]; |
---|
249 | } |
---|
250 | |
---|
251 | dmin = tmat[mini][minj]; |
---|
252 | dio = (dio - dmin) / fnseqs2; |
---|
253 | djo = (djo - dmin) / fnseqs2; |
---|
254 | bi = (dmin + dio - djo) * 0.5; |
---|
255 | bj = dmin - bi; |
---|
256 | bi = bi - av[mini]; |
---|
257 | bj = bj - av[minj]; |
---|
258 | |
---|
259 | if( av[mini] > 0.0 ) |
---|
260 | typei = 0; |
---|
261 | else |
---|
262 | typei = 1; |
---|
263 | if( av[minj] > 0.0 ) |
---|
264 | typej = 0; |
---|
265 | else |
---|
266 | typej = 1; |
---|
267 | |
---|
268 | if(verbose) { |
---|
269 | fprintf(tree,"\n Cycle%4d = ",nc); |
---|
270 | if(typei == 0) |
---|
271 | fprintf(tree,"Node:%4d (%9.5f) joins ",mini,bi); |
---|
272 | else |
---|
273 | fprintf(tree," SEQ:%4d (%9.5f) joins ",mini,bi); |
---|
274 | if(typej == 0) |
---|
275 | fprintf(tree,"Node:%4d (%9.5f)",minj,bj); |
---|
276 | else |
---|
277 | fprintf(tree," SEQ:%4d (%9.5f)",minj,bj); |
---|
278 | fprintf(tree,"\n"); |
---|
279 | } |
---|
280 | |
---|
281 | for(i=1; i<=nseqs; i++) |
---|
282 | tree_description[nc][i] = 0; |
---|
283 | |
---|
284 | if(typei == 0) { |
---|
285 | for(i=nc-1; i>=1; i--) |
---|
286 | if(tree_description[i][mini] == 1) { |
---|
287 | for(j=1; j<=nseqs; j++) |
---|
288 | if(tree_description[i][j] == 1) |
---|
289 | tree_description[nc][j] = 1; |
---|
290 | break; |
---|
291 | } |
---|
292 | } |
---|
293 | else |
---|
294 | tree_description[nc][mini] = 1; |
---|
295 | |
---|
296 | if(typej == 0) { |
---|
297 | for(i=nc-1; i>=1; i--) |
---|
298 | if(tree_description[i][minj] == 1) { |
---|
299 | for(j=1; j<=nseqs; j++) |
---|
300 | if(tree_description[i][j] == 1) |
---|
301 | tree_description[nc][j] = 1; |
---|
302 | break; |
---|
303 | } |
---|
304 | } |
---|
305 | else |
---|
306 | tree_description[nc][minj] = 1; |
---|
307 | |
---|
308 | if(dmin <= 0.0) dmin = 0.0001; |
---|
309 | av[mini] = dmin * 0.5; |
---|
310 | |
---|
311 | /*........................Re-initialisation................................*/ |
---|
312 | |
---|
313 | fnseqs = fnseqs - 1.0; |
---|
314 | kill[minj] = 1; |
---|
315 | |
---|
316 | for(j=1; j<=nseqs; ++j) |
---|
317 | if( kill[j] != 1 ) { |
---|
318 | da = ( tmat[mini][j] + tmat[minj][j] ) * 0.5; |
---|
319 | if( (mini - j) < 0 ) |
---|
320 | tmat[mini][j] = da; |
---|
321 | if( (mini - j) > 0) |
---|
322 | tmat[j][mini] = da; |
---|
323 | } |
---|
324 | |
---|
325 | for(j=1; j<=nseqs; ++j) |
---|
326 | tmat[minj][j] = tmat[j][minj] = 0.0; |
---|
327 | |
---|
328 | |
---|
329 | /****/ } /**end main cycle**/ |
---|
330 | |
---|
331 | /******************************Last Cycle (3 Seqs. left)********************/ |
---|
332 | |
---|
333 | nude = 1; |
---|
334 | |
---|
335 | for(i=1; i<=nseqs; ++i) |
---|
336 | if( kill[i] != 1 ) { |
---|
337 | l[nude] = i; |
---|
338 | nude = nude + 1; |
---|
339 | } |
---|
340 | |
---|
341 | b1 = (tmat[l[1]][l[2]] + tmat[l[1]][l[3]] - tmat[l[2]][l[3]]) * 0.5; |
---|
342 | b2 = tmat[l[1]][l[2]] - b1; |
---|
343 | b3 = tmat[l[1]][l[3]] - b1; |
---|
344 | branch[1] = b1 - av[l[1]]; |
---|
345 | branch[2] = b2 - av[l[2]]; |
---|
346 | branch[3] = b3 - av[l[3]]; |
---|
347 | |
---|
348 | |
---|
349 | for(i=1; i<=nseqs; i++) |
---|
350 | tree_description[nseqs-2][i] = 0; |
---|
351 | |
---|
352 | if(verbose) |
---|
353 | fprintf(tree,"\n Cycle%4d (Last cycle, trichotomy):\n",nc); |
---|
354 | |
---|
355 | for(i=1; i<=3; ++i) { |
---|
356 | if( av[l[i]] > 0.0) { |
---|
357 | if(verbose) |
---|
358 | fprintf(tree,"\n\t\t Node:%4d (%9.5f) ",l[i],branch[i]); |
---|
359 | for(k=nseqs-3; k>=1; k--) |
---|
360 | if(tree_description[k][l[i]] == 1) { |
---|
361 | for(j=1; j<=nseqs; j++) |
---|
362 | if(tree_description[k][j] == 1) |
---|
363 | tree_description[nseqs-2][j] = i; |
---|
364 | break; |
---|
365 | } |
---|
366 | } |
---|
367 | else { |
---|
368 | if(verbose) |
---|
369 | fprintf(tree,"\n\t\t SEQ:%4d (%9.5f) ",l[i],branch[i]); |
---|
370 | tree_description[nseqs-2][l[i]] = i; |
---|
371 | } |
---|
372 | if(i < 3) { |
---|
373 | if(verbose) |
---|
374 | fprintf(tree,"joins"); |
---|
375 | } |
---|
376 | } |
---|
377 | |
---|
378 | if(verbose) |
---|
379 | fprintf(tree,"\n"); |
---|
380 | |
---|
381 | } |
---|
382 | |
---|
383 | |
---|
384 | |
---|
385 | |
---|
386 | void bootstrap_tree() |
---|
387 | { |
---|
388 | int i,j,ranno; |
---|
389 | char path[MAXLINE+1]; |
---|
390 | |
---|
391 | if(empty) { |
---|
392 | error("You must load an alignment first"); |
---|
393 | return; |
---|
394 | } |
---|
395 | |
---|
396 | get_path(seqname, path); |
---|
397 | |
---|
398 | if((phy_tree_file = open_output_file( |
---|
399 | "\nEnter name for bootstrap output file ",path, |
---|
400 | phy_tree_name,"njb")) == NULL) return; |
---|
401 | |
---|
402 | for(i=0;i<MAXN+1;i++) |
---|
403 | boot_totals[i]=0; |
---|
404 | |
---|
405 | for(j=1; j<=seqlen_array[1]; ++j) /* First select all positions for */ |
---|
406 | boot_positions[j] = j; /* the "standard" tree */ |
---|
407 | |
---|
408 | verbose = TRUE; /* Turn on screen output */ |
---|
409 | if(dnaflag) |
---|
410 | dna_distance_matrix(phy_tree_file); |
---|
411 | else |
---|
412 | prot_distance_matrix(phy_tree_file); |
---|
413 | |
---|
414 | verbose = TRUE; /* Turn on screen output */ |
---|
415 | nj_tree(standard_tree, phy_tree_file); /* compute the standard tree */ |
---|
416 | |
---|
417 | fprintf(phy_tree_file,"\n\n\t\t\tBootstrap Confidence Limits\n\n"); |
---|
418 | |
---|
419 | ran_factor = RAND_MAX / seqlen_array[1]; |
---|
420 | |
---|
421 | if(usemenu) |
---|
422 | boot_ran_seed = |
---|
423 | getint("\n\nEnter seed no. for random number generator ",1,1000,boot_ran_seed); |
---|
424 | |
---|
425 | srand(boot_ran_seed); |
---|
426 | fprintf(phy_tree_file,"\n Random number generator seed = %7u\n", |
---|
427 | boot_ran_seed); |
---|
428 | |
---|
429 | if(usemenu) |
---|
430 | boot_ntrials = |
---|
431 | getint("\n\nEnter number of bootstrap trials ",1,10000,boot_ntrials); |
---|
432 | |
---|
433 | fprintf(phy_tree_file,"\n Number of bootstrap trials = %7d\n", |
---|
434 | boot_ntrials); |
---|
435 | |
---|
436 | fprintf(phy_tree_file, |
---|
437 | "\n\n Diagrammatic representation of the above tree: \n"); |
---|
438 | fprintf(phy_tree_file,"\n Each row represents 1 tree cycle;"); |
---|
439 | fprintf(phy_tree_file," defining 2 groups.\n"); |
---|
440 | fprintf(phy_tree_file,"\n Each column is 1 sequence; "); |
---|
441 | fprintf(phy_tree_file,"the stars in each line show 1 group; "); |
---|
442 | fprintf(phy_tree_file,"\n the dots show the other\n"); |
---|
443 | fprintf(phy_tree_file,"\n Numbers show occurrences in bootstrap samples."); |
---|
444 | /* |
---|
445 | print_tree(standard_tree, phy_tree_file, boot_totals); |
---|
446 | */ |
---|
447 | verbose = FALSE; /* Turn OFF screen output */ |
---|
448 | |
---|
449 | fprintf(stdout,"\n\nEach dot represents 10 trials\n\n"); |
---|
450 | for(i=1; i<=boot_ntrials; ++i) { |
---|
451 | for(j=1; j<=seqlen_array[1]; ++j) { /* select alignment */ |
---|
452 | ranno = ( rand() / ran_factor ) + 1; /* positions for */ |
---|
453 | boot_positions[j] = ranno; /* bootstrap sample */ |
---|
454 | } |
---|
455 | if(dnaflag) |
---|
456 | dna_distance_matrix(phy_tree_file); |
---|
457 | else |
---|
458 | prot_distance_matrix(phy_tree_file); |
---|
459 | nj_tree(sample_tree, phy_tree_file); /* compute 1 sample tree */ |
---|
460 | compare_tree(standard_tree, sample_tree, boot_totals, nseqs); |
---|
461 | if(i % 10 == 0) fprintf(stdout,"."); |
---|
462 | if(i % 100 == 0) fprintf(stdout,"\n"); |
---|
463 | } |
---|
464 | |
---|
465 | /* |
---|
466 | fprintf(phy_tree_file,"\n\n Bootstrap totals for each group\n"); |
---|
467 | */ |
---|
468 | print_tree(standard_tree, phy_tree_file, boot_totals); |
---|
469 | |
---|
470 | fclose(phy_tree_file); |
---|
471 | |
---|
472 | fprintf(stdout,"\n\nBootstrap output file completed [%s]" |
---|
473 | ,phy_tree_name); |
---|
474 | } |
---|
475 | |
---|
476 | |
---|
477 | void compare_tree(char **tree1, char **tree2, int *hits, int n) |
---|
478 | { |
---|
479 | int i,j,k; |
---|
480 | int nhits1, nhits2; |
---|
481 | |
---|
482 | for(i=1; i<=n-3; i++) { |
---|
483 | for(j=1; j<=n-3; j++) { |
---|
484 | nhits1 = 0; |
---|
485 | nhits2 = 0; |
---|
486 | for(k=1; k<=n; k++) { |
---|
487 | if(tree1[i][k] == tree2[j][k]) nhits1++; |
---|
488 | if(tree1[i][k] != tree2[j][k]) nhits2++; |
---|
489 | } |
---|
490 | if((nhits1 == nseqs) || (nhits2 == nseqs)) hits[i]++; |
---|
491 | } |
---|
492 | } |
---|
493 | } |
---|
494 | |
---|
495 | |
---|
496 | |
---|
497 | |
---|
498 | void print_tree(char **tree_description, FILE *tree, int *totals) |
---|
499 | { |
---|
500 | int row,col; |
---|
501 | |
---|
502 | fprintf(tree,"\n"); |
---|
503 | |
---|
504 | for(row=1; row<=nseqs-3; row++) { |
---|
505 | fprintf(tree," \n"); |
---|
506 | for(col=1; col<=nseqs; col++) { |
---|
507 | if(tree_description[row][col] == 0) |
---|
508 | fprintf(tree,"*"); |
---|
509 | else |
---|
510 | fprintf(tree,"."); |
---|
511 | } |
---|
512 | if(totals[row] > 0) |
---|
513 | fprintf(tree,"%7d",totals[row]); |
---|
514 | } |
---|
515 | fprintf(tree," \n"); |
---|
516 | for(col=1; col<=nseqs; col++) |
---|
517 | fprintf(tree,"%1d",tree_description[nseqs-2][col]); |
---|
518 | fprintf(tree,"\n"); |
---|
519 | } |
---|
520 | |
---|
521 | |
---|
522 | |
---|
523 | void dna_distance_matrix(FILE *tree) |
---|
524 | { |
---|
525 | int m,n,j,i; |
---|
526 | int res1, res2; |
---|
527 | double p,q,e,a,b,k; |
---|
528 | |
---|
529 | tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */ |
---|
530 | |
---|
531 | if(verbose) { |
---|
532 | fprintf(tree,"\n"); |
---|
533 | fprintf(tree,"\n DIST = percentage divergence (/100)"); |
---|
534 | fprintf(tree,"\n p = rate of transition (A <-> G; C <-> T)"); |
---|
535 | fprintf(tree,"\n q = rate of transversion"); |
---|
536 | fprintf(tree,"\n Length = number of sites used in comparison"); |
---|
537 | fprintf(tree,"\n"); |
---|
538 | if(tossgaps) { |
---|
539 | fprintf(tree,"\n All sites with gaps (in any sequence) deleted!"); |
---|
540 | fprintf(tree,"\n"); |
---|
541 | } |
---|
542 | if(kimura) { |
---|
543 | fprintf(tree,"\n Distances corrected by Kimura's 2 parameter model:"); |
---|
544 | fprintf(tree,"\n\n Kimura, M. (1980)"); |
---|
545 | fprintf(tree," A simple method for estimating evolutionary "); |
---|
546 | fprintf(tree,"rates of base"); |
---|
547 | fprintf(tree,"\n substitutions through comparative studies of "); |
---|
548 | fprintf(tree,"nucleotide sequences."); |
---|
549 | fprintf(tree,"\n J. Mol. Evol., 16, 111-120."); |
---|
550 | fprintf(tree,"\n\n"); |
---|
551 | } |
---|
552 | } |
---|
553 | |
---|
554 | for(m=1; m<nseqs; ++m) /* for every pair of sequence */ |
---|
555 | for(n=m+1; n<=nseqs; ++n) { |
---|
556 | p = q = e = 0.0; |
---|
557 | tmat[m][n] = tmat[n][m] = 0.0; |
---|
558 | for(i=1; i<=seqlen_array[1]; ++i) { |
---|
559 | j = boot_positions[i]; |
---|
560 | if(tossgaps && (tree_gaps[j] > 0) ) |
---|
561 | goto skip; /* gap position */ |
---|
562 | res1 = seq_array[m][j]; |
---|
563 | res2 = seq_array[n][j]; |
---|
564 | if( (res1 < 1) || (res2 < 1) ) |
---|
565 | goto skip; /* gap in a seq*/ |
---|
566 | e = e + 1.0; |
---|
567 | if(res1 != res2) { |
---|
568 | if(transition(res1,res2)) |
---|
569 | p = p + 1.0; |
---|
570 | else |
---|
571 | q = q + 1.0; |
---|
572 | } |
---|
573 | skip:; |
---|
574 | } |
---|
575 | |
---|
576 | |
---|
577 | /* Kimura's 2 parameter correction for multiple substitutions */ |
---|
578 | |
---|
579 | if(!kimura) { |
---|
580 | k = (p+q)/e; |
---|
581 | if(p > 0.0) |
---|
582 | p = p/e; |
---|
583 | else |
---|
584 | p = 0.0; |
---|
585 | if(q > 0.0) |
---|
586 | q = q/e; |
---|
587 | else |
---|
588 | q = 0.0; |
---|
589 | tmat[m][n] = tmat[n][m] = k; |
---|
590 | if(verbose) /* if screen output */ |
---|
591 | fprintf(tree, |
---|
592 | "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n" |
---|
593 | ,m,n,k,p,q,e); |
---|
594 | } |
---|
595 | else { |
---|
596 | if(p > 0.0) |
---|
597 | p = p/e; |
---|
598 | else |
---|
599 | p = 0.0; |
---|
600 | if(q > 0.0) |
---|
601 | q = q/e; |
---|
602 | else |
---|
603 | q = 0.0; |
---|
604 | a = 1.0/(1.0-2.0*p-q); |
---|
605 | b = 1.0/(1.0-2.0*q); |
---|
606 | k = 0.5*log(a) + 0.25*log(b); |
---|
607 | tmat[m][n] = tmat[n][m] = k; |
---|
608 | if(verbose) /* if screen output */ |
---|
609 | fprintf(tree, |
---|
610 | "%4d vs.%4d: DIST = %7.4f; p = %6.4f; q = %6.4f; length = %6.0f\n" |
---|
611 | ,m,n,k,p,q,e); |
---|
612 | |
---|
613 | } |
---|
614 | } |
---|
615 | } |
---|
616 | |
---|
617 | |
---|
618 | |
---|
619 | void prot_distance_matrix(FILE *tree) |
---|
620 | { |
---|
621 | int m,n,j,i; |
---|
622 | int res1, res2; |
---|
623 | double p,e,k; |
---|
624 | |
---|
625 | tree_gap_delete(); /* flag positions with gaps (tree_gaps[i] = 1 ) */ |
---|
626 | |
---|
627 | if(verbose) { |
---|
628 | fprintf(tree,"\n"); |
---|
629 | fprintf(tree,"\n DIST = percentage divergence (/100)"); |
---|
630 | fprintf(tree,"\n Length = number of sites used in comparison"); |
---|
631 | fprintf(tree,"\n\n"); |
---|
632 | if(tossgaps) { |
---|
633 | fprintf(tree,"\n All sites with gaps (in any sequence) deleted"); |
---|
634 | fprintf(tree,"\n"); |
---|
635 | } |
---|
636 | if(kimura) { |
---|
637 | fprintf(tree,"\n Distances corrected by Kimura's empirical method:"); |
---|
638 | fprintf(tree,"\n\n Kimura, M. (1983)"); |
---|
639 | fprintf(tree," The Neutral Theory of Molecular Evolution."); |
---|
640 | fprintf(tree,"\n Cambridge University Press, Cambridge, England."); |
---|
641 | fprintf(tree,"\n\n"); |
---|
642 | } |
---|
643 | } |
---|
644 | |
---|
645 | for(m=1; m<nseqs; ++m) /* for every pair of sequence */ |
---|
646 | for(n=m+1; n<=nseqs; ++n) { |
---|
647 | p = e = 0.0; |
---|
648 | tmat[m][n] = tmat[n][m] = 0.0; |
---|
649 | for(i=1; i<=seqlen_array[1]; ++i) { |
---|
650 | j = boot_positions[i]; |
---|
651 | if(tossgaps && (tree_gaps[j] > 0) ) goto skip; /* gap position */ |
---|
652 | res1 = seq_array[m][j]; |
---|
653 | res2 = seq_array[n][j]; |
---|
654 | if( (res1 < 1) || (res2 < 1) ) goto skip; /* gap in a seq*/ |
---|
655 | e = e + 1.0; |
---|
656 | if(res1 != res2) p = p + 1.0; |
---|
657 | skip:; |
---|
658 | } |
---|
659 | |
---|
660 | if(p <= 0.0) |
---|
661 | k = 0.0; |
---|
662 | else |
---|
663 | k = p/e; |
---|
664 | |
---|
665 | if(kimura) |
---|
666 | if(k > 0.0) k = - log(1.0 - k - (k * k/5.0) ); |
---|
667 | |
---|
668 | tmat[m][n] = tmat[n][m] = k; |
---|
669 | if(verbose) /* if screen output */ |
---|
670 | fprintf(tree, |
---|
671 | "%4d vs.%4d DIST = %6.4f; length = %6.0f\n",m,n,k,e); |
---|
672 | } |
---|
673 | } |
---|
674 | |
---|
675 | |
---|