Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

source: branches/nameserver/GDE/CLUSTALW/malign.c

Visit:

Last change on this file was 176, checked in by jobb, 24 years ago
* empty log message *
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 16.7 KB

Line
1	#include <stdio.h>
2	#include <string.h>
3	#include <ctype.h>
4	#include <stdlib.h>
5	#include "clustalw.h"
6
7
8	/*
9	* Prototypes
10	*/
11
12	/*
13	* Global Variables
14	*/
15
16	extern double **tmat;
17	extern Boolean no_weights;
18	extern sint debug;
19	extern sint max_aa;
20	extern sint nseqs;
21	extern sint profile1_nseqs;
22	extern sint nsets;
23	extern sint **sets;
24	extern sint divergence_cutoff;
25	extern sint *seq_weight;
26	extern sint output_order, *output_index;
27	extern Boolean distance_tree;
28	extern char seqname[];
29	extern sint *seqlen_array;
30	extern char **seq_array;
31
32	sint malign(sint istart,char phylip_name) / full progressive alignment*/
33	{
34	static sint *aligned;
35	static sint *group;
36	static sint ix;
37
38	sint *maxid, max, sum;
39	sint *tree_weight;
40	sint i,j,set,iseq=0;
41	sint status,entries;
42	lint score = 0;
43
44
45	info("Start of Multiple Alignment");
46
47	/* get the phylogenetic tree from .ph /
48
49	if (nseqs >= 2)
50	{
51	status = read_tree(phylip_name, (sint)0, nseqs);
52	if (status == 0) return((sint)0);
53	}
54
55	/* calculate sequence weights according to branch lengths of the tree -
56	weights in global variable seq_weight normalised to sum to 100 */
57
58	calc_seq_weights((sint)0, nseqs, seq_weight);
59
60	/* recalculate tmat matrix as percent similarity matrix */
61
62	status = calc_similarities(nseqs);
63	if (status == 0) return((sint)0);
64
65	/* for each sequence, find the most closely related sequence */
66
67	maxid = (sint )ckalloc( (nseqs+1) sizeof (sint));
68	for (i=1;i<=nseqs;i++)
69	{
70	maxid[i] = -1;
71	for (j=1;j<=nseqs;j++)
72	if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
73	}
74
75	/* group the sequences according to their relative divergence */
76
77	if (istart == 0)
78	{
79	sets = (sint *) ckalloc( (nseqs+1) sizeof (sint *) );
80	for(i=0;i<=nseqs;i++)
81	sets[i] = (sint )ckalloc( (nseqs+1) sizeof (sint) );
82
83	create_sets((sint)0,nseqs);
84	info("There are %d groups",(pint)nsets);
85
86	/* clear the memory used for the phylogenetic tree */
87
88	if (nseqs >= 2)
89	clear_tree(NULL);
90
91	/* start the multiple alignments......... */
92
93	info("Aligning...");
94
95	/* first pass, align closely related sequences first.... */
96
97	ix = 0;
98	aligned = (sint )ckalloc( (nseqs+1) sizeof (sint) );
99	for (i=0;i<=nseqs;i++) aligned[i] = 0;
100
101	for(set=1;set<=nsets;++set)
102	{
103	entries=0;
104	for (i=1;i<=nseqs;i++)
105	{
106	if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff))
107	{
108	entries++;
109	if (aligned[i] == 0)
110	{
111	if (output_order==INPUT)
112	{
113	++ix;
114	output_index[i] = i;
115	}
116	else output_index[++ix] = i;
117	aligned[i] = 1;
118	}
119	}
120	}
121
122	if(entries > 0) score = prfalign(sets[set], aligned);
123	else score=0.0;
124
125
126	/* negative score means fatal error... exit now! */
127
128	if (score < 0)
129	{
130	return(-1);
131	}
132	if ((entries > 0) && (score > 0))
133	info("Group %d: Sequences:%4d Score:%d",
134	(pint)set,(pint)entries,(pint)score);
135	else
136	info("Group %d: Delayed",
137	(pint)set);
138	}
139
140	for (i=0;i<=nseqs;i++)
141	sets[i]=ckfree((void *)sets[i]);
142	sets=ckfree(sets);
143	}
144	else
145	{
146	/* clear the memory used for the phylogenetic tree */
147
148	if (nseqs >= 2)
149	clear_tree(NULL);
150
151	aligned = (sint )ckalloc( (nseqs+1) sizeof (sint) );
152	ix = 0;
153	for (i=1;i<=istart+1;i++)
154	{
155	aligned[i] = 1;
156	++ix;
157	output_index[i] = i;
158	}
159	for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
160	}
161
162	/* second pass - align remaining, more divergent sequences..... */
163
164	/* if not all sequences were aligned, for each unaligned sequence,
165	find it's closest pair amongst the aligned sequences. */
166
167	group = (sint )ckalloc( (nseqs+1) sizeof (sint));
168	tree_weight = (sint ) ckalloc( (nseqs) sizeof(sint) );
169	for (i=0;i<nseqs;i++)
170	tree_weight[i] = seq_weight[i];
171
172	/* if we haven't aligned any sequences, in the first pass - align the
173	two most closely related sequences now */
174	if(ix==0)
175	{
176	max = -1;
177	iseq = 0;
178	for (i=1;i<=nseqs;i++)
179	{
180	for (j=i+1;j<=nseqs;j++)
181	{
182	if (max < tmat[i][j])
183	{
184	max = tmat[i][j];
185	iseq = i;
186	}
187	}
188	}
189	aligned[iseq]=1;
190	if (output_order == INPUT)
191	{
192	++ix;
193	output_index[iseq] = iseq;
194	}
195	else
196	output_index[++ix] = iseq;
197	}
198
199	while (ix < nseqs)
200	{
201	for (i=1;i<=nseqs;i++) {
202	if (aligned[i] == 0)
203	{
204	maxid[i] = -1;
205	for (j=1;j<=nseqs;j++)
206	if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
207	maxid[i] = tmat[i][j];
208	}
209	}
210	/* find the most closely related sequence to those already aligned */
211
212	max = -1;
213	iseq = 0;
214	for (i=1;i<=nseqs;i++)
215	{
216	if ((aligned[i] == 0) && (maxid[i] > max))
217	{
218	max = maxid[i];
219	iseq = i;
220	}
221	}
222
223
224	/* align this sequence to the existing alignment */
225	/* weight sequences with percent identity with profile*/
226	/* OR...., multiply sequence weights from tree by percent identity with new sequence */
227	if(no_weights==FALSE) {
228	for (j=0;j<nseqs;j++)
229	if (aligned[j+1] != 0)
230	seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
231	/*
232	Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
233	*/
234
235	sum = 0;
236	for (j=0;j<nseqs;j++)
237	if (aligned[j+1] != 0)
238	sum += seq_weight[j];
239	if (sum == 0)
240	{
241	for (j=0;j<nseqs;j++)
242	seq_weight[j] = 1;
243	sum = j;
244	}
245	for (j=0;j<nseqs;j++)
246	if (aligned[j+1] != 0)
247	{
248	seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
249	if (seq_weight[j] < 1) seq_weight[j] = 1;
250	}
251	}
252
253	entries = 0;
254	for (j=1;j<=nseqs;j++)
255	if (aligned[j] != 0)
256	{
257	group[j] = 1;
258	entries++;
259	}
260	else if (iseq==j)
261	{
262	group[j] = 2;
263	entries++;
264	}
265	aligned[iseq] = 1;
266
267	score = prfalign(group, aligned);
268	info("Sequence:%d Score:%d",(pint)iseq,(pint)score);
269	if (output_order == INPUT)
270	{
271	++ix;
272	output_index[iseq] = iseq;
273	}
274	else
275	output_index[++ix] = iseq;
276	}
277
278	group=ckfree((void *)group);
279	aligned=ckfree((void *)aligned);
280	maxid=ckfree((void *)maxid);
281	tree_weight=ckfree((void *)tree_weight);
282
283	aln_score();
284
285	/* make the rest (output stuff) into routine clustal_out in file amenu.c */
286
287	return(nseqs);
288
289	}
290
291	sint seqalign(sint istart,char phylip_name) / sequence alignment to existing profile */
292	{
293	static sint aligned, tree_weight;
294	static sint *group;
295	static sint ix;
296
297	sint *maxid, max;
298	sint i,j,status,iseq;
299	sint sum,entries;
300	lint score = 0;
301
302
303	info("Start of Multiple Alignment");
304
305	/* get the phylogenetic tree from .ph /
306
307	if (nseqs >= 2)
308	{
309	status = read_tree(phylip_name, (sint)0, nseqs);
310	if (status == 0) return(0);
311	}
312
313	/* calculate sequence weights according to branch lengths of the tree -
314	weights in global variable seq_weight normalised to sum to 100 */
315
316	calc_seq_weights((sint)0, nseqs, seq_weight);
317
318	tree_weight = (sint ) ckalloc( (nseqs) sizeof(sint) );
319	for (i=0;i<nseqs;i++)
320	tree_weight[i] = seq_weight[i];
321
322	/* recalculate tmat matrix as percent similarity matrix */
323
324	status = calc_similarities(nseqs);
325	if (status == 0) return((sint)0);
326
327	/* for each sequence, find the most closely related sequence */
328
329	maxid = (sint )ckalloc( (nseqs+1) sizeof (sint));
330	for (i=1;i<=nseqs;i++)
331	{
332	maxid[i] = -1;
333	for (j=1;j<=nseqs;j++)
334	if (maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
335	}
336
337	/* clear the memory used for the phylogenetic tree */
338
339	if (nseqs >= 2)
340	clear_tree(NULL);
341
342	aligned = (sint )ckalloc( (nseqs+1) sizeof (sint) );
343	ix = 0;
344	for (i=1;i<=istart+1;i++)
345	{
346	aligned[i] = 1;
347	++ix;
348	output_index[i] = i;
349	}
350	for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
351
352	/* for each unaligned sequence, find it's closest pair amongst the
353	aligned sequences. */
354
355	group = (sint )ckalloc( (nseqs+1) sizeof (sint));
356
357	while (ix < nseqs)
358	{
359	if (ix > 0)
360	{
361	for (i=1;i<=nseqs;i++) {
362	if (aligned[i] == 0)
363	{
364	maxid[i] = -1;
365	for (j=1;j<=nseqs;j++)
366	if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
367	maxid[i] = tmat[i][j];
368	}
369	}
370	}
371
372	/* find the most closely related sequence to those already aligned */
373
374	max = -1;
375	for (i=1;i<=nseqs;i++)
376	{
377	if ((aligned[i] == 0) && (maxid[i] > max))
378	{
379	max = maxid[i];
380	iseq = i;
381	}
382	}
383
384	/* align this sequence to the existing alignment */
385
386	entries = 0;
387	for (j=1;j<=nseqs;j++)
388	if (aligned[j] != 0)
389	{
390	group[j] = 1;
391	entries++;
392	}
393	else if (iseq==j)
394	{
395	group[j] = 2;
396	entries++;
397	}
398	aligned[iseq] = 1;
399
400
401	/* EITHER....., set sequence weights equal to percent identity with new sequence */
402	/*
403	for (j=0;j<nseqs;j++)
404	seq_weight[j] = tmat[j+1][iseq];
405	*/
406	/* OR...., multiply sequence weights from tree by percent identity with new sequence */
407	for (j=0;j<nseqs;j++)
408	seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
409	if (debug>1)
410	for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf (stdout,"sequence %d: %d\n", j+1,tree_weight[j]);
411	/*
412	Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
413	*/
414
415	sum = 0;
416	for (j=0;j<nseqs;j++)
417	if (group[j+1] == 1) sum += seq_weight[j];
418	if (sum == 0)
419	{
420	for (j=0;j<nseqs;j++)
421	seq_weight[j] = 1;
422	sum = j;
423	}
424	for (j=0;j<nseqs;j++)
425	{
426	seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
427	if (seq_weight[j] < 1) seq_weight[j] = 1;
428	}
429
430	if (debug > 1) {
431	fprintf(stdout,"new weights\n");
432	for (j=0;j<nseqs;j++) if (group[j+1] == 1)fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]);
433	}
434
435	score = prfalign(group, aligned);
436	info("Sequence:%d Score:%d",(pint)iseq,(pint)score);
437	if (output_order == INPUT)
438	{
439	++ix;
440	output_index[iseq] = iseq;
441	}
442	else
443	output_index[++ix] = iseq;
444	}
445
446	group=ckfree((void *)group);
447	aligned=ckfree((void *)aligned);
448	maxid=ckfree((void *)maxid);
449
450	aln_score();
451
452	/* make the rest (output stuff) into routine clustal_out in file amenu.c */
453
454	return(nseqs);
455
456	}
457
458
459	sint palign1(void) /* a profile alignment */
460	{
461	sint i,j,temp;
462	sint entries;
463	sint aligned, group;
464	float dscore;
465	lint score;
466
467	info("Start of Initial Alignment");
468
469	/* calculate sequence weights according to branch lengths of the tree -
470	weights in global variable seq_weight normalised to sum to INT_SCALE_FACTOR */
471
472	temp = INT_SCALE_FACTOR/nseqs;
473	for (i=0;i<nseqs;i++) seq_weight[i] = temp;
474
475	distance_tree = FALSE;
476
477	/* do the initial alignment......... */
478
479	group = (sint )ckalloc( (nseqs+1) sizeof (sint));
480
481	for(i=1; i<=profile1_nseqs; ++i)
482	group[i] = 1;
483	for(i=profile1_nseqs+1; i<=nseqs; ++i)
484	group[i] = 2;
485	entries = nseqs;
486
487	aligned = (sint )ckalloc( (nseqs+1) sizeof (sint) );
488	for (i=1;i<=nseqs;i++) aligned[i] = 1;
489
490	score = prfalign(group, aligned);
491	info("Sequences:%d Score:%d",(pint)entries,(pint)score);
492	group=ckfree((void *)group);
493	aligned=ckfree((void *)aligned);
494
495	for (i=1;i<=nseqs;i++) {
496	for (j=i+1;j<=nseqs;j++) {
497	dscore = countid(i,j);
498	tmat[i][j] = ((double)100.0 - (double)dscore)/(double)100.0;
499	tmat[j][i] = tmat[i][j];
500	}
501	}
502
503	return(nseqs);
504	}
505
506	float countid(sint s1, sint s2)
507	{
508	char c1,c2;
509	sint i;
510	sint count,total;
511	float score;
512
513	count = total = 0;
514	for (i=1;i<=seqlen_array[s1] && i<=seqlen_array[s2];i++) {
515	c1 = seq_array[s1][i];
516	c2 = seq_array[s2][i];
517	if ((c1>=0) && (c1<max_aa)) {
518	total++;
519	if (c1 == c2) count++;
520	}
521
522	}
523
524	if(total==0) score=0;
525	else
526	score = 100.0 * (float)count / (float)total;
527	return(score);
528
529	}
530
531	sint palign2(char p1_tree_name,char p2_tree_name) /* a profile alignment */
532	{
533	sint i,j,sum,entries,status;
534	lint score;
535	sint aligned, group;
536	sint maxid,p1_weight,*p2_weight;
537	sint dscore;
538
539	info("Start of Multiple Alignment");
540
541	/* get the phylogenetic trees from .ph /
542
543	if (profile1_nseqs >= 2)
544	{
545	status = read_tree(p1_tree_name, (sint)0, profile1_nseqs);
546	if (status == 0) return(0);
547	}
548
549	/* calculate sequence weights according to branch lengths of the tree -
550	weights in global variable seq_weight normalised to sum to 100 */
551
552	p1_weight = (sint ) ckalloc( (profile1_nseqs) sizeof(sint) );
553
554	calc_seq_weights((sint)0, profile1_nseqs, p1_weight);
555
556	/* clear the memory for the phylogenetic tree */
557
558	if (profile1_nseqs >= 2)
559	clear_tree(NULL);
560
561	if (nseqs-profile1_nseqs >= 2)
562	{
563	status = read_tree(p2_tree_name, profile1_nseqs, nseqs);
564	if (status == 0) return(0);
565	}
566
567	p2_weight = (sint ) ckalloc( (nseqs) sizeof(sint) );
568
569	calc_seq_weights(profile1_nseqs,nseqs, p2_weight);
570
571
572	/* clear the memory for the phylogenetic tree */
573
574	if (nseqs-profile1_nseqs >= 2)
575	clear_tree(NULL);
576
577	/* convert tmat distances to similarities */
578
579	for (i=1;i<nseqs;i++)
580	for (j=i+1;j<=nseqs;j++) {
581	tmat[i][j]=100.0-tmat[i][j]*100.0;
582	tmat[j][i]=tmat[i][j];
583	}
584
585
586	/* weight sequences with max percent identity with other profile*/
587
588	maxid = (sint )ckalloc( (nseqs+1) sizeof (sint));
589	for (i=0;i<profile1_nseqs;i++) {
590	maxid[i] = 0;
591	for (j=profile1_nseqs+1;j<=nseqs;j++)
592	if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j];
593	seq_weight[i] = maxid[i]*p1_weight[i];
594	}
595
596	for (i=profile1_nseqs;i<nseqs;i++) {
597	maxid[i] = -1;
598	for (j=1;j<=profile1_nseqs;j++)
599	if(maxid[i]<tmat[i+1][j]) maxid[i] = tmat[i+1][j];
600	seq_weight[i] = maxid[i]*p2_weight[i];
601	}
602	/*
603	Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
604	*/
605
606	sum = 0;
607	for (j=0;j<nseqs;j++)
608	sum += seq_weight[j];
609	if (sum == 0)
610	{
611	for (j=0;j<nseqs;j++)
612	seq_weight[j] = 1;
613	sum = j;
614	}
615	for (j=0;j<nseqs;j++)
616	{
617	seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
618	if (seq_weight[j] < 1) seq_weight[j] = 1;
619	}
620	if (debug > 1) {
621	fprintf(stdout,"new weights\n");
622	for (j=0;j<nseqs;j++) fprintf( stdout,"sequence %d: %d\n", j+1,seq_weight[j]);
623	}
624
625
626	/* do the alignment......... */
627
628	info("Aligning...");
629
630	group = (sint )ckalloc( (nseqs+1) sizeof (sint));
631
632	for(i=1; i<=profile1_nseqs; ++i)
633	group[i] = 1;
634	for(i=profile1_nseqs+1; i<=nseqs; ++i)
635	group[i] = 2;
636	entries = nseqs;
637
638	aligned = (sint )ckalloc( (nseqs+1) sizeof (sint) );
639	for (i=1;i<=nseqs;i++) aligned[i] = 1;
640
641	score = prfalign(group, aligned);
642	info("Sequences:%d Score:%d",(pint)entries,(pint)score);
643	group=ckfree((void *)group);
644	p1_weight=ckfree((void *)p1_weight);
645	p2_weight=ckfree((void *)p2_weight);
646	aligned=ckfree((void *)aligned);
647	maxid=ckfree((void *)maxid);
648
649	/* DES output_index = (int )ckalloc( (nseqs+1) sizeof (int)); */
650	for (i=1;i<=nseqs;i++) output_index[i] = i;
651
652	return(nseqs);
653	}
654

Note: See TracBrowser for help on using the repository browser.

Download in other formats: