Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

sequence.c

Visit:

Last change on this file was 19480, checked in by westram, 2 years ago
reintegrates 'gde' into 'trunk' adds: log:branches/gde@19460:19479
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 7.5 KB

Line
1	/******* Sequence input routines for CLUSTALV *****************/
2	#include <stdio.h>
3	#include <string.h>
4	#include <ctype.h>
5	#include <stdlib.h>
6	#include "clustalv.h"
7
8
9	/*
10	* Prototypes
11	*/
12
13	extern Boolean linetype(const char ,const char );
14	extern void warning(const char *,...);
15	extern void error(const char *,...);
16	extern char * rtrim(char *);
17	extern void getstr(const char ,char , int);
18
19	void fill_chartab(void);
20	int readseqs(int);
21
22	static void get_seq(char ,char ,int ,char );
23	static void check_infile(int *);
24	static void p_encode(char , char , int);
25	static void n_encode(char , char , int);
26	static int res_index(const char *,char);
27	static Boolean check_dnaflag(char *, int);
28	/*
29	* Global variables
30	*/
31
32	extern FILE *fin;
33	extern Boolean usemenu, dnaflag, explicit_dnaflag;
34	extern char seqname[];
35	extern int nseqs;
36	extern int *seqlen_array;
37	extern char names,titles;
38	extern char **seq_array;
39	extern int profile1_empty, profile2_empty;
40
41	const char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-";
42	const char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW";
43	const char *nucleic_acid_order = "NACGTU";
44	static int seqFormat;
45	static char chartab[256];
46	static const char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR","Pearson"};
47
48	void fill_chartab() /* Create translation and check table */
49	{
50	/* static char valid[]="ABCDEFGHIKLMNPQRSTVWXYZ-"; */
51	register int i;
52	register char c;
53
54	for(i=0;i<256;chartab[i++]=0);
55	for(i=0;c=amino_acid_codes[i];i++)
56	chartab[c]=chartab[tolower(c)]=c;
57	}
58
59
60
61	static void get_seq(char sname,char seq,int len,char tit)
62	{
63	static char line[MAXLINE+1];
64	int i;
65	unsigned char c;
66
67	switch(seqFormat) {
68	case EMBLSWISS:
69	while( !linetype(line,"ID") )
70	fgets(line,MAXLINE+1,fin);
71
72	strncpy(sname,line+5,MAXNAMES); /* remember entryname */
73	sname[MAXNAMES]=EOS;
74	rtrim(sname);
75
76	/* while( !linetype(line,"DE") )
77	fgets(line,MAXLINE+1,fin);
78	strncpy(tit,line+5,MAXTITLES);
79	tit[MAXTITLES]=EOS;
80	i=strlen(tit);
81	if(tit[i-1]=='\n') tit[i-1]=EOS;
82	*/
83	while( !linetype(line,"SQ") )
84	fgets(line,MAXLINE+1,fin);
85
86	*len=0;
87	while(fgets(line,MAXLINE+1,fin)) {
88	for(i=0;*len < MAXLEN;i++) {
89	c=line[i];
90	if(c == '\n' \|\| c == EOS \|\| c == '/')
91	break; /* EOL */
92
93	if( (c=chartab[c]))
94	seq[++(*len)]=c;
95	}
96	if(*len == MAXLEN \|\| c == '/') break;
97	}
98	break;
99
100	case PIR:
101	while(*line != '>')
102	fgets(line,MAXLINE+1,fin);
103
104	strncpy(sname,line+4,MAXNAMES);
105	sname[MAXNAMES]=EOS;
106	for(i=MAXNAMES-1;i > 0;i--)
107	if(isspace(sname[i])) {
108	sname[i]=EOS;
109	break;
110	}
111
112	fgets(line,MAXLINE+1,fin);
113	strncpy(tit,line,MAXTITLES);
114	tit[MAXTITLES]=EOS;
115	i=strlen(tit);
116	if(tit[i-1]=='\n') tit[i-1]=EOS;
117
118	*len=0;
119	while(fgets(line,MAXLINE+1,fin)) {
120	for(i=0;*len < MAXLEN;i++) {
121	c=line[i];
122	if(c == '\n' \|\| c == EOS \|\| c == '*')
123	break; /* EOL */
124
125	if( (c=chartab[c]))
126	seq[++(*len)]=c;
127	}
128	if(len == MAXLEN \|\| c == '') break;
129	}
130	break;
131
132	case PEARSON:
133	while(*line != '>')
134	fgets(line,MAXLINE+1,fin);
135
136	strncpy(sname,line+1,MAXNAMES);
137	sname[MAXNAMES]=EOS;
138	for(i=MAXNAMES-1;i > 0;i--)
139	if(isspace(sname[i])) {
140	sname[i]=EOS;
141	break;
142	}
143	*tit=EOS;
144
145	*len=0;
146	while(fgets(line,MAXLINE+1,fin)) {
147	for(i=0;*len < MAXLEN;i++) {
148	c=line[i];
149	if(c == '\n' \|\| c == EOS \|\| c == '>')
150	break; /* EOL */
151
152	if( (c=chartab[c]))
153	seq[++(*len)]=c;
154	}
155	if(*len == MAXLEN \|\| c == '>') break;
156	}
157	break;
158	}
159
160	if(*len == MAXLEN)
161	warning("Sequence %s truncated to %d residues",
162	sname,MAXLEN);
163
164	seq[*len+1]=EOS;
165	}
166
167
168	int readseqs(int first_seq) /first_seq is the #no. of the first seq. to read /
169	{
170	char line[FILENAMELEN+1];
171	static char seq1[MAXLEN+2],sname1[MAXNAMES+1],title[MAXTITLES+1];
172	int i,no_seqs;
173	static int l1;
174	static Boolean dnaflag1;
175
176	if(usemenu)
177	getstr("Enter the name of the sequence file",line, FILENAMELEN+1);
178	else
179	strcpy(line,seqname);
180	if(*line == EOS) return -1;
181
182	if((fin=fopen(line,"r"))==NULL) {
183	error("Could not open sequence file %s",line);
184	return -1; /* DES -1 => file not found */
185	}
186	strcpy(seqname,line);
187	no_seqs=0;
188	check_infile(&no_seqs);
189	printf("\nSequence format is %s\n",formatNames[seqFormat]);
190	if(no_seqs == 0)
191	return 0; /* return the number of seqs. (zero here)*/
192
193	if((no_seqs + first_seq -1) > MAXN) {
194	error("Too many sequences. Maximum is %d",MAXN);
195	return 0; /* also return zero if too many */
196	}
197
198	for(i=first_seq;i<=first_seq+no_seqs-1;++i) { /* get the seqs now*/
199	get_seq(sname1,seq1,&l1,title);
200	seqlen_array[i]=l1; /* store the length */
201	strcpy(names[i],sname1); /* " " name */
202	strcpy(titles[i],title); /* " " title */
203
204	if(!explicit_dnaflag) {
205	dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */
206	if(i == 1) dnaflag = dnaflag1;
207	} /* type decided by first seq*/
208	else
209	dnaflag1 = dnaflag;
210
211	if( (!explicit_dnaflag) && (dnaflag1 != dnaflag) )
212	warning(
213	"Sequence %d [%s] appears to be of different type to sequence 1",
214	i,sname1);
215
216	if(dnaflag)
217	n_encode(seq1,seq_array[i],l1); /* encode the sequence*/
218	else /* as ints */
219	p_encode(seq1,seq_array[i],l1);
220	}
221
222	fclose(fin);
223	return no_seqs; /* return the number of seqs. read in this call */
224	}
225
226
227	static Boolean check_dnaflag(char *seq, int slen)
228	/* check if DNA or Protein
229	The decision is based on counting all A,C,G,T,U or N.
230	If >= 85% of all characters (except -) are as above => DNA */
231	{
232	int i, c, nresidues, nbases;
233	float ratio;
234
235	nresidues = nbases = 0;
236	for(i=1; i <= slen; i++) {
237	if(seq[i] != '-') {
238	nresidues++;
239	if(seq[i] == 'N')
240	nbases++;
241	else {
242	c = res_index(nucleic_acid_order, seq[i]);
243	if(c > 0)
244	nbases++;
245	}
246	}
247	}
248	if( (nbases == 0) \|\| (nresidues == 0) ) return FALSE;
249	ratio = (float)nbases/(float)nresidues;
250	/*
251	fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",
252	nbases,nresidues,ratio);
253	*/
254	if(ratio >= 0.85)
255	return TRUE;
256	else
257	return FALSE;
258	}
259
260
261
262	static void check_infile(int *local_nseqs)
263	{
264	char line[MAXLINE+1];
265
266	*local_nseqs=0;
267	fgets(line,MAXLINE+1,fin);
268	if( linetype(line,"ID") ) /* EMBL/Swiss-Prot format ? */
269	seqFormat=EMBLSWISS;
270	else if(line == '>') / no */
271	seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
272	else {
273	seqFormat=UNKNOWN;
274	return;
275	}
276
277	(*local_nseqs)++;
278
279	while(fgets(line,MAXLINE+1,fin) != NULL) {
280	switch(seqFormat) {
281	case EMBLSWISS:
282	if( linetype(line,"ID") )
283	(*local_nseqs)++;
284	break;
285	case PIR:
286	case PEARSON:
287	if( *line == '>' )
288	(*local_nseqs)++;
289	break;
290	default:
291	break;
292	}
293	}
294	fseek(fin,0,0);
295	}
296
297	static void p_encode(char seq, char naseq, int l)
298	{ /* code seq as ints .. use -2 for gap */
299	register int i;
300	/* static char aacids="CSTPAGNDEQHRKMILVFYW";/
301
302	for(i=1;i<=l;i++)
303	if(seq[i] == '-')
304	naseq[i] = -2;
305	else if(seq[i] == 'X')
306	naseq[i] = 0;
307	else
308	naseq[i] = res_index(amino_acid_order,seq[i]);
309	}
310
311	static void n_encode(char seq,char naseq,int l)
312	{ /* code seq as ints .. use -2 for gap */
313	register int i,c;
314	/* static char nucs="ACGTU"; /
315
316	for(i=1;i<=l;i++) {
317	if(seq[i] == '-') /* if a gap character -> code = -2 */
318	naseq[i] = -2; /* this is the code for a gap in */
319	else { /* the input files */
320	c=res_index(nucleic_acid_order,seq[i]);
321	if (c == 5) c=4;
322	naseq[i]=c;
323	}
324	}
325	}
326
327	static int res_index(const char *t,char c)
328	{
329	register int i;
330
331	for(i=0;t[i] && t[i] != c;i++)
332	;
333	if(t[i]) return(i);
334	else return 0;
335	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/lib/GDE/CLUSTAL/sequence.c

Download in other formats: