/********* Sequence input routines for CLUSTALV *******************/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include "clustalv.h"	


/*
*	Prototypes
*/

extern Boolean linetype(char *,char *);
extern void warning(char *,...);
extern void error(char *,...);
extern char *	rtrim(char *);
extern void 	getstr(char *,char *);

void fill_chartab(void);
int readseqs(int);

static void get_seq(char *,char *,int *,char *);
static void check_infile(int *);
static void p_encode(char *, char *, int);
static void n_encode(char *, char *, int);
static int res_index(char *,char);
static Boolean check_dnaflag(char *, int);
/*
 *	Global variables
 */

extern FILE *fin;
extern Boolean usemenu, dnaflag, explicit_dnaflag;
extern char seqname[];
extern int nseqs;
extern int *seqlen_array;
extern char **names,**titles;
extern char **seq_array;
extern profile1_empty, profile2_empty;

char *amino_acid_codes   =    "ABCDEFGHIKLMNPQRSTUVWXYZ-"; 
char *amino_acid_order   =    "XCSTPAGNDEQHRKMILVFYW";
char *nucleic_acid_order = 	  "NACGTU";
static int seqFormat;
static char chartab[256];
static char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR","Pearson"};

void fill_chartab()	/* Create translation and check table */
{
/*	static char valid[]="ABCDEFGHIKLMNPQRSTVWXYZ-";  */
	register int i;
	register char c;
	
	for(i=0;i<256;chartab[i++]=0);
	for(i=0;c=amino_acid_codes[i];i++)
		chartab[c]=chartab[tolower(c)]=c;
}


static void get_seq(char *sname,char *seq,int *len,char *tit)
{
	static char line[MAXLINE+1];
	int i;
        unsigned char c;

	switch(seqFormat) {
		case EMBLSWISS:
			while( !linetype(line,"ID") )
				fgets(line,MAXLINE+1,fin);
			
		strncpy(sname,line+5,MAXNAMES); /* remember entryname */
			sname[MAXNAMES]=EOS;
			rtrim(sname);

/*			while( !linetype(line,"DE") )
				fgets(line,MAXLINE+1,fin);
			strncpy(tit,line+5,MAXTITLES);
			tit[MAXTITLES]=EOS;
			i=strlen(tit);
			if(tit[i-1]=='\n') tit[i-1]=EOS;
*/
			while( !linetype(line,"SQ") )
				fgets(line,MAXLINE+1,fin);
			
			*len=0;
		while(fgets(line,MAXLINE+1,fin)) {
			for(i=0;*len < MAXLEN;i++) {
				c=line[i];
			if(c == '\n' || c == EOS || c == '/')
				break;			/* EOL */
		
			if( (c=chartab[c]))
				seq[++(*len)]=c;
		}
		if(*len == MAXLEN || c == '/') break;
		}
		break;
		
		case PIR:
			while(*line != '>')
				fgets(line,MAXLINE+1,fin);
			
			strncpy(sname,line+4,MAXNAMES);
			sname[MAXNAMES]=EOS;
			for(i=MAXNAMES-1;i > 0;i--) 
				if(isspace(sname[i])) {
					sname[i]=EOS;	
					break;
				}		

			fgets(line,MAXLINE+1,fin);
			strncpy(tit,line,MAXTITLES);
			tit[MAXTITLES]=EOS;
			i=strlen(tit);
			if(tit[i-1]=='\n') tit[i-1]=EOS;
			
			*len=0;
			while(fgets(line,MAXLINE+1,fin)) {
				for(i=0;*len < MAXLEN;i++) {
					c=line[i];
				if(c == '\n' || c == EOS || c == '*')
					break;			/* EOL */
			
				if( (c=chartab[c]))
					seq[++(*len)]=c;
			}
			if(*len == MAXLEN || c == '*') break;
			}
		break;

		case PEARSON:
			while(*line != '>')
				fgets(line,MAXLINE+1,fin);
			
			strncpy(sname,line+1,MAXNAMES);
			sname[MAXNAMES]=EOS;
			for(i=MAXNAMES-1;i > 0;i--) 
				if(isspace(sname[i])) {
					sname[i]=EOS;	
					break;
				}		
			*tit=EOS;
			
			*len=0;
			while(fgets(line,MAXLINE+1,fin)) {
				for(i=0;*len < MAXLEN;i++) {
					c=line[i];
				if(c == '\n' || c == EOS || c == '>')
					break;			/* EOL */
			
				if( (c=chartab[c]))
					seq[++(*len)]=c;
			}
			if(*len == MAXLEN || c == '>') break;
			}
		break;
	}
	
	if(*len == MAXLEN)
		warning("Sequence %s truncated to %d residues",
				sname,MAXLEN);
				
	seq[*len+1]=EOS;
}


int readseqs(int first_seq) /*first_seq is the #no. of the first seq. to read */
{
	char line[FILENAMELEN+1];
	static char seq1[MAXLEN+2],sname1[MAXNAMES+1],title[MAXTITLES+1];
	int i,j,no_seqs;
	static int l1;
	static Boolean dnaflag1;
	
	if(usemenu)
		getstr("Enter the name of the sequence file",line);
	else
		strcpy(line,seqname);
	if(*line == EOS) return -1;
	
	if((fin=fopen(line,"r"))==NULL) {
		error("Could not open sequence file %s",line);
		return -1;      /* DES -1 => file not found */
	}
	strcpy(seqname,line);
	no_seqs=0;
	check_infile(&no_seqs);
	printf("\nSequence format is %s\n",formatNames[seqFormat]);
	if(no_seqs == 0)
		return 0;       /* return the number of seqs. (zero here)*/

	if((no_seqs + first_seq -1) > MAXN) {
		error("Too many sequences. Maximum is %d",MAXN);
		return 0;       /* also return zero if too many */
	}

	for(i=first_seq;i<=first_seq+no_seqs-1;++i) {    /* get the seqs now*/
		get_seq(sname1,seq1,&l1,title);
		seqlen_array[i]=l1;                   /* store the length */
		strcpy(names[i],sname1);              /*    "   "  name   */
		strcpy(titles[i],title);              /*    "   "  title  */

		if(!explicit_dnaflag) {
			dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */
		        if(i == 1) dnaflag = dnaflag1;
		}			/* type decided by first seq*/
		else
			dnaflag1 = dnaflag;

		if( (!explicit_dnaflag) && (dnaflag1 != dnaflag) )
			warning(
	"Sequence %d [%s] appears to be of different type to sequence 1",
			i,sname1);

		if(dnaflag)
			n_encode(seq1,seq_array[i],l1); /* encode the sequence*/
		else					/* as ints  */
			p_encode(seq1,seq_array[i],l1);
	}

	fclose(fin);
	return no_seqs;    /* return the number of seqs. read in this call */
}


static Boolean check_dnaflag(char *seq, int slen)
/* check if DNA or Protein
   The decision is based on counting all A,C,G,T,U or N. 
   If >= 85% of all characters (except -) are as above => DNA  */
{
	int i, c, nresidues, nbases;
	float ratio;
	
	nresidues = nbases = 0;	
	for(i=1; i <= slen; i++) {
		if(seq[i] != '-') {
			nresidues++;
			if(seq[i] == 'N')
				nbases++;
			else {
				c = res_index(nucleic_acid_order, seq[i]);
				if(c > 0)
					nbases++;
			}
		}
	}
	if( (nbases == 0) || (nresidues == 0) ) return FALSE;
	ratio = (float)nbases/(float)nresidues;
/*
	fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n",
		nbases,nresidues,ratio);
*/
	if(ratio >= 0.85) 
		return TRUE;
	else
		return FALSE;
}


static void check_infile(int *nseqs)
{
	char line[MAXLINE+1];
	
	*nseqs=0;
	fgets(line,MAXLINE+1,fin);
	if( linetype(line,"ID") )					/* EMBL/Swiss-Prot format ? */
		seqFormat=EMBLSWISS;
	else if(*line == '>')						/* no */
		seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */
	else {
		seqFormat=UNKNOWN;
		return;
	}

	(*nseqs)++;
	
	while(fgets(line,MAXLINE+1,fin) != NULL) {
		switch(seqFormat) {
			case EMBLSWISS:
				if( linetype(line,"ID") )
					(*nseqs)++;
				break;
			case PIR:
			case PEARSON:
				if( *line == '>' )
					(*nseqs)++;
				break;
			default:
				break;
		}
	}
	fseek(fin,0,0);
}

static void p_encode(char *seq, char *naseq, int l)
{					/* code seq as ints .. use -2 for gap */
	register int i;
/*	static char *aacids="CSTPAGNDEQHRKMILVFYW";*/
	
	for(i=1;i<=l;i++)
		if(seq[i] == '-')
			naseq[i] = -2;
		else if(seq[i] == 'X') 
			naseq[i] = 0;
		else
			naseq[i] = res_index(amino_acid_order,seq[i]);
}

static void n_encode(char *seq,char *naseq,int l)
{					/* code seq as ints .. use -2 for gap */
	register int i,c;
/*	static char *nucs="ACGTU";	*/
	
	for(i=1;i<=l;i++) {
    	if(seq[i] == '-')          	   /* if a gap character -> code = -2 */
			naseq[i] = -2;     /* this is the code for a gap in */
		else {                     /* the input files */
			c=res_index(nucleic_acid_order,seq[i]);
			if (c == 5) c=4;
			naseq[i]=c;
		}
	}
}

static int res_index(char *t,char c)
{
	register int i;
	
	for(i=0;t[i] && t[i] != c;i++)
		;
	if(t[i]) return(i);
	else return 0;
}