| 1 | /********* Sequence input routines for CLUSTAL W *******************/ |
|---|
| 2 | /* DES was here. FEB. 1994 */ |
|---|
| 3 | /* Now reads PILEUP/MSF and CLUSTAL alignment files */ |
|---|
| 4 | |
|---|
| 5 | #include <stdio.h> |
|---|
| 6 | #include <string.h> |
|---|
| 7 | #include <ctype.h> |
|---|
| 8 | #include <stdlib.h> |
|---|
| 9 | #include "clustalw.h" |
|---|
| 10 | |
|---|
| 11 | #define MIN(a,b) ((a)<(b)?(a):(b)) |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | /* |
|---|
| 16 | * Prototypes |
|---|
| 17 | */ |
|---|
| 18 | |
|---|
| 19 | static char * get_seq(char *,sint *,char *); |
|---|
| 20 | static char * get_clustal_seq(char *,sint *,char *,sint); |
|---|
| 21 | static char * get_msf_seq(char *,sint *,char *,sint); |
|---|
| 22 | static void check_infile(sint *); |
|---|
| 23 | static void p_encode(char *, char *, sint); |
|---|
| 24 | static void n_encode(char *, char *, sint); |
|---|
| 25 | static sint res_index(char *,char); |
|---|
| 26 | static Boolean check_dnaflag(char *, sint); |
|---|
| 27 | static sint count_clustal_seqs(void); |
|---|
| 28 | static sint count_pir_seqs(void); |
|---|
| 29 | static sint count_msf_seqs(void); |
|---|
| 30 | static sint count_rsf_seqs(void); |
|---|
| 31 | static void get_swiss_feature(char *line); |
|---|
| 32 | static void get_rsf_feature(char *line); |
|---|
| 33 | static void get_swiss_mask(char *line); |
|---|
| 34 | static void get_clustal_ss(sint length); |
|---|
| 35 | static void get_embl_ss(sint length); |
|---|
| 36 | static void get_rsf_ss(sint length); |
|---|
| 37 | static void get_gde_ss(sint length); |
|---|
| 38 | static Boolean cl_blankline(char *line); |
|---|
| 39 | |
|---|
| 40 | /* |
|---|
| 41 | * Global variables |
|---|
| 42 | */ |
|---|
| 43 | extern sint max_names; |
|---|
| 44 | extern FILE *fin; |
|---|
| 45 | extern Boolean usemenu, dnaflag, explicit_dnaflag; |
|---|
| 46 | extern Boolean interactive; |
|---|
| 47 | extern char seqname[]; |
|---|
| 48 | extern sint nseqs; |
|---|
| 49 | extern sint *seqlen_array; |
|---|
| 50 | extern sint *output_index; |
|---|
| 51 | extern char **names,**titles; |
|---|
| 52 | extern char **seq_array; |
|---|
| 53 | extern Boolean profile1_empty, profile2_empty; |
|---|
| 54 | extern sint gap_pos2; |
|---|
| 55 | extern sint max_aln_length; |
|---|
| 56 | extern char *gap_penalty_mask, *sec_struct_mask; |
|---|
| 57 | extern sint struct_penalties; |
|---|
| 58 | extern char *ss_name; |
|---|
| 59 | extern sint profile_no; |
|---|
| 60 | extern sint debug; |
|---|
| 61 | |
|---|
| 62 | char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; /* DES */ |
|---|
| 63 | static sint seqFormat; |
|---|
| 64 | static char chartab[128]; |
|---|
| 65 | static char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR", |
|---|
| 66 | "Pearson","GDE","Clustal","Pileup/MSF","RSF","USER"}; |
|---|
| 67 | |
|---|
| 68 | void fill_chartab(void) /* Create translation and check table */ |
|---|
| 69 | { |
|---|
| 70 | register sint i; |
|---|
| 71 | register char c; |
|---|
| 72 | |
|---|
| 73 | for(i=0;i<128;chartab[i++]=0); |
|---|
| 74 | for(i=0;(c=amino_acid_codes[i]);i++) |
|---|
| 75 | chartab[(int)c]=chartab[tolower(c)]=c; |
|---|
| 76 | } |
|---|
| 77 | |
|---|
| 78 | static char * get_msf_seq(char *sname,sint *len,char *tit,sint seqno) |
|---|
| 79 | /* read the seqno_th. sequence from a PILEUP multiple alignment file */ |
|---|
| 80 | { |
|---|
| 81 | static char line[MAXLINE+1]; |
|---|
| 82 | char *seq = NULL; |
|---|
| 83 | sint i,j,k; |
|---|
| 84 | unsigned char c; |
|---|
| 85 | |
|---|
| 86 | fseek(fin,0,0); /* start at the beginning */ |
|---|
| 87 | |
|---|
| 88 | *len=0; /* initialise length to zero */ |
|---|
| 89 | for(i=0;;i++) { |
|---|
| 90 | if(fgets(line,MAXLINE+1,fin)==NULL) return NULL; /* read the title*/ |
|---|
| 91 | if(linetype(line,"//") ) break; /* lines...ignore*/ |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 95 | if(!blankline(line)) { |
|---|
| 96 | |
|---|
| 97 | for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin); |
|---|
| 98 | for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break; |
|---|
| 99 | for(k=j;k<=strlen(line);k++) if(line[k] == ' ') break; |
|---|
| 100 | strncpy(sname,line+j,MIN(MAXNAMES,k-j)); |
|---|
| 101 | sname[MIN(MAXNAMES,k-j)]=EOS; |
|---|
| 102 | rtrim(sname); |
|---|
| 103 | blank_to_(sname); |
|---|
| 104 | |
|---|
| 105 | if(seq==NULL) |
|---|
| 106 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 107 | else |
|---|
| 108 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 109 | for(i=k;i<=MAXLINE;i++) { |
|---|
| 110 | c=line[i]; |
|---|
| 111 | if(c == '.' || c == '~' ) c = '-'; |
|---|
| 112 | if(c == '*') c = 'X'; |
|---|
| 113 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 114 | if( (c=chartab[c])) seq[++(*len)]=c; |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | for(i=0;;i++) { |
|---|
| 118 | if(fgets(line,MAXLINE+1,fin)==NULL) return seq; |
|---|
| 119 | if(blankline(line)) break; |
|---|
| 120 | } |
|---|
| 121 | } |
|---|
| 122 | } |
|---|
| 123 | return seq; |
|---|
| 124 | } |
|---|
| 125 | |
|---|
| 126 | static Boolean cl_blankline(char *line) |
|---|
| 127 | { |
|---|
| 128 | int i; |
|---|
| 129 | |
|---|
| 130 | if (line[0] == '!') return TRUE; |
|---|
| 131 | |
|---|
| 132 | for(i=0;line[i]!='\n' && line[i]!=EOS;i++) { |
|---|
| 133 | if( isdigit(line[i]) || |
|---|
| 134 | isspace(line[i]) || |
|---|
| 135 | (line[i] == '*') || |
|---|
| 136 | (line[i] == ':') || |
|---|
| 137 | (line[i] == '.')) |
|---|
| 138 | ; |
|---|
| 139 | else |
|---|
| 140 | return FALSE; |
|---|
| 141 | } |
|---|
| 142 | return TRUE; |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | static char * get_clustal_seq(char *sname,sint *len,char *tit,sint seqno) |
|---|
| 146 | /* read the seqno_th. sequence from a clustal multiple alignment file */ |
|---|
| 147 | { |
|---|
| 148 | static char line[MAXLINE+1]; |
|---|
| 149 | static char tseq[MAXLINE+1]; |
|---|
| 150 | char *seq = NULL; |
|---|
| 151 | sint i,j; |
|---|
| 152 | unsigned char c; |
|---|
| 153 | |
|---|
| 154 | fseek(fin,0,0); /* start at the beginning */ |
|---|
| 155 | |
|---|
| 156 | *len=0; /* initialise length to zero */ |
|---|
| 157 | fgets(line,MAXLINE+1,fin); /* read the title line...ignore it */ |
|---|
| 158 | |
|---|
| 159 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 160 | if(!cl_blankline(line)) { |
|---|
| 161 | |
|---|
| 162 | for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin); |
|---|
| 163 | for(j=0;j<=strlen(line);j++) if(line[j] != ' ') break; |
|---|
| 164 | |
|---|
| 165 | sscanf(line,"%s%s",sname,tseq); |
|---|
| 166 | for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; |
|---|
| 167 | sname[j]=EOS; |
|---|
| 168 | rtrim(sname); |
|---|
| 169 | blank_to_(sname); |
|---|
| 170 | |
|---|
| 171 | if(seq==NULL) |
|---|
| 172 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 173 | else |
|---|
| 174 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 175 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 176 | c=tseq[i]; |
|---|
| 177 | /*if(c == '\n' || c == EOS) break;*/ /* EOL */ |
|---|
| 178 | if(isspace(c) || c == EOS) break; /* EOL */ |
|---|
| 179 | if( (c=chartab[c])) seq[++(*len)]=c; |
|---|
| 180 | } |
|---|
| 181 | |
|---|
| 182 | for(i=0;;i++) { |
|---|
| 183 | if(fgets(line,MAXLINE+1,fin)==NULL) return seq; |
|---|
| 184 | if(cl_blankline(line)) break; |
|---|
| 185 | } |
|---|
| 186 | } |
|---|
| 187 | } |
|---|
| 188 | |
|---|
| 189 | return seq; |
|---|
| 190 | } |
|---|
| 191 | |
|---|
| 192 | static void get_clustal_ss(sint length) |
|---|
| 193 | /* read the structure data from a clustal multiple alignment file */ |
|---|
| 194 | { |
|---|
| 195 | static char title[MAXLINE+1]; |
|---|
| 196 | static char line[MAXLINE+1]; |
|---|
| 197 | static char lin2[MAXLINE+1]; |
|---|
| 198 | static char tseq[MAXLINE+1]; |
|---|
| 199 | static char sname[MAXNAMES+1]; |
|---|
| 200 | sint i,j,len,ix,struct_index=0; |
|---|
| 201 | char c; |
|---|
| 202 | |
|---|
| 203 | |
|---|
| 204 | fseek(fin,0,0); /* start at the beginning */ |
|---|
| 205 | |
|---|
| 206 | len=0; /* initialise length to zero */ |
|---|
| 207 | if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the title line...ignore it */ |
|---|
| 208 | |
|---|
| 209 | if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the next line... */ |
|---|
| 210 | /* skip any blank lines */ |
|---|
| 211 | for (;;) { |
|---|
| 212 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 213 | if(!blankline(line)) break; |
|---|
| 214 | } |
|---|
| 215 | |
|---|
| 216 | /* look for structure table lines */ |
|---|
| 217 | ix = -1; |
|---|
| 218 | for(;;) { |
|---|
| 219 | if(line[0] != '!') break; |
|---|
| 220 | if(strncmp(line,"!SS",3) == 0) { |
|---|
| 221 | ix++; |
|---|
| 222 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 223 | for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; |
|---|
| 224 | sname[j]=EOS; |
|---|
| 225 | rtrim(sname); |
|---|
| 226 | blank_to_(sname); |
|---|
| 227 | if (interactive) { |
|---|
| 228 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 229 | strcat(title,sname); |
|---|
| 230 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 231 | } |
|---|
| 232 | else (*lin2) = 'y'; |
|---|
| 233 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 234 | struct_penalties = SECST; |
|---|
| 235 | struct_index = ix; |
|---|
| 236 | for (i=0;i<length;i++) |
|---|
| 237 | { |
|---|
| 238 | sec_struct_mask[i] = '.'; |
|---|
| 239 | gap_penalty_mask[i] = '.'; |
|---|
| 240 | } |
|---|
| 241 | strcpy(ss_name,sname); |
|---|
| 242 | for(i=0;len < length;i++) { |
|---|
| 243 | c = tseq[i]; |
|---|
| 244 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 245 | if (!isspace(c)) sec_struct_mask[len++] = c; |
|---|
| 246 | } |
|---|
| 247 | } |
|---|
| 248 | } |
|---|
| 249 | else if(strncmp(line,"!GM",3) == 0) { |
|---|
| 250 | ix++; |
|---|
| 251 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 252 | for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; |
|---|
| 253 | sname[j]=EOS; |
|---|
| 254 | rtrim(sname); |
|---|
| 255 | blank_to_(sname); |
|---|
| 256 | if (interactive) { |
|---|
| 257 | strcpy(title,"Found gap penalty mask in alignment file: "); |
|---|
| 258 | strcat(title,sname); |
|---|
| 259 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 260 | } |
|---|
| 261 | else (*lin2) = 'y'; |
|---|
| 262 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 263 | struct_penalties = GMASK; |
|---|
| 264 | struct_index = ix; |
|---|
| 265 | for (i=0;i<length;i++) |
|---|
| 266 | gap_penalty_mask[i] = '1'; |
|---|
| 267 | strcpy(ss_name,sname); |
|---|
| 268 | for(i=0;len < length;i++) { |
|---|
| 269 | c = tseq[i]; |
|---|
| 270 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 271 | if (!isspace(c)) gap_penalty_mask[len++] = c; |
|---|
| 272 | } |
|---|
| 273 | } |
|---|
| 274 | } |
|---|
| 275 | if (struct_penalties != NONE) break; |
|---|
| 276 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 277 | } |
|---|
| 278 | |
|---|
| 279 | if (struct_penalties == NONE) return; |
|---|
| 280 | |
|---|
| 281 | /* skip any more comment lines */ |
|---|
| 282 | while (line[0] == '!') { |
|---|
| 283 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 284 | } |
|---|
| 285 | |
|---|
| 286 | /* skip the sequence lines and any comments after the alignment */ |
|---|
| 287 | for (;;) { |
|---|
| 288 | if(isspace(line[0])) break; |
|---|
| 289 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 290 | } |
|---|
| 291 | |
|---|
| 292 | |
|---|
| 293 | /* read the rest of the alignment */ |
|---|
| 294 | |
|---|
| 295 | for (;;) { |
|---|
| 296 | /* skip any blank lines */ |
|---|
| 297 | for (;;) { |
|---|
| 298 | if(!blankline(line)) break; |
|---|
| 299 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 300 | } |
|---|
| 301 | /* get structure table line */ |
|---|
| 302 | for(ix=0;ix<struct_index;ix++) { |
|---|
| 303 | if (line[0] != '!') { |
|---|
| 304 | if(struct_penalties == SECST) |
|---|
| 305 | error("bad secondary structure format"); |
|---|
| 306 | else |
|---|
| 307 | error("bad gap penalty mask format"); |
|---|
| 308 | struct_penalties = NONE; |
|---|
| 309 | return; |
|---|
| 310 | } |
|---|
| 311 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 312 | } |
|---|
| 313 | if(struct_penalties == SECST) { |
|---|
| 314 | if (strncmp(line,"!SS",3) != 0) { |
|---|
| 315 | error("bad secondary structure format"); |
|---|
| 316 | struct_penalties = NONE; |
|---|
| 317 | return; |
|---|
| 318 | } |
|---|
| 319 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 320 | for(i=0;len < length;i++) { |
|---|
| 321 | c = tseq[i]; |
|---|
| 322 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 323 | if (!isspace(c)) sec_struct_mask[len++] = c; |
|---|
| 324 | } |
|---|
| 325 | } |
|---|
| 326 | else if (struct_penalties == GMASK) { |
|---|
| 327 | if (strncmp(line,"!GM",3) != 0) { |
|---|
| 328 | error("bad gap penalty mask format"); |
|---|
| 329 | struct_penalties = NONE; |
|---|
| 330 | return; |
|---|
| 331 | } |
|---|
| 332 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 333 | for(i=0;len < length;i++) { |
|---|
| 334 | c = tseq[i]; |
|---|
| 335 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 336 | if (!isspace(c)) gap_penalty_mask[len++] = c; |
|---|
| 337 | } |
|---|
| 338 | } |
|---|
| 339 | |
|---|
| 340 | /* skip any more comment lines */ |
|---|
| 341 | while (line[0] == '!') { |
|---|
| 342 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 343 | } |
|---|
| 344 | |
|---|
| 345 | /* skip the sequence lines */ |
|---|
| 346 | for (;;) { |
|---|
| 347 | if(isspace(line[0])) break; |
|---|
| 348 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 349 | } |
|---|
| 350 | } |
|---|
| 351 | } |
|---|
| 352 | |
|---|
| 353 | static void get_embl_ss(sint length) |
|---|
| 354 | { |
|---|
| 355 | static char title[MAXLINE+1]; |
|---|
| 356 | static char line[MAXLINE+1]; |
|---|
| 357 | static char lin2[MAXLINE+1]; |
|---|
| 358 | static char sname[MAXNAMES+1]; |
|---|
| 359 | sint i; |
|---|
| 360 | |
|---|
| 361 | /* find the start of the sequence entry */ |
|---|
| 362 | for (;;) { |
|---|
| 363 | while( !linetype(line,"ID") ) |
|---|
| 364 | if (fgets(line,MAXLINE+1,fin) == NULL) return; |
|---|
| 365 | |
|---|
| 366 | for(i=5;i<=strlen(line);i++) /* DES */ |
|---|
| 367 | if(line[i] != ' ') break; |
|---|
| 368 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 369 | for(i=0;i<=strlen(sname);i++) |
|---|
| 370 | if(sname[i] == ' ') { |
|---|
| 371 | sname[i]=EOS; |
|---|
| 372 | break; |
|---|
| 373 | } |
|---|
| 374 | sname[MAXNAMES]=EOS; |
|---|
| 375 | rtrim(sname); |
|---|
| 376 | blank_to_(sname); |
|---|
| 377 | |
|---|
| 378 | /* look for secondary structure feature table / gap penalty mask */ |
|---|
| 379 | while(fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 380 | if (linetype(line,"FT")) { |
|---|
| 381 | if (interactive) { |
|---|
| 382 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 383 | strcat(title,sname); |
|---|
| 384 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 385 | } |
|---|
| 386 | else (*lin2) = 'y'; |
|---|
| 387 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 388 | struct_penalties = SECST; |
|---|
| 389 | for (i=0;i<length;i++) |
|---|
| 390 | sec_struct_mask[i] = '.'; |
|---|
| 391 | do { |
|---|
| 392 | get_swiss_feature(&line[2]); |
|---|
| 393 | fgets(line,MAXLINE+1,fin); |
|---|
| 394 | } while( linetype(line,"FT") ); |
|---|
| 395 | } |
|---|
| 396 | else { |
|---|
| 397 | do { |
|---|
| 398 | fgets(line,MAXLINE+1,fin); |
|---|
| 399 | } while( linetype(line,"FT") ); |
|---|
| 400 | } |
|---|
| 401 | strcpy(ss_name,sname); |
|---|
| 402 | } |
|---|
| 403 | else if (linetype(line,"GM")) { |
|---|
| 404 | if (interactive) { |
|---|
| 405 | strcpy(title,"Found gap penalty mask in alignment file: "); |
|---|
| 406 | strcat(title,sname); |
|---|
| 407 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 408 | } |
|---|
| 409 | else (*lin2) = 'y'; |
|---|
| 410 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 411 | struct_penalties = GMASK; |
|---|
| 412 | for (i=0;i<length;i++) |
|---|
| 413 | gap_penalty_mask[i] = '1'; |
|---|
| 414 | do { |
|---|
| 415 | get_swiss_mask(&line[2]); |
|---|
| 416 | fgets(line,MAXLINE+1,fin); |
|---|
| 417 | } while( linetype(line,"GM") ); |
|---|
| 418 | } |
|---|
| 419 | else { |
|---|
| 420 | do { |
|---|
| 421 | fgets(line,MAXLINE+1,fin); |
|---|
| 422 | } while( linetype(line,"GM") ); |
|---|
| 423 | } |
|---|
| 424 | strcpy(ss_name,sname); |
|---|
| 425 | } |
|---|
| 426 | else if (linetype(line,"SQ")) |
|---|
| 427 | break; |
|---|
| 428 | |
|---|
| 429 | if (struct_penalties != NONE) break; |
|---|
| 430 | } |
|---|
| 431 | |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | } |
|---|
| 435 | |
|---|
| 436 | static void get_rsf_ss(sint length) |
|---|
| 437 | { |
|---|
| 438 | static char title[MAXLINE+1]; |
|---|
| 439 | static char line[MAXLINE+1]; |
|---|
| 440 | static char lin2[MAXLINE+1]; |
|---|
| 441 | static char sname[MAXNAMES+1]; |
|---|
| 442 | sint i; |
|---|
| 443 | |
|---|
| 444 | /* skip the comments */ |
|---|
| 445 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 446 | if(line[strlen(line)-2]=='.' && |
|---|
| 447 | line[strlen(line)-3]=='.') |
|---|
| 448 | break; |
|---|
| 449 | } |
|---|
| 450 | |
|---|
| 451 | /* find the start of the sequence entry */ |
|---|
| 452 | for (;;) { |
|---|
| 453 | while (fgets(line,MAXLINE+1,fin) != NULL) |
|---|
| 454 | if( *line == '{' ) break; |
|---|
| 455 | |
|---|
| 456 | while( !keyword(line,"name") ) |
|---|
| 457 | if (fgets(line,MAXLINE+1,fin) == NULL) return; |
|---|
| 458 | |
|---|
| 459 | for(i=5;i<=strlen(line);i++) /* DES */ |
|---|
| 460 | if(line[i] != ' ') break; |
|---|
| 461 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 462 | for(i=0;i<=strlen(sname);i++) |
|---|
| 463 | if(sname[i] == ' ') { |
|---|
| 464 | sname[i]=EOS; |
|---|
| 465 | break; |
|---|
| 466 | } |
|---|
| 467 | sname[MAXNAMES]=EOS; |
|---|
| 468 | rtrim(sname); |
|---|
| 469 | blank_to_(sname); |
|---|
| 470 | |
|---|
| 471 | /* look for secondary structure feature table / gap penalty mask */ |
|---|
| 472 | while(fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 473 | if (keyword(line,"feature")) { |
|---|
| 474 | if (interactive) { |
|---|
| 475 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 476 | strcat(title,sname); |
|---|
| 477 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 478 | } |
|---|
| 479 | else (*lin2) = 'y'; |
|---|
| 480 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 481 | struct_penalties = SECST; |
|---|
| 482 | for (i=0;i<length;i++) |
|---|
| 483 | sec_struct_mask[i] = '.'; |
|---|
| 484 | do { |
|---|
| 485 | if(keyword(line,"feature")) |
|---|
| 486 | get_rsf_feature(&line[7]); |
|---|
| 487 | fgets(line,MAXLINE+1,fin); |
|---|
| 488 | } while( !keyword(line,"sequence") ); |
|---|
| 489 | } |
|---|
| 490 | else { |
|---|
| 491 | do { |
|---|
| 492 | fgets(line,MAXLINE+1,fin); |
|---|
| 493 | } while( !keyword(line,"sequence") ); |
|---|
| 494 | } |
|---|
| 495 | strcpy(ss_name,sname); |
|---|
| 496 | } |
|---|
| 497 | else if (keyword(line,"sequence")) |
|---|
| 498 | break; |
|---|
| 499 | |
|---|
| 500 | if (struct_penalties != NONE) break; |
|---|
| 501 | } |
|---|
| 502 | |
|---|
| 503 | } |
|---|
| 504 | |
|---|
| 505 | } |
|---|
| 506 | |
|---|
| 507 | static void get_gde_ss(sint length) |
|---|
| 508 | { |
|---|
| 509 | static char title[MAXLINE+1]; |
|---|
| 510 | static char line[MAXLINE+1]; |
|---|
| 511 | static char lin2[MAXLINE+1]; |
|---|
| 512 | static char sname[MAXNAMES+1]; |
|---|
| 513 | sint i, len, offset = 0; |
|---|
| 514 | unsigned char c; |
|---|
| 515 | |
|---|
| 516 | for (;;) { |
|---|
| 517 | line[0] = '\0'; |
|---|
| 518 | /* search for the next comment line */ |
|---|
| 519 | while(*line != '"') |
|---|
| 520 | if (fgets(line,MAXLINE+1,fin) == NULL) return; |
|---|
| 521 | |
|---|
| 522 | /* is it a secondary structure entry? */ |
|---|
| 523 | if (strncmp(&line[1],"SS_",3) == 0) { |
|---|
| 524 | for (i=1;i<=MAXNAMES-3;i++) { |
|---|
| 525 | if (line[i+3] == '(' || line[i+3] == '\n') |
|---|
| 526 | break; |
|---|
| 527 | sname[i-1] = line[i+3]; |
|---|
| 528 | } |
|---|
| 529 | i--; |
|---|
| 530 | sname[i]=EOS; |
|---|
| 531 | if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset); |
|---|
| 532 | else offset = 0; |
|---|
| 533 | for(i--;i > 0;i--) |
|---|
| 534 | if(isspace(sname[i])) { |
|---|
| 535 | sname[i]=EOS; |
|---|
| 536 | } |
|---|
| 537 | else break; |
|---|
| 538 | blank_to_(sname); |
|---|
| 539 | |
|---|
| 540 | if (interactive) { |
|---|
| 541 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 542 | strcat(title,sname); |
|---|
| 543 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 544 | } |
|---|
| 545 | else (*lin2) = 'y'; |
|---|
| 546 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 547 | struct_penalties = SECST; |
|---|
| 548 | for (i=0;i<length;i++) |
|---|
| 549 | sec_struct_mask[i] = '.'; |
|---|
| 550 | len = 0; |
|---|
| 551 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 552 | if(*line == '%' || *line == '#' || *line == '"') break; |
|---|
| 553 | for(i=offset;i < length;i++) { |
|---|
| 554 | c=line[i]; |
|---|
| 555 | if(c == '\n' || c == EOS) |
|---|
| 556 | break; /* EOL */ |
|---|
| 557 | sec_struct_mask[len++]=c; |
|---|
| 558 | } |
|---|
| 559 | if (len > length) break; |
|---|
| 560 | } |
|---|
| 561 | strcpy(ss_name,sname); |
|---|
| 562 | } |
|---|
| 563 | } |
|---|
| 564 | /* or is it a gap penalty mask entry? */ |
|---|
| 565 | else if (strncmp(&line[1],"GM_",3) == 0) { |
|---|
| 566 | for (i=1;i<=MAXNAMES-3;i++) { |
|---|
| 567 | if (line[i+3] == '(' || line[i+3] == '\n') |
|---|
| 568 | break; |
|---|
| 569 | sname[i-1] = line[i+3]; |
|---|
| 570 | } |
|---|
| 571 | i--; |
|---|
| 572 | sname[i]=EOS; |
|---|
| 573 | if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset); |
|---|
| 574 | else offset = 0; |
|---|
| 575 | for(i--;i > 0;i--) |
|---|
| 576 | if(isspace(sname[i])) { |
|---|
| 577 | sname[i]=EOS; |
|---|
| 578 | } |
|---|
| 579 | else break; |
|---|
| 580 | blank_to_(sname); |
|---|
| 581 | |
|---|
| 582 | if (interactive) { |
|---|
| 583 | strcpy(title,"Found gap penalty mask in alignment file: "); |
|---|
| 584 | strcat(title,sname); |
|---|
| 585 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 586 | } |
|---|
| 587 | else (*lin2) = 'y'; |
|---|
| 588 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 589 | struct_penalties = GMASK; |
|---|
| 590 | for (i=0;i<length;i++) |
|---|
| 591 | gap_penalty_mask[i] = '1'; |
|---|
| 592 | len = 0; |
|---|
| 593 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 594 | if(*line == '%' || *line == '#' || *line == '"') break; |
|---|
| 595 | for(i=offset;i < length;i++) { |
|---|
| 596 | c=line[i]; |
|---|
| 597 | if(c == '\n' || c == EOS) |
|---|
| 598 | break; /* EOL */ |
|---|
| 599 | gap_penalty_mask[len++]=c; |
|---|
| 600 | } |
|---|
| 601 | if (len > length) break; |
|---|
| 602 | } |
|---|
| 603 | strcpy(ss_name,sname); |
|---|
| 604 | } |
|---|
| 605 | } |
|---|
| 606 | if (struct_penalties != NONE) break; |
|---|
| 607 | } |
|---|
| 608 | |
|---|
| 609 | } |
|---|
| 610 | |
|---|
| 611 | static void get_swiss_feature(char *line) |
|---|
| 612 | { |
|---|
| 613 | char c, s, feature[MAXLINE+1]; |
|---|
| 614 | int i, start_pos, end_pos; |
|---|
| 615 | |
|---|
| 616 | if (sscanf(line,"%s%d%d",feature,&start_pos,&end_pos) != 3) { |
|---|
| 617 | return; |
|---|
| 618 | } |
|---|
| 619 | |
|---|
| 620 | if (strcmp(feature,"HELIX") == 0) { |
|---|
| 621 | c = 'A'; |
|---|
| 622 | s = '$'; |
|---|
| 623 | } |
|---|
| 624 | else if (strcmp(feature,"STRAND") == 0) { |
|---|
| 625 | c = 'B'; |
|---|
| 626 | s = '%'; |
|---|
| 627 | } |
|---|
| 628 | else |
|---|
| 629 | return; |
|---|
| 630 | |
|---|
| 631 | sec_struct_mask[start_pos-1] = s; |
|---|
| 632 | for (i=start_pos;i<end_pos-1;i++) |
|---|
| 633 | sec_struct_mask[i] = c; |
|---|
| 634 | sec_struct_mask[end_pos-1] = s; |
|---|
| 635 | |
|---|
| 636 | } |
|---|
| 637 | |
|---|
| 638 | static void get_rsf_feature(char *line) |
|---|
| 639 | { |
|---|
| 640 | char c, s; |
|---|
| 641 | char str1[MAXLINE+1],str2[MAXLINE+1],feature[MAXLINE+1]; |
|---|
| 642 | int i, tmp,start_pos, end_pos; |
|---|
| 643 | |
|---|
| 644 | if (sscanf(line,"%d%d%d%s%s%s",&start_pos,&end_pos,&tmp,str1,str2,feature) != 6) { |
|---|
| 645 | return; |
|---|
| 646 | } |
|---|
| 647 | |
|---|
| 648 | if (strcmp(feature,"HELIX") == 0) { |
|---|
| 649 | c = 'A'; |
|---|
| 650 | s = '$'; |
|---|
| 651 | } |
|---|
| 652 | else if (strcmp(feature,"STRAND") == 0) { |
|---|
| 653 | c = 'B'; |
|---|
| 654 | s = '%'; |
|---|
| 655 | } |
|---|
| 656 | else |
|---|
| 657 | return; |
|---|
| 658 | |
|---|
| 659 | sec_struct_mask[start_pos-1] = s; |
|---|
| 660 | for (i=start_pos;i<end_pos-1;i++) |
|---|
| 661 | sec_struct_mask[i] = c; |
|---|
| 662 | sec_struct_mask[end_pos-1] = s; |
|---|
| 663 | |
|---|
| 664 | } |
|---|
| 665 | |
|---|
| 666 | static void get_swiss_mask(char *line) |
|---|
| 667 | { |
|---|
| 668 | int i, value, start_pos, end_pos; |
|---|
| 669 | |
|---|
| 670 | if (sscanf(line,"%d%d%d",&value,&start_pos,&end_pos) != 3) { |
|---|
| 671 | return; |
|---|
| 672 | } |
|---|
| 673 | |
|---|
| 674 | if (value < 1 || value > 9) return; |
|---|
| 675 | |
|---|
| 676 | for (i=start_pos-1;i<end_pos;i++) |
|---|
| 677 | gap_penalty_mask[i] = value+'0'; |
|---|
| 678 | |
|---|
| 679 | } |
|---|
| 680 | |
|---|
| 681 | static char * get_seq(char *sname,sint *len,char *tit) |
|---|
| 682 | { |
|---|
| 683 | static char line[MAXLINE+1]; |
|---|
| 684 | char *seq = NULL; |
|---|
| 685 | sint i, offset = 0; |
|---|
| 686 | unsigned char c=EOS; |
|---|
| 687 | |
|---|
| 688 | switch(seqFormat) { |
|---|
| 689 | |
|---|
| 690 | /************************************/ |
|---|
| 691 | case EMBLSWISS: |
|---|
| 692 | while( !linetype(line,"ID") ) |
|---|
| 693 | fgets(line,MAXLINE+1,fin); |
|---|
| 694 | |
|---|
| 695 | for(i=5;i<=strlen(line);i++) /* DES */ |
|---|
| 696 | if(line[i] != ' ') break; |
|---|
| 697 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 698 | for(i=0;i<=strlen(sname);i++) |
|---|
| 699 | if(sname[i] == ' ') { |
|---|
| 700 | sname[i]=EOS; |
|---|
| 701 | break; |
|---|
| 702 | } |
|---|
| 703 | |
|---|
| 704 | sname[MAXNAMES]=EOS; |
|---|
| 705 | rtrim(sname); |
|---|
| 706 | blank_to_(sname); |
|---|
| 707 | |
|---|
| 708 | |
|---|
| 709 | while( !linetype(line,"SQ") ) |
|---|
| 710 | fgets(line,MAXLINE+1,fin); |
|---|
| 711 | |
|---|
| 712 | *len=0; |
|---|
| 713 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 714 | if( strlen(line) > 2 && line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.' ) |
|---|
| 715 | continue; |
|---|
| 716 | if(seq==NULL) |
|---|
| 717 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 718 | else |
|---|
| 719 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 720 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 721 | c=line[i]; |
|---|
| 722 | if(c == '\n' || c == EOS || c == '/') |
|---|
| 723 | break; /* EOL */ |
|---|
| 724 | if( (c=chartab[c])) |
|---|
| 725 | seq[++(*len)]=c; |
|---|
| 726 | } |
|---|
| 727 | if(c == '/') break; |
|---|
| 728 | } |
|---|
| 729 | break; |
|---|
| 730 | |
|---|
| 731 | /************************************/ |
|---|
| 732 | case PIR: |
|---|
| 733 | while(*line != '>') |
|---|
| 734 | fgets(line,MAXLINE+1,fin); |
|---|
| 735 | for(i=4;i<=strlen(line);i++) /* DES */ |
|---|
| 736 | if(line[i] != ' ') break; |
|---|
| 737 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 738 | sname[MAXNAMES]=EOS; |
|---|
| 739 | rtrim(sname); |
|---|
| 740 | blank_to_(sname); |
|---|
| 741 | |
|---|
| 742 | fgets(line,MAXLINE+1,fin); |
|---|
| 743 | strncpy(tit,line,MAXTITLES); |
|---|
| 744 | tit[MAXTITLES]=EOS; |
|---|
| 745 | i=strlen(tit); |
|---|
| 746 | if(tit[i-1]=='\n') tit[i-1]=EOS; |
|---|
| 747 | |
|---|
| 748 | *len=0; |
|---|
| 749 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 750 | if(seq==NULL) |
|---|
| 751 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 752 | else |
|---|
| 753 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 754 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 755 | c=line[i]; |
|---|
| 756 | if(c == '\n' || c == EOS || c == '*') |
|---|
| 757 | break; /* EOL */ |
|---|
| 758 | |
|---|
| 759 | if( (c=chartab[c])) |
|---|
| 760 | seq[++(*len)]=c; |
|---|
| 761 | } |
|---|
| 762 | if(c == '*') break; |
|---|
| 763 | } |
|---|
| 764 | break; |
|---|
| 765 | /***********************************************/ |
|---|
| 766 | case PEARSON: |
|---|
| 767 | while(*line != '>') |
|---|
| 768 | fgets(line,MAXLINE+1,fin); |
|---|
| 769 | |
|---|
| 770 | for(i=1;i<=strlen(line);i++) /* DES */ |
|---|
| 771 | if(line[i] != ' ') break; |
|---|
| 772 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 773 | for(i=1;i<=strlen(line);i++) /* DES */ |
|---|
| 774 | if(sname[i] == ' ') break; |
|---|
| 775 | sname[i]=EOS; |
|---|
| 776 | rtrim(sname); |
|---|
| 777 | blank_to_(sname); |
|---|
| 778 | |
|---|
| 779 | *tit=EOS; |
|---|
| 780 | |
|---|
| 781 | *len=0; |
|---|
| 782 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 783 | if(seq==NULL) |
|---|
| 784 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 785 | else |
|---|
| 786 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 787 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 788 | c=line[i]; |
|---|
| 789 | if(c == '\n' || c == EOS || c == '>') |
|---|
| 790 | break; /* EOL */ |
|---|
| 791 | |
|---|
| 792 | if( (c=chartab[c])) |
|---|
| 793 | seq[++(*len)]=c; |
|---|
| 794 | } |
|---|
| 795 | if(c == '>') break; |
|---|
| 796 | } |
|---|
| 797 | break; |
|---|
| 798 | /**********************************************/ |
|---|
| 799 | case GDE: |
|---|
| 800 | if (dnaflag) { |
|---|
| 801 | while(*line != '#') |
|---|
| 802 | fgets(line,MAXLINE+1,fin); |
|---|
| 803 | } |
|---|
| 804 | else { |
|---|
| 805 | while(*line != '%') |
|---|
| 806 | fgets(line,MAXLINE+1,fin); |
|---|
| 807 | } |
|---|
| 808 | |
|---|
| 809 | for (i=1;i<=MAXNAMES;i++) { |
|---|
| 810 | if (line[i] == '(' || line[i] == '\n') |
|---|
| 811 | break; |
|---|
| 812 | sname[i-1] = line[i]; |
|---|
| 813 | } |
|---|
| 814 | i--; |
|---|
| 815 | sname[i]=EOS; |
|---|
| 816 | if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset); |
|---|
| 817 | else offset = 0; |
|---|
| 818 | for(i--;i > 0;i--) |
|---|
| 819 | if(isspace(sname[i])) { |
|---|
| 820 | sname[i]=EOS; |
|---|
| 821 | } |
|---|
| 822 | else break; |
|---|
| 823 | blank_to_(sname); |
|---|
| 824 | |
|---|
| 825 | *tit=EOS; |
|---|
| 826 | |
|---|
| 827 | *len=0; |
|---|
| 828 | for (i=0;i<offset;i++) seq[++(*len)] = '-'; |
|---|
| 829 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 830 | if(*line == '%' || *line == '#' || *line == '"') break; |
|---|
| 831 | if(seq==NULL) |
|---|
| 832 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 833 | else |
|---|
| 834 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 835 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 836 | c=line[i]; |
|---|
| 837 | if(c == '\n' || c == EOS) |
|---|
| 838 | break; /* EOL */ |
|---|
| 839 | |
|---|
| 840 | if( (c=chartab[c])) |
|---|
| 841 | seq[++(*len)]=c; |
|---|
| 842 | } |
|---|
| 843 | } |
|---|
| 844 | break; |
|---|
| 845 | /***********************************************/ |
|---|
| 846 | case RSF: |
|---|
| 847 | while(*line != '{') |
|---|
| 848 | if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; |
|---|
| 849 | |
|---|
| 850 | while( !keyword(line,"name") ) |
|---|
| 851 | if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; |
|---|
| 852 | |
|---|
| 853 | for(i=5;i<=strlen(line);i++) /* DES */ |
|---|
| 854 | if(line[i] != ' ') break; |
|---|
| 855 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 856 | for(i=0;i<=strlen(sname);i++) |
|---|
| 857 | if(sname[i] == ' ') { |
|---|
| 858 | sname[i]=EOS; |
|---|
| 859 | break; |
|---|
| 860 | } |
|---|
| 861 | |
|---|
| 862 | sname[MAXNAMES]=EOS; |
|---|
| 863 | rtrim(sname); |
|---|
| 864 | blank_to_(sname); |
|---|
| 865 | |
|---|
| 866 | |
|---|
| 867 | while( !keyword(line,"sequence") ) |
|---|
| 868 | if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; |
|---|
| 869 | |
|---|
| 870 | *len=0; |
|---|
| 871 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 872 | if(seq==NULL) |
|---|
| 873 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 874 | else |
|---|
| 875 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 876 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 877 | c=line[i]; |
|---|
| 878 | if(c == EOS || c == '}') |
|---|
| 879 | break; /* EOL */ |
|---|
| 880 | if( c=='.') |
|---|
| 881 | seq[++(*len)]='-'; |
|---|
| 882 | if( (c=chartab[c])) |
|---|
| 883 | seq[++(*len)]=c; |
|---|
| 884 | } |
|---|
| 885 | if(c == '}') break; |
|---|
| 886 | } |
|---|
| 887 | break; |
|---|
| 888 | /***********************************************/ |
|---|
| 889 | } |
|---|
| 890 | |
|---|
| 891 | seq[*len+1]=EOS; |
|---|
| 892 | |
|---|
| 893 | return seq; |
|---|
| 894 | } |
|---|
| 895 | |
|---|
| 896 | |
|---|
| 897 | sint readseqs(sint first_seq) /*first_seq is the #no. of the first seq. to read */ |
|---|
| 898 | { |
|---|
| 899 | char line[FILENAMELEN+1]; |
|---|
| 900 | static char *seq1,sname1[MAXNAMES+1],title[MAXTITLES+1]; |
|---|
| 901 | sint i,j; |
|---|
| 902 | sint no_seqs; |
|---|
| 903 | static sint l1; |
|---|
| 904 | static Boolean dnaflag1; |
|---|
| 905 | |
|---|
| 906 | if(usemenu) |
|---|
| 907 | getstr("Enter the name of the sequence file",line); |
|---|
| 908 | else |
|---|
| 909 | strcpy(line,seqname); |
|---|
| 910 | if(*line == EOS) return -1; |
|---|
| 911 | |
|---|
| 912 | if((fin=fopen(line,"r"))==NULL) { |
|---|
| 913 | error("Could not open sequence file %s",line); |
|---|
| 914 | return -1; /* DES -1 => file not found */ |
|---|
| 915 | } |
|---|
| 916 | strcpy(seqname,line); |
|---|
| 917 | no_seqs=0; |
|---|
| 918 | check_infile(&no_seqs); |
|---|
| 919 | info("Sequence format is %s",formatNames[seqFormat]); |
|---|
| 920 | |
|---|
| 921 | /* DES DEBUG |
|---|
| 922 | fprintf(stdout,"\n\n File name = %s\n\n",seqname); |
|---|
| 923 | */ |
|---|
| 924 | if(no_seqs == 0) |
|---|
| 925 | return 0; /* return the number of seqs. (zero here)*/ |
|---|
| 926 | |
|---|
| 927 | /* |
|---|
| 928 | if((no_seqs + first_seq -1) > MAXN) { |
|---|
| 929 | error("Too many sequences. Maximum is %d",(pint)MAXN); |
|---|
| 930 | return 0; |
|---|
| 931 | } |
|---|
| 932 | */ |
|---|
| 933 | |
|---|
| 934 | /* DES */ |
|---|
| 935 | /* if(seqFormat == CLUSTAL) { |
|---|
| 936 | info("no of sequences = %d",(pint)no_seqs); |
|---|
| 937 | return no_seqs; |
|---|
| 938 | } |
|---|
| 939 | */ |
|---|
| 940 | max_aln_length = 0; |
|---|
| 941 | |
|---|
| 942 | /* if this is a multiple alignment, or profile 1 - free any memory used |
|---|
| 943 | by previous alignments, then allocate memory for the new alignment */ |
|---|
| 944 | if(first_seq == 1) { |
|---|
| 945 | max_names = 0; |
|---|
| 946 | free_aln(nseqs); |
|---|
| 947 | alloc_aln(no_seqs); |
|---|
| 948 | } |
|---|
| 949 | /* otherwise, this is a profile 2, and we need to reallocate the arrays, |
|---|
| 950 | leaving the data for profile 1 intact */ |
|---|
| 951 | else realloc_aln(first_seq,no_seqs); |
|---|
| 952 | |
|---|
| 953 | for(i=1;i<first_seq;i++) |
|---|
| 954 | { |
|---|
| 955 | if(seqlen_array[i]>max_aln_length) |
|---|
| 956 | max_aln_length=seqlen_array[i]; |
|---|
| 957 | if(strlen(names[i])>max_names) |
|---|
| 958 | max_names=strlen(names[i]); |
|---|
| 959 | } |
|---|
| 960 | |
|---|
| 961 | for(i=first_seq;i<=first_seq+no_seqs-1;i++) { /* get the seqs now*/ |
|---|
| 962 | output_index[i] = i; /* default output order */ |
|---|
| 963 | if(seqFormat == CLUSTAL) |
|---|
| 964 | seq1=get_clustal_seq(sname1,&l1,title,i-first_seq+1); |
|---|
| 965 | else if(seqFormat == MSF) |
|---|
| 966 | seq1=get_msf_seq(sname1,&l1,title,i-first_seq+1); |
|---|
| 967 | else |
|---|
| 968 | seq1=get_seq(sname1,&l1,title); |
|---|
| 969 | /* JULIE */ |
|---|
| 970 | /* Set max length of dynamically allocated arrays in prfalign.c */ |
|---|
| 971 | if (l1 > max_aln_length) max_aln_length = l1; |
|---|
| 972 | seqlen_array[i]=l1; /* store the length */ |
|---|
| 973 | strcpy(names[i],sname1); /* " " name */ |
|---|
| 974 | strcpy(titles[i],title); /* " " title */ |
|---|
| 975 | |
|---|
| 976 | if(!explicit_dnaflag) { |
|---|
| 977 | dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */ |
|---|
| 978 | if(i == 1) dnaflag = dnaflag1; |
|---|
| 979 | } /* type decided by first seq*/ |
|---|
| 980 | else |
|---|
| 981 | dnaflag1 = dnaflag; |
|---|
| 982 | |
|---|
| 983 | alloc_seq(i,l1); |
|---|
| 984 | |
|---|
| 985 | if(dnaflag) |
|---|
| 986 | n_encode(seq1,seq_array[i],l1); /* encode the sequence*/ |
|---|
| 987 | else /* as ints */ |
|---|
| 988 | p_encode(seq1,seq_array[i],l1); |
|---|
| 989 | if(seq1!=NULL) seq1=ckfree(seq1); |
|---|
| 990 | } |
|---|
| 991 | |
|---|
| 992 | |
|---|
| 993 | max_aln_length *= 2; |
|---|
| 994 | /* |
|---|
| 995 | JULIE |
|---|
| 996 | check sequence names are all different - otherwise phylip tree is |
|---|
| 997 | confused. |
|---|
| 998 | */ |
|---|
| 999 | for(i=1;i<=first_seq+no_seqs-1;i++) { |
|---|
| 1000 | for(j=i+1;j<=first_seq+no_seqs-1;j++) { |
|---|
| 1001 | if (strncmp(names[i],names[j],MAXNAMES) == 0) { |
|---|
| 1002 | error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES); |
|---|
| 1003 | return -1; |
|---|
| 1004 | } |
|---|
| 1005 | } |
|---|
| 1006 | } |
|---|
| 1007 | for(i=first_seq;i<=first_seq+no_seqs-1;i++) |
|---|
| 1008 | { |
|---|
| 1009 | if(seqlen_array[i]>max_aln_length) |
|---|
| 1010 | max_aln_length=seqlen_array[i]; |
|---|
| 1011 | } |
|---|
| 1012 | |
|---|
| 1013 | /* look for a feature table / gap penalty mask (only if this is a profile) */ |
|---|
| 1014 | if (profile_no > 0) { |
|---|
| 1015 | rewind(fin); |
|---|
| 1016 | struct_penalties = NONE; |
|---|
| 1017 | gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); |
|---|
| 1018 | sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); |
|---|
| 1019 | ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char)); |
|---|
| 1020 | |
|---|
| 1021 | if (seqFormat == CLUSTAL) { |
|---|
| 1022 | get_clustal_ss(max_aln_length); |
|---|
| 1023 | } |
|---|
| 1024 | else if (seqFormat == GDE) { |
|---|
| 1025 | get_gde_ss(max_aln_length); |
|---|
| 1026 | } |
|---|
| 1027 | else if (seqFormat == EMBLSWISS) { |
|---|
| 1028 | get_embl_ss(max_aln_length); |
|---|
| 1029 | } |
|---|
| 1030 | else if (seqFormat == RSF) { |
|---|
| 1031 | get_rsf_ss(max_aln_length); |
|---|
| 1032 | } |
|---|
| 1033 | } |
|---|
| 1034 | |
|---|
| 1035 | for(i=first_seq;i<=first_seq+no_seqs-1;i++) |
|---|
| 1036 | { |
|---|
| 1037 | if(strlen(names[i])>max_names) |
|---|
| 1038 | max_names=strlen(names[i]); |
|---|
| 1039 | } |
|---|
| 1040 | |
|---|
| 1041 | if(max_names<10) max_names=10; |
|---|
| 1042 | |
|---|
| 1043 | fclose(fin); |
|---|
| 1044 | |
|---|
| 1045 | return no_seqs; /* return the number of seqs. read in this call */ |
|---|
| 1046 | } |
|---|
| 1047 | |
|---|
| 1048 | |
|---|
| 1049 | static Boolean check_dnaflag(char *seq, sint slen) |
|---|
| 1050 | /* check if DNA or Protein |
|---|
| 1051 | The decision is based on counting all A,C,G,T,U or N. |
|---|
| 1052 | If >= 85% of all characters (except -) are as above => DNA */ |
|---|
| 1053 | { |
|---|
| 1054 | sint i, c, nresidues, nbases; |
|---|
| 1055 | float ratio; |
|---|
| 1056 | char *dna_codes="ACGTUN"; |
|---|
| 1057 | |
|---|
| 1058 | nresidues = nbases = 0; |
|---|
| 1059 | for(i=1; i <= slen; i++) { |
|---|
| 1060 | if(seq[i] != '-') { |
|---|
| 1061 | nresidues++; |
|---|
| 1062 | if(seq[i] == 'N') |
|---|
| 1063 | nbases++; |
|---|
| 1064 | else { |
|---|
| 1065 | c = res_index(dna_codes, seq[i]); |
|---|
| 1066 | if(c >= 0) |
|---|
| 1067 | nbases++; |
|---|
| 1068 | } |
|---|
| 1069 | } |
|---|
| 1070 | } |
|---|
| 1071 | if( (nbases == 0) || (nresidues == 0) ) return FALSE; |
|---|
| 1072 | ratio = (float)nbases/(float)nresidues; |
|---|
| 1073 | /* DES fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n", |
|---|
| 1074 | (pint)nbases,(pint)nresidues,(pint)ratio); */ |
|---|
| 1075 | if(ratio >= 0.85) |
|---|
| 1076 | return TRUE; |
|---|
| 1077 | else |
|---|
| 1078 | return FALSE; |
|---|
| 1079 | } |
|---|
| 1080 | |
|---|
| 1081 | |
|---|
| 1082 | |
|---|
| 1083 | static void check_infile(sint *nseqs) |
|---|
| 1084 | { |
|---|
| 1085 | char line[MAXLINE+1]; |
|---|
| 1086 | sint i; |
|---|
| 1087 | |
|---|
| 1088 | *nseqs=0; |
|---|
| 1089 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1090 | if(!blankline(line)) |
|---|
| 1091 | break; |
|---|
| 1092 | } |
|---|
| 1093 | |
|---|
| 1094 | for(i=strlen(line)-1;i>=0;i--) |
|---|
| 1095 | if(isgraph(line[i])) break; |
|---|
| 1096 | line[i+1]=EOS; |
|---|
| 1097 | |
|---|
| 1098 | for(i=0;i<=6;i++) line[i] = toupper(line[i]); |
|---|
| 1099 | |
|---|
| 1100 | if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */ |
|---|
| 1101 | seqFormat=EMBLSWISS; |
|---|
| 1102 | (*nseqs)++; |
|---|
| 1103 | } |
|---|
| 1104 | else if( linetype(line,"CLUSTAL") ) { |
|---|
| 1105 | seqFormat=CLUSTAL; |
|---|
| 1106 | } |
|---|
| 1107 | else if( linetype(line,"PILEUP") ) { |
|---|
| 1108 | seqFormat = MSF; |
|---|
| 1109 | } |
|---|
| 1110 | else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) { |
|---|
| 1111 | seqFormat = MSF; |
|---|
| 1112 | dnaflag = FALSE; |
|---|
| 1113 | explicit_dnaflag = TRUE; |
|---|
| 1114 | } |
|---|
| 1115 | else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) { |
|---|
| 1116 | seqFormat = MSF; |
|---|
| 1117 | dnaflag = TRUE; |
|---|
| 1118 | explicit_dnaflag = TRUE; |
|---|
| 1119 | } |
|---|
| 1120 | else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' && |
|---|
| 1121 | line[strlen(line)-2]=='.' ) { |
|---|
| 1122 | seqFormat = MSF; |
|---|
| 1123 | } |
|---|
| 1124 | else if( linetype(line,"!!RICH_SEQUENCE") ) { |
|---|
| 1125 | seqFormat = RSF; |
|---|
| 1126 | } |
|---|
| 1127 | else if(*line == '>') { /* no */ |
|---|
| 1128 | seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */ |
|---|
| 1129 | (*nseqs)++; |
|---|
| 1130 | } |
|---|
| 1131 | else if((*line == '"') || (*line == '%') || (*line == '#')) { |
|---|
| 1132 | seqFormat=GDE; /* GDE format */ |
|---|
| 1133 | if (*line == '%') { |
|---|
| 1134 | (*nseqs)++; |
|---|
| 1135 | dnaflag = FALSE; |
|---|
| 1136 | explicit_dnaflag = TRUE; |
|---|
| 1137 | } |
|---|
| 1138 | else if (*line == '#') { |
|---|
| 1139 | (*nseqs)++; |
|---|
| 1140 | dnaflag = TRUE; |
|---|
| 1141 | explicit_dnaflag = TRUE; |
|---|
| 1142 | } |
|---|
| 1143 | } |
|---|
| 1144 | else { |
|---|
| 1145 | seqFormat=UNKNOWN; |
|---|
| 1146 | return; |
|---|
| 1147 | } |
|---|
| 1148 | |
|---|
| 1149 | while(fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1150 | switch(seqFormat) { |
|---|
| 1151 | case EMBLSWISS: |
|---|
| 1152 | if( linetype(line,"ID") ) |
|---|
| 1153 | (*nseqs)++; |
|---|
| 1154 | break; |
|---|
| 1155 | case PIR: |
|---|
| 1156 | *nseqs = count_pir_seqs(); |
|---|
| 1157 | fseek(fin,0,0); |
|---|
| 1158 | return; |
|---|
| 1159 | case PEARSON: |
|---|
| 1160 | if( *line == '>' ) |
|---|
| 1161 | (*nseqs)++; |
|---|
| 1162 | break; |
|---|
| 1163 | case GDE: |
|---|
| 1164 | if(( *line == '%' ) && ( dnaflag == FALSE)) |
|---|
| 1165 | (*nseqs)++; |
|---|
| 1166 | else if (( *line == '#') && ( dnaflag == TRUE)) |
|---|
| 1167 | (*nseqs)++; |
|---|
| 1168 | break; |
|---|
| 1169 | case CLUSTAL: |
|---|
| 1170 | *nseqs = count_clustal_seqs(); |
|---|
| 1171 | /* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*nseqs); */ |
|---|
| 1172 | fseek(fin,0,0); |
|---|
| 1173 | return; |
|---|
| 1174 | case MSF: |
|---|
| 1175 | *nseqs = count_msf_seqs(); |
|---|
| 1176 | fseek(fin,0,0); |
|---|
| 1177 | return; |
|---|
| 1178 | case RSF: |
|---|
| 1179 | fseek(fin,0,0); |
|---|
| 1180 | *nseqs = count_rsf_seqs(); |
|---|
| 1181 | fseek(fin,0,0); |
|---|
| 1182 | return; |
|---|
| 1183 | case USER: |
|---|
| 1184 | default: |
|---|
| 1185 | break; |
|---|
| 1186 | } |
|---|
| 1187 | } |
|---|
| 1188 | fseek(fin,0,0); |
|---|
| 1189 | } |
|---|
| 1190 | |
|---|
| 1191 | |
|---|
| 1192 | static sint count_pir_seqs(void) |
|---|
| 1193 | /* count the number of sequences in a pir alignment file */ |
|---|
| 1194 | { |
|---|
| 1195 | char line[MAXLINE+1],c; |
|---|
| 1196 | sint nseqs, i; |
|---|
| 1197 | Boolean seq_ok; |
|---|
| 1198 | |
|---|
| 1199 | seq_ok = FALSE; |
|---|
| 1200 | while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */ |
|---|
| 1201 | if(*line == '>') break; |
|---|
| 1202 | for(i=0;seq_ok == FALSE;i++) { |
|---|
| 1203 | c=line[i]; |
|---|
| 1204 | if(c == '*') { |
|---|
| 1205 | seq_ok = TRUE; /* ok - end of sequence found */ |
|---|
| 1206 | break; |
|---|
| 1207 | } /* EOL */ |
|---|
| 1208 | if(c == '\n' || c == EOS) |
|---|
| 1209 | break; /* EOL */ |
|---|
| 1210 | } |
|---|
| 1211 | if (seq_ok == TRUE) |
|---|
| 1212 | break; |
|---|
| 1213 | } |
|---|
| 1214 | if (seq_ok == FALSE) { |
|---|
| 1215 | error("PIR format sequence end marker '*'\nmissing for one or more sequences."); |
|---|
| 1216 | return (sint)0; /* funny format*/ |
|---|
| 1217 | } |
|---|
| 1218 | |
|---|
| 1219 | |
|---|
| 1220 | nseqs = 1; |
|---|
| 1221 | |
|---|
| 1222 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1223 | if(*line == '>') { /* Look for start of next seq */ |
|---|
| 1224 | seq_ok = FALSE; |
|---|
| 1225 | while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */ |
|---|
| 1226 | if(*line == '>') { |
|---|
| 1227 | error("PIR format sequence end marker '*' missing for one or more sequences."); |
|---|
| 1228 | return (sint)0; /* funny format*/ |
|---|
| 1229 | } |
|---|
| 1230 | for(i=0;seq_ok == FALSE;i++) { |
|---|
| 1231 | c=line[i]; |
|---|
| 1232 | if(c == '*') { |
|---|
| 1233 | seq_ok = TRUE; /* ok - sequence found */ |
|---|
| 1234 | break; |
|---|
| 1235 | } /* EOL */ |
|---|
| 1236 | if(c == '\n' || c == EOS) |
|---|
| 1237 | break; /* EOL */ |
|---|
| 1238 | } |
|---|
| 1239 | if (seq_ok == TRUE) { |
|---|
| 1240 | nseqs++; |
|---|
| 1241 | break; |
|---|
| 1242 | } |
|---|
| 1243 | } |
|---|
| 1244 | } |
|---|
| 1245 | } |
|---|
| 1246 | return (sint)nseqs; |
|---|
| 1247 | } |
|---|
| 1248 | |
|---|
| 1249 | |
|---|
| 1250 | static sint count_clustal_seqs(void) |
|---|
| 1251 | /* count the number of sequences in a clustal alignment file */ |
|---|
| 1252 | { |
|---|
| 1253 | char line[MAXLINE+1]; |
|---|
| 1254 | sint nseqs; |
|---|
| 1255 | |
|---|
| 1256 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1257 | if(!cl_blankline(line)) break; /* Look for next non- */ |
|---|
| 1258 | } /* blank line */ |
|---|
| 1259 | nseqs = 1; |
|---|
| 1260 | |
|---|
| 1261 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1262 | if(cl_blankline(line)) return nseqs; |
|---|
| 1263 | nseqs++; |
|---|
| 1264 | } |
|---|
| 1265 | |
|---|
| 1266 | return (sint)0; /* if you got to here-funny format/no seqs.*/ |
|---|
| 1267 | } |
|---|
| 1268 | |
|---|
| 1269 | static sint count_msf_seqs(void) |
|---|
| 1270 | { |
|---|
| 1271 | /* count the number of sequences in a PILEUP alignment file */ |
|---|
| 1272 | |
|---|
| 1273 | char line[MAXLINE+1]; |
|---|
| 1274 | sint nseqs; |
|---|
| 1275 | |
|---|
| 1276 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1277 | if(linetype(line,"//")) break; |
|---|
| 1278 | } |
|---|
| 1279 | |
|---|
| 1280 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1281 | if(!blankline(line)) break; /* Look for next non- */ |
|---|
| 1282 | } /* blank line */ |
|---|
| 1283 | nseqs = 1; |
|---|
| 1284 | |
|---|
| 1285 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1286 | if(blankline(line)) return nseqs; |
|---|
| 1287 | nseqs++; |
|---|
| 1288 | } |
|---|
| 1289 | |
|---|
| 1290 | return (sint)0; /* if you got to here-funny format/no seqs.*/ |
|---|
| 1291 | } |
|---|
| 1292 | |
|---|
| 1293 | static sint count_rsf_seqs(void) |
|---|
| 1294 | { |
|---|
| 1295 | /* count the number of sequences in a GCG RSF alignment file */ |
|---|
| 1296 | |
|---|
| 1297 | char line[MAXLINE+1]; |
|---|
| 1298 | sint nseqs; |
|---|
| 1299 | |
|---|
| 1300 | nseqs = 0; |
|---|
| 1301 | /* skip the comments */ |
|---|
| 1302 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1303 | if(line[strlen(line)-2]=='.' && |
|---|
| 1304 | line[strlen(line)-3]=='.') |
|---|
| 1305 | break; |
|---|
| 1306 | } |
|---|
| 1307 | |
|---|
| 1308 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1309 | if( *line == '{' ) |
|---|
| 1310 | nseqs++; |
|---|
| 1311 | } |
|---|
| 1312 | return (sint)nseqs; |
|---|
| 1313 | } |
|---|
| 1314 | |
|---|
| 1315 | static void p_encode(char *seq, char *naseq, sint l) |
|---|
| 1316 | { /* code seq as ints .. use gap_pos2 for gap */ |
|---|
| 1317 | register sint i; |
|---|
| 1318 | /* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/ |
|---|
| 1319 | |
|---|
| 1320 | for(i=1;i<=l;i++) |
|---|
| 1321 | if(seq[i] == '-') |
|---|
| 1322 | naseq[i] = gap_pos2; |
|---|
| 1323 | else |
|---|
| 1324 | naseq[i] = res_index(amino_acid_codes,seq[i]); |
|---|
| 1325 | naseq[i] = -3; |
|---|
| 1326 | } |
|---|
| 1327 | |
|---|
| 1328 | static void n_encode(char *seq,char *naseq,sint l) |
|---|
| 1329 | { /* code seq as ints .. use gap_pos2 for gap */ |
|---|
| 1330 | register sint i; |
|---|
| 1331 | /* static char *nucs="ACGTU"; */ |
|---|
| 1332 | |
|---|
| 1333 | for(i=1;i<=l;i++) { |
|---|
| 1334 | if(seq[i] == '-') /* if a gap character -> code = gap_pos2 */ |
|---|
| 1335 | naseq[i] = gap_pos2; /* this is the code for a gap in */ |
|---|
| 1336 | else { /* the input files */ |
|---|
| 1337 | naseq[i]=res_index(amino_acid_codes,seq[i]); |
|---|
| 1338 | } |
|---|
| 1339 | } |
|---|
| 1340 | naseq[i] = -3; |
|---|
| 1341 | } |
|---|
| 1342 | |
|---|
| 1343 | static sint res_index(char *t,char c) |
|---|
| 1344 | { |
|---|
| 1345 | register sint i; |
|---|
| 1346 | |
|---|
| 1347 | for(i=0;t[i] && t[i] != c;i++) |
|---|
| 1348 | ; |
|---|
| 1349 | if(t[i]) return(i); |
|---|
| 1350 | else return -1; |
|---|
| 1351 | } |
|---|