| 1 | /********* Sequence input routines for CLUSTAL W *******************/ |
|---|
| 2 | /* DES was here. FEB. 1994 */ |
|---|
| 3 | /* Now reads PILEUP/MSF and CLUSTAL alignment files */ |
|---|
| 4 | |
|---|
| 5 | #include <stdio.h> |
|---|
| 6 | #include <string.h> |
|---|
| 7 | #include <ctype.h> |
|---|
| 8 | #include <stdlib.h> |
|---|
| 9 | #include "clustalw.h" |
|---|
| 10 | |
|---|
| 11 | #define MIN(a,b) ((a)<(b)?(a):(b)) |
|---|
| 12 | |
|---|
| 13 | |
|---|
| 14 | |
|---|
| 15 | /* |
|---|
| 16 | * Prototypes |
|---|
| 17 | */ |
|---|
| 18 | |
|---|
| 19 | static char * get_seq(char *,sint *,char *); |
|---|
| 20 | static void check_infile(sint *); |
|---|
| 21 | static void p_encode(char *, char *, sint); |
|---|
| 22 | static void n_encode(char *, char *, sint); |
|---|
| 23 | static sint res_index(const char *,char); |
|---|
| 24 | static Boolean check_dnaflag(char *, sint); |
|---|
| 25 | static sint count_clustal_seqs(void); |
|---|
| 26 | static sint count_pir_seqs(void); |
|---|
| 27 | static sint count_msf_seqs(void); |
|---|
| 28 | static sint count_rsf_seqs(void); |
|---|
| 29 | static void get_swiss_feature(char *line,sint len); |
|---|
| 30 | static void get_rsf_feature(char *line,sint len); |
|---|
| 31 | static void get_swiss_mask(char *line,sint len); |
|---|
| 32 | static void get_clustal_ss(sint length); |
|---|
| 33 | static void get_embl_ss(sint length); |
|---|
| 34 | static void get_rsf_ss(sint length); |
|---|
| 35 | static void get_gde_ss(sint length); |
|---|
| 36 | static Boolean cl_blankline(char *line); |
|---|
| 37 | |
|---|
| 38 | /* |
|---|
| 39 | * Global variables |
|---|
| 40 | */ |
|---|
| 41 | extern sint max_names; |
|---|
| 42 | FILE *fin; |
|---|
| 43 | extern Boolean usemenu, dnaflag, explicit_dnaflag; |
|---|
| 44 | extern Boolean interactive; |
|---|
| 45 | extern char seqname[]; |
|---|
| 46 | extern sint nseqs; |
|---|
| 47 | extern sint *seqlen_array; |
|---|
| 48 | extern sint *output_index; |
|---|
| 49 | extern char **names,**titles; |
|---|
| 50 | extern char **seq_array; |
|---|
| 51 | extern Boolean profile1_empty, profile2_empty; |
|---|
| 52 | extern sint gap_pos2; |
|---|
| 53 | extern sint max_aln_length; |
|---|
| 54 | extern char *gap_penalty_mask, *sec_struct_mask; |
|---|
| 55 | extern sint struct_penalties; |
|---|
| 56 | extern char *ss_name; |
|---|
| 57 | extern sint profile_no; |
|---|
| 58 | extern sint debug; |
|---|
| 59 | |
|---|
| 60 | const char *amino_acid_codes = "ABCDEFGHIKLMNPQRSTUVWXYZ-"; /* DES */ |
|---|
| 61 | static sint seqFormat; |
|---|
| 62 | static char chartab[128]; |
|---|
| 63 | static const char *formatNames[] = {"unknown","EMBL/Swiss-Prot","PIR", |
|---|
| 64 | "Pearson","GDE","Clustal","Pileup/MSF","RSF","USER","PHYLIP","NEXUS"}; |
|---|
| 65 | |
|---|
| 66 | void fill_chartab(void) /* Create translation and check table */ |
|---|
| 67 | { |
|---|
| 68 | register sint i; |
|---|
| 69 | register char c; |
|---|
| 70 | |
|---|
| 71 | for(i=0;i<128;chartab[i++]=0); |
|---|
| 72 | for(i=0;(c=amino_acid_codes[i]);i++) |
|---|
| 73 | chartab[(int)c]=chartab[tolower(c)]=c; |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | static char * get_msf_seq(char *sname,sint *len,sint seqno) |
|---|
| 77 | /* read the seqno_th. sequence from a PILEUP multiple alignment file */ |
|---|
| 78 | { |
|---|
| 79 | static char line[MAXLINE+1]; |
|---|
| 80 | char *seq = NULL; |
|---|
| 81 | sint i,j,k; |
|---|
| 82 | unsigned char c; |
|---|
| 83 | |
|---|
| 84 | fseek(fin,0,0); /* start at the beginning */ |
|---|
| 85 | |
|---|
| 86 | *len=0; /* initialise length to zero */ |
|---|
| 87 | for(i=0;;i++) { |
|---|
| 88 | if(fgets(line,MAXLINE+1,fin)==NULL) return NULL; /* read the title*/ |
|---|
| 89 | if(linetype(line,"//") ) break; /* lines...ignore*/ |
|---|
| 90 | } |
|---|
| 91 | |
|---|
| 92 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 93 | if(!blankline(line)) { |
|---|
| 94 | |
|---|
| 95 | for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin); |
|---|
| 96 | { |
|---|
| 97 | int lineLen=strlen(line); |
|---|
| 98 | for(j=0;j<=lineLen;j++) if(line[j] != ' ') break; |
|---|
| 99 | for(k=j;k<=lineLen;k++) if(line[k] == ' ') break; |
|---|
| 100 | } |
|---|
| 101 | strncpy(sname,line+j,MIN(MAXNAMES,k-j)); |
|---|
| 102 | sname[MIN(MAXNAMES,k-j)]=EOS; |
|---|
| 103 | rtrim(sname); |
|---|
| 104 | blank_to_(sname); |
|---|
| 105 | |
|---|
| 106 | if(seq==NULL) |
|---|
| 107 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 108 | else |
|---|
| 109 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 110 | for(i=k;i<=MAXLINE;i++) { |
|---|
| 111 | c=line[i]; |
|---|
| 112 | if(c == '.' || c == '~' ) c = '-'; |
|---|
| 113 | if(c == '*') c = 'X'; |
|---|
| 114 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 115 | c=chartab[c]; |
|---|
| 116 | if(c) seq[++(*len)]=c; |
|---|
| 117 | } |
|---|
| 118 | |
|---|
| 119 | for(i=0;;i++) { |
|---|
| 120 | if(fgets(line,MAXLINE+1,fin)==NULL) return seq; |
|---|
| 121 | if(blankline(line)) break; |
|---|
| 122 | } |
|---|
| 123 | } |
|---|
| 124 | } |
|---|
| 125 | return seq; |
|---|
| 126 | } |
|---|
| 127 | |
|---|
| 128 | static Boolean cl_blankline(char *line) |
|---|
| 129 | { |
|---|
| 130 | int i; |
|---|
| 131 | |
|---|
| 132 | if (line[0] == '!') return TRUE; |
|---|
| 133 | |
|---|
| 134 | for(i=0;line[i]!='\n' && line[i]!=EOS;i++) { |
|---|
| 135 | if( isdigit(line[i]) || |
|---|
| 136 | isspace(line[i]) || |
|---|
| 137 | (line[i] == '*') || |
|---|
| 138 | (line[i] == ':') || |
|---|
| 139 | (line[i] == '.')) |
|---|
| 140 | ; |
|---|
| 141 | else |
|---|
| 142 | return FALSE; |
|---|
| 143 | } |
|---|
| 144 | return TRUE; |
|---|
| 145 | } |
|---|
| 146 | |
|---|
| 147 | static char * get_clustal_seq(char *sname,sint *len,sint seqno) |
|---|
| 148 | /* read the seqno_th. sequence from a clustal multiple alignment file */ |
|---|
| 149 | { |
|---|
| 150 | static char line[MAXLINE+1]; |
|---|
| 151 | static char tseq[MAXLINE+1]; |
|---|
| 152 | char *seq = NULL; |
|---|
| 153 | sint i,j; |
|---|
| 154 | unsigned char c; |
|---|
| 155 | |
|---|
| 156 | fseek(fin,0,0); /* start at the beginning */ |
|---|
| 157 | |
|---|
| 158 | *len=0; /* initialise length to zero */ |
|---|
| 159 | fgets(line,MAXLINE+1,fin); /* read the title line...ignore it */ |
|---|
| 160 | |
|---|
| 161 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 162 | if(!cl_blankline(line)) { |
|---|
| 163 | |
|---|
| 164 | for(i=1;i<seqno;i++) fgets(line,MAXLINE+1,fin); |
|---|
| 165 | { |
|---|
| 166 | int lineLen=strlen(line); |
|---|
| 167 | for(j=0;j<=lineLen;j++) if(line[j] != ' ') break; |
|---|
| 168 | } |
|---|
| 169 | |
|---|
| 170 | sscanf(line,"%s%s",sname,tseq); |
|---|
| 171 | for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; |
|---|
| 172 | sname[j]=EOS; |
|---|
| 173 | rtrim(sname); |
|---|
| 174 | blank_to_(sname); |
|---|
| 175 | |
|---|
| 176 | if(seq==NULL) |
|---|
| 177 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 178 | else |
|---|
| 179 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 180 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 181 | c=tseq[i]; |
|---|
| 182 | /*if(c == '\n' || c == EOS) break;*/ /* EOL */ |
|---|
| 183 | if(isspace(c) || c == EOS) break; /* EOL */ |
|---|
| 184 | c=chartab[c]; |
|---|
| 185 | if(c) seq[++(*len)]=c; |
|---|
| 186 | } |
|---|
| 187 | |
|---|
| 188 | for(i=0;;i++) { |
|---|
| 189 | if(fgets(line,MAXLINE+1,fin)==NULL) return seq; |
|---|
| 190 | if(cl_blankline(line)) break; |
|---|
| 191 | } |
|---|
| 192 | } |
|---|
| 193 | } |
|---|
| 194 | |
|---|
| 195 | return seq; |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | static void get_clustal_ss(sint length) |
|---|
| 199 | /* read the structure data from a clustal multiple alignment file */ |
|---|
| 200 | { |
|---|
| 201 | static char title[MAXLINE+1]; |
|---|
| 202 | static char line[MAXLINE+1]; |
|---|
| 203 | static char lin2[MAXLINE+1]; |
|---|
| 204 | static char tseq[MAXLINE+1]; |
|---|
| 205 | static char sname[MAXNAMES+1]; |
|---|
| 206 | sint i,j,len,ix,struct_index=0; |
|---|
| 207 | char c; |
|---|
| 208 | |
|---|
| 209 | |
|---|
| 210 | fseek(fin,0,0); /* start at the beginning */ |
|---|
| 211 | |
|---|
| 212 | len=0; /* initialise length to zero */ |
|---|
| 213 | if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the title line...ignore it */ |
|---|
| 214 | |
|---|
| 215 | if (fgets(line,MAXLINE+1,fin) == NULL) return; /* read the next line... */ |
|---|
| 216 | /* skip any blank lines */ |
|---|
| 217 | for (;;) { |
|---|
| 218 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 219 | if(!blankline(line)) break; |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | /* look for structure table lines */ |
|---|
| 223 | ix = -1; |
|---|
| 224 | for(;;) { |
|---|
| 225 | if(line[0] != '!') break; |
|---|
| 226 | if(strncmp(line,"!SS",3) == 0) { |
|---|
| 227 | ix++; |
|---|
| 228 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 229 | for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; |
|---|
| 230 | sname[j]=EOS; |
|---|
| 231 | rtrim(sname); |
|---|
| 232 | blank_to_(sname); |
|---|
| 233 | if (interactive) { |
|---|
| 234 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 235 | strcat(title,sname); |
|---|
| 236 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 237 | } |
|---|
| 238 | else (*lin2) = 'y'; |
|---|
| 239 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 240 | struct_penalties = SECST; |
|---|
| 241 | struct_index = ix; |
|---|
| 242 | for (i=0;i<length;i++) |
|---|
| 243 | { |
|---|
| 244 | sec_struct_mask[i] = '.'; |
|---|
| 245 | gap_penalty_mask[i] = '.'; |
|---|
| 246 | } |
|---|
| 247 | strcpy(ss_name,sname); |
|---|
| 248 | for(i=0;len < length;i++) { |
|---|
| 249 | c = tseq[i]; |
|---|
| 250 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 251 | if (!isspace(c)) sec_struct_mask[len++] = c; |
|---|
| 252 | } |
|---|
| 253 | } |
|---|
| 254 | } |
|---|
| 255 | else if(strncmp(line,"!GM",3) == 0) { |
|---|
| 256 | ix++; |
|---|
| 257 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 258 | for(j=0;j<MAXNAMES;j++) if(sname[j] == ' ') break; |
|---|
| 259 | sname[j]=EOS; |
|---|
| 260 | rtrim(sname); |
|---|
| 261 | blank_to_(sname); |
|---|
| 262 | if (interactive) { |
|---|
| 263 | strcpy(title,"Found gap penalty mask in alignment file: "); |
|---|
| 264 | strcat(title,sname); |
|---|
| 265 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 266 | } |
|---|
| 267 | else (*lin2) = 'y'; |
|---|
| 268 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 269 | struct_penalties = GMASK; |
|---|
| 270 | struct_index = ix; |
|---|
| 271 | for (i=0;i<length;i++) |
|---|
| 272 | gap_penalty_mask[i] = '1'; |
|---|
| 273 | strcpy(ss_name,sname); |
|---|
| 274 | for(i=0;len < length;i++) { |
|---|
| 275 | c = tseq[i]; |
|---|
| 276 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 277 | if (!isspace(c)) gap_penalty_mask[len++] = c; |
|---|
| 278 | } |
|---|
| 279 | } |
|---|
| 280 | } |
|---|
| 281 | if (struct_penalties != NONE) break; |
|---|
| 282 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 283 | } |
|---|
| 284 | |
|---|
| 285 | if (struct_penalties == NONE) return; |
|---|
| 286 | |
|---|
| 287 | /* skip any more comment lines */ |
|---|
| 288 | while (line[0] == '!') { |
|---|
| 289 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 290 | } |
|---|
| 291 | |
|---|
| 292 | /* skip the sequence lines and any comments after the alignment */ |
|---|
| 293 | for (;;) { |
|---|
| 294 | if(isspace(line[0])) break; |
|---|
| 295 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 296 | } |
|---|
| 297 | |
|---|
| 298 | |
|---|
| 299 | /* read the rest of the alignment */ |
|---|
| 300 | |
|---|
| 301 | for (;;) { |
|---|
| 302 | /* skip any blank lines */ |
|---|
| 303 | for (;;) { |
|---|
| 304 | if(!blankline(line)) break; |
|---|
| 305 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 306 | } |
|---|
| 307 | /* get structure table line */ |
|---|
| 308 | for(ix=0;ix<struct_index;ix++) { |
|---|
| 309 | if (line[0] != '!') { |
|---|
| 310 | if(struct_penalties == SECST) |
|---|
| 311 | error("bad secondary structure format"); |
|---|
| 312 | else |
|---|
| 313 | error("bad gap penalty mask format"); |
|---|
| 314 | struct_penalties = NONE; |
|---|
| 315 | return; |
|---|
| 316 | } |
|---|
| 317 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 318 | } |
|---|
| 319 | if(struct_penalties == SECST) { |
|---|
| 320 | if (strncmp(line,"!SS",3) != 0) { |
|---|
| 321 | error("bad secondary structure format"); |
|---|
| 322 | struct_penalties = NONE; |
|---|
| 323 | return; |
|---|
| 324 | } |
|---|
| 325 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 326 | for(i=0;len < length;i++) { |
|---|
| 327 | c = tseq[i]; |
|---|
| 328 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 329 | if (!isspace(c)) sec_struct_mask[len++] = c; |
|---|
| 330 | } |
|---|
| 331 | } |
|---|
| 332 | else if (struct_penalties == GMASK) { |
|---|
| 333 | if (strncmp(line,"!GM",3) != 0) { |
|---|
| 334 | error("bad gap penalty mask format"); |
|---|
| 335 | struct_penalties = NONE; |
|---|
| 336 | return; |
|---|
| 337 | } |
|---|
| 338 | sscanf(line+4,"%s%s",sname,tseq); |
|---|
| 339 | for(i=0;len < length;i++) { |
|---|
| 340 | c = tseq[i]; |
|---|
| 341 | if(c == '\n' || c == EOS) break; /* EOL */ |
|---|
| 342 | if (!isspace(c)) gap_penalty_mask[len++] = c; |
|---|
| 343 | } |
|---|
| 344 | } |
|---|
| 345 | |
|---|
| 346 | /* skip any more comment lines */ |
|---|
| 347 | while (line[0] == '!') { |
|---|
| 348 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 349 | } |
|---|
| 350 | |
|---|
| 351 | /* skip the sequence lines */ |
|---|
| 352 | for (;;) { |
|---|
| 353 | if(isspace(line[0])) break; |
|---|
| 354 | if(fgets(line,MAXLINE+1,fin)==NULL) return; |
|---|
| 355 | } |
|---|
| 356 | } |
|---|
| 357 | } |
|---|
| 358 | |
|---|
| 359 | static void get_embl_ss(sint length) |
|---|
| 360 | { |
|---|
| 361 | static char title[MAXLINE+1]; |
|---|
| 362 | static char line[MAXLINE+1]; |
|---|
| 363 | static char lin2[MAXLINE+1]; |
|---|
| 364 | static char sname[MAXNAMES+1]; |
|---|
| 365 | char feature[MAXLINE+1]; |
|---|
| 366 | sint i; |
|---|
| 367 | |
|---|
| 368 | /* find the start of the sequence entry */ |
|---|
| 369 | for (;;) { |
|---|
| 370 | while( !linetype(line,"ID") ) |
|---|
| 371 | if (fgets(line,MAXLINE+1,fin) == NULL) return; |
|---|
| 372 | |
|---|
| 373 | { |
|---|
| 374 | int lineLen=strlen(line); |
|---|
| 375 | for(i=5;i<=lineLen;i++) /* DES */ |
|---|
| 376 | if(line[i] != ' ') break; |
|---|
| 377 | } |
|---|
| 378 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 379 | { |
|---|
| 380 | int snameLen = strlen(sname); |
|---|
| 381 | for(i=0;i<=snameLen;i++) |
|---|
| 382 | if(sname[i] == ' ') { |
|---|
| 383 | sname[i]=EOS; |
|---|
| 384 | break; |
|---|
| 385 | } |
|---|
| 386 | } |
|---|
| 387 | sname[MAXNAMES]=EOS; |
|---|
| 388 | rtrim(sname); |
|---|
| 389 | blank_to_(sname); |
|---|
| 390 | |
|---|
| 391 | /* look for secondary structure feature table / gap penalty mask */ |
|---|
| 392 | while(fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 393 | if (linetype(line,"FT")) { |
|---|
| 394 | sscanf(line+2,"%s",feature); |
|---|
| 395 | if (strcmp(feature,"HELIX") == 0 || |
|---|
| 396 | strcmp(feature,"STRAND") == 0) |
|---|
| 397 | { |
|---|
| 398 | |
|---|
| 399 | if (interactive) { |
|---|
| 400 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 401 | strcat(title,sname); |
|---|
| 402 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 403 | } |
|---|
| 404 | else (*lin2) = 'y'; |
|---|
| 405 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 406 | struct_penalties = SECST; |
|---|
| 407 | for (i=0;i<length;i++) |
|---|
| 408 | sec_struct_mask[i] = '.'; |
|---|
| 409 | do { |
|---|
| 410 | get_swiss_feature(&line[2],length); |
|---|
| 411 | fgets(line,MAXLINE+1,fin); |
|---|
| 412 | } while( linetype(line,"FT") ); |
|---|
| 413 | } |
|---|
| 414 | else { |
|---|
| 415 | do { |
|---|
| 416 | fgets(line,MAXLINE+1,fin); |
|---|
| 417 | } while( linetype(line,"FT") ); |
|---|
| 418 | } |
|---|
| 419 | strcpy(ss_name,sname); |
|---|
| 420 | } |
|---|
| 421 | } |
|---|
| 422 | else if (linetype(line,"GM")) { |
|---|
| 423 | if (interactive) { |
|---|
| 424 | strcpy(title,"Found gap penalty mask in alignment file: "); |
|---|
| 425 | strcat(title,sname); |
|---|
| 426 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 427 | } |
|---|
| 428 | else (*lin2) = 'y'; |
|---|
| 429 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 430 | struct_penalties = GMASK; |
|---|
| 431 | for (i=0;i<length;i++) |
|---|
| 432 | gap_penalty_mask[i] = '1'; |
|---|
| 433 | do { |
|---|
| 434 | get_swiss_mask(&line[2],length); |
|---|
| 435 | fgets(line,MAXLINE+1,fin); |
|---|
| 436 | } while( linetype(line,"GM") ); |
|---|
| 437 | } |
|---|
| 438 | else { |
|---|
| 439 | do { |
|---|
| 440 | fgets(line,MAXLINE+1,fin); |
|---|
| 441 | } while( linetype(line,"GM") ); |
|---|
| 442 | } |
|---|
| 443 | strcpy(ss_name,sname); |
|---|
| 444 | } |
|---|
| 445 | if (linetype(line,"SQ")) |
|---|
| 446 | break; |
|---|
| 447 | |
|---|
| 448 | if (struct_penalties != NONE) break; |
|---|
| 449 | } |
|---|
| 450 | |
|---|
| 451 | } |
|---|
| 452 | |
|---|
| 453 | } |
|---|
| 454 | |
|---|
| 455 | static void get_rsf_ss(sint length) |
|---|
| 456 | { |
|---|
| 457 | static char title[MAXLINE+1]; |
|---|
| 458 | static char line[MAXLINE+1]; |
|---|
| 459 | static char lin2[MAXLINE+1]; |
|---|
| 460 | static char sname[MAXNAMES+1]; |
|---|
| 461 | sint i; |
|---|
| 462 | |
|---|
| 463 | /* skip the comments */ |
|---|
| 464 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 465 | if(line[strlen(line)-2]=='.' && |
|---|
| 466 | line[strlen(line)-3]=='.') |
|---|
| 467 | break; |
|---|
| 468 | } |
|---|
| 469 | |
|---|
| 470 | /* find the start of the sequence entry */ |
|---|
| 471 | for (;;) { |
|---|
| 472 | while (fgets(line,MAXLINE+1,fin) != NULL) |
|---|
| 473 | if( *line == '{' ) break; |
|---|
| 474 | |
|---|
| 475 | while( !keyword(line,"name") ) |
|---|
| 476 | if (fgets(line,MAXLINE+1,fin) == NULL) return; |
|---|
| 477 | |
|---|
| 478 | { |
|---|
| 479 | int lineLen=strlen(line); |
|---|
| 480 | for(i=5;i<=lineLen;i++) /* DES */ |
|---|
| 481 | if(line[i] != ' ') break; |
|---|
| 482 | } |
|---|
| 483 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 484 | { |
|---|
| 485 | int snameLen = strlen(sname); |
|---|
| 486 | for(i=0;i<=snameLen;i++) |
|---|
| 487 | if(sname[i] == ' ') { |
|---|
| 488 | sname[i]=EOS; |
|---|
| 489 | break; |
|---|
| 490 | } |
|---|
| 491 | } |
|---|
| 492 | sname[MAXNAMES]=EOS; |
|---|
| 493 | rtrim(sname); |
|---|
| 494 | blank_to_(sname); |
|---|
| 495 | |
|---|
| 496 | /* look for secondary structure feature table / gap penalty mask */ |
|---|
| 497 | while(fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 498 | if (keyword(line,"feature")) { |
|---|
| 499 | if (interactive) { |
|---|
| 500 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 501 | strcat(title,sname); |
|---|
| 502 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 503 | } |
|---|
| 504 | else (*lin2) = 'y'; |
|---|
| 505 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 506 | struct_penalties = SECST; |
|---|
| 507 | for (i=0;i<length;i++) |
|---|
| 508 | sec_struct_mask[i] = '.'; |
|---|
| 509 | do { |
|---|
| 510 | if(keyword(line,"feature")) |
|---|
| 511 | get_rsf_feature(&line[7],length); |
|---|
| 512 | fgets(line,MAXLINE+1,fin); |
|---|
| 513 | } while( !keyword(line,"sequence") ); |
|---|
| 514 | } |
|---|
| 515 | else { |
|---|
| 516 | do { |
|---|
| 517 | fgets(line,MAXLINE+1,fin); |
|---|
| 518 | } while( !keyword(line,"sequence") ); |
|---|
| 519 | } |
|---|
| 520 | strcpy(ss_name,sname); |
|---|
| 521 | } |
|---|
| 522 | else if (keyword(line,"sequence")) |
|---|
| 523 | break; |
|---|
| 524 | |
|---|
| 525 | if (struct_penalties != NONE) break; |
|---|
| 526 | } |
|---|
| 527 | |
|---|
| 528 | } |
|---|
| 529 | |
|---|
| 530 | } |
|---|
| 531 | |
|---|
| 532 | static void get_gde_ss(sint length) |
|---|
| 533 | { |
|---|
| 534 | static char title[MAXLINE+1]; |
|---|
| 535 | static char line[MAXLINE+1]; |
|---|
| 536 | static char lin2[MAXLINE+1]; |
|---|
| 537 | static char sname[MAXNAMES+1]; |
|---|
| 538 | sint i, len, offset = 0; |
|---|
| 539 | unsigned char c; |
|---|
| 540 | |
|---|
| 541 | for (;;) { |
|---|
| 542 | line[0] = '\0'; |
|---|
| 543 | /* search for the next comment line */ |
|---|
| 544 | while(*line != '"') |
|---|
| 545 | if (fgets(line,MAXLINE+1,fin) == NULL) return; |
|---|
| 546 | |
|---|
| 547 | /* is it a secondary structure entry? */ |
|---|
| 548 | if (strncmp(&line[1],"SS_",3) == 0) { |
|---|
| 549 | for (i=1;i<=MAXNAMES-3;i++) { |
|---|
| 550 | if (line[i+3] == '(' || line[i+3] == '\n') |
|---|
| 551 | break; |
|---|
| 552 | sname[i-1] = line[i+3]; |
|---|
| 553 | } |
|---|
| 554 | i--; |
|---|
| 555 | sname[i]=EOS; |
|---|
| 556 | if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset); |
|---|
| 557 | else offset = 0; |
|---|
| 558 | for(i--;i > 0;i--) |
|---|
| 559 | if(isspace(sname[i])) { |
|---|
| 560 | sname[i]=EOS; |
|---|
| 561 | } |
|---|
| 562 | else break; |
|---|
| 563 | blank_to_(sname); |
|---|
| 564 | |
|---|
| 565 | if (interactive) { |
|---|
| 566 | strcpy(title,"Found secondary structure in alignment file: "); |
|---|
| 567 | strcat(title,sname); |
|---|
| 568 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 569 | } |
|---|
| 570 | else (*lin2) = 'y'; |
|---|
| 571 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 572 | struct_penalties = SECST; |
|---|
| 573 | for (i=0;i<length;i++) |
|---|
| 574 | sec_struct_mask[i] = '.'; |
|---|
| 575 | len = 0; |
|---|
| 576 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 577 | if(*line == '%' || *line == '#' || *line == '"') break; |
|---|
| 578 | for(i=offset;i < length;i++) { |
|---|
| 579 | c=line[i]; |
|---|
| 580 | if(c == '\n' || c == EOS) |
|---|
| 581 | break; /* EOL */ |
|---|
| 582 | sec_struct_mask[len++]=c; |
|---|
| 583 | } |
|---|
| 584 | if (len > length) break; |
|---|
| 585 | } |
|---|
| 586 | strcpy(ss_name,sname); |
|---|
| 587 | } |
|---|
| 588 | } |
|---|
| 589 | /* or is it a gap penalty mask entry? */ |
|---|
| 590 | else if (strncmp(&line[1],"GM_",3) == 0) { |
|---|
| 591 | for (i=1;i<=MAXNAMES-3;i++) { |
|---|
| 592 | if (line[i+3] == '(' || line[i+3] == '\n') |
|---|
| 593 | break; |
|---|
| 594 | sname[i-1] = line[i+3]; |
|---|
| 595 | } |
|---|
| 596 | i--; |
|---|
| 597 | sname[i]=EOS; |
|---|
| 598 | if (sname[i-1] == '(') sscanf(&line[i+3],"%d",&offset); |
|---|
| 599 | else offset = 0; |
|---|
| 600 | for(i--;i > 0;i--) |
|---|
| 601 | if(isspace(sname[i])) { |
|---|
| 602 | sname[i]=EOS; |
|---|
| 603 | } |
|---|
| 604 | else break; |
|---|
| 605 | blank_to_(sname); |
|---|
| 606 | |
|---|
| 607 | if (interactive) { |
|---|
| 608 | strcpy(title,"Found gap penalty mask in alignment file: "); |
|---|
| 609 | strcat(title,sname); |
|---|
| 610 | (*lin2)=prompt_for_yes_no(title,"Use it to set local gap penalties "); |
|---|
| 611 | } |
|---|
| 612 | else (*lin2) = 'y'; |
|---|
| 613 | if ((*lin2 != 'n') && (*lin2 != 'N')) { |
|---|
| 614 | struct_penalties = GMASK; |
|---|
| 615 | for (i=0;i<length;i++) |
|---|
| 616 | gap_penalty_mask[i] = '1'; |
|---|
| 617 | len = 0; |
|---|
| 618 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 619 | if(*line == '%' || *line == '#' || *line == '"') break; |
|---|
| 620 | for(i=offset;i < length;i++) { |
|---|
| 621 | c=line[i]; |
|---|
| 622 | if(c == '\n' || c == EOS) |
|---|
| 623 | break; /* EOL */ |
|---|
| 624 | gap_penalty_mask[len++]=c; |
|---|
| 625 | } |
|---|
| 626 | if (len > length) break; |
|---|
| 627 | } |
|---|
| 628 | strcpy(ss_name,sname); |
|---|
| 629 | } |
|---|
| 630 | } |
|---|
| 631 | if (struct_penalties != NONE) break; |
|---|
| 632 | } |
|---|
| 633 | |
|---|
| 634 | } |
|---|
| 635 | |
|---|
| 636 | static void get_swiss_feature(char *line, sint len) |
|---|
| 637 | { |
|---|
| 638 | char c, s, feature[MAXLINE+1]; |
|---|
| 639 | int i, start_pos, end_pos; |
|---|
| 640 | |
|---|
| 641 | if (sscanf(line,"%s%d%d",feature,&start_pos,&end_pos) != 3) { |
|---|
| 642 | return; |
|---|
| 643 | } |
|---|
| 644 | |
|---|
| 645 | if (strcmp(feature,"HELIX") == 0) { |
|---|
| 646 | c = 'A'; |
|---|
| 647 | s = '$'; |
|---|
| 648 | } |
|---|
| 649 | else if (strcmp(feature,"STRAND") == 0) { |
|---|
| 650 | c = 'B'; |
|---|
| 651 | s = '%'; |
|---|
| 652 | } |
|---|
| 653 | else |
|---|
| 654 | return; |
|---|
| 655 | |
|---|
| 656 | if(start_pos >=len || end_pos>=len) return; |
|---|
| 657 | |
|---|
| 658 | sec_struct_mask[start_pos-1] = s; |
|---|
| 659 | for (i=start_pos;i<end_pos-1;i++) |
|---|
| 660 | sec_struct_mask[i] = c; |
|---|
| 661 | sec_struct_mask[end_pos-1] = s; |
|---|
| 662 | |
|---|
| 663 | } |
|---|
| 664 | |
|---|
| 665 | static void get_rsf_feature(char *line, sint len) |
|---|
| 666 | { |
|---|
| 667 | char c, s; |
|---|
| 668 | char str1[MAXLINE+1],str2[MAXLINE+1],feature[MAXLINE+1]; |
|---|
| 669 | int i, tmp,start_pos, end_pos; |
|---|
| 670 | |
|---|
| 671 | if (sscanf(line,"%d%d%d%s%s%s",&start_pos,&end_pos,&tmp,str1,str2,feature) != 6) { |
|---|
| 672 | return; |
|---|
| 673 | } |
|---|
| 674 | |
|---|
| 675 | if (strcmp(feature,"HELIX") == 0) { |
|---|
| 676 | c = 'A'; |
|---|
| 677 | s = '$'; |
|---|
| 678 | } |
|---|
| 679 | else if (strcmp(feature,"STRAND") == 0) { |
|---|
| 680 | c = 'B'; |
|---|
| 681 | s = '%'; |
|---|
| 682 | } |
|---|
| 683 | else |
|---|
| 684 | return; |
|---|
| 685 | |
|---|
| 686 | if(start_pos>=len || end_pos >= len) return; |
|---|
| 687 | sec_struct_mask[start_pos-1] = s; |
|---|
| 688 | for (i=start_pos;i<end_pos-1;i++) |
|---|
| 689 | sec_struct_mask[i] = c; |
|---|
| 690 | sec_struct_mask[end_pos-1] = s; |
|---|
| 691 | |
|---|
| 692 | } |
|---|
| 693 | |
|---|
| 694 | static void get_swiss_mask(char *line, sint len) |
|---|
| 695 | { |
|---|
| 696 | int i, value, start_pos, end_pos; |
|---|
| 697 | |
|---|
| 698 | if (sscanf(line,"%d%d%d",&value,&start_pos,&end_pos) != 3) { |
|---|
| 699 | return; |
|---|
| 700 | } |
|---|
| 701 | |
|---|
| 702 | if (value < 1 || value > 9) return; |
|---|
| 703 | |
|---|
| 704 | if(start_pos>=len || end_pos >= len) return; |
|---|
| 705 | for (i=start_pos-1;i<end_pos;i++) |
|---|
| 706 | gap_penalty_mask[i] = value+'0'; |
|---|
| 707 | |
|---|
| 708 | } |
|---|
| 709 | |
|---|
| 710 | static char * get_seq(char *sname,sint *len,char *tit) |
|---|
| 711 | { |
|---|
| 712 | static char line[MAXLINE+1]; |
|---|
| 713 | char *seq = NULL; |
|---|
| 714 | sint i, offset = 0; |
|---|
| 715 | unsigned char c=EOS; |
|---|
| 716 | Boolean got_seq=FALSE; |
|---|
| 717 | |
|---|
| 718 | switch(seqFormat) { |
|---|
| 719 | |
|---|
| 720 | /************************************/ |
|---|
| 721 | case EMBLSWISS: |
|---|
| 722 | while( !linetype(line,"ID") ) |
|---|
| 723 | if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; |
|---|
| 724 | |
|---|
| 725 | { |
|---|
| 726 | int lineLen=strlen(line); |
|---|
| 727 | for(i=5;i<=lineLen;i++) /* DES */ |
|---|
| 728 | if(line[i] != ' ') break; |
|---|
| 729 | } |
|---|
| 730 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 731 | { |
|---|
| 732 | int snameLen = strlen(sname); |
|---|
| 733 | for(i=0;i<=snameLen;i++) |
|---|
| 734 | if(sname[i] == ' ') { |
|---|
| 735 | sname[i]=EOS; |
|---|
| 736 | break; |
|---|
| 737 | } |
|---|
| 738 | } |
|---|
| 739 | |
|---|
| 740 | sname[MAXNAMES]=EOS; |
|---|
| 741 | rtrim(sname); |
|---|
| 742 | blank_to_(sname); |
|---|
| 743 | |
|---|
| 744 | |
|---|
| 745 | while( !linetype(line,"SQ") ) |
|---|
| 746 | fgets(line,MAXLINE+1,fin); |
|---|
| 747 | |
|---|
| 748 | *len=0; |
|---|
| 749 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 750 | if(got_seq && blankline(line)) break; |
|---|
| 751 | if( strlen(line) > 2 && line[strlen(line)-2]=='.' && line[strlen(line)-3]=='.' ) |
|---|
| 752 | continue; |
|---|
| 753 | if(seq==NULL) |
|---|
| 754 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 755 | else |
|---|
| 756 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 757 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 758 | c=line[i]; |
|---|
| 759 | if(c == '\n' || c == EOS || c == '/') |
|---|
| 760 | break; /* EOL */ |
|---|
| 761 | c=chartab[c]; |
|---|
| 762 | if(c) { |
|---|
| 763 | got_seq=TRUE; |
|---|
| 764 | seq[++(*len)]=c; |
|---|
| 765 | } |
|---|
| 766 | } |
|---|
| 767 | if(c == '/') break; |
|---|
| 768 | } |
|---|
| 769 | break; |
|---|
| 770 | |
|---|
| 771 | /************************************/ |
|---|
| 772 | case PIR: |
|---|
| 773 | while(*line != '>') |
|---|
| 774 | fgets(line,MAXLINE+1,fin); |
|---|
| 775 | { |
|---|
| 776 | int lineLen=strlen(line); |
|---|
| 777 | for(i=4;i<=lineLen;i++) /* DES */ |
|---|
| 778 | if(line[i] != ' ') break; |
|---|
| 779 | } |
|---|
| 780 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 781 | sname[MAXNAMES]=EOS; |
|---|
| 782 | rtrim(sname); |
|---|
| 783 | blank_to_(sname); |
|---|
| 784 | |
|---|
| 785 | fgets(line,MAXLINE+1,fin); |
|---|
| 786 | strncpy(tit,line,MAXTITLES); |
|---|
| 787 | tit[MAXTITLES]=EOS; |
|---|
| 788 | i=strlen(tit); |
|---|
| 789 | if(tit[i-1]=='\n') tit[i-1]=EOS; |
|---|
| 790 | |
|---|
| 791 | *len=0; |
|---|
| 792 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 793 | if(seq==NULL) |
|---|
| 794 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 795 | else |
|---|
| 796 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 797 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 798 | c=line[i]; |
|---|
| 799 | if(c == '\n' || c == EOS || c == '*') |
|---|
| 800 | break; /* EOL */ |
|---|
| 801 | |
|---|
| 802 | c=chartab[c]; |
|---|
| 803 | if(c) seq[++(*len)]=c; |
|---|
| 804 | } |
|---|
| 805 | if(c == '*') break; |
|---|
| 806 | } |
|---|
| 807 | break; |
|---|
| 808 | /***********************************************/ |
|---|
| 809 | case PEARSON: |
|---|
| 810 | while(*line != '>') |
|---|
| 811 | fgets(line,MAXLINE+1,fin); |
|---|
| 812 | |
|---|
| 813 | { |
|---|
| 814 | int lineLen=strlen(line); |
|---|
| 815 | for(i=1;i<=lineLen;i++) /* DES */ |
|---|
| 816 | if(line[i] != ' ') break; |
|---|
| 817 | } |
|---|
| 818 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 819 | { |
|---|
| 820 | int snameLen=strlen(sname); |
|---|
| 821 | for(i=1;i<=snameLen;i++) /* DES */ |
|---|
| 822 | if(sname[i] == ' ') break; |
|---|
| 823 | } |
|---|
| 824 | sname[i]=EOS; |
|---|
| 825 | rtrim(sname); |
|---|
| 826 | blank_to_(sname); |
|---|
| 827 | |
|---|
| 828 | *tit=EOS; |
|---|
| 829 | |
|---|
| 830 | *len=0; |
|---|
| 831 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 832 | if(seq==NULL) |
|---|
| 833 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 834 | else |
|---|
| 835 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 836 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 837 | c=line[i]; |
|---|
| 838 | if(c == '\n' || c == EOS || c == '>') |
|---|
| 839 | break; /* EOL */ |
|---|
| 840 | |
|---|
| 841 | c=chartab[c]; |
|---|
| 842 | if(c) seq[++(*len)]=c; |
|---|
| 843 | } |
|---|
| 844 | if(c == '>') break; |
|---|
| 845 | } |
|---|
| 846 | break; |
|---|
| 847 | /**********************************************/ |
|---|
| 848 | case GDE: |
|---|
| 849 | if (dnaflag) { |
|---|
| 850 | while(*line != '#') |
|---|
| 851 | fgets(line,MAXLINE+1,fin); |
|---|
| 852 | } |
|---|
| 853 | else { |
|---|
| 854 | while(*line != '%') |
|---|
| 855 | fgets(line,MAXLINE+1,fin); |
|---|
| 856 | } |
|---|
| 857 | |
|---|
| 858 | for (i=1;i<=MAXNAMES;i++) { |
|---|
| 859 | if (line[i] == '(' || line[i] == '\n') |
|---|
| 860 | break; |
|---|
| 861 | sname[i-1] = line[i]; |
|---|
| 862 | } |
|---|
| 863 | i--; |
|---|
| 864 | sname[i]=EOS; |
|---|
| 865 | if (sname[i-1] == '(') sscanf(&line[i],"%d",&offset); |
|---|
| 866 | else offset = 0; |
|---|
| 867 | for(i--;i > 0;i--) |
|---|
| 868 | if(isspace(sname[i])) { |
|---|
| 869 | sname[i]=EOS; |
|---|
| 870 | } |
|---|
| 871 | else break; |
|---|
| 872 | blank_to_(sname); |
|---|
| 873 | |
|---|
| 874 | *tit=EOS; |
|---|
| 875 | |
|---|
| 876 | *len=0; |
|---|
| 877 | for (i=0;i<offset;i++) seq[++(*len)] = '-'; |
|---|
| 878 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 879 | if(*line == '%' || *line == '#' || *line == '"') break; |
|---|
| 880 | if(seq==NULL) |
|---|
| 881 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 882 | else |
|---|
| 883 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 884 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 885 | c=line[i]; |
|---|
| 886 | if(c == '\n' || c == EOS) |
|---|
| 887 | break; /* EOL */ |
|---|
| 888 | |
|---|
| 889 | c=chartab[c]; |
|---|
| 890 | if(c) seq[++(*len)]=c; |
|---|
| 891 | } |
|---|
| 892 | } |
|---|
| 893 | break; |
|---|
| 894 | /***********************************************/ |
|---|
| 895 | case RSF: |
|---|
| 896 | while(*line != '{') |
|---|
| 897 | if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; |
|---|
| 898 | |
|---|
| 899 | while( !keyword(line,"name") ) |
|---|
| 900 | if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; |
|---|
| 901 | |
|---|
| 902 | { |
|---|
| 903 | int lineLen=strlen(line); |
|---|
| 904 | for(i=5;i<=lineLen;i++) /* DES */ |
|---|
| 905 | if(line[i] != ' ') break; |
|---|
| 906 | } |
|---|
| 907 | strncpy(sname,line+i,MAXNAMES); /* remember entryname */ |
|---|
| 908 | { |
|---|
| 909 | int snameLen=strlen(sname); |
|---|
| 910 | for(i=0;i<=snameLen;i++) |
|---|
| 911 | if(sname[i] == ' ') { |
|---|
| 912 | sname[i]=EOS; |
|---|
| 913 | break; |
|---|
| 914 | } |
|---|
| 915 | } |
|---|
| 916 | |
|---|
| 917 | sname[MAXNAMES]=EOS; |
|---|
| 918 | rtrim(sname); |
|---|
| 919 | blank_to_(sname); |
|---|
| 920 | |
|---|
| 921 | |
|---|
| 922 | while( !keyword(line,"sequence") ) |
|---|
| 923 | if (fgets(line,MAXLINE+1,fin) == NULL) return NULL; |
|---|
| 924 | |
|---|
| 925 | *len=0; |
|---|
| 926 | while(fgets(line,MAXLINE+1,fin)) { |
|---|
| 927 | if(seq==NULL) |
|---|
| 928 | seq=(char *)ckalloc((MAXLINE+2)*sizeof(char)); |
|---|
| 929 | else |
|---|
| 930 | seq=(char *)ckrealloc(seq,((*len)+MAXLINE+2)*sizeof(char)); |
|---|
| 931 | for(i=0;i<=MAXLINE;i++) { |
|---|
| 932 | c=line[i]; |
|---|
| 933 | if(c == EOS || c == '}') |
|---|
| 934 | break; /* EOL */ |
|---|
| 935 | if( c=='.') |
|---|
| 936 | seq[++(*len)]='-'; |
|---|
| 937 | c=chartab[c]; |
|---|
| 938 | if(c) seq[++(*len)]=c; |
|---|
| 939 | } |
|---|
| 940 | if(c == '}') break; |
|---|
| 941 | } |
|---|
| 942 | break; |
|---|
| 943 | /***********************************************/ |
|---|
| 944 | } |
|---|
| 945 | |
|---|
| 946 | seq[*len+1]=EOS; |
|---|
| 947 | |
|---|
| 948 | return seq; |
|---|
| 949 | } |
|---|
| 950 | |
|---|
| 951 | |
|---|
| 952 | sint readseqs(sint first_seq) /*first_seq is the #no. of the first seq. to read */ |
|---|
| 953 | { |
|---|
| 954 | char line[FILENAMELEN+1]; |
|---|
| 955 | char fileName[FILENAMELEN+1]; |
|---|
| 956 | |
|---|
| 957 | static char *seq1,sname1[MAXNAMES+1],title[MAXTITLES+1]; |
|---|
| 958 | sint i,j; |
|---|
| 959 | sint no_seqs; |
|---|
| 960 | static sint l1; |
|---|
| 961 | static Boolean dnaflag1; |
|---|
| 962 | |
|---|
| 963 | if(usemenu) |
|---|
| 964 | getstr("Enter the name of the sequence file",line, FILENAMELEN+1); |
|---|
| 965 | else |
|---|
| 966 | strcpy(line,seqname); |
|---|
| 967 | if(*line == EOS) return -1; |
|---|
| 968 | |
|---|
| 969 | if ((sscanf(line,"file://%s",fileName) == 1 )) { |
|---|
| 970 | strcpy(line,fileName); |
|---|
| 971 | } |
|---|
| 972 | |
|---|
| 973 | if((fin=fopen(line,"r"))==NULL) { |
|---|
| 974 | error("Could not open sequence file (%s) ",line); |
|---|
| 975 | return -1; /* DES -1 => file not found */ |
|---|
| 976 | } |
|---|
| 977 | strcpy(seqname,line); |
|---|
| 978 | no_seqs=0; |
|---|
| 979 | check_infile(&no_seqs); |
|---|
| 980 | info("Sequence format is %s",formatNames[seqFormat]); |
|---|
| 981 | if(seqFormat==NEXUS) |
|---|
| 982 | error("Cannot read nexus format"); |
|---|
| 983 | |
|---|
| 984 | /* DES DEBUG |
|---|
| 985 | fprintf(stdout,"\n\n File name = %s\n\n",seqname); |
|---|
| 986 | */ |
|---|
| 987 | if(no_seqs == 0) |
|---|
| 988 | return 0; /* return the number of seqs. (zero here)*/ |
|---|
| 989 | |
|---|
| 990 | /* |
|---|
| 991 | if((no_seqs + first_seq -1) > MAXN) { |
|---|
| 992 | error("Too many sequences. Maximum is %d",(pint)MAXN); |
|---|
| 993 | return 0; |
|---|
| 994 | } |
|---|
| 995 | */ |
|---|
| 996 | |
|---|
| 997 | /* DES */ |
|---|
| 998 | /* if(seqFormat == CLUSTAL) { |
|---|
| 999 | info("no of sequences = %d",(pint)no_seqs); |
|---|
| 1000 | return no_seqs; |
|---|
| 1001 | } |
|---|
| 1002 | */ |
|---|
| 1003 | max_aln_length = 0; |
|---|
| 1004 | |
|---|
| 1005 | /* if this is a multiple alignment, or profile 1 - free any memory used |
|---|
| 1006 | by previous alignments, then allocate memory for the new alignment */ |
|---|
| 1007 | if(first_seq == 1) { |
|---|
| 1008 | max_names = 0; |
|---|
| 1009 | free_aln(nseqs); |
|---|
| 1010 | alloc_aln(no_seqs); |
|---|
| 1011 | } |
|---|
| 1012 | /* otherwise, this is a profile 2, and we need to reallocate the arrays, |
|---|
| 1013 | leaving the data for profile 1 intact */ |
|---|
| 1014 | else realloc_aln(first_seq,no_seqs); |
|---|
| 1015 | |
|---|
| 1016 | for(i=1;i<first_seq;i++) |
|---|
| 1017 | { |
|---|
| 1018 | if(seqlen_array[i]>max_aln_length) |
|---|
| 1019 | max_aln_length=seqlen_array[i]; |
|---|
| 1020 | if((int)strlen(names[i])>max_names) |
|---|
| 1021 | max_names=strlen(names[i]); |
|---|
| 1022 | } |
|---|
| 1023 | |
|---|
| 1024 | for(i=first_seq;i<=first_seq+no_seqs-1;i++) { /* get the seqs now*/ |
|---|
| 1025 | output_index[i] = i; /* default output order */ |
|---|
| 1026 | if(seqFormat == CLUSTAL) |
|---|
| 1027 | seq1=get_clustal_seq(sname1,&l1,i-first_seq+1); |
|---|
| 1028 | else if(seqFormat == MSF) |
|---|
| 1029 | seq1=get_msf_seq(sname1,&l1,i-first_seq+1); |
|---|
| 1030 | else |
|---|
| 1031 | seq1=get_seq(sname1,&l1,title); |
|---|
| 1032 | |
|---|
| 1033 | if(seq1==NULL) break; |
|---|
| 1034 | /* JULIE */ |
|---|
| 1035 | /* Set max length of dynamically allocated arrays in prfalign.c */ |
|---|
| 1036 | if (l1 > max_aln_length) max_aln_length = l1; |
|---|
| 1037 | seqlen_array[i]=l1; /* store the length */ |
|---|
| 1038 | strcpy(names[i],sname1); /* " " name */ |
|---|
| 1039 | strcpy(titles[i],title); /* " " title */ |
|---|
| 1040 | |
|---|
| 1041 | if(!explicit_dnaflag) { |
|---|
| 1042 | dnaflag1 = check_dnaflag(seq1,l1); /* check DNA/Prot */ |
|---|
| 1043 | if(i == 1) dnaflag = dnaflag1; |
|---|
| 1044 | } /* type decided by first seq*/ |
|---|
| 1045 | else |
|---|
| 1046 | dnaflag1 = dnaflag; |
|---|
| 1047 | |
|---|
| 1048 | alloc_seq(i,l1); |
|---|
| 1049 | |
|---|
| 1050 | if(dnaflag) |
|---|
| 1051 | n_encode(seq1,seq_array[i],l1); /* encode the sequence*/ |
|---|
| 1052 | else /* as ints */ |
|---|
| 1053 | p_encode(seq1,seq_array[i],l1); |
|---|
| 1054 | if(seq1!=NULL) seq1=ckfree(seq1); |
|---|
| 1055 | } |
|---|
| 1056 | |
|---|
| 1057 | |
|---|
| 1058 | max_aln_length *= 2; |
|---|
| 1059 | /* |
|---|
| 1060 | JULIE |
|---|
| 1061 | check sequence names are all different - otherwise phylip tree is |
|---|
| 1062 | confused. |
|---|
| 1063 | */ |
|---|
| 1064 | for(i=1;i<=first_seq+no_seqs-1;i++) { |
|---|
| 1065 | for(j=i+1;j<=first_seq+no_seqs-1;j++) { |
|---|
| 1066 | if (strncmp(names[i],names[j],MAXNAMES) == 0) { |
|---|
| 1067 | error("Multiple sequences found with same name, %s (first %d chars are significant)", names[i],MAXNAMES); |
|---|
| 1068 | return 0; |
|---|
| 1069 | } |
|---|
| 1070 | } |
|---|
| 1071 | } |
|---|
| 1072 | for(i=first_seq;i<=first_seq+no_seqs-1;i++) |
|---|
| 1073 | { |
|---|
| 1074 | if(seqlen_array[i]>max_aln_length) |
|---|
| 1075 | max_aln_length=seqlen_array[i]; |
|---|
| 1076 | } |
|---|
| 1077 | |
|---|
| 1078 | /* look for a feature table / gap penalty mask (only if this is a profile) */ |
|---|
| 1079 | if (profile_no > 0) { |
|---|
| 1080 | rewind(fin); |
|---|
| 1081 | struct_penalties = NONE; |
|---|
| 1082 | gap_penalty_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); |
|---|
| 1083 | sec_struct_mask = (char *)ckalloc((max_aln_length+1) * sizeof (char)); |
|---|
| 1084 | ss_name = (char *)ckalloc((MAXNAMES+1) * sizeof (char)); |
|---|
| 1085 | |
|---|
| 1086 | if (seqFormat == CLUSTAL) { |
|---|
| 1087 | get_clustal_ss(max_aln_length); |
|---|
| 1088 | } |
|---|
| 1089 | else if (seqFormat == GDE) { |
|---|
| 1090 | get_gde_ss(max_aln_length); |
|---|
| 1091 | } |
|---|
| 1092 | else if (seqFormat == EMBLSWISS) { |
|---|
| 1093 | get_embl_ss(max_aln_length); |
|---|
| 1094 | } |
|---|
| 1095 | else if (seqFormat == RSF) { |
|---|
| 1096 | get_rsf_ss(max_aln_length); |
|---|
| 1097 | } |
|---|
| 1098 | } |
|---|
| 1099 | |
|---|
| 1100 | for(i=first_seq;i<=first_seq+no_seqs-1;i++) |
|---|
| 1101 | { |
|---|
| 1102 | if ((int)strlen(names[i])>max_names) |
|---|
| 1103 | max_names=strlen(names[i]); |
|---|
| 1104 | } |
|---|
| 1105 | |
|---|
| 1106 | if(max_names<10) max_names=10; |
|---|
| 1107 | |
|---|
| 1108 | fclose(fin); |
|---|
| 1109 | |
|---|
| 1110 | return no_seqs; /* return the number of seqs. read in this call */ |
|---|
| 1111 | } |
|---|
| 1112 | |
|---|
| 1113 | |
|---|
| 1114 | static Boolean check_dnaflag(char *seq, sint slen) |
|---|
| 1115 | /* check if DNA or Protein |
|---|
| 1116 | The decision is based on counting all A,C,G,T,U or N. |
|---|
| 1117 | If >= 85% of all characters (except -) are as above => DNA */ |
|---|
| 1118 | { |
|---|
| 1119 | sint i, c, nresidues, nbases; |
|---|
| 1120 | float ratio; |
|---|
| 1121 | const char *dna_codes="ACGTUN"; |
|---|
| 1122 | |
|---|
| 1123 | nresidues = nbases = 0; |
|---|
| 1124 | for(i=1; i <= slen; i++) { |
|---|
| 1125 | if(seq[i] != '-') { |
|---|
| 1126 | nresidues++; |
|---|
| 1127 | if(seq[i] == 'N') |
|---|
| 1128 | nbases++; |
|---|
| 1129 | else { |
|---|
| 1130 | c = res_index(dna_codes, seq[i]); |
|---|
| 1131 | if(c >= 0) |
|---|
| 1132 | nbases++; |
|---|
| 1133 | } |
|---|
| 1134 | } |
|---|
| 1135 | } |
|---|
| 1136 | if( (nbases == 0) || (nresidues == 0) ) return FALSE; |
|---|
| 1137 | ratio = (float)nbases/(float)nresidues; |
|---|
| 1138 | /* DES fprintf(stdout,"\n nbases = %d, nresidues = %d, ratio = %f\n", |
|---|
| 1139 | (pint)nbases,(pint)nresidues,(pint)ratio); */ |
|---|
| 1140 | if(ratio >= 0.85) |
|---|
| 1141 | return TRUE; |
|---|
| 1142 | else |
|---|
| 1143 | return FALSE; |
|---|
| 1144 | } |
|---|
| 1145 | |
|---|
| 1146 | |
|---|
| 1147 | |
|---|
| 1148 | static void check_infile(sint *local_nseqs) |
|---|
| 1149 | { |
|---|
| 1150 | char line[MAXLINE+1]; |
|---|
| 1151 | sint i; |
|---|
| 1152 | |
|---|
| 1153 | *local_nseqs=0; |
|---|
| 1154 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1155 | if(!blankline(line)) |
|---|
| 1156 | break; |
|---|
| 1157 | } |
|---|
| 1158 | |
|---|
| 1159 | for(i=strlen(line)-1;i>=0;i--) |
|---|
| 1160 | if(isgraph(line[i])) break; |
|---|
| 1161 | line[i+1]=EOS; |
|---|
| 1162 | |
|---|
| 1163 | for(i=0;i<=6;i++) line[i] = toupper(line[i]); |
|---|
| 1164 | |
|---|
| 1165 | if( linetype(line,"ID") ) { /* EMBL/Swiss-Prot format ? */ |
|---|
| 1166 | seqFormat=EMBLSWISS; |
|---|
| 1167 | (*local_nseqs)++; |
|---|
| 1168 | } |
|---|
| 1169 | else if( linetype(line,"CLUSTAL") ) { |
|---|
| 1170 | seqFormat=CLUSTAL; |
|---|
| 1171 | } |
|---|
| 1172 | else if( linetype(line,"PILEUP") ) { |
|---|
| 1173 | seqFormat = MSF; |
|---|
| 1174 | } |
|---|
| 1175 | else if( linetype(line,"!!AA_MULTIPLE_ALIGNMENT") ) { |
|---|
| 1176 | seqFormat = MSF; |
|---|
| 1177 | dnaflag = FALSE; |
|---|
| 1178 | } |
|---|
| 1179 | else if( linetype(line,"!!NA_MULTIPLE_ALIGNMENT") ) { |
|---|
| 1180 | seqFormat = MSF; |
|---|
| 1181 | dnaflag = TRUE; |
|---|
| 1182 | } |
|---|
| 1183 | else if( strstr(line,"MSF") && line[strlen(line)-1]=='.' && |
|---|
| 1184 | line[strlen(line)-2]=='.' ) { |
|---|
| 1185 | seqFormat = MSF; |
|---|
| 1186 | } |
|---|
| 1187 | else if( linetype(line,"!!RICH_SEQUENCE") ) { |
|---|
| 1188 | seqFormat = RSF; |
|---|
| 1189 | } |
|---|
| 1190 | else if( linetype(line,"#NEXUS") ) { |
|---|
| 1191 | seqFormat=NEXUS; |
|---|
| 1192 | return; |
|---|
| 1193 | } |
|---|
| 1194 | else if(*line == '>') { /* no */ |
|---|
| 1195 | seqFormat=(line[3] == ';')?PIR:PEARSON; /* distinguish PIR and Pearson */ |
|---|
| 1196 | (*local_nseqs)++; |
|---|
| 1197 | } |
|---|
| 1198 | else if((*line == '"') || (*line == '%') || (*line == '#')) { |
|---|
| 1199 | seqFormat=GDE; /* GDE format */ |
|---|
| 1200 | if (*line == '%') { |
|---|
| 1201 | (*local_nseqs)++; |
|---|
| 1202 | dnaflag = FALSE; |
|---|
| 1203 | } |
|---|
| 1204 | else if (*line == '#') { |
|---|
| 1205 | (*local_nseqs)++; |
|---|
| 1206 | dnaflag = TRUE; |
|---|
| 1207 | } |
|---|
| 1208 | } |
|---|
| 1209 | else { |
|---|
| 1210 | seqFormat=UNKNOWN; |
|---|
| 1211 | return; |
|---|
| 1212 | } |
|---|
| 1213 | |
|---|
| 1214 | while(fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1215 | switch(seqFormat) { |
|---|
| 1216 | case EMBLSWISS: |
|---|
| 1217 | if( linetype(line,"ID") ) |
|---|
| 1218 | (*local_nseqs)++; |
|---|
| 1219 | break; |
|---|
| 1220 | case PIR: |
|---|
| 1221 | *local_nseqs = count_pir_seqs(); |
|---|
| 1222 | fseek(fin,0,0); |
|---|
| 1223 | return; |
|---|
| 1224 | case PEARSON: |
|---|
| 1225 | if( *line == '>' ) |
|---|
| 1226 | (*local_nseqs)++; |
|---|
| 1227 | break; |
|---|
| 1228 | case GDE: |
|---|
| 1229 | if(( *line == '%' ) && ( dnaflag == FALSE)) |
|---|
| 1230 | (*local_nseqs)++; |
|---|
| 1231 | else if (( *line == '#') && ( dnaflag == TRUE)) |
|---|
| 1232 | (*local_nseqs)++; |
|---|
| 1233 | break; |
|---|
| 1234 | case CLUSTAL: |
|---|
| 1235 | *local_nseqs = count_clustal_seqs(); |
|---|
| 1236 | /* DES */ /* fprintf(stdout,"\nnseqs = %d\n",(pint)*local_nseqs); */ |
|---|
| 1237 | fseek(fin,0,0); |
|---|
| 1238 | return; |
|---|
| 1239 | case MSF: |
|---|
| 1240 | *local_nseqs = count_msf_seqs(); |
|---|
| 1241 | fseek(fin,0,0); |
|---|
| 1242 | return; |
|---|
| 1243 | case RSF: |
|---|
| 1244 | fseek(fin,0,0); |
|---|
| 1245 | *local_nseqs = count_rsf_seqs(); |
|---|
| 1246 | fseek(fin,0,0); |
|---|
| 1247 | return; |
|---|
| 1248 | case USER: |
|---|
| 1249 | default: |
|---|
| 1250 | break; |
|---|
| 1251 | } |
|---|
| 1252 | } |
|---|
| 1253 | fseek(fin,0,0); |
|---|
| 1254 | } |
|---|
| 1255 | |
|---|
| 1256 | |
|---|
| 1257 | static sint count_pir_seqs(void) |
|---|
| 1258 | /* count the number of sequences in a pir alignment file */ |
|---|
| 1259 | { |
|---|
| 1260 | char line[MAXLINE+1],c; |
|---|
| 1261 | sint local_nseqs, i; |
|---|
| 1262 | Boolean seq_ok; |
|---|
| 1263 | |
|---|
| 1264 | seq_ok = FALSE; |
|---|
| 1265 | while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of first seq */ |
|---|
| 1266 | if(*line == '>') break; |
|---|
| 1267 | for(i=0;seq_ok == FALSE;i++) { |
|---|
| 1268 | c=line[i]; |
|---|
| 1269 | if(c == '*') { |
|---|
| 1270 | seq_ok = TRUE; /* ok - end of sequence found */ |
|---|
| 1271 | break; |
|---|
| 1272 | } /* EOL */ |
|---|
| 1273 | if(c == '\n' || c == EOS) |
|---|
| 1274 | break; /* EOL */ |
|---|
| 1275 | } |
|---|
| 1276 | if (seq_ok == TRUE) |
|---|
| 1277 | break; |
|---|
| 1278 | } |
|---|
| 1279 | if (seq_ok == FALSE) { |
|---|
| 1280 | error("PIR format sequence end marker '*'\nmissing for one or more sequences."); |
|---|
| 1281 | return (sint)0; /* funny format*/ |
|---|
| 1282 | } |
|---|
| 1283 | |
|---|
| 1284 | |
|---|
| 1285 | local_nseqs = 1; |
|---|
| 1286 | |
|---|
| 1287 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1288 | if(*line == '>') { /* Look for start of next seq */ |
|---|
| 1289 | seq_ok = FALSE; |
|---|
| 1290 | while (fgets(line,MAXLINE+1,fin) != NULL) { /* Look for end of seq */ |
|---|
| 1291 | if(*line == '>') { |
|---|
| 1292 | error("PIR format sequence end marker '*' missing for one or more sequences."); |
|---|
| 1293 | return (sint)0; /* funny format*/ |
|---|
| 1294 | } |
|---|
| 1295 | for(i=0;seq_ok == FALSE;i++) { |
|---|
| 1296 | c=line[i]; |
|---|
| 1297 | if(c == '*') { |
|---|
| 1298 | seq_ok = TRUE; /* ok - sequence found */ |
|---|
| 1299 | break; |
|---|
| 1300 | } /* EOL */ |
|---|
| 1301 | if(c == '\n' || c == EOS) |
|---|
| 1302 | break; /* EOL */ |
|---|
| 1303 | } |
|---|
| 1304 | if (seq_ok == TRUE) { |
|---|
| 1305 | local_nseqs++; |
|---|
| 1306 | break; |
|---|
| 1307 | } |
|---|
| 1308 | } |
|---|
| 1309 | } |
|---|
| 1310 | } |
|---|
| 1311 | return (sint)local_nseqs; |
|---|
| 1312 | } |
|---|
| 1313 | |
|---|
| 1314 | |
|---|
| 1315 | static sint count_clustal_seqs(void) |
|---|
| 1316 | /* count the number of sequences in a clustal alignment file */ |
|---|
| 1317 | { |
|---|
| 1318 | char line[MAXLINE+1]; |
|---|
| 1319 | sint local_nseqs; |
|---|
| 1320 | |
|---|
| 1321 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1322 | if(!cl_blankline(line)) break; /* Look for next non- */ |
|---|
| 1323 | } /* blank line */ |
|---|
| 1324 | local_nseqs = 1; |
|---|
| 1325 | |
|---|
| 1326 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1327 | if(cl_blankline(line)) return local_nseqs; |
|---|
| 1328 | local_nseqs++; |
|---|
| 1329 | } |
|---|
| 1330 | |
|---|
| 1331 | return (sint)0; /* if you got to here-funny format/no seqs.*/ |
|---|
| 1332 | } |
|---|
| 1333 | |
|---|
| 1334 | static sint count_msf_seqs(void) |
|---|
| 1335 | { |
|---|
| 1336 | /* count the number of sequences in a PILEUP alignment file */ |
|---|
| 1337 | |
|---|
| 1338 | char line[MAXLINE+1]; |
|---|
| 1339 | sint local_nseqs; |
|---|
| 1340 | |
|---|
| 1341 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1342 | if(linetype(line,"//")) break; |
|---|
| 1343 | } |
|---|
| 1344 | |
|---|
| 1345 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1346 | if(!blankline(line)) break; /* Look for next non- */ |
|---|
| 1347 | } /* blank line */ |
|---|
| 1348 | local_nseqs = 1; |
|---|
| 1349 | |
|---|
| 1350 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1351 | if(blankline(line)) return local_nseqs; |
|---|
| 1352 | local_nseqs++; |
|---|
| 1353 | } |
|---|
| 1354 | |
|---|
| 1355 | return (sint)0; /* if you got to here-funny format/no seqs.*/ |
|---|
| 1356 | } |
|---|
| 1357 | |
|---|
| 1358 | static sint count_rsf_seqs(void) |
|---|
| 1359 | { |
|---|
| 1360 | /* count the number of sequences in a GCG RSF alignment file */ |
|---|
| 1361 | |
|---|
| 1362 | char line[MAXLINE+1]; |
|---|
| 1363 | sint local_nseqs; |
|---|
| 1364 | |
|---|
| 1365 | local_nseqs = 0; |
|---|
| 1366 | /* skip the comments */ |
|---|
| 1367 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1368 | if(line[strlen(line)-2]=='.' && |
|---|
| 1369 | line[strlen(line)-3]=='.') |
|---|
| 1370 | break; |
|---|
| 1371 | } |
|---|
| 1372 | |
|---|
| 1373 | while (fgets(line,MAXLINE+1,fin) != NULL) { |
|---|
| 1374 | if( *line == '{' ) |
|---|
| 1375 | local_nseqs++; |
|---|
| 1376 | } |
|---|
| 1377 | return (sint)local_nseqs; |
|---|
| 1378 | } |
|---|
| 1379 | |
|---|
| 1380 | static void p_encode(char *seq, char *naseq, sint l) |
|---|
| 1381 | { /* code seq as ints .. use gap_pos2 for gap */ |
|---|
| 1382 | register sint i; |
|---|
| 1383 | /* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/ |
|---|
| 1384 | |
|---|
| 1385 | for(i=1;i<=l;i++) |
|---|
| 1386 | if(seq[i] == '-') |
|---|
| 1387 | naseq[i] = gap_pos2; |
|---|
| 1388 | else |
|---|
| 1389 | naseq[i] = res_index(amino_acid_codes,seq[i]); |
|---|
| 1390 | naseq[i] = -3; |
|---|
| 1391 | } |
|---|
| 1392 | |
|---|
| 1393 | static void n_encode(char *seq,char *naseq,sint l) |
|---|
| 1394 | { /* code seq as ints .. use gap_pos2 for gap */ |
|---|
| 1395 | register sint i; |
|---|
| 1396 | /* static char *nucs="ACGTU"; */ |
|---|
| 1397 | |
|---|
| 1398 | for(i=1;i<=l;i++) { |
|---|
| 1399 | if(seq[i] == '-') /* if a gap character -> code = gap_pos2 */ |
|---|
| 1400 | naseq[i] = gap_pos2; /* this is the code for a gap in */ |
|---|
| 1401 | else { /* the input files */ |
|---|
| 1402 | naseq[i]=res_index(amino_acid_codes,seq[i]); |
|---|
| 1403 | } |
|---|
| 1404 | } |
|---|
| 1405 | naseq[i] = -3; |
|---|
| 1406 | } |
|---|
| 1407 | |
|---|
| 1408 | static sint res_index(const char *t,char c) |
|---|
| 1409 | { |
|---|
| 1410 | register sint i; |
|---|
| 1411 | |
|---|
| 1412 | for(i=0;t[i] && t[i] != c;i++) |
|---|
| 1413 | ; |
|---|
| 1414 | if(t[i]) return(i); |
|---|
| 1415 | else return -1; |
|---|
| 1416 | } |
|---|