1 | /* File: ureadseq.c |
---|
2 | * |
---|
3 | * Reads and writes nucleic/protein sequence in various |
---|
4 | * formats. Data files may have multiple sequences. |
---|
5 | * |
---|
6 | * Copyright 1990 by d.g.gilbert |
---|
7 | * biology dept., indiana university, bloomington, in 47405 |
---|
8 | * e-mail: gilbertd@bio.indiana.edu |
---|
9 | * |
---|
10 | * This program may be freely copied and used by anyone. |
---|
11 | * Developers are encourged to incorporate parts in their |
---|
12 | * programs, rather than devise their own private sequence |
---|
13 | * format. |
---|
14 | * |
---|
15 | * This should compile and run with any ANSI C compiler. |
---|
16 | * |
---|
17 | * ------------------------------------------------------------ |
---|
18 | * |
---|
19 | * Some slight modifications were done by ARB developers |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | |
---|
24 | #include <stdio.h> |
---|
25 | #include <stdlib.h> |
---|
26 | #include <string.h> |
---|
27 | #define __NO_CTYPE |
---|
28 | #include <ctype.h> |
---|
29 | |
---|
30 | #include "ureadseq.h" |
---|
31 | prettyopts gPretty; |
---|
32 | |
---|
33 | #pragma segment ureadseq |
---|
34 | |
---|
35 | |
---|
36 | int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */ |
---|
37 | { |
---|
38 | int diff, done; |
---|
39 | if (a == b) return 0; |
---|
40 | done = 0; |
---|
41 | while (! done) { |
---|
42 | diff = to_upper(*a) - to_upper(*b); |
---|
43 | if (diff) return diff; |
---|
44 | if (*a == '\0') done = 1; |
---|
45 | else { a++; b++; } |
---|
46 | } |
---|
47 | return 0; |
---|
48 | } |
---|
49 | |
---|
50 | int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */ |
---|
51 | { |
---|
52 | int diff, done; |
---|
53 | if (a == b) return 0; |
---|
54 | done = 0; |
---|
55 | while (! done) { |
---|
56 | diff = to_upper(*a) - to_upper(*b); |
---|
57 | if (diff) return diff; |
---|
58 | if (*a == '\0') done = 1; |
---|
59 | else { |
---|
60 | a++; b++; maxn--; |
---|
61 | if (! maxn) done = 1; |
---|
62 | } |
---|
63 | } |
---|
64 | return 0; |
---|
65 | } |
---|
66 | |
---|
67 | |
---|
68 | |
---|
69 | |
---|
70 | |
---|
71 | #ifndef Local |
---|
72 | # define Local static /* local functions */ |
---|
73 | #endif |
---|
74 | |
---|
75 | #define kStartLength 500000 /* 20Apr93 temp. bug fix */ |
---|
76 | |
---|
77 | const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; |
---|
78 | const char *primenuc = "ACGTU"; |
---|
79 | const char *protonly = "EFIPQZ"; |
---|
80 | |
---|
81 | const char kNocountsymbols[5] = "_.-?"; |
---|
82 | const char stdsymbols[6] = "_.-*?"; |
---|
83 | const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\"; |
---|
84 | static const char *seqsymbols = allsymbols; |
---|
85 | |
---|
86 | const char nummask[11] = "0123456789"; |
---|
87 | const char nonummask[11] = "~!@#$%^&*("; |
---|
88 | |
---|
89 | /* |
---|
90 | use general form of isseqchar -- all chars + symbols. |
---|
91 | no formats except nbrf (?) use symbols in data area as |
---|
92 | anything other than sequence chars. |
---|
93 | */ |
---|
94 | |
---|
95 | |
---|
96 | |
---|
97 | /* Local variables for readSeq: */ |
---|
98 | struct ReadSeqVars { |
---|
99 | short choice, err, nseq; |
---|
100 | long seqlen, maxseq, seqlencount; |
---|
101 | short topnseq; |
---|
102 | long topseqlen; |
---|
103 | const char *fname; |
---|
104 | char *seq, *seqid, matchchar; |
---|
105 | boolean allDone, done, filestart, addit; |
---|
106 | FILE *f; |
---|
107 | long linestart; |
---|
108 | char s[256], *sp; |
---|
109 | |
---|
110 | int (*isseqcharfirst8)(); /* Patch by o. strunk (ARB) to allow numbers in genbank sequences*/ |
---|
111 | int (*isseqchar)(); |
---|
112 | /* int (*isseqchar)(int c); << sgi cc hates (int c) */ |
---|
113 | }; |
---|
114 | |
---|
115 | |
---|
116 | |
---|
117 | int isSeqChar(int c) |
---|
118 | { |
---|
119 | return (isalpha(c) || strchr(seqsymbols,c)); |
---|
120 | } |
---|
121 | |
---|
122 | int isSeqNumChar(int c) |
---|
123 | { |
---|
124 | return (isalnum(c) || strchr(seqsymbols,c)); |
---|
125 | } |
---|
126 | |
---|
127 | |
---|
128 | int isAnyChar(int c) |
---|
129 | { |
---|
130 | return isascii(c); /* wrap in case isascii is macro */ |
---|
131 | } |
---|
132 | |
---|
133 | Local void readline(FILE *f, char *s, long *linestart) |
---|
134 | { |
---|
135 | char *cp; |
---|
136 | |
---|
137 | *linestart= ftell(f); |
---|
138 | if (NULL == fgets(s, 256, f)) |
---|
139 | *s = 0; |
---|
140 | else { |
---|
141 | cp = strchr(s, '\n'); |
---|
142 | if (cp != NULL) *cp = 0; |
---|
143 | } |
---|
144 | } |
---|
145 | |
---|
146 | Local void GetLine(struct ReadSeqVars *V) |
---|
147 | { |
---|
148 | readline(V->f, V->s, &V->linestart); |
---|
149 | } |
---|
150 | |
---|
151 | Local void unGetLine(struct ReadSeqVars *V) |
---|
152 | { |
---|
153 | fseek(V->f, V->linestart, 0); |
---|
154 | } |
---|
155 | |
---|
156 | |
---|
157 | Local void addseq(char *s, struct ReadSeqVars * V) |
---|
158 | { |
---|
159 | char *ptr; |
---|
160 | int count = 0; |
---|
161 | if (V->addit) { |
---|
162 | for (;*s != 0;s++,count++) { |
---|
163 | if (count < 9 && V->isseqcharfirst8) { |
---|
164 | if (!(V->isseqcharfirst8) (*s)) continue; |
---|
165 | } |
---|
166 | else { |
---|
167 | if (!(V->isseqchar) (*s)) continue; |
---|
168 | } |
---|
169 | if (V->seqlen >= V->maxseq) { |
---|
170 | V->maxseq += kStartLength; |
---|
171 | ptr = (char *) realloc(V->seq, V->maxseq + 1); |
---|
172 | if (ptr == NULL) { |
---|
173 | V->err = eMemFull; |
---|
174 | return; |
---|
175 | } else |
---|
176 | V->seq = ptr; |
---|
177 | } |
---|
178 | V->seq[(V->seqlen)++] = *s; |
---|
179 | } |
---|
180 | } |
---|
181 | } |
---|
182 | |
---|
183 | Local void countseq(char *s, struct ReadSeqVars *V) |
---|
184 | /* this must count all valid seq chars, for some formats (paup-sequential) even |
---|
185 | if we are skipping seq... */ |
---|
186 | { |
---|
187 | while (*s != 0) { |
---|
188 | if ((V->isseqchar)(*s)) { |
---|
189 | (V->seqlencount)++; |
---|
190 | } |
---|
191 | s++; |
---|
192 | } |
---|
193 | } |
---|
194 | |
---|
195 | |
---|
196 | Local void addinfo(char *s, struct ReadSeqVars *V) |
---|
197 | { |
---|
198 | char s2[256], *si; |
---|
199 | boolean saveadd; |
---|
200 | |
---|
201 | si = s2; |
---|
202 | while (*s == ' ') s++; |
---|
203 | sprintf(si, " %d) %s\n", V->nseq, s); |
---|
204 | |
---|
205 | saveadd = V->addit; |
---|
206 | V->addit = true; |
---|
207 | V->isseqchar = isAnyChar; |
---|
208 | addseq( si, V); |
---|
209 | V->addit = saveadd; |
---|
210 | V->isseqchar = isSeqChar; |
---|
211 | } |
---|
212 | |
---|
213 | |
---|
214 | |
---|
215 | |
---|
216 | Local void readLoop(short margin, boolean addfirst, |
---|
217 | boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V), |
---|
218 | struct ReadSeqVars *V) |
---|
219 | { |
---|
220 | boolean addend = false; |
---|
221 | boolean ungetend = false; |
---|
222 | |
---|
223 | V->nseq++; |
---|
224 | if (V->choice == kListSequences) V->addit = false; |
---|
225 | else V->addit = (V->nseq == V->choice); |
---|
226 | if (V->addit) V->seqlen = 0; |
---|
227 | |
---|
228 | if (addfirst) addseq(V->s, V); |
---|
229 | do { |
---|
230 | GetLine(V); |
---|
231 | V->done = feof(V->f); |
---|
232 | V->done |= (*endTest)( &addend, &ungetend, V); |
---|
233 | if (V->addit && (addend || !V->done) && (strlen(V->s) > (unsigned)margin)) { |
---|
234 | addseq( (V->s)+margin, V); |
---|
235 | } |
---|
236 | } while (!V->done); |
---|
237 | |
---|
238 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
---|
239 | else { |
---|
240 | V->allDone = (V->nseq >= V->choice); |
---|
241 | if (V->allDone && ungetend) unGetLine(V); |
---|
242 | } |
---|
243 | } |
---|
244 | |
---|
245 | |
---|
246 | |
---|
247 | Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
248 | { |
---|
249 | *addend = true; /* 1 or 2 occur in line w/ bases */ |
---|
250 | *ungetend= false; |
---|
251 | return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL)); |
---|
252 | } |
---|
253 | |
---|
254 | Local void readIG(struct ReadSeqVars *V) |
---|
255 | { |
---|
256 | /* 18Aug92: new IG format -- ^L between sequences in place of ";" */ |
---|
257 | char *si; |
---|
258 | |
---|
259 | while (!V->allDone) { |
---|
260 | do { |
---|
261 | GetLine(V); |
---|
262 | for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */ |
---|
263 | if (*si == 0) *V->s= 0; /* chop line to empty */ |
---|
264 | } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) )); |
---|
265 | if (feof(V->f)) |
---|
266 | V->allDone = true; |
---|
267 | else { |
---|
268 | strcpy(V->seqid, V->s); |
---|
269 | readLoop(0, false, endIG, V); |
---|
270 | } |
---|
271 | } |
---|
272 | } |
---|
273 | |
---|
274 | |
---|
275 | |
---|
276 | Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
277 | { |
---|
278 | *addend = false; |
---|
279 | *ungetend= false; |
---|
280 | return (strstr( V->s, "//") != NULL); |
---|
281 | } |
---|
282 | |
---|
283 | Local void readStrider(struct ReadSeqVars *V) |
---|
284 | { /* ? only 1 seq/file ? */ |
---|
285 | |
---|
286 | while (!V->allDone) { |
---|
287 | GetLine(V); |
---|
288 | if (strstr(V->s,"; DNA sequence ") == V->s) |
---|
289 | strcpy(V->seqid, (V->s)+16); |
---|
290 | else |
---|
291 | strcpy(V->seqid, (V->s)+1); |
---|
292 | while ((!feof(V->f)) && (*V->s == ';')) { |
---|
293 | GetLine(V); |
---|
294 | } |
---|
295 | if (feof(V->f)) V->allDone = true; |
---|
296 | else readLoop(0, true, endStrider, V); |
---|
297 | } |
---|
298 | } |
---|
299 | |
---|
300 | |
---|
301 | Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
302 | { |
---|
303 | *addend = false; |
---|
304 | *ungetend= (strstr(V->s,"ENTRY") == V->s); |
---|
305 | return ((strstr(V->s,"///") != NULL) || *ungetend); |
---|
306 | } |
---|
307 | |
---|
308 | Local void readPIR(struct ReadSeqVars *V) |
---|
309 | { /*PIR -- many seqs/file */ |
---|
310 | |
---|
311 | while (!V->allDone) { |
---|
312 | while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) ) |
---|
313 | GetLine(V); |
---|
314 | strcpy(V->seqid, (V->s)+16); |
---|
315 | while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s)) |
---|
316 | GetLine(V); |
---|
317 | readLoop(0, false, endPIR, V); |
---|
318 | |
---|
319 | if (!V->allDone) { |
---|
320 | while (! (feof(V->f) || ((*V->s != 0) |
---|
321 | && (strstr( V->s,"ENTRY") == V->s)))) |
---|
322 | GetLine(V); |
---|
323 | } |
---|
324 | if (feof(V->f)) V->allDone = true; |
---|
325 | } |
---|
326 | } |
---|
327 | |
---|
328 | |
---|
329 | Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
330 | { |
---|
331 | *addend = false; |
---|
332 | *ungetend= (strstr(V->s,"LOCUS") == V->s); |
---|
333 | return ((strstr(V->s,"//") != NULL) || *ungetend); |
---|
334 | } |
---|
335 | |
---|
336 | Local void readGenBank(struct ReadSeqVars * V) |
---|
337 | { /* GenBank -- many seqs/file */ |
---|
338 | |
---|
339 | V->isseqchar = isSeqNumChar; |
---|
340 | V->isseqcharfirst8 = isSeqChar; |
---|
341 | |
---|
342 | while (!V->allDone) { |
---|
343 | strcpy(V->seqid, (V->s)+12); |
---|
344 | while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s)) |
---|
345 | GetLine(V); |
---|
346 | readLoop(0, false, endGB, V); |
---|
347 | |
---|
348 | if (!V->allDone) { |
---|
349 | while (! (feof(V->f) || ((*V->s != 0) |
---|
350 | && (strstr( V->s, "LOCUS") == V->s)))) |
---|
351 | GetLine(V); |
---|
352 | } |
---|
353 | if (feof(V->f)) V->allDone = true; |
---|
354 | } |
---|
355 | |
---|
356 | V->isseqchar = isSeqChar; |
---|
357 | V->isseqcharfirst8 = 0; |
---|
358 | } |
---|
359 | |
---|
360 | |
---|
361 | Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
362 | { |
---|
363 | char *a; |
---|
364 | |
---|
365 | if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */ |
---|
366 | /* "*" can be valid base symbol, drop it here */ |
---|
367 | *a = 0; |
---|
368 | *addend = true; |
---|
369 | *ungetend= false; |
---|
370 | return(true); |
---|
371 | } |
---|
372 | else if (*V->s == '>') { /* start of next seq */ |
---|
373 | *addend = false; |
---|
374 | *ungetend= true; |
---|
375 | return(true); |
---|
376 | } |
---|
377 | else |
---|
378 | return(false); |
---|
379 | } |
---|
380 | |
---|
381 | Local void readNBRF(struct ReadSeqVars *V) |
---|
382 | { |
---|
383 | while (!V->allDone) { |
---|
384 | strcpy(V->seqid, (V->s)+4); |
---|
385 | GetLine(V); /*skip title-junk line*/ |
---|
386 | readLoop(0, false, endNBRF, V); |
---|
387 | if (!V->allDone) { |
---|
388 | while (!(feof(V->f) || (*V->s != 0 && *V->s == '>'))) |
---|
389 | GetLine(V); |
---|
390 | } |
---|
391 | if (feof(V->f)) V->allDone = true; |
---|
392 | } |
---|
393 | } |
---|
394 | |
---|
395 | |
---|
396 | |
---|
397 | Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
398 | { |
---|
399 | *addend = false; |
---|
400 | *ungetend= true; |
---|
401 | return(*V->s == '>'); |
---|
402 | } |
---|
403 | |
---|
404 | Local void readPearson(struct ReadSeqVars *V) |
---|
405 | { |
---|
406 | while (!V->allDone) { |
---|
407 | strcpy(V->seqid, (V->s)+1); |
---|
408 | readLoop(0, false, endPearson, V); |
---|
409 | if (!V->allDone) { |
---|
410 | while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>')))) |
---|
411 | GetLine(V); |
---|
412 | } |
---|
413 | if (feof(V->f)) V->allDone = true; |
---|
414 | } |
---|
415 | } |
---|
416 | |
---|
417 | |
---|
418 | |
---|
419 | Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
420 | { |
---|
421 | *addend = false; |
---|
422 | *ungetend= (strstr(V->s,"ID ") == V->s); |
---|
423 | return ((strstr(V->s,"//") != NULL) || *ungetend); |
---|
424 | } |
---|
425 | |
---|
426 | Local void readEMBL(struct ReadSeqVars *V) |
---|
427 | { |
---|
428 | while (!V->allDone) { |
---|
429 | strcpy(V->seqid, (V->s)+5); |
---|
430 | do { |
---|
431 | GetLine(V); |
---|
432 | } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s))); |
---|
433 | |
---|
434 | readLoop(0, false, endEMBL, V); |
---|
435 | if (!V->allDone) { |
---|
436 | while (!(feof(V->f) | |
---|
437 | ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s)))) |
---|
438 | GetLine(V); |
---|
439 | } |
---|
440 | if (feof(V->f)) V->allDone = true; |
---|
441 | } |
---|
442 | } |
---|
443 | |
---|
444 | |
---|
445 | |
---|
446 | Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
447 | { |
---|
448 | *addend = false; |
---|
449 | *ungetend= true; |
---|
450 | return( *V->s == '(' ); |
---|
451 | } |
---|
452 | |
---|
453 | Local void readZuker(struct ReadSeqVars *V) |
---|
454 | { |
---|
455 | /*! 1st string is Zuker's Fortran format */ |
---|
456 | |
---|
457 | while (!V->allDone) { |
---|
458 | GetLine(V); /*s == "seqLen seqid string..."*/ |
---|
459 | strcpy(V->seqid, (V->s)+6); |
---|
460 | readLoop(0, false, endZuker, V); |
---|
461 | if (!V->allDone) { |
---|
462 | while (!(feof(V->f) | |
---|
463 | ((*V->s != '\0') & (*V->s == '(')))) |
---|
464 | GetLine(V); |
---|
465 | } |
---|
466 | if (feof(V->f)) V->allDone = true; |
---|
467 | } |
---|
468 | } |
---|
469 | |
---|
470 | |
---|
471 | |
---|
472 | Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
473 | { |
---|
474 | /* this is a somewhat shaky end, |
---|
475 | 1st char of line is non-blank for seq. title |
---|
476 | */ |
---|
477 | *addend = false; |
---|
478 | *ungetend= true; |
---|
479 | return( *V->s != ' ' ); |
---|
480 | } |
---|
481 | |
---|
482 | Local void readFitch(struct ReadSeqVars *V) |
---|
483 | { |
---|
484 | boolean first; |
---|
485 | |
---|
486 | first = true; |
---|
487 | while (!V->allDone) { |
---|
488 | if (!first) strcpy(V->seqid, V->s); |
---|
489 | readLoop(0, first, endFitch, V); |
---|
490 | if (feof(V->f)) V->allDone = true; |
---|
491 | first = false; |
---|
492 | } |
---|
493 | } |
---|
494 | |
---|
495 | |
---|
496 | Local void readPlain(struct ReadSeqVars *V) |
---|
497 | { |
---|
498 | V->nseq++; |
---|
499 | V->addit = (V->choice > 0); |
---|
500 | if (V->addit) V->seqlen = 0; |
---|
501 | addseq(V->seqid, V); /*from above..*/ |
---|
502 | if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname); |
---|
503 | else sprintf(V->seqid, " [Unknown form]"); |
---|
504 | do { |
---|
505 | addseq(V->s, V); |
---|
506 | V->done = feof(V->f); |
---|
507 | GetLine(V); |
---|
508 | } while (!V->done); |
---|
509 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
---|
510 | V->allDone = true; |
---|
511 | } |
---|
512 | |
---|
513 | |
---|
514 | Local void readUWGCG(struct ReadSeqVars *V) |
---|
515 | { |
---|
516 | /* |
---|
517 | 10nov91: Reading GCG files casued duplication of last line when |
---|
518 | EOF followed that line !!! |
---|
519 | fix: GetLine now sets *V->s = 0 |
---|
520 | */ |
---|
521 | char *si; |
---|
522 | |
---|
523 | V->nseq++; |
---|
524 | V->addit = (V->choice > 0); |
---|
525 | if (V->addit) V->seqlen = 0; |
---|
526 | strcpy(V->seqid, V->s); |
---|
527 | /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ |
---|
528 | /*drop above or ".." from id*/ |
---|
529 | if ((si = strstr(V->seqid," Length: "))) *si = 0; |
---|
530 | else if ((si = strstr(V->seqid,".."))) *si = 0; |
---|
531 | do { |
---|
532 | V->done = feof(V->f); |
---|
533 | GetLine(V); |
---|
534 | if (!V->done) addseq((V->s), V); |
---|
535 | } while (!V->done); |
---|
536 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
---|
537 | V->allDone = true; |
---|
538 | } |
---|
539 | |
---|
540 | |
---|
541 | Local void readOlsen(struct ReadSeqVars *V) |
---|
542 | { /* G. Olsen /print output from multiple sequence editor */ |
---|
543 | |
---|
544 | char *si, *sj, *sk, *sm, sid[40], snum[20]; |
---|
545 | boolean indata = false; |
---|
546 | int snumlen; |
---|
547 | |
---|
548 | V->addit = (V->choice > 0); |
---|
549 | if (V->addit) V->seqlen = 0; |
---|
550 | rewind(V->f); V->nseq= 0; |
---|
551 | do { |
---|
552 | GetLine(V); |
---|
553 | V->done = feof(V->f); |
---|
554 | |
---|
555 | if (V->done && !(*V->s)) break; |
---|
556 | else if (indata) { |
---|
557 | if ( (si= strstr(V->s, sid)) |
---|
558 | /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */ |
---|
559 | && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) { |
---|
560 | |
---|
561 | /* Spaces are valid alignment data !! */ |
---|
562 | /* 17Oct91: Error, the left margin is 21 not 22! */ |
---|
563 | /* dropped some nucs up to now -- my example file was right shifted ! */ |
---|
564 | /* variable right id margin, drop id-2 spaces at end */ |
---|
565 | /* |
---|
566 | VMS CC COMPILER (VAXC031) mess up: |
---|
567 | -- Index of 21 is chopping 1st nuc on VMS systems Only! |
---|
568 | Byte-for-byte same ame rnasep.olsen sequence file ! |
---|
569 | */ |
---|
570 | |
---|
571 | /* si = (V->s)+21; < was this before VMS CC wasted my time */ |
---|
572 | si += 10; /* use strstr index plus offset to outfox VMS CC bug */ |
---|
573 | |
---|
574 | if ((sk = strstr(si, sid))) *(sk-2) = 0; |
---|
575 | for (sk = si; *sk != 0; sk++) { |
---|
576 | if (*sk == ' ') *sk = '.'; |
---|
577 | /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */ |
---|
578 | else if (isdigit(*sk)) *sk= nonummask[*sk - '0']; |
---|
579 | } |
---|
580 | |
---|
581 | addseq(si, V); |
---|
582 | } |
---|
583 | } |
---|
584 | |
---|
585 | else if ((sk = strstr(V->s, "): "))) { /* seq info header line */ |
---|
586 | /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */ |
---|
587 | /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */ |
---|
588 | /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */ |
---|
589 | (V->nseq)++; |
---|
590 | si = 1 + strchr(V->s,'('); |
---|
591 | *sk = ' '; |
---|
592 | if (V->choice == kListSequences) addinfo( si, V); |
---|
593 | else if (V->nseq == V->choice) { |
---|
594 | strcpy(V->seqid, si); |
---|
595 | sj = strchr(V->seqid, ':'); |
---|
596 | while (*(--sj) == ' ') ; |
---|
597 | while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; } |
---|
598 | |
---|
599 | *sk = 0; |
---|
600 | while (*(--sk) == ' ') *sk = 0; |
---|
601 | strcpy(sid, si); |
---|
602 | |
---|
603 | si= V->s; |
---|
604 | while ((*si <= ' ') && (*si != 0)) si++; |
---|
605 | snumlen=0; |
---|
606 | while (si[snumlen] > ' ' && snumlen<20) |
---|
607 | { snum[snumlen]= si[snumlen]; snumlen++; } |
---|
608 | snum[snumlen]= 0; |
---|
609 | } |
---|
610 | |
---|
611 | } |
---|
612 | |
---|
613 | else if (strstr(V->s,"identity: Data:")) { |
---|
614 | indata = true; |
---|
615 | if (V->choice == kListSequences) V->done = true; |
---|
616 | } |
---|
617 | |
---|
618 | } while (!V->done); |
---|
619 | |
---|
620 | V->allDone = true; |
---|
621 | } /*readOlsen*/ |
---|
622 | |
---|
623 | |
---|
624 | Local void readMSF(struct ReadSeqVars *V) |
---|
625 | { /* gcg's MSF, mult. sequence format, interleaved ! */ |
---|
626 | |
---|
627 | char *si, *sj, sid[128]; |
---|
628 | boolean indata = false; |
---|
629 | int iline= 0; |
---|
630 | |
---|
631 | V->addit = (V->choice > 0); |
---|
632 | if (V->addit) V->seqlen = 0; |
---|
633 | rewind(V->f); V->nseq= 0; |
---|
634 | do { |
---|
635 | GetLine(V); |
---|
636 | V->done = feof(V->f); |
---|
637 | |
---|
638 | if (V->done && !(*V->s)) break; |
---|
639 | else if (indata) { |
---|
640 | /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */ |
---|
641 | /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */ |
---|
642 | |
---|
643 | si= V->s; |
---|
644 | skipwhitespace(si); |
---|
645 | /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/ |
---|
646 | for (sj= si; *sj > ' '; sj++) ; |
---|
647 | *sj= 0; |
---|
648 | if ( *si ) { |
---|
649 | if ( (0==strcmp(si, sid)) ) { |
---|
650 | addseq(sj+1, V); |
---|
651 | } |
---|
652 | iline++; |
---|
653 | } |
---|
654 | } |
---|
655 | |
---|
656 | else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */ |
---|
657 | /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */ |
---|
658 | |
---|
659 | (V->nseq)++; |
---|
660 | si += 6; |
---|
661 | if (V->choice == kListSequences) addinfo( si, V); |
---|
662 | else if (V->nseq == V->choice) { |
---|
663 | strcpy(V->seqid, si); |
---|
664 | si = V->seqid; |
---|
665 | skipwhitespace(si); |
---|
666 | /* for (sj= si; isalnum(*sj); sj++) ; -- bug */ |
---|
667 | for (sj= si; *sj > ' '; sj++) ; |
---|
668 | *sj= 0; |
---|
669 | strcpy(sid, si); |
---|
670 | } |
---|
671 | } |
---|
672 | |
---|
673 | else if ( strstr(V->s,"//") /*== V->s*/ ) { |
---|
674 | indata = true; |
---|
675 | iline= 0; |
---|
676 | if (V->choice == kListSequences) V->done = true; |
---|
677 | } |
---|
678 | |
---|
679 | } while (!V->done); |
---|
680 | |
---|
681 | |
---|
682 | V->allDone = true; |
---|
683 | } /*readMSF*/ |
---|
684 | |
---|
685 | |
---|
686 | |
---|
687 | Local void readPAUPinterleaved(struct ReadSeqVars *V) |
---|
688 | { /* PAUP mult. sequence format, interleaved or sequential! */ |
---|
689 | |
---|
690 | char *si, *sj, *send, sid[40], sid1[40], saveseq[255]; |
---|
691 | boolean first = true, indata = false, domatch; |
---|
692 | int iline= 0, ifmc, saveseqlen=0; |
---|
693 | |
---|
694 | #define fixmatchchar(s) { \ |
---|
695 | for (ifmc=0; ifmc<saveseqlen; ifmc++) \ |
---|
696 | if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; } |
---|
697 | |
---|
698 | V->addit = (V->choice > 0); |
---|
699 | V->seqlencount = 0; |
---|
700 | if (V->addit) V->seqlen = 0; |
---|
701 | /* rewind(V->f); V->nseq= 0; << do in caller !*/ |
---|
702 | indata= true; /* call here after we find "matrix" */ |
---|
703 | domatch= (V->matchchar > 0); |
---|
704 | |
---|
705 | do { |
---|
706 | GetLine(V); |
---|
707 | V->done = feof(V->f); |
---|
708 | |
---|
709 | if (V->done && !(*V->s)) break; |
---|
710 | else if (indata) { |
---|
711 | /* [ 1 1 1 ]*/ |
---|
712 | /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
---|
713 | /* chimp ................a.t. .c.................a ..........*/ |
---|
714 | /* !! need to correct for V->matchchar */ |
---|
715 | si= V->s; |
---|
716 | skipwhitespace(si); |
---|
717 | if (strchr(si,';')) indata= false; |
---|
718 | |
---|
719 | if (isalnum(*si)) { |
---|
720 | /* valid data line starts w/ a left-justified seq name in columns [0..8] */ |
---|
721 | if (first) { |
---|
722 | (V->nseq)++; |
---|
723 | if (V->nseq >= V->topnseq) first= false; |
---|
724 | for (sj = si; isalnum(*sj); sj++) ; |
---|
725 | send= sj; |
---|
726 | skipwhitespace(sj); |
---|
727 | if (V->choice == kListSequences) { |
---|
728 | *send= 0; |
---|
729 | addinfo( si, V); |
---|
730 | } |
---|
731 | else if (V->nseq == V->choice) { |
---|
732 | if (domatch) { |
---|
733 | if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); } |
---|
734 | else fixmatchchar( sj); |
---|
735 | } |
---|
736 | addseq(sj, V); |
---|
737 | *send= 0; |
---|
738 | strcpy(V->seqid, si); |
---|
739 | strcpy(sid, si); |
---|
740 | if (V->nseq == 1) strcpy(sid1, sid); |
---|
741 | } |
---|
742 | } |
---|
743 | |
---|
744 | else if ( (strstr(si, sid) == si) ){ |
---|
745 | while (isalnum(*si)) si++; |
---|
746 | skipwhitespace(si); |
---|
747 | if (domatch) { |
---|
748 | if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); } |
---|
749 | else fixmatchchar( si); |
---|
750 | } |
---|
751 | addseq(si, V); |
---|
752 | } |
---|
753 | |
---|
754 | else if (domatch && (strstr(si, sid1) == si)) { |
---|
755 | strcpy( saveseq, si); |
---|
756 | saveseqlen= strlen(saveseq); |
---|
757 | } |
---|
758 | |
---|
759 | iline++; |
---|
760 | } |
---|
761 | } |
---|
762 | |
---|
763 | else if ( strstr(V->s,"matrix") ) { |
---|
764 | indata = true; |
---|
765 | iline= 0; |
---|
766 | if (V->choice == kListSequences) V->done = true; |
---|
767 | } |
---|
768 | |
---|
769 | } while (!V->done); |
---|
770 | |
---|
771 | V->allDone = true; |
---|
772 | } /*readPAUPinterleaved*/ |
---|
773 | |
---|
774 | |
---|
775 | |
---|
776 | Local void readPAUPsequential(struct ReadSeqVars *V) |
---|
777 | { /* PAUP mult. sequence format, interleaved or sequential! */ |
---|
778 | char *si, *sj; |
---|
779 | boolean atname = true, indata = false; |
---|
780 | |
---|
781 | V->addit = (V->choice > 0); |
---|
782 | if (V->addit) V->seqlen = 0; |
---|
783 | V->seqlencount = 0; |
---|
784 | /* rewind(V->f); V->nseq= 0; << do in caller !*/ |
---|
785 | indata= true; /* call here after we find "matrix" */ |
---|
786 | do { |
---|
787 | GetLine(V); |
---|
788 | V->done = feof(V->f); |
---|
789 | |
---|
790 | if (V->done && !(*V->s)) break; |
---|
791 | else if (indata) { |
---|
792 | /* [ 1 1 1 ]*/ |
---|
793 | /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
---|
794 | /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
---|
795 | /* chimp ................a.t. .c.................a ..........*/ |
---|
796 | /* ................a.t. .c.................a ..........*/ |
---|
797 | |
---|
798 | si= V->s; |
---|
799 | skipwhitespace(si); |
---|
800 | if (strchr(si,';')) indata= false; |
---|
801 | if (isalnum(*si)) { |
---|
802 | /* valid data line starts w/ a left-justified seq name in columns [0..8] */ |
---|
803 | if (atname) { |
---|
804 | (V->nseq)++; |
---|
805 | V->seqlencount = 0; |
---|
806 | atname= false; |
---|
807 | sj= si+1; |
---|
808 | while (isalnum(*sj)) sj++; |
---|
809 | if (V->choice == kListSequences) { |
---|
810 | /* !! we must count bases to know when topseqlen is reached ! */ |
---|
811 | countseq(sj, V); |
---|
812 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
813 | *sj= 0; |
---|
814 | addinfo( si, V); |
---|
815 | } |
---|
816 | else if (V->nseq == V->choice) { |
---|
817 | addseq(sj, V); |
---|
818 | V->seqlencount= V->seqlen; |
---|
819 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
820 | *sj= 0; |
---|
821 | strcpy(V->seqid, si); |
---|
822 | } |
---|
823 | else { |
---|
824 | countseq(sj, V); |
---|
825 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
826 | } |
---|
827 | } |
---|
828 | |
---|
829 | else if (V->nseq == V->choice) { |
---|
830 | addseq(V->s, V); |
---|
831 | V->seqlencount= V->seqlen; |
---|
832 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
833 | } |
---|
834 | else { |
---|
835 | countseq(V->s, V); |
---|
836 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
837 | } |
---|
838 | } |
---|
839 | } |
---|
840 | |
---|
841 | else if ( strstr(V->s,"matrix") ) { |
---|
842 | indata = true; |
---|
843 | atname= true; |
---|
844 | if (V->choice == kListSequences) V->done = true; |
---|
845 | } |
---|
846 | |
---|
847 | } while (!V->done); |
---|
848 | |
---|
849 | V->allDone = true; |
---|
850 | } /*readPAUPsequential*/ |
---|
851 | |
---|
852 | |
---|
853 | Local void readPhylipInterleaved(struct ReadSeqVars *V) |
---|
854 | { |
---|
855 | char *si, *sj; |
---|
856 | boolean first = true; |
---|
857 | int iline= 0; |
---|
858 | |
---|
859 | V->addit = (V->choice > 0); |
---|
860 | if (V->addit) V->seqlen = 0; |
---|
861 | V->seqlencount = 0; |
---|
862 | /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */ |
---|
863 | si= V->s; |
---|
864 | skipwhitespace(si); |
---|
865 | V->topnseq= atoi(si); |
---|
866 | while (isdigit(*si)) si++; |
---|
867 | skipwhitespace(si); |
---|
868 | V->topseqlen= atol(si); |
---|
869 | /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */ |
---|
870 | |
---|
871 | do { |
---|
872 | GetLine(V); |
---|
873 | V->done = feof(V->f); |
---|
874 | |
---|
875 | if (V->done && !(*V->s)) break; |
---|
876 | si= V->s; |
---|
877 | skipwhitespace(si); |
---|
878 | if (*si != 0) { |
---|
879 | |
---|
880 | if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */ |
---|
881 | (V->nseq)++; |
---|
882 | if (V->nseq >= V->topnseq) first= false; |
---|
883 | sj= V->s+10; /* past name, start of data */ |
---|
884 | if (V->choice == kListSequences) { |
---|
885 | *sj= 0; |
---|
886 | addinfo( si, V); |
---|
887 | } |
---|
888 | else if (V->nseq == V->choice) { |
---|
889 | addseq(sj, V); |
---|
890 | *sj= 0; |
---|
891 | strcpy(V->seqid, si); |
---|
892 | } |
---|
893 | } |
---|
894 | else if ( iline % V->nseq == V->choice -1 ) { |
---|
895 | addseq(si, V); |
---|
896 | } |
---|
897 | iline++; |
---|
898 | } |
---|
899 | } while (!V->done); |
---|
900 | |
---|
901 | V->allDone = true; |
---|
902 | } /*readPhylipInterleaved*/ |
---|
903 | |
---|
904 | |
---|
905 | |
---|
906 | Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
907 | { |
---|
908 | *addend = false; |
---|
909 | *ungetend= false; |
---|
910 | countseq( V->s, V); |
---|
911 | return V->seqlencount >= V->topseqlen; |
---|
912 | } |
---|
913 | |
---|
914 | Local void readPhylipSequential(struct ReadSeqVars *V) |
---|
915 | { |
---|
916 | short i; |
---|
917 | char *si; |
---|
918 | /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */ |
---|
919 | si= V->s; |
---|
920 | skipwhitespace(si); |
---|
921 | V->topnseq= atoi(si); |
---|
922 | while (isdigit(*si)) si++; |
---|
923 | skipwhitespace(si); |
---|
924 | V->topseqlen= atol(si); |
---|
925 | GetLine(V); |
---|
926 | while (!V->allDone) { |
---|
927 | V->seqlencount= 0; |
---|
928 | strncpy(V->seqid, (V->s), 10); |
---|
929 | V->seqid[10]= 0; |
---|
930 | for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' '; |
---|
931 | readLoop(0, true, endPhylipSequential, V); |
---|
932 | if (feof(V->f)) V->allDone = true; |
---|
933 | } |
---|
934 | } |
---|
935 | |
---|
936 | |
---|
937 | |
---|
938 | |
---|
939 | Local void readSeqMain( |
---|
940 | struct ReadSeqVars *V, |
---|
941 | const long skiplines_, |
---|
942 | const short format_) |
---|
943 | { |
---|
944 | #define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \ |
---|
945 | for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); } |
---|
946 | |
---|
947 | boolean gotuw; |
---|
948 | long l; |
---|
949 | |
---|
950 | V->linestart= 0; |
---|
951 | V->matchchar= 0; |
---|
952 | if (V->f == NULL) |
---|
953 | V->err = eFileNotFound; |
---|
954 | else { |
---|
955 | |
---|
956 | for (l = skiplines_; l > 0; l--) GetLine( V); |
---|
957 | |
---|
958 | do { |
---|
959 | GetLine( V); |
---|
960 | for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ; |
---|
961 | } while ((l == 0) && !feof(V->f)); |
---|
962 | |
---|
963 | if (feof(V->f)) V->err = eNoData; |
---|
964 | |
---|
965 | else switch (format_) { |
---|
966 | case kPlain : readPlain(V); break; |
---|
967 | case kIG : readIG(V); break; |
---|
968 | case kStrider: readStrider(V); break; |
---|
969 | case kGenBank: readGenBank(V); break; |
---|
970 | case kPIR : readPIR(V); break; |
---|
971 | case kNBRF : readNBRF(V); break; |
---|
972 | case kPearson: readPearson(V); break; |
---|
973 | case kEMBL : readEMBL(V); break; |
---|
974 | case kZuker : readZuker(V); break; |
---|
975 | case kOlsen : readOlsen(V); break; |
---|
976 | case kMSF : readMSF(V); break; |
---|
977 | |
---|
978 | case kPAUP : { |
---|
979 | boolean done= false; |
---|
980 | boolean interleaved= false; |
---|
981 | char *cp; |
---|
982 | /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */ |
---|
983 | while (!done) { |
---|
984 | GetLine( V); |
---|
985 | tolowerstr( V->s); |
---|
986 | if (strstr( V->s, "matrix")) done= true; |
---|
987 | if (strstr( V->s, "interleav")) interleaved= true; |
---|
988 | if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5); |
---|
989 | if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6); |
---|
990 | if (NULL != (cp=strstr( V->s, "matchchar=")) ) { |
---|
991 | cp += 10; |
---|
992 | if (*cp=='\'') cp++; |
---|
993 | else if (*cp=='"') cp++; |
---|
994 | V->matchchar= *cp; |
---|
995 | } |
---|
996 | } |
---|
997 | if (interleaved) readPAUPinterleaved(V); |
---|
998 | else readPAUPsequential(V); |
---|
999 | } |
---|
1000 | break; |
---|
1001 | |
---|
1002 | /* kPhylip: ! can't determine in middle of file which type it is...*/ |
---|
1003 | /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */ |
---|
1004 | case kPhylip2: |
---|
1005 | readPhylipSequential(V); |
---|
1006 | break; |
---|
1007 | case kPhylip4: /* == kPhylip3 */ |
---|
1008 | readPhylipInterleaved(V); |
---|
1009 | break; |
---|
1010 | |
---|
1011 | default: |
---|
1012 | V->err = eUnknownFormat; |
---|
1013 | break; |
---|
1014 | |
---|
1015 | case kFitch : |
---|
1016 | strcpy(V->seqid, V->s); GetLine(V); |
---|
1017 | readFitch(V); |
---|
1018 | break; |
---|
1019 | |
---|
1020 | case kGCG: |
---|
1021 | do { |
---|
1022 | gotuw = (strstr(V->s,"..") != NULL); |
---|
1023 | if (gotuw) readUWGCG(V); |
---|
1024 | GetLine(V); |
---|
1025 | } while (!(feof(V->f) || V->allDone)); |
---|
1026 | break; |
---|
1027 | } |
---|
1028 | } |
---|
1029 | |
---|
1030 | V->filestart= false; |
---|
1031 | V->seq[V->seqlen] = 0; /* stick a string terminator on it */ |
---|
1032 | } |
---|
1033 | |
---|
1034 | |
---|
1035 | char *readSeqFp( |
---|
1036 | const short whichEntry_, /* index to sequence in file */ |
---|
1037 | FILE *fp_, /* pointer to open seq file */ |
---|
1038 | const long skiplines_, |
---|
1039 | const short format_, /* sequence file format */ |
---|
1040 | long *seqlen_, /* return seq size */ |
---|
1041 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
---|
1042 | short *error_, /* return error */ |
---|
1043 | char *seqid_) /* return seq name/info */ |
---|
1044 | { |
---|
1045 | struct ReadSeqVars V; |
---|
1046 | |
---|
1047 | if (format_ < kMinFormat || format_ > kMaxFormat) { |
---|
1048 | *error_ = eUnknownFormat; |
---|
1049 | *seqlen_ = 0; |
---|
1050 | return NULL; |
---|
1051 | } |
---|
1052 | |
---|
1053 | V.choice = whichEntry_; |
---|
1054 | V.fname = NULL; /* don't know */ |
---|
1055 | V.seq = (char*) calloc(1, kStartLength+1); |
---|
1056 | V.maxseq = kStartLength; |
---|
1057 | V.seqlen = 0; |
---|
1058 | V.seqid = seqid_; |
---|
1059 | |
---|
1060 | V.f = fp_; |
---|
1061 | V.filestart= (ftell( fp_) == 0); |
---|
1062 | /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */ |
---|
1063 | if (V.filestart) V.nseq = 0; |
---|
1064 | else V.nseq= *nseq_; /* track where we are in file...*/ |
---|
1065 | |
---|
1066 | *V.seqid = '\0'; |
---|
1067 | V.err = 0; |
---|
1068 | V.nseq = 0; |
---|
1069 | V.isseqchar = isSeqChar; |
---|
1070 | V.isseqcharfirst8 = 0; |
---|
1071 | if (V.choice == kListSequences) ; /* leave as is */ |
---|
1072 | else if (V.choice <= 0) V.choice = 1; /* default ?? */ |
---|
1073 | V.addit = (V.choice > 0); |
---|
1074 | V.allDone = false; |
---|
1075 | |
---|
1076 | readSeqMain(&V, skiplines_, format_); |
---|
1077 | |
---|
1078 | *error_ = V.err; |
---|
1079 | *seqlen_ = V.seqlen; |
---|
1080 | *nseq_ = V.nseq; |
---|
1081 | return V.seq; |
---|
1082 | } |
---|
1083 | |
---|
1084 | char *readSeq( |
---|
1085 | const short whichEntry_, /* index to sequence in file */ |
---|
1086 | const char *filename_, /* file name */ |
---|
1087 | const long skiplines_, |
---|
1088 | const short format_, /* sequence file format */ |
---|
1089 | long *seqlen_, /* return seq size */ |
---|
1090 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
---|
1091 | short *error_, /* return error */ |
---|
1092 | char *seqid_) /* return seq name/info */ |
---|
1093 | { |
---|
1094 | struct ReadSeqVars V; |
---|
1095 | |
---|
1096 | if (format_ < kMinFormat || format_ > kMaxFormat) { |
---|
1097 | *error_ = eUnknownFormat; |
---|
1098 | *seqlen_ = 0; |
---|
1099 | return NULL; |
---|
1100 | } |
---|
1101 | |
---|
1102 | V.choice = whichEntry_; |
---|
1103 | V.fname = filename_; /* don't need to copy string, just ptr to it */ |
---|
1104 | V.seq = (char*) calloc(1, kStartLength+1); |
---|
1105 | V.maxseq = kStartLength; |
---|
1106 | V.seqlen = 0; |
---|
1107 | V.seqid = seqid_; |
---|
1108 | |
---|
1109 | V.f = NULL; |
---|
1110 | *V.seqid = '\0'; |
---|
1111 | V.err = 0; |
---|
1112 | V.nseq = 0; |
---|
1113 | V.isseqchar = isSeqChar; |
---|
1114 | V.isseqcharfirst8 = 0; |
---|
1115 | if (V.choice == kListSequences) ; /* leave as is */ |
---|
1116 | else if (V.choice <= 0) V.choice = 1; /* default ?? */ |
---|
1117 | V.addit = (V.choice > 0); |
---|
1118 | V.allDone = false; |
---|
1119 | |
---|
1120 | V.f = fopen(V.fname, "r"); |
---|
1121 | V.filestart= true; |
---|
1122 | |
---|
1123 | readSeqMain(&V, skiplines_, format_); |
---|
1124 | |
---|
1125 | if (V.f != NULL) fclose(V.f); |
---|
1126 | *error_ = V.err; |
---|
1127 | *seqlen_ = V.seqlen; |
---|
1128 | *nseq_ = V.nseq; |
---|
1129 | return V.seq; |
---|
1130 | } |
---|
1131 | |
---|
1132 | |
---|
1133 | |
---|
1134 | |
---|
1135 | |
---|
1136 | char *listSeqs( |
---|
1137 | const char *filename_, /* file name */ |
---|
1138 | const long skiplines_, |
---|
1139 | const short format_, /* sequence file format */ |
---|
1140 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
---|
1141 | short *error_) /* return error */ |
---|
1142 | { |
---|
1143 | char seqid[256]; |
---|
1144 | long seqlen; |
---|
1145 | |
---|
1146 | return readSeq( kListSequences, filename_, skiplines_, format_, |
---|
1147 | &seqlen, nseq_, error_, seqid); |
---|
1148 | } |
---|
1149 | |
---|
1150 | |
---|
1151 | |
---|
1152 | |
---|
1153 | short seqFileFormat( /* return sequence format number, see ureadseq.h */ |
---|
1154 | const char *filename, |
---|
1155 | long *skiplines, /* return #lines to skip any junk like mail header */ |
---|
1156 | short *error) /* return any error value or 0 */ |
---|
1157 | { |
---|
1158 | FILE *fseq; |
---|
1159 | short format; |
---|
1160 | |
---|
1161 | fseq = fopen(filename, "r"); |
---|
1162 | format= seqFileFormatFp( fseq, skiplines, error); |
---|
1163 | if (fseq!=NULL) fclose(fseq); |
---|
1164 | return format; |
---|
1165 | } |
---|
1166 | |
---|
1167 | short seqFileFormatFp( |
---|
1168 | FILE *fseq, |
---|
1169 | long *skiplines, /* return #lines to skip any junk like mail header */ |
---|
1170 | short *error) /* return any error value or 0 */ |
---|
1171 | { |
---|
1172 | boolean foundIG= false, foundStrider= false, |
---|
1173 | foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false, |
---|
1174 | foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false, |
---|
1175 | gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false, |
---|
1176 | isfitch= false, isphylip= false, done= false; |
---|
1177 | short format= kUnknown; |
---|
1178 | int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0; |
---|
1179 | char sp[256]; |
---|
1180 | long linestart=0; |
---|
1181 | int maxlines2check=500; |
---|
1182 | |
---|
1183 | #define ReadOneLine(sp) \ |
---|
1184 | { done |= (feof(fseq)); \ |
---|
1185 | readline( fseq, sp, &linestart); \ |
---|
1186 | if (!done) { splen = strlen(sp); ++nlines; } } |
---|
1187 | |
---|
1188 | *skiplines = 0; |
---|
1189 | *error = 0; |
---|
1190 | if (fseq == NULL) { *error = eFileNotFound; return kNoformat; } |
---|
1191 | |
---|
1192 | while ( !done ) { |
---|
1193 | ReadOneLine(sp); |
---|
1194 | |
---|
1195 | /* check for mailer head & skip past if found */ |
---|
1196 | if (nlines < 4 && !done) { |
---|
1197 | if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) { |
---|
1198 | do { |
---|
1199 | /* skip all lines until find one blank line */ |
---|
1200 | ReadOneLine(sp); |
---|
1201 | if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ; |
---|
1202 | } while ((!done) && (k < splen)); |
---|
1203 | *skiplines = nlines; /* !? do we want #lines or #bytes ?? */ |
---|
1204 | } |
---|
1205 | } |
---|
1206 | |
---|
1207 | if (sp==NULL || *sp==0) |
---|
1208 | ; /* nada */ |
---|
1209 | |
---|
1210 | /* high probability identities: */ |
---|
1211 | |
---|
1212 | else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") ) |
---|
1213 | gotMSF= true; |
---|
1214 | |
---|
1215 | else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL)) |
---|
1216 | gotuw= true; |
---|
1217 | |
---|
1218 | else if (strstr(sp,"identity: Data:") != NULL) |
---|
1219 | gotolsen= true; |
---|
1220 | |
---|
1221 | else if ( strstr(sp,"::=") && |
---|
1222 | (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */ |
---|
1223 | strstr(sp,"Seq-entry") || |
---|
1224 | strstr(sp,"Seq-submit") ) ) /* can we read submit format? */ |
---|
1225 | gotasn1= true; |
---|
1226 | |
---|
1227 | else if ( strstr(sp,"#NEXUS") == sp ) |
---|
1228 | gotpaup= true; |
---|
1229 | |
---|
1230 | /* uncertain identities: */ |
---|
1231 | |
---|
1232 | else if (*sp ==';') { |
---|
1233 | if (strstr(sp,"Strider") !=NULL) foundStrider= true; |
---|
1234 | else foundIG= true; |
---|
1235 | } |
---|
1236 | |
---|
1237 | else if (strstr(sp,"LOCUS") == sp) |
---|
1238 | foundGB= true; |
---|
1239 | else if (strstr(sp,"ORIGIN") == sp) |
---|
1240 | foundGB= true; |
---|
1241 | |
---|
1242 | else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */ |
---|
1243 | foundPIR= true; |
---|
1244 | else if (strstr(sp,"SEQUENCE") == sp) |
---|
1245 | foundPIR= true; |
---|
1246 | |
---|
1247 | else if (*sp == '>') { |
---|
1248 | if (sp[3] == ';') foundNBRF= true; |
---|
1249 | else foundPearson= true; |
---|
1250 | } |
---|
1251 | |
---|
1252 | else if (strstr(sp,"ID ") == sp) |
---|
1253 | foundEMBL= true; |
---|
1254 | else if (strstr(sp,"SQ ") == sp) |
---|
1255 | foundEMBL= true; |
---|
1256 | |
---|
1257 | else if (*sp == '(') |
---|
1258 | foundZuker= true; |
---|
1259 | |
---|
1260 | else { |
---|
1261 | if (nlines - *skiplines == 1) { |
---|
1262 | int ispp= 0, ilen= 0; |
---|
1263 | sscanf( sp, "%d%d", &ispp, &ilen); |
---|
1264 | if (ispp > 0 && ilen > 0) isphylip= true; |
---|
1265 | } |
---|
1266 | else if (isphylip && nlines - *skiplines == 2) { |
---|
1267 | int tseq; |
---|
1268 | tseq= getseqtype(sp+10, strlen(sp+10)); |
---|
1269 | if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */ |
---|
1270 | && (tseq != kOtherSeq)) /* sequence section must be okay */ |
---|
1271 | foundPhylip= true; |
---|
1272 | } |
---|
1273 | |
---|
1274 | for (k=0, isfitch= true; isfitch & (k < splen); k++) { |
---|
1275 | if (k % 4 == 0) isfitch &= (sp[k] == ' '); |
---|
1276 | else isfitch &= (sp[k] != ' '); |
---|
1277 | } |
---|
1278 | if (isfitch & (splen > 20)) foundFitch= true; |
---|
1279 | |
---|
1280 | /* kRNA && kDNA are fairly certain...*/ |
---|
1281 | switch (getseqtype( sp, splen)) { |
---|
1282 | case kOtherSeq: otherlines++; break; |
---|
1283 | case kAmino : if (splen>20) aminolines++; break; |
---|
1284 | case kDNA : |
---|
1285 | case kRNA : if (splen>20) dnalines++; break; |
---|
1286 | case kNucleic : break; /* not much info ? */ |
---|
1287 | } |
---|
1288 | |
---|
1289 | } |
---|
1290 | |
---|
1291 | /* pretty certain */ |
---|
1292 | if (gotolsen) { |
---|
1293 | format= kOlsen; |
---|
1294 | done= true; |
---|
1295 | } |
---|
1296 | else if (gotMSF) { |
---|
1297 | format= kMSF; |
---|
1298 | done= true; |
---|
1299 | } |
---|
1300 | else if (gotasn1) { |
---|
1301 | /* !! we need to look further and return kASNseqentry | kASNseqset */ |
---|
1302 | /* |
---|
1303 | seqentry key is Seq-entry ::= |
---|
1304 | seqset key is Bioseq-set ::= |
---|
1305 | ?? can't read these yet w/ ncbi tools ?? |
---|
1306 | Seq-submit ::= |
---|
1307 | Bioseq ::= << fails both bioseq-seq and seq-entry parsers ! |
---|
1308 | */ |
---|
1309 | if (strstr(sp,"Bioseq-set")) format= kASNseqset; |
---|
1310 | else if (strstr(sp,"Seq-entry")) format= kASNseqentry; |
---|
1311 | else format= kASN1; /* other form, we can't yet read... */ |
---|
1312 | done= true; |
---|
1313 | } |
---|
1314 | else if (gotpaup) { |
---|
1315 | format= kPAUP; |
---|
1316 | done= true; |
---|
1317 | } |
---|
1318 | |
---|
1319 | else if (gotuw) { |
---|
1320 | if (foundIG) format= kIG; /* a TOIG file from GCG for certain */ |
---|
1321 | else format= kGCG; |
---|
1322 | done= true; |
---|
1323 | } |
---|
1324 | |
---|
1325 | else if ((dnalines > 1) || done || (nlines > maxlines2check)) { |
---|
1326 | /* decide on most likely format */ |
---|
1327 | /* multichar idents: */ |
---|
1328 | if (foundStrider) format= kStrider; |
---|
1329 | else if (foundGB) format= kGenBank; |
---|
1330 | else if (foundPIR) format= kPIR; |
---|
1331 | else if (foundEMBL) format= kEMBL; |
---|
1332 | else if (foundNBRF) format= kNBRF; |
---|
1333 | /* single char idents: */ |
---|
1334 | else if (foundIG) format= kIG; |
---|
1335 | else if (foundPearson) format= kPearson; |
---|
1336 | else if (foundZuker) format= kZuker; |
---|
1337 | /* digit ident: */ |
---|
1338 | else if (foundPhylip) format= kPhylip; |
---|
1339 | /* spacing ident: */ |
---|
1340 | else if (foundFitch) format= kFitch; |
---|
1341 | /* no format chars: */ |
---|
1342 | else if (otherlines > 0) format= kUnknown; |
---|
1343 | else if (dnalines > 1) format= kPlain; |
---|
1344 | else if (aminolines > 1) format= kPlain; |
---|
1345 | else format= kUnknown; |
---|
1346 | |
---|
1347 | done= true; |
---|
1348 | } |
---|
1349 | |
---|
1350 | /* need this for possible long header in olsen format */ |
---|
1351 | else if (strstr(sp,"): ") != NULL) |
---|
1352 | maxlines2check++; |
---|
1353 | } |
---|
1354 | |
---|
1355 | if (format == kPhylip) { |
---|
1356 | /* check for interleaved or sequential -- really messy */ |
---|
1357 | int tname, tseq; |
---|
1358 | long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0; |
---|
1359 | char *ps; |
---|
1360 | |
---|
1361 | rewind(fseq); |
---|
1362 | for (i=0; i < *skiplines; i++) ReadOneLine(sp); |
---|
1363 | nlines= 0; |
---|
1364 | ReadOneLine(sp); |
---|
1365 | sscanf( sp, "%ld%ld", &nspp, &nlen); |
---|
1366 | ReadOneLine(sp); /* 1st seq line */ |
---|
1367 | for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
---|
1368 | |
---|
1369 | for (i= 1; i<nspp; i++) { |
---|
1370 | ReadOneLine(sp); |
---|
1371 | |
---|
1372 | tseq= getseqtype(sp+10, strlen(sp+10)); |
---|
1373 | tname= getseqtype(sp, 10); |
---|
1374 | for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++); |
---|
1375 | for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
---|
1376 | |
---|
1377 | /* find probable interleaf or sequential ... */ |
---|
1378 | if (j>=9) seq += 10; /* pretty certain not ileaf */ |
---|
1379 | else { |
---|
1380 | if (tseq != tname) leaf++; else seq++; |
---|
1381 | if (tname == kDNA || tname == kRNA) seq++; else leaf++; |
---|
1382 | } |
---|
1383 | |
---|
1384 | if (ilen <= nlen && j<9) { |
---|
1385 | if (tname == kOtherSeq) leaf += 10; |
---|
1386 | else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++; |
---|
1387 | } |
---|
1388 | else if (ilen > nlen) { |
---|
1389 | ilen= 0; |
---|
1390 | } |
---|
1391 | } |
---|
1392 | for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */ |
---|
1393 | ReadOneLine(sp); |
---|
1394 | |
---|
1395 | tseq= getseqtype(sp+10, strlen(sp+10)); |
---|
1396 | tname= getseqtype(sp, 10); |
---|
1397 | for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
---|
1398 | for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++); |
---|
1399 | if (j<9) { |
---|
1400 | if (tname == kOtherSeq) seq += 10; |
---|
1401 | if (tseq != tname) seq++; else leaf++; |
---|
1402 | if (tname == kDNA || tname == kRNA) leaf++; else seq++; |
---|
1403 | } |
---|
1404 | if (ilen > nlen) { |
---|
1405 | if (j>9) leaf += 10; /* must be a name here for sequent */ |
---|
1406 | else if (tname == kOtherSeq) seq += 10; |
---|
1407 | ilen= 0; |
---|
1408 | } |
---|
1409 | } |
---|
1410 | |
---|
1411 | if (leaf > seq) format= kPhylip4; |
---|
1412 | else format= kPhylip2; |
---|
1413 | } |
---|
1414 | |
---|
1415 | return(format); |
---|
1416 | #undef ReadOneLine |
---|
1417 | } /* SeqFileFormat */ |
---|
1418 | |
---|
1419 | |
---|
1420 | |
---|
1421 | |
---|
1422 | unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal) |
---|
1423 | /* GCGchecksum */ |
---|
1424 | { |
---|
1425 | long i, check = 0, count = 0; |
---|
1426 | |
---|
1427 | for (i = 0; i < seqlen; i++) { |
---|
1428 | count++; |
---|
1429 | check += count * to_upper(seq[i]); |
---|
1430 | if (count == 57) count = 0; |
---|
1431 | } |
---|
1432 | check %= 10000; |
---|
1433 | *checktotal += check; |
---|
1434 | *checktotal %= 10000; |
---|
1435 | return check; |
---|
1436 | } |
---|
1437 | |
---|
1438 | /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */ |
---|
1439 | const unsigned long crctab[] = { |
---|
1440 | 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, |
---|
1441 | 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, |
---|
1442 | 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, |
---|
1443 | 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, |
---|
1444 | 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, |
---|
1445 | 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, |
---|
1446 | 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, |
---|
1447 | 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, |
---|
1448 | 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, |
---|
1449 | 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, |
---|
1450 | 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, |
---|
1451 | 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, |
---|
1452 | 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, |
---|
1453 | 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, |
---|
1454 | 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, |
---|
1455 | 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, |
---|
1456 | 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, |
---|
1457 | 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, |
---|
1458 | 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, |
---|
1459 | 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, |
---|
1460 | 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, |
---|
1461 | 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, |
---|
1462 | 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, |
---|
1463 | 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, |
---|
1464 | 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, |
---|
1465 | 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, |
---|
1466 | 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, |
---|
1467 | 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, |
---|
1468 | 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, |
---|
1469 | 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, |
---|
1470 | 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, |
---|
1471 | 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, |
---|
1472 | 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, |
---|
1473 | 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, |
---|
1474 | 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, |
---|
1475 | 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, |
---|
1476 | 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, |
---|
1477 | 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, |
---|
1478 | 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, |
---|
1479 | 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, |
---|
1480 | 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, |
---|
1481 | 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, |
---|
1482 | 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, |
---|
1483 | 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, |
---|
1484 | 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, |
---|
1485 | 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, |
---|
1486 | 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, |
---|
1487 | 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, |
---|
1488 | 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, |
---|
1489 | 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, |
---|
1490 | 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, |
---|
1491 | 0x2d02ef8dL |
---|
1492 | }; |
---|
1493 | |
---|
1494 | unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal) |
---|
1495 | /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */ |
---|
1496 | { |
---|
1497 | unsigned long c = 0xffffffffL; |
---|
1498 | long n = seqlen; |
---|
1499 | |
---|
1500 | if (!seq) return 0; |
---|
1501 | while (n--) { |
---|
1502 | c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8); |
---|
1503 | seq++; /* fixed aug'98 finally */ |
---|
1504 | } |
---|
1505 | c= c ^ 0xffffffffL; |
---|
1506 | *checktotal += c; |
---|
1507 | return c; |
---|
1508 | } |
---|
1509 | |
---|
1510 | |
---|
1511 | |
---|
1512 | |
---|
1513 | short getseqtype( const char *seq, const long seqlen) |
---|
1514 | { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */ |
---|
1515 | char c; |
---|
1516 | short i, maxtest; |
---|
1517 | short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0; |
---|
1518 | |
---|
1519 | maxtest = min(300, seqlen); |
---|
1520 | for (i = 0; i < maxtest; i++) { |
---|
1521 | c = to_upper(seq[i]); |
---|
1522 | if (strchr(protonly, c)) po++; |
---|
1523 | else if (strchr(primenuc,c)) { |
---|
1524 | na++; |
---|
1525 | if (c == 'T') nt++; |
---|
1526 | else if (c == 'U') nu++; |
---|
1527 | } |
---|
1528 | else if (strchr(aminos,c)) aa++; |
---|
1529 | else if (strchr(seqsymbols,c)) ns++; |
---|
1530 | else if (isalpha(c)) no++; |
---|
1531 | } |
---|
1532 | |
---|
1533 | if ((no > 0) || (po+aa+na == 0)) return kOtherSeq; |
---|
1534 | /* ?? test for probability of kOtherSeq ?, e.g., |
---|
1535 | else if (po+aa+na / maxtest < 0.70) return kOtherSeq; |
---|
1536 | */ |
---|
1537 | else if (po > 0) return kAmino; |
---|
1538 | else if (aa == 0) { |
---|
1539 | if (nu > nt) return kRNA; |
---|
1540 | else return kDNA; |
---|
1541 | } |
---|
1542 | else if (na > aa) return kNucleic; |
---|
1543 | else return kAmino; |
---|
1544 | } /* getseqtype */ |
---|
1545 | |
---|
1546 | |
---|
1547 | char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen) |
---|
1548 | { |
---|
1549 | char *a, *b; |
---|
1550 | long i; |
---|
1551 | char *newseq; |
---|
1552 | |
---|
1553 | *newlen= 0; |
---|
1554 | if (!seq) return NULL; |
---|
1555 | newseq = (char*) malloc(seqlen+1); |
---|
1556 | if (!newseq) return NULL; |
---|
1557 | for (a= (char*)seq, b=newseq, i=0; *a!=0; a++) |
---|
1558 | if (*a != gapc) { |
---|
1559 | *b++= *a; |
---|
1560 | i++; |
---|
1561 | } |
---|
1562 | *b= '\0'; |
---|
1563 | newseq = (char*) realloc(newseq, i+1); |
---|
1564 | *newlen= i; |
---|
1565 | return newseq; |
---|
1566 | } |
---|
1567 | |
---|
1568 | |
---|
1569 | |
---|
1570 | /*** |
---|
1571 | char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \ |
---|
1572 | {\\fonttbl\ |
---|
1573 | {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\ |
---|
1574 | {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\ |
---|
1575 | {\\f5\\froman Times;}{\\f6\\froman Palatino;}\ |
---|
1576 | {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\ |
---|
1577 | {\\stylesheet\ |
---|
1578 | {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\ |
---|
1579 | {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\ |
---|
1580 | {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}"; |
---|
1581 | |
---|
1582 | char *rtftail = "}"; |
---|
1583 | ****/ |
---|
1584 | |
---|
1585 | short writeSeq(FILE *outf, const char *seq, const long seqlen, |
---|
1586 | const short outform, const char *seqid) |
---|
1587 | /* dump sequence to standard output */ |
---|
1588 | { |
---|
1589 | const short kSpaceAll = -9; |
---|
1590 | #define kMaxseqwidth 250 |
---|
1591 | |
---|
1592 | boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */ |
---|
1593 | short numline = 0; /* only true if we are writing seq number line (for interleave) */ |
---|
1594 | boolean numright = false, numleft = false; |
---|
1595 | boolean nameright = false, nameleft = false; |
---|
1596 | short namewidth = 8, numwidth = 8; |
---|
1597 | short spacer = 0, width = 50, tab = 0; |
---|
1598 | /* new parameters: width, spacer, those above... */ |
---|
1599 | |
---|
1600 | short linesout = 0, seqtype = kNucleic; |
---|
1601 | long i, j, l, l1, ibase; |
---|
1602 | char idword[31], endstr[10]; |
---|
1603 | char seqnamestore[128], *seqname = seqnamestore; |
---|
1604 | char s[kMaxseqwidth], *cp; |
---|
1605 | char nameform[10], numform[10], nocountsymbols[10]; |
---|
1606 | unsigned long checksum = 0, checktotal = 0; |
---|
1607 | |
---|
1608 | gPretty.atseq++; |
---|
1609 | skipwhitespace(seqid); |
---|
1610 | l = min(128, strlen(seqid)); |
---|
1611 | strncpy( seqnamestore, seqid, l); |
---|
1612 | seqname[l] = 0; |
---|
1613 | |
---|
1614 | sscanf( seqname, "%30s", idword); |
---|
1615 | sprintf(numform, "%ld", seqlen); |
---|
1616 | numwidth= strlen(numform)+1; |
---|
1617 | nameform[0]= '\0'; |
---|
1618 | |
---|
1619 | if (strstr(seqname,"checksum") != NULL) { |
---|
1620 | cp = strstr(seqname,"bases"); |
---|
1621 | if (cp!=NULL) { |
---|
1622 | for ( ; (cp!=seqname) && (*cp!=','); cp--) ; |
---|
1623 | if (cp!=seqname) *cp=0; |
---|
1624 | } |
---|
1625 | } |
---|
1626 | |
---|
1627 | strcpy( endstr,""); |
---|
1628 | l1 = 0; |
---|
1629 | |
---|
1630 | if (outform == kGCG || outform == kMSF) |
---|
1631 | checksum = GCGchecksum(seq, seqlen, &checktotal); |
---|
1632 | else |
---|
1633 | checksum = seqchecksum(seq, seqlen, &checktotal); |
---|
1634 | |
---|
1635 | switch (outform) { |
---|
1636 | |
---|
1637 | case kPlain: |
---|
1638 | case kUnknown: /* no header, just sequence */ |
---|
1639 | strcpy(endstr,"\n"); /* end w/ extra blank line */ |
---|
1640 | break; |
---|
1641 | |
---|
1642 | case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */ |
---|
1643 | case kGenBank: |
---|
1644 | fprintf(outf,"LOCUS %s %ld bp\n", idword, seqlen); |
---|
1645 | fprintf(outf,"DEFINITION %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1646 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
---|
1647 | fprintf(outf,"ORIGIN \n"); |
---|
1648 | spacer = 11; |
---|
1649 | numleft = true; |
---|
1650 | numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */ |
---|
1651 | strcpy(endstr, "\n//"); |
---|
1652 | linesout += 4; |
---|
1653 | break; |
---|
1654 | |
---|
1655 | case kPIR: |
---|
1656 | /* somewhat like genbank... \\\*/ |
---|
1657 | /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */ |
---|
1658 | fprintf(outf,"ENTRY %s \n", idword); |
---|
1659 | fprintf(outf,"TITLE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1660 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
---|
1661 | fprintf(outf,"SEQUENCE \n"); |
---|
1662 | numwidth = 7; |
---|
1663 | width= 30; |
---|
1664 | spacer = kSpaceAll; |
---|
1665 | numleft = true; |
---|
1666 | strcpy(endstr, "\n///"); |
---|
1667 | /* run a top number line for PIR */ |
---|
1668 | for (j=0; j<numwidth; j++) fputc(' ',outf); |
---|
1669 | for (j= 5; j<=width; j += 5) fprintf(outf,"%10ld",j); |
---|
1670 | fputc('\n',outf); |
---|
1671 | linesout += 5; |
---|
1672 | break; |
---|
1673 | |
---|
1674 | case kNBRF: |
---|
1675 | if (getseqtype(seq, seqlen) == kAmino) |
---|
1676 | fprintf(outf,">P1;%s\n", idword); |
---|
1677 | else |
---|
1678 | fprintf(outf,">DL;%s\n", idword); |
---|
1679 | fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1680 | spacer = 11; |
---|
1681 | strcpy(endstr,"*\n"); |
---|
1682 | linesout += 3; |
---|
1683 | break; |
---|
1684 | |
---|
1685 | case kEMBL: |
---|
1686 | fprintf(outf,"ID %s\n", idword); |
---|
1687 | /* fprintf(outf,"AC %s\n", accnum); */ |
---|
1688 | fprintf(outf,"DE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1689 | fprintf(outf,"SQ %ld BP\n", seqlen); |
---|
1690 | strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ |
---|
1691 | tab = 4; /** added 31jan91 */ |
---|
1692 | spacer = 11; /** added 31jan91 */ |
---|
1693 | width = 60; |
---|
1694 | linesout += 4; |
---|
1695 | break; |
---|
1696 | |
---|
1697 | case kGCG: |
---|
1698 | fprintf(outf,"%s\n", seqname); |
---|
1699 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
---|
1700 | fprintf(outf," %s Length: %ld (today) Check: %ld ..\n", idword, seqlen, checksum); |
---|
1701 | spacer = 11; |
---|
1702 | numleft = true; |
---|
1703 | strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ |
---|
1704 | linesout += 3; |
---|
1705 | break; |
---|
1706 | |
---|
1707 | case kStrider: /* ?? map ?*/ |
---|
1708 | fprintf(outf,"; ### from DNA Strider ;-)\n"); |
---|
1709 | fprintf(outf,"; DNA sequence %s, %ld bases, %lX checksum.\n;\n", seqname, seqlen, checksum); |
---|
1710 | strcpy(endstr, "\n//"); |
---|
1711 | linesout += 3; |
---|
1712 | break; |
---|
1713 | |
---|
1714 | case kFitch: |
---|
1715 | fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1716 | spacer = 4; |
---|
1717 | width = 60; |
---|
1718 | linesout += 1; |
---|
1719 | break; |
---|
1720 | |
---|
1721 | case kPhylip2: |
---|
1722 | case kPhylip4: |
---|
1723 | /* this is version 3.2/3.4 -- simplest way to write |
---|
1724 | version 3.3 is to write as version 3.2, then |
---|
1725 | re-read file and interleave the species lines */ |
---|
1726 | if (strlen(idword)>10) idword[10] = 0; |
---|
1727 | fprintf(outf,"%-10s ",idword); |
---|
1728 | l1 = -1; |
---|
1729 | tab = 12; |
---|
1730 | spacer = 11; |
---|
1731 | break; |
---|
1732 | |
---|
1733 | case kASN1: |
---|
1734 | seqtype= getseqtype(seq, seqlen); |
---|
1735 | switch (seqtype) { |
---|
1736 | case kDNA : cp= (char*)"dna"; break; |
---|
1737 | case kRNA : cp= (char*)"rna"; break; |
---|
1738 | case kNucleic : cp= (char*)"na"; break; |
---|
1739 | case kAmino : cp= (char*)"aa"; break; |
---|
1740 | case kOtherSeq: cp= (char*)"not-set"; break; |
---|
1741 | } |
---|
1742 | fprintf(outf," seq {\n"); |
---|
1743 | fprintf(outf," id { local id %d },\n", gPretty.atseq); |
---|
1744 | fprintf(outf," descr { title \"%s\" },\n", seqid); |
---|
1745 | fprintf(outf," inst {\n"); |
---|
1746 | fprintf(outf," repr raw, mol %s, length %ld, topology linear,\n", cp, seqlen); |
---|
1747 | fprintf(outf," seq-data\n"); |
---|
1748 | if (seqtype == kAmino) |
---|
1749 | fprintf(outf," iupacaa \""); |
---|
1750 | else |
---|
1751 | fprintf(outf," iupacna \""); |
---|
1752 | l1 = 17; |
---|
1753 | spacer = 0; |
---|
1754 | width = 78; |
---|
1755 | tab = 0; |
---|
1756 | strcpy(endstr,"\"\n } } ,"); |
---|
1757 | linesout += 7; |
---|
1758 | break; |
---|
1759 | |
---|
1760 | case kPAUP: |
---|
1761 | nameleft= true; |
---|
1762 | namewidth = 9; |
---|
1763 | spacer = 21; |
---|
1764 | width = 100; |
---|
1765 | tab = 0; /* 1; */ |
---|
1766 | /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */ |
---|
1767 | /* do a header comment line for paup */ |
---|
1768 | fprintf(outf,"[Name: %-16s Len:%6ld Check: %8lX]\n", idword, seqlen, checksum); |
---|
1769 | linesout += 1; |
---|
1770 | break; |
---|
1771 | |
---|
1772 | case kPretty: |
---|
1773 | numline= gPretty.numline; |
---|
1774 | baseonlynum= gPretty.baseonlynum; |
---|
1775 | namewidth = gPretty.namewidth; |
---|
1776 | numright = gPretty.numright; |
---|
1777 | numleft = gPretty.numleft; |
---|
1778 | nameright = gPretty.nameright; |
---|
1779 | nameleft = gPretty.nameleft; |
---|
1780 | spacer = gPretty.spacer + 1; |
---|
1781 | width = gPretty.seqwidth; |
---|
1782 | tab = gPretty.tab; |
---|
1783 | /* also add rtf formatting w/ font, size, style */ |
---|
1784 | if (gPretty.nametop) { |
---|
1785 | fprintf(outf,"Name: %-16s Len:%6ld Check: %8lX\n", idword, seqlen, checksum); |
---|
1786 | linesout++; |
---|
1787 | } |
---|
1788 | break; |
---|
1789 | |
---|
1790 | case kMSF: |
---|
1791 | fprintf(outf," Name: %-16s Len:%6ld Check: %5ld Weight: 1.00\n", |
---|
1792 | idword, seqlen, checksum); |
---|
1793 | linesout++; |
---|
1794 | nameleft= true; |
---|
1795 | namewidth= 15; /* need MAX namewidth here... */ |
---|
1796 | sprintf(nameform, "%%+%ds ",namewidth); |
---|
1797 | spacer = 11; |
---|
1798 | width = 50; |
---|
1799 | tab = 0; /* 1; */ |
---|
1800 | break; |
---|
1801 | |
---|
1802 | case kIG: |
---|
1803 | fprintf(outf,";%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1804 | fprintf(outf,"%s\n", idword); |
---|
1805 | strcpy(endstr,"1"); /* == linear dna */ |
---|
1806 | linesout += 2; |
---|
1807 | break; |
---|
1808 | |
---|
1809 | default : |
---|
1810 | case kZuker: /* don't attempt Zuker's ftn format */ |
---|
1811 | case kPearson: |
---|
1812 | fprintf(outf,">%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1813 | linesout += 1; |
---|
1814 | break; |
---|
1815 | } |
---|
1816 | |
---|
1817 | if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth); |
---|
1818 | if (numline) sprintf(numform, "%%%ds ",numwidth); |
---|
1819 | else sprintf(numform, "%%%dd ",numwidth); |
---|
1820 | strcpy( nocountsymbols, kNocountsymbols); |
---|
1821 | if (baseonlynum) { |
---|
1822 | if (strchr(nocountsymbols,gPretty.gapchar)==NULL) { |
---|
1823 | strcat(nocountsymbols," "); |
---|
1824 | nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar; |
---|
1825 | } |
---|
1826 | if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) { |
---|
1827 | *cp= ' '; |
---|
1828 | } |
---|
1829 | } |
---|
1830 | |
---|
1831 | if (numline) { |
---|
1832 | *idword= 0; |
---|
1833 | } |
---|
1834 | |
---|
1835 | width = min(width,kMaxseqwidth); |
---|
1836 | for (i=0, l=0, ibase = 1; i < seqlen; ) { |
---|
1837 | |
---|
1838 | if (l1 < 0) l1 = 0; |
---|
1839 | else if (l1 == 0) { |
---|
1840 | if (nameleft) fprintf(outf, nameform, idword); |
---|
1841 | if (numleft) { if (numline) fprintf(outf, numform, ""); |
---|
1842 | else fprintf(outf, numform, ibase);} |
---|
1843 | for (j=0; j<tab; j++) fputc(' ',outf); |
---|
1844 | } |
---|
1845 | |
---|
1846 | l1++; /* don't count spaces for width*/ |
---|
1847 | if (numline) { |
---|
1848 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) { |
---|
1849 | if (numline==1) fputc(' ',outf); |
---|
1850 | s[l++] = ' '; |
---|
1851 | } |
---|
1852 | if (l1 % 10 == 1 || l1 == width) { |
---|
1853 | if (numline==1) fprintf(outf,"%-9ld ",i+1); |
---|
1854 | s[l++]= '|'; /* == put a number here */ |
---|
1855 | } |
---|
1856 | else s[l++]= ' '; |
---|
1857 | i++; |
---|
1858 | } |
---|
1859 | |
---|
1860 | else { |
---|
1861 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) |
---|
1862 | s[l++] = ' '; |
---|
1863 | if (!baseonlynum) ibase++; |
---|
1864 | else if (0==strchr(nocountsymbols,seq[i])) ibase++; |
---|
1865 | s[l++] = seq[i++]; |
---|
1866 | } |
---|
1867 | |
---|
1868 | if (l1 == width || i == seqlen) { |
---|
1869 | if (outform==kPretty) for ( ; l1<width; l1++) { |
---|
1870 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) |
---|
1871 | s[l++] = ' '; |
---|
1872 | s[l++]=' '; /* pad w/ blanks */ |
---|
1873 | } |
---|
1874 | s[l] = '\0'; |
---|
1875 | l = 0; l1 = 0; |
---|
1876 | |
---|
1877 | if (numline) { |
---|
1878 | if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */ |
---|
1879 | } |
---|
1880 | else { |
---|
1881 | if (i == seqlen) fprintf(outf,"%s%s",s,endstr); |
---|
1882 | else fprintf(outf,"%s",s); |
---|
1883 | if (numright || nameright) fputc(' ',outf); |
---|
1884 | if (numright) fprintf(outf,numform, ibase-1); |
---|
1885 | if (nameright) fprintf(outf, nameform,idword); |
---|
1886 | } |
---|
1887 | fputc('\n',outf); |
---|
1888 | linesout++; |
---|
1889 | } |
---|
1890 | } |
---|
1891 | return linesout; |
---|
1892 | } /*writeSeq*/ |
---|
1893 | |
---|
1894 | |
---|
1895 | |
---|
1896 | /* End file: ureadseq.c */ |
---|