1 | /* File: ureadseq.c |
---|
2 | * |
---|
3 | * Reads and writes nucleic/protein sequence in various |
---|
4 | * formats. Data files may have multiple sequences. |
---|
5 | * |
---|
6 | * Copyright 1990 by d.g.gilbert |
---|
7 | * biology dept., indiana university, bloomington, in 47405 |
---|
8 | * e-mail: gilbertd@bio.indiana.edu |
---|
9 | * |
---|
10 | * This program may be freely copied and used by anyone. |
---|
11 | * Developers are encourged to incorporate parts in their |
---|
12 | * programs, rather than devise their own private sequence |
---|
13 | * format. |
---|
14 | * |
---|
15 | * This should compile and run with any ANSI C compiler. |
---|
16 | * |
---|
17 | * ------------------------------------------------------------ |
---|
18 | * |
---|
19 | * Some slight modifications were done by ARB developers |
---|
20 | * |
---|
21 | */ |
---|
22 | |
---|
23 | |
---|
24 | #include <stdio.h> |
---|
25 | #include <stdlib.h> |
---|
26 | #include <string.h> |
---|
27 | #define __NO_CTYPE |
---|
28 | #include <ctype.h> |
---|
29 | |
---|
30 | #include "ureadseq.h" |
---|
31 | prettyopts gPretty; |
---|
32 | |
---|
33 | |
---|
34 | int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */ |
---|
35 | { |
---|
36 | int diff, done; |
---|
37 | if (a == b) return 0; |
---|
38 | done = 0; |
---|
39 | while (! done) { |
---|
40 | diff = to_upper(*a) - to_upper(*b); |
---|
41 | if (diff) return diff; |
---|
42 | if (*a == '\0') done = 1; |
---|
43 | else { a++; b++; } |
---|
44 | } |
---|
45 | return 0; |
---|
46 | } |
---|
47 | |
---|
48 | int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */ |
---|
49 | { |
---|
50 | int diff, done; |
---|
51 | if (a == b) return 0; |
---|
52 | done = 0; |
---|
53 | while (! done) { |
---|
54 | diff = to_upper(*a) - to_upper(*b); |
---|
55 | if (diff) return diff; |
---|
56 | if (*a == '\0') done = 1; |
---|
57 | else { |
---|
58 | a++; b++; maxn--; |
---|
59 | if (! maxn) done = 1; |
---|
60 | } |
---|
61 | } |
---|
62 | return 0; |
---|
63 | } |
---|
64 | |
---|
65 | |
---|
66 | |
---|
67 | |
---|
68 | |
---|
69 | #ifndef Local |
---|
70 | # define Local static /* local functions */ |
---|
71 | #endif |
---|
72 | |
---|
73 | #define kStartLength 500000 /* 20Apr93 temp. bug fix */ |
---|
74 | |
---|
75 | const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; |
---|
76 | const char *primenuc = "ACGTU"; |
---|
77 | const char *protonly = "EFIPQZ"; |
---|
78 | |
---|
79 | const char kNocountsymbols[5] = "_.-?"; |
---|
80 | const char stdsymbols[6] = "_.-*?"; |
---|
81 | const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\"; |
---|
82 | static const char *seqsymbols = allsymbols; |
---|
83 | |
---|
84 | const char nummask[11] = "0123456789"; |
---|
85 | const char nonummask[11] = "~!@#$%^&*("; |
---|
86 | |
---|
87 | /* |
---|
88 | use general form of isseqchar -- all chars + symbols. |
---|
89 | no formats except nbrf (?) use symbols in data area as |
---|
90 | anything other than sequence chars. |
---|
91 | */ |
---|
92 | |
---|
93 | |
---|
94 | |
---|
95 | /* Local variables for readSeq: */ |
---|
96 | struct ReadSeqVars { |
---|
97 | short choice, err, nseq; |
---|
98 | long seqlen, maxseq, seqlencount; |
---|
99 | short topnseq; |
---|
100 | long topseqlen; |
---|
101 | const char *fname; |
---|
102 | char *seq, *seqid, matchchar; |
---|
103 | boolean allDone, done, filestart, addit; |
---|
104 | FILE *f; |
---|
105 | long linestart; |
---|
106 | char s[256], *sp; |
---|
107 | |
---|
108 | int (*isseqcharfirst8)(); /* Patch by o. strunk (ARB) to allow numbers in genbank sequences*/ |
---|
109 | int (*isseqchar)(); |
---|
110 | /* int (*isseqchar)(int c); << sgi cc hates (int c) */ |
---|
111 | }; |
---|
112 | |
---|
113 | |
---|
114 | |
---|
115 | int isSeqChar(int c) |
---|
116 | { |
---|
117 | return (isalpha(c) || strchr(seqsymbols,c)); |
---|
118 | } |
---|
119 | |
---|
120 | int isSeqNumChar(int c) |
---|
121 | { |
---|
122 | return (isalnum(c) || strchr(seqsymbols,c)); |
---|
123 | } |
---|
124 | |
---|
125 | |
---|
126 | int isAnyChar(int c) |
---|
127 | { |
---|
128 | return isascii(c); /* wrap in case isascii is macro */ |
---|
129 | } |
---|
130 | |
---|
131 | Local void readline(FILE *f, char *s, long *linestart) |
---|
132 | { |
---|
133 | char *cp; |
---|
134 | |
---|
135 | *linestart= ftell(f); |
---|
136 | if (NULL == fgets(s, 256, f)) |
---|
137 | *s = 0; |
---|
138 | else { |
---|
139 | cp = strchr(s, '\n'); |
---|
140 | if (cp != NULL) *cp = 0; |
---|
141 | } |
---|
142 | } |
---|
143 | |
---|
144 | Local void GetLine(struct ReadSeqVars *V) |
---|
145 | { |
---|
146 | readline(V->f, V->s, &V->linestart); |
---|
147 | } |
---|
148 | |
---|
149 | Local void unGetLine(struct ReadSeqVars *V) |
---|
150 | { |
---|
151 | fseek(V->f, V->linestart, 0); |
---|
152 | } |
---|
153 | |
---|
154 | |
---|
155 | Local void addseq(char *s, struct ReadSeqVars * V) |
---|
156 | { |
---|
157 | char *ptr; |
---|
158 | int count = 0; |
---|
159 | if (V->addit) { |
---|
160 | for (;*s != 0;s++,count++) { |
---|
161 | if (count < 9 && V->isseqcharfirst8) { |
---|
162 | if (!(V->isseqcharfirst8) (*s)) continue; |
---|
163 | } |
---|
164 | else { |
---|
165 | if (!(V->isseqchar) (*s)) continue; |
---|
166 | } |
---|
167 | if (V->seqlen >= V->maxseq) { |
---|
168 | V->maxseq += kStartLength; |
---|
169 | ptr = (char *) realloc(V->seq, V->maxseq + 1); |
---|
170 | if (ptr == NULL) { |
---|
171 | V->err = eMemFull; |
---|
172 | return; |
---|
173 | } else |
---|
174 | V->seq = ptr; |
---|
175 | } |
---|
176 | V->seq[(V->seqlen)++] = *s; |
---|
177 | } |
---|
178 | } |
---|
179 | } |
---|
180 | |
---|
181 | Local void countseq(char *s, struct ReadSeqVars *V) |
---|
182 | /* this must count all valid seq chars, for some formats (paup-sequential) even |
---|
183 | if we are skipping seq... */ |
---|
184 | { |
---|
185 | while (*s != 0) { |
---|
186 | if ((V->isseqchar)(*s)) { |
---|
187 | (V->seqlencount)++; |
---|
188 | } |
---|
189 | s++; |
---|
190 | } |
---|
191 | } |
---|
192 | |
---|
193 | |
---|
194 | Local void addinfo(char *s, struct ReadSeqVars *V) |
---|
195 | { |
---|
196 | char s2[256], *si; |
---|
197 | boolean saveadd; |
---|
198 | |
---|
199 | si = s2; |
---|
200 | while (*s == ' ') s++; |
---|
201 | sprintf(si, " %d) %s\n", V->nseq, s); |
---|
202 | |
---|
203 | saveadd = V->addit; |
---|
204 | V->addit = true; |
---|
205 | V->isseqchar = isAnyChar; |
---|
206 | addseq( si, V); |
---|
207 | V->addit = saveadd; |
---|
208 | V->isseqchar = isSeqChar; |
---|
209 | } |
---|
210 | |
---|
211 | |
---|
212 | |
---|
213 | |
---|
214 | Local void readLoop(short margin, boolean addfirst, |
---|
215 | boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V), |
---|
216 | struct ReadSeqVars *V) |
---|
217 | { |
---|
218 | boolean addend = false; |
---|
219 | boolean ungetend = false; |
---|
220 | |
---|
221 | V->nseq++; |
---|
222 | if (V->choice == kListSequences) V->addit = false; |
---|
223 | else V->addit = (V->nseq == V->choice); |
---|
224 | if (V->addit) V->seqlen = 0; |
---|
225 | |
---|
226 | if (addfirst) addseq(V->s, V); |
---|
227 | do { |
---|
228 | GetLine(V); |
---|
229 | V->done = feof(V->f); |
---|
230 | V->done |= (*endTest)( &addend, &ungetend, V); |
---|
231 | if (V->addit && (addend || !V->done) && (strlen(V->s) > (unsigned)margin)) { |
---|
232 | addseq( (V->s)+margin, V); |
---|
233 | } |
---|
234 | } while (!V->done); |
---|
235 | |
---|
236 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
---|
237 | else { |
---|
238 | V->allDone = (V->nseq >= V->choice); |
---|
239 | if (V->allDone && ungetend) unGetLine(V); |
---|
240 | } |
---|
241 | } |
---|
242 | |
---|
243 | |
---|
244 | |
---|
245 | Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
246 | { |
---|
247 | *addend = true; /* 1 or 2 occur in line w/ bases */ |
---|
248 | *ungetend= false; |
---|
249 | return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL)); |
---|
250 | } |
---|
251 | |
---|
252 | Local void readIG(struct ReadSeqVars *V) |
---|
253 | { |
---|
254 | /* 18Aug92: new IG format -- ^L between sequences in place of ";" */ |
---|
255 | char *si; |
---|
256 | |
---|
257 | while (!V->allDone) { |
---|
258 | do { |
---|
259 | GetLine(V); |
---|
260 | for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */ |
---|
261 | if (*si == 0) *V->s= 0; /* chop line to empty */ |
---|
262 | } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) )); |
---|
263 | if (feof(V->f)) |
---|
264 | V->allDone = true; |
---|
265 | else { |
---|
266 | strcpy(V->seqid, V->s); |
---|
267 | readLoop(0, false, endIG, V); |
---|
268 | } |
---|
269 | } |
---|
270 | } |
---|
271 | |
---|
272 | |
---|
273 | |
---|
274 | Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
275 | { |
---|
276 | *addend = false; |
---|
277 | *ungetend= false; |
---|
278 | return (strstr( V->s, "//") != NULL); |
---|
279 | } |
---|
280 | |
---|
281 | Local void readStrider(struct ReadSeqVars *V) |
---|
282 | { /* ? only 1 seq/file ? */ |
---|
283 | |
---|
284 | while (!V->allDone) { |
---|
285 | GetLine(V); |
---|
286 | if (strstr(V->s,"; DNA sequence ") == V->s) |
---|
287 | strcpy(V->seqid, (V->s)+16); |
---|
288 | else |
---|
289 | strcpy(V->seqid, (V->s)+1); |
---|
290 | while ((!feof(V->f)) && (*V->s == ';')) { |
---|
291 | GetLine(V); |
---|
292 | } |
---|
293 | if (feof(V->f)) V->allDone = true; |
---|
294 | else readLoop(0, true, endStrider, V); |
---|
295 | } |
---|
296 | } |
---|
297 | |
---|
298 | |
---|
299 | Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
300 | { |
---|
301 | *addend = false; |
---|
302 | *ungetend= (strstr(V->s,"ENTRY") == V->s); |
---|
303 | return ((strstr(V->s,"///") != NULL) || *ungetend); |
---|
304 | } |
---|
305 | |
---|
306 | Local void readPIR(struct ReadSeqVars *V) |
---|
307 | { /*PIR -- many seqs/file */ |
---|
308 | |
---|
309 | while (!V->allDone) { |
---|
310 | while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) ) |
---|
311 | GetLine(V); |
---|
312 | strcpy(V->seqid, (V->s)+16); |
---|
313 | while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s)) |
---|
314 | GetLine(V); |
---|
315 | readLoop(0, false, endPIR, V); |
---|
316 | |
---|
317 | if (!V->allDone) { |
---|
318 | while (! (feof(V->f) || ((*V->s != 0) |
---|
319 | && (strstr( V->s,"ENTRY") == V->s)))) |
---|
320 | GetLine(V); |
---|
321 | } |
---|
322 | if (feof(V->f)) V->allDone = true; |
---|
323 | } |
---|
324 | } |
---|
325 | |
---|
326 | |
---|
327 | Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
328 | { |
---|
329 | *addend = false; |
---|
330 | *ungetend= (strstr(V->s,"LOCUS") == V->s); |
---|
331 | return ((strstr(V->s,"//") != NULL) || *ungetend); |
---|
332 | } |
---|
333 | |
---|
334 | Local void readGenBank(struct ReadSeqVars * V) |
---|
335 | { /* GenBank -- many seqs/file */ |
---|
336 | |
---|
337 | V->isseqchar = isSeqNumChar; |
---|
338 | V->isseqcharfirst8 = isSeqChar; |
---|
339 | |
---|
340 | while (!V->allDone) { |
---|
341 | strcpy(V->seqid, (V->s)+12); |
---|
342 | while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s)) |
---|
343 | GetLine(V); |
---|
344 | readLoop(0, false, endGB, V); |
---|
345 | |
---|
346 | if (!V->allDone) { |
---|
347 | while (! (feof(V->f) || ((*V->s != 0) |
---|
348 | && (strstr( V->s, "LOCUS") == V->s)))) |
---|
349 | GetLine(V); |
---|
350 | } |
---|
351 | if (feof(V->f)) V->allDone = true; |
---|
352 | } |
---|
353 | |
---|
354 | V->isseqchar = isSeqChar; |
---|
355 | V->isseqcharfirst8 = 0; |
---|
356 | } |
---|
357 | |
---|
358 | |
---|
359 | Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
360 | { |
---|
361 | char *a; |
---|
362 | |
---|
363 | if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */ |
---|
364 | /* "*" can be valid base symbol, drop it here */ |
---|
365 | *a = 0; |
---|
366 | *addend = true; |
---|
367 | *ungetend= false; |
---|
368 | return(true); |
---|
369 | } |
---|
370 | else if (*V->s == '>') { /* start of next seq */ |
---|
371 | *addend = false; |
---|
372 | *ungetend= true; |
---|
373 | return(true); |
---|
374 | } |
---|
375 | else |
---|
376 | return(false); |
---|
377 | } |
---|
378 | |
---|
379 | Local void readNBRF(struct ReadSeqVars *V) |
---|
380 | { |
---|
381 | while (!V->allDone) { |
---|
382 | strcpy(V->seqid, (V->s)+4); |
---|
383 | GetLine(V); /*skip title-junk line*/ |
---|
384 | readLoop(0, false, endNBRF, V); |
---|
385 | if (!V->allDone) { |
---|
386 | while (!(feof(V->f) || (*V->s != 0 && *V->s == '>'))) |
---|
387 | GetLine(V); |
---|
388 | } |
---|
389 | if (feof(V->f)) V->allDone = true; |
---|
390 | } |
---|
391 | } |
---|
392 | |
---|
393 | |
---|
394 | |
---|
395 | Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
396 | { |
---|
397 | *addend = false; |
---|
398 | *ungetend= true; |
---|
399 | return(*V->s == '>'); |
---|
400 | } |
---|
401 | |
---|
402 | Local void readPearson(struct ReadSeqVars *V) |
---|
403 | { |
---|
404 | while (!V->allDone) { |
---|
405 | strcpy(V->seqid, (V->s)+1); |
---|
406 | readLoop(0, false, endPearson, V); |
---|
407 | if (!V->allDone) { |
---|
408 | while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>')))) |
---|
409 | GetLine(V); |
---|
410 | } |
---|
411 | if (feof(V->f)) V->allDone = true; |
---|
412 | } |
---|
413 | } |
---|
414 | |
---|
415 | |
---|
416 | |
---|
417 | Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
418 | { |
---|
419 | *addend = false; |
---|
420 | *ungetend= (strstr(V->s,"ID ") == V->s); |
---|
421 | return ((strstr(V->s,"//") != NULL) || *ungetend); |
---|
422 | } |
---|
423 | |
---|
424 | Local void readEMBL(struct ReadSeqVars *V) |
---|
425 | { |
---|
426 | while (!V->allDone) { |
---|
427 | strcpy(V->seqid, (V->s)+5); |
---|
428 | do { |
---|
429 | GetLine(V); |
---|
430 | } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s))); |
---|
431 | |
---|
432 | readLoop(0, false, endEMBL, V); |
---|
433 | if (!V->allDone) { |
---|
434 | while (!(feof(V->f) | |
---|
435 | ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s)))) |
---|
436 | GetLine(V); |
---|
437 | } |
---|
438 | if (feof(V->f)) V->allDone = true; |
---|
439 | } |
---|
440 | } |
---|
441 | |
---|
442 | |
---|
443 | |
---|
444 | Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
445 | { |
---|
446 | *addend = false; |
---|
447 | *ungetend= true; |
---|
448 | return( *V->s == '(' ); |
---|
449 | } |
---|
450 | |
---|
451 | Local void readZuker(struct ReadSeqVars *V) |
---|
452 | { |
---|
453 | /*! 1st string is Zuker's Fortran format */ |
---|
454 | |
---|
455 | while (!V->allDone) { |
---|
456 | GetLine(V); /*s == "seqLen seqid string..."*/ |
---|
457 | strcpy(V->seqid, (V->s)+6); |
---|
458 | readLoop(0, false, endZuker, V); |
---|
459 | if (!V->allDone) { |
---|
460 | while (!(feof(V->f) | |
---|
461 | ((*V->s != '\0') & (*V->s == '(')))) |
---|
462 | GetLine(V); |
---|
463 | } |
---|
464 | if (feof(V->f)) V->allDone = true; |
---|
465 | } |
---|
466 | } |
---|
467 | |
---|
468 | |
---|
469 | |
---|
470 | Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
471 | { |
---|
472 | /* this is a somewhat shaky end, |
---|
473 | 1st char of line is non-blank for seq. title |
---|
474 | */ |
---|
475 | *addend = false; |
---|
476 | *ungetend= true; |
---|
477 | return( *V->s != ' ' ); |
---|
478 | } |
---|
479 | |
---|
480 | Local void readFitch(struct ReadSeqVars *V) |
---|
481 | { |
---|
482 | boolean first; |
---|
483 | |
---|
484 | first = true; |
---|
485 | while (!V->allDone) { |
---|
486 | if (!first) strcpy(V->seqid, V->s); |
---|
487 | readLoop(0, first, endFitch, V); |
---|
488 | if (feof(V->f)) V->allDone = true; |
---|
489 | first = false; |
---|
490 | } |
---|
491 | } |
---|
492 | |
---|
493 | |
---|
494 | Local void readPlain(struct ReadSeqVars *V) |
---|
495 | { |
---|
496 | V->nseq++; |
---|
497 | V->addit = (V->choice > 0); |
---|
498 | if (V->addit) V->seqlen = 0; |
---|
499 | addseq(V->seqid, V); /*from above..*/ |
---|
500 | if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname); |
---|
501 | else sprintf(V->seqid, " [Unknown form]"); |
---|
502 | do { |
---|
503 | addseq(V->s, V); |
---|
504 | V->done = feof(V->f); |
---|
505 | GetLine(V); |
---|
506 | } while (!V->done); |
---|
507 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
---|
508 | V->allDone = true; |
---|
509 | } |
---|
510 | |
---|
511 | |
---|
512 | Local void readUWGCG(struct ReadSeqVars *V) |
---|
513 | { |
---|
514 | /* |
---|
515 | 10nov91: Reading GCG files casued duplication of last line when |
---|
516 | EOF followed that line !!! |
---|
517 | fix: GetLine now sets *V->s = 0 |
---|
518 | */ |
---|
519 | char *si; |
---|
520 | |
---|
521 | V->nseq++; |
---|
522 | V->addit = (V->choice > 0); |
---|
523 | if (V->addit) V->seqlen = 0; |
---|
524 | strcpy(V->seqid, V->s); |
---|
525 | /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ |
---|
526 | /*drop above or ".." from id*/ |
---|
527 | if ((si = strstr(V->seqid," Length: "))) *si = 0; |
---|
528 | else if ((si = strstr(V->seqid,".."))) *si = 0; |
---|
529 | do { |
---|
530 | V->done = feof(V->f); |
---|
531 | GetLine(V); |
---|
532 | if (!V->done) addseq((V->s), V); |
---|
533 | } while (!V->done); |
---|
534 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
---|
535 | V->allDone = true; |
---|
536 | } |
---|
537 | |
---|
538 | |
---|
539 | Local void readOlsen(struct ReadSeqVars *V) |
---|
540 | { /* G. Olsen /print output from multiple sequence editor */ |
---|
541 | |
---|
542 | char *si, *sj, *sk, *sm, sid[40], snum[20]; |
---|
543 | boolean indata = false; |
---|
544 | int snumlen; |
---|
545 | |
---|
546 | V->addit = (V->choice > 0); |
---|
547 | if (V->addit) V->seqlen = 0; |
---|
548 | rewind(V->f); V->nseq= 0; |
---|
549 | do { |
---|
550 | GetLine(V); |
---|
551 | V->done = feof(V->f); |
---|
552 | |
---|
553 | if (V->done && !(*V->s)) break; |
---|
554 | else if (indata) { |
---|
555 | if ( (si= strstr(V->s, sid)) |
---|
556 | /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */ |
---|
557 | && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) { |
---|
558 | |
---|
559 | /* Spaces are valid alignment data !! */ |
---|
560 | /* 17Oct91: Error, the left margin is 21 not 22! */ |
---|
561 | /* dropped some nucs up to now -- my example file was right shifted ! */ |
---|
562 | /* variable right id margin, drop id-2 spaces at end */ |
---|
563 | /* |
---|
564 | VMS CC COMPILER (VAXC031) mess up: |
---|
565 | -- Index of 21 is chopping 1st nuc on VMS systems Only! |
---|
566 | Byte-for-byte same ame rnasep.olsen sequence file ! |
---|
567 | */ |
---|
568 | |
---|
569 | /* si = (V->s)+21; < was this before VMS CC wasted my time */ |
---|
570 | si += 10; /* use strstr index plus offset to outfox VMS CC bug */ |
---|
571 | |
---|
572 | if ((sk = strstr(si, sid))) *(sk-2) = 0; |
---|
573 | for (sk = si; *sk != 0; sk++) { |
---|
574 | if (*sk == ' ') *sk = '.'; |
---|
575 | /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */ |
---|
576 | else if (isdigit(*sk)) *sk= nonummask[*sk - '0']; |
---|
577 | } |
---|
578 | |
---|
579 | addseq(si, V); |
---|
580 | } |
---|
581 | } |
---|
582 | |
---|
583 | else if ((sk = strstr(V->s, "): "))) { /* seq info header line */ |
---|
584 | /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */ |
---|
585 | /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */ |
---|
586 | /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */ |
---|
587 | (V->nseq)++; |
---|
588 | si = 1 + strchr(V->s,'('); |
---|
589 | *sk = ' '; |
---|
590 | if (V->choice == kListSequences) addinfo( si, V); |
---|
591 | else if (V->nseq == V->choice) { |
---|
592 | strcpy(V->seqid, si); |
---|
593 | sj = strchr(V->seqid, ':'); |
---|
594 | while (*(--sj) == ' ') ; |
---|
595 | while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; } |
---|
596 | |
---|
597 | *sk = 0; |
---|
598 | while (*(--sk) == ' ') *sk = 0; |
---|
599 | strcpy(sid, si); |
---|
600 | |
---|
601 | si= V->s; |
---|
602 | while ((*si <= ' ') && (*si != 0)) si++; |
---|
603 | snumlen=0; |
---|
604 | while (si[snumlen] > ' ' && snumlen<20) |
---|
605 | { snum[snumlen]= si[snumlen]; snumlen++; } |
---|
606 | snum[snumlen]= 0; |
---|
607 | } |
---|
608 | |
---|
609 | } |
---|
610 | |
---|
611 | else if (strstr(V->s,"identity: Data:")) { |
---|
612 | indata = true; |
---|
613 | if (V->choice == kListSequences) V->done = true; |
---|
614 | } |
---|
615 | |
---|
616 | } while (!V->done); |
---|
617 | |
---|
618 | V->allDone = true; |
---|
619 | } /*readOlsen*/ |
---|
620 | |
---|
621 | |
---|
622 | Local void readMSF(struct ReadSeqVars *V) |
---|
623 | { /* gcg's MSF, mult. sequence format, interleaved ! */ |
---|
624 | |
---|
625 | char *si, *sj, sid[128]; |
---|
626 | boolean indata = false; |
---|
627 | int iline= 0; |
---|
628 | |
---|
629 | V->addit = (V->choice > 0); |
---|
630 | if (V->addit) V->seqlen = 0; |
---|
631 | rewind(V->f); V->nseq= 0; |
---|
632 | do { |
---|
633 | GetLine(V); |
---|
634 | V->done = feof(V->f); |
---|
635 | |
---|
636 | if (V->done && !(*V->s)) break; |
---|
637 | else if (indata) { |
---|
638 | /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */ |
---|
639 | /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */ |
---|
640 | |
---|
641 | si= V->s; |
---|
642 | skipwhitespace(si); |
---|
643 | /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/ |
---|
644 | for (sj= si; *sj > ' '; sj++) ; |
---|
645 | *sj= 0; |
---|
646 | if ( *si ) { |
---|
647 | if ( (0==strcmp(si, sid)) ) { |
---|
648 | addseq(sj+1, V); |
---|
649 | } |
---|
650 | iline++; |
---|
651 | } |
---|
652 | } |
---|
653 | |
---|
654 | else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */ |
---|
655 | /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */ |
---|
656 | |
---|
657 | (V->nseq)++; |
---|
658 | si += 6; |
---|
659 | if (V->choice == kListSequences) addinfo( si, V); |
---|
660 | else if (V->nseq == V->choice) { |
---|
661 | strcpy(V->seqid, si); |
---|
662 | si = V->seqid; |
---|
663 | skipwhitespace(si); |
---|
664 | /* for (sj= si; isalnum(*sj); sj++) ; -- bug */ |
---|
665 | for (sj= si; *sj > ' '; sj++) ; |
---|
666 | *sj= 0; |
---|
667 | strcpy(sid, si); |
---|
668 | } |
---|
669 | } |
---|
670 | |
---|
671 | else if ( strstr(V->s,"//") /*== V->s*/ ) { |
---|
672 | indata = true; |
---|
673 | iline= 0; |
---|
674 | if (V->choice == kListSequences) V->done = true; |
---|
675 | } |
---|
676 | |
---|
677 | } while (!V->done); |
---|
678 | |
---|
679 | |
---|
680 | V->allDone = true; |
---|
681 | } /*readMSF*/ |
---|
682 | |
---|
683 | |
---|
684 | |
---|
685 | Local void readPAUPinterleaved(struct ReadSeqVars *V) |
---|
686 | { /* PAUP mult. sequence format, interleaved or sequential! */ |
---|
687 | |
---|
688 | char *si, *sj, *send, sid[40], sid1[40], saveseq[255]; |
---|
689 | boolean first = true, indata = false, domatch; |
---|
690 | int iline= 0, ifmc, saveseqlen=0; |
---|
691 | |
---|
692 | #define fixmatchchar(s) { \ |
---|
693 | for (ifmc=0; ifmc<saveseqlen; ifmc++) \ |
---|
694 | if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; } |
---|
695 | |
---|
696 | V->addit = (V->choice > 0); |
---|
697 | V->seqlencount = 0; |
---|
698 | if (V->addit) V->seqlen = 0; |
---|
699 | /* rewind(V->f); V->nseq= 0; << do in caller !*/ |
---|
700 | indata= true; /* call here after we find "matrix" */ |
---|
701 | domatch= (V->matchchar > 0); |
---|
702 | |
---|
703 | do { |
---|
704 | GetLine(V); |
---|
705 | V->done = feof(V->f); |
---|
706 | |
---|
707 | if (V->done && !(*V->s)) break; |
---|
708 | else if (indata) { |
---|
709 | /* [ 1 1 1 ]*/ |
---|
710 | /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
---|
711 | /* chimp ................a.t. .c.................a ..........*/ |
---|
712 | /* !! need to correct for V->matchchar */ |
---|
713 | si= V->s; |
---|
714 | skipwhitespace(si); |
---|
715 | if (strchr(si,';')) indata= false; |
---|
716 | |
---|
717 | if (isalnum(*si)) { |
---|
718 | /* valid data line starts w/ a left-justified seq name in columns [0..8] */ |
---|
719 | if (first) { |
---|
720 | (V->nseq)++; |
---|
721 | if (V->nseq >= V->topnseq) first= false; |
---|
722 | for (sj = si; isalnum(*sj); sj++) ; |
---|
723 | send= sj; |
---|
724 | skipwhitespace(sj); |
---|
725 | if (V->choice == kListSequences) { |
---|
726 | *send= 0; |
---|
727 | addinfo( si, V); |
---|
728 | } |
---|
729 | else if (V->nseq == V->choice) { |
---|
730 | if (domatch) { |
---|
731 | if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); } |
---|
732 | else fixmatchchar( sj); |
---|
733 | } |
---|
734 | addseq(sj, V); |
---|
735 | *send= 0; |
---|
736 | strcpy(V->seqid, si); |
---|
737 | strcpy(sid, si); |
---|
738 | if (V->nseq == 1) strcpy(sid1, sid); |
---|
739 | } |
---|
740 | } |
---|
741 | |
---|
742 | else if ( (strstr(si, sid) == si) ){ |
---|
743 | while (isalnum(*si)) si++; |
---|
744 | skipwhitespace(si); |
---|
745 | if (domatch) { |
---|
746 | if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); } |
---|
747 | else fixmatchchar( si); |
---|
748 | } |
---|
749 | addseq(si, V); |
---|
750 | } |
---|
751 | |
---|
752 | else if (domatch && (strstr(si, sid1) == si)) { |
---|
753 | strcpy( saveseq, si); |
---|
754 | saveseqlen= strlen(saveseq); |
---|
755 | } |
---|
756 | |
---|
757 | iline++; |
---|
758 | } |
---|
759 | } |
---|
760 | |
---|
761 | else if ( strstr(V->s,"matrix") ) { |
---|
762 | indata = true; |
---|
763 | iline= 0; |
---|
764 | if (V->choice == kListSequences) V->done = true; |
---|
765 | } |
---|
766 | |
---|
767 | } while (!V->done); |
---|
768 | |
---|
769 | V->allDone = true; |
---|
770 | } /*readPAUPinterleaved*/ |
---|
771 | |
---|
772 | |
---|
773 | |
---|
774 | Local void readPAUPsequential(struct ReadSeqVars *V) |
---|
775 | { /* PAUP mult. sequence format, interleaved or sequential! */ |
---|
776 | char *si, *sj; |
---|
777 | boolean atname = true, indata = false; |
---|
778 | |
---|
779 | V->addit = (V->choice > 0); |
---|
780 | if (V->addit) V->seqlen = 0; |
---|
781 | V->seqlencount = 0; |
---|
782 | /* rewind(V->f); V->nseq= 0; << do in caller !*/ |
---|
783 | indata= true; /* call here after we find "matrix" */ |
---|
784 | do { |
---|
785 | GetLine(V); |
---|
786 | V->done = feof(V->f); |
---|
787 | |
---|
788 | if (V->done && !(*V->s)) break; |
---|
789 | else if (indata) { |
---|
790 | /* [ 1 1 1 ]*/ |
---|
791 | /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
---|
792 | /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
---|
793 | /* chimp ................a.t. .c.................a ..........*/ |
---|
794 | /* ................a.t. .c.................a ..........*/ |
---|
795 | |
---|
796 | si= V->s; |
---|
797 | skipwhitespace(si); |
---|
798 | if (strchr(si,';')) indata= false; |
---|
799 | if (isalnum(*si)) { |
---|
800 | /* valid data line starts w/ a left-justified seq name in columns [0..8] */ |
---|
801 | if (atname) { |
---|
802 | (V->nseq)++; |
---|
803 | V->seqlencount = 0; |
---|
804 | atname= false; |
---|
805 | sj= si+1; |
---|
806 | while (isalnum(*sj)) sj++; |
---|
807 | if (V->choice == kListSequences) { |
---|
808 | /* !! we must count bases to know when topseqlen is reached ! */ |
---|
809 | countseq(sj, V); |
---|
810 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
811 | *sj= 0; |
---|
812 | addinfo( si, V); |
---|
813 | } |
---|
814 | else if (V->nseq == V->choice) { |
---|
815 | addseq(sj, V); |
---|
816 | V->seqlencount= V->seqlen; |
---|
817 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
818 | *sj= 0; |
---|
819 | strcpy(V->seqid, si); |
---|
820 | } |
---|
821 | else { |
---|
822 | countseq(sj, V); |
---|
823 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
824 | } |
---|
825 | } |
---|
826 | |
---|
827 | else if (V->nseq == V->choice) { |
---|
828 | addseq(V->s, V); |
---|
829 | V->seqlencount= V->seqlen; |
---|
830 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
831 | } |
---|
832 | else { |
---|
833 | countseq(V->s, V); |
---|
834 | if (V->seqlencount >= V->topseqlen) atname= true; |
---|
835 | } |
---|
836 | } |
---|
837 | } |
---|
838 | |
---|
839 | else if ( strstr(V->s,"matrix") ) { |
---|
840 | indata = true; |
---|
841 | atname= true; |
---|
842 | if (V->choice == kListSequences) V->done = true; |
---|
843 | } |
---|
844 | |
---|
845 | } while (!V->done); |
---|
846 | |
---|
847 | V->allDone = true; |
---|
848 | } /*readPAUPsequential*/ |
---|
849 | |
---|
850 | |
---|
851 | Local void readPhylipInterleaved(struct ReadSeqVars *V) |
---|
852 | { |
---|
853 | char *si, *sj; |
---|
854 | boolean first = true; |
---|
855 | int iline= 0; |
---|
856 | |
---|
857 | V->addit = (V->choice > 0); |
---|
858 | if (V->addit) V->seqlen = 0; |
---|
859 | V->seqlencount = 0; |
---|
860 | /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */ |
---|
861 | si= V->s; |
---|
862 | skipwhitespace(si); |
---|
863 | V->topnseq= atoi(si); |
---|
864 | while (isdigit(*si)) si++; |
---|
865 | skipwhitespace(si); |
---|
866 | V->topseqlen= atol(si); |
---|
867 | /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */ |
---|
868 | |
---|
869 | do { |
---|
870 | GetLine(V); |
---|
871 | V->done = feof(V->f); |
---|
872 | |
---|
873 | if (V->done && !(*V->s)) break; |
---|
874 | si= V->s; |
---|
875 | skipwhitespace(si); |
---|
876 | if (*si != 0) { |
---|
877 | |
---|
878 | if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */ |
---|
879 | (V->nseq)++; |
---|
880 | if (V->nseq >= V->topnseq) first= false; |
---|
881 | sj= V->s+10; /* past name, start of data */ |
---|
882 | if (V->choice == kListSequences) { |
---|
883 | *sj= 0; |
---|
884 | addinfo( si, V); |
---|
885 | } |
---|
886 | else if (V->nseq == V->choice) { |
---|
887 | addseq(sj, V); |
---|
888 | *sj= 0; |
---|
889 | strcpy(V->seqid, si); |
---|
890 | } |
---|
891 | } |
---|
892 | else if ( iline % V->nseq == V->choice -1 ) { |
---|
893 | addseq(si, V); |
---|
894 | } |
---|
895 | iline++; |
---|
896 | } |
---|
897 | } while (!V->done); |
---|
898 | |
---|
899 | V->allDone = true; |
---|
900 | } /*readPhylipInterleaved*/ |
---|
901 | |
---|
902 | |
---|
903 | |
---|
904 | Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
---|
905 | { |
---|
906 | *addend = false; |
---|
907 | *ungetend= false; |
---|
908 | countseq( V->s, V); |
---|
909 | return V->seqlencount >= V->topseqlen; |
---|
910 | } |
---|
911 | |
---|
912 | Local void readPhylipSequential(struct ReadSeqVars *V) |
---|
913 | { |
---|
914 | short i; |
---|
915 | char *si; |
---|
916 | /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */ |
---|
917 | si= V->s; |
---|
918 | skipwhitespace(si); |
---|
919 | V->topnseq= atoi(si); |
---|
920 | while (isdigit(*si)) si++; |
---|
921 | skipwhitespace(si); |
---|
922 | V->topseqlen= atol(si); |
---|
923 | GetLine(V); |
---|
924 | while (!V->allDone) { |
---|
925 | V->seqlencount= 0; |
---|
926 | strncpy(V->seqid, (V->s), 10); |
---|
927 | V->seqid[10]= 0; |
---|
928 | for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' '; |
---|
929 | readLoop(0, true, endPhylipSequential, V); |
---|
930 | if (feof(V->f)) V->allDone = true; |
---|
931 | } |
---|
932 | } |
---|
933 | |
---|
934 | |
---|
935 | |
---|
936 | |
---|
937 | Local void readSeqMain( |
---|
938 | struct ReadSeqVars *V, |
---|
939 | const long skiplines_, |
---|
940 | const short format_) |
---|
941 | { |
---|
942 | #define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \ |
---|
943 | for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); } |
---|
944 | |
---|
945 | boolean gotuw; |
---|
946 | long l; |
---|
947 | |
---|
948 | V->linestart= 0; |
---|
949 | V->matchchar= 0; |
---|
950 | if (V->f == NULL) |
---|
951 | V->err = eFileNotFound; |
---|
952 | else { |
---|
953 | |
---|
954 | for (l = skiplines_; l > 0; l--) GetLine( V); |
---|
955 | |
---|
956 | do { |
---|
957 | GetLine( V); |
---|
958 | for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ; |
---|
959 | } while ((l == 0) && !feof(V->f)); |
---|
960 | |
---|
961 | if (feof(V->f)) V->err = eNoData; |
---|
962 | |
---|
963 | else switch (format_) { |
---|
964 | case kPlain : readPlain(V); break; |
---|
965 | case kIG : readIG(V); break; |
---|
966 | case kStrider: readStrider(V); break; |
---|
967 | case kGenBank: readGenBank(V); break; |
---|
968 | case kPIR : readPIR(V); break; |
---|
969 | case kNBRF : readNBRF(V); break; |
---|
970 | case kPearson: readPearson(V); break; |
---|
971 | case kEMBL : readEMBL(V); break; |
---|
972 | case kZuker : readZuker(V); break; |
---|
973 | case kOlsen : readOlsen(V); break; |
---|
974 | case kMSF : readMSF(V); break; |
---|
975 | |
---|
976 | case kPAUP : { |
---|
977 | boolean done= false; |
---|
978 | boolean interleaved= false; |
---|
979 | char *cp; |
---|
980 | /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */ |
---|
981 | while (!done) { |
---|
982 | GetLine( V); |
---|
983 | tolowerstr( V->s); |
---|
984 | if (strstr( V->s, "matrix")) done= true; |
---|
985 | if (strstr( V->s, "interleav")) interleaved= true; |
---|
986 | if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5); |
---|
987 | if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6); |
---|
988 | if (NULL != (cp=strstr( V->s, "matchchar=")) ) { |
---|
989 | cp += 10; |
---|
990 | if (*cp=='\'') cp++; |
---|
991 | else if (*cp=='"') cp++; |
---|
992 | V->matchchar= *cp; |
---|
993 | } |
---|
994 | } |
---|
995 | if (interleaved) readPAUPinterleaved(V); |
---|
996 | else readPAUPsequential(V); |
---|
997 | } |
---|
998 | break; |
---|
999 | |
---|
1000 | /* kPhylip: ! can't determine in middle of file which type it is...*/ |
---|
1001 | /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */ |
---|
1002 | case kPhylip2: |
---|
1003 | readPhylipSequential(V); |
---|
1004 | break; |
---|
1005 | case kPhylip4: /* == kPhylip3 */ |
---|
1006 | readPhylipInterleaved(V); |
---|
1007 | break; |
---|
1008 | |
---|
1009 | default: |
---|
1010 | V->err = eUnknownFormat; |
---|
1011 | break; |
---|
1012 | |
---|
1013 | case kFitch : |
---|
1014 | strcpy(V->seqid, V->s); GetLine(V); |
---|
1015 | readFitch(V); |
---|
1016 | break; |
---|
1017 | |
---|
1018 | case kGCG: |
---|
1019 | do { |
---|
1020 | gotuw = (strstr(V->s,"..") != NULL); |
---|
1021 | if (gotuw) readUWGCG(V); |
---|
1022 | GetLine(V); |
---|
1023 | } while (!(feof(V->f) || V->allDone)); |
---|
1024 | break; |
---|
1025 | } |
---|
1026 | } |
---|
1027 | |
---|
1028 | V->filestart= false; |
---|
1029 | V->seq[V->seqlen] = 0; /* stick a string terminator on it */ |
---|
1030 | } |
---|
1031 | |
---|
1032 | |
---|
1033 | char *readSeqFp( |
---|
1034 | const short whichEntry_, /* index to sequence in file */ |
---|
1035 | FILE *fp_, /* pointer to open seq file */ |
---|
1036 | const long skiplines_, |
---|
1037 | const short format_, /* sequence file format */ |
---|
1038 | long *seqlen_, /* return seq size */ |
---|
1039 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
---|
1040 | short *error_, /* return error */ |
---|
1041 | char *seqid_) /* return seq name/info */ |
---|
1042 | { |
---|
1043 | struct ReadSeqVars V; |
---|
1044 | |
---|
1045 | if (format_ < kMinFormat || format_ > kMaxFormat) { |
---|
1046 | *error_ = eUnknownFormat; |
---|
1047 | *seqlen_ = 0; |
---|
1048 | return NULL; |
---|
1049 | } |
---|
1050 | |
---|
1051 | V.choice = whichEntry_; |
---|
1052 | V.fname = NULL; /* don't know */ |
---|
1053 | V.seq = (char*) calloc(1, kStartLength+1); |
---|
1054 | V.maxseq = kStartLength; |
---|
1055 | V.seqlen = 0; |
---|
1056 | V.seqid = seqid_; |
---|
1057 | |
---|
1058 | V.f = fp_; |
---|
1059 | V.filestart= (ftell( fp_) == 0); |
---|
1060 | /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */ |
---|
1061 | if (V.filestart) V.nseq = 0; |
---|
1062 | else V.nseq= *nseq_; /* track where we are in file...*/ |
---|
1063 | |
---|
1064 | *V.seqid = '\0'; |
---|
1065 | V.err = 0; |
---|
1066 | V.nseq = 0; |
---|
1067 | V.isseqchar = isSeqChar; |
---|
1068 | V.isseqcharfirst8 = 0; |
---|
1069 | if (V.choice == kListSequences) ; /* leave as is */ |
---|
1070 | else if (V.choice <= 0) V.choice = 1; /* default ?? */ |
---|
1071 | V.addit = (V.choice > 0); |
---|
1072 | V.allDone = false; |
---|
1073 | |
---|
1074 | readSeqMain(&V, skiplines_, format_); |
---|
1075 | |
---|
1076 | *error_ = V.err; |
---|
1077 | *seqlen_ = V.seqlen; |
---|
1078 | *nseq_ = V.nseq; |
---|
1079 | return V.seq; |
---|
1080 | } |
---|
1081 | |
---|
1082 | char *readSeq( |
---|
1083 | const short whichEntry_, /* index to sequence in file */ |
---|
1084 | const char *filename_, /* file name */ |
---|
1085 | const long skiplines_, |
---|
1086 | const short format_, /* sequence file format */ |
---|
1087 | long *seqlen_, /* return seq size */ |
---|
1088 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
---|
1089 | short *error_, /* return error */ |
---|
1090 | char *seqid_) /* return seq name/info */ |
---|
1091 | { |
---|
1092 | struct ReadSeqVars V; |
---|
1093 | |
---|
1094 | if (format_ < kMinFormat || format_ > kMaxFormat) { |
---|
1095 | *error_ = eUnknownFormat; |
---|
1096 | *seqlen_ = 0; |
---|
1097 | return NULL; |
---|
1098 | } |
---|
1099 | |
---|
1100 | V.choice = whichEntry_; |
---|
1101 | V.fname = filename_; /* don't need to copy string, just ptr to it */ |
---|
1102 | V.seq = (char*) calloc(1, kStartLength+1); |
---|
1103 | V.maxseq = kStartLength; |
---|
1104 | V.seqlen = 0; |
---|
1105 | V.seqid = seqid_; |
---|
1106 | |
---|
1107 | V.f = NULL; |
---|
1108 | *V.seqid = '\0'; |
---|
1109 | V.err = 0; |
---|
1110 | V.nseq = 0; |
---|
1111 | V.isseqchar = isSeqChar; |
---|
1112 | V.isseqcharfirst8 = 0; |
---|
1113 | if (V.choice == kListSequences) ; /* leave as is */ |
---|
1114 | else if (V.choice <= 0) V.choice = 1; /* default ?? */ |
---|
1115 | V.addit = (V.choice > 0); |
---|
1116 | V.allDone = false; |
---|
1117 | |
---|
1118 | V.f = fopen(V.fname, "r"); |
---|
1119 | V.filestart= true; |
---|
1120 | |
---|
1121 | readSeqMain(&V, skiplines_, format_); |
---|
1122 | |
---|
1123 | if (V.f != NULL) fclose(V.f); |
---|
1124 | *error_ = V.err; |
---|
1125 | *seqlen_ = V.seqlen; |
---|
1126 | *nseq_ = V.nseq; |
---|
1127 | return V.seq; |
---|
1128 | } |
---|
1129 | |
---|
1130 | |
---|
1131 | |
---|
1132 | |
---|
1133 | |
---|
1134 | char *listSeqs( |
---|
1135 | const char *filename_, /* file name */ |
---|
1136 | const long skiplines_, |
---|
1137 | const short format_, /* sequence file format */ |
---|
1138 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
---|
1139 | short *error_) /* return error */ |
---|
1140 | { |
---|
1141 | char seqid[256]; |
---|
1142 | long seqlen; |
---|
1143 | |
---|
1144 | return readSeq( kListSequences, filename_, skiplines_, format_, |
---|
1145 | &seqlen, nseq_, error_, seqid); |
---|
1146 | } |
---|
1147 | |
---|
1148 | |
---|
1149 | |
---|
1150 | |
---|
1151 | short seqFileFormat( /* return sequence format number, see ureadseq.h */ |
---|
1152 | const char *filename, |
---|
1153 | long *skiplines, /* return #lines to skip any junk like mail header */ |
---|
1154 | short *error) /* return any error value or 0 */ |
---|
1155 | { |
---|
1156 | FILE *fseq; |
---|
1157 | short format; |
---|
1158 | |
---|
1159 | fseq = fopen(filename, "r"); |
---|
1160 | format= seqFileFormatFp( fseq, skiplines, error); |
---|
1161 | if (fseq!=NULL) fclose(fseq); |
---|
1162 | return format; |
---|
1163 | } |
---|
1164 | |
---|
1165 | short seqFileFormatFp( |
---|
1166 | FILE *fseq, |
---|
1167 | long *skiplines, /* return #lines to skip any junk like mail header */ |
---|
1168 | short *error) /* return any error value or 0 */ |
---|
1169 | { |
---|
1170 | boolean foundIG= false, foundStrider= false, |
---|
1171 | foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false, |
---|
1172 | foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false, |
---|
1173 | gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false, |
---|
1174 | isfitch= false, isphylip= false, done= false; |
---|
1175 | short format= kUnknown; |
---|
1176 | int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0; |
---|
1177 | char sp[256]; |
---|
1178 | long linestart=0; |
---|
1179 | int maxlines2check=500; |
---|
1180 | |
---|
1181 | #define ReadOneLine(sp) \ |
---|
1182 | { done |= (feof(fseq)); \ |
---|
1183 | readline( fseq, sp, &linestart); \ |
---|
1184 | if (!done) { splen = strlen(sp); ++nlines; } } |
---|
1185 | |
---|
1186 | *skiplines = 0; |
---|
1187 | *error = 0; |
---|
1188 | if (fseq == NULL) { *error = eFileNotFound; return kNoformat; } |
---|
1189 | |
---|
1190 | while ( !done ) { |
---|
1191 | ReadOneLine(sp); |
---|
1192 | |
---|
1193 | /* check for mailer head & skip past if found */ |
---|
1194 | if (nlines < 4 && !done) { |
---|
1195 | if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) { |
---|
1196 | do { |
---|
1197 | /* skip all lines until find one blank line */ |
---|
1198 | ReadOneLine(sp); |
---|
1199 | if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ; |
---|
1200 | } while ((!done) && (k < splen)); |
---|
1201 | *skiplines = nlines; /* !? do we want #lines or #bytes ?? */ |
---|
1202 | } |
---|
1203 | } |
---|
1204 | |
---|
1205 | if (sp==NULL || *sp==0) |
---|
1206 | ; /* nada */ |
---|
1207 | |
---|
1208 | /* high probability identities: */ |
---|
1209 | |
---|
1210 | else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") ) |
---|
1211 | gotMSF= true; |
---|
1212 | |
---|
1213 | else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL)) |
---|
1214 | gotuw= true; |
---|
1215 | |
---|
1216 | else if (strstr(sp,"identity: Data:") != NULL) |
---|
1217 | gotolsen= true; |
---|
1218 | |
---|
1219 | else if ( strstr(sp,"::=") && |
---|
1220 | (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */ |
---|
1221 | strstr(sp,"Seq-entry") || |
---|
1222 | strstr(sp,"Seq-submit") ) ) /* can we read submit format? */ |
---|
1223 | gotasn1= true; |
---|
1224 | |
---|
1225 | else if ( strstr(sp,"#NEXUS") == sp ) |
---|
1226 | gotpaup= true; |
---|
1227 | |
---|
1228 | /* uncertain identities: */ |
---|
1229 | |
---|
1230 | else if (*sp ==';') { |
---|
1231 | if (strstr(sp,"Strider") !=NULL) foundStrider= true; |
---|
1232 | else foundIG= true; |
---|
1233 | } |
---|
1234 | |
---|
1235 | else if (strstr(sp,"LOCUS") == sp) |
---|
1236 | foundGB= true; |
---|
1237 | else if (strstr(sp,"ORIGIN") == sp) |
---|
1238 | foundGB= true; |
---|
1239 | |
---|
1240 | else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */ |
---|
1241 | foundPIR= true; |
---|
1242 | else if (strstr(sp,"SEQUENCE") == sp) |
---|
1243 | foundPIR= true; |
---|
1244 | |
---|
1245 | else if (*sp == '>') { |
---|
1246 | if (sp[3] == ';') foundNBRF= true; |
---|
1247 | else foundPearson= true; |
---|
1248 | } |
---|
1249 | |
---|
1250 | else if (strstr(sp,"ID ") == sp) |
---|
1251 | foundEMBL= true; |
---|
1252 | else if (strstr(sp,"SQ ") == sp) |
---|
1253 | foundEMBL= true; |
---|
1254 | |
---|
1255 | else if (*sp == '(') |
---|
1256 | foundZuker= true; |
---|
1257 | |
---|
1258 | else { |
---|
1259 | if (nlines - *skiplines == 1) { |
---|
1260 | int ispp= 0, ilen= 0; |
---|
1261 | sscanf( sp, "%d%d", &ispp, &ilen); |
---|
1262 | if (ispp > 0 && ilen > 0) isphylip= true; |
---|
1263 | } |
---|
1264 | else if (isphylip && nlines - *skiplines == 2) { |
---|
1265 | int tseq; |
---|
1266 | tseq= getseqtype(sp+10, strlen(sp+10)); |
---|
1267 | if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */ |
---|
1268 | && (tseq != kOtherSeq)) /* sequence section must be okay */ |
---|
1269 | foundPhylip= true; |
---|
1270 | } |
---|
1271 | |
---|
1272 | for (k=0, isfitch= true; isfitch & (k < splen); k++) { |
---|
1273 | if (k % 4 == 0) isfitch &= (sp[k] == ' '); |
---|
1274 | else isfitch &= (sp[k] != ' '); |
---|
1275 | } |
---|
1276 | if (isfitch & (splen > 20)) foundFitch= true; |
---|
1277 | |
---|
1278 | /* kRNA && kDNA are fairly certain...*/ |
---|
1279 | switch (getseqtype( sp, splen)) { |
---|
1280 | case kOtherSeq: otherlines++; break; |
---|
1281 | case kAmino : if (splen>20) aminolines++; break; |
---|
1282 | case kDNA : |
---|
1283 | case kRNA : if (splen>20) dnalines++; break; |
---|
1284 | case kNucleic : break; /* not much info ? */ |
---|
1285 | } |
---|
1286 | |
---|
1287 | } |
---|
1288 | |
---|
1289 | /* pretty certain */ |
---|
1290 | if (gotolsen) { |
---|
1291 | format= kOlsen; |
---|
1292 | done= true; |
---|
1293 | } |
---|
1294 | else if (gotMSF) { |
---|
1295 | format= kMSF; |
---|
1296 | done= true; |
---|
1297 | } |
---|
1298 | else if (gotasn1) { |
---|
1299 | /* !! we need to look further and return kASNseqentry | kASNseqset */ |
---|
1300 | /* |
---|
1301 | seqentry key is Seq-entry ::= |
---|
1302 | seqset key is Bioseq-set ::= |
---|
1303 | ?? can't read these yet w/ ncbi tools ?? |
---|
1304 | Seq-submit ::= |
---|
1305 | Bioseq ::= << fails both bioseq-seq and seq-entry parsers ! |
---|
1306 | */ |
---|
1307 | if (strstr(sp,"Bioseq-set")) format= kASNseqset; |
---|
1308 | else if (strstr(sp,"Seq-entry")) format= kASNseqentry; |
---|
1309 | else format= kASN1; /* other form, we can't yet read... */ |
---|
1310 | done= true; |
---|
1311 | } |
---|
1312 | else if (gotpaup) { |
---|
1313 | format= kPAUP; |
---|
1314 | done= true; |
---|
1315 | } |
---|
1316 | |
---|
1317 | else if (gotuw) { |
---|
1318 | if (foundIG) format= kIG; /* a TOIG file from GCG for certain */ |
---|
1319 | else format= kGCG; |
---|
1320 | done= true; |
---|
1321 | } |
---|
1322 | |
---|
1323 | else if ((dnalines > 1) || done || (nlines > maxlines2check)) { |
---|
1324 | /* decide on most likely format */ |
---|
1325 | /* multichar idents: */ |
---|
1326 | if (foundStrider) format= kStrider; |
---|
1327 | else if (foundGB) format= kGenBank; |
---|
1328 | else if (foundPIR) format= kPIR; |
---|
1329 | else if (foundEMBL) format= kEMBL; |
---|
1330 | else if (foundNBRF) format= kNBRF; |
---|
1331 | /* single char idents: */ |
---|
1332 | else if (foundIG) format= kIG; |
---|
1333 | else if (foundPearson) format= kPearson; |
---|
1334 | else if (foundZuker) format= kZuker; |
---|
1335 | /* digit ident: */ |
---|
1336 | else if (foundPhylip) format= kPhylip; |
---|
1337 | /* spacing ident: */ |
---|
1338 | else if (foundFitch) format= kFitch; |
---|
1339 | /* no format chars: */ |
---|
1340 | else if (otherlines > 0) format= kUnknown; |
---|
1341 | else if (dnalines > 1) format= kPlain; |
---|
1342 | else if (aminolines > 1) format= kPlain; |
---|
1343 | else format= kUnknown; |
---|
1344 | |
---|
1345 | done= true; |
---|
1346 | } |
---|
1347 | |
---|
1348 | /* need this for possible long header in olsen format */ |
---|
1349 | else if (strstr(sp,"): ") != NULL) |
---|
1350 | maxlines2check++; |
---|
1351 | } |
---|
1352 | |
---|
1353 | if (format == kPhylip) { |
---|
1354 | /* check for interleaved or sequential -- really messy */ |
---|
1355 | int tname, tseq; |
---|
1356 | long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0; |
---|
1357 | char *ps; |
---|
1358 | |
---|
1359 | rewind(fseq); |
---|
1360 | for (i=0; i < *skiplines; i++) ReadOneLine(sp); |
---|
1361 | nlines= 0; |
---|
1362 | ReadOneLine(sp); |
---|
1363 | sscanf( sp, "%ld%ld", &nspp, &nlen); |
---|
1364 | ReadOneLine(sp); /* 1st seq line */ |
---|
1365 | for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
---|
1366 | |
---|
1367 | for (i= 1; i<nspp; i++) { |
---|
1368 | ReadOneLine(sp); |
---|
1369 | |
---|
1370 | tseq= getseqtype(sp+10, strlen(sp+10)); |
---|
1371 | tname= getseqtype(sp, 10); |
---|
1372 | for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++); |
---|
1373 | for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
---|
1374 | |
---|
1375 | /* find probable interleaf or sequential ... */ |
---|
1376 | if (j>=9) seq += 10; /* pretty certain not ileaf */ |
---|
1377 | else { |
---|
1378 | if (tseq != tname) leaf++; else seq++; |
---|
1379 | if (tname == kDNA || tname == kRNA) seq++; else leaf++; |
---|
1380 | } |
---|
1381 | |
---|
1382 | if (ilen <= nlen && j<9) { |
---|
1383 | if (tname == kOtherSeq) leaf += 10; |
---|
1384 | else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++; |
---|
1385 | } |
---|
1386 | else if (ilen > nlen) { |
---|
1387 | ilen= 0; |
---|
1388 | } |
---|
1389 | } |
---|
1390 | for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */ |
---|
1391 | ReadOneLine(sp); |
---|
1392 | |
---|
1393 | tseq= getseqtype(sp+10, strlen(sp+10)); |
---|
1394 | tname= getseqtype(sp, 10); |
---|
1395 | for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
---|
1396 | for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++); |
---|
1397 | if (j<9) { |
---|
1398 | if (tname == kOtherSeq) seq += 10; |
---|
1399 | if (tseq != tname) seq++; else leaf++; |
---|
1400 | if (tname == kDNA || tname == kRNA) leaf++; else seq++; |
---|
1401 | } |
---|
1402 | if (ilen > nlen) { |
---|
1403 | if (j>9) leaf += 10; /* must be a name here for sequent */ |
---|
1404 | else if (tname == kOtherSeq) seq += 10; |
---|
1405 | ilen= 0; |
---|
1406 | } |
---|
1407 | } |
---|
1408 | |
---|
1409 | if (leaf > seq) format= kPhylip4; |
---|
1410 | else format= kPhylip2; |
---|
1411 | } |
---|
1412 | |
---|
1413 | return(format); |
---|
1414 | #undef ReadOneLine |
---|
1415 | } /* SeqFileFormat */ |
---|
1416 | |
---|
1417 | |
---|
1418 | |
---|
1419 | |
---|
1420 | unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal) |
---|
1421 | /* GCGchecksum */ |
---|
1422 | { |
---|
1423 | long i, check = 0, count = 0; |
---|
1424 | |
---|
1425 | for (i = 0; i < seqlen; i++) { |
---|
1426 | count++; |
---|
1427 | check += count * to_upper(seq[i]); |
---|
1428 | if (count == 57) count = 0; |
---|
1429 | } |
---|
1430 | check %= 10000; |
---|
1431 | *checktotal += check; |
---|
1432 | *checktotal %= 10000; |
---|
1433 | return check; |
---|
1434 | } |
---|
1435 | |
---|
1436 | /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */ |
---|
1437 | const unsigned long crctab[] = { |
---|
1438 | 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, |
---|
1439 | 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, |
---|
1440 | 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, |
---|
1441 | 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, |
---|
1442 | 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, |
---|
1443 | 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, |
---|
1444 | 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, |
---|
1445 | 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, |
---|
1446 | 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, |
---|
1447 | 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, |
---|
1448 | 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, |
---|
1449 | 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, |
---|
1450 | 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, |
---|
1451 | 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, |
---|
1452 | 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, |
---|
1453 | 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, |
---|
1454 | 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, |
---|
1455 | 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, |
---|
1456 | 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, |
---|
1457 | 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, |
---|
1458 | 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, |
---|
1459 | 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, |
---|
1460 | 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, |
---|
1461 | 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, |
---|
1462 | 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, |
---|
1463 | 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, |
---|
1464 | 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, |
---|
1465 | 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, |
---|
1466 | 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, |
---|
1467 | 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, |
---|
1468 | 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, |
---|
1469 | 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, |
---|
1470 | 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, |
---|
1471 | 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, |
---|
1472 | 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, |
---|
1473 | 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, |
---|
1474 | 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, |
---|
1475 | 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, |
---|
1476 | 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, |
---|
1477 | 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, |
---|
1478 | 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, |
---|
1479 | 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, |
---|
1480 | 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, |
---|
1481 | 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, |
---|
1482 | 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, |
---|
1483 | 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, |
---|
1484 | 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, |
---|
1485 | 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, |
---|
1486 | 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, |
---|
1487 | 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, |
---|
1488 | 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, |
---|
1489 | 0x2d02ef8dL |
---|
1490 | }; |
---|
1491 | |
---|
1492 | unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal) |
---|
1493 | /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */ |
---|
1494 | { |
---|
1495 | unsigned long c = 0xffffffffL; |
---|
1496 | long n = seqlen; |
---|
1497 | |
---|
1498 | if (!seq) return 0; |
---|
1499 | while (n--) { |
---|
1500 | c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8); |
---|
1501 | seq++; /* fixed aug'98 finally */ |
---|
1502 | } |
---|
1503 | c= c ^ 0xffffffffL; |
---|
1504 | *checktotal += c; |
---|
1505 | return c; |
---|
1506 | } |
---|
1507 | |
---|
1508 | |
---|
1509 | |
---|
1510 | |
---|
1511 | short getseqtype( const char *seq, const long seqlen) |
---|
1512 | { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */ |
---|
1513 | char c; |
---|
1514 | short i, maxtest; |
---|
1515 | short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0; |
---|
1516 | |
---|
1517 | maxtest = min(300, seqlen); |
---|
1518 | for (i = 0; i < maxtest; i++) { |
---|
1519 | c = to_upper(seq[i]); |
---|
1520 | if (strchr(protonly, c)) po++; |
---|
1521 | else if (strchr(primenuc,c)) { |
---|
1522 | na++; |
---|
1523 | if (c == 'T') nt++; |
---|
1524 | else if (c == 'U') nu++; |
---|
1525 | } |
---|
1526 | else if (strchr(aminos,c)) aa++; |
---|
1527 | else if (strchr(seqsymbols,c)) ns++; |
---|
1528 | else if (isalpha(c)) no++; |
---|
1529 | } |
---|
1530 | |
---|
1531 | if ((no > 0) || (po+aa+na == 0)) return kOtherSeq; |
---|
1532 | /* ?? test for probability of kOtherSeq ?, e.g., |
---|
1533 | else if (po+aa+na / maxtest < 0.70) return kOtherSeq; |
---|
1534 | */ |
---|
1535 | else if (po > 0) return kAmino; |
---|
1536 | else if (aa == 0) { |
---|
1537 | if (nu > nt) return kRNA; |
---|
1538 | else return kDNA; |
---|
1539 | } |
---|
1540 | else if (na > aa) return kNucleic; |
---|
1541 | else return kAmino; |
---|
1542 | } /* getseqtype */ |
---|
1543 | |
---|
1544 | |
---|
1545 | char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen) |
---|
1546 | { |
---|
1547 | char *a, *b; |
---|
1548 | long i; |
---|
1549 | char *newseq; |
---|
1550 | |
---|
1551 | *newlen= 0; |
---|
1552 | if (!seq) return NULL; |
---|
1553 | newseq = (char*) malloc(seqlen+1); |
---|
1554 | if (!newseq) return NULL; |
---|
1555 | for (a= (char*)seq, b=newseq, i=0; *a!=0; a++) |
---|
1556 | if (*a != gapc) { |
---|
1557 | *b++= *a; |
---|
1558 | i++; |
---|
1559 | } |
---|
1560 | *b= '\0'; |
---|
1561 | newseq = (char*) realloc(newseq, i+1); |
---|
1562 | *newlen= i; |
---|
1563 | return newseq; |
---|
1564 | } |
---|
1565 | |
---|
1566 | |
---|
1567 | |
---|
1568 | /*** |
---|
1569 | char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \ |
---|
1570 | {\\fonttbl\ |
---|
1571 | {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\ |
---|
1572 | {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\ |
---|
1573 | {\\f5\\froman Times;}{\\f6\\froman Palatino;}\ |
---|
1574 | {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\ |
---|
1575 | {\\stylesheet\ |
---|
1576 | {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\ |
---|
1577 | {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\ |
---|
1578 | {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}"; |
---|
1579 | |
---|
1580 | char *rtftail = "}"; |
---|
1581 | ****/ |
---|
1582 | |
---|
1583 | short writeSeq(FILE *outf, const char *seq, const long seqlen, |
---|
1584 | const short outform, const char *seqid) |
---|
1585 | /* dump sequence to standard output */ |
---|
1586 | { |
---|
1587 | const short kSpaceAll = -9; |
---|
1588 | #define kMaxseqwidth 250 |
---|
1589 | |
---|
1590 | boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */ |
---|
1591 | short numline = 0; /* only true if we are writing seq number line (for interleave) */ |
---|
1592 | boolean numright = false, numleft = false; |
---|
1593 | boolean nameright = false, nameleft = false; |
---|
1594 | short namewidth = 8, numwidth = 8; |
---|
1595 | short spacer = 0, width = 50, tab = 0; |
---|
1596 | /* new parameters: width, spacer, those above... */ |
---|
1597 | |
---|
1598 | short linesout = 0, seqtype = kNucleic; |
---|
1599 | long i, j, l, l1, ibase; |
---|
1600 | char idword[31], endstr[14]; |
---|
1601 | char seqnamestore[128], *seqname = seqnamestore; |
---|
1602 | char s[kMaxseqwidth], *cp; |
---|
1603 | char nameform[10], numform[10], nocountsymbols[10]; |
---|
1604 | unsigned long checksum = 0, checktotal = 0; |
---|
1605 | |
---|
1606 | gPretty.atseq++; |
---|
1607 | skipwhitespace(seqid); |
---|
1608 | l = min(128, strlen(seqid)); |
---|
1609 | strncpy( seqnamestore, seqid, l); |
---|
1610 | seqname[l] = 0; |
---|
1611 | |
---|
1612 | sscanf( seqname, "%30s", idword); |
---|
1613 | sprintf(numform, "%ld", seqlen); |
---|
1614 | numwidth= strlen(numform)+1; |
---|
1615 | nameform[0]= '\0'; |
---|
1616 | |
---|
1617 | if (strstr(seqname,"checksum") != NULL) { |
---|
1618 | cp = strstr(seqname,"bases"); |
---|
1619 | if (cp!=NULL) { |
---|
1620 | for ( ; (cp!=seqname) && (*cp!=','); cp--) ; |
---|
1621 | if (cp!=seqname) *cp=0; |
---|
1622 | } |
---|
1623 | } |
---|
1624 | |
---|
1625 | strcpy( endstr,""); |
---|
1626 | l1 = 0; |
---|
1627 | |
---|
1628 | if (outform == kGCG || outform == kMSF) |
---|
1629 | checksum = GCGchecksum(seq, seqlen, &checktotal); |
---|
1630 | else |
---|
1631 | checksum = seqchecksum(seq, seqlen, &checktotal); |
---|
1632 | |
---|
1633 | switch (outform) { |
---|
1634 | |
---|
1635 | case kPlain: |
---|
1636 | case kUnknown: /* no header, just sequence */ |
---|
1637 | strcpy(endstr,"\n"); /* end w/ extra blank line */ |
---|
1638 | break; |
---|
1639 | |
---|
1640 | case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */ |
---|
1641 | case kGenBank: |
---|
1642 | fprintf(outf,"LOCUS %s %ld bp\n", idword, seqlen); |
---|
1643 | fprintf(outf,"DEFINITION %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1644 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
---|
1645 | fprintf(outf,"ORIGIN \n"); |
---|
1646 | spacer = 11; |
---|
1647 | numleft = true; |
---|
1648 | numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */ |
---|
1649 | strcpy(endstr, "\n//"); |
---|
1650 | linesout += 4; |
---|
1651 | break; |
---|
1652 | |
---|
1653 | case kPIR: |
---|
1654 | /* somewhat like genbank... \\\*/ |
---|
1655 | /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */ |
---|
1656 | fprintf(outf,"ENTRY %s \n", idword); |
---|
1657 | fprintf(outf,"TITLE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1658 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
---|
1659 | fprintf(outf,"SEQUENCE \n"); |
---|
1660 | numwidth = 7; |
---|
1661 | width= 30; |
---|
1662 | spacer = kSpaceAll; |
---|
1663 | numleft = true; |
---|
1664 | strcpy(endstr, "\n///"); |
---|
1665 | /* run a top number line for PIR */ |
---|
1666 | for (j=0; j<numwidth; j++) fputc(' ',outf); |
---|
1667 | for (j= 5; j<=width; j += 5) fprintf(outf,"%10ld",j); |
---|
1668 | fputc('\n',outf); |
---|
1669 | linesout += 5; |
---|
1670 | break; |
---|
1671 | |
---|
1672 | case kNBRF: |
---|
1673 | if (getseqtype(seq, seqlen) == kAmino) |
---|
1674 | fprintf(outf,">P1;%s\n", idword); |
---|
1675 | else |
---|
1676 | fprintf(outf,">DL;%s\n", idword); |
---|
1677 | fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1678 | spacer = 11; |
---|
1679 | strcpy(endstr,"*\n"); |
---|
1680 | linesout += 3; |
---|
1681 | break; |
---|
1682 | |
---|
1683 | case kEMBL: |
---|
1684 | fprintf(outf,"ID %s\n", idword); |
---|
1685 | /* fprintf(outf,"AC %s\n", accnum); */ |
---|
1686 | fprintf(outf,"DE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1687 | fprintf(outf,"SQ %ld BP\n", seqlen); |
---|
1688 | strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ |
---|
1689 | tab = 4; /** added 31jan91 */ |
---|
1690 | spacer = 11; /** added 31jan91 */ |
---|
1691 | width = 60; |
---|
1692 | linesout += 4; |
---|
1693 | break; |
---|
1694 | |
---|
1695 | case kGCG: |
---|
1696 | fprintf(outf,"%s\n", seqname); |
---|
1697 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
---|
1698 | fprintf(outf," %s Length: %ld (today) Check: %ld ..\n", idword, seqlen, checksum); |
---|
1699 | spacer = 11; |
---|
1700 | numleft = true; |
---|
1701 | strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ |
---|
1702 | linesout += 3; |
---|
1703 | break; |
---|
1704 | |
---|
1705 | case kStrider: /* ?? map ?*/ |
---|
1706 | fprintf(outf,"; ### from DNA Strider ;-)\n"); |
---|
1707 | fprintf(outf,"; DNA sequence %s, %ld bases, %lX checksum.\n;\n", seqname, seqlen, checksum); |
---|
1708 | strcpy(endstr, "\n//"); |
---|
1709 | linesout += 3; |
---|
1710 | break; |
---|
1711 | |
---|
1712 | case kFitch: |
---|
1713 | fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1714 | spacer = 4; |
---|
1715 | width = 60; |
---|
1716 | linesout += 1; |
---|
1717 | break; |
---|
1718 | |
---|
1719 | case kPhylip2: |
---|
1720 | case kPhylip4: |
---|
1721 | /* this is version 3.2/3.4 -- simplest way to write |
---|
1722 | version 3.3 is to write as version 3.2, then |
---|
1723 | re-read file and interleave the species lines */ |
---|
1724 | if (strlen(idword)>10) idword[10] = 0; |
---|
1725 | fprintf(outf,"%-10s ",idword); |
---|
1726 | l1 = -1; |
---|
1727 | tab = 12; |
---|
1728 | spacer = 11; |
---|
1729 | break; |
---|
1730 | |
---|
1731 | case kASN1: |
---|
1732 | seqtype= getseqtype(seq, seqlen); |
---|
1733 | switch (seqtype) { |
---|
1734 | case kDNA : cp= (char*)"dna"; break; |
---|
1735 | case kRNA : cp= (char*)"rna"; break; |
---|
1736 | case kNucleic : cp= (char*)"na"; break; |
---|
1737 | case kAmino : cp= (char*)"aa"; break; |
---|
1738 | case kOtherSeq: cp= (char*)"not-set"; break; |
---|
1739 | } |
---|
1740 | fprintf(outf," seq {\n"); |
---|
1741 | fprintf(outf," id { local id %d },\n", gPretty.atseq); |
---|
1742 | fprintf(outf," descr { title \"%s\" },\n", seqid); |
---|
1743 | fprintf(outf," inst {\n"); |
---|
1744 | fprintf(outf," repr raw, mol %s, length %ld, topology linear,\n", cp, seqlen); |
---|
1745 | fprintf(outf," seq-data\n"); |
---|
1746 | if (seqtype == kAmino) |
---|
1747 | fprintf(outf," iupacaa \""); |
---|
1748 | else |
---|
1749 | fprintf(outf," iupacna \""); |
---|
1750 | l1 = 17; |
---|
1751 | spacer = 0; |
---|
1752 | width = 78; |
---|
1753 | tab = 0; |
---|
1754 | strcpy(endstr,"\"\n } } ,"); |
---|
1755 | linesout += 7; |
---|
1756 | break; |
---|
1757 | |
---|
1758 | case kPAUP: |
---|
1759 | nameleft= true; |
---|
1760 | namewidth = 9; |
---|
1761 | spacer = 21; |
---|
1762 | width = 100; |
---|
1763 | tab = 0; /* 1; */ |
---|
1764 | /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */ |
---|
1765 | /* do a header comment line for paup */ |
---|
1766 | fprintf(outf,"[Name: %-16s Len:%6ld Check: %8lX]\n", idword, seqlen, checksum); |
---|
1767 | linesout += 1; |
---|
1768 | break; |
---|
1769 | |
---|
1770 | case kPretty: |
---|
1771 | numline= gPretty.numline; |
---|
1772 | baseonlynum= gPretty.baseonlynum; |
---|
1773 | namewidth = gPretty.namewidth; |
---|
1774 | numright = gPretty.numright; |
---|
1775 | numleft = gPretty.numleft; |
---|
1776 | nameright = gPretty.nameright; |
---|
1777 | nameleft = gPretty.nameleft; |
---|
1778 | spacer = gPretty.spacer + 1; |
---|
1779 | width = gPretty.seqwidth; |
---|
1780 | tab = gPretty.tab; |
---|
1781 | /* also add rtf formatting w/ font, size, style */ |
---|
1782 | if (gPretty.nametop) { |
---|
1783 | fprintf(outf,"Name: %-16s Len:%6ld Check: %8lX\n", idword, seqlen, checksum); |
---|
1784 | linesout++; |
---|
1785 | } |
---|
1786 | break; |
---|
1787 | |
---|
1788 | case kMSF: |
---|
1789 | fprintf(outf," Name: %-16s Len:%6ld Check: %5ld Weight: 1.00\n", |
---|
1790 | idword, seqlen, checksum); |
---|
1791 | linesout++; |
---|
1792 | nameleft= true; |
---|
1793 | namewidth= 15; /* need MAX namewidth here... */ |
---|
1794 | sprintf(nameform, "%%+%ds ",namewidth); |
---|
1795 | spacer = 11; |
---|
1796 | width = 50; |
---|
1797 | tab = 0; /* 1; */ |
---|
1798 | break; |
---|
1799 | |
---|
1800 | case kIG: |
---|
1801 | fprintf(outf,";%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1802 | fprintf(outf,"%s\n", idword); |
---|
1803 | strcpy(endstr,"1"); /* == linear dna */ |
---|
1804 | linesout += 2; |
---|
1805 | break; |
---|
1806 | |
---|
1807 | default : |
---|
1808 | case kZuker: /* don't attempt Zuker's ftn format */ |
---|
1809 | case kPearson: |
---|
1810 | fprintf(outf,">%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
---|
1811 | linesout += 1; |
---|
1812 | break; |
---|
1813 | } |
---|
1814 | |
---|
1815 | if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth); |
---|
1816 | if (numline) sprintf(numform, "%%%ds ",numwidth); |
---|
1817 | else sprintf(numform, "%%%dd ",numwidth); |
---|
1818 | strcpy( nocountsymbols, kNocountsymbols); |
---|
1819 | if (baseonlynum) { |
---|
1820 | if (strchr(nocountsymbols,gPretty.gapchar)==NULL) { |
---|
1821 | strcat(nocountsymbols," "); |
---|
1822 | nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar; |
---|
1823 | } |
---|
1824 | if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) { |
---|
1825 | *cp= ' '; |
---|
1826 | } |
---|
1827 | } |
---|
1828 | |
---|
1829 | if (numline) { |
---|
1830 | *idword= 0; |
---|
1831 | } |
---|
1832 | |
---|
1833 | width = min(width,kMaxseqwidth); |
---|
1834 | for (i=0, l=0, ibase = 1; i < seqlen; ) { |
---|
1835 | |
---|
1836 | if (l1 < 0) l1 = 0; |
---|
1837 | else if (l1 == 0) { |
---|
1838 | if (nameleft) fprintf(outf, nameform, idword); |
---|
1839 | if (numleft) { if (numline) fprintf(outf, numform, ""); |
---|
1840 | else fprintf(outf, numform, ibase);} |
---|
1841 | for (j=0; j<tab; j++) fputc(' ',outf); |
---|
1842 | } |
---|
1843 | |
---|
1844 | l1++; /* don't count spaces for width*/ |
---|
1845 | if (numline) { |
---|
1846 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) { |
---|
1847 | if (numline==1) fputc(' ',outf); |
---|
1848 | s[l++] = ' '; |
---|
1849 | } |
---|
1850 | if (l1 % 10 == 1 || l1 == width) { |
---|
1851 | if (numline==1) fprintf(outf,"%-9ld ",i+1); |
---|
1852 | s[l++]= '|'; /* == put a number here */ |
---|
1853 | } |
---|
1854 | else s[l++]= ' '; |
---|
1855 | i++; |
---|
1856 | } |
---|
1857 | |
---|
1858 | else { |
---|
1859 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) |
---|
1860 | s[l++] = ' '; |
---|
1861 | if (!baseonlynum) ibase++; |
---|
1862 | else if (0==strchr(nocountsymbols,seq[i])) ibase++; |
---|
1863 | s[l++] = seq[i++]; |
---|
1864 | } |
---|
1865 | |
---|
1866 | if (l1 == width || i == seqlen) { |
---|
1867 | if (outform==kPretty) for ( ; l1<width; l1++) { |
---|
1868 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) |
---|
1869 | s[l++] = ' '; |
---|
1870 | s[l++]=' '; /* pad w/ blanks */ |
---|
1871 | } |
---|
1872 | s[l] = '\0'; |
---|
1873 | l = 0; l1 = 0; |
---|
1874 | |
---|
1875 | if (numline) { |
---|
1876 | if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */ |
---|
1877 | } |
---|
1878 | else { |
---|
1879 | if (i == seqlen) fprintf(outf,"%s%s",s,endstr); |
---|
1880 | else fprintf(outf,"%s",s); |
---|
1881 | if (numright || nameright) fputc(' ',outf); |
---|
1882 | if (numright) fprintf(outf,numform, ibase-1); |
---|
1883 | if (nameright) fprintf(outf, nameform,idword); |
---|
1884 | } |
---|
1885 | fputc('\n',outf); |
---|
1886 | linesout++; |
---|
1887 | } |
---|
1888 | } |
---|
1889 | return linesout; |
---|
1890 | } /*writeSeq*/ |
---|
1891 | |
---|
1892 | |
---|
1893 | |
---|
1894 | /* End file: ureadseq.c */ |
---|