| 1 | /* File: ureadseq.c |
|---|
| 2 | * |
|---|
| 3 | * Reads and writes nucleic/protein sequence in various |
|---|
| 4 | * formats. Data files may have multiple sequences. |
|---|
| 5 | * |
|---|
| 6 | * Copyright 1990 by d.g.gilbert |
|---|
| 7 | * biology dept., indiana university, bloomington, in 47405 |
|---|
| 8 | * e-mail: gilbertd@bio.indiana.edu |
|---|
| 9 | * |
|---|
| 10 | * This program may be freely copied and used by anyone. |
|---|
| 11 | * Developers are encourged to incorporate parts in their |
|---|
| 12 | * programs, rather than devise their own private sequence |
|---|
| 13 | * format. |
|---|
| 14 | * |
|---|
| 15 | * This should compile and run with any ANSI C compiler. |
|---|
| 16 | * |
|---|
| 17 | * ------------------------------------------------------------ |
|---|
| 18 | * |
|---|
| 19 | * Some slight modifications were done by ARB developers |
|---|
| 20 | * |
|---|
| 21 | */ |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | #include <stdio.h> |
|---|
| 25 | #include <stdlib.h> |
|---|
| 26 | #include <string.h> |
|---|
| 27 | #define __NO_CTYPE |
|---|
| 28 | #include <ctype.h> |
|---|
| 29 | |
|---|
| 30 | #include "ureadseq.h" |
|---|
| 31 | prettyopts gPretty; |
|---|
| 32 | |
|---|
| 33 | |
|---|
| 34 | int Strcasecmp(const char *a, const char *b) /* from Nlm_StrICmp */ |
|---|
| 35 | { |
|---|
| 36 | int diff, done; |
|---|
| 37 | if (a == b) return 0; |
|---|
| 38 | done = 0; |
|---|
| 39 | while (! done) { |
|---|
| 40 | diff = to_upper(*a) - to_upper(*b); |
|---|
| 41 | if (diff) return diff; |
|---|
| 42 | if (*a == '\0') done = 1; |
|---|
| 43 | else { a++; b++; } |
|---|
| 44 | } |
|---|
| 45 | return 0; |
|---|
| 46 | } |
|---|
| 47 | |
|---|
| 48 | int Strncasecmp(const char *a, const char *b, long maxn) /* from Nlm_StrNICmp */ |
|---|
| 49 | { |
|---|
| 50 | int diff, done; |
|---|
| 51 | if (a == b) return 0; |
|---|
| 52 | done = 0; |
|---|
| 53 | while (! done) { |
|---|
| 54 | diff = to_upper(*a) - to_upper(*b); |
|---|
| 55 | if (diff) return diff; |
|---|
| 56 | if (*a == '\0') done = 1; |
|---|
| 57 | else { |
|---|
| 58 | a++; b++; maxn--; |
|---|
| 59 | if (! maxn) done = 1; |
|---|
| 60 | } |
|---|
| 61 | } |
|---|
| 62 | return 0; |
|---|
| 63 | } |
|---|
| 64 | |
|---|
| 65 | |
|---|
| 66 | |
|---|
| 67 | |
|---|
| 68 | |
|---|
| 69 | #ifndef Local |
|---|
| 70 | # define Local static /* local functions */ |
|---|
| 71 | #endif |
|---|
| 72 | |
|---|
| 73 | #define kStartLength 500000 /* 20Apr93 temp. bug fix */ |
|---|
| 74 | |
|---|
| 75 | const char *aminos = "ABCDEFGHIKLMNPQRSTVWXYZ*"; |
|---|
| 76 | const char *primenuc = "ACGTU"; |
|---|
| 77 | const char *protonly = "EFIPQZ"; |
|---|
| 78 | |
|---|
| 79 | const char kNocountsymbols[5] = "_.-?"; |
|---|
| 80 | const char stdsymbols[6] = "_.-*?"; |
|---|
| 81 | const char allsymbols[32] = "_.-*?<>{}[]()!@#$%^&=+;:'/|`~\"\\"; |
|---|
| 82 | static const char *seqsymbols = allsymbols; |
|---|
| 83 | |
|---|
| 84 | const char nummask[11] = "0123456789"; |
|---|
| 85 | const char nonummask[11] = "~!@#$%^&*("; |
|---|
| 86 | |
|---|
| 87 | /* |
|---|
| 88 | use general form of isseqchar -- all chars + symbols. |
|---|
| 89 | no formats except nbrf (?) use symbols in data area as |
|---|
| 90 | anything other than sequence chars. |
|---|
| 91 | */ |
|---|
| 92 | |
|---|
| 93 | |
|---|
| 94 | |
|---|
| 95 | /* Local variables for readSeq: */ |
|---|
| 96 | struct ReadSeqVars { |
|---|
| 97 | short choice, err, nseq; |
|---|
| 98 | long seqlen, maxseq, seqlencount; |
|---|
| 99 | short topnseq; |
|---|
| 100 | long topseqlen; |
|---|
| 101 | const char *fname; |
|---|
| 102 | char *seq, *seqid, matchchar; |
|---|
| 103 | boolean allDone, done, filestart, addit; |
|---|
| 104 | FILE *f; |
|---|
| 105 | long linestart; |
|---|
| 106 | char s[256], *sp; |
|---|
| 107 | |
|---|
| 108 | int (*isseqcharfirst8)(); /* Patch by o. strunk (ARB) to allow numbers in genbank sequences*/ |
|---|
| 109 | int (*isseqchar)(); |
|---|
| 110 | /* int (*isseqchar)(int c); << sgi cc hates (int c) */ |
|---|
| 111 | }; |
|---|
| 112 | |
|---|
| 113 | |
|---|
| 114 | |
|---|
| 115 | int isSeqChar(int c) |
|---|
| 116 | { |
|---|
| 117 | return (isalpha(c) || strchr(seqsymbols,c)); |
|---|
| 118 | } |
|---|
| 119 | |
|---|
| 120 | int isSeqNumChar(int c) |
|---|
| 121 | { |
|---|
| 122 | return (isalnum(c) || strchr(seqsymbols,c)); |
|---|
| 123 | } |
|---|
| 124 | |
|---|
| 125 | |
|---|
| 126 | int isAnyChar(int c) |
|---|
| 127 | { |
|---|
| 128 | return isascii(c); /* wrap in case isascii is macro */ |
|---|
| 129 | } |
|---|
| 130 | |
|---|
| 131 | Local void readline(FILE *f, char *s, long *linestart) |
|---|
| 132 | { |
|---|
| 133 | char *cp; |
|---|
| 134 | |
|---|
| 135 | *linestart= ftell(f); |
|---|
| 136 | if (NULL == fgets(s, 256, f)) |
|---|
| 137 | *s = 0; |
|---|
| 138 | else { |
|---|
| 139 | cp = strchr(s, '\n'); |
|---|
| 140 | if (cp != NULL) *cp = 0; |
|---|
| 141 | } |
|---|
| 142 | } |
|---|
| 143 | |
|---|
| 144 | Local void GetLine(struct ReadSeqVars *V) |
|---|
| 145 | { |
|---|
| 146 | readline(V->f, V->s, &V->linestart); |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | Local void unGetLine(struct ReadSeqVars *V) |
|---|
| 150 | { |
|---|
| 151 | fseek(V->f, V->linestart, 0); |
|---|
| 152 | } |
|---|
| 153 | |
|---|
| 154 | |
|---|
| 155 | Local void addseq(char *s, struct ReadSeqVars * V) |
|---|
| 156 | { |
|---|
| 157 | char *ptr; |
|---|
| 158 | int count = 0; |
|---|
| 159 | if (V->addit) { |
|---|
| 160 | for (;*s != 0;s++,count++) { |
|---|
| 161 | if (count < 9 && V->isseqcharfirst8) { |
|---|
| 162 | if (!(V->isseqcharfirst8) (*s)) continue; |
|---|
| 163 | } |
|---|
| 164 | else { |
|---|
| 165 | if (!(V->isseqchar) (*s)) continue; |
|---|
| 166 | } |
|---|
| 167 | if (V->seqlen >= V->maxseq) { |
|---|
| 168 | V->maxseq += kStartLength; |
|---|
| 169 | ptr = (char *) realloc(V->seq, V->maxseq + 1); |
|---|
| 170 | if (ptr == NULL) { |
|---|
| 171 | V->err = eMemFull; |
|---|
| 172 | return; |
|---|
| 173 | } else |
|---|
| 174 | V->seq = ptr; |
|---|
| 175 | } |
|---|
| 176 | V->seq[(V->seqlen)++] = *s; |
|---|
| 177 | } |
|---|
| 178 | } |
|---|
| 179 | } |
|---|
| 180 | |
|---|
| 181 | Local void countseq(char *s, struct ReadSeqVars *V) |
|---|
| 182 | /* this must count all valid seq chars, for some formats (paup-sequential) even |
|---|
| 183 | if we are skipping seq... */ |
|---|
| 184 | { |
|---|
| 185 | while (*s != 0) { |
|---|
| 186 | if ((V->isseqchar)(*s)) { |
|---|
| 187 | (V->seqlencount)++; |
|---|
| 188 | } |
|---|
| 189 | s++; |
|---|
| 190 | } |
|---|
| 191 | } |
|---|
| 192 | |
|---|
| 193 | |
|---|
| 194 | Local void addinfo(char *s, struct ReadSeqVars *V) |
|---|
| 195 | { |
|---|
| 196 | char s2[256], *si; |
|---|
| 197 | boolean saveadd; |
|---|
| 198 | |
|---|
| 199 | si = s2; |
|---|
| 200 | while (*s == ' ') s++; |
|---|
| 201 | sprintf(si, " %d) %s\n", V->nseq, s); |
|---|
| 202 | |
|---|
| 203 | saveadd = V->addit; |
|---|
| 204 | V->addit = true; |
|---|
| 205 | V->isseqchar = isAnyChar; |
|---|
| 206 | addseq( si, V); |
|---|
| 207 | V->addit = saveadd; |
|---|
| 208 | V->isseqchar = isSeqChar; |
|---|
| 209 | } |
|---|
| 210 | |
|---|
| 211 | |
|---|
| 212 | |
|---|
| 213 | |
|---|
| 214 | Local void readLoop(short margin, boolean addfirst, |
|---|
| 215 | boolean (*endTest)(boolean *addend, boolean *ungetend, struct ReadSeqVars *V), |
|---|
| 216 | struct ReadSeqVars *V) |
|---|
| 217 | { |
|---|
| 218 | boolean addend = false; |
|---|
| 219 | boolean ungetend = false; |
|---|
| 220 | |
|---|
| 221 | V->nseq++; |
|---|
| 222 | if (V->choice == kListSequences) V->addit = false; |
|---|
| 223 | else V->addit = (V->nseq == V->choice); |
|---|
| 224 | if (V->addit) V->seqlen = 0; |
|---|
| 225 | |
|---|
| 226 | if (addfirst) addseq(V->s, V); |
|---|
| 227 | do { |
|---|
| 228 | GetLine(V); |
|---|
| 229 | V->done = feof(V->f); |
|---|
| 230 | V->done |= (*endTest)( &addend, &ungetend, V); |
|---|
| 231 | if (V->addit && (addend || !V->done) && (strlen(V->s) > (unsigned)margin)) { |
|---|
| 232 | addseq( (V->s)+margin, V); |
|---|
| 233 | } |
|---|
| 234 | } while (!V->done); |
|---|
| 235 | |
|---|
| 236 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
|---|
| 237 | else { |
|---|
| 238 | V->allDone = (V->nseq >= V->choice); |
|---|
| 239 | if (V->allDone && ungetend) unGetLine(V); |
|---|
| 240 | } |
|---|
| 241 | } |
|---|
| 242 | |
|---|
| 243 | |
|---|
| 244 | |
|---|
| 245 | Local boolean endIG( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 246 | { |
|---|
| 247 | *addend = true; /* 1 or 2 occur in line w/ bases */ |
|---|
| 248 | *ungetend= false; |
|---|
| 249 | return((strchr(V->s,'1')!=NULL) || (strchr(V->s,'2')!=NULL)); |
|---|
| 250 | } |
|---|
| 251 | |
|---|
| 252 | Local void readIG(struct ReadSeqVars *V) |
|---|
| 253 | { |
|---|
| 254 | /* 18Aug92: new IG format -- ^L between sequences in place of ";" */ |
|---|
| 255 | char *si; |
|---|
| 256 | |
|---|
| 257 | while (!V->allDone) { |
|---|
| 258 | do { |
|---|
| 259 | GetLine(V); |
|---|
| 260 | for (si= V->s; *si != 0 && *si < ' '; si++) *si= ' '; /* drop controls */ |
|---|
| 261 | if (*si == 0) *V->s= 0; /* chop line to empty */ |
|---|
| 262 | } while (! (feof(V->f) || ((*V->s != 0) && (*V->s != ';') ) )); |
|---|
| 263 | if (feof(V->f)) |
|---|
| 264 | V->allDone = true; |
|---|
| 265 | else { |
|---|
| 266 | strcpy(V->seqid, V->s); |
|---|
| 267 | readLoop(0, false, endIG, V); |
|---|
| 268 | } |
|---|
| 269 | } |
|---|
| 270 | } |
|---|
| 271 | |
|---|
| 272 | |
|---|
| 273 | |
|---|
| 274 | Local boolean endStrider( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 275 | { |
|---|
| 276 | *addend = false; |
|---|
| 277 | *ungetend= false; |
|---|
| 278 | return (strstr( V->s, "//") != NULL); |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | Local void readStrider(struct ReadSeqVars *V) |
|---|
| 282 | { /* ? only 1 seq/file ? */ |
|---|
| 283 | |
|---|
| 284 | while (!V->allDone) { |
|---|
| 285 | GetLine(V); |
|---|
| 286 | if (strstr(V->s,"; DNA sequence ") == V->s) |
|---|
| 287 | strcpy(V->seqid, (V->s)+16); |
|---|
| 288 | else |
|---|
| 289 | strcpy(V->seqid, (V->s)+1); |
|---|
| 290 | while ((!feof(V->f)) && (*V->s == ';')) { |
|---|
| 291 | GetLine(V); |
|---|
| 292 | } |
|---|
| 293 | if (feof(V->f)) V->allDone = true; |
|---|
| 294 | else readLoop(0, true, endStrider, V); |
|---|
| 295 | } |
|---|
| 296 | } |
|---|
| 297 | |
|---|
| 298 | |
|---|
| 299 | Local boolean endPIR( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 300 | { |
|---|
| 301 | *addend = false; |
|---|
| 302 | *ungetend= (strstr(V->s,"ENTRY") == V->s); |
|---|
| 303 | return ((strstr(V->s,"///") != NULL) || *ungetend); |
|---|
| 304 | } |
|---|
| 305 | |
|---|
| 306 | Local void readPIR(struct ReadSeqVars *V) |
|---|
| 307 | { /*PIR -- many seqs/file */ |
|---|
| 308 | |
|---|
| 309 | while (!V->allDone) { |
|---|
| 310 | while (! (feof(V->f) || strstr(V->s,"ENTRY") || strstr(V->s,"SEQUENCE")) ) |
|---|
| 311 | GetLine(V); |
|---|
| 312 | strcpy(V->seqid, (V->s)+16); |
|---|
| 313 | while (! (feof(V->f) || strstr(V->s,"SEQUENCE") == V->s)) |
|---|
| 314 | GetLine(V); |
|---|
| 315 | readLoop(0, false, endPIR, V); |
|---|
| 316 | |
|---|
| 317 | if (!V->allDone) { |
|---|
| 318 | while (! (feof(V->f) || ((*V->s != 0) |
|---|
| 319 | && (strstr( V->s,"ENTRY") == V->s)))) |
|---|
| 320 | GetLine(V); |
|---|
| 321 | } |
|---|
| 322 | if (feof(V->f)) V->allDone = true; |
|---|
| 323 | } |
|---|
| 324 | } |
|---|
| 325 | |
|---|
| 326 | |
|---|
| 327 | Local boolean endGB( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 328 | { |
|---|
| 329 | *addend = false; |
|---|
| 330 | *ungetend= (strstr(V->s,"LOCUS") == V->s); |
|---|
| 331 | return ((strstr(V->s,"//") != NULL) || *ungetend); |
|---|
| 332 | } |
|---|
| 333 | |
|---|
| 334 | Local void readGenBank(struct ReadSeqVars * V) |
|---|
| 335 | { /* GenBank -- many seqs/file */ |
|---|
| 336 | |
|---|
| 337 | V->isseqchar = isSeqNumChar; |
|---|
| 338 | V->isseqcharfirst8 = isSeqChar; |
|---|
| 339 | |
|---|
| 340 | while (!V->allDone) { |
|---|
| 341 | strcpy(V->seqid, (V->s)+12); |
|---|
| 342 | while (! (feof(V->f) || strstr(V->s,"ORIGIN") == V->s)) |
|---|
| 343 | GetLine(V); |
|---|
| 344 | readLoop(0, false, endGB, V); |
|---|
| 345 | |
|---|
| 346 | if (!V->allDone) { |
|---|
| 347 | while (! (feof(V->f) || ((*V->s != 0) |
|---|
| 348 | && (strstr( V->s, "LOCUS") == V->s)))) |
|---|
| 349 | GetLine(V); |
|---|
| 350 | } |
|---|
| 351 | if (feof(V->f)) V->allDone = true; |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | V->isseqchar = isSeqChar; |
|---|
| 355 | V->isseqcharfirst8 = 0; |
|---|
| 356 | } |
|---|
| 357 | |
|---|
| 358 | |
|---|
| 359 | Local boolean endNBRF( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 360 | { |
|---|
| 361 | char *a; |
|---|
| 362 | |
|---|
| 363 | if ((a = strchr(V->s, '*')) != NULL) { /* end of 1st seq */ |
|---|
| 364 | /* "*" can be valid base symbol, drop it here */ |
|---|
| 365 | *a = 0; |
|---|
| 366 | *addend = true; |
|---|
| 367 | *ungetend= false; |
|---|
| 368 | return(true); |
|---|
| 369 | } |
|---|
| 370 | else if (*V->s == '>') { /* start of next seq */ |
|---|
| 371 | *addend = false; |
|---|
| 372 | *ungetend= true; |
|---|
| 373 | return(true); |
|---|
| 374 | } |
|---|
| 375 | else |
|---|
| 376 | return(false); |
|---|
| 377 | } |
|---|
| 378 | |
|---|
| 379 | Local void readNBRF(struct ReadSeqVars *V) |
|---|
| 380 | { |
|---|
| 381 | while (!V->allDone) { |
|---|
| 382 | strcpy(V->seqid, (V->s)+4); |
|---|
| 383 | GetLine(V); /*skip title-junk line*/ |
|---|
| 384 | readLoop(0, false, endNBRF, V); |
|---|
| 385 | if (!V->allDone) { |
|---|
| 386 | while (!(feof(V->f) || (*V->s != 0 && *V->s == '>'))) |
|---|
| 387 | GetLine(V); |
|---|
| 388 | } |
|---|
| 389 | if (feof(V->f)) V->allDone = true; |
|---|
| 390 | } |
|---|
| 391 | } |
|---|
| 392 | |
|---|
| 393 | |
|---|
| 394 | |
|---|
| 395 | Local boolean endPearson( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 396 | { |
|---|
| 397 | *addend = false; |
|---|
| 398 | *ungetend= true; |
|---|
| 399 | return(*V->s == '>'); |
|---|
| 400 | } |
|---|
| 401 | |
|---|
| 402 | Local void readPearson(struct ReadSeqVars *V) |
|---|
| 403 | { |
|---|
| 404 | while (!V->allDone) { |
|---|
| 405 | strcpy(V->seqid, (V->s)+1); |
|---|
| 406 | readLoop(0, false, endPearson, V); |
|---|
| 407 | if (!V->allDone) { |
|---|
| 408 | while (!(feof(V->f) || ((*V->s != 0) && (*V->s == '>')))) |
|---|
| 409 | GetLine(V); |
|---|
| 410 | } |
|---|
| 411 | if (feof(V->f)) V->allDone = true; |
|---|
| 412 | } |
|---|
| 413 | } |
|---|
| 414 | |
|---|
| 415 | |
|---|
| 416 | |
|---|
| 417 | Local boolean endEMBL( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 418 | { |
|---|
| 419 | *addend = false; |
|---|
| 420 | *ungetend= (strstr(V->s,"ID ") == V->s); |
|---|
| 421 | return ((strstr(V->s,"//") != NULL) || *ungetend); |
|---|
| 422 | } |
|---|
| 423 | |
|---|
| 424 | Local void readEMBL(struct ReadSeqVars *V) |
|---|
| 425 | { |
|---|
| 426 | while (!V->allDone) { |
|---|
| 427 | strcpy(V->seqid, (V->s)+5); |
|---|
| 428 | do { |
|---|
| 429 | GetLine(V); |
|---|
| 430 | } while (!(feof(V->f) | (strstr(V->s,"SQ ") == V->s))); |
|---|
| 431 | |
|---|
| 432 | readLoop(0, false, endEMBL, V); |
|---|
| 433 | if (!V->allDone) { |
|---|
| 434 | while (!(feof(V->f) | |
|---|
| 435 | ((*V->s != '\0') & (strstr(V->s,"ID ") == V->s)))) |
|---|
| 436 | GetLine(V); |
|---|
| 437 | } |
|---|
| 438 | if (feof(V->f)) V->allDone = true; |
|---|
| 439 | } |
|---|
| 440 | } |
|---|
| 441 | |
|---|
| 442 | |
|---|
| 443 | |
|---|
| 444 | Local boolean endZuker( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 445 | { |
|---|
| 446 | *addend = false; |
|---|
| 447 | *ungetend= true; |
|---|
| 448 | return( *V->s == '(' ); |
|---|
| 449 | } |
|---|
| 450 | |
|---|
| 451 | Local void readZuker(struct ReadSeqVars *V) |
|---|
| 452 | { |
|---|
| 453 | /*! 1st string is Zuker's Fortran format */ |
|---|
| 454 | |
|---|
| 455 | while (!V->allDone) { |
|---|
| 456 | GetLine(V); /*s == "seqLen seqid string..."*/ |
|---|
| 457 | strcpy(V->seqid, (V->s)+6); |
|---|
| 458 | readLoop(0, false, endZuker, V); |
|---|
| 459 | if (!V->allDone) { |
|---|
| 460 | while (!(feof(V->f) | |
|---|
| 461 | ((*V->s != '\0') & (*V->s == '(')))) |
|---|
| 462 | GetLine(V); |
|---|
| 463 | } |
|---|
| 464 | if (feof(V->f)) V->allDone = true; |
|---|
| 465 | } |
|---|
| 466 | } |
|---|
| 467 | |
|---|
| 468 | |
|---|
| 469 | |
|---|
| 470 | Local boolean endFitch( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 471 | { |
|---|
| 472 | /* this is a somewhat shaky end, |
|---|
| 473 | 1st char of line is non-blank for seq. title |
|---|
| 474 | */ |
|---|
| 475 | *addend = false; |
|---|
| 476 | *ungetend= true; |
|---|
| 477 | return( *V->s != ' ' ); |
|---|
| 478 | } |
|---|
| 479 | |
|---|
| 480 | Local void readFitch(struct ReadSeqVars *V) |
|---|
| 481 | { |
|---|
| 482 | boolean first; |
|---|
| 483 | |
|---|
| 484 | first = true; |
|---|
| 485 | while (!V->allDone) { |
|---|
| 486 | if (!first) strcpy(V->seqid, V->s); |
|---|
| 487 | readLoop(0, first, endFitch, V); |
|---|
| 488 | if (feof(V->f)) V->allDone = true; |
|---|
| 489 | first = false; |
|---|
| 490 | } |
|---|
| 491 | } |
|---|
| 492 | |
|---|
| 493 | |
|---|
| 494 | Local void readPlain(struct ReadSeqVars *V) |
|---|
| 495 | { |
|---|
| 496 | V->nseq++; |
|---|
| 497 | V->addit = (V->choice > 0); |
|---|
| 498 | if (V->addit) V->seqlen = 0; |
|---|
| 499 | addseq(V->seqid, V); /*from above..*/ |
|---|
| 500 | if (V->fname!=NULL) sprintf(V->seqid, "%s [Unknown form]", V->fname); |
|---|
| 501 | else sprintf(V->seqid, " [Unknown form]"); |
|---|
| 502 | do { |
|---|
| 503 | addseq(V->s, V); |
|---|
| 504 | V->done = feof(V->f); |
|---|
| 505 | GetLine(V); |
|---|
| 506 | } while (!V->done); |
|---|
| 507 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
|---|
| 508 | V->allDone = true; |
|---|
| 509 | } |
|---|
| 510 | |
|---|
| 511 | |
|---|
| 512 | Local void readUWGCG(struct ReadSeqVars *V) |
|---|
| 513 | { |
|---|
| 514 | /* |
|---|
| 515 | 10nov91: Reading GCG files casued duplication of last line when |
|---|
| 516 | EOF followed that line !!! |
|---|
| 517 | fix: GetLine now sets *V->s = 0 |
|---|
| 518 | */ |
|---|
| 519 | char *si; |
|---|
| 520 | |
|---|
| 521 | V->nseq++; |
|---|
| 522 | V->addit = (V->choice > 0); |
|---|
| 523 | if (V->addit) V->seqlen = 0; |
|---|
| 524 | strcpy(V->seqid, V->s); |
|---|
| 525 | /*writeseq: " %s Length: %d (today) Check: %d ..\n" */ |
|---|
| 526 | /*drop above or ".." from id*/ |
|---|
| 527 | if ((si = strstr(V->seqid," Length: "))) *si = 0; |
|---|
| 528 | else if ((si = strstr(V->seqid,".."))) *si = 0; |
|---|
| 529 | do { |
|---|
| 530 | V->done = feof(V->f); |
|---|
| 531 | GetLine(V); |
|---|
| 532 | if (!V->done) addseq((V->s), V); |
|---|
| 533 | } while (!V->done); |
|---|
| 534 | if (V->choice == kListSequences) addinfo(V->seqid, V); |
|---|
| 535 | V->allDone = true; |
|---|
| 536 | } |
|---|
| 537 | |
|---|
| 538 | |
|---|
| 539 | Local void readOlsen(struct ReadSeqVars *V) |
|---|
| 540 | { /* G. Olsen /print output from multiple sequence editor */ |
|---|
| 541 | |
|---|
| 542 | char *si, *sj, *sk, *sm, sid[40], snum[20]; |
|---|
| 543 | boolean indata = false; |
|---|
| 544 | int snumlen; |
|---|
| 545 | |
|---|
| 546 | V->addit = (V->choice > 0); |
|---|
| 547 | if (V->addit) V->seqlen = 0; |
|---|
| 548 | rewind(V->f); V->nseq= 0; |
|---|
| 549 | do { |
|---|
| 550 | GetLine(V); |
|---|
| 551 | V->done = feof(V->f); |
|---|
| 552 | |
|---|
| 553 | if (V->done && !(*V->s)) break; |
|---|
| 554 | else if (indata) { |
|---|
| 555 | if ( (si= strstr(V->s, sid)) |
|---|
| 556 | /* && (strstr(V->s, snum) == si - snumlen - 1) ) { */ |
|---|
| 557 | && (sm= strstr(V->s, snum)) && (sm < si - snumlen) ) { |
|---|
| 558 | |
|---|
| 559 | /* Spaces are valid alignment data !! */ |
|---|
| 560 | /* 17Oct91: Error, the left margin is 21 not 22! */ |
|---|
| 561 | /* dropped some nucs up to now -- my example file was right shifted ! */ |
|---|
| 562 | /* variable right id margin, drop id-2 spaces at end */ |
|---|
| 563 | /* |
|---|
| 564 | VMS CC COMPILER (VAXC031) mess up: |
|---|
| 565 | -- Index of 21 is chopping 1st nuc on VMS systems Only! |
|---|
| 566 | Byte-for-byte same ame rnasep.olsen sequence file ! |
|---|
| 567 | */ |
|---|
| 568 | |
|---|
| 569 | /* si = (V->s)+21; < was this before VMS CC wasted my time */ |
|---|
| 570 | si += 10; /* use strstr index plus offset to outfox VMS CC bug */ |
|---|
| 571 | |
|---|
| 572 | if ((sk = strstr(si, sid))) *(sk-2) = 0; |
|---|
| 573 | for (sk = si; *sk != 0; sk++) { |
|---|
| 574 | if (*sk == ' ') *sk = '.'; |
|---|
| 575 | /* 18aug92: !! some olsen masks are NUMBERS !! which addseq eats */ |
|---|
| 576 | else if (isdigit(*sk)) *sk= nonummask[*sk - '0']; |
|---|
| 577 | } |
|---|
| 578 | |
|---|
| 579 | addseq(si, V); |
|---|
| 580 | } |
|---|
| 581 | } |
|---|
| 582 | |
|---|
| 583 | else if ((sk = strstr(V->s, "): "))) { /* seq info header line */ |
|---|
| 584 | /* 18aug92: correct for diff seqs w/ same name -- use number, e.g. */ |
|---|
| 585 | /* 3 (Agr.tume): agrobacterium.prna 18-JUN-1987 16:12 */ |
|---|
| 586 | /* 328 (Agr.tume): agrobacterium.prna XYZ 19-DEC-1992 */ |
|---|
| 587 | (V->nseq)++; |
|---|
| 588 | si = 1 + strchr(V->s,'('); |
|---|
| 589 | *sk = ' '; |
|---|
| 590 | if (V->choice == kListSequences) addinfo( si, V); |
|---|
| 591 | else if (V->nseq == V->choice) { |
|---|
| 592 | strcpy(V->seqid, si); |
|---|
| 593 | sj = strchr(V->seqid, ':'); |
|---|
| 594 | while (*(--sj) == ' ') ; |
|---|
| 595 | while (--sj != V->seqid) { if (*sj == ' ') *sj = '_'; } |
|---|
| 596 | |
|---|
| 597 | *sk = 0; |
|---|
| 598 | while (*(--sk) == ' ') *sk = 0; |
|---|
| 599 | strcpy(sid, si); |
|---|
| 600 | |
|---|
| 601 | si= V->s; |
|---|
| 602 | while ((*si <= ' ') && (*si != 0)) si++; |
|---|
| 603 | snumlen=0; |
|---|
| 604 | while (si[snumlen] > ' ' && snumlen<20) |
|---|
| 605 | { snum[snumlen]= si[snumlen]; snumlen++; } |
|---|
| 606 | snum[snumlen]= 0; |
|---|
| 607 | } |
|---|
| 608 | |
|---|
| 609 | } |
|---|
| 610 | |
|---|
| 611 | else if (strstr(V->s,"identity: Data:")) { |
|---|
| 612 | indata = true; |
|---|
| 613 | if (V->choice == kListSequences) V->done = true; |
|---|
| 614 | } |
|---|
| 615 | |
|---|
| 616 | } while (!V->done); |
|---|
| 617 | |
|---|
| 618 | V->allDone = true; |
|---|
| 619 | } /*readOlsen*/ |
|---|
| 620 | |
|---|
| 621 | |
|---|
| 622 | Local void readMSF(struct ReadSeqVars *V) |
|---|
| 623 | { /* gcg's MSF, mult. sequence format, interleaved ! */ |
|---|
| 624 | |
|---|
| 625 | char *si, *sj, sid[128]; |
|---|
| 626 | boolean indata = false; |
|---|
| 627 | int iline= 0; |
|---|
| 628 | |
|---|
| 629 | V->addit = (V->choice > 0); |
|---|
| 630 | if (V->addit) V->seqlen = 0; |
|---|
| 631 | rewind(V->f); V->nseq= 0; |
|---|
| 632 | do { |
|---|
| 633 | GetLine(V); |
|---|
| 634 | V->done = feof(V->f); |
|---|
| 635 | |
|---|
| 636 | if (V->done && !(*V->s)) break; |
|---|
| 637 | else if (indata) { |
|---|
| 638 | /*somename ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet */ |
|---|
| 639 | /* E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs */ |
|---|
| 640 | |
|---|
| 641 | si= V->s; |
|---|
| 642 | skipwhitespace(si); |
|---|
| 643 | /* for (sj= si; isalnum(*sj); sj++) ; bug -- cdelwiche uses "-", "_" and others in names*/ |
|---|
| 644 | for (sj= si; *sj > ' '; sj++) ; |
|---|
| 645 | *sj= 0; |
|---|
| 646 | if ( *si ) { |
|---|
| 647 | if ( (0==strcmp(si, sid)) ) { |
|---|
| 648 | addseq(sj+1, V); |
|---|
| 649 | } |
|---|
| 650 | iline++; |
|---|
| 651 | } |
|---|
| 652 | } |
|---|
| 653 | |
|---|
| 654 | else if (NULL != (si = strstr(V->s, "Name: "))) { /* seq info header line */ |
|---|
| 655 | /* Name: somename Len: 100 Check: 7009 Weight: 1.00 */ |
|---|
| 656 | |
|---|
| 657 | (V->nseq)++; |
|---|
| 658 | si += 6; |
|---|
| 659 | if (V->choice == kListSequences) addinfo( si, V); |
|---|
| 660 | else if (V->nseq == V->choice) { |
|---|
| 661 | strcpy(V->seqid, si); |
|---|
| 662 | si = V->seqid; |
|---|
| 663 | skipwhitespace(si); |
|---|
| 664 | /* for (sj= si; isalnum(*sj); sj++) ; -- bug */ |
|---|
| 665 | for (sj= si; *sj > ' '; sj++) ; |
|---|
| 666 | *sj= 0; |
|---|
| 667 | strcpy(sid, si); |
|---|
| 668 | } |
|---|
| 669 | } |
|---|
| 670 | |
|---|
| 671 | else if ( strstr(V->s,"//") /*== V->s*/ ) { |
|---|
| 672 | indata = true; |
|---|
| 673 | iline= 0; |
|---|
| 674 | if (V->choice == kListSequences) V->done = true; |
|---|
| 675 | } |
|---|
| 676 | |
|---|
| 677 | } while (!V->done); |
|---|
| 678 | |
|---|
| 679 | |
|---|
| 680 | V->allDone = true; |
|---|
| 681 | } /*readMSF*/ |
|---|
| 682 | |
|---|
| 683 | |
|---|
| 684 | |
|---|
| 685 | Local void readPAUPinterleaved(struct ReadSeqVars *V) |
|---|
| 686 | { /* PAUP mult. sequence format, interleaved or sequential! */ |
|---|
| 687 | |
|---|
| 688 | char *si, *sj, *send, sid[40], sid1[40], saveseq[255]; |
|---|
| 689 | boolean first = true, indata = false, domatch; |
|---|
| 690 | int iline= 0, ifmc, saveseqlen=0; |
|---|
| 691 | |
|---|
| 692 | #define fixmatchchar(s) { \ |
|---|
| 693 | for (ifmc=0; ifmc<saveseqlen; ifmc++) \ |
|---|
| 694 | if (s[ifmc] == V->matchchar) s[ifmc]= saveseq[ifmc]; } |
|---|
| 695 | |
|---|
| 696 | V->addit = (V->choice > 0); |
|---|
| 697 | V->seqlencount = 0; |
|---|
| 698 | if (V->addit) V->seqlen = 0; |
|---|
| 699 | /* rewind(V->f); V->nseq= 0; << do in caller !*/ |
|---|
| 700 | indata= true; /* call here after we find "matrix" */ |
|---|
| 701 | domatch= (V->matchchar > 0); |
|---|
| 702 | |
|---|
| 703 | do { |
|---|
| 704 | GetLine(V); |
|---|
| 705 | V->done = feof(V->f); |
|---|
| 706 | |
|---|
| 707 | if (V->done && !(*V->s)) break; |
|---|
| 708 | else if (indata) { |
|---|
| 709 | /* [ 1 1 1 ]*/ |
|---|
| 710 | /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
|---|
| 711 | /* chimp ................a.t. .c.................a ..........*/ |
|---|
| 712 | /* !! need to correct for V->matchchar */ |
|---|
| 713 | si= V->s; |
|---|
| 714 | skipwhitespace(si); |
|---|
| 715 | if (strchr(si,';')) indata= false; |
|---|
| 716 | |
|---|
| 717 | if (isalnum(*si)) { |
|---|
| 718 | /* valid data line starts w/ a left-justified seq name in columns [0..8] */ |
|---|
| 719 | if (first) { |
|---|
| 720 | (V->nseq)++; |
|---|
| 721 | if (V->nseq >= V->topnseq) first= false; |
|---|
| 722 | for (sj = si; isalnum(*sj); sj++) ; |
|---|
| 723 | send= sj; |
|---|
| 724 | skipwhitespace(sj); |
|---|
| 725 | if (V->choice == kListSequences) { |
|---|
| 726 | *send= 0; |
|---|
| 727 | addinfo( si, V); |
|---|
| 728 | } |
|---|
| 729 | else if (V->nseq == V->choice) { |
|---|
| 730 | if (domatch) { |
|---|
| 731 | if (V->nseq == 1) { strcpy( saveseq, sj); saveseqlen= strlen(saveseq); } |
|---|
| 732 | else fixmatchchar( sj); |
|---|
| 733 | } |
|---|
| 734 | addseq(sj, V); |
|---|
| 735 | *send= 0; |
|---|
| 736 | strcpy(V->seqid, si); |
|---|
| 737 | strcpy(sid, si); |
|---|
| 738 | if (V->nseq == 1) strcpy(sid1, sid); |
|---|
| 739 | } |
|---|
| 740 | } |
|---|
| 741 | |
|---|
| 742 | else if ( (strstr(si, sid) == si) ){ |
|---|
| 743 | while (isalnum(*si)) si++; |
|---|
| 744 | skipwhitespace(si); |
|---|
| 745 | if (domatch) { |
|---|
| 746 | if (V->nseq == 1) { strcpy( saveseq, si); saveseqlen= strlen(saveseq); } |
|---|
| 747 | else fixmatchchar( si); |
|---|
| 748 | } |
|---|
| 749 | addseq(si, V); |
|---|
| 750 | } |
|---|
| 751 | |
|---|
| 752 | else if (domatch && (strstr(si, sid1) == si)) { |
|---|
| 753 | strcpy( saveseq, si); |
|---|
| 754 | saveseqlen= strlen(saveseq); |
|---|
| 755 | } |
|---|
| 756 | |
|---|
| 757 | iline++; |
|---|
| 758 | } |
|---|
| 759 | } |
|---|
| 760 | |
|---|
| 761 | else if ( strstr(V->s,"matrix") ) { |
|---|
| 762 | indata = true; |
|---|
| 763 | iline= 0; |
|---|
| 764 | if (V->choice == kListSequences) V->done = true; |
|---|
| 765 | } |
|---|
| 766 | |
|---|
| 767 | } while (!V->done); |
|---|
| 768 | |
|---|
| 769 | V->allDone = true; |
|---|
| 770 | } /*readPAUPinterleaved*/ |
|---|
| 771 | |
|---|
| 772 | |
|---|
| 773 | |
|---|
| 774 | Local void readPAUPsequential(struct ReadSeqVars *V) |
|---|
| 775 | { /* PAUP mult. sequence format, interleaved or sequential! */ |
|---|
| 776 | char *si, *sj; |
|---|
| 777 | boolean atname = true, indata = false; |
|---|
| 778 | |
|---|
| 779 | V->addit = (V->choice > 0); |
|---|
| 780 | if (V->addit) V->seqlen = 0; |
|---|
| 781 | V->seqlencount = 0; |
|---|
| 782 | /* rewind(V->f); V->nseq= 0; << do in caller !*/ |
|---|
| 783 | indata= true; /* call here after we find "matrix" */ |
|---|
| 784 | do { |
|---|
| 785 | GetLine(V); |
|---|
| 786 | V->done = feof(V->f); |
|---|
| 787 | |
|---|
| 788 | if (V->done && !(*V->s)) break; |
|---|
| 789 | else if (indata) { |
|---|
| 790 | /* [ 1 1 1 ]*/ |
|---|
| 791 | /* human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
|---|
| 792 | /* aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcct*/ |
|---|
| 793 | /* chimp ................a.t. .c.................a ..........*/ |
|---|
| 794 | /* ................a.t. .c.................a ..........*/ |
|---|
| 795 | |
|---|
| 796 | si= V->s; |
|---|
| 797 | skipwhitespace(si); |
|---|
| 798 | if (strchr(si,';')) indata= false; |
|---|
| 799 | if (isalnum(*si)) { |
|---|
| 800 | /* valid data line starts w/ a left-justified seq name in columns [0..8] */ |
|---|
| 801 | if (atname) { |
|---|
| 802 | (V->nseq)++; |
|---|
| 803 | V->seqlencount = 0; |
|---|
| 804 | atname= false; |
|---|
| 805 | sj= si+1; |
|---|
| 806 | while (isalnum(*sj)) sj++; |
|---|
| 807 | if (V->choice == kListSequences) { |
|---|
| 808 | /* !! we must count bases to know when topseqlen is reached ! */ |
|---|
| 809 | countseq(sj, V); |
|---|
| 810 | if (V->seqlencount >= V->topseqlen) atname= true; |
|---|
| 811 | *sj= 0; |
|---|
| 812 | addinfo( si, V); |
|---|
| 813 | } |
|---|
| 814 | else if (V->nseq == V->choice) { |
|---|
| 815 | addseq(sj, V); |
|---|
| 816 | V->seqlencount= V->seqlen; |
|---|
| 817 | if (V->seqlencount >= V->topseqlen) atname= true; |
|---|
| 818 | *sj= 0; |
|---|
| 819 | strcpy(V->seqid, si); |
|---|
| 820 | } |
|---|
| 821 | else { |
|---|
| 822 | countseq(sj, V); |
|---|
| 823 | if (V->seqlencount >= V->topseqlen) atname= true; |
|---|
| 824 | } |
|---|
| 825 | } |
|---|
| 826 | |
|---|
| 827 | else if (V->nseq == V->choice) { |
|---|
| 828 | addseq(V->s, V); |
|---|
| 829 | V->seqlencount= V->seqlen; |
|---|
| 830 | if (V->seqlencount >= V->topseqlen) atname= true; |
|---|
| 831 | } |
|---|
| 832 | else { |
|---|
| 833 | countseq(V->s, V); |
|---|
| 834 | if (V->seqlencount >= V->topseqlen) atname= true; |
|---|
| 835 | } |
|---|
| 836 | } |
|---|
| 837 | } |
|---|
| 838 | |
|---|
| 839 | else if ( strstr(V->s,"matrix") ) { |
|---|
| 840 | indata = true; |
|---|
| 841 | atname= true; |
|---|
| 842 | if (V->choice == kListSequences) V->done = true; |
|---|
| 843 | } |
|---|
| 844 | |
|---|
| 845 | } while (!V->done); |
|---|
| 846 | |
|---|
| 847 | V->allDone = true; |
|---|
| 848 | } /*readPAUPsequential*/ |
|---|
| 849 | |
|---|
| 850 | |
|---|
| 851 | Local void readPhylipInterleaved(struct ReadSeqVars *V) |
|---|
| 852 | { |
|---|
| 853 | char *si, *sj; |
|---|
| 854 | boolean first = true; |
|---|
| 855 | int iline= 0; |
|---|
| 856 | |
|---|
| 857 | V->addit = (V->choice > 0); |
|---|
| 858 | if (V->addit) V->seqlen = 0; |
|---|
| 859 | V->seqlencount = 0; |
|---|
| 860 | /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); << topnseq == 0 !!! bad scan !! */ |
|---|
| 861 | si= V->s; |
|---|
| 862 | skipwhitespace(si); |
|---|
| 863 | V->topnseq= atoi(si); |
|---|
| 864 | while (isdigit(*si)) si++; |
|---|
| 865 | skipwhitespace(si); |
|---|
| 866 | V->topseqlen= atol(si); |
|---|
| 867 | /* fprintf(stderr,"Phylip-ileaf: topnseq=%d topseqlen=%d\n",V->topnseq, V->topseqlen); */ |
|---|
| 868 | |
|---|
| 869 | do { |
|---|
| 870 | GetLine(V); |
|---|
| 871 | V->done = feof(V->f); |
|---|
| 872 | |
|---|
| 873 | if (V->done && !(*V->s)) break; |
|---|
| 874 | si= V->s; |
|---|
| 875 | skipwhitespace(si); |
|---|
| 876 | if (*si != 0) { |
|---|
| 877 | |
|---|
| 878 | if (first) { /* collect seq names + seq, as fprintf(outf,"%-10s ",seqname); */ |
|---|
| 879 | (V->nseq)++; |
|---|
| 880 | if (V->nseq >= V->topnseq) first= false; |
|---|
| 881 | sj= V->s+10; /* past name, start of data */ |
|---|
| 882 | if (V->choice == kListSequences) { |
|---|
| 883 | *sj= 0; |
|---|
| 884 | addinfo( si, V); |
|---|
| 885 | } |
|---|
| 886 | else if (V->nseq == V->choice) { |
|---|
| 887 | addseq(sj, V); |
|---|
| 888 | *sj= 0; |
|---|
| 889 | strcpy(V->seqid, si); |
|---|
| 890 | } |
|---|
| 891 | } |
|---|
| 892 | else if ( iline % V->nseq == V->choice -1 ) { |
|---|
| 893 | addseq(si, V); |
|---|
| 894 | } |
|---|
| 895 | iline++; |
|---|
| 896 | } |
|---|
| 897 | } while (!V->done); |
|---|
| 898 | |
|---|
| 899 | V->allDone = true; |
|---|
| 900 | } /*readPhylipInterleaved*/ |
|---|
| 901 | |
|---|
| 902 | |
|---|
| 903 | |
|---|
| 904 | Local boolean endPhylipSequential( boolean *addend, boolean *ungetend, struct ReadSeqVars *V) |
|---|
| 905 | { |
|---|
| 906 | *addend = false; |
|---|
| 907 | *ungetend= false; |
|---|
| 908 | countseq( V->s, V); |
|---|
| 909 | return V->seqlencount >= V->topseqlen; |
|---|
| 910 | } |
|---|
| 911 | |
|---|
| 912 | Local void readPhylipSequential(struct ReadSeqVars *V) |
|---|
| 913 | { |
|---|
| 914 | short i; |
|---|
| 915 | char *si; |
|---|
| 916 | /* sscanf( V->s, "%d%d", &V->topnseq, &V->topseqlen); < ? bad sscan ? */ |
|---|
| 917 | si= V->s; |
|---|
| 918 | skipwhitespace(si); |
|---|
| 919 | V->topnseq= atoi(si); |
|---|
| 920 | while (isdigit(*si)) si++; |
|---|
| 921 | skipwhitespace(si); |
|---|
| 922 | V->topseqlen= atol(si); |
|---|
| 923 | GetLine(V); |
|---|
| 924 | while (!V->allDone) { |
|---|
| 925 | V->seqlencount= 0; |
|---|
| 926 | strncpy(V->seqid, (V->s), 10); |
|---|
| 927 | V->seqid[10]= 0; |
|---|
| 928 | for (i=0; i<10 && V->s[i]; i++) V->s[i]= ' '; |
|---|
| 929 | readLoop(0, true, endPhylipSequential, V); |
|---|
| 930 | if (feof(V->f)) V->allDone = true; |
|---|
| 931 | } |
|---|
| 932 | } |
|---|
| 933 | |
|---|
| 934 | |
|---|
| 935 | |
|---|
| 936 | |
|---|
| 937 | Local void readSeqMain( |
|---|
| 938 | struct ReadSeqVars *V, |
|---|
| 939 | const long skiplines_, |
|---|
| 940 | const short format_) |
|---|
| 941 | { |
|---|
| 942 | #define tolowerstr(s) { long Itlwr, Ntlwr= strlen(s); \ |
|---|
| 943 | for (Itlwr=0; Itlwr<Ntlwr; Itlwr++) s[Itlwr]= to_lower(s[Itlwr]); } |
|---|
| 944 | |
|---|
| 945 | boolean gotuw; |
|---|
| 946 | long l; |
|---|
| 947 | |
|---|
| 948 | V->linestart= 0; |
|---|
| 949 | V->matchchar= 0; |
|---|
| 950 | if (V->f == NULL) |
|---|
| 951 | V->err = eFileNotFound; |
|---|
| 952 | else { |
|---|
| 953 | |
|---|
| 954 | for (l = skiplines_; l > 0; l--) GetLine( V); |
|---|
| 955 | |
|---|
| 956 | do { |
|---|
| 957 | GetLine( V); |
|---|
| 958 | for (l= strlen(V->s); (l > 0) && (V->s[l] == ' '); l--) ; |
|---|
| 959 | } while ((l == 0) && !feof(V->f)); |
|---|
| 960 | |
|---|
| 961 | if (feof(V->f)) V->err = eNoData; |
|---|
| 962 | |
|---|
| 963 | else switch (format_) { |
|---|
| 964 | case kPlain : readPlain(V); break; |
|---|
| 965 | case kIG : readIG(V); break; |
|---|
| 966 | case kStrider: readStrider(V); break; |
|---|
| 967 | case kGenBank: readGenBank(V); break; |
|---|
| 968 | case kPIR : readPIR(V); break; |
|---|
| 969 | case kNBRF : readNBRF(V); break; |
|---|
| 970 | case kPearson: readPearson(V); break; |
|---|
| 971 | case kEMBL : readEMBL(V); break; |
|---|
| 972 | case kZuker : readZuker(V); break; |
|---|
| 973 | case kOlsen : readOlsen(V); break; |
|---|
| 974 | case kMSF : readMSF(V); break; |
|---|
| 975 | |
|---|
| 976 | case kPAUP : { |
|---|
| 977 | boolean done= false; |
|---|
| 978 | boolean interleaved= false; |
|---|
| 979 | char *cp; |
|---|
| 980 | /* rewind(V->f); V->nseq= 0; ?? assume it is at top ?? skiplines ... */ |
|---|
| 981 | while (!done) { |
|---|
| 982 | GetLine( V); |
|---|
| 983 | tolowerstr( V->s); |
|---|
| 984 | if (strstr( V->s, "matrix")) done= true; |
|---|
| 985 | if (strstr( V->s, "interleav")) interleaved= true; |
|---|
| 986 | if (NULL != (cp=strstr( V->s, "ntax=")) ) V->topnseq= atoi(cp+5); |
|---|
| 987 | if (NULL != (cp=strstr( V->s, "nchar=")) ) V->topseqlen= atoi(cp+6); |
|---|
| 988 | if (NULL != (cp=strstr( V->s, "matchchar=")) ) { |
|---|
| 989 | cp += 10; |
|---|
| 990 | if (*cp=='\'') cp++; |
|---|
| 991 | else if (*cp=='"') cp++; |
|---|
| 992 | V->matchchar= *cp; |
|---|
| 993 | } |
|---|
| 994 | } |
|---|
| 995 | if (interleaved) readPAUPinterleaved(V); |
|---|
| 996 | else readPAUPsequential(V); |
|---|
| 997 | } |
|---|
| 998 | break; |
|---|
| 999 | |
|---|
| 1000 | /* kPhylip: ! can't determine in middle of file which type it is...*/ |
|---|
| 1001 | /* test for interleave or sequential and use Phylip4(ileave) or Phylip2 */ |
|---|
| 1002 | case kPhylip2: |
|---|
| 1003 | readPhylipSequential(V); |
|---|
| 1004 | break; |
|---|
| 1005 | case kPhylip4: /* == kPhylip3 */ |
|---|
| 1006 | readPhylipInterleaved(V); |
|---|
| 1007 | break; |
|---|
| 1008 | |
|---|
| 1009 | default: |
|---|
| 1010 | V->err = eUnknownFormat; |
|---|
| 1011 | break; |
|---|
| 1012 | |
|---|
| 1013 | case kFitch : |
|---|
| 1014 | strcpy(V->seqid, V->s); GetLine(V); |
|---|
| 1015 | readFitch(V); |
|---|
| 1016 | break; |
|---|
| 1017 | |
|---|
| 1018 | case kGCG: |
|---|
| 1019 | do { |
|---|
| 1020 | gotuw = (strstr(V->s,"..") != NULL); |
|---|
| 1021 | if (gotuw) readUWGCG(V); |
|---|
| 1022 | GetLine(V); |
|---|
| 1023 | } while (!(feof(V->f) || V->allDone)); |
|---|
| 1024 | break; |
|---|
| 1025 | } |
|---|
| 1026 | } |
|---|
| 1027 | |
|---|
| 1028 | V->filestart= false; |
|---|
| 1029 | V->seq[V->seqlen] = 0; /* stick a string terminator on it */ |
|---|
| 1030 | } |
|---|
| 1031 | |
|---|
| 1032 | |
|---|
| 1033 | char *readSeqFp( |
|---|
| 1034 | const short whichEntry_, /* index to sequence in file */ |
|---|
| 1035 | FILE *fp_, /* pointer to open seq file */ |
|---|
| 1036 | const long skiplines_, |
|---|
| 1037 | const short format_, /* sequence file format */ |
|---|
| 1038 | long *seqlen_, /* return seq size */ |
|---|
| 1039 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
|---|
| 1040 | short *error_, /* return error */ |
|---|
| 1041 | char *seqid_) /* return seq name/info */ |
|---|
| 1042 | { |
|---|
| 1043 | struct ReadSeqVars V; |
|---|
| 1044 | |
|---|
| 1045 | if (format_ < kMinFormat || format_ > kMaxFormat) { |
|---|
| 1046 | *error_ = eUnknownFormat; |
|---|
| 1047 | *seqlen_ = 0; |
|---|
| 1048 | return NULL; |
|---|
| 1049 | } |
|---|
| 1050 | |
|---|
| 1051 | V.choice = whichEntry_; |
|---|
| 1052 | V.fname = NULL; /* don't know */ |
|---|
| 1053 | V.seq = (char*) calloc(1, kStartLength+1); |
|---|
| 1054 | V.maxseq = kStartLength; |
|---|
| 1055 | V.seqlen = 0; |
|---|
| 1056 | V.seqid = seqid_; |
|---|
| 1057 | |
|---|
| 1058 | V.f = fp_; |
|---|
| 1059 | V.filestart= (ftell( fp_) == 0); |
|---|
| 1060 | /* !! in sequential read, must remove current seq position from choice/whichEntry_ counter !! ... */ |
|---|
| 1061 | if (V.filestart) V.nseq = 0; |
|---|
| 1062 | else V.nseq= *nseq_; /* track where we are in file...*/ |
|---|
| 1063 | |
|---|
| 1064 | *V.seqid = '\0'; |
|---|
| 1065 | V.err = 0; |
|---|
| 1066 | V.nseq = 0; |
|---|
| 1067 | V.isseqchar = isSeqChar; |
|---|
| 1068 | V.isseqcharfirst8 = 0; |
|---|
| 1069 | if (V.choice == kListSequences) ; /* leave as is */ |
|---|
| 1070 | else if (V.choice <= 0) V.choice = 1; /* default ?? */ |
|---|
| 1071 | V.addit = (V.choice > 0); |
|---|
| 1072 | V.allDone = false; |
|---|
| 1073 | |
|---|
| 1074 | readSeqMain(&V, skiplines_, format_); |
|---|
| 1075 | |
|---|
| 1076 | *error_ = V.err; |
|---|
| 1077 | *seqlen_ = V.seqlen; |
|---|
| 1078 | *nseq_ = V.nseq; |
|---|
| 1079 | return V.seq; |
|---|
| 1080 | } |
|---|
| 1081 | |
|---|
| 1082 | char *readSeq( |
|---|
| 1083 | const short whichEntry_, /* index to sequence in file */ |
|---|
| 1084 | const char *filename_, /* file name */ |
|---|
| 1085 | const long skiplines_, |
|---|
| 1086 | const short format_, /* sequence file format */ |
|---|
| 1087 | long *seqlen_, /* return seq size */ |
|---|
| 1088 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
|---|
| 1089 | short *error_, /* return error */ |
|---|
| 1090 | char *seqid_) /* return seq name/info */ |
|---|
| 1091 | { |
|---|
| 1092 | struct ReadSeqVars V; |
|---|
| 1093 | |
|---|
| 1094 | if (format_ < kMinFormat || format_ > kMaxFormat) { |
|---|
| 1095 | *error_ = eUnknownFormat; |
|---|
| 1096 | *seqlen_ = 0; |
|---|
| 1097 | return NULL; |
|---|
| 1098 | } |
|---|
| 1099 | |
|---|
| 1100 | V.choice = whichEntry_; |
|---|
| 1101 | V.fname = filename_; /* don't need to copy string, just ptr to it */ |
|---|
| 1102 | V.seq = (char*) calloc(1, kStartLength+1); |
|---|
| 1103 | V.maxseq = kStartLength; |
|---|
| 1104 | V.seqlen = 0; |
|---|
| 1105 | V.seqid = seqid_; |
|---|
| 1106 | |
|---|
| 1107 | V.f = NULL; |
|---|
| 1108 | *V.seqid = '\0'; |
|---|
| 1109 | V.err = 0; |
|---|
| 1110 | V.nseq = 0; |
|---|
| 1111 | V.isseqchar = isSeqChar; |
|---|
| 1112 | V.isseqcharfirst8 = 0; |
|---|
| 1113 | if (V.choice == kListSequences) ; /* leave as is */ |
|---|
| 1114 | else if (V.choice <= 0) V.choice = 1; /* default ?? */ |
|---|
| 1115 | V.addit = (V.choice > 0); |
|---|
| 1116 | V.allDone = false; |
|---|
| 1117 | |
|---|
| 1118 | V.f = fopen(V.fname, "r"); |
|---|
| 1119 | V.filestart= true; |
|---|
| 1120 | |
|---|
| 1121 | readSeqMain(&V, skiplines_, format_); |
|---|
| 1122 | |
|---|
| 1123 | if (V.f != NULL) fclose(V.f); |
|---|
| 1124 | *error_ = V.err; |
|---|
| 1125 | *seqlen_ = V.seqlen; |
|---|
| 1126 | *nseq_ = V.nseq; |
|---|
| 1127 | return V.seq; |
|---|
| 1128 | } |
|---|
| 1129 | |
|---|
| 1130 | |
|---|
| 1131 | |
|---|
| 1132 | |
|---|
| 1133 | |
|---|
| 1134 | char *listSeqs( |
|---|
| 1135 | const char *filename_, /* file name */ |
|---|
| 1136 | const long skiplines_, |
|---|
| 1137 | const short format_, /* sequence file format */ |
|---|
| 1138 | short *nseq_, /* number of seqs in file, for listSeqs() */ |
|---|
| 1139 | short *error_) /* return error */ |
|---|
| 1140 | { |
|---|
| 1141 | char seqid[256]; |
|---|
| 1142 | long seqlen; |
|---|
| 1143 | |
|---|
| 1144 | return readSeq( kListSequences, filename_, skiplines_, format_, |
|---|
| 1145 | &seqlen, nseq_, error_, seqid); |
|---|
| 1146 | } |
|---|
| 1147 | |
|---|
| 1148 | |
|---|
| 1149 | |
|---|
| 1150 | |
|---|
| 1151 | short seqFileFormat( /* return sequence format number, see ureadseq.h */ |
|---|
| 1152 | const char *filename, |
|---|
| 1153 | long *skiplines, /* return #lines to skip any junk like mail header */ |
|---|
| 1154 | short *error) /* return any error value or 0 */ |
|---|
| 1155 | { |
|---|
| 1156 | FILE *fseq; |
|---|
| 1157 | short format; |
|---|
| 1158 | |
|---|
| 1159 | fseq = fopen(filename, "r"); |
|---|
| 1160 | format= seqFileFormatFp( fseq, skiplines, error); |
|---|
| 1161 | if (fseq!=NULL) fclose(fseq); |
|---|
| 1162 | return format; |
|---|
| 1163 | } |
|---|
| 1164 | |
|---|
| 1165 | short seqFileFormatFp( |
|---|
| 1166 | FILE *fseq, |
|---|
| 1167 | long *skiplines, /* return #lines to skip any junk like mail header */ |
|---|
| 1168 | short *error) /* return any error value or 0 */ |
|---|
| 1169 | { |
|---|
| 1170 | boolean foundIG= false, foundStrider= false, |
|---|
| 1171 | foundGB= false, foundPIR= false, foundEMBL= false, foundNBRF= false, |
|---|
| 1172 | foundPearson= false, foundFitch= false, foundPhylip= false, foundZuker= false, |
|---|
| 1173 | gotolsen= false, gotpaup = false, gotasn1 = false, gotuw= false, gotMSF= false, |
|---|
| 1174 | isfitch= false, isphylip= false, done= false; |
|---|
| 1175 | short format= kUnknown; |
|---|
| 1176 | int nlines= 0, k, splen= 0, otherlines= 0, aminolines= 0, dnalines= 0; |
|---|
| 1177 | char sp[256]; |
|---|
| 1178 | long linestart=0; |
|---|
| 1179 | int maxlines2check=500; |
|---|
| 1180 | |
|---|
| 1181 | #define ReadOneLine(sp) \ |
|---|
| 1182 | { done |= (feof(fseq)); \ |
|---|
| 1183 | readline( fseq, sp, &linestart); \ |
|---|
| 1184 | if (!done) { splen = strlen(sp); ++nlines; } } |
|---|
| 1185 | |
|---|
| 1186 | *skiplines = 0; |
|---|
| 1187 | *error = 0; |
|---|
| 1188 | if (fseq == NULL) { *error = eFileNotFound; return kNoformat; } |
|---|
| 1189 | |
|---|
| 1190 | while ( !done ) { |
|---|
| 1191 | ReadOneLine(sp); |
|---|
| 1192 | |
|---|
| 1193 | /* check for mailer head & skip past if found */ |
|---|
| 1194 | if (nlines < 4 && !done) { |
|---|
| 1195 | if ((strstr(sp,"From ") == sp) || (strstr(sp,"Received:") == sp)) { |
|---|
| 1196 | do { |
|---|
| 1197 | /* skip all lines until find one blank line */ |
|---|
| 1198 | ReadOneLine(sp); |
|---|
| 1199 | if (!done) for (k=0; (k<splen) && (sp[k]==' '); k++) ; |
|---|
| 1200 | } while ((!done) && (k < splen)); |
|---|
| 1201 | *skiplines = nlines; /* !? do we want #lines or #bytes ?? */ |
|---|
| 1202 | } |
|---|
| 1203 | } |
|---|
| 1204 | |
|---|
| 1205 | if (sp==NULL || *sp==0) |
|---|
| 1206 | ; /* nada */ |
|---|
| 1207 | |
|---|
| 1208 | /* high probability identities: */ |
|---|
| 1209 | |
|---|
| 1210 | else if ( strstr(sp,"MSF:") && strstr(sp,"Type:") && strstr(sp,"Check:") ) |
|---|
| 1211 | gotMSF= true; |
|---|
| 1212 | |
|---|
| 1213 | else if ((strstr(sp,"..") != NULL) && (strstr(sp,"Check:") != NULL)) |
|---|
| 1214 | gotuw= true; |
|---|
| 1215 | |
|---|
| 1216 | else if (strstr(sp,"identity: Data:") != NULL) |
|---|
| 1217 | gotolsen= true; |
|---|
| 1218 | |
|---|
| 1219 | else if ( strstr(sp,"::=") && |
|---|
| 1220 | (strstr(sp,"Bioseq") || /* Bioseq or Bioseq-set */ |
|---|
| 1221 | strstr(sp,"Seq-entry") || |
|---|
| 1222 | strstr(sp,"Seq-submit") ) ) /* can we read submit format? */ |
|---|
| 1223 | gotasn1= true; |
|---|
| 1224 | |
|---|
| 1225 | else if ( strstr(sp,"#NEXUS") == sp ) |
|---|
| 1226 | gotpaup= true; |
|---|
| 1227 | |
|---|
| 1228 | /* uncertain identities: */ |
|---|
| 1229 | |
|---|
| 1230 | else if (*sp ==';') { |
|---|
| 1231 | if (strstr(sp,"Strider") !=NULL) foundStrider= true; |
|---|
| 1232 | else foundIG= true; |
|---|
| 1233 | } |
|---|
| 1234 | |
|---|
| 1235 | else if (strstr(sp,"LOCUS") == sp) |
|---|
| 1236 | foundGB= true; |
|---|
| 1237 | else if (strstr(sp,"ORIGIN") == sp) |
|---|
| 1238 | foundGB= true; |
|---|
| 1239 | |
|---|
| 1240 | else if (strstr(sp,"ENTRY ") == sp) /* ? also (strcmp(sp,"\\\\\\")==0) */ |
|---|
| 1241 | foundPIR= true; |
|---|
| 1242 | else if (strstr(sp,"SEQUENCE") == sp) |
|---|
| 1243 | foundPIR= true; |
|---|
| 1244 | |
|---|
| 1245 | else if (*sp == '>') { |
|---|
| 1246 | if (sp[3] == ';') foundNBRF= true; |
|---|
| 1247 | else foundPearson= true; |
|---|
| 1248 | } |
|---|
| 1249 | |
|---|
| 1250 | else if (strstr(sp,"ID ") == sp) |
|---|
| 1251 | foundEMBL= true; |
|---|
| 1252 | else if (strstr(sp,"SQ ") == sp) |
|---|
| 1253 | foundEMBL= true; |
|---|
| 1254 | |
|---|
| 1255 | else if (*sp == '(') |
|---|
| 1256 | foundZuker= true; |
|---|
| 1257 | |
|---|
| 1258 | else { |
|---|
| 1259 | if (nlines - *skiplines == 1) { |
|---|
| 1260 | int ispp= 0, ilen= 0; |
|---|
| 1261 | sscanf( sp, "%d%d", &ispp, &ilen); |
|---|
| 1262 | if (ispp > 0 && ilen > 0) isphylip= true; |
|---|
| 1263 | } |
|---|
| 1264 | else if (isphylip && nlines - *skiplines == 2) { |
|---|
| 1265 | int tseq; |
|---|
| 1266 | tseq= getseqtype(sp+10, strlen(sp+10)); |
|---|
| 1267 | if ( isalpha(*sp) /* 1st letter in 2nd line must be of a name */ |
|---|
| 1268 | && (tseq != kOtherSeq)) /* sequence section must be okay */ |
|---|
| 1269 | foundPhylip= true; |
|---|
| 1270 | } |
|---|
| 1271 | |
|---|
| 1272 | for (k=0, isfitch= true; isfitch & (k < splen); k++) { |
|---|
| 1273 | if (k % 4 == 0) isfitch &= (sp[k] == ' '); |
|---|
| 1274 | else isfitch &= (sp[k] != ' '); |
|---|
| 1275 | } |
|---|
| 1276 | if (isfitch & (splen > 20)) foundFitch= true; |
|---|
| 1277 | |
|---|
| 1278 | /* kRNA && kDNA are fairly certain...*/ |
|---|
| 1279 | switch (getseqtype( sp, splen)) { |
|---|
| 1280 | case kOtherSeq: otherlines++; break; |
|---|
| 1281 | case kAmino : if (splen>20) aminolines++; break; |
|---|
| 1282 | case kDNA : |
|---|
| 1283 | case kRNA : if (splen>20) dnalines++; break; |
|---|
| 1284 | case kNucleic : break; /* not much info ? */ |
|---|
| 1285 | } |
|---|
| 1286 | |
|---|
| 1287 | } |
|---|
| 1288 | |
|---|
| 1289 | /* pretty certain */ |
|---|
| 1290 | if (gotolsen) { |
|---|
| 1291 | format= kOlsen; |
|---|
| 1292 | done= true; |
|---|
| 1293 | } |
|---|
| 1294 | else if (gotMSF) { |
|---|
| 1295 | format= kMSF; |
|---|
| 1296 | done= true; |
|---|
| 1297 | } |
|---|
| 1298 | else if (gotasn1) { |
|---|
| 1299 | /* !! we need to look further and return kASNseqentry | kASNseqset */ |
|---|
| 1300 | /* |
|---|
| 1301 | seqentry key is Seq-entry ::= |
|---|
| 1302 | seqset key is Bioseq-set ::= |
|---|
| 1303 | ?? can't read these yet w/ ncbi tools ?? |
|---|
| 1304 | Seq-submit ::= |
|---|
| 1305 | Bioseq ::= << fails both bioseq-seq and seq-entry parsers ! |
|---|
| 1306 | */ |
|---|
| 1307 | if (strstr(sp,"Bioseq-set")) format= kASNseqset; |
|---|
| 1308 | else if (strstr(sp,"Seq-entry")) format= kASNseqentry; |
|---|
| 1309 | else format= kASN1; /* other form, we can't yet read... */ |
|---|
| 1310 | done= true; |
|---|
| 1311 | } |
|---|
| 1312 | else if (gotpaup) { |
|---|
| 1313 | format= kPAUP; |
|---|
| 1314 | done= true; |
|---|
| 1315 | } |
|---|
| 1316 | |
|---|
| 1317 | else if (gotuw) { |
|---|
| 1318 | if (foundIG) format= kIG; /* a TOIG file from GCG for certain */ |
|---|
| 1319 | else format= kGCG; |
|---|
| 1320 | done= true; |
|---|
| 1321 | } |
|---|
| 1322 | |
|---|
| 1323 | else if ((dnalines > 1) || done || (nlines > maxlines2check)) { |
|---|
| 1324 | /* decide on most likely format */ |
|---|
| 1325 | /* multichar idents: */ |
|---|
| 1326 | if (foundStrider) format= kStrider; |
|---|
| 1327 | else if (foundGB) format= kGenBank; |
|---|
| 1328 | else if (foundPIR) format= kPIR; |
|---|
| 1329 | else if (foundEMBL) format= kEMBL; |
|---|
| 1330 | else if (foundNBRF) format= kNBRF; |
|---|
| 1331 | /* single char idents: */ |
|---|
| 1332 | else if (foundIG) format= kIG; |
|---|
| 1333 | else if (foundPearson) format= kPearson; |
|---|
| 1334 | else if (foundZuker) format= kZuker; |
|---|
| 1335 | /* digit ident: */ |
|---|
| 1336 | else if (foundPhylip) format= kPhylip; |
|---|
| 1337 | /* spacing ident: */ |
|---|
| 1338 | else if (foundFitch) format= kFitch; |
|---|
| 1339 | /* no format chars: */ |
|---|
| 1340 | else if (otherlines > 0) format= kUnknown; |
|---|
| 1341 | else if (dnalines > 1) format= kPlain; |
|---|
| 1342 | else if (aminolines > 1) format= kPlain; |
|---|
| 1343 | else format= kUnknown; |
|---|
| 1344 | |
|---|
| 1345 | done= true; |
|---|
| 1346 | } |
|---|
| 1347 | |
|---|
| 1348 | /* need this for possible long header in olsen format */ |
|---|
| 1349 | else if (strstr(sp,"): ") != NULL) |
|---|
| 1350 | maxlines2check++; |
|---|
| 1351 | } |
|---|
| 1352 | |
|---|
| 1353 | if (format == kPhylip) { |
|---|
| 1354 | /* check for interleaved or sequential -- really messy */ |
|---|
| 1355 | int tname, tseq; |
|---|
| 1356 | long i, j, nspp= 0, nlen= 0, ilen, leaf= 0, seq= 0; |
|---|
| 1357 | char *ps; |
|---|
| 1358 | |
|---|
| 1359 | rewind(fseq); |
|---|
| 1360 | for (i=0; i < *skiplines; i++) ReadOneLine(sp); |
|---|
| 1361 | nlines= 0; |
|---|
| 1362 | ReadOneLine(sp); |
|---|
| 1363 | sscanf( sp, "%ld%ld", &nspp, &nlen); |
|---|
| 1364 | ReadOneLine(sp); /* 1st seq line */ |
|---|
| 1365 | for (ps= sp+10, ilen=0; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
|---|
| 1366 | |
|---|
| 1367 | for (i= 1; i<nspp; i++) { |
|---|
| 1368 | ReadOneLine(sp); |
|---|
| 1369 | |
|---|
| 1370 | tseq= getseqtype(sp+10, strlen(sp+10)); |
|---|
| 1371 | tname= getseqtype(sp, 10); |
|---|
| 1372 | for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++); |
|---|
| 1373 | for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
|---|
| 1374 | |
|---|
| 1375 | /* find probable interleaf or sequential ... */ |
|---|
| 1376 | if (j>=9) seq += 10; /* pretty certain not ileaf */ |
|---|
| 1377 | else { |
|---|
| 1378 | if (tseq != tname) leaf++; else seq++; |
|---|
| 1379 | if (tname == kDNA || tname == kRNA) seq++; else leaf++; |
|---|
| 1380 | } |
|---|
| 1381 | |
|---|
| 1382 | if (ilen <= nlen && j<9) { |
|---|
| 1383 | if (tname == kOtherSeq) leaf += 10; |
|---|
| 1384 | else if (tname == kAmino || tname == kDNA || tname == kRNA) seq++; else leaf++; |
|---|
| 1385 | } |
|---|
| 1386 | else if (ilen > nlen) { |
|---|
| 1387 | ilen= 0; |
|---|
| 1388 | } |
|---|
| 1389 | } |
|---|
| 1390 | for ( nspp *= 2 ; i<nspp; i++) { /* this should be only bases if interleaf */ |
|---|
| 1391 | ReadOneLine(sp); |
|---|
| 1392 | |
|---|
| 1393 | tseq= getseqtype(sp+10, strlen(sp+10)); |
|---|
| 1394 | tname= getseqtype(sp, 10); |
|---|
| 1395 | for (ps= sp; *ps!=0; ps++) if (isprint(*ps)) ilen++; |
|---|
| 1396 | for (j=0, ps= sp; isspace(*ps) && j<10; ps++, j++); |
|---|
| 1397 | if (j<9) { |
|---|
| 1398 | if (tname == kOtherSeq) seq += 10; |
|---|
| 1399 | if (tseq != tname) seq++; else leaf++; |
|---|
| 1400 | if (tname == kDNA || tname == kRNA) leaf++; else seq++; |
|---|
| 1401 | } |
|---|
| 1402 | if (ilen > nlen) { |
|---|
| 1403 | if (j>9) leaf += 10; /* must be a name here for sequent */ |
|---|
| 1404 | else if (tname == kOtherSeq) seq += 10; |
|---|
| 1405 | ilen= 0; |
|---|
| 1406 | } |
|---|
| 1407 | } |
|---|
| 1408 | |
|---|
| 1409 | if (leaf > seq) format= kPhylip4; |
|---|
| 1410 | else format= kPhylip2; |
|---|
| 1411 | } |
|---|
| 1412 | |
|---|
| 1413 | return(format); |
|---|
| 1414 | #undef ReadOneLine |
|---|
| 1415 | } /* SeqFileFormat */ |
|---|
| 1416 | |
|---|
| 1417 | |
|---|
| 1418 | |
|---|
| 1419 | |
|---|
| 1420 | unsigned long GCGchecksum( const char *seq, const long seqlen, unsigned long *checktotal) |
|---|
| 1421 | /* GCGchecksum */ |
|---|
| 1422 | { |
|---|
| 1423 | long i, check = 0, count = 0; |
|---|
| 1424 | |
|---|
| 1425 | for (i = 0; i < seqlen; i++) { |
|---|
| 1426 | count++; |
|---|
| 1427 | check += count * to_upper(seq[i]); |
|---|
| 1428 | if (count == 57) count = 0; |
|---|
| 1429 | } |
|---|
| 1430 | check %= 10000; |
|---|
| 1431 | *checktotal += check; |
|---|
| 1432 | *checktotal %= 10000; |
|---|
| 1433 | return check; |
|---|
| 1434 | } |
|---|
| 1435 | |
|---|
| 1436 | /* Table of CRC-32's of all single byte values (made by makecrc.c of ZIP source) */ |
|---|
| 1437 | const unsigned long crctab[] = { |
|---|
| 1438 | 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, |
|---|
| 1439 | 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, |
|---|
| 1440 | 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, |
|---|
| 1441 | 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, |
|---|
| 1442 | 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, |
|---|
| 1443 | 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, |
|---|
| 1444 | 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, |
|---|
| 1445 | 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, |
|---|
| 1446 | 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, |
|---|
| 1447 | 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, |
|---|
| 1448 | 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, |
|---|
| 1449 | 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, |
|---|
| 1450 | 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, |
|---|
| 1451 | 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, |
|---|
| 1452 | 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, |
|---|
| 1453 | 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, |
|---|
| 1454 | 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, |
|---|
| 1455 | 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, |
|---|
| 1456 | 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, |
|---|
| 1457 | 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, |
|---|
| 1458 | 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, |
|---|
| 1459 | 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, |
|---|
| 1460 | 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, |
|---|
| 1461 | 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, |
|---|
| 1462 | 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, |
|---|
| 1463 | 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, |
|---|
| 1464 | 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, |
|---|
| 1465 | 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, |
|---|
| 1466 | 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, |
|---|
| 1467 | 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, |
|---|
| 1468 | 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, |
|---|
| 1469 | 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, |
|---|
| 1470 | 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, |
|---|
| 1471 | 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, |
|---|
| 1472 | 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, |
|---|
| 1473 | 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, |
|---|
| 1474 | 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, |
|---|
| 1475 | 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, |
|---|
| 1476 | 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, |
|---|
| 1477 | 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, |
|---|
| 1478 | 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, |
|---|
| 1479 | 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, |
|---|
| 1480 | 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, |
|---|
| 1481 | 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, |
|---|
| 1482 | 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, |
|---|
| 1483 | 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, |
|---|
| 1484 | 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, |
|---|
| 1485 | 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, |
|---|
| 1486 | 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, |
|---|
| 1487 | 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, |
|---|
| 1488 | 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, |
|---|
| 1489 | 0x2d02ef8dL |
|---|
| 1490 | }; |
|---|
| 1491 | |
|---|
| 1492 | unsigned long CRC32checksum(const char *seq, const long seqlen, unsigned long *checktotal) |
|---|
| 1493 | /*CRC32checksum: modified from CRC-32 algorithm found in ZIP compression source */ |
|---|
| 1494 | { |
|---|
| 1495 | unsigned long c = 0xffffffffL; |
|---|
| 1496 | long n = seqlen; |
|---|
| 1497 | |
|---|
| 1498 | if (!seq) return 0; |
|---|
| 1499 | while (n--) { |
|---|
| 1500 | c = crctab[((int)c ^ (to_upper(*seq))) & 0xff] ^ (c >> 8); |
|---|
| 1501 | seq++; /* fixed aug'98 finally */ |
|---|
| 1502 | } |
|---|
| 1503 | c= c ^ 0xffffffffL; |
|---|
| 1504 | *checktotal += c; |
|---|
| 1505 | return c; |
|---|
| 1506 | } |
|---|
| 1507 | |
|---|
| 1508 | |
|---|
| 1509 | |
|---|
| 1510 | |
|---|
| 1511 | short getseqtype( const char *seq, const long seqlen) |
|---|
| 1512 | { /* return sequence kind: kDNA, kRNA, kProtein, kOtherSeq, ??? */ |
|---|
| 1513 | char c; |
|---|
| 1514 | short i, maxtest; |
|---|
| 1515 | short na = 0, aa = 0, po = 0, nt = 0, nu = 0, ns = 0, no = 0; |
|---|
| 1516 | |
|---|
| 1517 | maxtest = min(300, seqlen); |
|---|
| 1518 | for (i = 0; i < maxtest; i++) { |
|---|
| 1519 | c = to_upper(seq[i]); |
|---|
| 1520 | if (strchr(protonly, c)) po++; |
|---|
| 1521 | else if (strchr(primenuc,c)) { |
|---|
| 1522 | na++; |
|---|
| 1523 | if (c == 'T') nt++; |
|---|
| 1524 | else if (c == 'U') nu++; |
|---|
| 1525 | } |
|---|
| 1526 | else if (strchr(aminos,c)) aa++; |
|---|
| 1527 | else if (strchr(seqsymbols,c)) ns++; |
|---|
| 1528 | else if (isalpha(c)) no++; |
|---|
| 1529 | } |
|---|
| 1530 | |
|---|
| 1531 | if ((no > 0) || (po+aa+na == 0)) return kOtherSeq; |
|---|
| 1532 | /* ?? test for probability of kOtherSeq ?, e.g., |
|---|
| 1533 | else if (po+aa+na / maxtest < 0.70) return kOtherSeq; |
|---|
| 1534 | */ |
|---|
| 1535 | else if (po > 0) return kAmino; |
|---|
| 1536 | else if (aa == 0) { |
|---|
| 1537 | if (nu > nt) return kRNA; |
|---|
| 1538 | else return kDNA; |
|---|
| 1539 | } |
|---|
| 1540 | else if (na > aa) return kNucleic; |
|---|
| 1541 | else return kAmino; |
|---|
| 1542 | } /* getseqtype */ |
|---|
| 1543 | |
|---|
| 1544 | |
|---|
| 1545 | char* compressSeq( const char gapc, const char *seq, const long seqlen, long *newlen) |
|---|
| 1546 | { |
|---|
| 1547 | char *a, *b; |
|---|
| 1548 | long i; |
|---|
| 1549 | char *newseq; |
|---|
| 1550 | |
|---|
| 1551 | *newlen= 0; |
|---|
| 1552 | if (!seq) return NULL; |
|---|
| 1553 | newseq = (char*) malloc(seqlen+1); |
|---|
| 1554 | if (!newseq) return NULL; |
|---|
| 1555 | for (a= (char*)seq, b=newseq, i=0; *a!=0; a++) |
|---|
| 1556 | if (*a != gapc) { |
|---|
| 1557 | *b++= *a; |
|---|
| 1558 | i++; |
|---|
| 1559 | } |
|---|
| 1560 | *b= '\0'; |
|---|
| 1561 | newseq = (char*) realloc(newseq, i+1); |
|---|
| 1562 | *newlen= i; |
|---|
| 1563 | return newseq; |
|---|
| 1564 | } |
|---|
| 1565 | |
|---|
| 1566 | |
|---|
| 1567 | |
|---|
| 1568 | /*** |
|---|
| 1569 | char *rtfhead = "{\\rtf1\\defformat\\mac\\deff2 \ |
|---|
| 1570 | {\\fonttbl\ |
|---|
| 1571 | {\\f1\\fmodern Courier;}{\\f2\\fmodern Monaco;}\ |
|---|
| 1572 | {\\f3\\fswiss Helvetica;}{\\f4\\fswiss Geneva;}\ |
|---|
| 1573 | {\\f5\\froman Times;}{\\f6\\froman Palatino;}\ |
|---|
| 1574 | {\\f7\\froman New Century Schlbk;}{\\f8\\ftech Symbol;}}\ |
|---|
| 1575 | {\\stylesheet\ |
|---|
| 1576 | {\\s1 \\f5\\fs20 \\sbasedon0\\snext1 name;}\ |
|---|
| 1577 | {\\s2 \\f3\\fs20 \\sbasedon0\\snext2 num;}\ |
|---|
| 1578 | {\\s3 \\f1\\f21 \\sbasedon0\\snext3 seq;}}"; |
|---|
| 1579 | |
|---|
| 1580 | char *rtftail = "}"; |
|---|
| 1581 | ****/ |
|---|
| 1582 | |
|---|
| 1583 | short writeSeq(FILE *outf, const char *seq, const long seqlen, |
|---|
| 1584 | const short outform, const char *seqid) |
|---|
| 1585 | /* dump sequence to standard output */ |
|---|
| 1586 | { |
|---|
| 1587 | const short kSpaceAll = -9; |
|---|
| 1588 | #define kMaxseqwidth 250 |
|---|
| 1589 | |
|---|
| 1590 | boolean baseonlynum= false; /* nocountsymbols -- only count true bases, not "-" */ |
|---|
| 1591 | short numline = 0; /* only true if we are writing seq number line (for interleave) */ |
|---|
| 1592 | boolean numright = false, numleft = false; |
|---|
| 1593 | boolean nameright = false, nameleft = false; |
|---|
| 1594 | short namewidth = 8, numwidth = 8; |
|---|
| 1595 | short spacer = 0, width = 50, tab = 0; |
|---|
| 1596 | /* new parameters: width, spacer, those above... */ |
|---|
| 1597 | |
|---|
| 1598 | short linesout = 0, seqtype = kNucleic; |
|---|
| 1599 | long i, j, l, l1, ibase; |
|---|
| 1600 | char idword[31], endstr[14]; |
|---|
| 1601 | char seqnamestore[128], *seqname = seqnamestore; |
|---|
| 1602 | char s[kMaxseqwidth], *cp; |
|---|
| 1603 | char nameform[10], numform[10], nocountsymbols[10]; |
|---|
| 1604 | unsigned long checksum = 0, checktotal = 0; |
|---|
| 1605 | |
|---|
| 1606 | gPretty.atseq++; |
|---|
| 1607 | skipwhitespace(seqid); |
|---|
| 1608 | l = min(128, strlen(seqid)); |
|---|
| 1609 | strncpy( seqnamestore, seqid, l); |
|---|
| 1610 | seqname[l] = 0; |
|---|
| 1611 | |
|---|
| 1612 | sscanf( seqname, "%30s", idword); |
|---|
| 1613 | sprintf(numform, "%ld", seqlen); |
|---|
| 1614 | numwidth= strlen(numform)+1; |
|---|
| 1615 | nameform[0]= '\0'; |
|---|
| 1616 | |
|---|
| 1617 | if (strstr(seqname,"checksum") != NULL) { |
|---|
| 1618 | cp = strstr(seqname,"bases"); |
|---|
| 1619 | if (cp!=NULL) { |
|---|
| 1620 | for ( ; (cp!=seqname) && (*cp!=','); cp--) ; |
|---|
| 1621 | if (cp!=seqname) *cp=0; |
|---|
| 1622 | } |
|---|
| 1623 | } |
|---|
| 1624 | |
|---|
| 1625 | strcpy( endstr,""); |
|---|
| 1626 | l1 = 0; |
|---|
| 1627 | |
|---|
| 1628 | if (outform == kGCG || outform == kMSF) |
|---|
| 1629 | checksum = GCGchecksum(seq, seqlen, &checktotal); |
|---|
| 1630 | else |
|---|
| 1631 | checksum = seqchecksum(seq, seqlen, &checktotal); |
|---|
| 1632 | |
|---|
| 1633 | switch (outform) { |
|---|
| 1634 | |
|---|
| 1635 | case kPlain: |
|---|
| 1636 | case kUnknown: /* no header, just sequence */ |
|---|
| 1637 | strcpy(endstr,"\n"); /* end w/ extra blank line */ |
|---|
| 1638 | break; |
|---|
| 1639 | |
|---|
| 1640 | case kOlsen: /* Olsen seq. editor takes plain nucs OR Genbank */ |
|---|
| 1641 | case kGenBank: |
|---|
| 1642 | fprintf(outf,"LOCUS %s %ld bp\n", idword, seqlen); |
|---|
| 1643 | fprintf(outf,"DEFINITION %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
|---|
| 1644 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
|---|
| 1645 | fprintf(outf,"ORIGIN \n"); |
|---|
| 1646 | spacer = 11; |
|---|
| 1647 | numleft = true; |
|---|
| 1648 | numwidth = 8; /* dgg. 1Feb93, patch for GDE fail to read short numwidth */ |
|---|
| 1649 | strcpy(endstr, "\n//"); |
|---|
| 1650 | linesout += 4; |
|---|
| 1651 | break; |
|---|
| 1652 | |
|---|
| 1653 | case kPIR: |
|---|
| 1654 | /* somewhat like genbank... \\\*/ |
|---|
| 1655 | /* fprintf(outf,"\\\\\\\n"); << only at top of file, not each entry... */ |
|---|
| 1656 | fprintf(outf,"ENTRY %s \n", idword); |
|---|
| 1657 | fprintf(outf,"TITLE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
|---|
| 1658 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
|---|
| 1659 | fprintf(outf,"SEQUENCE \n"); |
|---|
| 1660 | numwidth = 7; |
|---|
| 1661 | width= 30; |
|---|
| 1662 | spacer = kSpaceAll; |
|---|
| 1663 | numleft = true; |
|---|
| 1664 | strcpy(endstr, "\n///"); |
|---|
| 1665 | /* run a top number line for PIR */ |
|---|
| 1666 | for (j=0; j<numwidth; j++) fputc(' ',outf); |
|---|
| 1667 | for (j= 5; j<=width; j += 5) fprintf(outf,"%10ld",j); |
|---|
| 1668 | fputc('\n',outf); |
|---|
| 1669 | linesout += 5; |
|---|
| 1670 | break; |
|---|
| 1671 | |
|---|
| 1672 | case kNBRF: |
|---|
| 1673 | if (getseqtype(seq, seqlen) == kAmino) |
|---|
| 1674 | fprintf(outf,">P1;%s\n", idword); |
|---|
| 1675 | else |
|---|
| 1676 | fprintf(outf,">DL;%s\n", idword); |
|---|
| 1677 | fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
|---|
| 1678 | spacer = 11; |
|---|
| 1679 | strcpy(endstr,"*\n"); |
|---|
| 1680 | linesout += 3; |
|---|
| 1681 | break; |
|---|
| 1682 | |
|---|
| 1683 | case kEMBL: |
|---|
| 1684 | fprintf(outf,"ID %s\n", idword); |
|---|
| 1685 | /* fprintf(outf,"AC %s\n", accnum); */ |
|---|
| 1686 | fprintf(outf,"DE %s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
|---|
| 1687 | fprintf(outf,"SQ %ld BP\n", seqlen); |
|---|
| 1688 | strcpy(endstr, "\n//"); /* 11Oct90: bug fix*/ |
|---|
| 1689 | tab = 4; /** added 31jan91 */ |
|---|
| 1690 | spacer = 11; /** added 31jan91 */ |
|---|
| 1691 | width = 60; |
|---|
| 1692 | linesout += 4; |
|---|
| 1693 | break; |
|---|
| 1694 | |
|---|
| 1695 | case kGCG: |
|---|
| 1696 | fprintf(outf,"%s\n", seqname); |
|---|
| 1697 | /* fprintf(outf,"ACCESSION %s\n", accnum); */ |
|---|
| 1698 | fprintf(outf," %s Length: %ld (today) Check: %ld ..\n", idword, seqlen, checksum); |
|---|
| 1699 | spacer = 11; |
|---|
| 1700 | numleft = true; |
|---|
| 1701 | strcpy(endstr, "\n"); /* this is insurance to help prevent misreads at eof */ |
|---|
| 1702 | linesout += 3; |
|---|
| 1703 | break; |
|---|
| 1704 | |
|---|
| 1705 | case kStrider: /* ?? map ?*/ |
|---|
| 1706 | fprintf(outf,"; ### from DNA Strider ;-)\n"); |
|---|
| 1707 | fprintf(outf,"; DNA sequence %s, %ld bases, %lX checksum.\n;\n", seqname, seqlen, checksum); |
|---|
| 1708 | strcpy(endstr, "\n//"); |
|---|
| 1709 | linesout += 3; |
|---|
| 1710 | break; |
|---|
| 1711 | |
|---|
| 1712 | case kFitch: |
|---|
| 1713 | fprintf(outf,"%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
|---|
| 1714 | spacer = 4; |
|---|
| 1715 | width = 60; |
|---|
| 1716 | linesout += 1; |
|---|
| 1717 | break; |
|---|
| 1718 | |
|---|
| 1719 | case kPhylip2: |
|---|
| 1720 | case kPhylip4: |
|---|
| 1721 | /* this is version 3.2/3.4 -- simplest way to write |
|---|
| 1722 | version 3.3 is to write as version 3.2, then |
|---|
| 1723 | re-read file and interleave the species lines */ |
|---|
| 1724 | if (strlen(idword)>10) idword[10] = 0; |
|---|
| 1725 | fprintf(outf,"%-10s ",idword); |
|---|
| 1726 | l1 = -1; |
|---|
| 1727 | tab = 12; |
|---|
| 1728 | spacer = 11; |
|---|
| 1729 | break; |
|---|
| 1730 | |
|---|
| 1731 | case kASN1: |
|---|
| 1732 | seqtype= getseqtype(seq, seqlen); |
|---|
| 1733 | switch (seqtype) { |
|---|
| 1734 | case kDNA : cp= (char*)"dna"; break; |
|---|
| 1735 | case kRNA : cp= (char*)"rna"; break; |
|---|
| 1736 | case kNucleic : cp= (char*)"na"; break; |
|---|
| 1737 | case kAmino : cp= (char*)"aa"; break; |
|---|
| 1738 | case kOtherSeq: cp= (char*)"not-set"; break; |
|---|
| 1739 | } |
|---|
| 1740 | fprintf(outf," seq {\n"); |
|---|
| 1741 | fprintf(outf," id { local id %d },\n", gPretty.atseq); |
|---|
| 1742 | fprintf(outf," descr { title \"%s\" },\n", seqid); |
|---|
| 1743 | fprintf(outf," inst {\n"); |
|---|
| 1744 | fprintf(outf," repr raw, mol %s, length %ld, topology linear,\n", cp, seqlen); |
|---|
| 1745 | fprintf(outf," seq-data\n"); |
|---|
| 1746 | if (seqtype == kAmino) |
|---|
| 1747 | fprintf(outf," iupacaa \""); |
|---|
| 1748 | else |
|---|
| 1749 | fprintf(outf," iupacna \""); |
|---|
| 1750 | l1 = 17; |
|---|
| 1751 | spacer = 0; |
|---|
| 1752 | width = 78; |
|---|
| 1753 | tab = 0; |
|---|
| 1754 | strcpy(endstr,"\"\n } } ,"); |
|---|
| 1755 | linesout += 7; |
|---|
| 1756 | break; |
|---|
| 1757 | |
|---|
| 1758 | case kPAUP: |
|---|
| 1759 | nameleft= true; |
|---|
| 1760 | namewidth = 9; |
|---|
| 1761 | spacer = 21; |
|---|
| 1762 | width = 100; |
|---|
| 1763 | tab = 0; /* 1; */ |
|---|
| 1764 | /* strcpy(endstr,";\nend;"); << this is end of all seqs.. */ |
|---|
| 1765 | /* do a header comment line for paup */ |
|---|
| 1766 | fprintf(outf,"[Name: %-16s Len:%6ld Check: %8lX]\n", idword, seqlen, checksum); |
|---|
| 1767 | linesout += 1; |
|---|
| 1768 | break; |
|---|
| 1769 | |
|---|
| 1770 | case kPretty: |
|---|
| 1771 | numline= gPretty.numline; |
|---|
| 1772 | baseonlynum= gPretty.baseonlynum; |
|---|
| 1773 | namewidth = gPretty.namewidth; |
|---|
| 1774 | numright = gPretty.numright; |
|---|
| 1775 | numleft = gPretty.numleft; |
|---|
| 1776 | nameright = gPretty.nameright; |
|---|
| 1777 | nameleft = gPretty.nameleft; |
|---|
| 1778 | spacer = gPretty.spacer + 1; |
|---|
| 1779 | width = gPretty.seqwidth; |
|---|
| 1780 | tab = gPretty.tab; |
|---|
| 1781 | /* also add rtf formatting w/ font, size, style */ |
|---|
| 1782 | if (gPretty.nametop) { |
|---|
| 1783 | fprintf(outf,"Name: %-16s Len:%6ld Check: %8lX\n", idword, seqlen, checksum); |
|---|
| 1784 | linesout++; |
|---|
| 1785 | } |
|---|
| 1786 | break; |
|---|
| 1787 | |
|---|
| 1788 | case kMSF: |
|---|
| 1789 | fprintf(outf," Name: %-16s Len:%6ld Check: %5ld Weight: 1.00\n", |
|---|
| 1790 | idword, seqlen, checksum); |
|---|
| 1791 | linesout++; |
|---|
| 1792 | nameleft= true; |
|---|
| 1793 | namewidth= 15; /* need MAX namewidth here... */ |
|---|
| 1794 | sprintf(nameform, "%%+%ds ",namewidth); |
|---|
| 1795 | spacer = 11; |
|---|
| 1796 | width = 50; |
|---|
| 1797 | tab = 0; /* 1; */ |
|---|
| 1798 | break; |
|---|
| 1799 | |
|---|
| 1800 | case kIG: |
|---|
| 1801 | fprintf(outf,";%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
|---|
| 1802 | fprintf(outf,"%s\n", idword); |
|---|
| 1803 | strcpy(endstr,"1"); /* == linear dna */ |
|---|
| 1804 | linesout += 2; |
|---|
| 1805 | break; |
|---|
| 1806 | |
|---|
| 1807 | default : |
|---|
| 1808 | case kZuker: /* don't attempt Zuker's ftn format */ |
|---|
| 1809 | case kPearson: |
|---|
| 1810 | fprintf(outf,">%s, %ld bases, %lX checksum.\n", seqname, seqlen, checksum); |
|---|
| 1811 | linesout += 1; |
|---|
| 1812 | break; |
|---|
| 1813 | } |
|---|
| 1814 | |
|---|
| 1815 | if (*nameform==0) sprintf(nameform, "%%%d.%ds ",namewidth,namewidth); |
|---|
| 1816 | if (numline) sprintf(numform, "%%%ds ",numwidth); |
|---|
| 1817 | else sprintf(numform, "%%%dd ",numwidth); |
|---|
| 1818 | strcpy( nocountsymbols, kNocountsymbols); |
|---|
| 1819 | if (baseonlynum) { |
|---|
| 1820 | if (strchr(nocountsymbols,gPretty.gapchar)==NULL) { |
|---|
| 1821 | strcat(nocountsymbols," "); |
|---|
| 1822 | nocountsymbols[strlen(nocountsymbols)-1]= gPretty.gapchar; |
|---|
| 1823 | } |
|---|
| 1824 | if (gPretty.domatch && (cp=strchr(nocountsymbols,gPretty.matchchar))!=NULL) { |
|---|
| 1825 | *cp= ' '; |
|---|
| 1826 | } |
|---|
| 1827 | } |
|---|
| 1828 | |
|---|
| 1829 | if (numline) { |
|---|
| 1830 | *idword= 0; |
|---|
| 1831 | } |
|---|
| 1832 | |
|---|
| 1833 | width = min(width,kMaxseqwidth); |
|---|
| 1834 | for (i=0, l=0, ibase = 1; i < seqlen; ) { |
|---|
| 1835 | |
|---|
| 1836 | if (l1 < 0) l1 = 0; |
|---|
| 1837 | else if (l1 == 0) { |
|---|
| 1838 | if (nameleft) fprintf(outf, nameform, idword); |
|---|
| 1839 | if (numleft) { if (numline) fprintf(outf, numform, ""); |
|---|
| 1840 | else fprintf(outf, numform, ibase);} |
|---|
| 1841 | for (j=0; j<tab; j++) fputc(' ',outf); |
|---|
| 1842 | } |
|---|
| 1843 | |
|---|
| 1844 | l1++; /* don't count spaces for width*/ |
|---|
| 1845 | if (numline) { |
|---|
| 1846 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) { |
|---|
| 1847 | if (numline==1) fputc(' ',outf); |
|---|
| 1848 | s[l++] = ' '; |
|---|
| 1849 | } |
|---|
| 1850 | if (l1 % 10 == 1 || l1 == width) { |
|---|
| 1851 | if (numline==1) fprintf(outf,"%-9ld ",i+1); |
|---|
| 1852 | s[l++]= '|'; /* == put a number here */ |
|---|
| 1853 | } |
|---|
| 1854 | else s[l++]= ' '; |
|---|
| 1855 | i++; |
|---|
| 1856 | } |
|---|
| 1857 | |
|---|
| 1858 | else { |
|---|
| 1859 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) |
|---|
| 1860 | s[l++] = ' '; |
|---|
| 1861 | if (!baseonlynum) ibase++; |
|---|
| 1862 | else if (0==strchr(nocountsymbols,seq[i])) ibase++; |
|---|
| 1863 | s[l++] = seq[i++]; |
|---|
| 1864 | } |
|---|
| 1865 | |
|---|
| 1866 | if (l1 == width || i == seqlen) { |
|---|
| 1867 | if (outform==kPretty) for ( ; l1<width; l1++) { |
|---|
| 1868 | if (spacer==kSpaceAll || (spacer != 0 && (l+1) % spacer == 1)) |
|---|
| 1869 | s[l++] = ' '; |
|---|
| 1870 | s[l++]=' '; /* pad w/ blanks */ |
|---|
| 1871 | } |
|---|
| 1872 | s[l] = '\0'; |
|---|
| 1873 | l = 0; l1 = 0; |
|---|
| 1874 | |
|---|
| 1875 | if (numline) { |
|---|
| 1876 | if (numline==2) fprintf(outf,"%s",s); /* finish numberline ! and | */ |
|---|
| 1877 | } |
|---|
| 1878 | else { |
|---|
| 1879 | if (i == seqlen) fprintf(outf,"%s%s",s,endstr); |
|---|
| 1880 | else fprintf(outf,"%s",s); |
|---|
| 1881 | if (numright || nameright) fputc(' ',outf); |
|---|
| 1882 | if (numright) fprintf(outf,numform, ibase-1); |
|---|
| 1883 | if (nameright) fprintf(outf, nameform,idword); |
|---|
| 1884 | } |
|---|
| 1885 | fputc('\n',outf); |
|---|
| 1886 | linesout++; |
|---|
| 1887 | } |
|---|
| 1888 | } |
|---|
| 1889 | return linesout; |
|---|
| 1890 | } /*writeSeq*/ |
|---|
| 1891 | |
|---|
| 1892 | |
|---|
| 1893 | |
|---|
| 1894 | /* End file: ureadseq.c */ |
|---|