| 1 | #include "muscle.h" |
|---|
| 2 | #include <stdio.h> |
|---|
| 3 | #include <errno.h> |
|---|
| 4 | |
|---|
| 5 | //const int BUFFER_BYTES = 16*1024; |
|---|
| 6 | const int BUFFER_BYTES = 128; |
|---|
| 7 | const int CR = '\r'; |
|---|
| 8 | const int NL = '\n'; |
|---|
| 9 | |
|---|
| 10 | #define ADD(c) \ |
|---|
| 11 | { \ |
|---|
| 12 | if (Pos >= BufferLength) \ |
|---|
| 13 | { \ |
|---|
| 14 | const int NewBufferLength = BufferLength + BUFFER_BYTES; \ |
|---|
| 15 | char *NewBuffer = new char[NewBufferLength]; \ |
|---|
| 16 | memcpy(NewBuffer, Buffer, BufferLength); \ |
|---|
| 17 | delete[] Buffer; \ |
|---|
| 18 | Buffer = NewBuffer; \ |
|---|
| 19 | BufferLength = NewBufferLength; \ |
|---|
| 20 | } \ |
|---|
| 21 | Buffer[Pos++] = c; \ |
|---|
| 22 | } |
|---|
| 23 | |
|---|
| 24 | // Get next sequence from file. |
|---|
| 25 | char *GetFastaSeq(FILE *f, unsigned *ptrSeqLength, char **ptrLabel, bool DeleteGaps) |
|---|
| 26 | { |
|---|
| 27 | unsigned BufferLength = 0; |
|---|
| 28 | unsigned Pos = 0; |
|---|
| 29 | char *Buffer = 0; |
|---|
| 30 | |
|---|
| 31 | int c = fgetc(f); |
|---|
| 32 | if (EOF == c) |
|---|
| 33 | return 0; |
|---|
| 34 | if ('>' != c) |
|---|
| 35 | Quit("Invalid file format, expected '>' to start FASTA label"); |
|---|
| 36 | |
|---|
| 37 | for (;;) |
|---|
| 38 | { |
|---|
| 39 | int c = fgetc(f); |
|---|
| 40 | if (EOF == c) |
|---|
| 41 | Quit("End-of-file or input error in FASTA label"); |
|---|
| 42 | |
|---|
| 43 | // NL or CR terminates label |
|---|
| 44 | if (NL == c || CR == c) |
|---|
| 45 | break; |
|---|
| 46 | |
|---|
| 47 | // All other characters added to label |
|---|
| 48 | ADD(c) |
|---|
| 49 | } |
|---|
| 50 | |
|---|
| 51 | // Nul-terminate label |
|---|
| 52 | ADD(0) |
|---|
| 53 | *ptrLabel = Buffer; |
|---|
| 54 | |
|---|
| 55 | BufferLength = 0; |
|---|
| 56 | Pos = 0; |
|---|
| 57 | Buffer = 0; |
|---|
| 58 | int PreviousChar = NL; |
|---|
| 59 | for (;;) |
|---|
| 60 | { |
|---|
| 61 | int c = fgetc(f); |
|---|
| 62 | if (EOF == c) |
|---|
| 63 | { |
|---|
| 64 | if (feof(f)) |
|---|
| 65 | break; |
|---|
| 66 | else if (ferror(f)) |
|---|
| 67 | Quit("Error reading FASTA file, ferror=TRUE feof=FALSE errno=%d %s", |
|---|
| 68 | errno, strerror(errno)); |
|---|
| 69 | else |
|---|
| 70 | Quit("Error reading FASTA file, fgetc=EOF feof=FALSE ferror=FALSE errno=%d %s", |
|---|
| 71 | errno, strerror(errno)); |
|---|
| 72 | } |
|---|
| 73 | |
|---|
| 74 | if ('>' == c) |
|---|
| 75 | { |
|---|
| 76 | if (NL == PreviousChar || CR == PreviousChar) |
|---|
| 77 | { |
|---|
| 78 | ungetc(c, f); |
|---|
| 79 | break; |
|---|
| 80 | } |
|---|
| 81 | else |
|---|
| 82 | Quit("Unexpected '>' in FASTA sequence data"); |
|---|
| 83 | } |
|---|
| 84 | else if (isspace(c)) |
|---|
| 85 | ; |
|---|
| 86 | else if (IsGapChar(c)) |
|---|
| 87 | { |
|---|
| 88 | if (!DeleteGaps) |
|---|
| 89 | ADD(c) |
|---|
| 90 | } |
|---|
| 91 | else if (isalpha(c)) |
|---|
| 92 | { |
|---|
| 93 | c = toupper(c); |
|---|
| 94 | ADD(c) |
|---|
| 95 | } |
|---|
| 96 | else if (isprint(c)) |
|---|
| 97 | { |
|---|
| 98 | Warning("Invalid character '%c' in FASTA sequence data, ignored", c); |
|---|
| 99 | continue; |
|---|
| 100 | } |
|---|
| 101 | else |
|---|
| 102 | { |
|---|
| 103 | Warning("Invalid byte hex %02x in FASTA sequence data, ignored", (unsigned char) c); |
|---|
| 104 | continue; |
|---|
| 105 | } |
|---|
| 106 | PreviousChar = c; |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | if (0 == Pos) |
|---|
| 110 | return GetFastaSeq(f, ptrSeqLength, ptrLabel, DeleteGaps); |
|---|
| 111 | |
|---|
| 112 | *ptrSeqLength = Pos; |
|---|
| 113 | return Buffer; |
|---|
| 114 | } |
|---|