| 1 | [file has been slightly modified to fit into arb help] | 
|---|
| 2 |  | 
|---|
| 3 | ||||||||||| ReadSeq supported formats   (revised 30Dec92) | 
|---|
| 4 | -------------------------------------------------------- | 
|---|
| 5 |  | 
|---|
| 6 | -f[ormat=]Name Format name for output: | 
|---|
| 7 | |  1. IG/Stanford           10. Olsen (in-only) | 
|---|
| 8 | |  2. GenBank/GB            11. Phylip3.2 | 
|---|
| 9 | |  3. NBRF                  12. Phylip | 
|---|
| 10 | |  4. EMBL                  13. Plain/Raw | 
|---|
| 11 | |  5. GCG                   14. PIR/CODATA | 
|---|
| 12 | |  6. DNAStrider            15. MSF | 
|---|
| 13 | |  7. Fitch                 16. ASN.1 | 
|---|
| 14 | |  8. Pearson/Fasta         17. PAUP | 
|---|
| 15 | |  9. Zuker (in-only)       18. Pretty (out-only) | 
|---|
| 16 |  | 
|---|
| 17 | In general, output supports only minimal subsets of each format | 
|---|
| 18 | needed for sequence data exchanges.  Features, descriptions | 
|---|
| 19 | and other format-unique information is discarded. | 
|---|
| 20 |  | 
|---|
| 21 | Users of Olsen multi sequence editor (VMS).  The Olsen format | 
|---|
| 22 | here is produced with the print command: | 
|---|
| 23 | print/out=some.file | 
|---|
| 24 | Use Genbank output from readseq to produce a format that this | 
|---|
| 25 | editor can read, and use the command | 
|---|
| 26 | load/genbank some.file | 
|---|
| 27 | Dan Davison has a VMS program that will convert to/from the | 
|---|
| 28 | Olsen native binary data format.  E-mail davison@uh.edu | 
|---|
| 29 |  | 
|---|
| 30 | Warning: Phylip format input is now supported (30Dec92), however the | 
|---|
| 31 | auto-detection of Phylip format is very probabilistic and messy, | 
|---|
| 32 | especially distinguishing sequential from interleaved versions. It | 
|---|
| 33 | is not recommended that one use readseq to convert files from Phylip | 
|---|
| 34 | format to others unless essential. | 
|---|
| 35 |  | 
|---|
| 36 |  | 
|---|
| 37 |  | 
|---|
| 38 | ||||||||||| ReadSeq usage             (revised 11Nov91) | 
|---|
| 39 | -------------------------------------------------------- | 
|---|
| 40 |  | 
|---|
| 41 | A. determine file format: | 
|---|
| 42 |  | 
|---|
| 43 | short skiplines;  /* result: number of header lines to skip (or 0) */ | 
|---|
| 44 | short error;      /* error result or 0 */ | 
|---|
| 45 | short format;     /* resulting format code, see ureadseq.h */ | 
|---|
| 46 | char  *filename   = "Mysequence.file" | 
|---|
| 47 |  | 
|---|
| 48 | format = seqFileFormat( filename, &skiplines, &error); | 
|---|
| 49 | if (error!=0) fail; | 
|---|
| 50 |  | 
|---|
| 51 | B. read number and list of sequences (optional) | 
|---|
| 52 |  | 
|---|
| 53 | short numseqs;    /* resulting number of sequences found in file */ | 
|---|
| 54 | char  *seqlist;   /* list of sequence names, newline separated, 0 terminated */ | 
|---|
| 55 |  | 
|---|
| 56 | seqlist = listSeqs( filename, skiplines, format, &numseqs, &error); | 
|---|
| 57 | if (error!=0)  display (seqlist); | 
|---|
| 58 | free( seqlist); | 
|---|
| 59 |  | 
|---|
| 60 | C.  read individual sequences as desired | 
|---|
| 61 |  | 
|---|
| 62 | short seqIndex;   /* sequence index #, or == kListSeqs for listSeqs equivalent */ | 
|---|
| 63 | long  seqlen;     /* length of seq */ | 
|---|
| 64 | char  seqid[256]; /* sequence name */ | 
|---|
| 65 | char  *seq;       /* sequence, 0 terminated, free when done */ | 
|---|
| 66 |  | 
|---|
| 67 | seq = readSeq( seqIndex, filename, skiplines, format, | 
|---|
| 68 | &seqlen, &numseqs, &error, seqid); | 
|---|
| 69 | if (error!=0) manipulate(seq); | 
|---|
| 70 | free(seq); | 
|---|
| 71 |  | 
|---|
| 72 | D. write sequences as desired | 
|---|
| 73 |  | 
|---|
| 74 | int nlines;     /* number of lines of sequence written */ | 
|---|
| 75 | FILE* fout;     /* open file pointer (stdout or other) */ | 
|---|
| 76 | short outform;  /* output format, see ureadseq.h */ | 
|---|
| 77 |  | 
|---|
| 78 | nlines = writeSeq( fout, seq, seqlen, format, outform, seqid); | 
|---|
| 79 |  | 
|---|
| 80 |  | 
|---|
| 81 | Note (30Dec92): There is various processing done by the main program (in readseq.c), | 
|---|
| 82 | rather than just in the subroutines (in ureadseq.c).  Especially for interleaved | 
|---|
| 83 | output formats, the writeSeq subroutine does not handle interleaving, nor some of | 
|---|
| 84 | the formatting at the top and end of output files.  While seqFileFormat, listSeqs, | 
|---|
| 85 | and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on | 
|---|
| 86 | auxilliary processing.  At some point, this may be revised so writeSeq is | 
|---|
| 87 | self-contained. | 
|---|
| 88 |  | 
|---|
| 89 | Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format | 
|---|
| 90 | reading (see ureadasn.c).  A bastard (but workable I hope) ASN.1 format is written | 
|---|
| 91 | by writeSeq alone. | 
|---|
| 92 |  | 
|---|
| 93 |  | 
|---|
| 94 |  | 
|---|
| 95 | |||||||||||  sequence formats.... | 
|---|
| 96 | --------------------------------------------------- | 
|---|
| 97 |  | 
|---|
| 98 | stanford/IG | 
|---|
| 99 | ;comments | 
|---|
| 100 | ;... | 
|---|
| 101 | seq1 info | 
|---|
| 102 | abcd... | 
|---|
| 103 | efgh1 (or 2 = terminator) | 
|---|
| 104 | ;another seq | 
|---|
| 105 | ;.... | 
|---|
| 106 | seq2 info | 
|---|
| 107 | abcd...1 | 
|---|
| 108 | --- for e.g. ---- | 
|---|
| 109 | ;     Dro5s-T.Seq  Length: 120  April 6, 1989  21:22  Check: 9487  .. | 
|---|
| 110 | dro5stseq | 
|---|
| 111 | GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG | 
|---|
| 112 | GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1 | 
|---|
| 113 |  | 
|---|
| 114 | ;  TOIG of: Dro5srna.Seq  check: 9487  from: 1  to: 120 | 
|---|
| 115 | --------------------------------------------------- | 
|---|
| 116 |  | 
|---|
| 117 | Genbank: | 
|---|
| 118 | LOCUS    seq1 ID.. | 
|---|
| 119 | ... | 
|---|
| 120 | ORIGIN ... | 
|---|
| 121 | 123456789abcdefg....(1st 9 columns are formatting) | 
|---|
| 122 | hijkl... | 
|---|
| 123 | //         (end of sequence) | 
|---|
| 124 | LOCUS     seq2 ID .. | 
|---|
| 125 | ... | 
|---|
| 126 | ORIGIN | 
|---|
| 127 | abcd... | 
|---|
| 128 | // | 
|---|
| 129 | --------------------------------------------------- | 
|---|
| 130 |  | 
|---|
| 131 | NBRF format: (from uwgcg ToNBRF) | 
|---|
| 132 | >DL;DRO5SRNA | 
|---|
| 133 | Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA | 
|---|
| 134 |  | 
|---|
| 135 | 51  AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG | 
|---|
| 136 | 101  AACACCGCGU GUUGUUGGCC U | 
|---|
| 137 |  | 
|---|
| 138 | --------------------------------------------------- | 
|---|
| 139 |  | 
|---|
| 140 | EMBL format | 
|---|
| 141 | ID345 seq1 id   (the 345 are spaces) | 
|---|
| 142 | ... other info | 
|---|
| 143 | SQ345Sequence   (the 3,4,5 are spaces) | 
|---|
| 144 | abcd... | 
|---|
| 145 | hijk... | 
|---|
| 146 | //              (! this is proper end string: 12Oct90) | 
|---|
| 147 | ID    seq2 id | 
|---|
| 148 | ... | 
|---|
| 149 | SQ   Sequence | 
|---|
| 150 | abcd... | 
|---|
| 151 | ... | 
|---|
| 152 | // | 
|---|
| 153 | --------------------------------------------------- | 
|---|
| 154 |  | 
|---|
| 155 | UW GCG Format: | 
|---|
| 156 | comments of any form, up to ".." signal | 
|---|
| 157 | signal line has seq id, and " Check: ####   .." | 
|---|
| 158 | only 1 seq/file | 
|---|
| 159 |  | 
|---|
| 160 | -- e.g. --- (GCG from GenBank) | 
|---|
| 161 | LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987 | 
|---|
| 162 | ... much more ... | 
|---|
| 163 | ORIGIN      1 bp upstream of EcoRI site; chromosome BK9 region 69A1. | 
|---|
| 164 |  | 
|---|
| 165 | INVERTEBRATE:DROEST6  Length: 1819  January 9, 1989  16:48  Check: 8008  .. | 
|---|
| 166 |  | 
|---|
| 167 | 1  GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT | 
|---|
| 168 |  | 
|---|
| 169 | 51  CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG | 
|---|
| 170 |  | 
|---|
| 171 |  | 
|---|
| 172 | --------------------------------------------------- | 
|---|
| 173 |  | 
|---|
| 174 | DNAStrider (Mac) = modified Stanford: | 
|---|
| 175 | ; ### from DNA Strider  Friday, April 7, 1989   11:04:24 PM | 
|---|
| 176 | ; DNA sequence  pBR322   4363  b.p. complete sequence | 
|---|
| 177 | ; | 
|---|
| 178 | abcd... | 
|---|
| 179 | efgh | 
|---|
| 180 | //  (end of sequence) | 
|---|
| 181 | --------------------------------------------------- | 
|---|
| 182 |  | 
|---|
| 183 | Fitch format: | 
|---|
| 184 | Dro5srna.Seq | 
|---|
| 185 | GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC | 
|---|
| 186 | GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU | 
|---|
| 187 | Droest6.Seq | 
|---|
| 188 | GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG | 
|---|
| 189 | AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG | 
|---|
| 190 | --------------------------------------------------- | 
|---|
| 191 |  | 
|---|
| 192 | W.Pearson/Fasta format: | 
|---|
| 193 | >BOVPRL GenBank entry BOVPRL from omam file.  907 nucleotides. | 
|---|
| 194 | TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT | 
|---|
| 195 |  | 
|---|
| 196 | --------------------------------------------------- | 
|---|
| 197 | Phylip version 3.2 format (e.g., DNAML): | 
|---|
| 198 |  | 
|---|
| 199 | 5   13 YF                (# seqs, #bases, YF) | 
|---|
| 200 | Alpha     AACGTGGCCAAAT | 
|---|
| 201 | aaaagggccc...  (continued sp. alpha) | 
|---|
| 202 | Beta      AAGGTCGCCAAAC | 
|---|
| 203 | aaaagggccc...  (continued sp. beta) | 
|---|
| 204 | Gamma     CATTTCGTCACAA | 
|---|
| 205 | aaaagggccc...  (continued sp. Gamma) | 
|---|
| 206 | 1234567890^-- bases must start in col 11, and run 'til #bases | 
|---|
| 207 | (spaces & newlines are okay) | 
|---|
| 208 | --------------------------------------------------- | 
|---|
| 209 | Phylip version 3.3 format (e.g., DNAML): | 
|---|
| 210 |  | 
|---|
| 211 | 5    42  YF             (# seqs, #bases, YF) | 
|---|
| 212 | Turkey    AAGCTNGGGC ATTTCAGGGT | 
|---|
| 213 | Salmo gairAAGCCTTGGC AGTGCAGGGT | 
|---|
| 214 | H. SapiensACCGGTTGGC CGTTCAGGGT | 
|---|
| 215 | Chimp     AAACCCTTGC CGTTACGCTT | 
|---|
| 216 | Gorilla   AAACCCTTGC CGGTACGCTT | 
|---|
| 217 | 1234567890^-- bases must start in col 11 | 
|---|
| 218 | !! this version interleaves the species -- contrary to | 
|---|
| 219 | all other output formats. | 
|---|
| 220 |  | 
|---|
| 221 | GAGCCCGGGC AATACAGGGT AT | 
|---|
| 222 | GAGCCGTGGC CGGGCACGGT AT | 
|---|
| 223 | ACAGGTTGGC CGTTCAGGGT AA | 
|---|
| 224 | AAACCGAGGC CGGGACACTC AT | 
|---|
| 225 | AAACCATTGC CGGTACGCTT AA | 
|---|
| 226 |  | 
|---|
| 227 | --------------------------------------------------- | 
|---|
| 228 | Phylip version 3.4 format (e.g., DNAML) | 
|---|
| 229 | -- Both Interleaved and sequential are permitted | 
|---|
| 230 |  | 
|---|
| 231 | 5   13                (# seqs, #bases) | 
|---|
| 232 | Alpha     AACGTGGCCAAAT | 
|---|
| 233 | aaaagggccc...  (continued sp. alpha) | 
|---|
| 234 | Beta      AAGGTCGCCAAAC | 
|---|
| 235 | aaaagggccc...  (continued sp. beta) | 
|---|
| 236 | Gamma     CATTTCGTCACAA | 
|---|
| 237 | aaaagggccc...  (continued sp. Gamma) | 
|---|
| 238 | 1234567890^-- bases must start in col 11, and run 'til #bases | 
|---|
| 239 | (spaces, newlines and numbers are are ignored) | 
|---|
| 240 |  | 
|---|
| 241 | --------------------------------------------------- | 
|---|
| 242 | Gary Olsen (multiple) sequence editor /print format: | 
|---|
| 243 |  | 
|---|
| 244 | !--------------------- | 
|---|
| 245 | !17Oct91 -- error in original copy of olsen /print format, shifted right 1 space | 
|---|
| 246 | ! here is correct copy: | 
|---|
| 247 | 301  40 Tb.thiop  CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG-----------------------------------------------------  Tb.thiop | 
|---|
| 248 | 123456789012345678901 | 
|---|
| 249 | 301  42 Rhc.purp  CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG-----------------------------------------------------  Rhc.purp | 
|---|
| 250 |  | 
|---|
| 251 | 301  44 Rhc.gela  nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA-----------------------------------------------------  Rhc.gela | 
|---|
| 252 | !--------------------- | 
|---|
| 253 |  | 
|---|
| 254 | RNase P RNA components.  on 20-FEB-90 17:23:58 | 
|---|
| 255 |  | 
|---|
| 256 | 1 (E.c. pr ):  Base pairing in Escherichia coli RNase P RNA. | 
|---|
| 257 | 2 (chrom   ):  Chromatium | 
|---|
| 258 | : | 
|---|
| 259 | 12 (B.brevis):  Bacillus brevis RNase P RNA, B. James. | 
|---|
| 260 | 13 ( 90% con):   90% conserved | 
|---|
| 261 | 14 (100% con):  100% conserved | 
|---|
| 262 | 15 (gram+ pr):  pairing | 
|---|
| 263 |  | 
|---|
| 264 | 1 | 
|---|
| 265 | RNase P RNA components.  on 20-FEB-90 17:23:58 | 
|---|
| 266 |  | 
|---|
| 267 | Posi-   Sequence | 
|---|
| 268 | tion:   identity:   Data: | 
|---|
| 269 |  | 
|---|
| 270 | 1   1 E.c. pr      <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>>  E.c. pr | 
|---|
| 271 | 1   2 chrom        GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------  chrom | 
|---|
| 272 | : | 
|---|
| 273 | 1  12 B.brevis  AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU-------------  B.brevis | 
|---|
| 274 | 1234567890123456789012 <! this should be 21 not 22, | 
|---|
| 275 | ! this example must be inset on left by 1 space from olsen /print files ! | 
|---|
| 276 | 1  13  90% con           G  C G  A  CGC GC               -    -      90% con | 
|---|
| 277 | 1  14 100% con                G  A  CGC                             100% con | 
|---|
| 278 | 1  15 gram+ pr     <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<===============  gram+ pr | 
|---|
| 279 |  | 
|---|
| 280 | 60   1 E.c. pr   >>>>>>^>>^>>>>:>>    <<<^<<<< {{{{{                 E.c. pr | 
|---|
| 281 | 60   2 chrom     -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU-------------  chrom | 
|---|
| 282 | :       : | 
|---|
| 283 | 60  10 B.stearo  ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC  B.stearo | 
|---|
| 284 |  | 
|---|
| 285 |  | 
|---|
| 286 | --------------------------------------------------- | 
|---|
| 287 | GCG MSF format | 
|---|
| 288 | Title line | 
|---|
| 289 |  | 
|---|
| 290 | picorna.msf  MSF: 100  Type: P  January 17, 1991  17:53  Check: 541 | 
|---|
| 291 | .. | 
|---|
| 292 | Name: Cb3              Len:   100  Check: 7009  Weight:  1.00 | 
|---|
| 293 | Name: E                Len:   100  Check:   60  Weight:  1.00 | 
|---|
| 294 |  | 
|---|
| 295 | // | 
|---|
| 296 |  | 
|---|
| 297 | 1                                                   50 | 
|---|
| 298 | Cb3  ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet | 
|---|
| 299 | E  gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs | 
|---|
| 300 |  | 
|---|
| 301 | 51                                                 100 | 
|---|
| 302 |  | 
|---|
| 303 | Cb3  ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn..... | 
|---|
| 304 | E  ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf | 
|---|
| 305 |  | 
|---|
| 306 | --------------------------------------------------- | 
|---|
| 307 | PIR format | 
|---|
| 308 | This is NBRF-PIR MAILSERVER version 1.45 | 
|---|
| 309 | Command-> get PIR3:A31391 | 
|---|
| 310 | \\\ | 
|---|
| 311 | ENTRY           A31391       #Type Protein | 
|---|
| 312 | TITLE           *Esterase-6 - Fruit fly (Drosophila melanogaster) | 
|---|
| 313 |  | 
|---|
| 314 | DATE            03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992 | 
|---|
| 315 | PLACEMENT          0.0    0.0    0.0    0.0    0.0 | 
|---|
| 316 | COMMENT         *This entry is not verified. | 
|---|
| 317 | SOURCE          Drosophila melanogaster | 
|---|
| 318 |  | 
|---|
| 319 | REFERENCE | 
|---|
| 320 | #Authors     Cooke P.H., Oakeshott J.G. | 
|---|
| 321 | #Citation    submitted to GenBank, April 1989 | 
|---|
| 322 | #Reference-number A31391 | 
|---|
| 323 | #Accession   A31391 | 
|---|
| 324 | #Cross-reference GB:J04167 | 
|---|
| 325 |  | 
|---|
| 326 | SUMMARY       #Molecular-weight 61125  #Length 544  #Checksum  1679 | 
|---|
| 327 | SEQUENCE | 
|---|
| 328 | 5        10        15        20        25        30 | 
|---|
| 329 | 1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V | 
|---|
| 330 | 31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D | 
|---|
| 331 | 61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D | 
|---|
| 332 | 91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S | 
|---|
| 333 | 121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K | 
|---|
| 334 | 151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K | 
|---|
| 335 | 181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A | 
|---|
| 336 | 211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D | 
|---|
| 337 | 241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L | 
|---|
| 338 | 271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F | 
|---|
| 339 | 301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V | 
|---|
| 340 | 331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D | 
|---|
| 341 | 361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K | 
|---|
| 342 | 391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N | 
|---|
| 343 | 421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I | 
|---|
| 344 | 451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D | 
|---|
| 345 | 481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K | 
|---|
| 346 | 511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H | 
|---|
| 347 | 541 V E F P | 
|---|
| 348 | /// | 
|---|
| 349 | \\\ | 
|---|
| 350 | --------------------------------------------------- | 
|---|
| 351 | PAUP format: | 
|---|
| 352 | The NEXUS Format | 
|---|
| 353 |  | 
|---|
| 354 | Every block starts with "BEGIN blockname;" and ends with "END;". | 
|---|
| 355 | Each block is composed of one or more statements, each | 
|---|
| 356 | terminated by a semicolon (;). | 
|---|
| 357 |  | 
|---|
| 358 | Comments may be included in NEXUS files by enclosing them within | 
|---|
| 359 | square brackets, as in "[This is a comment]." | 
|---|
| 360 |  | 
|---|
| 361 | NEXUS-conforming files are identified by a "#NEXUS" directive at | 
|---|
| 362 | the very beginning of the file (line 1, column 1).  If the | 
|---|
| 363 | #NEXUS is omitted PAUP issues a warning but continues | 
|---|
| 364 | processing. | 
|---|
| 365 |  | 
|---|
| 366 | NEXUS files are entirely free-format.  Blanks, tabs, and | 
|---|
| 367 | newlines may be placed anywhere in the file.  Unless RESPECTCASE | 
|---|
| 368 | is requested, commands and data may be entered in upper case, | 
|---|
| 369 | lower case, or a mixture of upper and lower case. | 
|---|
| 370 |  | 
|---|
| 371 | The following conventions are used in the syntax descriptions of | 
|---|
| 372 | the various blocks.  Upper-case items are entered exactly as | 
|---|
| 373 | shown.  Lower-case items inside of angle brackets -- e.g., <x> | 
|---|
| 374 | -- represent items to be substituted by the user.  Items inside | 
|---|
| 375 | of square brackets -- e.g., [X] -- are optional.  Items inside | 
|---|
| 376 | of curly braces and separated by vertical bars -- e.g.,  { X | Y | 
|---|
| 377 | | Z } -- are mutually exclusive options. | 
|---|
| 378 |  | 
|---|
| 379 |  | 
|---|
| 380 | The DATA Block | 
|---|
| 381 |  | 
|---|
| 382 | The DATA block contains the data matrix and other associated | 
|---|
| 383 | information.  Its syntax is: | 
|---|
| 384 |  | 
|---|
| 385 | BEGIN DATA; | 
|---|
| 386 | DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>; | 
|---|
| 387 | [ FORMAT  [ MISSING=<missing-symbol> ] | 
|---|
| 388 | [ LABELPOS={ LEFT | RIGHT } ] | 
|---|
| 389 | [ SYMBOLS="<symbols-list>" ] | 
|---|
| 390 | [ INTERLEAVE ] | 
|---|
| 391 | [ MATCHCHAR=<match-symbol> ] | 
|---|
| 392 | [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ] | 
|---|
| 393 | [ TRANSPOSE ] | 
|---|
| 394 | [ RESPECTCASE ] | 
|---|
| 395 | [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ] | 
|---|
| 396 | [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ] | 
|---|
| 397 | [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ] | 
|---|
| 398 | [ ZAP = "<list of zapped characters>" ] ; ] | 
|---|
| 399 | [ CHARLABELS <label_1> <label_2> <label_NCHAR> ; ] | 
|---|
| 400 | [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ] | 
|---|
| 401 | [ STATELABELS <currently ignored by PAUP> ; ] | 
|---|
| 402 | MATRIX <data-matrix> ; | 
|---|
| 403 | END; | 
|---|
| 404 |  | 
|---|
| 405 | --- example PAUP file | 
|---|
| 406 |  | 
|---|
| 407 | #NEXUS | 
|---|
| 408 |  | 
|---|
| 409 | [!Brown et al. (1982) primate mitochondrial DNA] | 
|---|
| 410 |  | 
|---|
| 411 | begin data; | 
|---|
| 412 | dimensions ntax=5 nchar=896; | 
|---|
| 413 | format datatype=dna matchchar=. interleave missing='-'; | 
|---|
| 414 | matrix | 
|---|
| 415 | [                              2                    4                    6            8                    ] | 
|---|
| 416 | [         1                    1                    1                    1            1                    ] | 
|---|
| 417 | human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc | 
|---|
| 418 | chimp     ................a.t. .c.................a ...............t.... ..................t. .t........c......... | 
|---|
| 419 | gorilla   ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c... | 
|---|
| 420 | orang     ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c... | 
|---|
| 421 | gibbon    ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c... | 
|---|
| 422 |  | 
|---|
| 423 | [         8                    8                    8                    8            8              8     ] | 
|---|
| 424 | [         0                    2                    4                    6            8              9     ] | 
|---|
| 425 | [         1                    1                    1                    1            1              6     ] | 
|---|
| 426 | human     cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt | 
|---|
| 427 | chimp     t................... .a................c. ........a.....g..... ...a................ ................ | 
|---|
| 428 | gorilla   ..................tc .a................c. ........a.g......... ...a.............tt. .a.............. | 
|---|
| 429 | orang     ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........ | 
|---|
| 430 | gibbon    a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a.............. | 
|---|
| 431 | ; | 
|---|
| 432 | end; | 
|---|
| 433 | --------------------------------------------------- | 
|---|
| 434 |  | 
|---|
| 435 |  | 
|---|
| 436 |  | 
|---|
| 437 |  | 
|---|
| 438 |  | 
|---|
| 439 |  | 
|---|
| 440 | |||||||||||  Sample SMTP mail header | 
|---|
| 441 | --------------------------------------------------- | 
|---|
| 442 |  | 
|---|
| 443 | - - - - - - - - - | 
|---|
| 444 | From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991 | 
|---|
| 445 | Received: from genbank.bio.net by sunflower.bio.indiana.edu | 
|---|
| 446 | (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST | 
|---|
| 447 | Received: by genbank.bio.net (5.65/IG-2.0) | 
|---|
| 448 | id AA14458; Sun, 10 Nov 91 14:30:03 -0800 | 
|---|
| 449 | Date: Sun, 10 Nov 91 14:30:03 -0800 | 
|---|
| 450 | Message-Id: <9111102230.AA14458@genbank.bio.net> | 
|---|
| 451 | From: Database Server <GenBank-Retrieval-System@genbank.bio.net> | 
|---|
| 452 | To: gilbertd@sunflower.bio.indiana.edu | 
|---|
| 453 | Subject: Results of Query for drorna | 
|---|
| 454 | Status: R | 
|---|
| 455 |  | 
|---|
| 456 | No matches on drorna. | 
|---|
| 457 | - - - - - - | 
|---|
| 458 | From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991 | 
|---|
| 459 | Received: from genbank.bio.net by sunflower.bio.indiana.edu | 
|---|
| 460 | (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST | 
|---|
| 461 | Received: by genbank.bio.net (5.65/IG-2.0) | 
|---|
| 462 | id AA14461; Sun, 10 Nov 91 14:30:03 -0800 | 
|---|
| 463 | Date: Sun, 10 Nov 91 14:30:03 -0800 | 
|---|
| 464 | Message-Id: <9111102230.AA14461@genbank.bio.net> | 
|---|
| 465 | From: Database Server <GenBank-Retrieval-System@genbank.bio.net> | 
|---|
| 466 | To: gilbertd@sunflower.bio.indiana.edu | 
|---|
| 467 | Subject: Results of Query for droest6 | 
|---|
| 468 | Status: R | 
|---|
| 469 |  | 
|---|
| 470 | LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987 | 
|---|
| 471 | DEFINITION  D.melanogaster esterase-6 mRNA, complete cds. | 
|---|
| 472 | ACCESSION   M15961 | 
|---|
| 473 |  | 
|---|
| 474 |  | 
|---|
| 475 |  | 
|---|
| 476 |  | 
|---|
| 477 |  | 
|---|
| 478 |  | 
|---|
| 479 |  | 
|---|
| 480 |  | 
|---|
| 481 |  | 
|---|
| 482 |  | 
|---|
| 483 |  | 
|---|
| 484 |  | 
|---|
| 485 | |||||||||||  GCG manual discussion of sequence symbols: | 
|---|
| 486 | --------------------------------------------------- | 
|---|
| 487 |  | 
|---|
| 488 | III_SEQUENCE_SYMBOLS | 
|---|
| 489 |  | 
|---|
| 490 |  | 
|---|
| 491 | GCG programs allow all upper and lower  case  letters,  periods  (.), | 
|---|
| 492 | asterisks  (*),  pluses  (+),  ampersands  (&),  and ats (@) as symbols in | 
|---|
| 493 | biological sequences.  Nucleotide  symbols,  their  complements,  and  the | 
|---|
| 494 | standard  one-letter amino acid symbols are shown below in separate lists. | 
|---|
| 495 | The meanings of the symbols +, &, and @ have not  been  assigned  at  this | 
|---|
| 496 | writing (March, 1989). | 
|---|
| 497 |  | 
|---|
| 498 | GCG uses the  letter  codes  for  amino  acid  codes  and  nucleotide | 
|---|
| 499 | ambiguity    proposed    by    IUB    (Nomenclature    Committee,    1985, | 
|---|
| 500 | Eur. J. Biochem. 150; 1-5).  These codes are  compatible  with  the  codes | 
|---|
| 501 | used by the EMBL, GenBank, and NBRF data libraries. | 
|---|
| 502 |  | 
|---|
| 503 |  | 
|---|
| 504 | NUCLEOTIDES | 
|---|
| 505 |  | 
|---|
| 506 | The meaning of each symbol, its complement,  and  the  Cambridge  and | 
|---|
| 507 | Stanford  equivalents  are  shown below.  Cambridge files can be converted | 
|---|
| 508 | into GCG files and vice versa with the programs FROMSTADEN  and  TOSTADEN. | 
|---|
| 509 | IntelliGenetics  sequence  files  can  be interconverted with the programs | 
|---|
| 510 | FROMIG and TOIG. | 
|---|
| 511 |  | 
|---|
| 512 | IUB/GCG      Meaning     Complement   Staden/Sanger  Stanford | 
|---|
| 513 |  | 
|---|
| 514 | A             A             T             A            A | 
|---|
| 515 | C             C             G             C            C | 
|---|
| 516 | G             G             C             G            G | 
|---|
| 517 | T/U            T             A             T           T/U | 
|---|
| 518 | M           A or C          K             5            J | 
|---|
| 519 | R           A or G          Y             R            R | 
|---|
| 520 | W           A or T          W             7            L | 
|---|
| 521 | S           C or G          S             8            M | 
|---|
| 522 | Y           C or T          R             Y            Y | 
|---|
| 523 | K           G or T          M             6            K | 
|---|
| 524 | V        A or C or G        B       not supported      N | 
|---|
| 525 | H        A or C or T        D       not supported      N | 
|---|
| 526 | D        A or G or T        H       not supported      N | 
|---|
| 527 | B        C or G or T        V       not supported      N | 
|---|
| 528 | X/N     G or A or T or C     X            -/X           N | 
|---|
| 529 | .    not G or A or T or C   .       not supported      ? | 
|---|
| 530 |  | 
|---|
| 531 |  | 
|---|
| 532 | The frame ambiguity codes used by Staden are not  supported  by  GCG | 
|---|
| 533 | and   are  translated  by  FROMSTADEN  as  the  lower  case  single  base | 
|---|
| 534 | equivalent. | 
|---|
| 535 |  | 
|---|
| 536 | Staden Code          Meaning              GCG | 
|---|
| 537 |  | 
|---|
| 538 | D                C or CC                c | 
|---|
| 539 | V                T or TT                t | 
|---|
| 540 | B                A or AA                a | 
|---|
| 541 | H                G or GG                g | 
|---|
| 542 | K                C or CX                c | 
|---|
| 543 | L                T or TX                t | 
|---|
| 544 | M                A or AX                a | 
|---|
| 545 | N                G or GX                g | 
|---|
| 546 |  | 
|---|
| 547 |  | 
|---|
| 548 | AMINO ACIDS | 
|---|
| 549 |  | 
|---|
| 550 | Here is a list of the standard one-letter amino acid codes and their | 
|---|
| 551 | three-letter  equivalents.   The synonymous codons and their depiction in | 
|---|
| 552 | the IUB codes are shown.  You should recognize that the codons  following | 
|---|
| 553 | semicolons  (;)  are  not  sufficiently specific to define a single amino | 
|---|
| 554 | acid even though they represent the best possible back  translation  into | 
|---|
| 555 | the IUB codes!  All of the relationships in this list can be redefined by | 
|---|
| 556 | the user in a local data file described below. | 
|---|
| 557 |  | 
|---|
| 558 | IUB | 
|---|
| 559 | Symbol 3-letter  Meaning      Codons                Depiction | 
|---|
| 560 | A    Ala       Alanine      GCT,GCC,GCA,GCG         !GCX | 
|---|
| 561 | B    Asp,Asn   Aspartic, | 
|---|
| 562 | Asparagine   GAT,GAC,AAT,AAC         !RAY | 
|---|
| 563 | C    Cys       Cysteine     TGT,TGC                 !TGY | 
|---|
| 564 | D    Asp       Aspartic     GAT,GAC                 !GAY | 
|---|
| 565 | E    Glu       Glutamic     GAA,GAG                 !GAR | 
|---|
| 566 | F    Phe     Phenylalanine  TTT,TTC                 !TTY | 
|---|
| 567 | G    Gly       Glycine      GGT,GGC,GGA,GGG         !GGX | 
|---|
| 568 | H    His       Histidine    CAT,CAC                 !CAY | 
|---|
| 569 | I    Ile       Isoleucine   ATT,ATC,ATA             !ATH | 
|---|
| 570 | K    Lys       Lysine       AAA,AAG                 !AAR | 
|---|
| 571 | L    Leu       Leucine      TTG,TTA,CTT,CTC,CTA,CTG | 
|---|
| 572 | !TTR,CTX,YTR;YTX | 
|---|
| 573 | M    Met       Methionine   ATG                     !ATG | 
|---|
| 574 | N    Asn       Asparagine   AAT,AAC                 !AAY | 
|---|
| 575 | P    Pro       Proline      CCT,CCC,CCA,CCG         !CCX | 
|---|
| 576 | Q    Gln       Glutamine    CAA,CAG                 !CAR | 
|---|
| 577 | R    Arg       Arginine     CGT,CGC,CGA,CGG,AGA,AGG | 
|---|
| 578 | !CGX,AGR,MGR;MGX | 
|---|
| 579 | S    Ser       Serine       TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX | 
|---|
| 580 | T    Thr       Threonine    ACT,ACC,ACA,ACG         !ACX | 
|---|
| 581 | V    Val       Valine       GTT,GTC,GTA,GTG         !GTX | 
|---|
| 582 | W    Trp       Tryptophan   TGG                     !TGG | 
|---|
| 583 | X    Xxx       Unknown                              !XXX | 
|---|
| 584 | Y    Tyr       Tyrosine     TAT, TAC                !TAY | 
|---|
| 585 | Z    Glu,Gln   Glutamic, | 
|---|
| 586 | Glutamine    GAA,GAG,CAA,CAG         !SAR | 
|---|
| 587 | *    End       Terminator   TAA, TAG, TGA           !TAR,TRA;TRR | 
|---|
| 588 |  | 
|---|
| 589 |  | 
|---|
| 590 |  | 
|---|
| 591 |  | 
|---|
| 592 |  | 
|---|
| 593 |  | 
|---|
| 594 |  | 
|---|
| 595 |  | 
|---|
| 596 | |||||||||||  docs from PSC on sequence formats: | 
|---|
| 597 | --------------------------------------------------- | 
|---|
| 598 |  | 
|---|
| 599 |  | 
|---|
| 600 | Nucleic Acid and Protein Sequence File Formats | 
|---|
| 601 |  | 
|---|
| 602 |  | 
|---|
| 603 | It will probably save you some time if you have your data in a usable | 
|---|
| 604 | format before you send it to us.  However, we do have the University of | 
|---|
| 605 | Wisconsin Genetics Computing Group programs running on our VAXen and | 
|---|
| 606 | this package includes several reformatting utilities.  Our programs | 
|---|
| 607 | usually recognize any of several standard formats, including GenBank, | 
|---|
| 608 | EMBL, NBRF, and MolGen/Stanford.  For the purposes of annotating an | 
|---|
| 609 | analysis we find the GenBank and EMBL formats most useful, particularly | 
|---|
| 610 | if you have already received an accession number from one of these | 
|---|
| 611 | organizations for your sequence. | 
|---|
| 612 |  | 
|---|
| 613 | Our programs do not require that all of the line types available in | 
|---|
| 614 | GenBank, EMBL, or NBRF file formats be present for the file format to | 
|---|
| 615 | be recognized and processed.  The following pages outline the essential | 
|---|
| 616 | details required for correct processing of files by our programs. | 
|---|
| 617 | Additional information may be present but will generally be ignored. | 
|---|
| 618 |  | 
|---|
| 619 |  | 
|---|
| 620 | GenBank File Format | 
|---|
| 621 |  | 
|---|
| 622 | File Header | 
|---|
| 623 |  | 
|---|
| 624 | 1.  The first line in the file must have "GENETIC SEQUENCE DATA BANK" | 
|---|
| 625 | in spaces 20 through 46 (see LINE  1, below). | 
|---|
| 626 | 2.  The next 8 lines may contain arbitrary text.  They are ignored but | 
|---|
| 627 | are required to maintain the GenBank format (see LINE 2 - LINE 9). | 
|---|
| 628 |  | 
|---|
| 629 | Sequence Data Entries | 
|---|
| 630 |  | 
|---|
| 631 | 3.  Each sequence entry in the file should have the following format. | 
|---|
| 632 |  | 
|---|
| 633 | a) first line: | 
|---|
| 634 |  | 
|---|
| 635 | Must have LOCUS in the first 5 spaces. The | 
|---|
| 636 | genetic locus name or identifier must be in spaces | 
|---|
| 637 | 13 - 22. The length of the sequences is right | 
|---|
| 638 | justified in spaces 23 through 29 (see LINE 10). | 
|---|
| 639 |  | 
|---|
| 640 | b) second line: | 
|---|
| 641 |  | 
|---|
| 642 | Must have DEFINITION in the first 10 spaces. | 
|---|
| 643 | Spaces 13 - 80 are free form text to identify the | 
|---|
| 644 | sequence (see LINE 11). | 
|---|
| 645 |  | 
|---|
| 646 | c) third line: | 
|---|
| 647 |  | 
|---|
| 648 | Must have ACCESSION in the first 9 spaces. Spaces | 
|---|
| 649 | 13 - 18 must hold the primary accession number | 
|---|
| 650 | (see LINE 12). | 
|---|
| 651 |  | 
|---|
| 652 | d) fourth line: | 
|---|
| 653 |  | 
|---|
| 654 | Must have ORIGIN in the first 6 spaces. Nothing | 
|---|
| 655 | else is required on this line, it indicates that | 
|---|
| 656 | the nucleic acid sequence begins on the next line | 
|---|
| 657 | (see LINE 13). | 
|---|
| 658 |  | 
|---|
| 659 | e) fifth line: | 
|---|
| 660 |  | 
|---|
| 661 | Begins the nucleotide sequence. The first 9 | 
|---|
| 662 | spaces of each sequence line may either be blank | 
|---|
| 663 | or may contain the position in the sequence of the | 
|---|
| 664 | first nucleotide on the line. The next 66 spaces | 
|---|
| 665 | hold the nucleotide sequence in six blocks of ten | 
|---|
| 666 | nucleotides. Each of the six blocks begins with a | 
|---|
| 667 | blank space followed by ten nucleotides. Thus the | 
|---|
| 668 | first nucleotide is in space eleven of the line while | 
|---|
| 669 | the last is in space 75 (see LINE 14, LINE 15). | 
|---|
| 670 |  | 
|---|
| 671 | f) last line: | 
|---|
| 672 |  | 
|---|
| 673 | Must have // in the first 2 spaces to indicate | 
|---|
| 674 | termination of the sequence (see LINE 16). | 
|---|
| 675 |  | 
|---|
| 676 | NOTE:  Multiple sequences may appear in each file.  To begin another | 
|---|
| 677 | sequence go back to a) and start again. | 
|---|
| 678 |  | 
|---|
| 679 |  | 
|---|
| 680 | Example GenBank file | 
|---|
| 681 |  | 
|---|
| 682 |  | 
|---|
| 683 | LINE  1  :                   GENETIC SEQUENCE DATA BANK | 
|---|
| 684 | LINE  2  : | 
|---|
| 685 | LINE  3  : | 
|---|
| 686 | LINE  4  : | 
|---|
| 687 | LINE  5  : | 
|---|
| 688 | LINE  6  : | 
|---|
| 689 | LINE  7  : | 
|---|
| 690 | LINE  8  : | 
|---|
| 691 | LINE  9  : | 
|---|
| 692 | LINE 10  :LOCUS       L_Name     Length BP | 
|---|
| 693 | LINE 11  :DEFINITION  Describe the sequence any way you want | 
|---|
| 694 | LINE 12  :ACCESSION   Accession Number | 
|---|
| 695 | LINE 13  :ORIGIN | 
|---|
| 696 | LINE 14  :        1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a... | 
|---|
| 697 | LINE 15  :       61 acgt... | 
|---|
| 698 | LINE 16  :// | 
|---|
| 699 |  | 
|---|
| 700 |  | 
|---|
| 701 |  | 
|---|
| 702 | EMBL File Format | 
|---|
| 703 |  | 
|---|
| 704 | Unlike the GenBank file format the EMBL file format does not require | 
|---|
| 705 | a series of header lines.  Thus the first line in the file begins | 
|---|
| 706 | the first sequence entry of the file. | 
|---|
| 707 |  | 
|---|
| 708 | 1.  The first line of each sequence entry contains the two letters ID | 
|---|
| 709 | in the first two spaces.  This is followed by the EMBL identifier | 
|---|
| 710 | in spaces 6 through 14.  (See LINE  1). | 
|---|
| 711 |  | 
|---|
| 712 | 2.  The second line of each sequence entry has the two letters AC in | 
|---|
| 713 | the first two spaces.  This is followed by the accession number in | 
|---|
| 714 | spaces 6 through 11.  (See LINE  2). | 
|---|
| 715 |  | 
|---|
| 716 | 3.  The third line of each sequence entry has the two letters DE in the | 
|---|
| 717 | first two spaces.  This is followed by a free form text definition | 
|---|
| 718 | in spaces 6 through 72.  (See LINE  3). | 
|---|
| 719 |  | 
|---|
| 720 | 4.  The fourth line in each sequence entry has the two letters SQ in | 
|---|
| 721 | the first two spaces.  This is followed by the length of the | 
|---|
| 722 | sequence beginning at or after space 13.  After the sequence length | 
|---|
| 723 | there is a blank space and the two letters BP.  (See LINE  4). | 
|---|
| 724 |  | 
|---|
| 725 | 5.  The nucleotide sequence begins on the fifth line of the sequence | 
|---|
| 726 | entry.  Each line of sequence begins with four blank spaces. The | 
|---|
| 727 | next 66 spaces hold the nucleotide sequence in six blocks of ten | 
|---|
| 728 | nucleotides.  Each of the six blocks begins with a blank space | 
|---|
| 729 | followed by ten nucleotides.  Thus the first nucleotide is in space | 
|---|
| 730 | 6 of the line while the last is in space 70.  (See LINE  5 - | 
|---|
| 731 | LINE  6). | 
|---|
| 732 |  | 
|---|
| 733 | 6.  The last line of each sequence entry in the file is a terminator | 
|---|
| 734 | line which has the two characters // in the first two spaces. | 
|---|
| 735 | (See LINE  7). | 
|---|
| 736 |  | 
|---|
| 737 | 7.  Multiple sequences may appear in each file.  To begin another | 
|---|
| 738 | sequence go back to item 1 and start again. | 
|---|
| 739 |  | 
|---|
| 740 |  | 
|---|
| 741 | Example EMBL file | 
|---|
| 742 |  | 
|---|
| 743 | LINE  1  :ID   ID_name | 
|---|
| 744 | LINE  2  :AC   Accession number | 
|---|
| 745 | LINE  3  :DE   Describe the sequence any way you want | 
|---|
| 746 | LINE  4  :SQ          Length BP | 
|---|
| 747 | LINE  5  :     ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA... | 
|---|
| 748 | LINE  6  :     ACGT... | 
|---|
| 749 | LINE  7  :// | 
|---|
| 750 |  | 
|---|
| 751 |  | 
|---|
| 752 |  | 
|---|
| 753 | NBRF (protein or nucleic acid) File Format | 
|---|
| 754 |  | 
|---|
| 755 | 1.  The first line of each sequence entry begins with a greater than | 
|---|
| 756 | symbol, >.  This is immediately followed by the two character | 
|---|
| 757 | sequence type specifier.  Space four must contain a semi-colon. | 
|---|
| 758 | Beginning in space five is the sequence name or identification code | 
|---|
| 759 | for the NBRF database.  The code is from four to six letters and | 
|---|
| 760 | numbers.  (See LINE  1). | 
|---|
| 761 |  | 
|---|
| 762 | !!!! >> add these to readseq | 
|---|
| 763 | Specifier             Sequence type | 
|---|
| 764 |  | 
|---|
| 765 | P1                protein, complete | 
|---|
| 766 | F1                protein, fragment | 
|---|
| 767 | DL                DNA, linear | 
|---|
| 768 | DC                DNA, circular | 
|---|
| 769 | RL                RNA, linear | 
|---|
| 770 | RC                RNA, circular | 
|---|
| 771 | N1                functional RNA, other than tRNA | 
|---|
| 772 | N3                tRNA | 
|---|
| 773 |  | 
|---|
| 774 | 2.  The second line of each sequence entry contains two kinds of | 
|---|
| 775 | information.  First is the sequence name which is separated from | 
|---|
| 776 | the organism or organelle name by the three character sequence | 
|---|
| 777 | blank space, dash, blank space, " - ".  There is no special | 
|---|
| 778 | character marking the beginning of this line.  (See LINE  2). | 
|---|
| 779 |  | 
|---|
| 780 | 3.  Either the amino acid or nucleic acid sequence begins on line three | 
|---|
| 781 | and can begin in any space, including the first.  The sequence is | 
|---|
| 782 | free format and may be interrupted by blanks for ease of reading. | 
|---|
| 783 | Protein sequences man contain special punctuation to indicate | 
|---|
| 784 | various indeterminacies in the sequence.  In the NBRF data files | 
|---|
| 785 | all lines may be up to 500 characters long.  However some PSC | 
|---|
| 786 | programs currently have a limit of 130 characters per line | 
|---|
| 787 | (including blanks), and BitNet will not accept lines of over eighty | 
|---|
| 788 | characters.  (See LINE  3, LINE  4, and LINE  5). | 
|---|
| 789 |  | 
|---|
| 790 | The last character in the sequence must be an asterisks, *. | 
|---|
| 791 |  | 
|---|
| 792 | Example NBRF file | 
|---|
| 793 |  | 
|---|
| 794 | LINE  1  :>P1;CBRT | 
|---|
| 795 | LINE  2  :Cytochrome b - Rat mitochondrion (SGC1) | 
|---|
| 796 | LINE  3  :M T N I R K S H P L F K I I N H S F I D L P A P S | 
|---|
| 797 | LINE  4  : VTHICRDVN Y GWL IRY | 
|---|
| 798 | LINE  5  :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN* | 
|---|
| 799 |  | 
|---|
| 800 |  | 
|---|
| 801 |  | 
|---|
| 802 | MolGen/Stanford File Format | 
|---|
| 803 |  | 
|---|
| 804 | 1.  The first line in a sequence file is a comment line.  This line | 
|---|
| 805 | begins with a semi-colon in the first space.  This line need | 
|---|
| 806 | not be present.  If it is present it holds descriptive text. | 
|---|
| 807 | There may be as many comment lines as desired at the first of | 
|---|
| 808 | sequence file.  (See LINE  1). | 
|---|
| 809 |  | 
|---|
| 810 | 2.  The second line must be present and contains an identifier or | 
|---|
| 811 | name for the sequence in the first ten spaces.  (See LINE  2). | 
|---|
| 812 |  | 
|---|
| 813 | 3.  The sequence begins on the third line and occupies up to eighty | 
|---|
| 814 | spaces.  Spaces may be included in the sequence for ease of | 
|---|
| 815 | reading.  The sequence continues for as many line as needed | 
|---|
| 816 | and is terminated with a 1 or 2.  1 indicates a linear sequence | 
|---|
| 817 | while 2 marks a circular sequence.  (See LINE  3 and LINE  4). | 
|---|
| 818 |  | 
|---|
| 819 | Example MolGen/Stanford file | 
|---|
| 820 |  | 
|---|
| 821 | LINE  1  :;  Describe the sequence any way you want | 
|---|
| 822 | LINE  2  :ECTRNAGLY2 | 
|---|
| 823 | LINE  3  :ACGCACGTAC ACGTACGTAC   A C G T C C G T ACG TAC GTA CGT | 
|---|
| 824 | LINE  4  :  GCTTA   GG G C T A1 | 
|---|
| 825 |  | 
|---|
| 826 |  | 
|---|
| 827 |  | 
|---|
| 828 |  | 
|---|
| 829 | |||||||||||  Phylip file format | 
|---|
| 830 | --------------------------------------------------- | 
|---|
| 831 |  | 
|---|
| 832 | Phylip 3.3 File Format (DNA sequences) | 
|---|
| 833 |  | 
|---|
| 834 |  | 
|---|
| 835 | The input and output formats for PROTPARS and for RESTML are described  in | 
|---|
| 836 | their  document  files.   In  general  their input formats are similar to those | 
|---|
| 837 | described here, except that the one-letter codes for data are specific to those | 
|---|
| 838 | programs  and  are  described in those document files.  Since the input formats | 
|---|
| 839 | for the eight DNA sequence programs apply to  all  eight,  they  are  described | 
|---|
| 840 | here.   Their  input  formats are standard: the data have A's, G's, C's and T's | 
|---|
| 841 | (or U's).  The first line of the input file contains the number of species  and | 
|---|
| 842 | the  number  of  sites.   As  with  the other programs, options information may | 
|---|
| 843 | follow this.  In the case of DNAML, DNAMLK,  and  DNADIST  an  additional  line | 
|---|
| 844 | (described  in  the  document file for these pograms) may follow the first one. | 
|---|
| 845 | Following this, each species starts on a new line.  The first 10 characters  of | 
|---|
| 846 | that  line  are the species name.  There then follows the base sequence of that | 
|---|
| 847 | species, each character being one of the letters A, B, C, D, G, H, K, M, N,  O, | 
|---|
| 848 | R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is | 
|---|
| 849 | no longer allowed, because it sometimes is used to in aligned sequences to mean | 
|---|
| 850 | "the  same  as  the  sequence  above").   Blanks  will  be ignored, and so will | 
|---|
| 851 | numerical digits.  This allows GENBANK and EMBL sequence  entries  to  be  read | 
|---|
| 852 | with minimum editing. | 
|---|
| 853 |  | 
|---|
| 854 | These characters can be  either  upper  or  lower  case.   The  algorithms | 
|---|
| 855 | convert  all  input  characters  to upper case (which is how they are treated). | 
|---|
| 856 | The characters constitute the IUPAC (IUB) nucleic acid code  plus  some  slight | 
|---|
| 857 | extensions.  They enable input of nucleic acid sequences taking full account of | 
|---|
| 858 | any ambiguities in the sequence. | 
|---|
| 859 |  | 
|---|
| 860 | The sequences can continue over multiple lines; when this is done the sequences | 
|---|
| 861 | must  be  either  in  "interleaved"  format, similar to the output of alignment | 
|---|
| 862 | programs, or "sequential" format.  These are described  in  the  main  document | 
|---|
| 863 | file.   In sequential format all of one sequence is given, possibly on multiple | 
|---|
| 864 | lines, before the next starts.  In interleaved format the  first  part  of  the | 
|---|
| 865 | file  should  contain  the first part of each of the sequences, then possibly a | 
|---|
| 866 | line containing nothing but a carriage-return character, then the  second  part | 
|---|
| 867 | of  each  sequence, and so on.  Only the first parts of the sequences should be | 
|---|
| 868 | preceded by names.  Here is a hypothetical example of interleaved format: | 
|---|
| 869 |  | 
|---|
| 870 | 5    42 | 
|---|
| 871 | Turkey    AAGCTNGGGC ATTTCAGGGT | 
|---|
| 872 | Salmo gairAAGCCTTGGC AGTGCAGGGT | 
|---|
| 873 | H. SapiensACCGGTTGGC CGTTCAGGGT | 
|---|
| 874 | Chimp     AAACCCTTGC CGTTACGCTT | 
|---|
| 875 | Gorilla   AAACCCTTGC CGGTACGCTT | 
|---|
| 876 |  | 
|---|
| 877 | GAGCCCGGGC AATACAGGGT AT | 
|---|
| 878 | GAGCCGTGGC CGGGCACGGT AT | 
|---|
| 879 | ACAGGTTGGC CGTTCAGGGT AA | 
|---|
| 880 | AAACCGAGGC CGGGACACTC AT | 
|---|
| 881 | AAACCATTGC CGGTACGCTT AA | 
|---|
| 882 |  | 
|---|
| 883 | while in sequential format the same sequences would be: | 
|---|
| 884 |  | 
|---|
| 885 | 5    42 | 
|---|
| 886 | Turkey    AAGCTNGGGC ATTTCAGGGT | 
|---|
| 887 | GAGCCCGGGC AATACAGGGT AT | 
|---|
| 888 | Salmo gairAAGCCTTGGC AGTGCAGGGT | 
|---|
| 889 | GAGCCGTGGC CGGGCACGGT AT | 
|---|
| 890 | H. SapiensACCGGTTGGC CGTTCAGGGT | 
|---|
| 891 | ACAGGTTGGC CGTTCAGGGT AA | 
|---|
| 892 | Chimp     AAACCCTTGC CGTTACGCTT | 
|---|
| 893 | AAACCGAGGC CGGGACACTC AT | 
|---|
| 894 | Gorilla   AAACCCTTGC CGGTACGCTT | 
|---|
| 895 | AAACCATTGC CGGTACGCTT AA | 
|---|
| 896 |  | 
|---|
| 897 |  | 
|---|
| 898 | Note, of course, that a portion of a sequence like this: | 
|---|
| 899 |  | 
|---|
| 900 | 300   AAGCGTGAAC GTTGTACTAA TRCAG | 
|---|
| 901 |  | 
|---|
| 902 | is perfectly legal, assuming that the species name  has  gone  before,  and  is | 
|---|
| 903 | filled  out  to  full  length  by  blanks.  The above digits and blanks will be | 
|---|
| 904 | ignored, the sequence being taken as starting at the first base symbol (in this | 
|---|
| 905 | case an A). | 
|---|
| 906 |  | 
|---|
| 907 | The present versions of the programs may sometimes have difficulties  with | 
|---|
| 908 | the  blank  lines  between  groups of lines, and if so you might want to retype | 
|---|
| 909 | those lines, making sure that they have only a  carriage-return  and  no  blank | 
|---|
| 910 | characters on them, or you may perhaps have to eliminate them.  The symptoms of | 
|---|
| 911 | this problem are that the programs complain that the sequences are not properly | 
|---|
| 912 | aligned, and you can find no other cause for this complaint. | 
|---|
| 913 |  | 
|---|
| 914 | ------------------------------------------------ | 
|---|
| 915 |  | 
|---|
| 916 |  | 
|---|
| 917 | |||||||||||  ASN.1 file format | 
|---|
| 918 | --------------------------------------------------- | 
|---|
| 919 |  | 
|---|
| 920 |  | 
|---|
| 921 | ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov) | 
|---|
| 922 |  | 
|---|
| 923 | Example asn.1 sequence file---- | 
|---|
| 924 |  | 
|---|
| 925 | Bioseq-set ::= { | 
|---|
| 926 | seq-set { | 
|---|
| 927 | seq { | 
|---|
| 928 | id { local id 1 } ,                 -- id essential | 
|---|
| 929 | descr {  title "Dummy sequence data from nowhere"  } ,  -- optional | 
|---|
| 930 | inst {                              -- inst essential | 
|---|
| 931 | repr raw , | 
|---|
| 932 | mol dna , | 
|---|
| 933 | length 156 , | 
|---|
| 934 | topology linear , | 
|---|
| 935 | seq-data | 
|---|
| 936 | iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA | 
|---|
| 937 | TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG | 
|---|
| 938 | TGGATTCAAAGCAATAGAGTTGTTCTT" | 
|---|
| 939 | } } , | 
|---|
| 940 |  | 
|---|
| 941 | seq { | 
|---|
| 942 | id { local id 2 } , | 
|---|
| 943 | descr {  title "Dummy sequence 2 data from somewhere else"  } , | 
|---|
| 944 | inst { | 
|---|
| 945 | repr raw , | 
|---|
| 946 | mol dna , | 
|---|
| 947 | length 150 , | 
|---|
| 948 | topology linear , | 
|---|
| 949 | seq-data | 
|---|
| 950 | iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA | 
|---|
| 951 | TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG | 
|---|
| 952 | TGGATTCAAAGCAATAGAGTT" | 
|---|
| 953 | } | 
|---|
| 954 | } | 
|---|
| 955 | } | 
|---|
| 956 | } | 
|---|
| 957 |  | 
|---|
| 958 |  | 
|---|
| 959 | partial ASN.1 description from toolkit | 
|---|
| 960 |  | 
|---|
| 961 | Bioseq ::= SEQUENCE { | 
|---|
| 962 | id SET OF Seq-id ,            -- equivalent identifiers | 
|---|
| 963 | descr Seq-descr OPTIONAL , -- descriptors | 
|---|
| 964 | inst Seq-inst ,            -- the sequence data | 
|---|
| 965 | annot SET OF Seq-annot OPTIONAL } | 
|---|
| 966 |  | 
|---|
| 967 | Seq-inst ::= SEQUENCE {            -- the sequence data itself | 
|---|
| 968 | repr ENUMERATED {              -- representation class | 
|---|
| 969 | not-set (0) ,              -- empty | 
|---|
| 970 | virtual (1) ,              -- no seq data | 
|---|
| 971 | raw (2) ,                  -- continuous sequence | 
|---|
| 972 | seg (3) ,                  -- segmented sequence | 
|---|
| 973 | const (4) ,                -- constructed sequence | 
|---|
| 974 | ref (5) ,                  -- reference to another sequence | 
|---|
| 975 | consen (6) ,               -- consensus sequence or pattern | 
|---|
| 976 | map (7) ,                  -- ordered map (genetic, restriction) | 
|---|
| 977 | other (255) } , | 
|---|
| 978 | mol ENUMERATED {               -- molecule class in living organism | 
|---|
| 979 | not-set (0) ,              --   > cdna = rna | 
|---|
| 980 | dna (1) , | 
|---|
| 981 | rna (2) , | 
|---|
| 982 | aa (3) , | 
|---|
| 983 | na (4) ,                   -- just a nucleic acid | 
|---|
| 984 | other (255) } , | 
|---|
| 985 | length INTEGER OPTIONAL ,      -- length of sequence in residues | 
|---|
| 986 | fuzz Int-fuzz OPTIONAL ,       -- length uncertainty | 
|---|
| 987 | topology ENUMERATED {          -- topology of molecule | 
|---|
| 988 | not-set (0) , | 
|---|
| 989 | linear (1) , | 
|---|
| 990 | circular (2) , | 
|---|
| 991 | tandem (3) ,               -- some part of tandem repeat | 
|---|
| 992 | other (255) } DEFAULT linear , | 
|---|
| 993 | strand ENUMERATED {            -- strandedness in living organism | 
|---|
| 994 | not-set (0) , | 
|---|
| 995 | ss (1) ,                   -- single strand | 
|---|
| 996 | ds (2) ,                   -- double strand | 
|---|
| 997 | mixed (3) , | 
|---|
| 998 | other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept | 
|---|
| 999 | seq-data Seq-data OPTIONAL ,   -- the sequence | 
|---|
| 1000 | ext Seq-ext OPTIONAL ,         -- extensions for special types | 
|---|
| 1001 | hist Seq-hist OPTIONAL }       -- sequence history | 
|---|
| 1002 |  | 
|---|
| 1003 | ------------------------------------------------ | 
|---|