| 1 | [file has been slightly modified to fit into arb help] |
|---|
| 2 | |
|---|
| 3 | ||||||||||| ReadSeq supported formats (revised 30Dec92) |
|---|
| 4 | -------------------------------------------------------- |
|---|
| 5 | |
|---|
| 6 | -f[ormat=]Name Format name for output: |
|---|
| 7 | | 1. IG/Stanford 10. Olsen (in-only) |
|---|
| 8 | | 2. GenBank/GB 11. Phylip3.2 |
|---|
| 9 | | 3. NBRF 12. Phylip |
|---|
| 10 | | 4. EMBL 13. Plain/Raw |
|---|
| 11 | | 5. GCG 14. PIR/CODATA |
|---|
| 12 | | 6. DNAStrider 15. MSF |
|---|
| 13 | | 7. Fitch 16. ASN.1 |
|---|
| 14 | | 8. Pearson/Fasta 17. PAUP |
|---|
| 15 | | 9. Zuker (in-only) 18. Pretty (out-only) |
|---|
| 16 | |
|---|
| 17 | In general, output supports only minimal subsets of each format |
|---|
| 18 | needed for sequence data exchanges. Features, descriptions |
|---|
| 19 | and other format-unique information is discarded. |
|---|
| 20 | |
|---|
| 21 | Users of Olsen multi sequence editor (VMS). The Olsen format |
|---|
| 22 | here is produced with the print command: |
|---|
| 23 | print/out=some.file |
|---|
| 24 | Use Genbank output from readseq to produce a format that this |
|---|
| 25 | editor can read, and use the command |
|---|
| 26 | load/genbank some.file |
|---|
| 27 | Dan Davison has a VMS program that will convert to/from the |
|---|
| 28 | Olsen native binary data format. E-mail davison@uh.edu |
|---|
| 29 | |
|---|
| 30 | Warning: Phylip format input is now supported (30Dec92), however the |
|---|
| 31 | auto-detection of Phylip format is very probabilistic and messy, |
|---|
| 32 | especially distinguishing sequential from interleaved versions. It |
|---|
| 33 | is not recommended that one use readseq to convert files from Phylip |
|---|
| 34 | format to others unless essential. |
|---|
| 35 | |
|---|
| 36 | |
|---|
| 37 | |
|---|
| 38 | ||||||||||| ReadSeq usage (revised 11Nov91) |
|---|
| 39 | -------------------------------------------------------- |
|---|
| 40 | |
|---|
| 41 | A. determine file format: |
|---|
| 42 | |
|---|
| 43 | short skiplines; /* result: number of header lines to skip (or 0) */ |
|---|
| 44 | short error; /* error result or 0 */ |
|---|
| 45 | short format; /* resulting format code, see ureadseq.h */ |
|---|
| 46 | char *filename = "Mysequence.file" |
|---|
| 47 | |
|---|
| 48 | format = seqFileFormat( filename, &skiplines, &error); |
|---|
| 49 | if (error!=0) fail; |
|---|
| 50 | |
|---|
| 51 | B. read number and list of sequences (optional) |
|---|
| 52 | |
|---|
| 53 | short numseqs; /* resulting number of sequences found in file */ |
|---|
| 54 | char *seqlist; /* list of sequence names, newline separated, 0 terminated */ |
|---|
| 55 | |
|---|
| 56 | seqlist = listSeqs( filename, skiplines, format, &numseqs, &error); |
|---|
| 57 | if (error!=0) display (seqlist); |
|---|
| 58 | free( seqlist); |
|---|
| 59 | |
|---|
| 60 | C. read individual sequences as desired |
|---|
| 61 | |
|---|
| 62 | short seqIndex; /* sequence index #, or == kListSeqs for listSeqs equivalent */ |
|---|
| 63 | long seqlen; /* length of seq */ |
|---|
| 64 | char seqid[256]; /* sequence name */ |
|---|
| 65 | char *seq; /* sequence, 0 terminated, free when done */ |
|---|
| 66 | |
|---|
| 67 | seq = readSeq( seqIndex, filename, skiplines, format, |
|---|
| 68 | &seqlen, &numseqs, &error, seqid); |
|---|
| 69 | if (error!=0) manipulate(seq); |
|---|
| 70 | free(seq); |
|---|
| 71 | |
|---|
| 72 | D. write sequences as desired |
|---|
| 73 | |
|---|
| 74 | int nlines; /* number of lines of sequence written */ |
|---|
| 75 | FILE* fout; /* open file pointer (stdout or other) */ |
|---|
| 76 | short outform; /* output format, see ureadseq.h */ |
|---|
| 77 | |
|---|
| 78 | nlines = writeSeq( fout, seq, seqlen, format, outform, seqid); |
|---|
| 79 | |
|---|
| 80 | |
|---|
| 81 | Note (30Dec92): There is various processing done by the main program (in readseq.c), |
|---|
| 82 | rather than just in the subroutines (in ureadseq.c). Especially for interleaved |
|---|
| 83 | output formats, the writeSeq subroutine does not handle interleaving, nor some of |
|---|
| 84 | the formatting at the top and end of output files. While seqFileFormat, listSeqs, |
|---|
| 85 | and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on |
|---|
| 86 | auxilliary processing. At some point, this may be revised so writeSeq is |
|---|
| 87 | self-contained. |
|---|
| 88 | |
|---|
| 89 | Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format |
|---|
| 90 | reading (see ureadasn.c). A bastard (but workable I hope) ASN.1 format is written |
|---|
| 91 | by writeSeq alone. |
|---|
| 92 | |
|---|
| 93 | |
|---|
| 94 | |
|---|
| 95 | ||||||||||| sequence formats.... |
|---|
| 96 | --------------------------------------------------- |
|---|
| 97 | |
|---|
| 98 | stanford/IG |
|---|
| 99 | ;comments |
|---|
| 100 | ;... |
|---|
| 101 | seq1 info |
|---|
| 102 | abcd... |
|---|
| 103 | efgh1 (or 2 = terminator) |
|---|
| 104 | ;another seq |
|---|
| 105 | ;.... |
|---|
| 106 | seq2 info |
|---|
| 107 | abcd...1 |
|---|
| 108 | --- for e.g. ---- |
|---|
| 109 | ; Dro5s-T.Seq Length: 120 April 6, 1989 21:22 Check: 9487 .. |
|---|
| 110 | dro5stseq |
|---|
| 111 | GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG |
|---|
| 112 | GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1 |
|---|
| 113 | |
|---|
| 114 | ; TOIG of: Dro5srna.Seq check: 9487 from: 1 to: 120 |
|---|
| 115 | --------------------------------------------------- |
|---|
| 116 | |
|---|
| 117 | Genbank: |
|---|
| 118 | LOCUS seq1 ID.. |
|---|
| 119 | ... |
|---|
| 120 | ORIGIN ... |
|---|
| 121 | 123456789abcdefg....(1st 9 columns are formatting) |
|---|
| 122 | hijkl... |
|---|
| 123 | // (end of sequence) |
|---|
| 124 | LOCUS seq2 ID .. |
|---|
| 125 | ... |
|---|
| 126 | ORIGIN |
|---|
| 127 | abcd... |
|---|
| 128 | // |
|---|
| 129 | --------------------------------------------------- |
|---|
| 130 | |
|---|
| 131 | NBRF format: (from uwgcg ToNBRF) |
|---|
| 132 | >DL;DRO5SRNA |
|---|
| 133 | Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA |
|---|
| 134 | |
|---|
| 135 | 51 AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG |
|---|
| 136 | 101 AACACCGCGU GUUGUUGGCC U |
|---|
| 137 | |
|---|
| 138 | --------------------------------------------------- |
|---|
| 139 | |
|---|
| 140 | EMBL format |
|---|
| 141 | ID345 seq1 id (the 345 are spaces) |
|---|
| 142 | ... other info |
|---|
| 143 | SQ345Sequence (the 3,4,5 are spaces) |
|---|
| 144 | abcd... |
|---|
| 145 | hijk... |
|---|
| 146 | // (! this is proper end string: 12Oct90) |
|---|
| 147 | ID seq2 id |
|---|
| 148 | ... |
|---|
| 149 | SQ Sequence |
|---|
| 150 | abcd... |
|---|
| 151 | ... |
|---|
| 152 | // |
|---|
| 153 | --------------------------------------------------- |
|---|
| 154 | |
|---|
| 155 | UW GCG Format: |
|---|
| 156 | comments of any form, up to ".." signal |
|---|
| 157 | signal line has seq id, and " Check: #### .." |
|---|
| 158 | only 1 seq/file |
|---|
| 159 | |
|---|
| 160 | -- e.g. --- (GCG from GenBank) |
|---|
| 161 | LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987 |
|---|
| 162 | ... much more ... |
|---|
| 163 | ORIGIN 1 bp upstream of EcoRI site; chromosome BK9 region 69A1. |
|---|
| 164 | |
|---|
| 165 | INVERTEBRATE:DROEST6 Length: 1819 January 9, 1989 16:48 Check: 8008 .. |
|---|
| 166 | |
|---|
| 167 | 1 GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT |
|---|
| 168 | |
|---|
| 169 | 51 CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG |
|---|
| 170 | |
|---|
| 171 | |
|---|
| 172 | --------------------------------------------------- |
|---|
| 173 | |
|---|
| 174 | DNAStrider (Mac) = modified Stanford: |
|---|
| 175 | ; ### from DNA Strider Friday, April 7, 1989 11:04:24 PM |
|---|
| 176 | ; DNA sequence pBR322 4363 b.p. complete sequence |
|---|
| 177 | ; |
|---|
| 178 | abcd... |
|---|
| 179 | efgh |
|---|
| 180 | // (end of sequence) |
|---|
| 181 | --------------------------------------------------- |
|---|
| 182 | |
|---|
| 183 | Fitch format: |
|---|
| 184 | Dro5srna.Seq |
|---|
| 185 | GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC |
|---|
| 186 | GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU |
|---|
| 187 | Droest6.Seq |
|---|
| 188 | GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG |
|---|
| 189 | AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG |
|---|
| 190 | --------------------------------------------------- |
|---|
| 191 | |
|---|
| 192 | W.Pearson/Fasta format: |
|---|
| 193 | >BOVPRL GenBank entry BOVPRL from omam file. 907 nucleotides. |
|---|
| 194 | TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT |
|---|
| 195 | |
|---|
| 196 | --------------------------------------------------- |
|---|
| 197 | Phylip version 3.2 format (e.g., DNAML): |
|---|
| 198 | |
|---|
| 199 | 5 13 YF (# seqs, #bases, YF) |
|---|
| 200 | Alpha AACGTGGCCAAAT |
|---|
| 201 | aaaagggccc... (continued sp. alpha) |
|---|
| 202 | Beta AAGGTCGCCAAAC |
|---|
| 203 | aaaagggccc... (continued sp. beta) |
|---|
| 204 | Gamma CATTTCGTCACAA |
|---|
| 205 | aaaagggccc... (continued sp. Gamma) |
|---|
| 206 | 1234567890^-- bases must start in col 11, and run 'til #bases |
|---|
| 207 | (spaces & newlines are okay) |
|---|
| 208 | --------------------------------------------------- |
|---|
| 209 | Phylip version 3.3 format (e.g., DNAML): |
|---|
| 210 | |
|---|
| 211 | 5 42 YF (# seqs, #bases, YF) |
|---|
| 212 | Turkey AAGCTNGGGC ATTTCAGGGT |
|---|
| 213 | Salmo gairAAGCCTTGGC AGTGCAGGGT |
|---|
| 214 | H. SapiensACCGGTTGGC CGTTCAGGGT |
|---|
| 215 | Chimp AAACCCTTGC CGTTACGCTT |
|---|
| 216 | Gorilla AAACCCTTGC CGGTACGCTT |
|---|
| 217 | 1234567890^-- bases must start in col 11 |
|---|
| 218 | !! this version interleaves the species -- contrary to |
|---|
| 219 | all other output formats. |
|---|
| 220 | |
|---|
| 221 | GAGCCCGGGC AATACAGGGT AT |
|---|
| 222 | GAGCCGTGGC CGGGCACGGT AT |
|---|
| 223 | ACAGGTTGGC CGTTCAGGGT AA |
|---|
| 224 | AAACCGAGGC CGGGACACTC AT |
|---|
| 225 | AAACCATTGC CGGTACGCTT AA |
|---|
| 226 | |
|---|
| 227 | --------------------------------------------------- |
|---|
| 228 | Phylip version 3.4 format (e.g., DNAML) |
|---|
| 229 | -- Both Interleaved and sequential are permitted |
|---|
| 230 | |
|---|
| 231 | 5 13 (# seqs, #bases) |
|---|
| 232 | Alpha AACGTGGCCAAAT |
|---|
| 233 | aaaagggccc... (continued sp. alpha) |
|---|
| 234 | Beta AAGGTCGCCAAAC |
|---|
| 235 | aaaagggccc... (continued sp. beta) |
|---|
| 236 | Gamma CATTTCGTCACAA |
|---|
| 237 | aaaagggccc... (continued sp. Gamma) |
|---|
| 238 | 1234567890^-- bases must start in col 11, and run 'til #bases |
|---|
| 239 | (spaces, newlines and numbers are are ignored) |
|---|
| 240 | |
|---|
| 241 | --------------------------------------------------- |
|---|
| 242 | Gary Olsen (multiple) sequence editor /print format: |
|---|
| 243 | |
|---|
| 244 | !--------------------- |
|---|
| 245 | !17Oct91 -- error in original copy of olsen /print format, shifted right 1 space |
|---|
| 246 | ! here is correct copy: |
|---|
| 247 | 301 40 Tb.thiop CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG----------------------------------------------------- Tb.thiop |
|---|
| 248 | 123456789012345678901 |
|---|
| 249 | 301 42 Rhc.purp CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG----------------------------------------------------- Rhc.purp |
|---|
| 250 | |
|---|
| 251 | 301 44 Rhc.gela nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA----------------------------------------------------- Rhc.gela |
|---|
| 252 | !--------------------- |
|---|
| 253 | |
|---|
| 254 | RNase P RNA components. on 20-FEB-90 17:23:58 |
|---|
| 255 | |
|---|
| 256 | 1 (E.c. pr ): Base pairing in Escherichia coli RNase P RNA. |
|---|
| 257 | 2 (chrom ): Chromatium |
|---|
| 258 | : |
|---|
| 259 | 12 (B.brevis): Bacillus brevis RNase P RNA, B. James. |
|---|
| 260 | 13 ( 90% con): 90% conserved |
|---|
| 261 | 14 (100% con): 100% conserved |
|---|
| 262 | 15 (gram+ pr): pairing |
|---|
| 263 | |
|---|
| 264 | 1 |
|---|
| 265 | RNase P RNA components. on 20-FEB-90 17:23:58 |
|---|
| 266 | |
|---|
| 267 | Posi- Sequence |
|---|
| 268 | tion: identity: Data: |
|---|
| 269 | |
|---|
| 270 | 1 1 E.c. pr <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>> E.c. pr |
|---|
| 271 | 1 2 chrom GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------ chrom |
|---|
| 272 | : |
|---|
| 273 | 1 12 B.brevis AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU------------- B.brevis |
|---|
| 274 | 1234567890123456789012 <! this should be 21 not 22, |
|---|
| 275 | ! this example must be inset on left by 1 space from olsen /print files ! |
|---|
| 276 | 1 13 90% con G C G A CGC GC - - 90% con |
|---|
| 277 | 1 14 100% con G A CGC 100% con |
|---|
| 278 | 1 15 gram+ pr <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<=============== gram+ pr |
|---|
| 279 | |
|---|
| 280 | 60 1 E.c. pr >>>>>>^>>^>>>>:>> <<<^<<<< {{{{{ E.c. pr |
|---|
| 281 | 60 2 chrom -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU------------- chrom |
|---|
| 282 | : : |
|---|
| 283 | 60 10 B.stearo ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC B.stearo |
|---|
| 284 | |
|---|
| 285 | |
|---|
| 286 | --------------------------------------------------- |
|---|
| 287 | GCG MSF format |
|---|
| 288 | Title line |
|---|
| 289 | |
|---|
| 290 | picorna.msf MSF: 100 Type: P January 17, 1991 17:53 Check: 541 |
|---|
| 291 | .. |
|---|
| 292 | Name: Cb3 Len: 100 Check: 7009 Weight: 1.00 |
|---|
| 293 | Name: E Len: 100 Check: 60 Weight: 1.00 |
|---|
| 294 | |
|---|
| 295 | // |
|---|
| 296 | |
|---|
| 297 | 1 50 |
|---|
| 298 | Cb3 ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet |
|---|
| 299 | E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs |
|---|
| 300 | |
|---|
| 301 | 51 100 |
|---|
| 302 | |
|---|
| 303 | Cb3 ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn..... |
|---|
| 304 | E ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf |
|---|
| 305 | |
|---|
| 306 | --------------------------------------------------- |
|---|
| 307 | PIR format |
|---|
| 308 | This is NBRF-PIR MAILSERVER version 1.45 |
|---|
| 309 | Command-> get PIR3:A31391 |
|---|
| 310 | \\\ |
|---|
| 311 | ENTRY A31391 #Type Protein |
|---|
| 312 | TITLE *Esterase-6 - Fruit fly (Drosophila melanogaster) |
|---|
| 313 | |
|---|
| 314 | DATE 03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992 |
|---|
| 315 | PLACEMENT 0.0 0.0 0.0 0.0 0.0 |
|---|
| 316 | COMMENT *This entry is not verified. |
|---|
| 317 | SOURCE Drosophila melanogaster |
|---|
| 318 | |
|---|
| 319 | REFERENCE |
|---|
| 320 | #Authors Cooke P.H., Oakeshott J.G. |
|---|
| 321 | #Citation submitted to GenBank, April 1989 |
|---|
| 322 | #Reference-number A31391 |
|---|
| 323 | #Accession A31391 |
|---|
| 324 | #Cross-reference GB:J04167 |
|---|
| 325 | |
|---|
| 326 | SUMMARY #Molecular-weight 61125 #Length 544 #Checksum 1679 |
|---|
| 327 | SEQUENCE |
|---|
| 328 | 5 10 15 20 25 30 |
|---|
| 329 | 1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V |
|---|
| 330 | 31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D |
|---|
| 331 | 61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D |
|---|
| 332 | 91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S |
|---|
| 333 | 121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K |
|---|
| 334 | 151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K |
|---|
| 335 | 181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A |
|---|
| 336 | 211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D |
|---|
| 337 | 241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L |
|---|
| 338 | 271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F |
|---|
| 339 | 301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V |
|---|
| 340 | 331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D |
|---|
| 341 | 361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K |
|---|
| 342 | 391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N |
|---|
| 343 | 421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I |
|---|
| 344 | 451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D |
|---|
| 345 | 481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K |
|---|
| 346 | 511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H |
|---|
| 347 | 541 V E F P |
|---|
| 348 | /// |
|---|
| 349 | \\\ |
|---|
| 350 | --------------------------------------------------- |
|---|
| 351 | PAUP format: |
|---|
| 352 | The NEXUS Format |
|---|
| 353 | |
|---|
| 354 | Every block starts with "BEGIN blockname;" and ends with "END;". |
|---|
| 355 | Each block is composed of one or more statements, each |
|---|
| 356 | terminated by a semicolon (;). |
|---|
| 357 | |
|---|
| 358 | Comments may be included in NEXUS files by enclosing them within |
|---|
| 359 | square brackets, as in "[This is a comment]." |
|---|
| 360 | |
|---|
| 361 | NEXUS-conforming files are identified by a "#NEXUS" directive at |
|---|
| 362 | the very beginning of the file (line 1, column 1). If the |
|---|
| 363 | #NEXUS is omitted PAUP issues a warning but continues |
|---|
| 364 | processing. |
|---|
| 365 | |
|---|
| 366 | NEXUS files are entirely free-format. Blanks, tabs, and |
|---|
| 367 | newlines may be placed anywhere in the file. Unless RESPECTCASE |
|---|
| 368 | is requested, commands and data may be entered in upper case, |
|---|
| 369 | lower case, or a mixture of upper and lower case. |
|---|
| 370 | |
|---|
| 371 | The following conventions are used in the syntax descriptions of |
|---|
| 372 | the various blocks. Upper-case items are entered exactly as |
|---|
| 373 | shown. Lower-case items inside of angle brackets -- e.g., <x> |
|---|
| 374 | -- represent items to be substituted by the user. Items inside |
|---|
| 375 | of square brackets -- e.g., [X] -- are optional. Items inside |
|---|
| 376 | of curly braces and separated by vertical bars -- e.g., { X | Y |
|---|
| 377 | | Z } -- are mutually exclusive options. |
|---|
| 378 | |
|---|
| 379 | |
|---|
| 380 | The DATA Block |
|---|
| 381 | |
|---|
| 382 | The DATA block contains the data matrix and other associated |
|---|
| 383 | information. Its syntax is: |
|---|
| 384 | |
|---|
| 385 | BEGIN DATA; |
|---|
| 386 | DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>; |
|---|
| 387 | [ FORMAT [ MISSING=<missing-symbol> ] |
|---|
| 388 | [ LABELPOS={ LEFT | RIGHT } ] |
|---|
| 389 | [ SYMBOLS="<symbols-list>" ] |
|---|
| 390 | [ INTERLEAVE ] |
|---|
| 391 | [ MATCHCHAR=<match-symbol> ] |
|---|
| 392 | [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ] |
|---|
| 393 | [ TRANSPOSE ] |
|---|
| 394 | [ RESPECTCASE ] |
|---|
| 395 | [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ] |
|---|
| 396 | [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ] |
|---|
| 397 | [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ] |
|---|
| 398 | [ ZAP = "<list of zapped characters>" ] ; ] |
|---|
| 399 | [ CHARLABELS <label_1> <label_2> <label_NCHAR> ; ] |
|---|
| 400 | [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ] |
|---|
| 401 | [ STATELABELS <currently ignored by PAUP> ; ] |
|---|
| 402 | MATRIX <data-matrix> ; |
|---|
| 403 | END; |
|---|
| 404 | |
|---|
| 405 | --- example PAUP file |
|---|
| 406 | |
|---|
| 407 | #NEXUS |
|---|
| 408 | |
|---|
| 409 | [!Brown et al. (1982) primate mitochondrial DNA] |
|---|
| 410 | |
|---|
| 411 | begin data; |
|---|
| 412 | dimensions ntax=5 nchar=896; |
|---|
| 413 | format datatype=dna matchchar=. interleave missing='-'; |
|---|
| 414 | matrix |
|---|
| 415 | [ 2 4 6 8 ] |
|---|
| 416 | [ 1 1 1 1 1 ] |
|---|
| 417 | human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc |
|---|
| 418 | chimp ................a.t. .c.................a ...............t.... ..................t. .t........c......... |
|---|
| 419 | gorilla ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c... |
|---|
| 420 | orang ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c... |
|---|
| 421 | gibbon ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c... |
|---|
| 422 | |
|---|
| 423 | [ 8 8 8 8 8 8 ] |
|---|
| 424 | [ 0 2 4 6 8 9 ] |
|---|
| 425 | [ 1 1 1 1 1 6 ] |
|---|
| 426 | human cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt |
|---|
| 427 | chimp t................... .a................c. ........a.....g..... ...a................ ................ |
|---|
| 428 | gorilla ..................tc .a................c. ........a.g......... ...a.............tt. .a.............. |
|---|
| 429 | orang ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........ |
|---|
| 430 | gibbon a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a.............. |
|---|
| 431 | ; |
|---|
| 432 | end; |
|---|
| 433 | --------------------------------------------------- |
|---|
| 434 | |
|---|
| 435 | |
|---|
| 436 | |
|---|
| 437 | |
|---|
| 438 | |
|---|
| 439 | |
|---|
| 440 | ||||||||||| Sample SMTP mail header |
|---|
| 441 | --------------------------------------------------- |
|---|
| 442 | |
|---|
| 443 | - - - - - - - - - |
|---|
| 444 | From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991 |
|---|
| 445 | Received: from genbank.bio.net by sunflower.bio.indiana.edu |
|---|
| 446 | (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST |
|---|
| 447 | Received: by genbank.bio.net (5.65/IG-2.0) |
|---|
| 448 | id AA14458; Sun, 10 Nov 91 14:30:03 -0800 |
|---|
| 449 | Date: Sun, 10 Nov 91 14:30:03 -0800 |
|---|
| 450 | Message-Id: <9111102230.AA14458@genbank.bio.net> |
|---|
| 451 | From: Database Server <GenBank-Retrieval-System@genbank.bio.net> |
|---|
| 452 | To: gilbertd@sunflower.bio.indiana.edu |
|---|
| 453 | Subject: Results of Query for drorna |
|---|
| 454 | Status: R |
|---|
| 455 | |
|---|
| 456 | No matches on drorna. |
|---|
| 457 | - - - - - - |
|---|
| 458 | From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991 |
|---|
| 459 | Received: from genbank.bio.net by sunflower.bio.indiana.edu |
|---|
| 460 | (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST |
|---|
| 461 | Received: by genbank.bio.net (5.65/IG-2.0) |
|---|
| 462 | id AA14461; Sun, 10 Nov 91 14:30:03 -0800 |
|---|
| 463 | Date: Sun, 10 Nov 91 14:30:03 -0800 |
|---|
| 464 | Message-Id: <9111102230.AA14461@genbank.bio.net> |
|---|
| 465 | From: Database Server <GenBank-Retrieval-System@genbank.bio.net> |
|---|
| 466 | To: gilbertd@sunflower.bio.indiana.edu |
|---|
| 467 | Subject: Results of Query for droest6 |
|---|
| 468 | Status: R |
|---|
| 469 | |
|---|
| 470 | LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987 |
|---|
| 471 | DEFINITION D.melanogaster esterase-6 mRNA, complete cds. |
|---|
| 472 | ACCESSION M15961 |
|---|
| 473 | |
|---|
| 474 | |
|---|
| 475 | |
|---|
| 476 | |
|---|
| 477 | |
|---|
| 478 | |
|---|
| 479 | |
|---|
| 480 | |
|---|
| 481 | |
|---|
| 482 | |
|---|
| 483 | |
|---|
| 484 | |
|---|
| 485 | ||||||||||| GCG manual discussion of sequence symbols: |
|---|
| 486 | --------------------------------------------------- |
|---|
| 487 | |
|---|
| 488 | III_SEQUENCE_SYMBOLS |
|---|
| 489 | |
|---|
| 490 | |
|---|
| 491 | GCG programs allow all upper and lower case letters, periods (.), |
|---|
| 492 | asterisks (*), pluses (+), ampersands (&), and ats (@) as symbols in |
|---|
| 493 | biological sequences. Nucleotide symbols, their complements, and the |
|---|
| 494 | standard one-letter amino acid symbols are shown below in separate lists. |
|---|
| 495 | The meanings of the symbols +, &, and @ have not been assigned at this |
|---|
| 496 | writing (March, 1989). |
|---|
| 497 | |
|---|
| 498 | GCG uses the letter codes for amino acid codes and nucleotide |
|---|
| 499 | ambiguity proposed by IUB (Nomenclature Committee, 1985, |
|---|
| 500 | Eur. J. Biochem. 150; 1-5). These codes are compatible with the codes |
|---|
| 501 | used by the EMBL, GenBank, and NBRF data libraries. |
|---|
| 502 | |
|---|
| 503 | |
|---|
| 504 | NUCLEOTIDES |
|---|
| 505 | |
|---|
| 506 | The meaning of each symbol, its complement, and the Cambridge and |
|---|
| 507 | Stanford equivalents are shown below. Cambridge files can be converted |
|---|
| 508 | into GCG files and vice versa with the programs FROMSTADEN and TOSTADEN. |
|---|
| 509 | IntelliGenetics sequence files can be interconverted with the programs |
|---|
| 510 | FROMIG and TOIG. |
|---|
| 511 | |
|---|
| 512 | IUB/GCG Meaning Complement Staden/Sanger Stanford |
|---|
| 513 | |
|---|
| 514 | A A T A A |
|---|
| 515 | C C G C C |
|---|
| 516 | G G C G G |
|---|
| 517 | T/U T A T T/U |
|---|
| 518 | M A or C K 5 J |
|---|
| 519 | R A or G Y R R |
|---|
| 520 | W A or T W 7 L |
|---|
| 521 | S C or G S 8 M |
|---|
| 522 | Y C or T R Y Y |
|---|
| 523 | K G or T M 6 K |
|---|
| 524 | V A or C or G B not supported N |
|---|
| 525 | H A or C or T D not supported N |
|---|
| 526 | D A or G or T H not supported N |
|---|
| 527 | B C or G or T V not supported N |
|---|
| 528 | X/N G or A or T or C X -/X N |
|---|
| 529 | . not G or A or T or C . not supported ? |
|---|
| 530 | |
|---|
| 531 | |
|---|
| 532 | The frame ambiguity codes used by Staden are not supported by GCG |
|---|
| 533 | and are translated by FROMSTADEN as the lower case single base |
|---|
| 534 | equivalent. |
|---|
| 535 | |
|---|
| 536 | Staden Code Meaning GCG |
|---|
| 537 | |
|---|
| 538 | D C or CC c |
|---|
| 539 | V T or TT t |
|---|
| 540 | B A or AA a |
|---|
| 541 | H G or GG g |
|---|
| 542 | K C or CX c |
|---|
| 543 | L T or TX t |
|---|
| 544 | M A or AX a |
|---|
| 545 | N G or GX g |
|---|
| 546 | |
|---|
| 547 | |
|---|
| 548 | AMINO ACIDS |
|---|
| 549 | |
|---|
| 550 | Here is a list of the standard one-letter amino acid codes and their |
|---|
| 551 | three-letter equivalents. The synonymous codons and their depiction in |
|---|
| 552 | the IUB codes are shown. You should recognize that the codons following |
|---|
| 553 | semicolons (;) are not sufficiently specific to define a single amino |
|---|
| 554 | acid even though they represent the best possible back translation into |
|---|
| 555 | the IUB codes! All of the relationships in this list can be redefined by |
|---|
| 556 | the user in a local data file described below. |
|---|
| 557 | |
|---|
| 558 | IUB |
|---|
| 559 | Symbol 3-letter Meaning Codons Depiction |
|---|
| 560 | A Ala Alanine GCT,GCC,GCA,GCG !GCX |
|---|
| 561 | B Asp,Asn Aspartic, |
|---|
| 562 | Asparagine GAT,GAC,AAT,AAC !RAY |
|---|
| 563 | C Cys Cysteine TGT,TGC !TGY |
|---|
| 564 | D Asp Aspartic GAT,GAC !GAY |
|---|
| 565 | E Glu Glutamic GAA,GAG !GAR |
|---|
| 566 | F Phe Phenylalanine TTT,TTC !TTY |
|---|
| 567 | G Gly Glycine GGT,GGC,GGA,GGG !GGX |
|---|
| 568 | H His Histidine CAT,CAC !CAY |
|---|
| 569 | I Ile Isoleucine ATT,ATC,ATA !ATH |
|---|
| 570 | K Lys Lysine AAA,AAG !AAR |
|---|
| 571 | L Leu Leucine TTG,TTA,CTT,CTC,CTA,CTG |
|---|
| 572 | !TTR,CTX,YTR;YTX |
|---|
| 573 | M Met Methionine ATG !ATG |
|---|
| 574 | N Asn Asparagine AAT,AAC !AAY |
|---|
| 575 | P Pro Proline CCT,CCC,CCA,CCG !CCX |
|---|
| 576 | Q Gln Glutamine CAA,CAG !CAR |
|---|
| 577 | R Arg Arginine CGT,CGC,CGA,CGG,AGA,AGG |
|---|
| 578 | !CGX,AGR,MGR;MGX |
|---|
| 579 | S Ser Serine TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX |
|---|
| 580 | T Thr Threonine ACT,ACC,ACA,ACG !ACX |
|---|
| 581 | V Val Valine GTT,GTC,GTA,GTG !GTX |
|---|
| 582 | W Trp Tryptophan TGG !TGG |
|---|
| 583 | X Xxx Unknown !XXX |
|---|
| 584 | Y Tyr Tyrosine TAT, TAC !TAY |
|---|
| 585 | Z Glu,Gln Glutamic, |
|---|
| 586 | Glutamine GAA,GAG,CAA,CAG !SAR |
|---|
| 587 | * End Terminator TAA, TAG, TGA !TAR,TRA;TRR |
|---|
| 588 | |
|---|
| 589 | |
|---|
| 590 | |
|---|
| 591 | |
|---|
| 592 | |
|---|
| 593 | |
|---|
| 594 | |
|---|
| 595 | |
|---|
| 596 | ||||||||||| docs from PSC on sequence formats: |
|---|
| 597 | --------------------------------------------------- |
|---|
| 598 | |
|---|
| 599 | |
|---|
| 600 | Nucleic Acid and Protein Sequence File Formats |
|---|
| 601 | |
|---|
| 602 | |
|---|
| 603 | It will probably save you some time if you have your data in a usable |
|---|
| 604 | format before you send it to us. However, we do have the University of |
|---|
| 605 | Wisconsin Genetics Computing Group programs running on our VAXen and |
|---|
| 606 | this package includes several reformatting utilities. Our programs |
|---|
| 607 | usually recognize any of several standard formats, including GenBank, |
|---|
| 608 | EMBL, NBRF, and MolGen/Stanford. For the purposes of annotating an |
|---|
| 609 | analysis we find the GenBank and EMBL formats most useful, particularly |
|---|
| 610 | if you have already received an accession number from one of these |
|---|
| 611 | organizations for your sequence. |
|---|
| 612 | |
|---|
| 613 | Our programs do not require that all of the line types available in |
|---|
| 614 | GenBank, EMBL, or NBRF file formats be present for the file format to |
|---|
| 615 | be recognized and processed. The following pages outline the essential |
|---|
| 616 | details required for correct processing of files by our programs. |
|---|
| 617 | Additional information may be present but will generally be ignored. |
|---|
| 618 | |
|---|
| 619 | |
|---|
| 620 | GenBank File Format |
|---|
| 621 | |
|---|
| 622 | File Header |
|---|
| 623 | |
|---|
| 624 | 1. The first line in the file must have "GENETIC SEQUENCE DATA BANK" |
|---|
| 625 | in spaces 20 through 46 (see LINE 1, below). |
|---|
| 626 | 2. The next 8 lines may contain arbitrary text. They are ignored but |
|---|
| 627 | are required to maintain the GenBank format (see LINE 2 - LINE 9). |
|---|
| 628 | |
|---|
| 629 | Sequence Data Entries |
|---|
| 630 | |
|---|
| 631 | 3. Each sequence entry in the file should have the following format. |
|---|
| 632 | |
|---|
| 633 | a) first line: |
|---|
| 634 | |
|---|
| 635 | Must have LOCUS in the first 5 spaces. The |
|---|
| 636 | genetic locus name or identifier must be in spaces |
|---|
| 637 | 13 - 22. The length of the sequences is right |
|---|
| 638 | justified in spaces 23 through 29 (see LINE 10). |
|---|
| 639 | |
|---|
| 640 | b) second line: |
|---|
| 641 | |
|---|
| 642 | Must have DEFINITION in the first 10 spaces. |
|---|
| 643 | Spaces 13 - 80 are free form text to identify the |
|---|
| 644 | sequence (see LINE 11). |
|---|
| 645 | |
|---|
| 646 | c) third line: |
|---|
| 647 | |
|---|
| 648 | Must have ACCESSION in the first 9 spaces. Spaces |
|---|
| 649 | 13 - 18 must hold the primary accession number |
|---|
| 650 | (see LINE 12). |
|---|
| 651 | |
|---|
| 652 | d) fourth line: |
|---|
| 653 | |
|---|
| 654 | Must have ORIGIN in the first 6 spaces. Nothing |
|---|
| 655 | else is required on this line, it indicates that |
|---|
| 656 | the nucleic acid sequence begins on the next line |
|---|
| 657 | (see LINE 13). |
|---|
| 658 | |
|---|
| 659 | e) fifth line: |
|---|
| 660 | |
|---|
| 661 | Begins the nucleotide sequence. The first 9 |
|---|
| 662 | spaces of each sequence line may either be blank |
|---|
| 663 | or may contain the position in the sequence of the |
|---|
| 664 | first nucleotide on the line. The next 66 spaces |
|---|
| 665 | hold the nucleotide sequence in six blocks of ten |
|---|
| 666 | nucleotides. Each of the six blocks begins with a |
|---|
| 667 | blank space followed by ten nucleotides. Thus the |
|---|
| 668 | first nucleotide is in space eleven of the line while |
|---|
| 669 | the last is in space 75 (see LINE 14, LINE 15). |
|---|
| 670 | |
|---|
| 671 | f) last line: |
|---|
| 672 | |
|---|
| 673 | Must have // in the first 2 spaces to indicate |
|---|
| 674 | termination of the sequence (see LINE 16). |
|---|
| 675 | |
|---|
| 676 | NOTE: Multiple sequences may appear in each file. To begin another |
|---|
| 677 | sequence go back to a) and start again. |
|---|
| 678 | |
|---|
| 679 | |
|---|
| 680 | Example GenBank file |
|---|
| 681 | |
|---|
| 682 | |
|---|
| 683 | LINE 1 : GENETIC SEQUENCE DATA BANK |
|---|
| 684 | LINE 2 : |
|---|
| 685 | LINE 3 : |
|---|
| 686 | LINE 4 : |
|---|
| 687 | LINE 5 : |
|---|
| 688 | LINE 6 : |
|---|
| 689 | LINE 7 : |
|---|
| 690 | LINE 8 : |
|---|
| 691 | LINE 9 : |
|---|
| 692 | LINE 10 :LOCUS L_Name Length BP |
|---|
| 693 | LINE 11 :DEFINITION Describe the sequence any way you want |
|---|
| 694 | LINE 12 :ACCESSION Accession Number |
|---|
| 695 | LINE 13 :ORIGIN |
|---|
| 696 | LINE 14 : 1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a... |
|---|
| 697 | LINE 15 : 61 acgt... |
|---|
| 698 | LINE 16 :// |
|---|
| 699 | |
|---|
| 700 | |
|---|
| 701 | |
|---|
| 702 | EMBL File Format |
|---|
| 703 | |
|---|
| 704 | Unlike the GenBank file format the EMBL file format does not require |
|---|
| 705 | a series of header lines. Thus the first line in the file begins |
|---|
| 706 | the first sequence entry of the file. |
|---|
| 707 | |
|---|
| 708 | 1. The first line of each sequence entry contains the two letters ID |
|---|
| 709 | in the first two spaces. This is followed by the EMBL identifier |
|---|
| 710 | in spaces 6 through 14. (See LINE 1). |
|---|
| 711 | |
|---|
| 712 | 2. The second line of each sequence entry has the two letters AC in |
|---|
| 713 | the first two spaces. This is followed by the accession number in |
|---|
| 714 | spaces 6 through 11. (See LINE 2). |
|---|
| 715 | |
|---|
| 716 | 3. The third line of each sequence entry has the two letters DE in the |
|---|
| 717 | first two spaces. This is followed by a free form text definition |
|---|
| 718 | in spaces 6 through 72. (See LINE 3). |
|---|
| 719 | |
|---|
| 720 | 4. The fourth line in each sequence entry has the two letters SQ in |
|---|
| 721 | the first two spaces. This is followed by the length of the |
|---|
| 722 | sequence beginning at or after space 13. After the sequence length |
|---|
| 723 | there is a blank space and the two letters BP. (See LINE 4). |
|---|
| 724 | |
|---|
| 725 | 5. The nucleotide sequence begins on the fifth line of the sequence |
|---|
| 726 | entry. Each line of sequence begins with four blank spaces. The |
|---|
| 727 | next 66 spaces hold the nucleotide sequence in six blocks of ten |
|---|
| 728 | nucleotides. Each of the six blocks begins with a blank space |
|---|
| 729 | followed by ten nucleotides. Thus the first nucleotide is in space |
|---|
| 730 | 6 of the line while the last is in space 70. (See LINE 5 - |
|---|
| 731 | LINE 6). |
|---|
| 732 | |
|---|
| 733 | 6. The last line of each sequence entry in the file is a terminator |
|---|
| 734 | line which has the two characters // in the first two spaces. |
|---|
| 735 | (See LINE 7). |
|---|
| 736 | |
|---|
| 737 | 7. Multiple sequences may appear in each file. To begin another |
|---|
| 738 | sequence go back to item 1 and start again. |
|---|
| 739 | |
|---|
| 740 | |
|---|
| 741 | Example EMBL file |
|---|
| 742 | |
|---|
| 743 | LINE 1 :ID ID_name |
|---|
| 744 | LINE 2 :AC Accession number |
|---|
| 745 | LINE 3 :DE Describe the sequence any way you want |
|---|
| 746 | LINE 4 :SQ Length BP |
|---|
| 747 | LINE 5 : ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA... |
|---|
| 748 | LINE 6 : ACGT... |
|---|
| 749 | LINE 7 :// |
|---|
| 750 | |
|---|
| 751 | |
|---|
| 752 | |
|---|
| 753 | NBRF (protein or nucleic acid) File Format |
|---|
| 754 | |
|---|
| 755 | 1. The first line of each sequence entry begins with a greater than |
|---|
| 756 | symbol, >. This is immediately followed by the two character |
|---|
| 757 | sequence type specifier. Space four must contain a semi-colon. |
|---|
| 758 | Beginning in space five is the sequence name or identification code |
|---|
| 759 | for the NBRF database. The code is from four to six letters and |
|---|
| 760 | numbers. (See LINE 1). |
|---|
| 761 | |
|---|
| 762 | !!!! >> add these to readseq |
|---|
| 763 | Specifier Sequence type |
|---|
| 764 | |
|---|
| 765 | P1 protein, complete |
|---|
| 766 | F1 protein, fragment |
|---|
| 767 | DL DNA, linear |
|---|
| 768 | DC DNA, circular |
|---|
| 769 | RL RNA, linear |
|---|
| 770 | RC RNA, circular |
|---|
| 771 | N1 functional RNA, other than tRNA |
|---|
| 772 | N3 tRNA |
|---|
| 773 | |
|---|
| 774 | 2. The second line of each sequence entry contains two kinds of |
|---|
| 775 | information. First is the sequence name which is separated from |
|---|
| 776 | the organism or organelle name by the three character sequence |
|---|
| 777 | blank space, dash, blank space, " - ". There is no special |
|---|
| 778 | character marking the beginning of this line. (See LINE 2). |
|---|
| 779 | |
|---|
| 780 | 3. Either the amino acid or nucleic acid sequence begins on line three |
|---|
| 781 | and can begin in any space, including the first. The sequence is |
|---|
| 782 | free format and may be interrupted by blanks for ease of reading. |
|---|
| 783 | Protein sequences man contain special punctuation to indicate |
|---|
| 784 | various indeterminacies in the sequence. In the NBRF data files |
|---|
| 785 | all lines may be up to 500 characters long. However some PSC |
|---|
| 786 | programs currently have a limit of 130 characters per line |
|---|
| 787 | (including blanks), and BitNet will not accept lines of over eighty |
|---|
| 788 | characters. (See LINE 3, LINE 4, and LINE 5). |
|---|
| 789 | |
|---|
| 790 | The last character in the sequence must be an asterisks, *. |
|---|
| 791 | |
|---|
| 792 | Example NBRF file |
|---|
| 793 | |
|---|
| 794 | LINE 1 :>P1;CBRT |
|---|
| 795 | LINE 2 :Cytochrome b - Rat mitochondrion (SGC1) |
|---|
| 796 | LINE 3 :M T N I R K S H P L F K I I N H S F I D L P A P S |
|---|
| 797 | LINE 4 : VTHICRDVN Y GWL IRY |
|---|
| 798 | LINE 5 :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN* |
|---|
| 799 | |
|---|
| 800 | |
|---|
| 801 | |
|---|
| 802 | MolGen/Stanford File Format |
|---|
| 803 | |
|---|
| 804 | 1. The first line in a sequence file is a comment line. This line |
|---|
| 805 | begins with a semi-colon in the first space. This line need |
|---|
| 806 | not be present. If it is present it holds descriptive text. |
|---|
| 807 | There may be as many comment lines as desired at the first of |
|---|
| 808 | sequence file. (See LINE 1). |
|---|
| 809 | |
|---|
| 810 | 2. The second line must be present and contains an identifier or |
|---|
| 811 | name for the sequence in the first ten spaces. (See LINE 2). |
|---|
| 812 | |
|---|
| 813 | 3. The sequence begins on the third line and occupies up to eighty |
|---|
| 814 | spaces. Spaces may be included in the sequence for ease of |
|---|
| 815 | reading. The sequence continues for as many line as needed |
|---|
| 816 | and is terminated with a 1 or 2. 1 indicates a linear sequence |
|---|
| 817 | while 2 marks a circular sequence. (See LINE 3 and LINE 4). |
|---|
| 818 | |
|---|
| 819 | Example MolGen/Stanford file |
|---|
| 820 | |
|---|
| 821 | LINE 1 :; Describe the sequence any way you want |
|---|
| 822 | LINE 2 :ECTRNAGLY2 |
|---|
| 823 | LINE 3 :ACGCACGTAC ACGTACGTAC A C G T C C G T ACG TAC GTA CGT |
|---|
| 824 | LINE 4 : GCTTA GG G C T A1 |
|---|
| 825 | |
|---|
| 826 | |
|---|
| 827 | |
|---|
| 828 | |
|---|
| 829 | ||||||||||| Phylip file format |
|---|
| 830 | --------------------------------------------------- |
|---|
| 831 | |
|---|
| 832 | Phylip 3.3 File Format (DNA sequences) |
|---|
| 833 | |
|---|
| 834 | |
|---|
| 835 | The input and output formats for PROTPARS and for RESTML are described in |
|---|
| 836 | their document files. In general their input formats are similar to those |
|---|
| 837 | described here, except that the one-letter codes for data are specific to those |
|---|
| 838 | programs and are described in those document files. Since the input formats |
|---|
| 839 | for the eight DNA sequence programs apply to all eight, they are described |
|---|
| 840 | here. Their input formats are standard: the data have A's, G's, C's and T's |
|---|
| 841 | (or U's). The first line of the input file contains the number of species and |
|---|
| 842 | the number of sites. As with the other programs, options information may |
|---|
| 843 | follow this. In the case of DNAML, DNAMLK, and DNADIST an additional line |
|---|
| 844 | (described in the document file for these pograms) may follow the first one. |
|---|
| 845 | Following this, each species starts on a new line. The first 10 characters of |
|---|
| 846 | that line are the species name. There then follows the base sequence of that |
|---|
| 847 | species, each character being one of the letters A, B, C, D, G, H, K, M, N, O, |
|---|
| 848 | R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is |
|---|
| 849 | no longer allowed, because it sometimes is used to in aligned sequences to mean |
|---|
| 850 | "the same as the sequence above"). Blanks will be ignored, and so will |
|---|
| 851 | numerical digits. This allows GENBANK and EMBL sequence entries to be read |
|---|
| 852 | with minimum editing. |
|---|
| 853 | |
|---|
| 854 | These characters can be either upper or lower case. The algorithms |
|---|
| 855 | convert all input characters to upper case (which is how they are treated). |
|---|
| 856 | The characters constitute the IUPAC (IUB) nucleic acid code plus some slight |
|---|
| 857 | extensions. They enable input of nucleic acid sequences taking full account of |
|---|
| 858 | any ambiguities in the sequence. |
|---|
| 859 | |
|---|
| 860 | The sequences can continue over multiple lines; when this is done the sequences |
|---|
| 861 | must be either in "interleaved" format, similar to the output of alignment |
|---|
| 862 | programs, or "sequential" format. These are described in the main document |
|---|
| 863 | file. In sequential format all of one sequence is given, possibly on multiple |
|---|
| 864 | lines, before the next starts. In interleaved format the first part of the |
|---|
| 865 | file should contain the first part of each of the sequences, then possibly a |
|---|
| 866 | line containing nothing but a carriage-return character, then the second part |
|---|
| 867 | of each sequence, and so on. Only the first parts of the sequences should be |
|---|
| 868 | preceded by names. Here is a hypothetical example of interleaved format: |
|---|
| 869 | |
|---|
| 870 | 5 42 |
|---|
| 871 | Turkey AAGCTNGGGC ATTTCAGGGT |
|---|
| 872 | Salmo gairAAGCCTTGGC AGTGCAGGGT |
|---|
| 873 | H. SapiensACCGGTTGGC CGTTCAGGGT |
|---|
| 874 | Chimp AAACCCTTGC CGTTACGCTT |
|---|
| 875 | Gorilla AAACCCTTGC CGGTACGCTT |
|---|
| 876 | |
|---|
| 877 | GAGCCCGGGC AATACAGGGT AT |
|---|
| 878 | GAGCCGTGGC CGGGCACGGT AT |
|---|
| 879 | ACAGGTTGGC CGTTCAGGGT AA |
|---|
| 880 | AAACCGAGGC CGGGACACTC AT |
|---|
| 881 | AAACCATTGC CGGTACGCTT AA |
|---|
| 882 | |
|---|
| 883 | while in sequential format the same sequences would be: |
|---|
| 884 | |
|---|
| 885 | 5 42 |
|---|
| 886 | Turkey AAGCTNGGGC ATTTCAGGGT |
|---|
| 887 | GAGCCCGGGC AATACAGGGT AT |
|---|
| 888 | Salmo gairAAGCCTTGGC AGTGCAGGGT |
|---|
| 889 | GAGCCGTGGC CGGGCACGGT AT |
|---|
| 890 | H. SapiensACCGGTTGGC CGTTCAGGGT |
|---|
| 891 | ACAGGTTGGC CGTTCAGGGT AA |
|---|
| 892 | Chimp AAACCCTTGC CGTTACGCTT |
|---|
| 893 | AAACCGAGGC CGGGACACTC AT |
|---|
| 894 | Gorilla AAACCCTTGC CGGTACGCTT |
|---|
| 895 | AAACCATTGC CGGTACGCTT AA |
|---|
| 896 | |
|---|
| 897 | |
|---|
| 898 | Note, of course, that a portion of a sequence like this: |
|---|
| 899 | |
|---|
| 900 | 300 AAGCGTGAAC GTTGTACTAA TRCAG |
|---|
| 901 | |
|---|
| 902 | is perfectly legal, assuming that the species name has gone before, and is |
|---|
| 903 | filled out to full length by blanks. The above digits and blanks will be |
|---|
| 904 | ignored, the sequence being taken as starting at the first base symbol (in this |
|---|
| 905 | case an A). |
|---|
| 906 | |
|---|
| 907 | The present versions of the programs may sometimes have difficulties with |
|---|
| 908 | the blank lines between groups of lines, and if so you might want to retype |
|---|
| 909 | those lines, making sure that they have only a carriage-return and no blank |
|---|
| 910 | characters on them, or you may perhaps have to eliminate them. The symptoms of |
|---|
| 911 | this problem are that the programs complain that the sequences are not properly |
|---|
| 912 | aligned, and you can find no other cause for this complaint. |
|---|
| 913 | |
|---|
| 914 | ------------------------------------------------ |
|---|
| 915 | |
|---|
| 916 | |
|---|
| 917 | ||||||||||| ASN.1 file format |
|---|
| 918 | --------------------------------------------------- |
|---|
| 919 | |
|---|
| 920 | |
|---|
| 921 | ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov) |
|---|
| 922 | |
|---|
| 923 | Example asn.1 sequence file---- |
|---|
| 924 | |
|---|
| 925 | Bioseq-set ::= { |
|---|
| 926 | seq-set { |
|---|
| 927 | seq { |
|---|
| 928 | id { local id 1 } , -- id essential |
|---|
| 929 | descr { title "Dummy sequence data from nowhere" } , -- optional |
|---|
| 930 | inst { -- inst essential |
|---|
| 931 | repr raw , |
|---|
| 932 | mol dna , |
|---|
| 933 | length 156 , |
|---|
| 934 | topology linear , |
|---|
| 935 | seq-data |
|---|
| 936 | iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA |
|---|
| 937 | TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG |
|---|
| 938 | TGGATTCAAAGCAATAGAGTTGTTCTT" |
|---|
| 939 | } } , |
|---|
| 940 | |
|---|
| 941 | seq { |
|---|
| 942 | id { local id 2 } , |
|---|
| 943 | descr { title "Dummy sequence 2 data from somewhere else" } , |
|---|
| 944 | inst { |
|---|
| 945 | repr raw , |
|---|
| 946 | mol dna , |
|---|
| 947 | length 150 , |
|---|
| 948 | topology linear , |
|---|
| 949 | seq-data |
|---|
| 950 | iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA |
|---|
| 951 | TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG |
|---|
| 952 | TGGATTCAAAGCAATAGAGTT" |
|---|
| 953 | } |
|---|
| 954 | } |
|---|
| 955 | } |
|---|
| 956 | } |
|---|
| 957 | |
|---|
| 958 | |
|---|
| 959 | partial ASN.1 description from toolkit |
|---|
| 960 | |
|---|
| 961 | Bioseq ::= SEQUENCE { |
|---|
| 962 | id SET OF Seq-id , -- equivalent identifiers |
|---|
| 963 | descr Seq-descr OPTIONAL , -- descriptors |
|---|
| 964 | inst Seq-inst , -- the sequence data |
|---|
| 965 | annot SET OF Seq-annot OPTIONAL } |
|---|
| 966 | |
|---|
| 967 | Seq-inst ::= SEQUENCE { -- the sequence data itself |
|---|
| 968 | repr ENUMERATED { -- representation class |
|---|
| 969 | not-set (0) , -- empty |
|---|
| 970 | virtual (1) , -- no seq data |
|---|
| 971 | raw (2) , -- continuous sequence |
|---|
| 972 | seg (3) , -- segmented sequence |
|---|
| 973 | const (4) , -- constructed sequence |
|---|
| 974 | ref (5) , -- reference to another sequence |
|---|
| 975 | consen (6) , -- consensus sequence or pattern |
|---|
| 976 | map (7) , -- ordered map (genetic, restriction) |
|---|
| 977 | other (255) } , |
|---|
| 978 | mol ENUMERATED { -- molecule class in living organism |
|---|
| 979 | not-set (0) , -- > cdna = rna |
|---|
| 980 | dna (1) , |
|---|
| 981 | rna (2) , |
|---|
| 982 | aa (3) , |
|---|
| 983 | na (4) , -- just a nucleic acid |
|---|
| 984 | other (255) } , |
|---|
| 985 | length INTEGER OPTIONAL , -- length of sequence in residues |
|---|
| 986 | fuzz Int-fuzz OPTIONAL , -- length uncertainty |
|---|
| 987 | topology ENUMERATED { -- topology of molecule |
|---|
| 988 | not-set (0) , |
|---|
| 989 | linear (1) , |
|---|
| 990 | circular (2) , |
|---|
| 991 | tandem (3) , -- some part of tandem repeat |
|---|
| 992 | other (255) } DEFAULT linear , |
|---|
| 993 | strand ENUMERATED { -- strandedness in living organism |
|---|
| 994 | not-set (0) , |
|---|
| 995 | ss (1) , -- single strand |
|---|
| 996 | ds (2) , -- double strand |
|---|
| 997 | mixed (3) , |
|---|
| 998 | other (255) } OPTIONAL , -- default ds for DNA, ss for RNA, pept |
|---|
| 999 | seq-data Seq-data OPTIONAL , -- the sequence |
|---|
| 1000 | ext Seq-ext OPTIONAL , -- extensions for special types |
|---|
| 1001 | hist Seq-hist OPTIONAL } -- sequence history |
|---|
| 1002 | |
|---|
| 1003 | ------------------------------------------------ |
|---|