source: tags/ms_r16q3/READSEQ/Formats

Last change on this file was 10842, checked in by westram, 10 years ago
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 38.3 KB
Line 
1[file has been slightly modified to fit into arb help]
2
3||||||||||| ReadSeq supported formats   (revised 30Dec92)
4--------------------------------------------------------
5
6    -f[ormat=]Name Format name for output:
7       |  1. IG/Stanford           10. Olsen (in-only)
8       |  2. GenBank/GB            11. Phylip3.2
9       |  3. NBRF                  12. Phylip
10       |  4. EMBL                  13. Plain/Raw
11       |  5. GCG                   14. PIR/CODATA
12       |  6. DNAStrider            15. MSF
13       |  7. Fitch                 16. ASN.1
14       |  8. Pearson/Fasta         17. PAUP
15       |  9. Zuker (in-only)       18. Pretty (out-only)
16
17In general, output supports only minimal subsets of each format
18needed for sequence data exchanges.  Features, descriptions
19and other format-unique information is discarded.
20
21Users of Olsen multi sequence editor (VMS).  The Olsen format
22here is produced with the print command:
23  print/out=some.file
24Use Genbank output from readseq to produce a format that this
25editor can read, and use the command
26  load/genbank some.file
27Dan Davison has a VMS program that will convert to/from the
28Olsen native binary data format.  E-mail davison@uh.edu
29
30Warning: Phylip format input is now supported (30Dec92), however the
31auto-detection of Phylip format is very probabilistic and messy,
32especially distinguishing sequential from interleaved versions. It
33is not recommended that one use readseq to convert files from Phylip
34format to others unless essential.
35
36
37
38||||||||||| ReadSeq usage             (revised 11Nov91)
39--------------------------------------------------------
40
41A. determine file format:
42
43        short skiplines;  /* result: number of header lines to skip (or 0) */
44        short error;      /* error result or 0 */
45        short format;     /* resulting format code, see ureadseq.h */
46        char  *filename   = "Mysequence.file"
47
48        format = seqFileFormat( filename, &skiplines, &error);
49        if (error!=0) fail;
50
51B. read number and list of sequences (optional)
52
53        short numseqs;    /* resulting number of sequences found in file */
54        char  *seqlist;   /* list of sequence names, newline separated, 0 terminated */
55
56        seqlist = listSeqs( filename, skiplines, format, &numseqs, &error);
57        if (error!=0)  display (seqlist);
58        free( seqlist);
59
60C.  read individual sequences as desired
61
62        short seqIndex;   /* sequence index #, or == kListSeqs for listSeqs equivalent */
63        long  seqlen;     /* length of seq */
64        char  seqid[256]; /* sequence name */
65        char  *seq;       /* sequence, 0 terminated, free when done */
66
67        seq = readSeq( seqIndex, filename, skiplines, format,
68                      &seqlen, &numseqs, &error, seqid);
69        if (error!=0) manipulate(seq);
70        free(seq);
71
72D. write sequences as desired
73
74        int nlines;     /* number of lines of sequence written */
75        FILE* fout;     /* open file pointer (stdout or other) */
76        short outform;  /* output format, see ureadseq.h */
77
78        nlines = writeSeq( fout, seq, seqlen, format, outform, seqid);
79
80
81Note (30Dec92): There is various processing done by the main program (in readseq.c),
82  rather than just in the subroutines (in ureadseq.c).  Especially for interleaved
83  output formats, the writeSeq subroutine does not handle interleaving, nor some of
84  the formatting at the top and end of output files.  While seqFileFormat, listSeqs,
85  and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on
86  auxilliary processing.  At some point, this may be revised so writeSeq is
87  self-contained.
88
89Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format
90  reading (see ureadasn.c).  A bastard (but workable I hope) ASN.1 format is written
91  by writeSeq alone.
92
93
94
95|||||||||||  sequence formats....
96---------------------------------------------------
97
98stanford/IG
99;comments
100;...
101seq1 info
102abcd...
103efgh1 (or 2 = terminator)
104;another seq
105;....
106seq2 info
107abcd...1
108--- for e.g. ----
109;     Dro5s-T.Seq  Length: 120  April 6, 1989  21:22  Check: 9487  ..
110dro5stseq
111GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
112GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1
113
114;  TOIG of: Dro5srna.Seq  check: 9487  from: 1  to: 120
115---------------------------------------------------
116
117Genbank:
118LOCUS    seq1 ID..
119...
120ORIGIN ...
121123456789abcdefg....(1st 9 columns are formatting)
122     hijkl...
123//         (end of sequence)
124LOCUS     seq2 ID ..
125...
126ORIGIN
127      abcd...
128//
129---------------------------------------------------
130
131NBRF format: (from uwgcg ToNBRF)
132>DL;DRO5SRNA
133Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA
134
135      51  AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG
136     101  AACACCGCGU GUUGUUGGCC U
137
138---------------------------------------------------
139
140EMBL format
141ID345 seq1 id   (the 345 are spaces)
142... other info
143SQ345Sequence   (the 3,4,5 are spaces)
144abcd...
145hijk...
146//              (! this is proper end string: 12Oct90)
147ID    seq2 id
148...
149SQ   Sequence
150abcd...
151...
152//
153---------------------------------------------------
154
155UW GCG Format:
156comments of any form, up to ".." signal
157signal line has seq id, and " Check: ####   .."
158only 1 seq/file
159
160-- e.g. --- (GCG from GenBank)
161LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
162    ... much more ...
163ORIGIN      1 bp upstream of EcoRI site; chromosome BK9 region 69A1.
164
165INVERTEBRATE:DROEST6  Length: 1819  January 9, 1989  16:48  Check: 8008  ..
166
167       1  GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT
168
169      51  CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG
170
171
172---------------------------------------------------
173
174DNAStrider (Mac) = modified Stanford:
175; ### from DNA Strider  Friday, April 7, 1989   11:04:24 PM
176; DNA sequence  pBR322   4363  b.p. complete sequence
177;
178abcd...
179efgh
180//  (end of sequence)
181---------------------------------------------------
182
183Fitch format:
184Dro5srna.Seq
185 GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC
186 GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU
187Droest6.Seq
188 GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG
189 AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG
190---------------------------------------------------
191
192W.Pearson/Fasta format:
193>BOVPRL GenBank entry BOVPRL from omam file.  907 nucleotides.
194TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT
195
196---------------------------------------------------
197Phylip version 3.2 format (e.g., DNAML):
198
199   5   13 YF                (# seqs, #bases, YF)
200Alpha     AACGTGGCCAAAT
201          aaaagggccc...  (continued sp. alpha)
202Beta      AAGGTCGCCAAAC
203          aaaagggccc...  (continued sp. beta)
204Gamma     CATTTCGTCACAA
205          aaaagggccc...  (continued sp. Gamma)
2061234567890^-- bases must start in col 11, and run 'til #bases
207        (spaces & newlines are okay)
208---------------------------------------------------
209Phylip version 3.3 format (e.g., DNAML):
210
211  5    42  YF             (# seqs, #bases, YF)
212Turkey    AAGCTNGGGC ATTTCAGGGT
213Salmo gairAAGCCTTGGC AGTGCAGGGT
214H. SapiensACCGGTTGGC CGTTCAGGGT
215Chimp     AAACCCTTGC CGTTACGCTT
216Gorilla   AAACCCTTGC CGGTACGCTT
2171234567890^-- bases must start in col 11
218  !! this version interleaves the species -- contrary to
219     all other output formats.
220
221GAGCCCGGGC AATACAGGGT AT
222GAGCCGTGGC CGGGCACGGT AT
223ACAGGTTGGC CGTTCAGGGT AA
224AAACCGAGGC CGGGACACTC AT
225AAACCATTGC CGGTACGCTT AA
226
227---------------------------------------------------
228Phylip version 3.4 format (e.g., DNAML)
229-- Both Interleaved and sequential are permitted
230
231   5   13                (# seqs, #bases)
232Alpha     AACGTGGCCAAAT
233          aaaagggccc...  (continued sp. alpha)
234Beta      AAGGTCGCCAAAC
235          aaaagggccc...  (continued sp. beta)
236Gamma     CATTTCGTCACAA
237          aaaagggccc...  (continued sp. Gamma)
2381234567890^-- bases must start in col 11, and run 'til #bases
239        (spaces, newlines and numbers are are ignored)
240
241---------------------------------------------------
242Gary Olsen (multiple) sequence editor /print format:
243
244!---------------------
245!17Oct91 -- error in original copy of olsen /print format, shifted right 1 space
246! here is correct copy:
247  301  40 Tb.thiop  CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG-----------------------------------------------------  Tb.thiop
248123456789012345678901
249  301  42 Rhc.purp  CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG-----------------------------------------------------  Rhc.purp
250
251  301  44 Rhc.gela  nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA-----------------------------------------------------  Rhc.gela
252!---------------------
253
254 RNase P RNA components.  on 20-FEB-90 17:23:58
255
256    1 (E.c. pr ):  Base pairing in Escherichia coli RNase P RNA.
257    2 (chrom   ):  Chromatium
258      :
259   12 (B.brevis):  Bacillus brevis RNase P RNA, B. James.
260   13 ( 90% con):   90% conserved
261   14 (100% con):  100% conserved
262   15 (gram+ pr):  pairing
263
2641
265 RNase P RNA components.  on 20-FEB-90 17:23:58
266
267 Posi-   Sequence
268 tion:   identity:   Data:
269
270     1   1 E.c. pr      <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>>  E.c. pr
271     1   2 chrom        GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------  chrom
272            :
273     1  12 B.brevis  AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU-------------  B.brevis
2741234567890123456789012 <! this should be 21 not 22,
275! this example must be inset on left by 1 space from olsen /print files !
276     1  13  90% con           G  C G  A  CGC GC               -    -      90% con
277     1  14 100% con                G  A  CGC                             100% con
278     1  15 gram+ pr     <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<===============  gram+ pr
279
280    60   1 E.c. pr   >>>>>>^>>^>>>>:>>    <<<^<<<< {{{{{                 E.c. pr
281    60   2 chrom     -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU-------------  chrom
282    :       :
283    60  10 B.stearo  ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC  B.stearo
284
285
286---------------------------------------------------
287  GCG MSF format
288Title line
289
290picorna.msf  MSF: 100  Type: P  January 17, 1991  17:53  Check: 541
291..
292Name: Cb3              Len:   100  Check: 7009  Weight:  1.00
293Name: E                Len:   100  Check:   60  Weight:  1.00
294
295//
296
297   1                                                   50
298Cb3  ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet
299  E  gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs
300
301   51                                                 100
302
303Cb3  ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn.....
304  E  ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf
305
306---------------------------------------------------
307     PIR format
308This is NBRF-PIR MAILSERVER version 1.45
309Command-> get PIR3:A31391
310\\\
311ENTRY           A31391       #Type Protein
312TITLE           *Esterase-6 - Fruit fly (Drosophila melanogaster)
313
314DATE            03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992
315PLACEMENT          0.0    0.0    0.0    0.0    0.0
316COMMENT         *This entry is not verified.
317SOURCE          Drosophila melanogaster
318
319REFERENCE
320   #Authors     Cooke P.H., Oakeshott J.G.
321   #Citation    submitted to GenBank, April 1989
322   #Reference-number A31391
323   #Accession   A31391
324   #Cross-reference GB:J04167
325
326SUMMARY       #Molecular-weight 61125  #Length 544  #Checksum  1679
327SEQUENCE
328                5        10        15        20        25        30
329      1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V
330     31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D
331     61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D
332     91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S
333    121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K
334    151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K
335    181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A
336    211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D
337    241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L
338    271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F
339    301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V
340    331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D
341    361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K
342    391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N
343    421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I
344    451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D
345    481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K
346    511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H
347    541 V E F P
348///
349\\\
350---------------------------------------------------
351PAUP format:
352The NEXUS Format
353
354Every block starts with "BEGIN blockname;" and ends with "END;".
355Each block is composed of one or more statements, each
356terminated by a semicolon (;).
357
358Comments may be included in NEXUS files by enclosing them within
359square brackets, as in "[This is a comment]."
360
361NEXUS-conforming files are identified by a "#NEXUS" directive at
362the very beginning of the file (line 1, column 1).  If the
363#NEXUS is omitted PAUP issues a warning but continues
364processing.
365
366NEXUS files are entirely free-format.  Blanks, tabs, and
367newlines may be placed anywhere in the file.  Unless RESPECTCASE
368is requested, commands and data may be entered in upper case,
369lower case, or a mixture of upper and lower case.
370
371The following conventions are used in the syntax descriptions of
372the various blocks.  Upper-case items are entered exactly as
373shown.  Lower-case items inside of angle brackets -- e.g., <x>
374-- represent items to be substituted by the user.  Items inside
375of square brackets -- e.g., [X] -- are optional.  Items inside
376of curly braces and separated by vertical bars -- e.g.,  { X | Y
377| Z } -- are mutually exclusive options.
378
379
380The DATA Block
381
382The DATA block contains the data matrix and other associated
383information.  Its syntax is:
384
385BEGIN DATA;
386DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>;
387  [ FORMAT  [ MISSING=<missing-symbol> ]
388        [ LABELPOS={ LEFT | RIGHT } ]
389        [ SYMBOLS="<symbols-list>" ]
390        [ INTERLEAVE ]
391        [ MATCHCHAR=<match-symbol> ]
392        [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ]
393        [ TRANSPOSE ]
394        [ RESPECTCASE ]
395        [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ]
396        [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ]
397        [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ]
398        [ ZAP = "<list of zapped characters>" ] ; ]
399  [ CHARLABELS <label_1> <label_2> <label_NCHAR> ; ]
400  [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ]
401  [ STATELABELS <currently ignored by PAUP> ; ]
402  MATRIX <data-matrix> ;
403  END;
404
405--- example PAUP file
406
407#NEXUS
408
409[!Brown et al. (1982) primate mitochondrial DNA]
410
411begin data;
412  dimensions ntax=5 nchar=896;
413  format datatype=dna matchchar=. interleave missing='-';
414  matrix
415[                              2                    4                    6            8                    ]
416[         1                    1                    1                    1            1                    ]
417human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc
418chimp     ................a.t. .c.................a ...............t.... ..................t. .t........c.........
419gorilla   ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c...
420orang     ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c...
421gibbon    ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c...
422
423[         8                    8                    8                    8            8              8     ]
424[         0                    2                    4                    6            8              9     ]
425[         1                    1                    1                    1            1              6     ]
426human     cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt
427chimp     t................... .a................c. ........a.....g..... ...a................ ................
428gorilla   ..................tc .a................c. ........a.g......... ...a.............tt. .a..............
429orang     ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........
430gibbon    a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a..............
431  ;
432end;
433---------------------------------------------------
434
435
436
437
438
439
440|||||||||||  Sample SMTP mail header
441---------------------------------------------------
442
443- - - - - - - - -
444From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991
445Received: from genbank.bio.net by sunflower.bio.indiana.edu
446        (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST
447Received: by genbank.bio.net (5.65/IG-2.0)
448        id AA14458; Sun, 10 Nov 91 14:30:03 -0800
449Date: Sun, 10 Nov 91 14:30:03 -0800
450Message-Id: <9111102230.AA14458@genbank.bio.net>
451From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
452To: gilbertd@sunflower.bio.indiana.edu
453Subject: Results of Query for drorna
454Status: R
455
456No matches on drorna.
457- - - - - -
458From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991
459Received: from genbank.bio.net by sunflower.bio.indiana.edu
460        (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST
461Received: by genbank.bio.net (5.65/IG-2.0)
462        id AA14461; Sun, 10 Nov 91 14:30:03 -0800
463Date: Sun, 10 Nov 91 14:30:03 -0800
464Message-Id: <9111102230.AA14461@genbank.bio.net>
465From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
466To: gilbertd@sunflower.bio.indiana.edu
467Subject: Results of Query for droest6
468Status: R
469
470LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
471DEFINITION  D.melanogaster esterase-6 mRNA, complete cds.
472ACCESSION   M15961
473
474
475
476
477
478
479
480
481
482
483
484
485|||||||||||  GCG manual discussion of sequence symbols:
486---------------------------------------------------
487
488III_SEQUENCE_SYMBOLS
489
490
491     GCG programs allow all upper and lower  case  letters,  periods  (.),
492asterisks  (*),  pluses  (+),  ampersands  (&),  and ats (@) as symbols in
493biological sequences.  Nucleotide  symbols,  their  complements,  and  the
494standard  one-letter amino acid symbols are shown below in separate lists.
495The meanings of the symbols +, &, and @ have not  been  assigned  at  this
496writing (March, 1989).
497
498     GCG uses the  letter  codes  for  amino  acid  codes  and  nucleotide
499ambiguity    proposed    by    IUB    (Nomenclature    Committee,    1985,
500Eur. J. Biochem. 150; 1-5).  These codes are  compatible  with  the  codes
501used by the EMBL, GenBank, and NBRF data libraries.
502
503
504                               NUCLEOTIDES
505
506     The meaning of each symbol, its complement,  and  the  Cambridge  and
507Stanford  equivalents  are  shown below.  Cambridge files can be converted
508into GCG files and vice versa with the programs FROMSTADEN  and  TOSTADEN.
509IntelliGenetics  sequence  files  can  be interconverted with the programs
510FROMIG and TOIG.
511
512IUB/GCG      Meaning     Complement   Staden/Sanger  Stanford
513
514   A             A             T             A            A
515   C             C             G             C            C
516   G             G             C             G            G
517  T/U            T             A             T           T/U
518   M           A or C          K             5            J
519   R           A or G          Y             R            R
520   W           A or T          W             7            L
521   S           C or G          S             8            M
522   Y           C or T          R             Y            Y
523   K           G or T          M             6            K
524   V        A or C or G        B       not supported      N
525   H        A or C or T        D       not supported      N
526   D        A or G or T        H       not supported      N
527   B        C or G or T        V       not supported      N
528  X/N     G or A or T or C     X            -/X           N
529   .    not G or A or T or C   .       not supported      ?
530
531
532  The frame ambiguity codes used by Staden are not  supported  by  GCG
533and   are  translated  by  FROMSTADEN  as  the  lower  case  single  base
534equivalent.
535
536     Staden Code          Meaning              GCG
537
538         D                C or CC                c
539         V                T or TT                t
540         B                A or AA                a
541         H                G or GG                g
542         K                C or CX                c
543         L                T or TX                t
544         M                A or AX                a
545         N                G or GX                g
546
547
548                        AMINO ACIDS
549
550  Here is a list of the standard one-letter amino acid codes and their
551three-letter  equivalents.   The synonymous codons and their depiction in
552the IUB codes are shown.  You should recognize that the codons  following
553semicolons  (;)  are  not  sufficiently specific to define a single amino
554acid even though they represent the best possible back  translation  into
555the IUB codes!  All of the relationships in this list can be redefined by
556the user in a local data file described below.
557
558                                                      IUB
559Symbol 3-letter  Meaning      Codons                Depiction
560 A    Ala       Alanine      GCT,GCC,GCA,GCG         !GCX
561 B    Asp,Asn   Aspartic,
562                Asparagine   GAT,GAC,AAT,AAC         !RAY
563 C    Cys       Cysteine     TGT,TGC                 !TGY
564 D    Asp       Aspartic     GAT,GAC                 !GAY
565 E    Glu       Glutamic     GAA,GAG                 !GAR
566 F    Phe     Phenylalanine  TTT,TTC                 !TTY
567 G    Gly       Glycine      GGT,GGC,GGA,GGG         !GGX
568 H    His       Histidine    CAT,CAC                 !CAY
569 I    Ile       Isoleucine   ATT,ATC,ATA             !ATH
570 K    Lys       Lysine       AAA,AAG                 !AAR
571 L    Leu       Leucine      TTG,TTA,CTT,CTC,CTA,CTG
572!TTR,CTX,YTR;YTX
573 M    Met       Methionine   ATG                     !ATG
574 N    Asn       Asparagine   AAT,AAC                 !AAY
575 P    Pro       Proline      CCT,CCC,CCA,CCG         !CCX
576 Q    Gln       Glutamine    CAA,CAG                 !CAR
577 R    Arg       Arginine     CGT,CGC,CGA,CGG,AGA,AGG
578!CGX,AGR,MGR;MGX
579 S    Ser       Serine       TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX
580 T    Thr       Threonine    ACT,ACC,ACA,ACG         !ACX
581 V    Val       Valine       GTT,GTC,GTA,GTG         !GTX
582 W    Trp       Tryptophan   TGG                     !TGG
583 X    Xxx       Unknown                              !XXX
584 Y    Tyr       Tyrosine     TAT, TAC                !TAY
585 Z    Glu,Gln   Glutamic,
586                Glutamine    GAA,GAG,CAA,CAG         !SAR
587 *    End       Terminator   TAA, TAG, TGA           !TAR,TRA;TRR
588
589
590
591
592
593
594
595
596|||||||||||  docs from PSC on sequence formats:
597---------------------------------------------------
598
599
600          Nucleic Acid and Protein Sequence File Formats
601
602
603It will probably save you some time if you have your data in a usable
604format before you send it to us.  However, we do have the University of
605Wisconsin Genetics Computing Group programs running on our VAXen and
606this package includes several reformatting utilities.  Our programs
607usually recognize any of several standard formats, including GenBank,
608EMBL, NBRF, and MolGen/Stanford.  For the purposes of annotating an
609analysis we find the GenBank and EMBL formats most useful, particularly
610if you have already received an accession number from one of these
611organizations for your sequence.
612
613Our programs do not require that all of the line types available in
614GenBank, EMBL, or NBRF file formats be present for the file format to
615be recognized and processed.  The following pages outline the essential
616details required for correct processing of files by our programs.
617Additional information may be present but will generally be ignored.
618
619
620                      GenBank File Format
621
622File Header
623
6241.  The first line in the file must have "GENETIC SEQUENCE DATA BANK"
625    in spaces 20 through 46 (see LINE  1, below).
6262.  The next 8 lines may contain arbitrary text.  They are ignored but
627    are required to maintain the GenBank format (see LINE 2 - LINE 9).
628
629Sequence Data Entries
630
6313.  Each sequence entry in the file should have the following format.
632
633    a) first line:   Must have LOCUS in the first 5 spaces.  The
634                     genetic locus name or identifier must be in spaces
635                     13 - 22.  The length of the sequences is right
636                     justified in spaces 23 through 29 (see LINE  10).
637    b) second line:  Must have DEFINITION in the first 10 spaces.
638                     Spaces 13 - 80 are free form text to identify the
639                     sequence (see LINE  11).
640    c) third line:   Must have ACCESSION in the first 9 spaces.  Spaces
641                     13 - 18 must hold the primary accession number
642                     (see LINE  12).
643    d) fourth line:  Must have ORIGIN in the first 6 spaces.  Nothing
644                     else is required on this line, it indicates that
645                     the nucleic acid sequence begins on the next line
646                     (see LINE  13).
647    e) fifth line:   Begins the nucleotide sequence.  The first 9
648                     spaces of each sequence line may either be blank
649                     or may contain the position in the sequence of the
650                     first nucleotide on the line.  The next 66 spaces
651                     hold the nucleotide sequence in six blocks of ten
652                     nucleotides.  Each of the six blocks begins with a
653                     blank space followed by ten nucleotides.  Thus the
654                     first nucleotide is in space eleven of the line while
655                     the last is in space 75 (see LINE  14, LINE  15).
656    f) last line:    Must have // in the first 2 spaces to indicate
657                     termination of the sequence (see LINE  16).
658
659NOTE:  Multiple sequences may appear in each file.  To begin another
660       sequence go back to a) and start again.
661
662
663                         Example GenBank file
664
665
666LINE  1  :                   GENETIC SEQUENCE DATA BANK
667LINE  2  :
668LINE  3  :
669LINE  4  :
670LINE  5  :
671LINE  6  :
672LINE  7  :
673LINE  8  :
674LINE  9  :
675LINE 10  :LOCUS       L_Name     Length BP
676LINE 11  :DEFINITION  Describe the sequence any way you want
677LINE 12  :ACCESSION   Accession Number
678LINE 13  :ORIGIN
679LINE 14  :        1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a...
680LINE 15  :       61 acgt...
681LINE 16  ://
682
683
684
685                         EMBL File Format
686
687Unlike the GenBank file format the EMBL file format does not require
688a series of header lines.  Thus the first line in the file begins
689the first sequence entry of the file.
690
6911.  The first line of each sequence entry contains the two letters ID
692    in the first two spaces.  This is followed by the EMBL identifier
693    in spaces 6 through 14.  (See LINE  1).
694
6952.  The second line of each sequence entry has the two letters AC in
696    the first two spaces.  This is followed by the accession number in
697    spaces 6 through 11.  (See LINE  2).
698
6993.  The third line of each sequence entry has the two letters DE in the
700    first two spaces.  This is followed by a free form text definition
701    in spaces 6 through 72.  (See LINE  3).
702
7034.  The fourth line in each sequence entry has the two letters SQ in
704    the first two spaces.  This is followed by the length of the
705    sequence beginning at or after space 13.  After the sequence length
706    there is a blank space and the two letters BP.  (See LINE  4).
707
7085.  The nucleotide sequence begins on the fifth line of the sequence
709    entry.  Each line of sequence begins with four blank spaces. The
710    next 66 spaces hold the nucleotide sequence in six blocks of ten
711    nucleotides.  Each of the six blocks begins with a blank space
712    followed by ten nucleotides.  Thus the first nucleotide is in space
713    6 of the line while the last is in space 70.  (See LINE  5 -
714    LINE  6).
715
7166.  The last line of each sequence entry in the file is a terminator
717    line which has the two characters // in the first two spaces.
718    (See LINE  7).
719
7207.  Multiple sequences may appear in each file.  To begin another
721    sequence go back to item 1 and start again.
722
723
724                          Example EMBL file
725
726LINE  1  :ID   ID_name
727LINE  2  :AC   Accession number
728LINE  3  :DE   Describe the sequence any way you want
729LINE  4  :SQ          Length BP
730LINE  5  :     ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA...
731LINE  6  :     ACGT...
732LINE  7  ://
733
734
735
736            NBRF (protein or nucleic acid) File Format
737
7381.  The first line of each sequence entry begins with a greater than
739  symbol, >.  This is immediately followed by the two character
740  sequence type specifier.  Space four must contain a semi-colon.
741  Beginning in space five is the sequence name or identification code
742  for the NBRF database.  The code is from four to six letters and
743  numbers.  (See LINE  1).
744
745!!!! >> add these to readseq
746          Specifier             Sequence type
747
748              P1                protein, complete
749              F1                protein, fragment
750              DL                DNA, linear
751              DC                DNA, circular
752              RL                RNA, linear
753              RC                RNA, circular
754              N1                functional RNA, other than tRNA
755              N3                tRNA
756
7572.  The second line of each sequence entry contains two kinds of
758  information.  First is the sequence name which is separated from
759  the organism or organelle name by the three character sequence
760  blank space, dash, blank space, " - ".  There is no special
761  character marking the beginning of this line.  (See LINE  2).
762
7633.  Either the amino acid or nucleic acid sequence begins on line three
764  and can begin in any space, including the first.  The sequence is
765  free format and may be interrupted by blanks for ease of reading.
766  Protein sequences man contain special punctuation to indicate
767  various indeterminacies in the sequence.  In the NBRF data files
768  all lines may be up to 500 characters long.  However some PSC
769  programs currently have a limit of 130 characters per line
770  (including blanks), and BitNet will not accept lines of over eighty
771  characters.  (See LINE  3, LINE  4, and LINE  5).
772
773  The last character in the sequence must be an asterisks, *.
774
775                       Example NBRF file
776
777 LINE  1  :>P1;CBRT
778 LINE  2  :Cytochrome b - Rat mitochondrion (SGC1)
779 LINE  3  :M T N I R K S H P L F K I I N H S F I D L P A P S
780 LINE  4  : VTHICRDVN Y GWL IRY
781 LINE  5  :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN*
782
783
784
785                MolGen/Stanford File Format
786
7871.  The first line in a sequence file is a comment line.  This line
788  begins with a semi-colon in the first space.  This line need
789  not be present.  If it is present it holds descriptive text.
790  There may be as many comment lines as desired at the first of
791  sequence file.  (See LINE  1).
792
7932.  The second line must be present and contains an identifier or
794  name for the sequence in the first ten spaces.  (See LINE  2).
795
7963.  The sequence begins on the third line and occupies up to eighty
797  spaces.  Spaces may be included in the sequence for ease of
798  reading.  The sequence continues for as many line as needed
799  and is terminated with a 1 or 2.  1 indicates a linear sequence
800  while 2 marks a circular sequence.  (See LINE  3 and LINE  4).
801
802                          Example MolGen/Stanford file
803
804LINE  1  :;  Describe the sequence any way you want
805LINE  2  :ECTRNAGLY2
806LINE  3  :ACGCACGTAC ACGTACGTAC   A C G T C C G T ACG TAC GTA CGT
807LINE  4  :  GCTTA   GG G C T A1
808
809
810
811
812|||||||||||  Phylip file format
813---------------------------------------------------
814
815        Phylip 3.3 File Format (DNA sequences)
816
817
818     The input and output formats for PROTPARS and for RESTML are described  in
819their  document  files.   In  general  their input formats are similar to those
820described here, except that the one-letter codes for data are specific to those
821programs  and  are  described in those document files.  Since the input formats
822for the eight DNA sequence programs apply to  all  eight,  they  are  described
823here.   Their  input  formats are standard: the data have A's, G's, C's and T's
824(or U's).  The first line of the input file contains the number of species  and
825the  number  of  sites.   As  with  the other programs, options information may
826follow this.  In the case of DNAML, DNAMLK,  and  DNADIST  an  additional  line
827(described  in  the  document file for these pograms) may follow the first one.
828Following this, each species starts on a new line.  The first 10 characters  of
829that  line  are the species name.  There then follows the base sequence of that
830species, each character being one of the letters A, B, C, D, G, H, K, M, N,  O,
831R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is
832no longer allowed, because it sometimes is used to in aligned sequences to mean
833"the  same  as  the  sequence  above").   Blanks  will  be ignored, and so will
834numerical digits.  This allows GENBANK and EMBL sequence  entries  to  be  read
835with minimum editing.
836
837     These characters can be  either  upper  or  lower  case.   The  algorithms
838convert  all  input  characters  to upper case (which is how they are treated).
839The characters constitute the IUPAC (IUB) nucleic acid code  plus  some  slight
840extensions.  They enable input of nucleic acid sequences taking full account of
841any ambiguities in the sequence.
842
843The sequences can continue over multiple lines; when this is done the sequences
844must  be  either  in  "interleaved"  format, similar to the output of alignment
845programs, or "sequential" format.  These are described  in  the  main  document
846file.   In sequential format all of one sequence is given, possibly on multiple
847lines, before the next starts.  In interleaved format the  first  part  of  the
848file  should  contain  the first part of each of the sequences, then possibly a
849line containing nothing but a carriage-return character, then the  second  part
850of  each  sequence, and so on.  Only the first parts of the sequences should be
851preceded by names.  Here is a hypothetical example of interleaved format:
852
853  5    42
854Turkey    AAGCTNGGGC ATTTCAGGGT
855Salmo gairAAGCCTTGGC AGTGCAGGGT
856H. SapiensACCGGTTGGC CGTTCAGGGT
857Chimp     AAACCCTTGC CGTTACGCTT
858Gorilla   AAACCCTTGC CGGTACGCTT
859
860GAGCCCGGGC AATACAGGGT AT
861GAGCCGTGGC CGGGCACGGT AT
862ACAGGTTGGC CGTTCAGGGT AA
863AAACCGAGGC CGGGACACTC AT
864AAACCATTGC CGGTACGCTT AA
865
866while in sequential format the same sequences would be:
867
868  5    42
869Turkey    AAGCTNGGGC ATTTCAGGGT
870GAGCCCGGGC AATACAGGGT AT
871Salmo gairAAGCCTTGGC AGTGCAGGGT
872GAGCCGTGGC CGGGCACGGT AT
873H. SapiensACCGGTTGGC CGTTCAGGGT
874ACAGGTTGGC CGTTCAGGGT AA
875Chimp     AAACCCTTGC CGTTACGCTT
876AAACCGAGGC CGGGACACTC AT
877Gorilla   AAACCCTTGC CGGTACGCTT
878AAACCATTGC CGGTACGCTT AA
879
880
881Note, of course, that a portion of a sequence like this:
882
883   300   AAGCGTGAAC GTTGTACTAA TRCAG
884
885is perfectly legal, assuming that the species name  has  gone  before,  and  is
886filled  out  to  full  length  by  blanks.  The above digits and blanks will be
887ignored, the sequence being taken as starting at the first base symbol (in this
888case an A).
889
890     The present versions of the programs may sometimes have difficulties  with
891the  blank  lines  between  groups of lines, and if so you might want to retype
892those lines, making sure that they have only a  carriage-return  and  no  blank
893characters on them, or you may perhaps have to eliminate them.  The symptoms of
894this problem are that the programs complain that the sequences are not properly
895aligned, and you can find no other cause for this complaint.
896
897------------------------------------------------
898
899
900|||||||||||  ASN.1 file format
901---------------------------------------------------
902
903
904ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov)
905
906Example asn.1 sequence file----
907
908Bioseq-set ::= {
909seq-set {
910  seq {
911    id { local id 1 } ,                 -- id essential
912    descr {  title "Dummy sequence data from nowhere"  } ,  -- optional
913    inst {                              -- inst essential
914      repr raw ,
915      mol dna ,
916      length 156 ,
917      topology linear ,
918      seq-data
919        iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
920TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
921TGGATTCAAAGCAATAGAGTTGTTCTT"
922      } } ,
923
924        seq {
925          id { local id 2 } ,
926          descr {  title "Dummy sequence 2 data from somewhere else"  } ,
927          inst {
928                repr raw ,
929                mol dna ,
930                length 150 ,
931                topology linear ,
932                seq-data
933                  iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
934TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
935TGGATTCAAAGCAATAGAGTT"
936            }
937          }
938        }
939      }
940
941
942partial ASN.1 description from toolkit
943
944Bioseq ::= SEQUENCE {
945    id SET OF Seq-id ,            -- equivalent identifiers
946    descr Seq-descr OPTIONAL , -- descriptors
947    inst Seq-inst ,            -- the sequence data
948    annot SET OF Seq-annot OPTIONAL }
949
950Seq-inst ::= SEQUENCE {            -- the sequence data itself
951    repr ENUMERATED {              -- representation class
952        not-set (0) ,              -- empty
953        virtual (1) ,              -- no seq data
954        raw (2) ,                  -- continuous sequence
955        seg (3) ,                  -- segmented sequence
956        const (4) ,                -- constructed sequence
957        ref (5) ,                  -- reference to another sequence
958        consen (6) ,               -- consensus sequence or pattern
959        map (7) ,                  -- ordered map (genetic, restriction)
960        other (255) } ,
961    mol ENUMERATED {               -- molecule class in living organism
962        not-set (0) ,              --   > cdna = rna
963        dna (1) ,
964        rna (2) ,
965        aa (3) ,
966        na (4) ,                   -- just a nucleic acid
967        other (255) } ,
968    length INTEGER OPTIONAL ,      -- length of sequence in residues
969    fuzz Int-fuzz OPTIONAL ,       -- length uncertainty
970    topology ENUMERATED {          -- topology of molecule
971        not-set (0) ,
972        linear (1) ,
973        circular (2) ,
974        tandem (3) ,               -- some part of tandem repeat
975        other (255) } DEFAULT linear ,
976    strand ENUMERATED {            -- strandedness in living organism
977        not-set (0) ,
978        ss (1) ,                   -- single strand
979        ds (2) ,                   -- double strand
980        mixed (3) ,
981        other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept
982    seq-data Seq-data OPTIONAL ,   -- the sequence
983    ext Seq-ext OPTIONAL ,         -- extensions for special types
984  hist Seq-hist OPTIONAL }       -- sequence history
985
986------------------------------------------------
Note: See TracBrowser for help on using the repository browser.