source: tags/arb_5.2/READSEQ/Formats

Last change on this file was 2682, checked in by westram, 20 years ago
  • removed broken chars
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 38.2 KB
Line 
1||||||||||| ReadSeq supported formats   (revised 30Dec92)
2--------------------------------------------------------
3
4    -f[ormat=]Name Format name for output:
5         1. IG/Stanford           10. Olsen (in-only)
6         2. GenBank/GB            11. Phylip3.2
7         3. NBRF                  12. Phylip
8         4. EMBL                  13. Plain/Raw
9         5. GCG                   14. PIR/CODATA
10         6. DNAStrider            15. MSF
11         7. Fitch                 16. ASN.1
12         8. Pearson/Fasta         17. PAUP
13         9. Zuker (in-only)       18. Pretty (out-only)
14
15In general, output supports only minimal subsets of each format
16needed for sequence data exchanges.  Features, descriptions
17and other format-unique information is discarded.
18
19Users of Olsen multi sequence editor (VMS).  The Olsen format
20here is produced with the print command:
21  print/out=some.file
22Use Genbank output from readseq to produce a format that this
23editor can read, and use the command
24  load/genbank some.file
25Dan Davison has a VMS program that will convert to/from the
26Olsen native binary data format.  E-mail davison@uh.edu
27
28Warning: Phylip format input is now supported (30Dec92), however the
29auto-detection of Phylip format is very probabilistic and messy,
30especially distinguishing sequential from interleaved versions. It
31is not recommended that one use readseq to convert files from Phylip
32format to others unless essential.
33
34
35
36||||||||||| ReadSeq usage             (revised 11Nov91)
37--------------------------------------------------------
38
39A. determine file format:
40
41        short skiplines;  /* result: number of header lines to skip (or 0) */
42        short error;      /* error result or 0 */
43        short format;     /* resulting format code, see ureadseq.h */
44        char  *filename   = "Mysequence.file"
45
46        format = seqFileFormat( filename, &skiplines, &error);
47        if (error!=0) fail;
48
49B. read number and list of sequences (optional)
50        short numseqs;    /* resulting number of sequences found in file */
51        char  *seqlist;   /* list of sequence names, newline separated, 0 terminated */
52
53        seqlist = listSeqs( filename, skiplines, format, &numseqs, &error);
54        if (error!=0)  display (seqlist);
55        free( seqlist);
56
57C.  read individual sequences as desired
58        short seqIndex;   /* sequence index #, or == kListSeqs for listSeqs equivalent */
59        long  seqlen;     /* length of seq */
60        char  seqid[256]; /* sequence name */
61        char  *seq;       /* sequence, 0 terminated, free when done */
62
63        seq = readSeq( seqIndex, filename, skiplines, format,
64                      &seqlen, &numseqs, &error, seqid);
65        if (error!=0) manipulate(seq);
66        free(seq);
67
68D. write sequences as desired
69        int nlines;     /* number of lines of sequence written */
70        FILE* fout;     /* open file pointer (stdout or other) */
71        short outform;  /* output format, see ureadseq.h */
72
73        nlines = writeSeq( fout, seq, seqlen, format, outform, seqid);
74
75
76Note (30Dec92): There is various processing done by the main program (in readseq.c),
77  rather than just in the subroutines (in ureadseq.c).  Especially for interleaved
78  output formats, the writeSeq subroutine does not handle interleaving, nor some of
79  the formatting at the top and end of output files.  While seqFileFormat, listSeqs,
80  and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on
81  auxilliary processing.  At some point, this may be revised so writeSeq is self-
82  contained.
83
84Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format
85  reading (see ureadasn.c).  A bastard (but workable I hope) ASN.1 format is written
86  by writeSeq alone.
87
88
89
90|||||||||||  sequence formats....
91---------------------------------------------------
92
93stanford/IG
94;comments
95;...
96seq1 info
97abcd...
98efgh1 (or 2 = terminator)
99;another seq
100;....
101seq2 info
102abcd...1
103--- for e.g. ----
104;     Dro5s-T.Seq  Length: 120  April 6, 1989  21:22  Check: 9487  ..
105dro5stseq
106GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
107GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1
108
109;  TOIG of: Dro5srna.Seq  check: 9487  from: 1  to: 120
110---------------------------------------------------
111
112Genbank:
113LOCUS    seq1 ID..
114...
115ORIGIN ...
116123456789abcdefg....(1st 9 columns are formatting)
117     hijkl...
118//         (end of sequence)
119LOCUS     seq2 ID ..
120...
121ORIGIN
122      abcd...
123//
124---------------------------------------------------
125
126NBRF format: (from uwgcg ToNBRF)
127>DL;DRO5SRNA
128Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA
129
130      51  AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG
131     101  AACACCGCGU GUUGUUGGCC U
132
133---------------------------------------------------
134
135EMBL format
136ID345 seq1 id   (the 345 are spaces)
137... other info
138SQ345Sequence   (the 3,4,5 are spaces)
139abcd...
140hijk...
141//              (! this is proper end string: 12Oct90)
142ID    seq2 id
143...
144SQ   Sequence
145abcd...
146...
147//
148---------------------------------------------------
149
150UW GCG Format:
151comments of any form, up to ".." signal
152signal line has seq id, and " Check: ####   .."
153only 1 seq/file
154
155-- e.g. --- (GCG from GenBank)
156LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
157    ... much more ...
158ORIGIN      1 bp upstream of EcoRI site; chromosome BK9 region 69A1.
159
160INVERTEBRATE:DROEST6  Length: 1819  January 9, 1989  16:48  Check: 8008  ..
161
162       1  GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT
163
164      51  CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG
165
166
167---------------------------------------------------
168
169DNAStrider (Mac) = modified Stanford:
170; ### from DNA Strider  Friday, April 7, 1989   11:04:24 PM
171; DNA sequence  pBR322   4363  b.p. complete sequence
172;
173abcd...
174efgh
175//  (end of sequence)
176---------------------------------------------------
177
178Fitch format:
179Dro5srna.Seq
180 GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC
181 GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU
182Droest6.Seq
183 GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG
184 AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG
185---------------------------------------------------
186
187W.Pearson/Fasta format:
188>BOVPRL GenBank entry BOVPRL from omam file.  907 nucleotides.
189TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT
190
191---------------------------------------------------
192Phylip version 3.2 format (e.g., DNAML):
193
194   5   13 YF                (# seqs, #bases, YF)
195Alpha     AACGTGGCCAAAT
196          aaaagggccc...  (continued sp. alpha)
197Beta      AAGGTCGCCAAAC
198          aaaagggccc...  (continued sp. beta)
199Gamma     CATTTCGTCACAA
200          aaaagggccc...  (continued sp. Gamma)
2011234567890^-- bases must start in col 11, and run 'til #bases
202        (spaces & newlines are okay)
203---------------------------------------------------
204Phylip version 3.3 format (e.g., DNAML):
205
206  5    42  YF             (# seqs, #bases, YF)
207Turkey    AAGCTNGGGC ATTTCAGGGT
208Salmo gairAAGCCTTGGC AGTGCAGGGT
209H. SapiensACCGGTTGGC CGTTCAGGGT
210Chimp     AAACCCTTGC CGTTACGCTT
211Gorilla   AAACCCTTGC CGGTACGCTT
2121234567890^-- bases must start in col 11
213  !! this version interleaves the species -- contrary to
214     all other output formats.
215
216GAGCCCGGGC AATACAGGGT AT
217GAGCCGTGGC CGGGCACGGT AT
218ACAGGTTGGC CGTTCAGGGT AA
219AAACCGAGGC CGGGACACTC AT
220AAACCATTGC CGGTACGCTT AA
221
222---------------------------------------------------
223Phylip version 3.4 format (e.g., DNAML)
224-- Both Interleaved and sequential are permitted
225
226   5   13                (# seqs, #bases)
227Alpha     AACGTGGCCAAAT
228          aaaagggccc...  (continued sp. alpha)
229Beta      AAGGTCGCCAAAC
230          aaaagggccc...  (continued sp. beta)
231Gamma     CATTTCGTCACAA
232          aaaagggccc...  (continued sp. Gamma)
2331234567890^-- bases must start in col 11, and run 'til #bases
234        (spaces, newlines and numbers are are ignored)
235
236---------------------------------------------------
237Gary Olsen (multiple) sequence editor /print format:
238
239!---------------------
240!17Oct91 -- error in original copy of olsen /print format, shifted right 1 space
241! here is correct copy:
242  301  40 Tb.thiop  CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG-----------------------------------------------------  Tb.thiop
243123456789012345678901
244  301  42 Rhc.purp  CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG-----------------------------------------------------  Rhc.purp
245
246  301  44 Rhc.gela  nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA-----------------------------------------------------  Rhc.gela
247!---------------------
248
249 RNase P RNA components.  on 20-FEB-90 17:23:58
250
251    1 (E.c. pr ):  Base pairing in Escherichia coli RNase P RNA.
252    2 (chrom   ):  Chromatium
253      :
254   12 (B.brevis):  Bacillus brevis RNase P RNA, B. James.
255   13 ( 90% con):   90% conserved
256   14 (100% con):  100% conserved
257   15 (gram+ pr):  pairing
258
2591
260 RNase P RNA components.  on 20-FEB-90 17:23:58
261
262 Posi-   Sequence
263 tion:   identity:   Data:
264
265     1   1 E.c. pr      <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>>  E.c. pr
266     1   2 chrom        GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------  chrom
267            :
268     1  12 B.brevis  AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU-------------  B.brevis
2691234567890123456789012 <! this should be 21 not 22,
270! this example must be inset on left by 1 space from olsen /print files !
271     1  13  90% con           G  C G  A  CGC GC               -    -      90% con
272     1  14 100% con                G  A  CGC                             100% con
273     1  15 gram+ pr     <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<===============  gram+ pr
274
275    60   1 E.c. pr   >>>>>>^>>^>>>>:>>    <<<^<<<< {{{{{                 E.c. pr
276    60   2 chrom     -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU-------------  chrom
277    :       :
278    60  10 B.stearo  ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC  B.stearo
279
280
281---------------------------------------------------
282  GCG MSF format
283Title line
284
285picorna.msf  MSF: 100  Type: P  January 17, 1991  17:53  Check: 541
286..
287Name: Cb3              Len:   100  Check: 7009  Weight:  1.00
288Name: E                Len:   100  Check:   60  Weight:  1.00
289
290//
291
292   1                                                   50
293Cb3  ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet
294  E  gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs
295
296   51                                                 100
297
298Cb3  ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn.....
299  E  ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf
300
301---------------------------------------------------
302     PIR format
303This is NBRF-PIR MAILSERVER version 1.45
304Command-> get PIR3:A31391
305\\\
306ENTRY           A31391       #Type Protein
307TITLE           *Esterase-6 - Fruit fly (Drosophila melanogaster)
308
309DATE            03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992
310PLACEMENT          0.0    0.0    0.0    0.0    0.0
311COMMENT         *This entry is not verified.
312SOURCE          Drosophila melanogaster
313
314REFERENCE
315   #Authors     Cooke P.H., Oakeshott J.G.
316   #Citation    submitted to GenBank, April 1989
317   #Reference-number A31391
318   #Accession   A31391
319   #Cross-reference GB:J04167
320
321SUMMARY       #Molecular-weight 61125  #Length 544  #Checksum  1679
322SEQUENCE
323                5        10        15        20        25        30
324      1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V
325     31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D
326     61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D
327     91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S
328    121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K
329    151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K
330    181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A
331    211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D
332    241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L
333    271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F
334    301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V
335    331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D
336    361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K
337    391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N
338    421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I
339    451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D
340    481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K
341    511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H
342    541 V E F P
343///
344\\\
345---------------------------------------------------
346PAUP format:
347The NEXUS Format
348
349Every block starts with "BEGIN blockname;" and ends with "END;".
350Each block is composed of one or more statements, each
351terminated by a semicolon (;).
352
353Comments may be included in NEXUS files by enclosing them within
354square brackets, as in "[This is a comment]."
355
356NEXUS-conforming files are identified by a "#NEXUS" directive at
357the very beginning of the file (line 1, column 1).  If the
358#NEXUS is omitted PAUP issues a warning but continues
359processing.
360
361NEXUS files are entirely free-format.  Blanks, tabs, and
362newlines may be placed anywhere in the file.  Unless RESPECTCASE
363is requested, commands and data may be entered in upper case,
364lower case, or a mixture of upper and lower case.
365
366The following conventions are used in the syntax descriptions of
367the various blocks.  Upper-case items are entered exactly as
368shown.  Lower-case items inside of angle brackets -- e.g., <x>
369-- represent items to be substituted by the user.  Items inside
370of square brackets -- e.g., [X] -- are optional.  Items inside
371of curly braces and separated by vertical bars -- e.g.,  { X | Y
372| Z } -- are mutually exclusive options.
373
374
375The DATA Block
376
377The DATA block contains the data matrix and other associated
378information.  Its syntax is:
379
380BEGIN DATA;
381DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>;
382  [ FORMAT  [ MISSING=<missing-symbol> ]
383        [ LABELPOS={ LEFT | RIGHT } ]
384        [ SYMBOLS="<symbols-list>" ]
385        [ INTERLEAVE ]
386        [ MATCHCHAR=<match-symbol> ]
387        [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ]
388        [ TRANSPOSE ]
389        [ RESPECTCASE ]
390        [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ]
391        [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ]
392        [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ]
393        [ ZAP = "<list of zapped characters>" ] ; ]
394  [ CHARLABELS <label_1> <label_2> <label_NCHAR> ; ]
395  [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ]
396  [ STATELABELS <currently ignored by PAUP> ; ]
397  MATRIX <data-matrix> ;
398  END;
399
400--- example PAUP file
401
402#NEXUS
403
404[!Brown et al. (1982) primate mitochondrial DNA]
405
406begin data;
407  dimensions ntax=5 nchar=896;
408  format datatype=dna matchchar=. interleave missing='-';
409  matrix
410[                              2                    4                    6            8                    ]
411[         1                    1                    1                    1            1                    ]
412human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc
413chimp     ................a.t. .c.................a ...............t.... ..................t. .t........c.........
414gorilla   ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c...
415orang     ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c...
416gibbon    ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c...
417
418[         8                    8                    8                    8            8              8     ]
419[         0                    2                    4                    6            8              9     ]
420[         1                    1                    1                    1            1              6     ]
421human     cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt
422chimp     t................... .a................c. ........a.....g..... ...a................ ................
423gorilla   ..................tc .a................c. ........a.g......... ...a.............tt. .a..............
424orang     ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........
425gibbon    a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a..............
426  ;
427end;
428---------------------------------------------------
429
430
431
432
433
434
435|||||||||||  Sample SMTP mail header
436---------------------------------------------------
437
438- - - - - - - - -
439From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991
440Received: from genbank.bio.net by sunflower.bio.indiana.edu
441        (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST
442Received: by genbank.bio.net (5.65/IG-2.0)
443        id AA14458; Sun, 10 Nov 91 14:30:03 -0800
444Date: Sun, 10 Nov 91 14:30:03 -0800
445Message-Id: <9111102230.AA14458@genbank.bio.net>
446From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
447To: gilbertd@sunflower.bio.indiana.edu
448Subject: Results of Query for drorna
449Status: R
450
451No matches on drorna.
452- - - - - -
453From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991
454Received: from genbank.bio.net by sunflower.bio.indiana.edu
455        (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST
456Received: by genbank.bio.net (5.65/IG-2.0)
457        id AA14461; Sun, 10 Nov 91 14:30:03 -0800
458Date: Sun, 10 Nov 91 14:30:03 -0800
459Message-Id: <9111102230.AA14461@genbank.bio.net>
460From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
461To: gilbertd@sunflower.bio.indiana.edu
462Subject: Results of Query for droest6
463Status: R
464
465LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
466DEFINITION  D.melanogaster esterase-6 mRNA, complete cds.
467ACCESSION   M15961
468
469
470
471
472
473
474
475
476
477
478
479
480|||||||||||  GCG manual discussion of sequence symbols:
481---------------------------------------------------
482
483III_SEQUENCE_SYMBOLS
484
485
486     GCG programs allow all upper and lower  case  letters,  periods  (.),
487asterisks  (*),  pluses  (+),  ampersands  (&),  and ats (@) as symbols in
488biological sequences.  Nucleotide  symbols,  their  complements,  and  the
489standard  one-letter amino acid symbols are shown below in separate lists.
490The meanings of the symbols +, &, and @ have not  been  assigned  at  this
491writing (March, 1989).
492
493     GCG uses the  letter  codes  for  amino  acid  codes  and  nucleotide
494ambiguity    proposed    by    IUB    (Nomenclature    Committee,    1985,
495Eur. J. Biochem. 150; 1-5).  These codes are  compatible  with  the  codes
496used by the EMBL, GenBank, and NBRF data libraries.
497
498
499                               NUCLEOTIDES
500
501     The meaning of each symbol, its complement,  and  the  Cambridge  and
502Stanford  equivalents  are  shown below.  Cambridge files can be converted
503into GCG files and vice versa with the programs FROMSTADEN  and  TOSTADEN.
504IntelliGenetics  sequence  files  can  be interconverted with the programs
505FROMIG and TOIG.
506
507IUB/GCG      Meaning     Complement   Staden/Sanger  Stanford
508
509   A             A             T             A            A
510   C             C             G             C            C
511   G             G             C             G            G
512  T/U            T             A             T           T/U
513   M           A or C          K             5            J
514   R           A or G          Y             R            R
515   W           A or T          W             7            L
516   S           C or G          S             8            M
517   Y           C or T          R             Y            Y
518   K           G or T          M             6            K
519   V        A or C or G        B       not supported      N
520   H        A or C or T        D       not supported      N
521   D        A or G or T        H       not supported      N
522   B        C or G or T        V       not supported      N
523  X/N     G or A or T or C     X            -/X           N
524   .    not G or A or T or C   .       not supported      ?
525
526
527  The frame ambiguity codes used by Staden are not  supported  by  GCG
528and   are  translated  by  FROMSTADEN  as  the  lower  case  single  base
529equivalent.
530
531     Staden Code          Meaning              GCG
532
533         D                C or CC                c
534         V                T or TT                t
535         B                A or AA                a
536         H                G or GG                g
537         K                C or CX                c
538         L                T or TX                t
539         M                A or AX                a
540         N                G or GX                g
541
542
543                        AMINO ACIDS
544
545  Here is a list of the standard one-letter amino acid codes and their
546three-letter  equivalents.   The synonymous codons and their depiction in
547the IUB codes are shown.  You should recognize that the codons  following
548semicolons  (;)  are  not  sufficiently specific to define a single amino
549acid even though they represent the best possible back  translation  into
550the IUB codes!  All of the relationships in this list can be redefined by
551the user in a local data file described below.
552
553                                                      IUB
554Symbol 3-letter  Meaning      Codons                Depiction
555 A    Ala       Alanine      GCT,GCC,GCA,GCG         !GCX
556 B    Asp,Asn   Aspartic,
557                Asparagine   GAT,GAC,AAT,AAC         !RAY
558 C    Cys       Cysteine     TGT,TGC                 !TGY
559 D    Asp       Aspartic     GAT,GAC                 !GAY
560 E    Glu       Glutamic     GAA,GAG                 !GAR
561 F    Phe     Phenylalanine  TTT,TTC                 !TTY
562 G    Gly       Glycine      GGT,GGC,GGA,GGG         !GGX
563 H    His       Histidine    CAT,CAC                 !CAY
564 I    Ile       Isoleucine   ATT,ATC,ATA             !ATH
565 K    Lys       Lysine       AAA,AAG                 !AAR
566 L    Leu       Leucine      TTG,TTA,CTT,CTC,CTA,CTG
567!TTR,CTX,YTR;YTX
568 M    Met       Methionine   ATG                     !ATG
569 N    Asn       Asparagine   AAT,AAC                 !AAY
570 P    Pro       Proline      CCT,CCC,CCA,CCG         !CCX
571 Q    Gln       Glutamine    CAA,CAG                 !CAR
572 R    Arg       Arginine     CGT,CGC,CGA,CGG,AGA,AGG
573!CGX,AGR,MGR;MGX
574 S    Ser       Serine       TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX
575 T    Thr       Threonine    ACT,ACC,ACA,ACG         !ACX
576 V    Val       Valine       GTT,GTC,GTA,GTG         !GTX
577 W    Trp       Tryptophan   TGG                     !TGG
578 X    Xxx       Unknown                              !XXX
579 Y    Tyr       Tyrosine     TAT, TAC                !TAY
580 Z    Glu,Gln   Glutamic,
581                Glutamine    GAA,GAG,CAA,CAG         !SAR
582 *    End       Terminator   TAA, TAG, TGA           !TAR,TRA;TRR
583
584
585
586
587
588
589
590
591|||||||||||  docs from PSC on sequence formats:
592---------------------------------------------------
593
594
595          Nucleic Acid and Protein Sequence File Formats
596
597
598It will probably save you some time if you have your data in a usable
599format before you send it to us.  However, we do have the University of
600Wisconsin Genetics Computing Group programs running on our VAXen and
601this package includes several reformatting utilities.  Our programs
602usually recognize any of several standard formats, including GenBank,
603EMBL, NBRF, and MolGen/Stanford.  For the purposes of annotating an
604analysis we find the GenBank and EMBL formats most useful, particularly
605if you have already received an accession number from one of these
606organizations for your sequence.
607
608Our programs do not require that all of the line types available in
609GenBank, EMBL, or NBRF file formats be present for the file format to
610be recognized and processed.  The following pages outline the essential
611details required for correct processing of files by our programs.
612Additional information may be present but will generally be ignored.
613
614
615                      GenBank File Format
616
617File Header
618
6191.  The first line in the file must have "GENETIC SEQUENCE DATA BANK"
620    in spaces 20 through 46 (see LINE  1, below).
6212.  The next 8 lines may contain arbitrary text.  They are ignored but
622    are required to maintain the GenBank format (see LINE 2 - LINE 9).
623
624Sequence Data Entries
625
6263.  Each sequence entry in the file should have the following format.
627    a) first line:   Must have LOCUS in the first 5 spaces.  The
628                     genetic locus name or identifier must be in spaces
629                     13 - 22.  The length of the sequences is right
630                     justified in spaces 23 through 29 (see LINE  10).
631    b) second line:  Must have DEFINITION in the first 10 spaces.
632                     Spaces 13 - 80 are free form text to identify the
633                     sequence (see LINE  11).
634    c) third line:   Must have ACCESSION in the first 9 spaces.  Spaces
635                     13 - 18 must hold the primary accession number
636                     (see LINE  12).
637    d) fourth line:  Must have ORIGIN in the first 6 spaces.  Nothing
638                     else is required on this line, it indicates that
639                     the nucleic acid sequence begins on the next line
640                     (see LINE  13).
641    e) fifth line:   Begins the nucleotide sequence.  The first 9
642                     spaces of each sequence line may either be blank
643                     or may contain the position in the sequence of the
644                     first nucleotide on the line.  The next 66 spaces
645                     hold the nucleotide sequence in six blocks of ten
646                     nucleotides.  Each of the six blocks begins with a
647                     blank space followed by ten nucleotides.  Thus the
648                     first nucleotide is in space eleven of the line while
649                     the last is in space 75 (see LINE  14, LINE  15).
650    f) last line:    Must have // in the first 2 spaces to indicate
651                     termination of the sequence (see LINE  16).
652
653NOTE:  Multiple sequences may appear in each file.  To begin another
654       sequence go back to a) and start again.
655
656
657                         Example GenBank file
658
659
660LINE  1  :                   GENETIC SEQUENCE DATA BANK
661LINE  2  :
662LINE  3  :
663LINE  4  :
664LINE  5  :
665LINE  6  :
666LINE  7  :
667LINE  8  :
668LINE  9  :
669LINE 10  :LOCUS       L_Name     Length BP
670LINE 11  :DEFINITION  Describe the sequence any way you want
671LINE 12  :ACCESSION   Accession Number
672LINE 13  :ORIGIN
673LINE 14  :        1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a...
674LINE 15  :       61 acgt...
675LINE 16  ://
676
677
678
679                         EMBL File Format
680
681Unlike the GenBank file format the EMBL file format does not require
682a series of header lines.  Thus the first line in the file begins
683the first sequence entry of the file.
684
6851.  The first line of each sequence entry contains the two letters ID
686    in the first two spaces.  This is followed by the EMBL identifier
687    in spaces 6 through 14.  (See LINE  1).
688
6892.  The second line of each sequence entry has the two letters AC in
690    the first two spaces.  This is followed by the accession number in
691    spaces 6 through 11.  (See LINE  2).
692
6933.  The third line of each sequence entry has the two letters DE in the
694    first two spaces.  This is followed by a free form text definition
695    in spaces 6 through 72.  (See LINE  3).
696
6974.  The fourth line in each sequence entry has the two letters SQ in
698    the first two spaces.  This is followed by the length of the
699    sequence beginning at or after space 13.  After the sequence length
700    there is a blank space and the two letters BP.  (See LINE  4).
701
7025.  The nucleotide sequence begins on the fifth line of the sequence
703    entry.  Each line of sequence begins with four blank spaces. The
704    next 66 spaces hold the nucleotide sequence in six blocks of ten
705    nucleotides.  Each of the six blocks begins with a blank space
706    followed by ten nucleotides.  Thus the first nucleotide is in space
707    6 of the line while the last is in space 70.  (See LINE  5 -
708    LINE  6).
709
7106.  The last line of each sequence entry in the file is a terminator
711    line which has the two characters // in the first two spaces.
712    (See LINE  7).
713
7147.  Multiple sequences may appear in each file.  To begin another
715    sequence go back to item 1 and start again.
716
717
718                          Example EMBL file
719
720LINE  1  :ID   ID_name
721LINE  2  :AC   Accession number
722LINE  3  :DE   Describe the sequence any way you want
723LINE  4  :SQ          Length BP
724LINE  5  :     ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA...
725LINE  6  :     ACGT...
726LINE  7  ://
727
728
729
730            NBRF (protein or nucleic acid) File Format
731
7321.  The first line of each sequence entry begins with a greater than
733  symbol, >.  This is immediately followed by the two character
734  sequence type specifier.  Space four must contain a semi-colon.
735  Beginning in space five is the sequence name or identification code
736  for the NBRF database.  The code is from four to six letters and
737  numbers.  (See LINE  1).
738
739!!!! >> add these to readseq
740          Specifier             Sequence type
741
742              P1                protein, complete
743              F1                protein, fragment
744              DL                DNA, linear
745              DC                DNA, circular
746              RL                RNA, linear
747              RC                RNA, circular
748              N1                functional RNA, other than tRNA
749              N3                tRNA
750
7512.  The second line of each sequence entry contains two kinds of
752  information.  First is the sequence name which is separated from
753  the organism or organelle name by the three character sequence
754  blank space, dash, blank space, " - ".  There is no special
755  character marking the beginning of this line.  (See LINE  2).
756
7573.  Either the amino acid or nucleic acid sequence begins on line three
758  and can begin in any space, including the first.  The sequence is
759  free format and may be interrupted by blanks for ease of reading.
760  Protein sequences man contain special punctuation to indicate
761  various indeterminacies in the sequence.  In the NBRF data files
762  all lines may be up to 500 characters long.  However some PSC
763  programs currently have a limit of 130 characters per line
764  (including blanks), and BitNet will not accept lines of over eighty
765  characters.  (See LINE  3, LINE  4, and LINE  5).
766
767  The last character in the sequence must be an asterisks, *.
768
769                       Example NBRF file
770
771 LINE  1  :>P1;CBRT
772 LINE  2  :Cytochrome b - Rat mitochondrion (SGC1)
773 LINE  3  :M T N I R K S H P L F K I I N H S F I D L P A P S
774 LINE  4  : VTHICRDVN Y GWL IRY
775 LINE  5  :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN*
776
777
778
779                MolGen/Stanford File Format
780
7811.  The first line in a sequence file is a comment line.  This line
782  begins with a semi-colon in the first space.  This line need
783  not be present.  If it is present it holds descriptive text.
784  There may be as many comment lines as desired at the first of
785  sequence file.  (See LINE  1).
786
7872.  The second line must be present and contains an identifier or
788  name for the sequence in the first ten spaces.  (See LINE  2).
789
7903.  The sequence begins on the third line and occupies up to eighty
791  spaces.  Spaces may be included in the sequence for ease of
792  reading.  The sequence continues for as many line as needed
793  and is terminated with a 1 or 2.  1 indicates a linear sequence
794  while 2 marks a circular sequence.  (See LINE  3 and LINE  4).
795
796                          Example MolGen/Stanford file
797
798LINE  1  :;  Describe the sequence any way you want
799LINE  2  :ECTRNAGLY2
800LINE  3  :ACGCACGTAC ACGTACGTAC   A C G T C C G T ACG TAC GTA CGT
801LINE  4  :  GCTTA   GG G C T A1
802
803
804
805
806|||||||||||  Phylip file format
807---------------------------------------------------
808
809        Phylip 3.3 File Format (DNA sequences)
810
811
812     The input and output formats for PROTPARS and for RESTML are described  in
813their  document  files.   In  general  their input formats are similar to those
814described here, except that the one-letter codes for data are specific to those
815programs  and  are  described in those document files.  Since the input formats
816for the eight DNA sequence programs apply to  all  eight,  they  are  described
817here.   Their  input  formats are standard: the data have A's, G's, C's and T's
818(or U's).  The first line of the input file contains the number of species  and
819the  number  of  sites.   As  with  the other programs, options information may
820follow this.  In the case of DNAML, DNAMLK,  and  DNADIST  an  additional  line
821(described  in  the  document file for these pograms) may follow the first one.
822Following this, each species starts on a new line.  The first 10 characters  of
823that  line  are the species name.  There then follows the base sequence of that
824species, each character being one of the letters A, B, C, D, G, H, K, M, N,  O,
825R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is
826no longer allowed, because it sometimes is used to in aligned sequences to mean
827"the  same  as  the  sequence  above").   Blanks  will  be ignored, and so will
828numerical digits.  This allows GENBANK and EMBL sequence  entries  to  be  read
829with minimum editing.
830
831     These characters can be  either  upper  or  lower  case.   The  algorithms
832convert  all  input  characters  to upper case (which is how they are treated).
833The characters constitute the IUPAC (IUB) nucleic acid code  plus  some  slight
834extensions.  They enable input of nucleic acid sequences taking full account of
835any ambiguities in the sequence.
836
837The sequences can continue over multiple lines; when this is done the sequences
838must  be  either  in  "interleaved"  format, similar to the output of alignment
839programs, or "sequential" format.  These are described  in  the  main  document
840file.   In sequential format all of one sequence is given, possibly on multiple
841lines, before the next starts.  In interleaved format the  first  part  of  the
842file  should  contain  the first part of each of the sequences, then possibly a
843line containing nothing but a carriage-return character, then the  second  part
844of  each  sequence, and so on.  Only the first parts of the sequences should be
845preceded by names.  Here is a hypothetical example of interleaved format:
846
847  5    42
848Turkey    AAGCTNGGGC ATTTCAGGGT
849Salmo gairAAGCCTTGGC AGTGCAGGGT
850H. SapiensACCGGTTGGC CGTTCAGGGT
851Chimp     AAACCCTTGC CGTTACGCTT
852Gorilla   AAACCCTTGC CGGTACGCTT
853
854GAGCCCGGGC AATACAGGGT AT
855GAGCCGTGGC CGGGCACGGT AT
856ACAGGTTGGC CGTTCAGGGT AA
857AAACCGAGGC CGGGACACTC AT
858AAACCATTGC CGGTACGCTT AA
859
860while in sequential format the same sequences would be:
861
862  5    42
863Turkey    AAGCTNGGGC ATTTCAGGGT
864GAGCCCGGGC AATACAGGGT AT
865Salmo gairAAGCCTTGGC AGTGCAGGGT
866GAGCCGTGGC CGGGCACGGT AT
867H. SapiensACCGGTTGGC CGTTCAGGGT
868ACAGGTTGGC CGTTCAGGGT AA
869Chimp     AAACCCTTGC CGTTACGCTT
870AAACCGAGGC CGGGACACTC AT
871Gorilla   AAACCCTTGC CGGTACGCTT
872AAACCATTGC CGGTACGCTT AA
873
874
875Note, of course, that a portion of a sequence like this:
876
877   300   AAGCGTGAAC GTTGTACTAA TRCAG
878
879is perfectly legal, assuming that the species name  has  gone  before,  and  is
880filled  out  to  full  length  by  blanks.  The above digits and blanks will be
881ignored, the sequence being taken as starting at the first base symbol (in this
882case an A).
883
884     The present versions of the programs may sometimes have difficulties  with
885the  blank  lines  between  groups of lines, and if so you might want to retype
886those lines, making sure that they have only a  carriage-return  and  no  blank
887characters on them, or you may perhaps have to eliminate them.  The symptoms of
888this problem are that the programs complain that the sequences are not properly
889aligned, and you can find no other cause for this complaint.
890
891------------------------------------------------
892
893
894|||||||||||  ASN.1 file format
895---------------------------------------------------
896
897
898ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov)
899
900Example asn.1 sequence file----
901
902Bioseq-set ::= {
903seq-set {
904  seq {
905    id { local id 1 } ,                 -- id essential
906    descr {  title "Dummy sequence data from nowhere"  } ,  -- optional
907    inst {                              -- inst essential
908      repr raw ,
909      mol dna ,
910      length 156 ,
911      topology linear ,
912      seq-data
913        iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
914TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
915TGGATTCAAAGCAATAGAGTTGTTCTT"
916      } } ,
917
918        seq {
919          id { local id 2 } ,
920          descr {  title "Dummy sequence 2 data from somewhere else"  } ,
921          inst {
922                repr raw ,
923                mol dna ,
924                length 150 ,
925                topology linear ,
926                seq-data
927                  iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
928TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
929TGGATTCAAAGCAATAGAGTT"
930            }
931          }
932        }
933      }
934
935
936partial ASN.1 description from toolkit
937
938Bioseq ::= SEQUENCE {
939    id SET OF Seq-id ,            -- equivalent identifiers
940    descr Seq-descr OPTIONAL , -- descriptors
941    inst Seq-inst ,            -- the sequence data
942    annot SET OF Seq-annot OPTIONAL }
943
944Seq-inst ::= SEQUENCE {            -- the sequence data itself
945    repr ENUMERATED {              -- representation class
946        not-set (0) ,              -- empty
947        virtual (1) ,              -- no seq data
948        raw (2) ,                  -- continuous sequence
949        seg (3) ,                  -- segmented sequence
950        const (4) ,                -- constructed sequence
951        ref (5) ,                  -- reference to another sequence
952        consen (6) ,               -- consensus sequence or pattern
953        map (7) ,                  -- ordered map (genetic, restriction)
954        other (255) } ,
955    mol ENUMERATED {               -- molecule class in living organism
956        not-set (0) ,              --   > cdna = rna
957        dna (1) ,
958        rna (2) ,
959        aa (3) ,
960        na (4) ,                   -- just a nucleic acid
961        other (255) } ,
962    length INTEGER OPTIONAL ,      -- length of sequence in residues
963    fuzz Int-fuzz OPTIONAL ,       -- length uncertainty
964    topology ENUMERATED {          -- topology of molecule
965        not-set (0) ,
966        linear (1) ,
967        circular (2) ,
968        tandem (3) ,               -- some part of tandem repeat
969        other (255) } DEFAULT linear ,
970    strand ENUMERATED {            -- strandedness in living organism
971        not-set (0) ,
972        ss (1) ,                   -- single strand
973        ds (2) ,                   -- double strand
974        mixed (3) ,
975        other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept
976    seq-data Seq-data OPTIONAL ,   -- the sequence
977    ext Seq-ext OPTIONAL ,         -- extensions for special types
978  hist Seq-hist OPTIONAL }       -- sequence history
979
980------------------------------------------------
Note: See TracBrowser for help on using the repository browser.