source: branches/stable/READSEQ/Formats

Last change on this file was 15411, checked in by westram, 7 years ago
  • fix warnings caused by converted documentation
    • conversion of html docs
      • remove lynx-"<HR/>"-output (arb-help-converter tries to add that to paragraphs)
      • add empty line after "generated from" message
    • manual reflow of one problematic section in READSEQ/Formats
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 38.1 KB
Line 
1[file has been slightly modified to fit into arb help]
2
3||||||||||| ReadSeq supported formats   (revised 30Dec92)
4--------------------------------------------------------
5
6    -f[ormat=]Name Format name for output:
7       |  1. IG/Stanford           10. Olsen (in-only)
8       |  2. GenBank/GB            11. Phylip3.2
9       |  3. NBRF                  12. Phylip
10       |  4. EMBL                  13. Plain/Raw
11       |  5. GCG                   14. PIR/CODATA
12       |  6. DNAStrider            15. MSF
13       |  7. Fitch                 16. ASN.1
14       |  8. Pearson/Fasta         17. PAUP
15       |  9. Zuker (in-only)       18. Pretty (out-only)
16
17In general, output supports only minimal subsets of each format
18needed for sequence data exchanges.  Features, descriptions
19and other format-unique information is discarded.
20
21Users of Olsen multi sequence editor (VMS).  The Olsen format
22here is produced with the print command:
23  print/out=some.file
24Use Genbank output from readseq to produce a format that this
25editor can read, and use the command
26  load/genbank some.file
27Dan Davison has a VMS program that will convert to/from the
28Olsen native binary data format.  E-mail davison@uh.edu
29
30Warning: Phylip format input is now supported (30Dec92), however the
31auto-detection of Phylip format is very probabilistic and messy,
32especially distinguishing sequential from interleaved versions. It
33is not recommended that one use readseq to convert files from Phylip
34format to others unless essential.
35
36
37
38||||||||||| ReadSeq usage             (revised 11Nov91)
39--------------------------------------------------------
40
41A. determine file format:
42
43        short skiplines;  /* result: number of header lines to skip (or 0) */
44        short error;      /* error result or 0 */
45        short format;     /* resulting format code, see ureadseq.h */
46        char  *filename   = "Mysequence.file"
47
48        format = seqFileFormat( filename, &skiplines, &error);
49        if (error!=0) fail;
50
51B. read number and list of sequences (optional)
52
53        short numseqs;    /* resulting number of sequences found in file */
54        char  *seqlist;   /* list of sequence names, newline separated, 0 terminated */
55
56        seqlist = listSeqs( filename, skiplines, format, &numseqs, &error);
57        if (error!=0)  display (seqlist);
58        free( seqlist);
59
60C.  read individual sequences as desired
61
62        short seqIndex;   /* sequence index #, or == kListSeqs for listSeqs equivalent */
63        long  seqlen;     /* length of seq */
64        char  seqid[256]; /* sequence name */
65        char  *seq;       /* sequence, 0 terminated, free when done */
66
67        seq = readSeq( seqIndex, filename, skiplines, format,
68                      &seqlen, &numseqs, &error, seqid);
69        if (error!=0) manipulate(seq);
70        free(seq);
71
72D. write sequences as desired
73
74        int nlines;     /* number of lines of sequence written */
75        FILE* fout;     /* open file pointer (stdout or other) */
76        short outform;  /* output format, see ureadseq.h */
77
78        nlines = writeSeq( fout, seq, seqlen, format, outform, seqid);
79
80
81Note (30Dec92): There is various processing done by the main program (in readseq.c),
82  rather than just in the subroutines (in ureadseq.c).  Especially for interleaved
83  output formats, the writeSeq subroutine does not handle interleaving, nor some of
84  the formatting at the top and end of output files.  While seqFileFormat, listSeqs,
85  and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on
86  auxilliary processing.  At some point, this may be revised so writeSeq is
87  self-contained.
88
89Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format
90  reading (see ureadasn.c).  A bastard (but workable I hope) ASN.1 format is written
91  by writeSeq alone.
92
93
94
95|||||||||||  sequence formats....
96---------------------------------------------------
97
98stanford/IG
99;comments
100;...
101seq1 info
102abcd...
103efgh1 (or 2 = terminator)
104;another seq
105;....
106seq2 info
107abcd...1
108--- for e.g. ----
109;     Dro5s-T.Seq  Length: 120  April 6, 1989  21:22  Check: 9487  ..
110dro5stseq
111GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG
112GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1
113
114;  TOIG of: Dro5srna.Seq  check: 9487  from: 1  to: 120
115---------------------------------------------------
116
117Genbank:
118LOCUS    seq1 ID..
119...
120ORIGIN ...
121123456789abcdefg....(1st 9 columns are formatting)
122     hijkl...
123//         (end of sequence)
124LOCUS     seq2 ID ..
125...
126ORIGIN
127      abcd...
128//
129---------------------------------------------------
130
131NBRF format: (from uwgcg ToNBRF)
132>DL;DRO5SRNA
133Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA
134
135      51  AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG
136     101  AACACCGCGU GUUGUUGGCC U
137
138---------------------------------------------------
139
140EMBL format
141ID345 seq1 id   (the 345 are spaces)
142... other info
143SQ345Sequence   (the 3,4,5 are spaces)
144abcd...
145hijk...
146//              (! this is proper end string: 12Oct90)
147ID    seq2 id
148...
149SQ   Sequence
150abcd...
151...
152//
153---------------------------------------------------
154
155UW GCG Format:
156comments of any form, up to ".." signal
157signal line has seq id, and " Check: ####   .."
158only 1 seq/file
159
160-- e.g. --- (GCG from GenBank)
161LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
162    ... much more ...
163ORIGIN      1 bp upstream of EcoRI site; chromosome BK9 region 69A1.
164
165INVERTEBRATE:DROEST6  Length: 1819  January 9, 1989  16:48  Check: 8008  ..
166
167       1  GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT
168
169      51  CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG
170
171
172---------------------------------------------------
173
174DNAStrider (Mac) = modified Stanford:
175; ### from DNA Strider  Friday, April 7, 1989   11:04:24 PM
176; DNA sequence  pBR322   4363  b.p. complete sequence
177;
178abcd...
179efgh
180//  (end of sequence)
181---------------------------------------------------
182
183Fitch format:
184Dro5srna.Seq
185 GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC
186 GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU
187Droest6.Seq
188 GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG
189 AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG
190---------------------------------------------------
191
192W.Pearson/Fasta format:
193>BOVPRL GenBank entry BOVPRL from omam file.  907 nucleotides.
194TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT
195
196---------------------------------------------------
197Phylip version 3.2 format (e.g., DNAML):
198
199   5   13 YF                (# seqs, #bases, YF)
200Alpha     AACGTGGCCAAAT
201          aaaagggccc...  (continued sp. alpha)
202Beta      AAGGTCGCCAAAC
203          aaaagggccc...  (continued sp. beta)
204Gamma     CATTTCGTCACAA
205          aaaagggccc...  (continued sp. Gamma)
2061234567890^-- bases must start in col 11, and run 'til #bases
207        (spaces & newlines are okay)
208---------------------------------------------------
209Phylip version 3.3 format (e.g., DNAML):
210
211  5    42  YF             (# seqs, #bases, YF)
212Turkey    AAGCTNGGGC ATTTCAGGGT
213Salmo gairAAGCCTTGGC AGTGCAGGGT
214H. SapiensACCGGTTGGC CGTTCAGGGT
215Chimp     AAACCCTTGC CGTTACGCTT
216Gorilla   AAACCCTTGC CGGTACGCTT
2171234567890^-- bases must start in col 11
218  !! this version interleaves the species -- contrary to
219     all other output formats.
220
221GAGCCCGGGC AATACAGGGT AT
222GAGCCGTGGC CGGGCACGGT AT
223ACAGGTTGGC CGTTCAGGGT AA
224AAACCGAGGC CGGGACACTC AT
225AAACCATTGC CGGTACGCTT AA
226
227---------------------------------------------------
228Phylip version 3.4 format (e.g., DNAML)
229-- Both Interleaved and sequential are permitted
230
231   5   13                (# seqs, #bases)
232Alpha     AACGTGGCCAAAT
233          aaaagggccc...  (continued sp. alpha)
234Beta      AAGGTCGCCAAAC
235          aaaagggccc...  (continued sp. beta)
236Gamma     CATTTCGTCACAA
237          aaaagggccc...  (continued sp. Gamma)
2381234567890^-- bases must start in col 11, and run 'til #bases
239        (spaces, newlines and numbers are are ignored)
240
241---------------------------------------------------
242Gary Olsen (multiple) sequence editor /print format:
243
244!---------------------
245!17Oct91 -- error in original copy of olsen /print format, shifted right 1 space
246! here is correct copy:
247  301  40 Tb.thiop  CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG-----------------------------------------------------  Tb.thiop
248123456789012345678901
249  301  42 Rhc.purp  CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG-----------------------------------------------------  Rhc.purp
250
251  301  44 Rhc.gela  nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA-----------------------------------------------------  Rhc.gela
252!---------------------
253
254 RNase P RNA components.  on 20-FEB-90 17:23:58
255
256    1 (E.c. pr ):  Base pairing in Escherichia coli RNase P RNA.
257    2 (chrom   ):  Chromatium
258      :
259   12 (B.brevis):  Bacillus brevis RNase P RNA, B. James.
260   13 ( 90% con):   90% conserved
261   14 (100% con):  100% conserved
262   15 (gram+ pr):  pairing
263
2641
265 RNase P RNA components.  on 20-FEB-90 17:23:58
266
267 Posi-   Sequence
268 tion:   identity:   Data:
269
270     1   1 E.c. pr      <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>>  E.c. pr
271     1   2 chrom        GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------  chrom
272            :
273     1  12 B.brevis  AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU-------------  B.brevis
2741234567890123456789012 <! this should be 21 not 22,
275! this example must be inset on left by 1 space from olsen /print files !
276     1  13  90% con           G  C G  A  CGC GC               -    -      90% con
277     1  14 100% con                G  A  CGC                             100% con
278     1  15 gram+ pr     <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<===============  gram+ pr
279
280    60   1 E.c. pr   >>>>>>^>>^>>>>:>>    <<<^<<<< {{{{{                 E.c. pr
281    60   2 chrom     -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU-------------  chrom
282    :       :
283    60  10 B.stearo  ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC  B.stearo
284
285
286---------------------------------------------------
287  GCG MSF format
288Title line
289
290picorna.msf  MSF: 100  Type: P  January 17, 1991  17:53  Check: 541
291..
292Name: Cb3              Len:   100  Check: 7009  Weight:  1.00
293Name: E                Len:   100  Check:   60  Weight:  1.00
294
295//
296
297   1                                                   50
298Cb3  ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet
299  E  gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs
300
301   51                                                 100
302
303Cb3  ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn.....
304  E  ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf
305
306---------------------------------------------------
307     PIR format
308This is NBRF-PIR MAILSERVER version 1.45
309Command-> get PIR3:A31391
310\\\
311ENTRY           A31391       #Type Protein
312TITLE           *Esterase-6 - Fruit fly (Drosophila melanogaster)
313
314DATE            03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992
315PLACEMENT          0.0    0.0    0.0    0.0    0.0
316COMMENT         *This entry is not verified.
317SOURCE          Drosophila melanogaster
318
319REFERENCE
320   #Authors     Cooke P.H., Oakeshott J.G.
321   #Citation    submitted to GenBank, April 1989
322   #Reference-number A31391
323   #Accession   A31391
324   #Cross-reference GB:J04167
325
326SUMMARY       #Molecular-weight 61125  #Length 544  #Checksum  1679
327SEQUENCE
328                5        10        15        20        25        30
329      1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V
330     31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D
331     61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D
332     91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S
333    121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K
334    151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K
335    181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A
336    211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D
337    241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L
338    271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F
339    301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V
340    331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D
341    361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K
342    391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N
343    421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I
344    451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D
345    481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K
346    511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H
347    541 V E F P
348///
349\\\
350---------------------------------------------------
351PAUP format:
352The NEXUS Format
353
354Every block starts with "BEGIN blockname;" and ends with "END;".
355Each block is composed of one or more statements, each
356terminated by a semicolon (;).
357
358Comments may be included in NEXUS files by enclosing them within
359square brackets, as in "[This is a comment]."
360
361NEXUS-conforming files are identified by a "#NEXUS" directive at
362the very beginning of the file (line 1, column 1).  If the
363#NEXUS is omitted PAUP issues a warning but continues
364processing.
365
366NEXUS files are entirely free-format.  Blanks, tabs, and
367newlines may be placed anywhere in the file.  Unless RESPECTCASE
368is requested, commands and data may be entered in upper case,
369lower case, or a mixture of upper and lower case.
370
371The following conventions are used in the syntax descriptions of
372the various blocks.  Upper-case items are entered exactly as
373shown.  Lower-case items inside of angle brackets -- e.g., <x>
374-- represent items to be substituted by the user.  Items inside
375of square brackets -- e.g., [X] -- are optional.  Items inside
376of curly braces and separated by vertical bars -- e.g.,  { X | Y
377| Z } -- are mutually exclusive options.
378
379
380The DATA Block
381
382The DATA block contains the data matrix and other associated
383information.  Its syntax is:
384
385BEGIN DATA;
386DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>;
387  [ FORMAT  [ MISSING=<missing-symbol> ]
388        [ LABELPOS={ LEFT | RIGHT } ]
389        [ SYMBOLS="<symbols-list>" ]
390        [ INTERLEAVE ]
391        [ MATCHCHAR=<match-symbol> ]
392        [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ]
393        [ TRANSPOSE ]
394        [ RESPECTCASE ]
395        [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ]
396        [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ]
397        [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ]
398        [ ZAP = "<list of zapped characters>" ] ; ]
399  [ CHARLABELS <label_1> <label_2> <label_NCHAR> ; ]
400  [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ]
401  [ STATELABELS <currently ignored by PAUP> ; ]
402  MATRIX <data-matrix> ;
403  END;
404
405--- example PAUP file
406
407#NEXUS
408
409[!Brown et al. (1982) primate mitochondrial DNA]
410
411begin data;
412  dimensions ntax=5 nchar=896;
413  format datatype=dna matchchar=. interleave missing='-';
414  matrix
415[                              2                    4                    6            8                    ]
416[         1                    1                    1                    1            1                    ]
417human     aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc
418chimp     ................a.t. .c.................a ...............t.... ..................t. .t........c.........
419gorilla   ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c...
420orang     ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c...
421gibbon    ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c...
422
423[         8                    8                    8                    8            8              8     ]
424[         0                    2                    4                    6            8              9     ]
425[         1                    1                    1                    1            1              6     ]
426human     cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt
427chimp     t................... .a................c. ........a.....g..... ...a................ ................
428gorilla   ..................tc .a................c. ........a.g......... ...a.............tt. .a..............
429orang     ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........
430gibbon    a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a..............
431  ;
432end;
433---------------------------------------------------
434
435
436
437
438
439
440|||||||||||  Sample SMTP mail header
441---------------------------------------------------
442
443- - - - - - - - -
444From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991
445Received: from genbank.bio.net by sunflower.bio.indiana.edu
446        (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST
447Received: by genbank.bio.net (5.65/IG-2.0)
448        id AA14458; Sun, 10 Nov 91 14:30:03 -0800
449Date: Sun, 10 Nov 91 14:30:03 -0800
450Message-Id: <9111102230.AA14458@genbank.bio.net>
451From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
452To: gilbertd@sunflower.bio.indiana.edu
453Subject: Results of Query for drorna
454Status: R
455
456No matches on drorna.
457- - - - - -
458From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991
459Received: from genbank.bio.net by sunflower.bio.indiana.edu
460        (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST
461Received: by genbank.bio.net (5.65/IG-2.0)
462        id AA14461; Sun, 10 Nov 91 14:30:03 -0800
463Date: Sun, 10 Nov 91 14:30:03 -0800
464Message-Id: <9111102230.AA14461@genbank.bio.net>
465From: Database Server <GenBank-Retrieval-System@genbank.bio.net>
466To: gilbertd@sunflower.bio.indiana.edu
467Subject: Results of Query for droest6
468Status: R
469
470LOCUS       DROEST6      1819 bp ss-mRNA            INV       31-AUG-1987
471DEFINITION  D.melanogaster esterase-6 mRNA, complete cds.
472ACCESSION   M15961
473
474
475
476
477
478
479
480
481
482
483
484
485|||||||||||  GCG manual discussion of sequence symbols:
486---------------------------------------------------
487
488III_SEQUENCE_SYMBOLS
489
490
491     GCG programs allow all upper and lower  case  letters,  periods  (.),
492asterisks  (*),  pluses  (+),  ampersands  (&),  and ats (@) as symbols in
493biological sequences.  Nucleotide  symbols,  their  complements,  and  the
494standard  one-letter amino acid symbols are shown below in separate lists.
495The meanings of the symbols +, &, and @ have not  been  assigned  at  this
496writing (March, 1989).
497
498     GCG uses the  letter  codes  for  amino  acid  codes  and  nucleotide
499ambiguity    proposed    by    IUB    (Nomenclature    Committee,    1985,
500Eur. J. Biochem. 150; 1-5).  These codes are  compatible  with  the  codes
501used by the EMBL, GenBank, and NBRF data libraries.
502
503
504                               NUCLEOTIDES
505
506     The meaning of each symbol, its complement,  and  the  Cambridge  and
507Stanford  equivalents  are  shown below.  Cambridge files can be converted
508into GCG files and vice versa with the programs FROMSTADEN  and  TOSTADEN.
509IntelliGenetics  sequence  files  can  be interconverted with the programs
510FROMIG and TOIG.
511
512IUB/GCG      Meaning     Complement   Staden/Sanger  Stanford
513
514   A             A             T             A            A
515   C             C             G             C            C
516   G             G             C             G            G
517  T/U            T             A             T           T/U
518   M           A or C          K             5            J
519   R           A or G          Y             R            R
520   W           A or T          W             7            L
521   S           C or G          S             8            M
522   Y           C or T          R             Y            Y
523   K           G or T          M             6            K
524   V        A or C or G        B       not supported      N
525   H        A or C or T        D       not supported      N
526   D        A or G or T        H       not supported      N
527   B        C or G or T        V       not supported      N
528  X/N     G or A or T or C     X            -/X           N
529   .    not G or A or T or C   .       not supported      ?
530
531
532  The frame ambiguity codes used by Staden are not  supported  by  GCG
533and   are  translated  by  FROMSTADEN  as  the  lower  case  single  base
534equivalent.
535
536     Staden Code          Meaning              GCG
537
538         D                C or CC                c
539         V                T or TT                t
540         B                A or AA                a
541         H                G or GG                g
542         K                C or CX                c
543         L                T or TX                t
544         M                A or AX                a
545         N                G or GX                g
546
547
548                        AMINO ACIDS
549
550  Here is a list of the standard one-letter amino acid codes and their
551three-letter  equivalents.   The synonymous codons and their depiction in
552the IUB codes are shown.  You should recognize that the codons  following
553semicolons  (;)  are  not  sufficiently specific to define a single amino
554acid even though they represent the best possible back  translation  into
555the IUB codes!  All of the relationships in this list can be redefined by
556the user in a local data file described below.
557
558                                                      IUB
559Symbol 3-letter  Meaning      Codons                Depiction
560 A    Ala       Alanine      GCT,GCC,GCA,GCG         !GCX
561 B    Asp,Asn   Aspartic,
562                Asparagine   GAT,GAC,AAT,AAC         !RAY
563 C    Cys       Cysteine     TGT,TGC                 !TGY
564 D    Asp       Aspartic     GAT,GAC                 !GAY
565 E    Glu       Glutamic     GAA,GAG                 !GAR
566 F    Phe     Phenylalanine  TTT,TTC                 !TTY
567 G    Gly       Glycine      GGT,GGC,GGA,GGG         !GGX
568 H    His       Histidine    CAT,CAC                 !CAY
569 I    Ile       Isoleucine   ATT,ATC,ATA             !ATH
570 K    Lys       Lysine       AAA,AAG                 !AAR
571 L    Leu       Leucine      TTG,TTA,CTT,CTC,CTA,CTG
572!TTR,CTX,YTR;YTX
573 M    Met       Methionine   ATG                     !ATG
574 N    Asn       Asparagine   AAT,AAC                 !AAY
575 P    Pro       Proline      CCT,CCC,CCA,CCG         !CCX
576 Q    Gln       Glutamine    CAA,CAG                 !CAR
577 R    Arg       Arginine     CGT,CGC,CGA,CGG,AGA,AGG
578!CGX,AGR,MGR;MGX
579 S    Ser       Serine       TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX
580 T    Thr       Threonine    ACT,ACC,ACA,ACG         !ACX
581 V    Val       Valine       GTT,GTC,GTA,GTG         !GTX
582 W    Trp       Tryptophan   TGG                     !TGG
583 X    Xxx       Unknown                              !XXX
584 Y    Tyr       Tyrosine     TAT, TAC                !TAY
585 Z    Glu,Gln   Glutamic,
586                Glutamine    GAA,GAG,CAA,CAG         !SAR
587 *    End       Terminator   TAA, TAG, TGA           !TAR,TRA;TRR
588
589
590
591
592
593
594
595
596|||||||||||  docs from PSC on sequence formats:
597---------------------------------------------------
598
599
600          Nucleic Acid and Protein Sequence File Formats
601
602
603It will probably save you some time if you have your data in a usable
604format before you send it to us.  However, we do have the University of
605Wisconsin Genetics Computing Group programs running on our VAXen and
606this package includes several reformatting utilities.  Our programs
607usually recognize any of several standard formats, including GenBank,
608EMBL, NBRF, and MolGen/Stanford.  For the purposes of annotating an
609analysis we find the GenBank and EMBL formats most useful, particularly
610if you have already received an accession number from one of these
611organizations for your sequence.
612
613Our programs do not require that all of the line types available in
614GenBank, EMBL, or NBRF file formats be present for the file format to
615be recognized and processed.  The following pages outline the essential
616details required for correct processing of files by our programs.
617Additional information may be present but will generally be ignored.
618
619
620                      GenBank File Format
621
622File Header
623
6241.  The first line in the file must have "GENETIC SEQUENCE DATA BANK"
625    in spaces 20 through 46 (see LINE  1, below).
6262.  The next 8 lines may contain arbitrary text.  They are ignored but
627    are required to maintain the GenBank format (see LINE 2 - LINE 9).
628
629Sequence Data Entries
630
6313.  Each sequence entry in the file should have the following format.
632
633    a) first line:
634
635        Must have LOCUS in the first 5 spaces. The
636        genetic locus name or identifier must be in spaces
637        13 - 22. The length of the sequences is right
638        justified in spaces 23 through 29 (see LINE 10).
639
640    b) second line:
641
642        Must have DEFINITION in the first 10 spaces.
643        Spaces 13 - 80 are free form text to identify the
644        sequence (see LINE 11).
645
646    c) third line:
647
648        Must have ACCESSION in the first 9 spaces. Spaces
649        13 - 18 must hold the primary accession number
650        (see LINE 12).
651
652    d) fourth line:
653
654        Must have ORIGIN in the first 6 spaces. Nothing
655        else is required on this line, it indicates that
656        the nucleic acid sequence begins on the next line
657        (see LINE 13).
658
659    e) fifth line:
660
661        Begins the nucleotide sequence. The first 9
662        spaces of each sequence line may either be blank
663        or may contain the position in the sequence of the
664        first nucleotide on the line. The next 66 spaces
665        hold the nucleotide sequence in six blocks of ten
666        nucleotides. Each of the six blocks begins with a
667        blank space followed by ten nucleotides. Thus the
668        first nucleotide is in space eleven of the line while
669        the last is in space 75 (see LINE 14, LINE 15).
670
671    f) last line:
672
673        Must have // in the first 2 spaces to indicate
674        termination of the sequence (see LINE 16).
675
676NOTE:  Multiple sequences may appear in each file.  To begin another
677       sequence go back to a) and start again.
678
679
680                         Example GenBank file
681
682
683LINE  1  :                   GENETIC SEQUENCE DATA BANK
684LINE  2  :
685LINE  3  :
686LINE  4  :
687LINE  5  :
688LINE  6  :
689LINE  7  :
690LINE  8  :
691LINE  9  :
692LINE 10  :LOCUS       L_Name     Length BP
693LINE 11  :DEFINITION  Describe the sequence any way you want
694LINE 12  :ACCESSION   Accession Number
695LINE 13  :ORIGIN
696LINE 14  :        1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a...
697LINE 15  :       61 acgt...
698LINE 16  ://
699
700
701
702                         EMBL File Format
703
704Unlike the GenBank file format the EMBL file format does not require
705a series of header lines.  Thus the first line in the file begins
706the first sequence entry of the file.
707
7081.  The first line of each sequence entry contains the two letters ID
709    in the first two spaces.  This is followed by the EMBL identifier
710    in spaces 6 through 14.  (See LINE  1).
711
7122.  The second line of each sequence entry has the two letters AC in
713    the first two spaces.  This is followed by the accession number in
714    spaces 6 through 11.  (See LINE  2).
715
7163.  The third line of each sequence entry has the two letters DE in the
717    first two spaces.  This is followed by a free form text definition
718    in spaces 6 through 72.  (See LINE  3).
719
7204.  The fourth line in each sequence entry has the two letters SQ in
721    the first two spaces.  This is followed by the length of the
722    sequence beginning at or after space 13.  After the sequence length
723    there is a blank space and the two letters BP.  (See LINE  4).
724
7255.  The nucleotide sequence begins on the fifth line of the sequence
726    entry.  Each line of sequence begins with four blank spaces. The
727    next 66 spaces hold the nucleotide sequence in six blocks of ten
728    nucleotides.  Each of the six blocks begins with a blank space
729    followed by ten nucleotides.  Thus the first nucleotide is in space
730    6 of the line while the last is in space 70.  (See LINE  5 -
731    LINE  6).
732
7336.  The last line of each sequence entry in the file is a terminator
734    line which has the two characters // in the first two spaces.
735    (See LINE  7).
736
7377.  Multiple sequences may appear in each file.  To begin another
738    sequence go back to item 1 and start again.
739
740
741                          Example EMBL file
742
743LINE  1  :ID   ID_name
744LINE  2  :AC   Accession number
745LINE  3  :DE   Describe the sequence any way you want
746LINE  4  :SQ          Length BP
747LINE  5  :     ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA...
748LINE  6  :     ACGT...
749LINE  7  ://
750
751
752
753            NBRF (protein or nucleic acid) File Format
754
7551.  The first line of each sequence entry begins with a greater than
756  symbol, >.  This is immediately followed by the two character
757  sequence type specifier.  Space four must contain a semi-colon.
758  Beginning in space five is the sequence name or identification code
759  for the NBRF database.  The code is from four to six letters and
760  numbers.  (See LINE  1).
761
762!!!! >> add these to readseq
763          Specifier             Sequence type
764
765              P1                protein, complete
766              F1                protein, fragment
767              DL                DNA, linear
768              DC                DNA, circular
769              RL                RNA, linear
770              RC                RNA, circular
771              N1                functional RNA, other than tRNA
772              N3                tRNA
773
7742.  The second line of each sequence entry contains two kinds of
775  information.  First is the sequence name which is separated from
776  the organism or organelle name by the three character sequence
777  blank space, dash, blank space, " - ".  There is no special
778  character marking the beginning of this line.  (See LINE  2).
779
7803.  Either the amino acid or nucleic acid sequence begins on line three
781  and can begin in any space, including the first.  The sequence is
782  free format and may be interrupted by blanks for ease of reading.
783  Protein sequences man contain special punctuation to indicate
784  various indeterminacies in the sequence.  In the NBRF data files
785  all lines may be up to 500 characters long.  However some PSC
786  programs currently have a limit of 130 characters per line
787  (including blanks), and BitNet will not accept lines of over eighty
788  characters.  (See LINE  3, LINE  4, and LINE  5).
789
790  The last character in the sequence must be an asterisks, *.
791
792                       Example NBRF file
793
794 LINE  1  :>P1;CBRT
795 LINE  2  :Cytochrome b - Rat mitochondrion (SGC1)
796 LINE  3  :M T N I R K S H P L F K I I N H S F I D L P A P S
797 LINE  4  : VTHICRDVN Y GWL IRY
798 LINE  5  :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN*
799
800
801
802                MolGen/Stanford File Format
803
8041.  The first line in a sequence file is a comment line.  This line
805  begins with a semi-colon in the first space.  This line need
806  not be present.  If it is present it holds descriptive text.
807  There may be as many comment lines as desired at the first of
808  sequence file.  (See LINE  1).
809
8102.  The second line must be present and contains an identifier or
811  name for the sequence in the first ten spaces.  (See LINE  2).
812
8133.  The sequence begins on the third line and occupies up to eighty
814  spaces.  Spaces may be included in the sequence for ease of
815  reading.  The sequence continues for as many line as needed
816  and is terminated with a 1 or 2.  1 indicates a linear sequence
817  while 2 marks a circular sequence.  (See LINE  3 and LINE  4).
818
819                          Example MolGen/Stanford file
820
821LINE  1  :;  Describe the sequence any way you want
822LINE  2  :ECTRNAGLY2
823LINE  3  :ACGCACGTAC ACGTACGTAC   A C G T C C G T ACG TAC GTA CGT
824LINE  4  :  GCTTA   GG G C T A1
825
826
827
828
829|||||||||||  Phylip file format
830---------------------------------------------------
831
832        Phylip 3.3 File Format (DNA sequences)
833
834
835     The input and output formats for PROTPARS and for RESTML are described  in
836their  document  files.   In  general  their input formats are similar to those
837described here, except that the one-letter codes for data are specific to those
838programs  and  are  described in those document files.  Since the input formats
839for the eight DNA sequence programs apply to  all  eight,  they  are  described
840here.   Their  input  formats are standard: the data have A's, G's, C's and T's
841(or U's).  The first line of the input file contains the number of species  and
842the  number  of  sites.   As  with  the other programs, options information may
843follow this.  In the case of DNAML, DNAMLK,  and  DNADIST  an  additional  line
844(described  in  the  document file for these pograms) may follow the first one.
845Following this, each species starts on a new line.  The first 10 characters  of
846that  line  are the species name.  There then follows the base sequence of that
847species, each character being one of the letters A, B, C, D, G, H, K, M, N,  O,
848R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is
849no longer allowed, because it sometimes is used to in aligned sequences to mean
850"the  same  as  the  sequence  above").   Blanks  will  be ignored, and so will
851numerical digits.  This allows GENBANK and EMBL sequence  entries  to  be  read
852with minimum editing.
853
854     These characters can be  either  upper  or  lower  case.   The  algorithms
855convert  all  input  characters  to upper case (which is how they are treated).
856The characters constitute the IUPAC (IUB) nucleic acid code  plus  some  slight
857extensions.  They enable input of nucleic acid sequences taking full account of
858any ambiguities in the sequence.
859
860The sequences can continue over multiple lines; when this is done the sequences
861must  be  either  in  "interleaved"  format, similar to the output of alignment
862programs, or "sequential" format.  These are described  in  the  main  document
863file.   In sequential format all of one sequence is given, possibly on multiple
864lines, before the next starts.  In interleaved format the  first  part  of  the
865file  should  contain  the first part of each of the sequences, then possibly a
866line containing nothing but a carriage-return character, then the  second  part
867of  each  sequence, and so on.  Only the first parts of the sequences should be
868preceded by names.  Here is a hypothetical example of interleaved format:
869
870  5    42
871Turkey    AAGCTNGGGC ATTTCAGGGT
872Salmo gairAAGCCTTGGC AGTGCAGGGT
873H. SapiensACCGGTTGGC CGTTCAGGGT
874Chimp     AAACCCTTGC CGTTACGCTT
875Gorilla   AAACCCTTGC CGGTACGCTT
876
877GAGCCCGGGC AATACAGGGT AT
878GAGCCGTGGC CGGGCACGGT AT
879ACAGGTTGGC CGTTCAGGGT AA
880AAACCGAGGC CGGGACACTC AT
881AAACCATTGC CGGTACGCTT AA
882
883while in sequential format the same sequences would be:
884
885  5    42
886Turkey    AAGCTNGGGC ATTTCAGGGT
887GAGCCCGGGC AATACAGGGT AT
888Salmo gairAAGCCTTGGC AGTGCAGGGT
889GAGCCGTGGC CGGGCACGGT AT
890H. SapiensACCGGTTGGC CGTTCAGGGT
891ACAGGTTGGC CGTTCAGGGT AA
892Chimp     AAACCCTTGC CGTTACGCTT
893AAACCGAGGC CGGGACACTC AT
894Gorilla   AAACCCTTGC CGGTACGCTT
895AAACCATTGC CGGTACGCTT AA
896
897
898Note, of course, that a portion of a sequence like this:
899
900   300   AAGCGTGAAC GTTGTACTAA TRCAG
901
902is perfectly legal, assuming that the species name  has  gone  before,  and  is
903filled  out  to  full  length  by  blanks.  The above digits and blanks will be
904ignored, the sequence being taken as starting at the first base symbol (in this
905case an A).
906
907     The present versions of the programs may sometimes have difficulties  with
908the  blank  lines  between  groups of lines, and if so you might want to retype
909those lines, making sure that they have only a  carriage-return  and  no  blank
910characters on them, or you may perhaps have to eliminate them.  The symptoms of
911this problem are that the programs complain that the sequences are not properly
912aligned, and you can find no other cause for this complaint.
913
914------------------------------------------------
915
916
917|||||||||||  ASN.1 file format
918---------------------------------------------------
919
920
921ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov)
922
923Example asn.1 sequence file----
924
925Bioseq-set ::= {
926seq-set {
927  seq {
928    id { local id 1 } ,                 -- id essential
929    descr {  title "Dummy sequence data from nowhere"  } ,  -- optional
930    inst {                              -- inst essential
931      repr raw ,
932      mol dna ,
933      length 156 ,
934      topology linear ,
935      seq-data
936        iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
937TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
938TGGATTCAAAGCAATAGAGTTGTTCTT"
939      } } ,
940
941        seq {
942          id { local id 2 } ,
943          descr {  title "Dummy sequence 2 data from somewhere else"  } ,
944          inst {
945                repr raw ,
946                mol dna ,
947                length 150 ,
948                topology linear ,
949                seq-data
950                  iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA
951TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG
952TGGATTCAAAGCAATAGAGTT"
953            }
954          }
955        }
956      }
957
958
959partial ASN.1 description from toolkit
960
961Bioseq ::= SEQUENCE {
962    id SET OF Seq-id ,            -- equivalent identifiers
963    descr Seq-descr OPTIONAL , -- descriptors
964    inst Seq-inst ,            -- the sequence data
965    annot SET OF Seq-annot OPTIONAL }
966
967Seq-inst ::= SEQUENCE {            -- the sequence data itself
968    repr ENUMERATED {              -- representation class
969        not-set (0) ,              -- empty
970        virtual (1) ,              -- no seq data
971        raw (2) ,                  -- continuous sequence
972        seg (3) ,                  -- segmented sequence
973        const (4) ,                -- constructed sequence
974        ref (5) ,                  -- reference to another sequence
975        consen (6) ,               -- consensus sequence or pattern
976        map (7) ,                  -- ordered map (genetic, restriction)
977        other (255) } ,
978    mol ENUMERATED {               -- molecule class in living organism
979        not-set (0) ,              --   > cdna = rna
980        dna (1) ,
981        rna (2) ,
982        aa (3) ,
983        na (4) ,                   -- just a nucleic acid
984        other (255) } ,
985    length INTEGER OPTIONAL ,      -- length of sequence in residues
986    fuzz Int-fuzz OPTIONAL ,       -- length uncertainty
987    topology ENUMERATED {          -- topology of molecule
988        not-set (0) ,
989        linear (1) ,
990        circular (2) ,
991        tandem (3) ,               -- some part of tandem repeat
992        other (255) } DEFAULT linear ,
993    strand ENUMERATED {            -- strandedness in living organism
994        not-set (0) ,
995        ss (1) ,                   -- single strand
996        ds (2) ,                   -- double strand
997        mixed (3) ,
998        other (255) } OPTIONAL ,   -- default ds for DNA, ss for RNA, pept
999    seq-data Seq-data OPTIONAL ,   -- the sequence
1000    ext Seq-ext OPTIONAL ,         -- extensions for special types
1001  hist Seq-hist OPTIONAL }       -- sequence history
1002
1003------------------------------------------------
Note: See TracBrowser for help on using the repository browser.