1 | [file has been slightly modified to fit into arb help] |
---|
2 | |
---|
3 | ||||||||||| ReadSeq supported formats (revised 30Dec92) |
---|
4 | -------------------------------------------------------- |
---|
5 | |
---|
6 | -f[ormat=]Name Format name for output: |
---|
7 | | 1. IG/Stanford 10. Olsen (in-only) |
---|
8 | | 2. GenBank/GB 11. Phylip3.2 |
---|
9 | | 3. NBRF 12. Phylip |
---|
10 | | 4. EMBL 13. Plain/Raw |
---|
11 | | 5. GCG 14. PIR/CODATA |
---|
12 | | 6. DNAStrider 15. MSF |
---|
13 | | 7. Fitch 16. ASN.1 |
---|
14 | | 8. Pearson/Fasta 17. PAUP |
---|
15 | | 9. Zuker (in-only) 18. Pretty (out-only) |
---|
16 | |
---|
17 | In general, output supports only minimal subsets of each format |
---|
18 | needed for sequence data exchanges. Features, descriptions |
---|
19 | and other format-unique information is discarded. |
---|
20 | |
---|
21 | Users of Olsen multi sequence editor (VMS). The Olsen format |
---|
22 | here is produced with the print command: |
---|
23 | print/out=some.file |
---|
24 | Use Genbank output from readseq to produce a format that this |
---|
25 | editor can read, and use the command |
---|
26 | load/genbank some.file |
---|
27 | Dan Davison has a VMS program that will convert to/from the |
---|
28 | Olsen native binary data format. E-mail davison@uh.edu |
---|
29 | |
---|
30 | Warning: Phylip format input is now supported (30Dec92), however the |
---|
31 | auto-detection of Phylip format is very probabilistic and messy, |
---|
32 | especially distinguishing sequential from interleaved versions. It |
---|
33 | is not recommended that one use readseq to convert files from Phylip |
---|
34 | format to others unless essential. |
---|
35 | |
---|
36 | |
---|
37 | |
---|
38 | ||||||||||| ReadSeq usage (revised 11Nov91) |
---|
39 | -------------------------------------------------------- |
---|
40 | |
---|
41 | A. determine file format: |
---|
42 | |
---|
43 | short skiplines; /* result: number of header lines to skip (or 0) */ |
---|
44 | short error; /* error result or 0 */ |
---|
45 | short format; /* resulting format code, see ureadseq.h */ |
---|
46 | char *filename = "Mysequence.file" |
---|
47 | |
---|
48 | format = seqFileFormat( filename, &skiplines, &error); |
---|
49 | if (error!=0) fail; |
---|
50 | |
---|
51 | B. read number and list of sequences (optional) |
---|
52 | |
---|
53 | short numseqs; /* resulting number of sequences found in file */ |
---|
54 | char *seqlist; /* list of sequence names, newline separated, 0 terminated */ |
---|
55 | |
---|
56 | seqlist = listSeqs( filename, skiplines, format, &numseqs, &error); |
---|
57 | if (error!=0) display (seqlist); |
---|
58 | free( seqlist); |
---|
59 | |
---|
60 | C. read individual sequences as desired |
---|
61 | |
---|
62 | short seqIndex; /* sequence index #, or == kListSeqs for listSeqs equivalent */ |
---|
63 | long seqlen; /* length of seq */ |
---|
64 | char seqid[256]; /* sequence name */ |
---|
65 | char *seq; /* sequence, 0 terminated, free when done */ |
---|
66 | |
---|
67 | seq = readSeq( seqIndex, filename, skiplines, format, |
---|
68 | &seqlen, &numseqs, &error, seqid); |
---|
69 | if (error!=0) manipulate(seq); |
---|
70 | free(seq); |
---|
71 | |
---|
72 | D. write sequences as desired |
---|
73 | |
---|
74 | int nlines; /* number of lines of sequence written */ |
---|
75 | FILE* fout; /* open file pointer (stdout or other) */ |
---|
76 | short outform; /* output format, see ureadseq.h */ |
---|
77 | |
---|
78 | nlines = writeSeq( fout, seq, seqlen, format, outform, seqid); |
---|
79 | |
---|
80 | |
---|
81 | Note (30Dec92): There is various processing done by the main program (in readseq.c), |
---|
82 | rather than just in the subroutines (in ureadseq.c). Especially for interleaved |
---|
83 | output formats, the writeSeq subroutine does not handle interleaving, nor some of |
---|
84 | the formatting at the top and end of output files. While seqFileFormat, listSeqs, |
---|
85 | and readSeq subroutines are fairly self-contained, the writeSeq depends a lot on |
---|
86 | auxilliary processing. At some point, this may be revised so writeSeq is |
---|
87 | self-contained. |
---|
88 | |
---|
89 | Note 2: The NCBI toolkit (ftp from ncbi.nlm.nih.gov) is needed for the ASN.1 format |
---|
90 | reading (see ureadasn.c). A bastard (but workable I hope) ASN.1 format is written |
---|
91 | by writeSeq alone. |
---|
92 | |
---|
93 | |
---|
94 | |
---|
95 | ||||||||||| sequence formats.... |
---|
96 | --------------------------------------------------- |
---|
97 | |
---|
98 | stanford/IG |
---|
99 | ;comments |
---|
100 | ;... |
---|
101 | seq1 info |
---|
102 | abcd... |
---|
103 | efgh1 (or 2 = terminator) |
---|
104 | ;another seq |
---|
105 | ;.... |
---|
106 | seq2 info |
---|
107 | abcd...1 |
---|
108 | --- for e.g. ---- |
---|
109 | ; Dro5s-T.Seq Length: 120 April 6, 1989 21:22 Check: 9487 .. |
---|
110 | dro5stseq |
---|
111 | GCCAACGACCAUACCACGCUGAAUACAUCGGUUCUCGUCCGAUCACCGAAAUUAAGCAGCGUCGCGGGCG |
---|
112 | GUUAGUACUUAGAUGGGGGACCGCUUGGGAACACCGCGUGUUGUUGGCCU1 |
---|
113 | |
---|
114 | ; TOIG of: Dro5srna.Seq check: 9487 from: 1 to: 120 |
---|
115 | --------------------------------------------------- |
---|
116 | |
---|
117 | Genbank: |
---|
118 | LOCUS seq1 ID.. |
---|
119 | ... |
---|
120 | ORIGIN ... |
---|
121 | 123456789abcdefg....(1st 9 columns are formatting) |
---|
122 | hijkl... |
---|
123 | // (end of sequence) |
---|
124 | LOCUS seq2 ID .. |
---|
125 | ... |
---|
126 | ORIGIN |
---|
127 | abcd... |
---|
128 | // |
---|
129 | --------------------------------------------------- |
---|
130 | |
---|
131 | NBRF format: (from uwgcg ToNBRF) |
---|
132 | >DL;DRO5SRNA |
---|
133 | Iubio$Dua0:[Gilbertd.Gcg]Dro5srna.Seq;2 => DRO5SRNA |
---|
134 | |
---|
135 | 51 AAUUAAGCAG CGUCGCGGGC GGUUAGUACU UAGAUGGGGG ACCGCUUGGG |
---|
136 | 101 AACACCGCGU GUUGUUGGCC U |
---|
137 | |
---|
138 | --------------------------------------------------- |
---|
139 | |
---|
140 | EMBL format |
---|
141 | ID345 seq1 id (the 345 are spaces) |
---|
142 | ... other info |
---|
143 | SQ345Sequence (the 3,4,5 are spaces) |
---|
144 | abcd... |
---|
145 | hijk... |
---|
146 | // (! this is proper end string: 12Oct90) |
---|
147 | ID seq2 id |
---|
148 | ... |
---|
149 | SQ Sequence |
---|
150 | abcd... |
---|
151 | ... |
---|
152 | // |
---|
153 | --------------------------------------------------- |
---|
154 | |
---|
155 | UW GCG Format: |
---|
156 | comments of any form, up to ".." signal |
---|
157 | signal line has seq id, and " Check: #### .." |
---|
158 | only 1 seq/file |
---|
159 | |
---|
160 | -- e.g. --- (GCG from GenBank) |
---|
161 | LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987 |
---|
162 | ... much more ... |
---|
163 | ORIGIN 1 bp upstream of EcoRI site; chromosome BK9 region 69A1. |
---|
164 | |
---|
165 | INVERTEBRATE:DROEST6 Length: 1819 January 9, 1989 16:48 Check: 8008 .. |
---|
166 | |
---|
167 | 1 GAATTCGCCG GAGTGAGGAG CAACATGAAC TACGTGGGAC TGGGACTTAT |
---|
168 | |
---|
169 | 51 CATTGTGCTG AGCTGCCTTT GGCTCGGTTC GAACGCGAGT GATACAGATG |
---|
170 | |
---|
171 | |
---|
172 | --------------------------------------------------- |
---|
173 | |
---|
174 | DNAStrider (Mac) = modified Stanford: |
---|
175 | ; ### from DNA Strider Friday, April 7, 1989 11:04:24 PM |
---|
176 | ; DNA sequence pBR322 4363 b.p. complete sequence |
---|
177 | ; |
---|
178 | abcd... |
---|
179 | efgh |
---|
180 | // (end of sequence) |
---|
181 | --------------------------------------------------- |
---|
182 | |
---|
183 | Fitch format: |
---|
184 | Dro5srna.Seq |
---|
185 | GCC AAC GAC CAU ACC ACG CUG AAU ACA UCG GUU CUC GUC CGA UCA CCG AAA UUA AGC AGC |
---|
186 | GUC GCG GGC GGU UAG UAC UUA GAU GGG GGA CCG CUU GGG AAC ACC GCG UGU UGU UGG CCU |
---|
187 | Droest6.Seq |
---|
188 | GAA TTC GCC GGA GTG AGG AGC AAC ATG AAC TAC GTG GGA CTG GGA CTT ATC ATT GTG CTG |
---|
189 | AGC TGC CTT TGG CTC GGT TCG AAC GCG AGT GAT ACA GAT GAC CCT CTG TTG GTG CAG CTG |
---|
190 | --------------------------------------------------- |
---|
191 | |
---|
192 | W.Pearson/Fasta format: |
---|
193 | >BOVPRL GenBank entry BOVPRL from omam file. 907 nucleotides. |
---|
194 | TGCTTGGCTGAGGAGCCATAGGACGAGAGCTTCCTGGTGAAGTGTGTTTCTTGAAATCAT |
---|
195 | |
---|
196 | --------------------------------------------------- |
---|
197 | Phylip version 3.2 format (e.g., DNAML): |
---|
198 | |
---|
199 | 5 13 YF (# seqs, #bases, YF) |
---|
200 | Alpha AACGTGGCCAAAT |
---|
201 | aaaagggccc... (continued sp. alpha) |
---|
202 | Beta AAGGTCGCCAAAC |
---|
203 | aaaagggccc... (continued sp. beta) |
---|
204 | Gamma CATTTCGTCACAA |
---|
205 | aaaagggccc... (continued sp. Gamma) |
---|
206 | 1234567890^-- bases must start in col 11, and run 'til #bases |
---|
207 | (spaces & newlines are okay) |
---|
208 | --------------------------------------------------- |
---|
209 | Phylip version 3.3 format (e.g., DNAML): |
---|
210 | |
---|
211 | 5 42 YF (# seqs, #bases, YF) |
---|
212 | Turkey AAGCTNGGGC ATTTCAGGGT |
---|
213 | Salmo gairAAGCCTTGGC AGTGCAGGGT |
---|
214 | H. SapiensACCGGTTGGC CGTTCAGGGT |
---|
215 | Chimp AAACCCTTGC CGTTACGCTT |
---|
216 | Gorilla AAACCCTTGC CGGTACGCTT |
---|
217 | 1234567890^-- bases must start in col 11 |
---|
218 | !! this version interleaves the species -- contrary to |
---|
219 | all other output formats. |
---|
220 | |
---|
221 | GAGCCCGGGC AATACAGGGT AT |
---|
222 | GAGCCGTGGC CGGGCACGGT AT |
---|
223 | ACAGGTTGGC CGTTCAGGGT AA |
---|
224 | AAACCGAGGC CGGGACACTC AT |
---|
225 | AAACCATTGC CGGTACGCTT AA |
---|
226 | |
---|
227 | --------------------------------------------------- |
---|
228 | Phylip version 3.4 format (e.g., DNAML) |
---|
229 | -- Both Interleaved and sequential are permitted |
---|
230 | |
---|
231 | 5 13 (# seqs, #bases) |
---|
232 | Alpha AACGTGGCCAAAT |
---|
233 | aaaagggccc... (continued sp. alpha) |
---|
234 | Beta AAGGTCGCCAAAC |
---|
235 | aaaagggccc... (continued sp. beta) |
---|
236 | Gamma CATTTCGTCACAA |
---|
237 | aaaagggccc... (continued sp. Gamma) |
---|
238 | 1234567890^-- bases must start in col 11, and run 'til #bases |
---|
239 | (spaces, newlines and numbers are are ignored) |
---|
240 | |
---|
241 | --------------------------------------------------- |
---|
242 | Gary Olsen (multiple) sequence editor /print format: |
---|
243 | |
---|
244 | !--------------------- |
---|
245 | !17Oct91 -- error in original copy of olsen /print format, shifted right 1 space |
---|
246 | ! here is correct copy: |
---|
247 | 301 40 Tb.thiop CGCAGCGAAA----------GCUNUGCUAAUACCGCAUA-CGnCCUG----------------------------------------------------- Tb.thiop |
---|
248 | 123456789012345678901 |
---|
249 | 301 42 Rhc.purp CGUAGCGAAA----------GUUACGCUAAUACCGCAUA-UUCUGUG----------------------------------------------------- Rhc.purp |
---|
250 | |
---|
251 | 301 44 Rhc.gela nnngnCGAAA----------GCCGGAUUAAUACCGCAUA-CGACCUA----------------------------------------------------- Rhc.gela |
---|
252 | !--------------------- |
---|
253 | |
---|
254 | RNase P RNA components. on 20-FEB-90 17:23:58 |
---|
255 | |
---|
256 | 1 (E.c. pr ): Base pairing in Escherichia coli RNase P RNA. |
---|
257 | 2 (chrom ): Chromatium |
---|
258 | : |
---|
259 | 12 (B.brevis): Bacillus brevis RNase P RNA, B. James. |
---|
260 | 13 ( 90% con): 90% conserved |
---|
261 | 14 (100% con): 100% conserved |
---|
262 | 15 (gram+ pr): pairing |
---|
263 | |
---|
264 | 1 |
---|
265 | RNase P RNA components. on 20-FEB-90 17:23:58 |
---|
266 | |
---|
267 | Posi- Sequence |
---|
268 | tion: identity: Data: |
---|
269 | |
---|
270 | 1 1 E.c. pr <<<<<<<<<< {{{{{{{{<<:<<<<<<<<<<^<<<<<<====>>>> E.c. pr |
---|
271 | 1 2 chrom GGAGUCGGCCAGACAGUCGCUUCCGUCCU------------------ chrom |
---|
272 | : |
---|
273 | 1 12 B.brevis AUGCAGGAAAUGCGGGUAGCCGCUGCCGCAAUCGUCU------------- B.brevis |
---|
274 | 1234567890123456789012 <! this should be 21 not 22, |
---|
275 | ! this example must be inset on left by 1 space from olsen /print files ! |
---|
276 | 1 13 90% con G C G A CGC GC - - 90% con |
---|
277 | 1 14 100% con G A CGC 100% con |
---|
278 | 1 15 gram+ pr <<<<<<<<<< {{{{{{{{<<<<<<<<<<<<<=============== gram+ pr |
---|
279 | |
---|
280 | 60 1 E.c. pr >>>>>>^>>^>>>>:>> <<<^<<<< {{{{{ E.c. pr |
---|
281 | 60 2 chrom -----GGUG-ACGGGGGAGGAAAGUCCGG-GCUCCAU------------- chrom |
---|
282 | : : |
---|
283 | 60 10 B.stearo ----UU-CG-GCCGUAGAGGAAAGUCCAUGCUCGCACGGUGCUGAGAUGC B.stearo |
---|
284 | |
---|
285 | |
---|
286 | --------------------------------------------------- |
---|
287 | GCG MSF format |
---|
288 | Title line |
---|
289 | |
---|
290 | picorna.msf MSF: 100 Type: P January 17, 1991 17:53 Check: 541 |
---|
291 | .. |
---|
292 | Name: Cb3 Len: 100 Check: 7009 Weight: 1.00 |
---|
293 | Name: E Len: 100 Check: 60 Weight: 1.00 |
---|
294 | |
---|
295 | // |
---|
296 | |
---|
297 | 1 50 |
---|
298 | Cb3 ...gpvedai .......t.. aaigr..vad tvgtgptnse aipaltaaet |
---|
299 | E gvenae.kgv tentna.tad fvaqpvylpe .nqt...... kv.affynrs |
---|
300 | |
---|
301 | 51 100 |
---|
302 | |
---|
303 | Cb3 ghtsqvvpgd tmqtrhvkny hsrsestien flcrsacvyf teykn..... |
---|
304 | E ...spi.gaf tvks...... gs.lesgfap .fsngtc.pn sviltpgpqf |
---|
305 | |
---|
306 | --------------------------------------------------- |
---|
307 | PIR format |
---|
308 | This is NBRF-PIR MAILSERVER version 1.45 |
---|
309 | Command-> get PIR3:A31391 |
---|
310 | \\\ |
---|
311 | ENTRY A31391 #Type Protein |
---|
312 | TITLE *Esterase-6 - Fruit fly (Drosophila melanogaster) |
---|
313 | |
---|
314 | DATE 03-Aug-1992 #Sequence 03-Aug-1992 #Text 03-Aug-1992 |
---|
315 | PLACEMENT 0.0 0.0 0.0 0.0 0.0 |
---|
316 | COMMENT *This entry is not verified. |
---|
317 | SOURCE Drosophila melanogaster |
---|
318 | |
---|
319 | REFERENCE |
---|
320 | #Authors Cooke P.H., Oakeshott J.G. |
---|
321 | #Citation submitted to GenBank, April 1989 |
---|
322 | #Reference-number A31391 |
---|
323 | #Accession A31391 |
---|
324 | #Cross-reference GB:J04167 |
---|
325 | |
---|
326 | SUMMARY #Molecular-weight 61125 #Length 544 #Checksum 1679 |
---|
327 | SEQUENCE |
---|
328 | 5 10 15 20 25 30 |
---|
329 | 1 M N Y V G L G L I I V L S C L W L G S N A S D T D D P L L V |
---|
330 | 31 Q L P Q G K L R G R D N G S Y Y S Y E S I P Y A E P P T G D |
---|
331 | 61 L R F E A P E P Y K Q K W S D I F D A T K T P V A C L Q W D |
---|
332 | 91 Q F T P G A N K L V G E E D C L T V S V Y K P K N S K R N S |
---|
333 | 121 F P V V A H I H G G A F M F G A A W Q N G H E N V M R E G K |
---|
334 | 151 F I L V K I S Y R L G P L G F V S T G D R D L P G N Y G L K |
---|
335 | 181 D Q R L A L K W I K Q N I A S F G G E P Q N V L L V G H S A |
---|
336 | 211 G G A S V H L Q M L R E D F G Q L A R A A F S F S G N A L D |
---|
337 | 241 P W V I Q K G A R G R A F E L G R N V G C E S A E D S T S L |
---|
338 | 271 K K C L K S K P A S E L V T A V R K F L I F S Y V P F A P F |
---|
339 | 301 S P V L E P S D A P D A I I T Q D P R D V I K S G K F G Q V |
---|
340 | 331 P W A V S Y V T E D G G Y N A A L L L K E R K S G I V I D D |
---|
341 | 361 L N E R W L E L A P Y L L F Y R D T K T K K D M D D Y S R K |
---|
342 | 391 I K Q E Y I G N Q R F D I E S Y S E L Q R L F T D I L F K N |
---|
343 | 421 S T Q E S L D L H R K Y G K S P A Y A Y V Y D N P A E K G I |
---|
344 | 451 A Q V L A N R T D Y D F G T V H G D D Y F L I F E N F V R D |
---|
345 | 481 V E M R P D E Q I I S R N F I N M L A D F A S S D N G S L K |
---|
346 | 511 Y G E C D F K D N V G S E K F Q L L A I Y I D G C Q N R Q H |
---|
347 | 541 V E F P |
---|
348 | /// |
---|
349 | \\\ |
---|
350 | --------------------------------------------------- |
---|
351 | PAUP format: |
---|
352 | The NEXUS Format |
---|
353 | |
---|
354 | Every block starts with "BEGIN blockname;" and ends with "END;". |
---|
355 | Each block is composed of one or more statements, each |
---|
356 | terminated by a semicolon (;). |
---|
357 | |
---|
358 | Comments may be included in NEXUS files by enclosing them within |
---|
359 | square brackets, as in "[This is a comment]." |
---|
360 | |
---|
361 | NEXUS-conforming files are identified by a "#NEXUS" directive at |
---|
362 | the very beginning of the file (line 1, column 1). If the |
---|
363 | #NEXUS is omitted PAUP issues a warning but continues |
---|
364 | processing. |
---|
365 | |
---|
366 | NEXUS files are entirely free-format. Blanks, tabs, and |
---|
367 | newlines may be placed anywhere in the file. Unless RESPECTCASE |
---|
368 | is requested, commands and data may be entered in upper case, |
---|
369 | lower case, or a mixture of upper and lower case. |
---|
370 | |
---|
371 | The following conventions are used in the syntax descriptions of |
---|
372 | the various blocks. Upper-case items are entered exactly as |
---|
373 | shown. Lower-case items inside of angle brackets -- e.g., <x> |
---|
374 | -- represent items to be substituted by the user. Items inside |
---|
375 | of square brackets -- e.g., [X] -- are optional. Items inside |
---|
376 | of curly braces and separated by vertical bars -- e.g., { X | Y |
---|
377 | | Z } -- are mutually exclusive options. |
---|
378 | |
---|
379 | |
---|
380 | The DATA Block |
---|
381 | |
---|
382 | The DATA block contains the data matrix and other associated |
---|
383 | information. Its syntax is: |
---|
384 | |
---|
385 | BEGIN DATA; |
---|
386 | DIMENSIONS NTAX=<number of taxa> NCHAR=<number of characters>; |
---|
387 | [ FORMAT [ MISSING=<missing-symbol> ] |
---|
388 | [ LABELPOS={ LEFT | RIGHT } ] |
---|
389 | [ SYMBOLS="<symbols-list>" ] |
---|
390 | [ INTERLEAVE ] |
---|
391 | [ MATCHCHAR=<match-symbol> ] |
---|
392 | [ EQUATE="<symbol>=<expansion> [<symbol>=<expansion>...]" ] |
---|
393 | [ TRANSPOSE ] |
---|
394 | [ RESPECTCASE ] |
---|
395 | [ DATATYPE = { STANDARD | DNA | RNA | PROTEIN } ]; ] |
---|
396 | [ OPTIONS [ IGNORE={ INVAR | UNINFORM } ] |
---|
397 | [ MSTAXA = { UNCERTAIN | POLYMORPH | VARIABLE } ] |
---|
398 | [ ZAP = "<list of zapped characters>" ] ; ] |
---|
399 | [ CHARLABELS <label_1> <label_2> <label_NCHAR> ; ] |
---|
400 | [ TAXLABELS <label1_1> <label1_2> <label1_NTAX> ; ] |
---|
401 | [ STATELABELS <currently ignored by PAUP> ; ] |
---|
402 | MATRIX <data-matrix> ; |
---|
403 | END; |
---|
404 | |
---|
405 | --- example PAUP file |
---|
406 | |
---|
407 | #NEXUS |
---|
408 | |
---|
409 | [!Brown et al. (1982) primate mitochondrial DNA] |
---|
410 | |
---|
411 | begin data; |
---|
412 | dimensions ntax=5 nchar=896; |
---|
413 | format datatype=dna matchchar=. interleave missing='-'; |
---|
414 | matrix |
---|
415 | [ 2 4 6 8 ] |
---|
416 | [ 1 1 1 1 1 ] |
---|
417 | human aagcttcaccggcgcagtca ttctcataatcgcccacggR cttacatcctcattactatt ctgcctagcaaactcaaact acgaacgcactcacagtcgc |
---|
418 | chimp ................a.t. .c.................a ...............t.... ..................t. .t........c......... |
---|
419 | gorilla ..................tg ....t.....t........a ........a......t.... .................... .......a..c.....c... |
---|
420 | orang ................ac.. cc.....g..t.....t..a ..c........cc....g.. .................... .......a..c.....c... |
---|
421 | gibbon ......t..a..t...ac.g .c.................a ..a..c..t..cc.g..... ......t............. .......a........c... |
---|
422 | |
---|
423 | [ 8 8 8 8 8 8 ] |
---|
424 | [ 0 2 4 6 8 9 ] |
---|
425 | [ 1 1 1 1 1 6 ] |
---|
426 | human cttccccacaacaatattca tgtgcctagaccaagaagtt attatctcgaactgacactg agccacaacccaaacaaccc agctctccctaagctt |
---|
427 | chimp t................... .a................c. ........a.....g..... ...a................ ................ |
---|
428 | gorilla ..................tc .a................c. ........a.g......... ...a.............tt. .a.............. |
---|
429 | orang ta....a...........t. .c.......ga......acc ..cg..a.a......tg... .a.a..c.....g...cta. .a.....a........ |
---|
430 | gibbon a..t.......t........ ....ac...........acc .....t..a........... .a.tg..........gctag .a.............. |
---|
431 | ; |
---|
432 | end; |
---|
433 | --------------------------------------------------- |
---|
434 | |
---|
435 | |
---|
436 | |
---|
437 | |
---|
438 | |
---|
439 | |
---|
440 | ||||||||||| Sample SMTP mail header |
---|
441 | --------------------------------------------------- |
---|
442 | |
---|
443 | - - - - - - - - - |
---|
444 | From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:56 1991 |
---|
445 | Received: from genbank.bio.net by sunflower.bio.indiana.edu |
---|
446 | (4.1/9.5jsm) id AA19328; Sun, 10 Nov 91 17:28:55 EST |
---|
447 | Received: by genbank.bio.net (5.65/IG-2.0) |
---|
448 | id AA14458; Sun, 10 Nov 91 14:30:03 -0800 |
---|
449 | Date: Sun, 10 Nov 91 14:30:03 -0800 |
---|
450 | Message-Id: <9111102230.AA14458@genbank.bio.net> |
---|
451 | From: Database Server <GenBank-Retrieval-System@genbank.bio.net> |
---|
452 | To: gilbertd@sunflower.bio.indiana.edu |
---|
453 | Subject: Results of Query for drorna |
---|
454 | Status: R |
---|
455 | |
---|
456 | No matches on drorna. |
---|
457 | - - - - - - |
---|
458 | From GenBank-Retrieval-System@genbank.bio.net Sun Nov 10 17:28:49 1991 |
---|
459 | Received: from genbank.bio.net by sunflower.bio.indiana.edu |
---|
460 | (4.1/9.5jsm) id AA19323; Sun, 10 Nov 91 17:28:47 EST |
---|
461 | Received: by genbank.bio.net (5.65/IG-2.0) |
---|
462 | id AA14461; Sun, 10 Nov 91 14:30:03 -0800 |
---|
463 | Date: Sun, 10 Nov 91 14:30:03 -0800 |
---|
464 | Message-Id: <9111102230.AA14461@genbank.bio.net> |
---|
465 | From: Database Server <GenBank-Retrieval-System@genbank.bio.net> |
---|
466 | To: gilbertd@sunflower.bio.indiana.edu |
---|
467 | Subject: Results of Query for droest6 |
---|
468 | Status: R |
---|
469 | |
---|
470 | LOCUS DROEST6 1819 bp ss-mRNA INV 31-AUG-1987 |
---|
471 | DEFINITION D.melanogaster esterase-6 mRNA, complete cds. |
---|
472 | ACCESSION M15961 |
---|
473 | |
---|
474 | |
---|
475 | |
---|
476 | |
---|
477 | |
---|
478 | |
---|
479 | |
---|
480 | |
---|
481 | |
---|
482 | |
---|
483 | |
---|
484 | |
---|
485 | ||||||||||| GCG manual discussion of sequence symbols: |
---|
486 | --------------------------------------------------- |
---|
487 | |
---|
488 | III_SEQUENCE_SYMBOLS |
---|
489 | |
---|
490 | |
---|
491 | GCG programs allow all upper and lower case letters, periods (.), |
---|
492 | asterisks (*), pluses (+), ampersands (&), and ats (@) as symbols in |
---|
493 | biological sequences. Nucleotide symbols, their complements, and the |
---|
494 | standard one-letter amino acid symbols are shown below in separate lists. |
---|
495 | The meanings of the symbols +, &, and @ have not been assigned at this |
---|
496 | writing (March, 1989). |
---|
497 | |
---|
498 | GCG uses the letter codes for amino acid codes and nucleotide |
---|
499 | ambiguity proposed by IUB (Nomenclature Committee, 1985, |
---|
500 | Eur. J. Biochem. 150; 1-5). These codes are compatible with the codes |
---|
501 | used by the EMBL, GenBank, and NBRF data libraries. |
---|
502 | |
---|
503 | |
---|
504 | NUCLEOTIDES |
---|
505 | |
---|
506 | The meaning of each symbol, its complement, and the Cambridge and |
---|
507 | Stanford equivalents are shown below. Cambridge files can be converted |
---|
508 | into GCG files and vice versa with the programs FROMSTADEN and TOSTADEN. |
---|
509 | IntelliGenetics sequence files can be interconverted with the programs |
---|
510 | FROMIG and TOIG. |
---|
511 | |
---|
512 | IUB/GCG Meaning Complement Staden/Sanger Stanford |
---|
513 | |
---|
514 | A A T A A |
---|
515 | C C G C C |
---|
516 | G G C G G |
---|
517 | T/U T A T T/U |
---|
518 | M A or C K 5 J |
---|
519 | R A or G Y R R |
---|
520 | W A or T W 7 L |
---|
521 | S C or G S 8 M |
---|
522 | Y C or T R Y Y |
---|
523 | K G or T M 6 K |
---|
524 | V A or C or G B not supported N |
---|
525 | H A or C or T D not supported N |
---|
526 | D A or G or T H not supported N |
---|
527 | B C or G or T V not supported N |
---|
528 | X/N G or A or T or C X -/X N |
---|
529 | . not G or A or T or C . not supported ? |
---|
530 | |
---|
531 | |
---|
532 | The frame ambiguity codes used by Staden are not supported by GCG |
---|
533 | and are translated by FROMSTADEN as the lower case single base |
---|
534 | equivalent. |
---|
535 | |
---|
536 | Staden Code Meaning GCG |
---|
537 | |
---|
538 | D C or CC c |
---|
539 | V T or TT t |
---|
540 | B A or AA a |
---|
541 | H G or GG g |
---|
542 | K C or CX c |
---|
543 | L T or TX t |
---|
544 | M A or AX a |
---|
545 | N G or GX g |
---|
546 | |
---|
547 | |
---|
548 | AMINO ACIDS |
---|
549 | |
---|
550 | Here is a list of the standard one-letter amino acid codes and their |
---|
551 | three-letter equivalents. The synonymous codons and their depiction in |
---|
552 | the IUB codes are shown. You should recognize that the codons following |
---|
553 | semicolons (;) are not sufficiently specific to define a single amino |
---|
554 | acid even though they represent the best possible back translation into |
---|
555 | the IUB codes! All of the relationships in this list can be redefined by |
---|
556 | the user in a local data file described below. |
---|
557 | |
---|
558 | IUB |
---|
559 | Symbol 3-letter Meaning Codons Depiction |
---|
560 | A Ala Alanine GCT,GCC,GCA,GCG !GCX |
---|
561 | B Asp,Asn Aspartic, |
---|
562 | Asparagine GAT,GAC,AAT,AAC !RAY |
---|
563 | C Cys Cysteine TGT,TGC !TGY |
---|
564 | D Asp Aspartic GAT,GAC !GAY |
---|
565 | E Glu Glutamic GAA,GAG !GAR |
---|
566 | F Phe Phenylalanine TTT,TTC !TTY |
---|
567 | G Gly Glycine GGT,GGC,GGA,GGG !GGX |
---|
568 | H His Histidine CAT,CAC !CAY |
---|
569 | I Ile Isoleucine ATT,ATC,ATA !ATH |
---|
570 | K Lys Lysine AAA,AAG !AAR |
---|
571 | L Leu Leucine TTG,TTA,CTT,CTC,CTA,CTG |
---|
572 | !TTR,CTX,YTR;YTX |
---|
573 | M Met Methionine ATG !ATG |
---|
574 | N Asn Asparagine AAT,AAC !AAY |
---|
575 | P Pro Proline CCT,CCC,CCA,CCG !CCX |
---|
576 | Q Gln Glutamine CAA,CAG !CAR |
---|
577 | R Arg Arginine CGT,CGC,CGA,CGG,AGA,AGG |
---|
578 | !CGX,AGR,MGR;MGX |
---|
579 | S Ser Serine TCT,TCC,TCA,TCG,AGT,AGC !TCX,AGY;WSX |
---|
580 | T Thr Threonine ACT,ACC,ACA,ACG !ACX |
---|
581 | V Val Valine GTT,GTC,GTA,GTG !GTX |
---|
582 | W Trp Tryptophan TGG !TGG |
---|
583 | X Xxx Unknown !XXX |
---|
584 | Y Tyr Tyrosine TAT, TAC !TAY |
---|
585 | Z Glu,Gln Glutamic, |
---|
586 | Glutamine GAA,GAG,CAA,CAG !SAR |
---|
587 | * End Terminator TAA, TAG, TGA !TAR,TRA;TRR |
---|
588 | |
---|
589 | |
---|
590 | |
---|
591 | |
---|
592 | |
---|
593 | |
---|
594 | |
---|
595 | |
---|
596 | ||||||||||| docs from PSC on sequence formats: |
---|
597 | --------------------------------------------------- |
---|
598 | |
---|
599 | |
---|
600 | Nucleic Acid and Protein Sequence File Formats |
---|
601 | |
---|
602 | |
---|
603 | It will probably save you some time if you have your data in a usable |
---|
604 | format before you send it to us. However, we do have the University of |
---|
605 | Wisconsin Genetics Computing Group programs running on our VAXen and |
---|
606 | this package includes several reformatting utilities. Our programs |
---|
607 | usually recognize any of several standard formats, including GenBank, |
---|
608 | EMBL, NBRF, and MolGen/Stanford. For the purposes of annotating an |
---|
609 | analysis we find the GenBank and EMBL formats most useful, particularly |
---|
610 | if you have already received an accession number from one of these |
---|
611 | organizations for your sequence. |
---|
612 | |
---|
613 | Our programs do not require that all of the line types available in |
---|
614 | GenBank, EMBL, or NBRF file formats be present for the file format to |
---|
615 | be recognized and processed. The following pages outline the essential |
---|
616 | details required for correct processing of files by our programs. |
---|
617 | Additional information may be present but will generally be ignored. |
---|
618 | |
---|
619 | |
---|
620 | GenBank File Format |
---|
621 | |
---|
622 | File Header |
---|
623 | |
---|
624 | 1. The first line in the file must have "GENETIC SEQUENCE DATA BANK" |
---|
625 | in spaces 20 through 46 (see LINE 1, below). |
---|
626 | 2. The next 8 lines may contain arbitrary text. They are ignored but |
---|
627 | are required to maintain the GenBank format (see LINE 2 - LINE 9). |
---|
628 | |
---|
629 | Sequence Data Entries |
---|
630 | |
---|
631 | 3. Each sequence entry in the file should have the following format. |
---|
632 | |
---|
633 | a) first line: |
---|
634 | |
---|
635 | Must have LOCUS in the first 5 spaces. The |
---|
636 | genetic locus name or identifier must be in spaces |
---|
637 | 13 - 22. The length of the sequences is right |
---|
638 | justified in spaces 23 through 29 (see LINE 10). |
---|
639 | |
---|
640 | b) second line: |
---|
641 | |
---|
642 | Must have DEFINITION in the first 10 spaces. |
---|
643 | Spaces 13 - 80 are free form text to identify the |
---|
644 | sequence (see LINE 11). |
---|
645 | |
---|
646 | c) third line: |
---|
647 | |
---|
648 | Must have ACCESSION in the first 9 spaces. Spaces |
---|
649 | 13 - 18 must hold the primary accession number |
---|
650 | (see LINE 12). |
---|
651 | |
---|
652 | d) fourth line: |
---|
653 | |
---|
654 | Must have ORIGIN in the first 6 spaces. Nothing |
---|
655 | else is required on this line, it indicates that |
---|
656 | the nucleic acid sequence begins on the next line |
---|
657 | (see LINE 13). |
---|
658 | |
---|
659 | e) fifth line: |
---|
660 | |
---|
661 | Begins the nucleotide sequence. The first 9 |
---|
662 | spaces of each sequence line may either be blank |
---|
663 | or may contain the position in the sequence of the |
---|
664 | first nucleotide on the line. The next 66 spaces |
---|
665 | hold the nucleotide sequence in six blocks of ten |
---|
666 | nucleotides. Each of the six blocks begins with a |
---|
667 | blank space followed by ten nucleotides. Thus the |
---|
668 | first nucleotide is in space eleven of the line while |
---|
669 | the last is in space 75 (see LINE 14, LINE 15). |
---|
670 | |
---|
671 | f) last line: |
---|
672 | |
---|
673 | Must have // in the first 2 spaces to indicate |
---|
674 | termination of the sequence (see LINE 16). |
---|
675 | |
---|
676 | NOTE: Multiple sequences may appear in each file. To begin another |
---|
677 | sequence go back to a) and start again. |
---|
678 | |
---|
679 | |
---|
680 | Example GenBank file |
---|
681 | |
---|
682 | |
---|
683 | LINE 1 : GENETIC SEQUENCE DATA BANK |
---|
684 | LINE 2 : |
---|
685 | LINE 3 : |
---|
686 | LINE 4 : |
---|
687 | LINE 5 : |
---|
688 | LINE 6 : |
---|
689 | LINE 7 : |
---|
690 | LINE 8 : |
---|
691 | LINE 9 : |
---|
692 | LINE 10 :LOCUS L_Name Length BP |
---|
693 | LINE 11 :DEFINITION Describe the sequence any way you want |
---|
694 | LINE 12 :ACCESSION Accession Number |
---|
695 | LINE 13 :ORIGIN |
---|
696 | LINE 14 : 1 acgtacgtac gtacgtacgt acgtacgtac gtacgtacgt a... |
---|
697 | LINE 15 : 61 acgt... |
---|
698 | LINE 16 :// |
---|
699 | |
---|
700 | |
---|
701 | |
---|
702 | EMBL File Format |
---|
703 | |
---|
704 | Unlike the GenBank file format the EMBL file format does not require |
---|
705 | a series of header lines. Thus the first line in the file begins |
---|
706 | the first sequence entry of the file. |
---|
707 | |
---|
708 | 1. The first line of each sequence entry contains the two letters ID |
---|
709 | in the first two spaces. This is followed by the EMBL identifier |
---|
710 | in spaces 6 through 14. (See LINE 1). |
---|
711 | |
---|
712 | 2. The second line of each sequence entry has the two letters AC in |
---|
713 | the first two spaces. This is followed by the accession number in |
---|
714 | spaces 6 through 11. (See LINE 2). |
---|
715 | |
---|
716 | 3. The third line of each sequence entry has the two letters DE in the |
---|
717 | first two spaces. This is followed by a free form text definition |
---|
718 | in spaces 6 through 72. (See LINE 3). |
---|
719 | |
---|
720 | 4. The fourth line in each sequence entry has the two letters SQ in |
---|
721 | the first two spaces. This is followed by the length of the |
---|
722 | sequence beginning at or after space 13. After the sequence length |
---|
723 | there is a blank space and the two letters BP. (See LINE 4). |
---|
724 | |
---|
725 | 5. The nucleotide sequence begins on the fifth line of the sequence |
---|
726 | entry. Each line of sequence begins with four blank spaces. The |
---|
727 | next 66 spaces hold the nucleotide sequence in six blocks of ten |
---|
728 | nucleotides. Each of the six blocks begins with a blank space |
---|
729 | followed by ten nucleotides. Thus the first nucleotide is in space |
---|
730 | 6 of the line while the last is in space 70. (See LINE 5 - |
---|
731 | LINE 6). |
---|
732 | |
---|
733 | 6. The last line of each sequence entry in the file is a terminator |
---|
734 | line which has the two characters // in the first two spaces. |
---|
735 | (See LINE 7). |
---|
736 | |
---|
737 | 7. Multiple sequences may appear in each file. To begin another |
---|
738 | sequence go back to item 1 and start again. |
---|
739 | |
---|
740 | |
---|
741 | Example EMBL file |
---|
742 | |
---|
743 | LINE 1 :ID ID_name |
---|
744 | LINE 2 :AC Accession number |
---|
745 | LINE 3 :DE Describe the sequence any way you want |
---|
746 | LINE 4 :SQ Length BP |
---|
747 | LINE 5 : ACGTACGTAC GTACGTACGT ACGTACGTAC GTACGTA... |
---|
748 | LINE 6 : ACGT... |
---|
749 | LINE 7 :// |
---|
750 | |
---|
751 | |
---|
752 | |
---|
753 | NBRF (protein or nucleic acid) File Format |
---|
754 | |
---|
755 | 1. The first line of each sequence entry begins with a greater than |
---|
756 | symbol, >. This is immediately followed by the two character |
---|
757 | sequence type specifier. Space four must contain a semi-colon. |
---|
758 | Beginning in space five is the sequence name or identification code |
---|
759 | for the NBRF database. The code is from four to six letters and |
---|
760 | numbers. (See LINE 1). |
---|
761 | |
---|
762 | !!!! >> add these to readseq |
---|
763 | Specifier Sequence type |
---|
764 | |
---|
765 | P1 protein, complete |
---|
766 | F1 protein, fragment |
---|
767 | DL DNA, linear |
---|
768 | DC DNA, circular |
---|
769 | RL RNA, linear |
---|
770 | RC RNA, circular |
---|
771 | N1 functional RNA, other than tRNA |
---|
772 | N3 tRNA |
---|
773 | |
---|
774 | 2. The second line of each sequence entry contains two kinds of |
---|
775 | information. First is the sequence name which is separated from |
---|
776 | the organism or organelle name by the three character sequence |
---|
777 | blank space, dash, blank space, " - ". There is no special |
---|
778 | character marking the beginning of this line. (See LINE 2). |
---|
779 | |
---|
780 | 3. Either the amino acid or nucleic acid sequence begins on line three |
---|
781 | and can begin in any space, including the first. The sequence is |
---|
782 | free format and may be interrupted by blanks for ease of reading. |
---|
783 | Protein sequences man contain special punctuation to indicate |
---|
784 | various indeterminacies in the sequence. In the NBRF data files |
---|
785 | all lines may be up to 500 characters long. However some PSC |
---|
786 | programs currently have a limit of 130 characters per line |
---|
787 | (including blanks), and BitNet will not accept lines of over eighty |
---|
788 | characters. (See LINE 3, LINE 4, and LINE 5). |
---|
789 | |
---|
790 | The last character in the sequence must be an asterisks, *. |
---|
791 | |
---|
792 | Example NBRF file |
---|
793 | |
---|
794 | LINE 1 :>P1;CBRT |
---|
795 | LINE 2 :Cytochrome b - Rat mitochondrion (SGC1) |
---|
796 | LINE 3 :M T N I R K S H P L F K I I N H S F I D L P A P S |
---|
797 | LINE 4 : VTHICRDVN Y GWL IRY |
---|
798 | LINE 5 :TWIGGQPVEHPFIIIGQLASISYFSIILILMPISGIVEDKMLKWN* |
---|
799 | |
---|
800 | |
---|
801 | |
---|
802 | MolGen/Stanford File Format |
---|
803 | |
---|
804 | 1. The first line in a sequence file is a comment line. This line |
---|
805 | begins with a semi-colon in the first space. This line need |
---|
806 | not be present. If it is present it holds descriptive text. |
---|
807 | There may be as many comment lines as desired at the first of |
---|
808 | sequence file. (See LINE 1). |
---|
809 | |
---|
810 | 2. The second line must be present and contains an identifier or |
---|
811 | name for the sequence in the first ten spaces. (See LINE 2). |
---|
812 | |
---|
813 | 3. The sequence begins on the third line and occupies up to eighty |
---|
814 | spaces. Spaces may be included in the sequence for ease of |
---|
815 | reading. The sequence continues for as many line as needed |
---|
816 | and is terminated with a 1 or 2. 1 indicates a linear sequence |
---|
817 | while 2 marks a circular sequence. (See LINE 3 and LINE 4). |
---|
818 | |
---|
819 | Example MolGen/Stanford file |
---|
820 | |
---|
821 | LINE 1 :; Describe the sequence any way you want |
---|
822 | LINE 2 :ECTRNAGLY2 |
---|
823 | LINE 3 :ACGCACGTAC ACGTACGTAC A C G T C C G T ACG TAC GTA CGT |
---|
824 | LINE 4 : GCTTA GG G C T A1 |
---|
825 | |
---|
826 | |
---|
827 | |
---|
828 | |
---|
829 | ||||||||||| Phylip file format |
---|
830 | --------------------------------------------------- |
---|
831 | |
---|
832 | Phylip 3.3 File Format (DNA sequences) |
---|
833 | |
---|
834 | |
---|
835 | The input and output formats for PROTPARS and for RESTML are described in |
---|
836 | their document files. In general their input formats are similar to those |
---|
837 | described here, except that the one-letter codes for data are specific to those |
---|
838 | programs and are described in those document files. Since the input formats |
---|
839 | for the eight DNA sequence programs apply to all eight, they are described |
---|
840 | here. Their input formats are standard: the data have A's, G's, C's and T's |
---|
841 | (or U's). The first line of the input file contains the number of species and |
---|
842 | the number of sites. As with the other programs, options information may |
---|
843 | follow this. In the case of DNAML, DNAMLK, and DNADIST an additional line |
---|
844 | (described in the document file for these pograms) may follow the first one. |
---|
845 | Following this, each species starts on a new line. The first 10 characters of |
---|
846 | that line are the species name. There then follows the base sequence of that |
---|
847 | species, each character being one of the letters A, B, C, D, G, H, K, M, N, O, |
---|
848 | R, S, T, U, V, W, X, Y, ?, or - (a period was also previously allowed but it is |
---|
849 | no longer allowed, because it sometimes is used to in aligned sequences to mean |
---|
850 | "the same as the sequence above"). Blanks will be ignored, and so will |
---|
851 | numerical digits. This allows GENBANK and EMBL sequence entries to be read |
---|
852 | with minimum editing. |
---|
853 | |
---|
854 | These characters can be either upper or lower case. The algorithms |
---|
855 | convert all input characters to upper case (which is how they are treated). |
---|
856 | The characters constitute the IUPAC (IUB) nucleic acid code plus some slight |
---|
857 | extensions. They enable input of nucleic acid sequences taking full account of |
---|
858 | any ambiguities in the sequence. |
---|
859 | |
---|
860 | The sequences can continue over multiple lines; when this is done the sequences |
---|
861 | must be either in "interleaved" format, similar to the output of alignment |
---|
862 | programs, or "sequential" format. These are described in the main document |
---|
863 | file. In sequential format all of one sequence is given, possibly on multiple |
---|
864 | lines, before the next starts. In interleaved format the first part of the |
---|
865 | file should contain the first part of each of the sequences, then possibly a |
---|
866 | line containing nothing but a carriage-return character, then the second part |
---|
867 | of each sequence, and so on. Only the first parts of the sequences should be |
---|
868 | preceded by names. Here is a hypothetical example of interleaved format: |
---|
869 | |
---|
870 | 5 42 |
---|
871 | Turkey AAGCTNGGGC ATTTCAGGGT |
---|
872 | Salmo gairAAGCCTTGGC AGTGCAGGGT |
---|
873 | H. SapiensACCGGTTGGC CGTTCAGGGT |
---|
874 | Chimp AAACCCTTGC CGTTACGCTT |
---|
875 | Gorilla AAACCCTTGC CGGTACGCTT |
---|
876 | |
---|
877 | GAGCCCGGGC AATACAGGGT AT |
---|
878 | GAGCCGTGGC CGGGCACGGT AT |
---|
879 | ACAGGTTGGC CGTTCAGGGT AA |
---|
880 | AAACCGAGGC CGGGACACTC AT |
---|
881 | AAACCATTGC CGGTACGCTT AA |
---|
882 | |
---|
883 | while in sequential format the same sequences would be: |
---|
884 | |
---|
885 | 5 42 |
---|
886 | Turkey AAGCTNGGGC ATTTCAGGGT |
---|
887 | GAGCCCGGGC AATACAGGGT AT |
---|
888 | Salmo gairAAGCCTTGGC AGTGCAGGGT |
---|
889 | GAGCCGTGGC CGGGCACGGT AT |
---|
890 | H. SapiensACCGGTTGGC CGTTCAGGGT |
---|
891 | ACAGGTTGGC CGTTCAGGGT AA |
---|
892 | Chimp AAACCCTTGC CGTTACGCTT |
---|
893 | AAACCGAGGC CGGGACACTC AT |
---|
894 | Gorilla AAACCCTTGC CGGTACGCTT |
---|
895 | AAACCATTGC CGGTACGCTT AA |
---|
896 | |
---|
897 | |
---|
898 | Note, of course, that a portion of a sequence like this: |
---|
899 | |
---|
900 | 300 AAGCGTGAAC GTTGTACTAA TRCAG |
---|
901 | |
---|
902 | is perfectly legal, assuming that the species name has gone before, and is |
---|
903 | filled out to full length by blanks. The above digits and blanks will be |
---|
904 | ignored, the sequence being taken as starting at the first base symbol (in this |
---|
905 | case an A). |
---|
906 | |
---|
907 | The present versions of the programs may sometimes have difficulties with |
---|
908 | the blank lines between groups of lines, and if so you might want to retype |
---|
909 | those lines, making sure that they have only a carriage-return and no blank |
---|
910 | characters on them, or you may perhaps have to eliminate them. The symptoms of |
---|
911 | this problem are that the programs complain that the sequences are not properly |
---|
912 | aligned, and you can find no other cause for this complaint. |
---|
913 | |
---|
914 | ------------------------------------------------ |
---|
915 | |
---|
916 | |
---|
917 | ||||||||||| ASN.1 file format |
---|
918 | --------------------------------------------------- |
---|
919 | |
---|
920 | |
---|
921 | ASN.1 -- see NCBI toolkit docs, source and examples (ncbi.nlm.nih.gov) |
---|
922 | |
---|
923 | Example asn.1 sequence file---- |
---|
924 | |
---|
925 | Bioseq-set ::= { |
---|
926 | seq-set { |
---|
927 | seq { |
---|
928 | id { local id 1 } , -- id essential |
---|
929 | descr { title "Dummy sequence data from nowhere" } , -- optional |
---|
930 | inst { -- inst essential |
---|
931 | repr raw , |
---|
932 | mol dna , |
---|
933 | length 156 , |
---|
934 | topology linear , |
---|
935 | seq-data |
---|
936 | iupacna "GAATTCATTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA |
---|
937 | TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG |
---|
938 | TGGATTCAAAGCAATAGAGTTGTTCTT" |
---|
939 | } } , |
---|
940 | |
---|
941 | seq { |
---|
942 | id { local id 2 } , |
---|
943 | descr { title "Dummy sequence 2 data from somewhere else" } , |
---|
944 | inst { |
---|
945 | repr raw , |
---|
946 | mol dna , |
---|
947 | length 150 , |
---|
948 | topology linear , |
---|
949 | seq-data |
---|
950 | iupacna "TTTTTTTTTTTTGAAACAAATCGACCTGACGACGGAATGGTACTCGAATTA |
---|
951 | TGGGCCAAAGGGTTTTATGGGACAAATTAATAGGTGTTCATTATATGCCACTTTCGGAGATTAGATACAGCAATGCAG |
---|
952 | TGGATTCAAAGCAATAGAGTT" |
---|
953 | } |
---|
954 | } |
---|
955 | } |
---|
956 | } |
---|
957 | |
---|
958 | |
---|
959 | partial ASN.1 description from toolkit |
---|
960 | |
---|
961 | Bioseq ::= SEQUENCE { |
---|
962 | id SET OF Seq-id , -- equivalent identifiers |
---|
963 | descr Seq-descr OPTIONAL , -- descriptors |
---|
964 | inst Seq-inst , -- the sequence data |
---|
965 | annot SET OF Seq-annot OPTIONAL } |
---|
966 | |
---|
967 | Seq-inst ::= SEQUENCE { -- the sequence data itself |
---|
968 | repr ENUMERATED { -- representation class |
---|
969 | not-set (0) , -- empty |
---|
970 | virtual (1) , -- no seq data |
---|
971 | raw (2) , -- continuous sequence |
---|
972 | seg (3) , -- segmented sequence |
---|
973 | const (4) , -- constructed sequence |
---|
974 | ref (5) , -- reference to another sequence |
---|
975 | consen (6) , -- consensus sequence or pattern |
---|
976 | map (7) , -- ordered map (genetic, restriction) |
---|
977 | other (255) } , |
---|
978 | mol ENUMERATED { -- molecule class in living organism |
---|
979 | not-set (0) , -- > cdna = rna |
---|
980 | dna (1) , |
---|
981 | rna (2) , |
---|
982 | aa (3) , |
---|
983 | na (4) , -- just a nucleic acid |
---|
984 | other (255) } , |
---|
985 | length INTEGER OPTIONAL , -- length of sequence in residues |
---|
986 | fuzz Int-fuzz OPTIONAL , -- length uncertainty |
---|
987 | topology ENUMERATED { -- topology of molecule |
---|
988 | not-set (0) , |
---|
989 | linear (1) , |
---|
990 | circular (2) , |
---|
991 | tandem (3) , -- some part of tandem repeat |
---|
992 | other (255) } DEFAULT linear , |
---|
993 | strand ENUMERATED { -- strandedness in living organism |
---|
994 | not-set (0) , |
---|
995 | ss (1) , -- single strand |
---|
996 | ds (2) , -- double strand |
---|
997 | mixed (3) , |
---|
998 | other (255) } OPTIONAL , -- default ds for DNA, ss for RNA, pept |
---|
999 | seq-data Seq-data OPTIONAL , -- the sequence |
---|
1000 | ext Seq-ext OPTIONAL , -- extensions for special types |
---|
1001 | hist Seq-hist OPTIONAL } -- sequence history |
---|
1002 | |
---|
1003 | ------------------------------------------------ |
---|