1 | // -------------- genbank related subroutines ----------------- |
---|
2 | |
---|
3 | #include "genbank.h" |
---|
4 | #include "wrap.h" |
---|
5 | |
---|
6 | #define NOPERIOD 0 |
---|
7 | #define PERIOD 1 |
---|
8 | |
---|
9 | void genbank_key_word(const char *line, int index, char *key) { |
---|
10 | ca_assert((GBINDENT-index) >= 0); |
---|
11 | int len = parse_key_word(line+index, key, " \t\n"); |
---|
12 | if ((index+len) >= GBINDENT) { |
---|
13 | key[GBINDENT-index] = 0; |
---|
14 | } |
---|
15 | } |
---|
16 | |
---|
17 | static int genbank_check_blanks(const char *line, int numb) { |
---|
18 | // Check if there is (numb) of blanks at beginning of line. |
---|
19 | int blank = 1, indi, indk; |
---|
20 | |
---|
21 | for (indi = 0; blank && indi < numb; indi++) { |
---|
22 | if (line[indi] != ' ' && line[indi] != '\t') |
---|
23 | blank = 0; |
---|
24 | if (line[indi] == '\t') { |
---|
25 | indk = indi / 8 + 1; |
---|
26 | indi = 8 * indk + 1; |
---|
27 | } |
---|
28 | } |
---|
29 | |
---|
30 | return (blank); |
---|
31 | } |
---|
32 | |
---|
33 | static void genbank_continue_line(char*& Str, int numb, Reader& reader) { |
---|
34 | // if following line(s) are continued line(s), append them to 'Str'. |
---|
35 | // if 'Str' is NULL, lines only get skipped. |
---|
36 | // 'numb' = number of blanks needed at BOL to defined continued lines |
---|
37 | |
---|
38 | // check continue lines |
---|
39 | for (++reader; |
---|
40 | reader.line() && (genbank_check_blanks(reader.line(), numb) || reader.line()[0] == '\n'); |
---|
41 | ++reader) |
---|
42 | { |
---|
43 | if (reader.line()[0] != '\n') { // empty line is allowed |
---|
44 | if (Str) { |
---|
45 | // remove end-of-line, if there is any |
---|
46 | int ind = Skip_white_space(reader.line(), 0); |
---|
47 | char temp[LINESIZE]; |
---|
48 | strcpy(temp, (reader.line() + ind)); |
---|
49 | skip_eolnl_and_append_spaced(Str, temp); |
---|
50 | } |
---|
51 | } |
---|
52 | } |
---|
53 | } |
---|
54 | |
---|
55 | static void genbank_one_entry_in(char*& datastring, Reader& reader) { |
---|
56 | freedup(datastring, reader.line()+Skip_white_space(reader.line(), GBINDENT)); |
---|
57 | return genbank_continue_line(datastring, GBINDENT, reader); |
---|
58 | } |
---|
59 | |
---|
60 | static void genbank_one_comment_entry(char*& datastring, int start_index, Reader& reader) { |
---|
61 | // Read in one genbank sub-entry in comments lines. |
---|
62 | freedup(datastring, reader.line() + Skip_white_space(reader.line(), start_index)); |
---|
63 | genbank_continue_line(datastring, 20, reader); |
---|
64 | } |
---|
65 | |
---|
66 | static void genbank_source(GenBank& gbk, Reader& reader) { |
---|
67 | // Read in genbank SOURCE lines and also ORGANISM lines. |
---|
68 | genbank_one_entry_in(gbk.source, reader); |
---|
69 | char key[TOKENSIZE]; |
---|
70 | genbank_key_word(reader.line(), 2, key); |
---|
71 | if (str_equal(key, "ORGANISM")) { |
---|
72 | int indent = Skip_white_space(reader.line(), GBINDENT); |
---|
73 | freedup(gbk.organism, reader.line() + indent); |
---|
74 | |
---|
75 | char *skip_em = NULL; |
---|
76 | genbank_continue_line(skip_em, GBINDENT, reader); |
---|
77 | } |
---|
78 | } |
---|
79 | |
---|
80 | class startsWithBlanks : virtual Noncopyable { |
---|
81 | int blanks; |
---|
82 | |
---|
83 | public: |
---|
84 | startsWithBlanks(int blanks_) : blanks(blanks_) {} |
---|
85 | bool operator()(const char *line) const { return genbank_check_blanks(line, blanks); } |
---|
86 | }; |
---|
87 | |
---|
88 | |
---|
89 | static void genbank_skip_unidentified(Reader& reader, int blank_num) { |
---|
90 | // Skip the lines of unidentified keyword. |
---|
91 | ++reader; |
---|
92 | startsWithBlanks num_blanks(blank_num); |
---|
93 | reader.skipOverLinesThat(num_blanks); |
---|
94 | } |
---|
95 | |
---|
96 | static void genbank_reference(GenBank& gbk, Reader& reader) { |
---|
97 | // Read in genbank REFERENCE lines. |
---|
98 | int refnum; |
---|
99 | ASSERT_RESULT(int, 1, sscanf(reader.line() + GBINDENT, "%d", &refnum)); |
---|
100 | if (refnum <= gbk.get_refcount()) { |
---|
101 | warningf(17, "Might redefine reference %d", refnum); |
---|
102 | genbank_skip_unidentified(reader, GBINDENT); |
---|
103 | } |
---|
104 | else { |
---|
105 | gbk.resize_refs(refnum); |
---|
106 | genbank_one_entry_in(gbk.get_latest_ref().ref, reader); |
---|
107 | } |
---|
108 | |
---|
109 | GenbankRef& ref = gbk.get_latest_ref(); |
---|
110 | |
---|
111 | for (; reader.line() && reader.line()[0] == ' ' && reader.line()[1] == ' ';) { |
---|
112 | char key[TOKENSIZE]; |
---|
113 | genbank_key_word(reader.line(), 2, key); |
---|
114 | if (str_equal(key, "AUTHORS")) { |
---|
115 | if (has_content(ref.author)) warningf(10, "AUTHORS of REFERENCE %d is redefined", refnum); |
---|
116 | genbank_one_entry_in(ref.author, reader); |
---|
117 | terminate_with(ref.author, '.'); // add '.' if missing at the end |
---|
118 | } |
---|
119 | else if (str_equal(key, "TITLE")) { |
---|
120 | if (has_content(ref.title)) warningf(11, "TITLE of REFERENCE %d is redefined", refnum); |
---|
121 | genbank_one_entry_in(ref.title, reader); |
---|
122 | } |
---|
123 | else if (str_equal(key, "JOURNAL")) { |
---|
124 | if (has_content(ref.journal)) warningf(12, "JOURNAL of REFERENCE %d is redefined", refnum); |
---|
125 | genbank_one_entry_in(ref.journal, reader); |
---|
126 | } |
---|
127 | else if (str_equal(key, "STANDARD")) { |
---|
128 | if (has_content(ref.standard)) warningf(13, "STANDARD of REFERENCE %d is redefined", refnum); |
---|
129 | genbank_one_entry_in(ref.standard, reader); |
---|
130 | } |
---|
131 | else { |
---|
132 | warningf(18, "Unidentified REFERENCE subkeyword: %s#", key); |
---|
133 | genbank_skip_unidentified(reader, GBINDENT); |
---|
134 | } |
---|
135 | } |
---|
136 | } |
---|
137 | |
---|
138 | static void genbank_comments(GenBank& gbk, Reader& reader) { |
---|
139 | // Read in genbank COMMENTS lines. |
---|
140 | char key[TOKENSIZE]; |
---|
141 | |
---|
142 | if (str0len(reader.line()) <= GBINDENT) { |
---|
143 | ++reader; |
---|
144 | if (!reader.line()) return; |
---|
145 | } |
---|
146 | |
---|
147 | // replace keyword with spaces |
---|
148 | // => identical format for 1st and following lines. |
---|
149 | { |
---|
150 | char *line = strdup(reader.line()); |
---|
151 | for (int indi = 0; indi < GBINDENT; line[indi++] = ' ') {} |
---|
152 | reader.set_line(line); |
---|
153 | free(line); |
---|
154 | } |
---|
155 | |
---|
156 | |
---|
157 | for (; reader.line() && (genbank_check_blanks(reader.line(), GBINDENT) || reader.line()[0] == '\n');) { |
---|
158 | if (reader.line()[0] == '\n') { // skip empty line |
---|
159 | ++reader; |
---|
160 | continue; |
---|
161 | } |
---|
162 | |
---|
163 | int index = Skip_white_space(reader.line(), GBINDENT); |
---|
164 | ca_assert(index<TOKENSIZE); // buffer overflow ? |
---|
165 | |
---|
166 | index += comment_subkey(reader.line()+index, key); |
---|
167 | ca_assert(index<TOKENSIZE); // buffer overflow ? |
---|
168 | |
---|
169 | RDP_comment_parser one_comment_entry = genbank_one_comment_entry; |
---|
170 | RDP_comments& comments = gbk.comments; |
---|
171 | |
---|
172 | if (!parse_RDP_comment(comments, one_comment_entry, key, index, reader)) { |
---|
173 | // other comments |
---|
174 | Append(comments.others, reader.line() + GBINDENT); |
---|
175 | ++reader; |
---|
176 | } |
---|
177 | } |
---|
178 | } |
---|
179 | |
---|
180 | inline bool valid_acc_char(char ch) { return isalnum(ch) || ch == '_'; } |
---|
181 | |
---|
182 | static void genbank_verify_accession(GenBank& gbk) { |
---|
183 | // Verify accession information. |
---|
184 | if (str_equal(gbk.accession, "No information\n")) return; // @@@ really allow this ? |
---|
185 | |
---|
186 | char *new_acc = NULL; |
---|
187 | const char *sep = " \t\n;"; |
---|
188 | SmartCharPtr req_fail; |
---|
189 | SmartCharPtr copy = strdup(gbk.accession); |
---|
190 | int count = 0; |
---|
191 | |
---|
192 | for (char *acc = strtok(&*copy, sep); acc && req_fail.isNull(); acc = strtok(NULL, sep)) { |
---|
193 | count++; |
---|
194 | if (!isalpha(acc[0])) req_fail = strdup("has to start with a letter"); |
---|
195 | else { |
---|
196 | for (int i = 0; acc[i]; ++i) { |
---|
197 | if (!valid_acc_char(acc[i])) { |
---|
198 | req_fail = strf("invalid char '%c'", acc[i]); |
---|
199 | break; |
---|
200 | } |
---|
201 | } |
---|
202 | } |
---|
203 | |
---|
204 | if (new_acc) Append(new_acc, ' '); |
---|
205 | Append(new_acc, acc); |
---|
206 | } |
---|
207 | |
---|
208 | if (req_fail.isNull() && count>9) { |
---|
209 | req_fail = strf("No more than 9 accession number allowed (found %i)", count); |
---|
210 | } |
---|
211 | |
---|
212 | if (!req_fail.isNull()) { |
---|
213 | skip_eolnl_and_append(gbk.accession, ""); |
---|
214 | throw_errorf(15, "Invalid accession number '%s' (%s)", gbk.accession, &*req_fail); |
---|
215 | } |
---|
216 | |
---|
217 | Append(new_acc, '\n'); |
---|
218 | freeset(gbk.accession, new_acc); |
---|
219 | } |
---|
220 | static void genbank_verify_keywords(GenBank& gbk) { |
---|
221 | // Verify keywords. |
---|
222 | int indi, count, len; |
---|
223 | |
---|
224 | // correct missing '.' at the end |
---|
225 | terminate_with(gbk.keywords, '.'); |
---|
226 | |
---|
227 | for (indi = count = 0, len = str0len(gbk.keywords); indi < len; indi++) |
---|
228 | if (gbk.keywords[indi] == '.') |
---|
229 | count++; |
---|
230 | |
---|
231 | if (count != 1) { |
---|
232 | // @@@ raise error here ? |
---|
233 | if (Warnings::shown()) |
---|
234 | fprintf(stderr, "\nKEYWORDS: %s", gbk.keywords); |
---|
235 | warning(141, "No more than one period is allowed in KEYWORDS line."); |
---|
236 | } |
---|
237 | } |
---|
238 | void GenbankParser::parse_section() { |
---|
239 | char key[TOKENSIZE]; |
---|
240 | genbank_key_word(reader.line(), 0, key); |
---|
241 | state = ENTRY_STARTED; |
---|
242 | parse_keyed_section(key); |
---|
243 | } |
---|
244 | |
---|
245 | static void genbank_origin(Seq& seq, Reader& reader) { |
---|
246 | // Read in genbank sequence data. |
---|
247 | ca_assert(seq.is_empty()); |
---|
248 | |
---|
249 | // read in whole sequence data |
---|
250 | for (++reader; reader.line() && !is_sequence_terminator(reader.line()); ++reader) { |
---|
251 | if (has_content(reader.line())) { |
---|
252 | for (int index = 9; reader.line()[index] != '\n' && reader.line()[index] != '\0'; index++) { |
---|
253 | if (reader.line()[index] != ' ') |
---|
254 | seq.add(reader.line()[index]); |
---|
255 | } |
---|
256 | } |
---|
257 | } |
---|
258 | } |
---|
259 | |
---|
260 | void GenbankParser::parse_keyed_section(const char *key) { |
---|
261 | if (str_equal(key, "LOCUS")) { |
---|
262 | genbank_one_entry_in(gbk.locus, reader); |
---|
263 | if (!gbk.locus_contains_date()) { |
---|
264 | static bool alreadyWarned = false; |
---|
265 | if (!alreadyWarned) { |
---|
266 | warning(14, "LOCUS data might be incomplete (no date seen)"); |
---|
267 | alreadyWarned = true; |
---|
268 | } |
---|
269 | } |
---|
270 | } |
---|
271 | else if (str_equal(key, "DEFINITION")) { |
---|
272 | genbank_one_entry_in(gbk.definition, reader); |
---|
273 | terminate_with(gbk.definition, '.'); // correct missing '.' at the end |
---|
274 | } |
---|
275 | else if (str_equal(key, "ACCESSION")) { |
---|
276 | genbank_one_entry_in(gbk.accession, reader); |
---|
277 | genbank_verify_accession(gbk); |
---|
278 | } |
---|
279 | else if (str_equal(key, "KEYWORDS")) { |
---|
280 | genbank_one_entry_in(gbk.keywords, reader); |
---|
281 | genbank_verify_keywords(gbk); |
---|
282 | } |
---|
283 | else if (str_equal(key, "SOURCE")) { |
---|
284 | genbank_source(gbk, reader); |
---|
285 | terminate_with(gbk.source, '.'); // correct missing '.' at the end |
---|
286 | terminate_with(gbk.organism, '.'); |
---|
287 | } |
---|
288 | else if (str_equal(key, "REFERENCE")) { |
---|
289 | genbank_reference(gbk, reader); |
---|
290 | } |
---|
291 | else if (str_equal(key, "COMMENTS")) { |
---|
292 | genbank_comments(gbk, reader); |
---|
293 | } |
---|
294 | else if (str_equal(key, "COMMENT")) { |
---|
295 | genbank_comments(gbk, reader); |
---|
296 | } |
---|
297 | else if (str_equal(key, "ORIGIN")) { |
---|
298 | genbank_origin(seq, reader); |
---|
299 | state = ENTRY_COMPLETED; |
---|
300 | } |
---|
301 | else { |
---|
302 | genbank_skip_unidentified(reader, 2); |
---|
303 | } |
---|
304 | } |
---|
305 | |
---|
306 | static void genbank_print_lines(Writer& write, const char *key, const char *content, const WrapMode& wrapMode) { |
---|
307 | // Print one genbank line, wrap around if over column GBMAXLINE |
---|
308 | |
---|
309 | ca_assert(strlen(key) == GBINDENT); |
---|
310 | ca_assert(content[strlen(content)-1] == '\n'); |
---|
311 | |
---|
312 | wrapMode.print(write, key, " ", content, GBMAXLINE); |
---|
313 | } |
---|
314 | |
---|
315 | static void genbank_out_one_entry(Writer& write, const char *key, const char *content, const WrapMode& wrapMode, int period) { |
---|
316 | /* Print out key and content if content length > 1 |
---|
317 | * otherwise print key and "No information" w/wo |
---|
318 | * period at the end depending on flag period. |
---|
319 | */ |
---|
320 | |
---|
321 | if (!has_content(content)) { |
---|
322 | content = period ? "No information.\n" : "No information\n"; |
---|
323 | } |
---|
324 | genbank_print_lines(write, key, content, wrapMode); |
---|
325 | } |
---|
326 | |
---|
327 | static void genbank_out_one_reference(Writer& write, const GenbankRef& gbk_ref, int gbk_ref_num) { |
---|
328 | WrapMode wrapWords(true); |
---|
329 | |
---|
330 | { |
---|
331 | const char *r = gbk_ref.ref; |
---|
332 | char refnum[TOKENSIZE]; |
---|
333 | |
---|
334 | if (!has_content(r)) { |
---|
335 | sprintf(refnum, "%d\n", gbk_ref_num); |
---|
336 | r = refnum; |
---|
337 | } |
---|
338 | genbank_out_one_entry(write, "REFERENCE ", r, wrapWords, NOPERIOD); |
---|
339 | } |
---|
340 | |
---|
341 | genbank_out_one_entry(write, " AUTHORS ", gbk_ref.author, WrapMode(" "), NOPERIOD); |
---|
342 | genbank_out_one_entry(write, " JOURNAL ", gbk_ref.journal, wrapWords, NOPERIOD); |
---|
343 | genbank_out_one_entry(write, " TITLE ", gbk_ref.title, wrapWords, NOPERIOD); |
---|
344 | genbank_out_one_entry(write, " STANDARD ", gbk_ref.standard, wrapWords, NOPERIOD); |
---|
345 | } |
---|
346 | |
---|
347 | static void genbank_print_comment_if_content(Writer& write, const char *key, const char *content) { |
---|
348 | // Print one genbank line, wrap around if over column GBMAXLINE |
---|
349 | |
---|
350 | if (!has_content(content)) return; |
---|
351 | |
---|
352 | char first[LINESIZE]; sprintf(first, "%*s%s", GBINDENT+RDP_SUBKEY_INDENT, "", key); |
---|
353 | char other[LINESIZE]; sprintf(other, "%*s", GBINDENT+RDP_SUBKEY_INDENT+RDP_CONTINUED_INDENT, ""); |
---|
354 | WrapMode(true).print(write, first, other, content, GBMAXLINE); |
---|
355 | } |
---|
356 | |
---|
357 | static void genbank_out_origin(const Seq& seq, Writer& write) { // @@@ inline method |
---|
358 | // Output sequence data in genbank format. |
---|
359 | seq.out(write, GENBANK); |
---|
360 | } |
---|
361 | |
---|
362 | inline void genbank_print_completeness(Writer& write, char compX, char X) { |
---|
363 | if (compX == ' ') return; |
---|
364 | ca_assert(compX == 'y' || compX == 'n'); |
---|
365 | write.outf(" %c' end complete: %s\n", X, compX == 'y' ? "Yes" : "No"); |
---|
366 | } |
---|
367 | |
---|
368 | void genbank_out_header(const GenBank& gbk, const Seq& seq, Writer& write) { |
---|
369 | int indi; |
---|
370 | WrapMode wrapWords(true); |
---|
371 | |
---|
372 | genbank_out_one_entry(write, "LOCUS ", gbk.locus, wrapWords, NOPERIOD); |
---|
373 | genbank_out_one_entry(write, "DEFINITION ", gbk.definition, wrapWords, PERIOD); |
---|
374 | genbank_out_one_entry(write, "ACCESSION ", gbk.accession, wrapWords, NOPERIOD); |
---|
375 | genbank_out_one_entry(write, "KEYWORDS ", gbk.keywords, WrapMode(";"), PERIOD); |
---|
376 | genbank_out_one_entry(write, "SOURCE ", gbk.source, wrapWords, PERIOD); |
---|
377 | genbank_out_one_entry(write, " ORGANISM ", gbk.organism, wrapWords, PERIOD); |
---|
378 | |
---|
379 | if (gbk.has_refs()) { |
---|
380 | for (indi = 0; indi < gbk.get_refcount(); indi++) { |
---|
381 | genbank_out_one_reference(write, gbk.get_ref(indi), indi+1); |
---|
382 | } |
---|
383 | } |
---|
384 | else { |
---|
385 | genbank_out_one_reference(write, GenbankRef(), 1); |
---|
386 | } |
---|
387 | |
---|
388 | const RDP_comments& comments = gbk.comments; |
---|
389 | const OrgInfo& orginf = comments.orginf; |
---|
390 | const SeqInfo& seqinf = comments.seqinf; |
---|
391 | |
---|
392 | if (comments.exists()) { |
---|
393 | write.out("COMMENTS "); |
---|
394 | |
---|
395 | if (orginf.exists()) { |
---|
396 | write.out("Organism information\n"); |
---|
397 | |
---|
398 | genbank_print_comment_if_content(write, "Source of strain: ", orginf.source); |
---|
399 | genbank_print_comment_if_content(write, "Culture collection: ", orginf.cultcoll); // this field is used in ../lib/import/.rdp_old.ift |
---|
400 | genbank_print_comment_if_content(write, "Former name: ", orginf.formname); // other fields occur in no .ift |
---|
401 | genbank_print_comment_if_content(write, "Alternate name: ", orginf.nickname); |
---|
402 | genbank_print_comment_if_content(write, "Common name: ", orginf.commname); |
---|
403 | genbank_print_comment_if_content(write, "Host organism: ", orginf.hostorg); |
---|
404 | |
---|
405 | if (seqinf.exists() || str0len(comments.others) > 0) |
---|
406 | write.out(" "); |
---|
407 | } |
---|
408 | |
---|
409 | if (seqinf.exists()) { |
---|
410 | write.outf("Sequence information (bases 1 to %d)\n", seq.get_len()); |
---|
411 | |
---|
412 | genbank_print_comment_if_content(write, "RDP ID: ", seqinf.RDPid); |
---|
413 | genbank_print_comment_if_content(write, "Corresponding GenBank entry: ", seqinf.gbkentry); // this field is used in ../lib/import/.rdp_old.ift |
---|
414 | genbank_print_comment_if_content(write, "Sequencing methods: ", seqinf.methods); |
---|
415 | |
---|
416 | genbank_print_completeness(write, seqinf.comp5, '5'); |
---|
417 | genbank_print_completeness(write, seqinf.comp3, '3'); |
---|
418 | } |
---|
419 | |
---|
420 | // @@@ use wrapper for code below ? |
---|
421 | // print GBINDENT spaces of the first line |
---|
422 | if (str0len(comments.others) > 0) { |
---|
423 | write.repeated(' ', GBINDENT); |
---|
424 | } |
---|
425 | |
---|
426 | if (str0len(comments.others) > 0) { |
---|
427 | int length = str0len(comments.others); |
---|
428 | for (indi = 0; indi < length; indi++) { |
---|
429 | write.out(comments.others[indi]); |
---|
430 | |
---|
431 | // if another line, print GBINDENT spaces first |
---|
432 | if (comments.others[indi] == '\n' && comments.others[indi + 1] != '\0') { |
---|
433 | write.repeated(' ', GBINDENT); |
---|
434 | } |
---|
435 | } |
---|
436 | } |
---|
437 | } |
---|
438 | } |
---|
439 | |
---|
440 | void genbank_out_base_count(const Seq& seq, Writer& write) { |
---|
441 | BaseCounts bases; |
---|
442 | seq.count(bases); |
---|
443 | write.outf("BASE COUNT %6d a %6d c %6d g %6d t", bases.a, bases.c, bases.g, bases.t); |
---|
444 | if (bases.other) { // don't write 0 others |
---|
445 | write.outf(" %6d others", bases.other); |
---|
446 | } |
---|
447 | write.out('\n'); |
---|
448 | } |
---|
449 | |
---|
450 | void genbank_out(const GenBank& gbk, const Seq& seq, Writer& write) { |
---|
451 | // Output in a genbank format |
---|
452 | |
---|
453 | genbank_out_header(gbk, seq, write); |
---|
454 | genbank_out_base_count(seq, write); |
---|
455 | write.out("ORIGIN\n"); |
---|
456 | genbank_out_origin(seq, write); |
---|
457 | } |
---|
458 | |
---|
459 | bool GenbankReader::read_one_entry(Seq& seq) { |
---|
460 | data.reinit(); |
---|
461 | if (!GenbankParser(data, seq, *this).parse_entry()) abort(); |
---|
462 | return ok(); |
---|
463 | } |
---|