Context Navigation

source: branches/port5/GENOM_IMPORT/Importer.cxx

Visit:

Last change on this file was 6104, checked in by westram, 16 years ago
use GB_warningf instead of GB_warning(GBS_global_string) same for GB_internal_error, GB_export_error
File size: 27.2 KB

Line
1	// ================================================================ //
2	// //
3	// File : Importer.cxx //
4	// Purpose : Genome importer core //
5	// //
6	// Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7	// Institute of Microbiology (Technical University Munich) //
8	// http://www.arb-home.de/ //
9	// //
10	// ================================================================ //
11
12	#include "Importer.h"
13	#include "tools.h"
14	#include "Feature.h"
15	#include "DBwriter.h"
16
17	using namespace std;
18
19	//--------------------------------------------------------------------------------
20
21	static bool is_escaped(const string& str, size_t pos) {
22	// returns true, if position 'pos' in string 'str' is escaped by '\\'
23
24	bool escaped = false;
25	if (pos != 0) { // pos 0 can't be escaped
26	if (str[pos-1] == '\\') { // is an escape before pos ?
27	escaped = !is_escaped(str, pos-1); // pos is escaped, if the escape isn't!
28	}
29	}
30	return escaped;
31	}
32
33	FeatureLine::FeatureLine(const string& line) {
34	// start parsing at position 5
35	string::size_type first_char = line.find_first_not_of(' ', 5);
36
37	orgLine = line;
38
39	if (first_char == 5) { // feature start
40	string::size_type behind_name = line.find_first_of(' ', first_char);
41	string::size_type rest_start = line.find_first_not_of(' ', behind_name);
42
43	if (rest_start == string::npos) {
44	if (behind_name == string::npos) throw "Expected space behind feature name";
45	throw "Expected some content behind feature name";
46	}
47
48	name = line.substr(first_char, behind_name-first_char);
49	rest = line.substr(rest_start);
50	type = FL_START;
51	}
52	else if (first_char >= 21) { // not feature start
53	if (first_char == 21 && line[first_char] == '/') { // qualifier start
54	string::size_type equal_pos = line.find_first_of('=', first_char);
55	if (equal_pos == string::npos) {
56	// qualifier w/o data (i.e. "/pseudo")
57	name = line.substr(first_char+1);
58	rest = "true";
59	type = FL_QUALIFIER_NODATA;
60	}
61	else {
62	name = line.substr(first_char+1, equal_pos-first_char-1);
63	rest = line.substr(equal_pos+1);
64
65	if (rest[0] == '"') {
66	size_t rlen = rest.length();
67
68	if (rlen == 1) { // special case: only one open quote behind qualifier
69	type = FL_QUALIFIER_QUOTE_OPENED;
70	}
71	else if (rest[rlen-1] == '"' && !is_escaped(rest, rlen-1)) { // closing non-escaped quote at eol
72	type = FL_QUALIFIER_QUOTED;
73	}
74	else {
75	type = FL_QUALIFIER_QUOTE_OPENED;
76	}
77	}
78	else {
79	type = FL_QUALIFIER;
80	}
81	}
82	}
83	else { // continued line
84	interpret_as_continued_line();
85	}
86	}
87	else {
88	if (first_char == string::npos) {
89	throw "Expected feature line, found empty line";
90	}
91	throw GBS_global_string("Expected feature line (first char at pos=%zu unexpected)", first_char);
92	}
93	}
94
95	void FeatureLine::interpret_as_continued_line() {
96	rest = orgLine.substr(21);
97	if (rest[rest.length()-1] == '"') {
98	type = FL_CONTINUED_QUOTE_CLOSED;
99	}
100	else {
101	type = FL_CONTINUED;
102	}
103	}
104
105	bool FeatureLine::reinterpret_as_continued_line() {
106	bool ok = false;
107
108	if (type == FL_QUALIFIER \|\| type == FL_QUALIFIER_NODATA) {
109	string::size_type first_char = orgLine.find_first_not_of(' ', 5);
110	if (first_char >= 21) {
111	interpret_as_continued_line();
112	ok = true;
113	}
114	}
115
116	return ok;
117	}
118
119	//--------------------------------------------------------------------------------
120
121	Importer::Importer(FileBuffer& Flatfile, DBwriter& DB_writer, const MetaTag *meta_description)
122	: db_writer(DB_writer)
123	, flatfile(Flatfile)
124	, tagTranslator(meta_description)
125	{ }
126
127	void Importer::warning(const char *msg) {
128	warnings.push_back(msg);
129	}
130
131	FeatureLinePtr Importer::getFeatureTableLine() {
132	FeatureLinePtr fline;
133
134	if (pushedFeatureLines.empty()) { // nothing on stack -> read new
135	string line;
136	if (readFeatureTableLine(line)) fline = new FeatureLine(line);
137	}
138	else {
139	fline = pushedFeatureLines.back();
140	pushedFeatureLines.pop_back();
141	}
142	return fline;
143	}
144
145	FeatureLinePtr Importer::getUnwrappedFeatureTableLine() {
146	FeatureLinePtr fline = getFeatureTableLine();
147	if (!fline.Null()) {
148	if (fline->type & FL_META_CONTINUED) throw "Expected start of feature or qualifier";
149
150	if (0 == (fline->type & (FL_QUALIFIER_NODATA\|FL_QUALIFIER_QUOTED))) {
151	// qualifier/featurestart may be wrapped
152	FeatureLinePtr next_fline = getFeatureTableLine();
153
154	while (!next_fline.Null() &&
155	fline->type != FL_QUALIFIER_QUOTED) // already seen closing quote
156	{
157	if ((next_fline->type&FL_META_CONTINUED) == 0) {
158	// special case: a wrapped line of a quoted qualifier may start with /xxx
159	// (in that case it is misinterpreted as qualifier start)
160	if (fline->type == FL_QUALIFIER_QUOTE_OPENED) {
161	if (!next_fline->reinterpret_as_continued_line()) {
162	throw "did not see end of quoted qualifier (instead found next qualifiert)";
163	}
164	gi_assert(next_fline->type & FL_META_CONTINUED);
165	}
166	else {
167	break;
168	}
169	}
170
171	if (next_fline->type == FL_CONTINUED_QUOTE_CLOSED) {
172	if (fline->type != FL_QUALIFIER_QUOTE_OPENED) throw "Unexpected closing quote";
173	fline->type = FL_QUALIFIER_QUOTED;
174	}
175	else {
176	gi_assert(next_fline->type == FL_CONTINUED);
177	gi_assert(fline->type == FL_START \|\| fline->type == FL_QUALIFIER \|\| fline->type == FL_QUALIFIER_QUOTE_OPENED);
178	}
179
180	fline->rest.append(next_fline->rest);
181	next_fline = getFeatureTableLine();
182	}
183
184	if (!next_fline.Null()) backFeatureTableLine(next_fline);
185	}
186	}
187	return fline;
188	}
189
190	FeaturePtr Importer::parseFeature() {
191	FeaturePtr feature;
192	FeatureLinePtr fline = getUnwrappedFeatureTableLine();
193
194	if (!fline.Null()) { // found a feature table line
195	if (fline->type != FL_START) throw "Expected feature start";
196
197	feature = new Feature(fline->name, fline->rest);
198
199	fline = getUnwrappedFeatureTableLine();
200	while (!fline.Null() && (fline->type & FL_META_QUALIFIER)) {
201	feature->addQualifiedEntry(fline->name, fline->rest);
202	fline = getUnwrappedFeatureTableLine();
203	}
204	if (!fline.Null()) backFeatureTableLine(fline);
205	}
206
207	return feature;
208	}
209
210	void Importer::parseFeatureTable() {
211	FeaturePtr feature = parseFeature();
212
213	while (!feature.Null()) {
214	feature->expectLocationInSequence(expectedSeqLength);
215	feature->fixEmptyQualifiers();
216	db_writer.writeFeature(*feature);
217	feature = parseFeature();
218	}
219	}
220
221	void Importer::show_warnings(const string& import_of_what) {
222	if (!warnings.empty()) {
223	const char *what = import_of_what.c_str();
224	stringVectorCRIter e = warnings.rend();
225	for (stringVectorCRIter i = warnings.rbegin(); i != e; ++i) {
226	GB_warningf("Warning: %s: %s", what, i->c_str());
227	}
228	warnings.clear();
229	}
230	}
231
232
233	void Importer::import() {
234	try {
235	string line;
236	while (flatfile.getLine(line)) {
237	if (!line.empty()) { // silently skip empty lines before or after section
238	flatfile.backLine(line);
239
240	// cleanup from import of previous section
241	gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
242	pushedFeatureLines.clear();
243	warnings.clear();
244
245	expectedSeqLength = 0; // reset expected seq. length
246	import_section();
247
248	gi_assert(warnings.empty());
249	gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
250	}
251	}
252	}
253	catch (const DBerror& err) { throw err.getMessage(); }
254	catch (const string& err) { throw flatfile.lineError(err); }
255	catch (const char *err) { throw flatfile.lineError(err); }
256	}
257
258	//--------------------------------------------------------------------------------
259	// Meta information definitions
260	//
261	//
262	// [ please keep the list of common entries in
263	// ../HELP_SOURCE/oldhelp/sp_info.hlp
264	// up to date! ]
265
266	static MetaTag genebank_meta_description[] = {
267	{ "LOCUS", "org_locus", MT_HEADER },
268
269	{ "REFERENCE", "", MT_REF_START },
270	{ " AUTHORS", "author", MT_REF },
271	{ " TITLE", "title", MT_REF },
272	{ " CONSRTM", "refgrp", MT_REF },
273	{ " JOURNAL", "journal", MT_REF },
274	{ " PUBMED", "pubmed_id", MT_REF },
275	{ " MEDLINE", "medline_id", MT_REF },
276	{ " REMARK", "refremark", MT_REF },
277
278	{ "DEFINITION", "definition", MT_BASIC },
279	{ "ACCESSION", "acc", MT_BASIC },
280	{ "VERSION", "version", MT_BASIC },
281	{ "KEYWORDS", "keywd", MT_BASIC },
282	{ "SOURCE", "full_name", MT_BASIC },
283	{ " ORGANISM", "tax", MT_BASIC },
284	{ "COMMENT", "comment", MT_BASIC },
285	{ "PROJECT", "projref", MT_BASIC },
286
287	{ "FEATURES", "", MT_FEATURE_START },
288	{ "CONTIG", "", MT_CONTIG },
289	{ "BASE", "", MT_SEQUENCE_START }, // BASE COUNT (sometimes missing)
290	{ "ORIGIN", "", MT_SEQUENCE_START }, // only used if BASE COUNT is missing
291	{ "//", "", MT_END },
292
293	{ "", "", MT_IGNORE }, // End of array
294	};
295
296	static MetaTag embl_meta_description[] = {
297	{ "ID", "org_id", MT_HEADER },
298
299	{ "RN", "", MT_REF_START },
300	{ "RA", "author", MT_REF },
301	{ "RC", "auth_comm", MT_REF },
302	{ "RG", "refgrp", MT_REF },
303	{ "RL", "journal", MT_REF },
304	{ "RP", "nuc_rp", MT_REF },
305	{ "RT", "title", MT_REF },
306	{ "RX", "", MT_REF_DBID }, // @@@ extract field 'pubmed_id' ?
307
308	{ "AC", "acc", MT_BASIC },
309	{ "AH", "assembly_header", MT_BASIC },
310	{ "AS", "assembly_info", MT_BASIC },
311	{ "CC", "comment", MT_BASIC },
312	{ "CO", "contig", MT_BASIC },
313	{ "DE", "description", MT_BASIC },
314	{ "DR", "db_xref", MT_BASIC },
315	{ "DT", "date", MT_BASIC },
316	{ "SV", "version", MT_BASIC },
317	{ "KW", "keywd", MT_BASIC },
318	{ "OS", "full_name", MT_BASIC },
319	{ "OC", "tax", MT_BASIC },
320	{ "OG", "organelle", MT_BASIC },
321	{ "PR", "projref", MT_BASIC },
322
323	{ "FH", "", MT_FEATURE_START },
324	{ "FT", "", MT_FEATURE },
325	{ "SQ", "", MT_SEQUENCE_START },
326	{ "//", "", MT_END },
327
328	{ "XX", "", MT_IGNORE }, // spacer
329
330	{ "", "", MT_IGNORE }, // End of array
331	};
332
333	//--------------------------------------------------------------------------------
334
335
336	GenebankImporter::GenebankImporter(FileBuffer& Flatfile, DBwriter& DB_writer)
337	: Importer(Flatfile, DB_writer, genebank_meta_description)
338	{ }
339
340	bool GenebankImporter::readFeatureTableLine(string& line) {
341	if (flatfile.getLine(line)) {
342	if (beginsWith(line, " ")) {
343	return true;
344	}
345	flatfile.backLine(line);
346	}
347	return false;
348	}
349
350	static bool splitGenebankTag(const string& line, string& tag, string& content) {
351	// split a line into tag (incl. preceeding spaces) and content
352	// returns true, if line suffices the format requirements
353	// Note: returns tag="" at wrapped lines
354
355	string::size_type first_non_space = line.find_first_not_of(' ');
356
357	if (first_non_space == 12 \|\| // no tag, only content
358	(first_non_space == string::npos && line.length() == 12)) { // same with empty content
359	tag = "";
360	content = line.substr(12);
361	return true;
362	}
363
364	if (first_non_space>12) return false;
365
366	string::size_type behind_tag = line.find_first_of(' ', first_non_space);
367	if (behind_tag == string::npos) { // only tag w/o spaces behind
368	tag = line;
369	content = "";
370	return true;
371	}
372
373	string::size_type content_start = line.find_first_not_of(' ', behind_tag);
374	if (content_start == string::npos) { // line w/o content
375	content = "";
376	}
377	else {
378	content = line.substr(content_start);
379	}
380
381	tag = line.substr(0, behind_tag);
382	return true;
383	}
384
385	static long scanSeqlenFromLOCUS(const string& locusContent) {
386	StringParser parser(locusContent);
387	parser.extractWord(); // id
388	parser.eatSpaces();
389
390	long bp = parser.extractNumber();
391	parser.eatSpaces();
392	parser.expectContent("bp");
393
394	return bp;
395	}
396
397	void GenebankImporter::import_section() {
398	MetaInfo meta;
399	References refs;
400
401	const MetaTag *prevTag = 0; // previously handled tag
402	string prevContent; // previously found content
403
404	bool seenHeaderLine = false;
405	bool EOS = false; // end of section ?
406
407	// read header of file
408	while (!EOS) {
409	string line, tag, content;
410	expectLine(line);
411	if (!splitGenebankTag(line, tag, content)) {
412	gi_assert(0);
413	}
414
415	if (tag.empty()) { // no tag - happens at wrapped lines
416	prevContent.append(1, ' ');
417	prevContent.append(content);
418	}
419	else { // start of new tag
420	const MetaTag *knownTag = findTag(tag);
421	if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
422
423	if (prevTag) { // save previous tag
424	switch (prevTag->type) {
425	case MT_REF: refs.add(prevTag->field, prevContent); break;
426	case MT_BASIC: meta.add(prevTag, prevContent, true); break;
427	case MT_HEADER:
428	meta.add(prevTag, prevContent, true); // save header line
429	// printf("Header not handled yet: '%s'\n", prevContent.c_str());
430	expectedSeqLength = scanSeqlenFromLOCUS(prevContent);
431	break;
432	case MT_REF_DBID: // embl only
433	default: gi_assert(0); break;
434	}
435	prevTag = 0;
436	}
437
438	switch (knownTag->type) {
439	case MT_HEADER:
440	if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
441	seenHeaderLine = true;
442	// fall-through
443	case MT_BASIC:
444	case MT_REF:
445	prevTag = knownTag;
446	prevContent = content;
447	break;
448
449	case MT_REF_START:
450	refs.start(); // start a new reference
451	break;
452
453	case MT_FEATURE_START:
454	db_writer.createOrganism(flatfile.getFilename(), "NCBI");
455	parseFeatureTable();
456	break;
457
458	case MT_SEQUENCE_START:
459	parseSequence(knownTag->tag, content);
460	EOS = true; // end of section
461	break;
462
463	case MT_IGNORE:
464	break;
465
466	case MT_END:
467	EOS = true;
468	break;
469
470	case MT_CONTIG:
471	throw GBS_global_string("Cannot import files containing CONTIG");
472
473	case MT_REF_DBID: // embl only
474	default:
475	gi_assert(0);
476	throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
477	}
478	}
479	}
480
481	db_writer.finalizeOrganism(meta, refs, *this);
482	show_warnings(meta.getAccessionNumber());
483	}
484
485	//--------------------------------------------------------------------------------
486
487
488	EmblImporter::EmblImporter(FileBuffer& Flatfile, DBwriter& DB_writer)
489	: Importer(Flatfile, DB_writer, embl_meta_description)
490	{ }
491
492	static bool splitEmblTag(const string& line, string& tag, string& content) {
493	// split a line into 2-character tag and content
494	// return true on success (i.e. if line suffices the required format)
495
496	if (line.length() == 2) {
497	tag = line;
498	content = "";
499	}
500	else {
501	string::size_type spacer = line.find(" "); // separator between tag and content
502	if (spacer != 2) return false; // expect spacer at pos 2-4
503
504	tag = line.substr(0, 2);
505	content = line.substr(5);
506	}
507
508	return true;
509	}
510
511	bool EmblImporter::readFeatureTableLine(string& line) {
512	if (flatfile.getLine(line)) {
513	if (beginsWith(line, "FT ")) {
514	return true;
515	}
516	flatfile.backLine(line);
517	}
518	return false;
519	}
520
521	static long scanSeqlenFromID(const string& idContent) {
522	StringParser parser(idContent);
523	string lastWord = parser.extractWord(); // eat id
524	bool bpseen = false;
525	long bp = -1;
526
527	while (!bpseen) {
528	parser.eatSpaces();
529	string word = parser.extractWord();
530	if (word == "BP.") {
531	// basecount is in word before "BP."
532	bp = atol(lastWord.c_str());
533	bpseen = true;
534	}
535	else {
536	lastWord = word;
537	}
538	}
539
540	if (bp == -1) throw "Could not parse bp from header";
541
542	return bp;
543	}
544
545	void EmblImporter::import_section() {
546	MetaInfo meta;
547	References refs;
548
549	const MetaTag *prevTag = 0; // previously handled tag
550	string prevContent; // previously found content
551	bool prevAppendNL = false; // append '\n' into multiline tags
552
553	bool seenHeaderLine = false;
554	bool EOS = false; // end of section ?
555
556	// read header of file
557	while (!EOS) {
558	string line, tag, content;
559	expectLine(line);
560	if (!splitEmblTag(line, tag, content)) {
561	throw "Expected two-character tag at start of line";
562	}
563
564	const MetaTag *knownTag = findTag(tag);
565	if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
566
567	if (knownTag == prevTag) { // multiline tag
568	if (prevAppendNL) prevContent.append("\n"); // append a newline to make parsing in add_dbid() more easy
569	prevContent.append(content); // append w/o space - EMBL flatfiles have spaces at EOL when needed
570	}
571	else { // start of new tag
572	if (prevTag) { // save previous tag
573	switch (prevTag->type) {
574	case MT_REF: refs.add(prevTag->field, prevContent); break;
575	case MT_REF_DBID: refs.add_dbid(prevContent); prevAppendNL = false; break;
576	case MT_BASIC: meta.add(prevTag, prevContent, true); break;
577	case MT_HEADER:
578	meta.add(prevTag, prevContent, true);
579	// printf("Header not handled yet: '%s'\n", prevContent.c_str());
580	expectedSeqLength = scanSeqlenFromID(prevContent);
581	break;
582	default: gi_assert(0); break;
583	}
584	prevTag = 0;
585	}
586
587	switch (knownTag->type) {
588	case MT_HEADER:
589	if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
590	seenHeaderLine = true;
591	// fall-through
592	case MT_BASIC:
593	case MT_REF:
594	prevTag = knownTag;
595	prevContent = content;
596	break;
597
598	case MT_REF_DBID:
599	prevTag = knownTag;
600	prevContent = content;
601	prevAppendNL = true;
602	break;
603
604	case MT_REF_START:
605	refs.start(); // start a new reference
606	break;
607
608	case MT_FEATURE:
609	flatfile.backLine(line);
610	db_writer.createOrganism(flatfile.getFilename(), "EMBL");
611	parseFeatureTable();
612	break;
613
614	case MT_SEQUENCE_START:
615	parseSequence(content);
616	EOS = true; // end of section
617	break;
618
619	case MT_FEATURE_START:
620	case MT_IGNORE:
621	break;
622
623	default:
624	gi_assert(0);
625	throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
626	}
627	}
628	}
629	db_writer.finalizeOrganism(meta, refs, *this);
630	show_warnings(meta.getAccessionNumber());
631	}
632
633	// --------------------------------------------------------------------------------
634	// sequence readers:
635
636	inline bool parseCounter(bool expect, BaseCounter& headerCount, StringParser& parser, Base base, const char *word) {
637	// parses part of string (e.g. " 6021225 BP;" or " 878196 A;")
638	// if 'expect' == true -> throw exception if missing
639	// if 'expect' == false -> return false if missing
640
641	bool found = false;
642	stringCIter start = parser.getPosition();
643
644	parser.expectSpaces(0);
645
646	bool seen_number;
647	long count = parser.eatNumber(seen_number);
648
649	if (seen_number) {
650	headerCount.addCount(base, count);
651	size_t spaces = parser.eatSpaces();
652	if (spaces>0) {
653	size_t len = parser.lookingAt(word);
654	if (len>0) { // seen
655	parser.advance(len);
656	found = true;
657	}
658	}
659	}
660
661	if (!found) {
662	parser.setPosition(start); // reset position
663	if (expect) throw GBS_global_string("Expected counter '### %s', found '%s'", word, parser.rest().c_str());
664	}
665	return found;
666	}
667
668	void GenebankImporter::parseSequence(const string& tag, const string& headerline) {
669	SmartPtr<BaseCounter> headerCount;
670
671	if (tag == "BASE") { // base count not always present
672	// parse headerline :
673	headerCount = new BaseCounter("sequence header");
674	{
675	StringParser parser(headerline);
676
677	parser.expectContent("COUNT");
678
679	parseCounter(true, *headerCount, parser, BC_A, "a");
680	parseCounter(true, *headerCount, parser, BC_C, "c");
681	parseCounter(true, *headerCount, parser, BC_G, "g");
682	parseCounter(true, *headerCount, parser, BC_T, "t");
683	parseCounter(false, *headerCount, parser, BC_OTHER, "others"); // not always present
684
685	headerCount->calcOverallCounter();
686	}
687	}
688
689	// parse sequence data
690	size_t est_seq_size = headerCount.Null() ? 500000 : headerCount->getCount(BC_ALL);
691	SequenceBuffer seqData(est_seq_size);
692	{
693	string line;
694
695	if (!headerCount.Null()) {
696	// if BASE COUNT was present, check ORIGIN line
697	// otherwise ORIGIN line has already been read
698	expectLine(line);
699	if (!beginsWith(line, "ORIGIN")) throw "Expected 'ORIGIN'";
700	}
701
702	bool eos_seen = false;
703	while (!eos_seen) {
704	expectLine(line);
705	if (beginsWith(line, "//")) {
706	eos_seen = true;
707	}
708	else {
709	string data;
710	data.reserve(60);
711	StringParser parser(line);
712
713	parser.eatSpaces(); // not sure whether there really have to be spaces if number has 9 digits or more
714	size_t cur_pos = (size_t)parser.extractNumber();
715	size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
716
717	if (cur_pos != (datasize+1)) {
718	throw GBS_global_string("Got wrong base position (found=%zu, expected=%zu)", cur_pos, size_t(datasize+1));
719	}
720
721	int blocks = 0;
722	while (!parser.atEnd() && parser.at() == ' ') {
723	parser.expectSpaces(1);
724
725	stringCIter start = parser.pos;
726	stringCIter end = parser.find(' ');
727
728	data.append(start, end);
729	blocks++;
730	}
731
732	if (blocks>6) throw "Found more than 6 parts of sequence data";
733	seqData.addLine(data);
734	}
735	}
736	}
737
738	if (headerCount.Null()) {
739	warning("No 'BASE COUNT' found. Base counts have not been validated.");
740	}
741	else {
742	headerCount->expectEqual(seqData.getBaseCounter());
743	}
744	db_writer.writeSequence(seqData);
745	}
746
747	void EmblImporter::parseSequence(const string& headerline) {
748	// parse headerline:
749	BaseCounter headerCount("sequence header");
750	{
751	StringParser parser(headerline);
752
753	parser.expectContent("Sequence");
754
755	parseCounter(true, headerCount, parser, BC_ALL, "BP;");
756	parseCounter(true, headerCount, parser, BC_A, "A;");
757	parseCounter(true, headerCount, parser, BC_C, "C;");
758	parseCounter(true, headerCount, parser, BC_G, "G;");
759	parseCounter(true, headerCount, parser, BC_T, "T;");
760	parseCounter(true, headerCount, parser, BC_OTHER, "other;");
761
762	headerCount.checkOverallCounter();
763	}
764
765	// parse sequence data
766	SequenceBuffer seqData(headerCount.getCount(BC_ALL));
767	{
768	bool eos_seen = false;
769	string line;
770
771	while (!eos_seen) {
772	expectLine(line);
773	if (beginsWith(line, "//")) {
774	eos_seen = true;
775	}
776	else {
777	string data;
778	data.reserve(60);
779	StringParser parser(line);
780
781	parser.expectSpaces(5, false);
782	int blocks = 0;
783	while (!parser.atEnd() && isalpha(parser.at())) {
784	stringCIter start = parser.pos;
785	stringCIter end = parser.find(' ');
786
787	data.append(start, end);
788	blocks++;
789	parser.expectSpaces(1);
790	}
791
792	if (blocks>6) throw "Found more than 6 parts of sequence data";
793
794	size_t basecount = (size_t)parser.extractNumber();
795
796	seqData.addLine(data);
797	size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
798
799	if (basecount != datasize) {
800	throw GBS_global_string("Got wrong base counter(found=%zu, expected=%zu)", basecount, datasize);
801	}
802	}
803	}
804	}
805
806	headerCount.expectEqual(seqData.getBaseCounter());
807	db_writer.writeSequence(seqData);
808	}
809

Note: See TracBrowser for help on using the repository browser.

Download in other formats: