Context Navigation

source: branches/profile/GENOM_IMPORT/Importer.cxx

Visit:

Last change on this file was 11844, checked in by westram, 10 years ago
merge [11840:11843] from 'customize' into 'trunk' adds: log:branches/customize@11841:11843 skipped changes in ARB_GDE log:branches/customize/ARB_GDE@11841:11843 ~~(might fail to reintegrate)~~
File size: 27.0 KB

Line
1	// ================================================================ //
2	// //
3	// File : Importer.cxx //
4	// Purpose : Genome importer core //
5	// //
6	// Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7	// Institute of Microbiology (Technical University Munich) //
8	// http://www.arb-home.de/ //
9	// //
10	// ================================================================ //
11
12	#include "tools.h"
13	#include "DBwriter.h"
14	#include <arbdb.h>
15	#include <arb_stdstr.h>
16
17	using namespace std;
18
19	// --------------------------------------------------------------------------------
20
21	static bool is_escaped(const string& str, size_t pos) {
22	// returns true, if position 'pos' in string 'str' is escaped by '\\'
23
24	bool escaped = false;
25	if (pos != 0) { // pos 0 can't be escaped
26	if (str[pos-1] == '\\') { // is an escape before pos ?
27	escaped = !is_escaped(str, pos-1); // pos is escaped, if the escape isn't!
28	}
29	}
30	return escaped;
31	}
32
33	FeatureLine::FeatureLine(const string& line) {
34	// start parsing at position 5
35	string::size_type first_char = line.find_first_not_of(' ', 5);
36
37	orgLine = line;
38
39	if (first_char == 5) { // feature start
40	string::size_type behind_name = line.find_first_of(' ', first_char);
41	string::size_type rest_start = line.find_first_not_of(' ', behind_name);
42
43	if (rest_start == string::npos) {
44	if (behind_name == string::npos) throw "Expected space behind feature name";
45	throw "Expected some content behind feature name";
46	}
47
48	name = line.substr(first_char, behind_name-first_char);
49	rest = line.substr(rest_start);
50	type = FL_START;
51	}
52	else if (first_char >= 21) { // not feature start
53	if (first_char == 21 && line[first_char] == '/') { // qualifier start
54	string::size_type equal_pos = line.find_first_of('=', first_char);
55	if (equal_pos == string::npos) {
56	// qualifier w/o data (i.e. "/pseudo")
57	name = line.substr(first_char+1);
58	rest = "true";
59	type = FL_QUALIFIER_NODATA;
60	}
61	else {
62	name = line.substr(first_char+1, equal_pos-first_char-1);
63	rest = line.substr(equal_pos+1);
64
65	if (rest[0] == '"') {
66	size_t rlen = rest.length();
67
68	if (rlen == 1) { // special case: only one open quote behind qualifier
69	type = FL_QUALIFIER_QUOTE_OPENED;
70	}
71	else if (rest[rlen-1] == '"' && !is_escaped(rest, rlen-1)) { // closing non-escaped quote at eol
72	type = FL_QUALIFIER_QUOTED;
73	}
74	else {
75	type = FL_QUALIFIER_QUOTE_OPENED;
76	}
77	}
78	else {
79	type = FL_QUALIFIER;
80	}
81	}
82	}
83	else { // continued line
84	interpret_as_continued_line();
85	}
86	}
87	else {
88	if (first_char == string::npos) {
89	throw "Expected feature line, found empty line";
90	}
91	throw GBS_global_string("Expected feature line (first char at pos=%zu unexpected)", first_char);
92	}
93	}
94
95	void FeatureLine::interpret_as_continued_line() {
96	rest = orgLine.substr(21);
97	if (rest[rest.length()-1] == '"') {
98	type = FL_CONTINUED_QUOTE_CLOSED;
99	}
100	else {
101	type = FL_CONTINUED;
102	}
103	}
104
105	bool FeatureLine::reinterpret_as_continued_line() {
106	bool ok = false;
107
108	if (type == FL_QUALIFIER \|\| type == FL_QUALIFIER_NODATA) {
109	string::size_type first_char = orgLine.find_first_not_of(' ', 5);
110	if (first_char >= 21) {
111	interpret_as_continued_line();
112	ok = true;
113	}
114	}
115
116	return ok;
117	}
118
119	// --------------------------------------------------------------------------------
120
121	Importer::Importer(LineReader& Flatfile, DBwriter& DB_writer, const MetaTag *meta_description)
122	: db_writer(DB_writer),
123	flatfile(Flatfile),
124	tagTranslator(meta_description),
125	expectedSeqLength(-1)
126	{}
127
128	void Importer::warning(const char *msg) {
129	warnings.push_back(msg);
130	}
131
132	FeatureLinePtr Importer::getFeatureTableLine() {
133	FeatureLinePtr fline;
134
135	if (pushedFeatureLines.empty()) { // nothing on stack -> read new
136	string line;
137	if (readFeatureTableLine(line)) fline = new FeatureLine(line);
138	}
139	else {
140	fline = pushedFeatureLines.back();
141	pushedFeatureLines.pop_back();
142	}
143	return fline;
144	}
145
146	FeatureLinePtr Importer::getUnwrappedFeatureTableLine() {
147	FeatureLinePtr fline = getFeatureTableLine();
148	if (!fline.isNull()) {
149	if (fline->type & FL_META_CONTINUED) throw "Expected start of feature or qualifier";
150
151	if (0 == (fline->type & (FL_QUALIFIER_NODATA\|FL_QUALIFIER_QUOTED))) {
152	// qualifier/featurestart may be wrapped
153	FeatureLinePtr next_fline = getFeatureTableLine();
154
155	while (!next_fline.isNull() &&
156	fline->type != FL_QUALIFIER_QUOTED) // already seen closing quote
157	{
158	if ((next_fline->type&FL_META_CONTINUED) == 0) {
159	// special case: a wrapped line of a quoted qualifier may start with /xxx
160	// (in that case it is misinterpreted as qualifier start)
161	if (fline->type == FL_QUALIFIER_QUOTE_OPENED) {
162	if (!next_fline->reinterpret_as_continued_line()) {
163	throw "did not see end of quoted qualifier (instead found next qualifiert)";
164	}
165	gi_assert(next_fline->type & FL_META_CONTINUED);
166	}
167	else {
168	break;
169	}
170	}
171
172	if (next_fline->type == FL_CONTINUED_QUOTE_CLOSED) {
173	if (fline->type != FL_QUALIFIER_QUOTE_OPENED) throw "Unexpected closing quote";
174	fline->type = FL_QUALIFIER_QUOTED;
175	}
176	else {
177	gi_assert(next_fline->type == FL_CONTINUED);
178	gi_assert(fline->type == FL_START \|\| fline->type == FL_QUALIFIER \|\| fline->type == FL_QUALIFIER_QUOTE_OPENED);
179	}
180
181	fline->rest.append(next_fline->rest);
182	next_fline = getFeatureTableLine();
183	}
184
185	if (!next_fline.isNull()) backFeatureTableLine(next_fline);
186	}
187	}
188	return fline;
189	}
190
191	FeaturePtr Importer::parseFeature() {
192	FeaturePtr feature;
193	FeatureLinePtr fline = getUnwrappedFeatureTableLine();
194
195	if (!fline.isNull()) { // found a feature table line
196	if (fline->type != FL_START) throw "Expected feature start";
197
198	feature = new Feature(fline->name, fline->rest);
199
200	fline = getUnwrappedFeatureTableLine();
201	while (!fline.isNull() && (fline->type & FL_META_QUALIFIER)) {
202	feature->addQualifiedEntry(fline->name, fline->rest);
203	fline = getUnwrappedFeatureTableLine();
204	}
205	if (!fline.isNull()) backFeatureTableLine(fline);
206	}
207
208	return feature;
209	}
210
211	void Importer::parseFeatureTable() {
212	FeaturePtr feature = parseFeature();
213
214	while (!feature.isNull()) {
215	feature->expectLocationInSequence(expectedSeqLength);
216	feature->fixEmptyQualifiers();
217	db_writer.writeFeature(*feature, expectedSeqLength);
218	feature = parseFeature();
219	}
220	}
221
222	void Importer::show_warnings(const string& import_of_what) {
223	if (!warnings.empty()) {
224	const char *what = import_of_what.c_str();
225	stringVectorCRIter e = warnings.rend();
226	for (stringVectorCRIter i = warnings.rbegin(); i != e; ++i) {
227	GB_warningf("Warning: %s: %s", what, i->c_str());
228	}
229	warnings.clear();
230	}
231	}
232
233
234	void Importer::import() {
235	try {
236	string line;
237	while (flatfile.getLine(line)) {
238	if (!line.empty()) { // silently skip empty lines before or after section
239	flatfile.backLine(line);
240
241	// cleanup from import of previous section
242	gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
243	pushedFeatureLines.clear();
244	warnings.clear();
245
246	expectedSeqLength = 0; // reset expected seq. length
247	import_section();
248
249	gi_assert(warnings.empty());
250	gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
251	}
252	}
253	}
254	catch (const DBerror& err) { throw err.getMessage(); }
255	catch (const string& err) { throw flatfile.lineError(err); }
256	catch (const char *err) { throw flatfile.lineError(err); }
257	}
258
259	// --------------------------------------------------------------------------------
260	// Meta information definitions
261	//
262	//
263	// [ please keep the list of common entries in
264	// ../HELP_SOURCE/oldhelp/sp_info.hlp
265	// up to date! ]
266
267	static MetaTag genebank_meta_description[] = {
268	{ "LOCUS", "org_locus", MT_HEADER },
269
270	{ "REFERENCE", "", MT_REF_START },
271	{ " AUTHORS", "author", MT_REF },
272	{ " TITLE", "title", MT_REF },
273	{ " CONSRTM", "refgrp", MT_REF },
274	{ " JOURNAL", "journal", MT_REF },
275	{ " PUBMED", "pubmed_id", MT_REF },
276	{ " MEDLINE", "medline_id", MT_REF },
277	{ " REMARK", "refremark", MT_REF },
278
279	{ "DEFINITION", "definition", MT_BASIC },
280	{ "ACCESSION", "acc", MT_BASIC },
281	{ "VERSION", "version", MT_BASIC },
282	{ "KEYWORDS", "keywd", MT_BASIC },
283	{ "SOURCE", "full_name", MT_BASIC },
284	{ " ORGANISM", "tax", MT_BASIC },
285	{ "COMMENT", "comment", MT_BASIC },
286	{ "PROJECT", "projref", MT_BASIC },
287
288	{ "FEATURES", "", MT_FEATURE_START },
289	{ "CONTIG", "", MT_CONTIG },
290	{ "BASE", "", MT_SEQUENCE_START }, // BASE COUNT (sometimes missing)
291	{ "ORIGIN", "", MT_SEQUENCE_START }, // only used if BASE COUNT is missing
292	{ "//", "", MT_END },
293
294	{ "", "", MT_IGNORE }, // End of array
295	};
296
297	static MetaTag embl_meta_description[] = {
298	{ "ID", "org_id", MT_HEADER },
299
300	{ "RN", "", MT_REF_START },
301	{ "RA", "author", MT_REF },
302	{ "RC", "auth_comm", MT_REF },
303	{ "RG", "refgrp", MT_REF },
304	{ "RL", "journal", MT_REF },
305	{ "RP", "nuc_rp", MT_REF },
306	{ "RT", "title", MT_REF },
307	{ "RX", "", MT_REF_DBID }, // @@@ extract field 'pubmed_id' ?
308
309	{ "AC", "acc", MT_BASIC },
310	{ "AH", "assembly_header", MT_BASIC },
311	{ "AS", "assembly_info", MT_BASIC },
312	{ "CC", "comment", MT_BASIC },
313	{ "CO", "contig", MT_BASIC },
314	{ "DE", "description", MT_BASIC },
315	{ "DR", "db_xref", MT_BASIC },
316	{ "DT", "date", MT_BASIC },
317	{ "SV", "version", MT_BASIC },
318	{ "KW", "keywd", MT_BASIC },
319	{ "OS", "full_name", MT_BASIC },
320	{ "OC", "tax", MT_BASIC },
321	{ "OG", "organelle", MT_BASIC },
322	{ "PR", "projref", MT_BASIC },
323
324	{ "FH", "", MT_FEATURE_START },
325	{ "FT", "", MT_FEATURE },
326	{ "SQ", "", MT_SEQUENCE_START },
327	{ "//", "", MT_END },
328
329	{ "XX", "", MT_IGNORE }, // spacer
330
331	{ "", "", MT_IGNORE }, // End of array
332	};
333
334	// --------------------------------------------------------------------------------
335
336
337	GenebankImporter::GenebankImporter(LineReader& Flatfile, DBwriter& DB_writer)
338	: Importer(Flatfile, DB_writer, genebank_meta_description)
339	{}
340
341	bool GenebankImporter::readFeatureTableLine(string& line) {
342	if (flatfile.getLine(line)) {
343	if (beginsWith(line, " ")) {
344	return true;
345	}
346	flatfile.backLine(line);
347	}
348	return false;
349	}
350
351	static bool splitGenebankTag(const string& line, string& tag, string& content) {
352	// split a line into tag (incl. preceding spaces) and content
353	// returns true, if line suffices the format requirements
354	// Note: returns tag="" at wrapped lines
355
356	string::size_type first_non_space = line.find_first_not_of(' ');
357
358	if (first_non_space == 12 \|\| // no tag, only content
359	(first_non_space == string::npos && line.length() == 12)) { // same with empty content
360	tag = "";
361	content = line.substr(12);
362	return true;
363	}
364
365	if (first_non_space>12) return false;
366
367	string::size_type behind_tag = line.find_first_of(' ', first_non_space);
368	if (behind_tag == string::npos) { // only tag w/o spaces behind
369	tag = line;
370	content = "";
371	return true;
372	}
373
374	string::size_type content_start = line.find_first_not_of(' ', behind_tag);
375	if (content_start == string::npos) { // line w/o content
376	content = "";
377	}
378	else {
379	content = line.substr(content_start);
380	}
381
382	tag = line.substr(0, behind_tag);
383	return true;
384	}
385
386	static long scanSeqlenFromLOCUS(const string& locusContent) {
387	StringParser parser(locusContent);
388	parser.extractWord(); // id
389	parser.eatSpaces();
390
391	long bp = parser.extractNumber();
392	parser.eatSpaces();
393	parser.expectContent("bp");
394
395	return bp;
396	}
397
398	void GenebankImporter::import_section() {
399	MetaInfo meta;
400	References refs;
401
402	const MetaTag *prevTag = 0; // previously handled tag
403	string prevContent; // previously found content
404
405	bool seenHeaderLine = false;
406	bool EOS = false; // end of section ?
407
408	// read header of file
409	while (!EOS) {
410	string line, tag, content;
411	expectLine(line);
412	if (!splitGenebankTag(line, tag, content)) {
413	gi_assert(0);
414	}
415
416	if (tag.empty()) { // no tag - happens at wrapped lines
417	prevContent.append(1, ' ');
418	prevContent.append(content);
419	}
420	else { // start of new tag
421	const MetaTag *knownTag = findTag(tag);
422	if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
423
424	if (prevTag) { // save previous tag
425	switch (prevTag->type) {
426	case MT_REF: refs.add(prevTag->field, prevContent); break;
427	case MT_BASIC: meta.add(prevTag, prevContent, true); break;
428	case MT_HEADER:
429	meta.add(prevTag, prevContent, true); // save header line
430	expectedSeqLength = scanSeqlenFromLOCUS(prevContent);
431	break;
432	case MT_REF_DBID: // embl only
433	default: gi_assert(0); break;
434	}
435	prevTag = 0;
436	}
437
438	switch (knownTag->type) {
439	case MT_HEADER:
440	if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
441	seenHeaderLine = true;
442	// fall-through
443	case MT_BASIC:
444	case MT_REF:
445	prevTag = knownTag;
446	prevContent = content;
447	break;
448
449	case MT_REF_START:
450	refs.start(); // start a new reference
451	break;
452
453	case MT_FEATURE_START:
454	db_writer.createOrganism(flatfile.getFilename(), "NCBI");
455	parseFeatureTable();
456	break;
457
458	case MT_SEQUENCE_START:
459	parseSequence(knownTag->tag, content);
460	EOS = true; // end of section
461	break;
462
463	case MT_IGNORE:
464	break;
465
466	case MT_END:
467	EOS = true;
468	break;
469
470	case MT_CONTIG:
471	throw GBS_global_string("Cannot import files containing CONTIG");
472
473	case MT_REF_DBID: // embl only
474	default:
475	gi_assert(0);
476	throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
477	}
478	}
479	}
480
481	db_writer.finalizeOrganism(meta, refs, *this);
482	show_warnings(meta.getAccessionNumber());
483	}
484
485	// --------------------------------------------------------------------------------
486
487
488	EmblImporter::EmblImporter(LineReader& Flatfile, DBwriter& DB_writer)
489	: Importer(Flatfile, DB_writer, embl_meta_description)
490	{}
491
492	static bool splitEmblTag(const string& line, string& tag, string& content) {
493	// split a line into 2-character tag and content
494	// return true on success (i.e. if line suffices the required format)
495
496	if (line.length() == 2) {
497	tag = line;
498	content = "";
499	}
500	else {
501	string::size_type spacer = line.find(" "); // separator between tag and content
502	if (spacer != 2) return false; // expect spacer at pos 2-4
503
504	tag = line.substr(0, 2);
505	content = line.substr(5);
506	}
507
508	return true;
509	}
510
511	bool EmblImporter::readFeatureTableLine(string& line) {
512	if (flatfile.getLine(line)) {
513	if (beginsWith(line, "FT ")) {
514	return true;
515	}
516	flatfile.backLine(line);
517	}
518	return false;
519	}
520
521	static long scanSeqlenFromID(const string& idContent) {
522	StringParser parser(idContent);
523	string lastWord = parser.extractWord(); // eat id
524	bool bpseen = false;
525	long bp = -1;
526
527	while (!bpseen) {
528	parser.eatSpaces();
529	string word = parser.extractWord();
530	if (word == "BP.") {
531	// basecount is in word before "BP."
532	bp = atol(lastWord.c_str());
533	bpseen = true;
534	}
535	else {
536	lastWord = word;
537	}
538	}
539
540	if (bp == -1) throw "Could not parse bp from header";
541
542	return bp;
543	}
544
545	void EmblImporter::import_section() {
546	MetaInfo meta;
547	References refs;
548
549	const MetaTag *prevTag = 0; // previously handled tag
550	string prevContent; // previously found content
551	bool prevAppendNL = false; // append '\n' into multiline tags
552
553	bool seenHeaderLine = false;
554	bool EOS = false; // end of section ?
555
556	// read header of file
557	while (!EOS) {
558	string line, tag, content;
559	expectLine(line);
560	if (!splitEmblTag(line, tag, content)) {
561	throw "Expected two-character tag at start of line";
562	}
563
564	const MetaTag *knownTag = findTag(tag);
565	if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
566
567	if (knownTag == prevTag) { // multiline tag
568	if (prevAppendNL) prevContent.append("\n"); // append a newline to make parsing in add_dbid() more easy
569	prevContent.append(content); // append w/o space - EMBL flatfiles have spaces at EOL when needed
570	}
571	else { // start of new tag
572	if (prevTag) { // save previous tag
573	switch (prevTag->type) {
574	case MT_REF: refs.add(prevTag->field, prevContent); break;
575	case MT_REF_DBID: refs.add_dbid(prevContent); prevAppendNL = false; break;
576	case MT_BASIC: meta.add(prevTag, prevContent, true); break;
577	case MT_HEADER:
578	meta.add(prevTag, prevContent, true);
579	expectedSeqLength = scanSeqlenFromID(prevContent);
580	break;
581	default: gi_assert(0); break;
582	}
583	prevTag = 0;
584	}
585
586	switch (knownTag->type) {
587	case MT_HEADER:
588	if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
589	seenHeaderLine = true;
590	// fall-through
591	case MT_BASIC:
592	case MT_REF:
593	prevTag = knownTag;
594	prevContent = content;
595	break;
596
597	case MT_REF_DBID:
598	prevTag = knownTag;
599	prevContent = content;
600	prevAppendNL = true;
601	break;
602
603	case MT_REF_START:
604	refs.start(); // start a new reference
605	break;
606
607	case MT_FEATURE:
608	flatfile.backLine(line);
609	db_writer.createOrganism(flatfile.getFilename(), "EMBL");
610	parseFeatureTable();
611	break;
612
613	case MT_SEQUENCE_START:
614	parseSequence(content);
615	EOS = true; // end of section
616	break;
617
618	case MT_FEATURE_START:
619	case MT_IGNORE:
620	break;
621
622	default:
623	gi_assert(0);
624	throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
625	}
626	}
627	}
628	db_writer.finalizeOrganism(meta, refs, *this);
629	show_warnings(meta.getAccessionNumber());
630	}
631
632	// --------------------------------------------------------------------------------
633	// sequence readers:
634
635	inline bool parseCounter(bool expect, BaseCounter& headerCount, StringParser& parser, Base base, const char *word) {
636	// parses part of string (e.g. " 6021225 BP;" or " 878196 A;")
637	// if 'expect' == true -> throw exception if missing
638	// if 'expect' == false -> return false if missing
639
640	bool found = false;
641	stringCIter start = parser.getPosition();
642
643	parser.expectSpaces(0);
644
645	bool seen_number;
646	long count = parser.eatNumber(seen_number);
647
648	if (seen_number) {
649	headerCount.addCount(base, count);
650	size_t spaces = parser.eatSpaces();
651	if (spaces>0) {
652	size_t len = parser.lookingAt(word);
653	if (len>0) { // seen
654	parser.advance(len);
655	found = true;
656	}
657	}
658	}
659
660	if (!found) {
661	parser.setPosition(start); // reset position
662	if (expect) throw GBS_global_string("Expected counter '### %s', found '%s'", word, parser.rest().c_str());
663	}
664	return found;
665	}
666
667	void GenebankImporter::parseSequence(const string& tag, const string& headerline) {
668	SmartPtr<BaseCounter> headerCount;
669
670	if (tag == "BASE") { // base count not always present
671	// parse headerline :
672	headerCount = new BaseCounter("sequence header");
673	{
674	StringParser parser(headerline);
675
676	parser.expectContent("COUNT");
677
678	parseCounter(true, *headerCount, parser, BC_A, "a");
679	parseCounter(true, *headerCount, parser, BC_C, "c");
680	parseCounter(true, *headerCount, parser, BC_G, "g");
681	parseCounter(true, *headerCount, parser, BC_T, "t");
682	parseCounter(false, *headerCount, parser, BC_OTHER, "others"); // not always present
683
684	headerCount->calcOverallCounter();
685	}
686	}
687
688	// parse sequence data
689	size_t est_seq_size = headerCount.isNull() ? 500000 : headerCount->getCount(BC_ALL);
690	SequenceBuffer seqData(est_seq_size);
691	{
692	string line;
693
694	if (!headerCount.isNull()) {
695	// if BASE COUNT was present, check ORIGIN line
696	// otherwise ORIGIN line has already been read
697	expectLine(line);
698	if (!beginsWith(line, "ORIGIN")) throw "Expected 'ORIGIN'";
699	}
700
701	bool eos_seen = false;
702	while (!eos_seen) {
703	expectLine(line);
704	if (beginsWith(line, "//")) {
705	eos_seen = true;
706	}
707	else {
708	string data;
709	data.reserve(60);
710	StringParser parser(line);
711
712	parser.eatSpaces(); // not sure whether there really have to be spaces if number has 9 digits or more
713	size_t cur_pos = (size_t)parser.extractNumber();
714	size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
715
716	if (cur_pos != (datasize+1)) {
717	throw GBS_global_string("Got wrong base position (found=%zu, expected=%zu)", cur_pos, size_t(datasize+1));
718	}
719
720	int blocks = 0;
721	while (!parser.atEnd() && parser.at() == ' ') {
722	parser.expectSpaces(1);
723
724	stringCIter start = parser.pos;
725	stringCIter end = parser.find(' ');
726
727	data.append(start, end);
728	blocks++;
729	}
730
731	if (blocks>6) throw "Found more than 6 parts of sequence data";
732	seqData.addLine(data);
733	}
734	}
735	}
736
737	if (headerCount.isNull()) {
738	warning("No 'BASE COUNT' found. Base counts have not been validated.");
739	}
740	else {
741	headerCount->expectEqual(seqData.getBaseCounter());
742	}
743	db_writer.writeSequence(seqData);
744	}
745
746	void EmblImporter::parseSequence(const string& headerline) {
747	// parse headerline:
748	BaseCounter headerCount("sequence header");
749	{
750	StringParser parser(headerline);
751
752	parser.expectContent("Sequence");
753
754	parseCounter(true, headerCount, parser, BC_ALL, "BP;");
755	parseCounter(true, headerCount, parser, BC_A, "A;");
756	parseCounter(true, headerCount, parser, BC_C, "C;");
757	parseCounter(true, headerCount, parser, BC_G, "G;");
758	parseCounter(true, headerCount, parser, BC_T, "T;");
759	parseCounter(true, headerCount, parser, BC_OTHER, "other;");
760
761	headerCount.checkOverallCounter();
762	}
763
764	// parse sequence data
765	SequenceBuffer seqData(headerCount.getCount(BC_ALL));
766	{
767	bool eos_seen = false;
768	string line;
769
770	while (!eos_seen) {
771	expectLine(line);
772	if (beginsWith(line, "//")) {
773	eos_seen = true;
774	}
775	else {
776	string data;
777	data.reserve(60);
778	StringParser parser(line);
779
780	parser.expectSpaces(5, false);
781	int blocks = 0;
782	while (!parser.atEnd() && isalpha(parser.at())) {
783	stringCIter start = parser.pos;
784	stringCIter end = parser.find(' ');
785
786	data.append(start, end);
787	blocks++;
788	parser.expectSpaces(1);
789	}
790
791	if (blocks>6) throw "Found more than 6 parts of sequence data";
792
793	size_t basecount = (size_t)parser.extractNumber();
794
795	seqData.addLine(data);
796	size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
797
798	if (basecount != datasize) {
799	throw GBS_global_string("Got wrong base counter(found=%zu, expected=%zu)", basecount, datasize);
800	}
801	}
802	}
803	}
804
805	headerCount.expectEqual(seqData.getBaseCounter());
806	db_writer.writeSequence(seqData);
807	}
808

Note: See TracBrowser for help on using the repository browser.

Download in other formats: