Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

Importer.cxx

Visit:

Last change on this file was 18781, checked in by westram, 3 years ago
reintegrates 'help' into 'trunk' relocate help sources adds: log:branches/help@18763:18780
File size: 27.7 KB

Line
1	// ================================================================ //
2	// //
3	// File : Importer.cxx //
4	// Purpose : Genome importer core //
5	// //
6	// Coded by Ralf Westram (coder@reallysoft.de) in November 2006 //
7	// Institute of Microbiology (Technical University Munich) //
8	// http://www.arb-home.de/ //
9	// //
10	// ================================================================ //
11
12	#include "tools.h"
13	#include "DBwriter.h"
14	#include <arbdb.h>
15	#include <arb_stdstr.h>
16
17	using namespace std;
18
19	// --------------------------------------------------------------------------------
20
21	static bool is_escaped(const string& str, size_t pos) {
22	// returns true, if position 'pos' in string 'str' is escaped by '\\'
23
24	bool escaped = false;
25	if (pos != 0) { // pos 0 can't be escaped
26	if (str[pos-1] == '\\') { // is an escape before pos ?
27	escaped = !is_escaped(str, pos-1); // pos is escaped, if the escape isn't!
28	}
29	}
30	return escaped;
31	}
32
33	FeatureLine::FeatureLine(const string& line) {
34	// start parsing at position 5
35	string::size_type first_char = line.find_first_not_of(' ', 5);
36
37	orgLine = line;
38
39	if (first_char == 5) { // feature start
40	string::size_type behind_name = line.find_first_of(' ', first_char);
41	string::size_type rest_start = line.find_first_not_of(' ', behind_name);
42
43	if (rest_start == string::npos) {
44	if (behind_name == string::npos) throw "Expected space behind feature name";
45	throw "Expected some content behind feature name";
46	}
47
48	name = line.substr(first_char, behind_name-first_char);
49	rest = line.substr(rest_start);
50	type = FL_START;
51	}
52	else if (first_char >= 21) { // not feature start
53	if (first_char == 21 && line[first_char] == '/') { // qualifier start
54	string::size_type equal_pos = line.find_first_of('=', first_char);
55	if (equal_pos == string::npos) {
56	// qualifier w/o data (i.e. "/pseudo")
57	name = line.substr(first_char+1);
58	rest = "true";
59	type = FL_QUALIFIER_NODATA;
60	}
61	else {
62	name = line.substr(first_char+1, equal_pos-first_char-1);
63	rest = line.substr(equal_pos+1);
64
65	if (rest[0] == '"') {
66	size_t rlen = rest.length();
67
68	if (rlen == 1) { // special case: only one open quote behind qualifier
69	type = FL_QUALIFIER_QUOTE_OPENED;
70	}
71	else if (rest[rlen-1] == '"' && !is_escaped(rest, rlen-1)) { // closing non-escaped quote at eol
72	type = FL_QUALIFIER_QUOTED;
73	}
74	else {
75	type = FL_QUALIFIER_QUOTE_OPENED;
76	}
77	}
78	else {
79	type = FL_QUALIFIER;
80	}
81	}
82	}
83	else { // continued line
84	interpret_as_continued_line();
85	}
86	}
87	else {
88	if (first_char == string::npos) {
89	throw "Expected feature line, found empty line";
90	}
91	throw GBS_global_string("Expected feature line (first char at pos=%zu unexpected)", first_char);
92	}
93	}
94
95	void FeatureLine::interpret_as_continued_line() {
96	rest = orgLine.substr(21);
97	if (rest[rest.length()-1] == '"') {
98	type = FL_CONTINUED_QUOTE_CLOSED;
99	}
100	else {
101	type = FL_CONTINUED;
102	}
103	}
104
105	bool FeatureLine::reinterpret_as_continued_line() {
106	bool ok = false;
107
108	if (type == FL_QUALIFIER \|\| type == FL_QUALIFIER_NODATA) {
109	string::size_type first_char = orgLine.find_first_not_of(' ', 5);
110	if (first_char >= 21) {
111	interpret_as_continued_line();
112	ok = true;
113	}
114	}
115
116	return ok;
117	}
118
119	// --------------------------------------------------------------------------------
120
121	Importer::Importer(LineReader& Flatfile, DBwriter& DB_writer, const MetaTag *meta_description)
122	: db_writer(DB_writer),
123	flatfile(Flatfile),
124	tagTranslator(meta_description),
125	expectedSeqLength(-1)
126	{}
127
128	void Importer::warning(const char *msg) {
129	warnings.push_back(msg);
130	}
131
132	FeatureLinePtr Importer::getFeatureTableLine() {
133	FeatureLinePtr fline;
134
135	if (pushedFeatureLines.empty()) { // nothing on stack -> read new
136	string line;
137	if (readFeatureTableLine(line)) fline = new FeatureLine(line);
138	}
139	else {
140	fline = pushedFeatureLines.back();
141	pushedFeatureLines.pop_back();
142	}
143	return fline;
144	}
145
146	FeatureLinePtr Importer::getUnwrappedFeatureTableLine() {
147	FeatureLinePtr fline = getFeatureTableLine();
148	if (!fline.isNull()) {
149	if (fline->type & FL_META_CONTINUED) throw "Expected start of feature or qualifier";
150
151	if (0 == (fline->type & (FL_QUALIFIER_NODATA\|FL_QUALIFIER_QUOTED))) {
152	// qualifier/featurestart may be wrapped
153	FeatureLinePtr next_fline = getFeatureTableLine();
154
155	while (!next_fline.isNull() &&
156	fline->type != FL_QUALIFIER_QUOTED) // already seen closing quote
157	{
158	if ((next_fline->type&FL_META_CONTINUED) == 0) {
159	// special case: a wrapped line of a quoted qualifier may start with /xxx
160	// (in that case it is misinterpreted as qualifier start)
161	if (fline->type == FL_QUALIFIER_QUOTE_OPENED) {
162	if (!next_fline->reinterpret_as_continued_line()) {
163	throw "did not see end of quoted qualifier (instead found next qualifiert)";
164	}
165	gi_assert(next_fline->type & FL_META_CONTINUED);
166	}
167	else {
168	break;
169	}
170	}
171
172	if (next_fline->type == FL_CONTINUED_QUOTE_CLOSED) {
173	if (fline->type != FL_QUALIFIER_QUOTE_OPENED) throw "Unexpected closing quote";
174	fline->type = FL_QUALIFIER_QUOTED;
175	}
176	else {
177	gi_assert(next_fline->type == FL_CONTINUED);
178	gi_assert(fline->type == FL_START \|\| fline->type == FL_QUALIFIER \|\| fline->type == FL_QUALIFIER_QUOTE_OPENED);
179	}
180
181	fline->rest.append(next_fline->rest);
182	next_fline = getFeatureTableLine();
183	}
184
185	if (!next_fline.isNull()) backFeatureTableLine(next_fline);
186	}
187	}
188	return fline;
189	}
190
191	FeaturePtr Importer::parseFeature() {
192	FeaturePtr feature;
193	FeatureLinePtr fline = getUnwrappedFeatureTableLine();
194
195	if (!fline.isNull()) { // found a feature table line
196	if (fline->type != FL_START) throw "Expected feature start";
197
198	feature = new Feature(fline->name, fline->rest);
199
200	fline = getUnwrappedFeatureTableLine();
201	while (!fline.isNull() && (fline->type & FL_META_QUALIFIER)) {
202	feature->addQualifiedEntry(fline->name, fline->rest);
203	fline = getUnwrappedFeatureTableLine();
204	}
205	if (!fline.isNull()) backFeatureTableLine(fline);
206	}
207
208	return feature;
209	}
210
211	void Importer::parseFeatureTable() {
212	FeaturePtr feature = parseFeature();
213
214	while (!feature.isNull()) {
215	feature->expectLocationInSequence(expectedSeqLength);
216	feature->fixEmptyQualifiers();
217	db_writer.writeFeature(*feature, expectedSeqLength);
218	feature = parseFeature();
219	}
220	}
221
222	void Importer::show_warnings(const string& import_of_what) {
223	if (!warnings.empty()) {
224	const char *what = import_of_what.c_str();
225	stringVectorCRIter e = warnings.rend();
226	for (stringVectorCRIter i = warnings.rbegin(); i != e; ++i) {
227	GB_warningf("Warning: %s: %s", what, i->c_str());
228	}
229	warnings.clear();
230	}
231	}
232
233
234	void Importer::import() {
235	try {
236	string line;
237	while (flatfile.getLine(line)) {
238	if (!line.empty()) { // silently skip empty lines before or after section
239	flatfile.backLine(line);
240
241	// cleanup from import of previous section
242	gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
243	pushedFeatureLines.clear();
244	warnings.clear();
245
246	expectedSeqLength = 0; // reset expected seq. length
247	import_section();
248
249	gi_assert(warnings.empty());
250	gi_assert(pushedFeatureLines.empty()); // oops - somehow forgot a feature
251	}
252	}
253	}
254	catch (const DBerror& err) { throw err.getMessage(); }
255	catch (const string& err) { throw flatfile.lineError(err); }
256	catch (const char *err) { throw flatfile.lineError(err); }
257	}
258
259	void Importer::check_base_counters(const SequenceBuffer& seqData, const BaseCounter *headerCount) {
260	const BaseCounter& baseCounter = seqData.getBaseCounter();
261	if (baseCounter.getCount(BC_ALL)<1) {
262	warning("Sequence data is empty (only metadata found).");
263	}
264	if (!headerCount) {
265	gi_assert(dynamic_cast<GenebankImporter*>(this) != NULp); // this case shall only happen with genebank files.
266	warning("No 'BASE COUNT' found. Base counts have not been validated.");
267	}
268	else {
269	headerCount->expectEqual(baseCounter);
270	}
271	}
272
273	// --------------------------------------------------------------------------------
274	// Meta information definitions
275	//
276	//
277	// [ please keep the list of common entries in
278	// ../HELP_SOURCE/source/sp_info.hlp@"Commonly used database entries"
279	// up to date! ]
280
281	static MetaTag genebank_meta_description[] = {
282	{ "LOCUS", "org_locus", MT_HEADER },
283
284	{ "REFERENCE", "", MT_REF_START },
285	{ " AUTHORS", "author", MT_REF },
286	{ " TITLE", "title", MT_REF },
287	{ " CONSRTM", "refgrp", MT_REF },
288	{ " JOURNAL", "journal", MT_REF },
289	{ " PUBMED", "pubmed_id", MT_REF },
290	{ " MEDLINE", "medline_id", MT_REF },
291	{ " REMARK", "refremark", MT_REF },
292
293	{ "DEFINITION", "definition", MT_BASIC },
294	{ "ACCESSION", "acc", MT_BASIC },
295	{ "VERSION", "version", MT_BASIC },
296	{ "DBLINK", "db_xref", MT_BASIC },
297	{ "KEYWORDS", "keywd", MT_BASIC },
298	{ "SOURCE", "full_name", MT_BASIC },
299	{ " ORGANISM", "tax", MT_BASIC },
300	{ "COMMENT", "comment", MT_BASIC },
301	{ "PROJECT", "projref", MT_BASIC },
302
303	{ "FEATURES", "", MT_FEATURE_START },
304	{ "CONTIG", "", MT_CONTIG },
305	{ "BASE", "", MT_SEQUENCE_START }, // BASE COUNT (sometimes missing)
306	{ "ORIGIN", "", MT_SEQUENCE_START }, // only used if BASE COUNT is missing
307	{ "//", "", MT_END },
308
309	{ "", "", MT_IGNORE }, // End of array
310	};
311
312	static MetaTag embl_meta_description[] = {
313	{ "ID", "org_id", MT_HEADER },
314
315	{ "RN", "", MT_REF_START },
316	{ "RA", "author", MT_REF },
317	{ "RC", "auth_comm", MT_REF },
318	{ "RG", "refgrp", MT_REF },
319	{ "RL", "journal", MT_REF },
320	{ "RP", "nuc_rp", MT_REF },
321	{ "RT", "title", MT_REF },
322	{ "RX", "", MT_REF_DBID }, // @@@ extract field 'pubmed_id' ?
323
324	{ "AC", "acc", MT_BASIC },
325	{ "AH", "assembly_header", MT_BASIC },
326	{ "AS", "assembly_info", MT_BASIC },
327	{ "CC", "comment", MT_BASIC },
328	{ "CO", "contig", MT_BASIC },
329	{ "DE", "description", MT_BASIC },
330	{ "DR", "db_xref", MT_BASIC },
331	{ "DT", "date", MT_BASIC },
332	{ "SV", "version", MT_BASIC },
333	{ "KW", "keywd", MT_BASIC },
334	{ "OS", "full_name", MT_BASIC },
335	{ "OC", "tax", MT_BASIC },
336	{ "OG", "organelle", MT_BASIC },
337	{ "PR", "projref", MT_BASIC },
338
339	{ "FH", "", MT_FEATURE_START },
340	{ "FT", "", MT_FEATURE },
341	{ "SQ", "", MT_SEQUENCE_START },
342	{ "//", "", MT_END },
343
344	{ "XX", "", MT_IGNORE }, // spacer
345
346	{ "", "", MT_IGNORE }, // End of array
347	};
348
349	// --------------------------------------------------------------------------------
350
351
352	GenebankImporter::GenebankImporter(LineReader& Flatfile, DBwriter& DB_writer)
353	: Importer(Flatfile, DB_writer, genebank_meta_description)
354	{}
355
356	bool GenebankImporter::readFeatureTableLine(string& line) {
357	if (flatfile.getLine(line)) {
358	if (beginsWith(line, " ")) {
359	return true;
360	}
361	flatfile.backLine(line);
362	}
363	return false;
364	}
365
366	static bool splitGenebankTag(const string& line, string& tag, string& content) {
367	// split a line into tag (incl. preceding spaces) and content
368	// returns true, if line suffices the format requirements (currently never returns false!)
369	// Note: returns tag="" at wrapped lines
370
371	string::size_type first_non_space = line.find_first_not_of(' ');
372
373	if (first_non_space >= 12 \|\| // no tag, only content
374	(first_non_space == string::npos && line.length() == 12)) { // same with empty content
375	tag = "";
376	content = line.substr(12);
377	return true;
378	}
379
380	gi_assert(first_non_space<12);
381
382	string::size_type behind_tag = line.find_first_of(' ', first_non_space);
383	if (behind_tag == string::npos) { // only tag w/o spaces behind
384	tag = line;
385	content = "";
386	return true;
387	}
388
389	string::size_type content_start = line.find_first_not_of(' ', behind_tag);
390	if (content_start == string::npos) { // line w/o content
391	content = "";
392	}
393	else {
394	content = line.substr(content_start);
395	}
396
397	tag = line.substr(0, behind_tag);
398	return true;
399	}
400
401	static long scanSeqlenFromLOCUS(const string& locusContent) {
402	StringParser parser(locusContent);
403	parser.extractWord(); // id
404	parser.eatSpaces();
405
406	long bp = parser.extractNumber();
407	parser.eatSpaces();
408	parser.expectContent("bp");
409
410	return bp;
411	}
412
413	void GenebankImporter::import_section() {
414	MetaInfo meta;
415	References refs;
416
417	const MetaTag *prevTag = NULp; // previously handled tag
418	string prevContent; // previously found content
419
420	bool seenHeaderLine = false;
421	bool EOS = false; // end of section ?
422
423	// read header of file
424	while (!EOS) {
425	string line, tag, content;
426	expectLine(line);
427	if (!splitGenebankTag(line, tag, content)) {
428	gi_assert(0);
429	}
430
431	if (tag.empty()) { // no tag - happens at wrapped lines
432	if (!content.empty()) { // do not append empty 'lines' from wrapped tag-entries.
433	prevContent.append(1, ' ');
434	prevContent.append(content);
435	}
436	}
437	else { // start of new tag
438	const MetaTag *knownTag = findTag(tag);
439	if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
440
441	if (prevTag) { // save previous tag
442	switch (prevTag->type) {
443	case MT_REF: refs.add(prevTag->field, prevContent); break;
444	case MT_BASIC: meta.add(prevTag, prevContent, true); break;
445	case MT_HEADER:
446	meta.add(prevTag, prevContent, true); // save header line
447	expectedSeqLength = scanSeqlenFromLOCUS(prevContent);
448	break;
449	case MT_REF_DBID: // embl only
450	default: gi_assert(0); break;
451	}
452	prevTag = NULp;
453	}
454
455	switch (knownTag->type) {
456	case MT_HEADER:
457	if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
458	seenHeaderLine = true;
459	// fall-through
460	case MT_BASIC:
461	case MT_REF:
462	prevTag = knownTag;
463	prevContent = content;
464	break;
465
466	case MT_REF_START:
467	refs.start(); // start a new reference
468	break;
469
470	case MT_FEATURE_START:
471	db_writer.createOrganism(flatfile.getFilename(), "NCBI");
472	parseFeatureTable();
473	break;
474
475	case MT_SEQUENCE_START:
476	parseSequence(knownTag->tag, content);
477	EOS = true; // end of section
478	break;
479
480	case MT_IGNORE:
481	break;
482
483	case MT_END:
484	EOS = true;
485	break;
486
487	case MT_CONTIG:
488	throw GBS_global_string("Cannot import files containing CONTIG");
489
490	case MT_REF_DBID: // embl only
491	default:
492	gi_assert(0);
493	throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
494	}
495	}
496	}
497
498	db_writer.finalizeOrganism(meta, refs, *this);
499	show_warnings(meta.getAccessionNumber());
500	}
501
502	// --------------------------------------------------------------------------------
503
504
505	EmblImporter::EmblImporter(LineReader& Flatfile, DBwriter& DB_writer)
506	: Importer(Flatfile, DB_writer, embl_meta_description)
507	{}
508
509	static bool splitEmblTag(const string& line, string& tag, string& content) {
510	// split a line into 2-character tag and content
511	// return true on success (i.e. if line suffices the required format)
512
513	if (line.length() == 2) {
514	tag = line;
515	content = "";
516	}
517	else {
518	string::size_type spacer = line.find(" "); // separator between tag and content
519	if (spacer != 2) return false; // expect spacer at pos 2-4
520
521	tag = line.substr(0, 2);
522	content = line.substr(5);
523	}
524
525	return true;
526	}
527
528	bool EmblImporter::readFeatureTableLine(string& line) {
529	if (flatfile.getLine(line)) {
530	if (beginsWith(line, "FT ")) {
531	return true;
532	}
533	flatfile.backLine(line);
534	}
535	return false;
536	}
537
538	static long scanSeqlenFromID(const string& idContent) {
539	StringParser parser(idContent);
540	string lastWord = parser.extractWord(); // eat id
541	bool bpseen = false;
542	long bp = -1;
543
544	while (!bpseen) {
545	parser.eatSpaces();
546	string word = parser.extractWord();
547	if (word == "BP.") {
548	// basecount is in word before "BP."
549	bp = atol(lastWord.c_str());
550	bpseen = true;
551	}
552	else {
553	lastWord = word;
554	}
555	}
556
557	if (bp == -1) throw "Could not parse bp from header";
558
559	return bp;
560	}
561
562	void EmblImporter::import_section() {
563	MetaInfo meta;
564	References refs;
565
566	const MetaTag *prevTag = NULp; // previously handled tag
567	string prevContent; // previously found content
568	bool prevAppendNL = false; // append '\n' into multiline tags
569
570	bool seenHeaderLine = false;
571	bool EOS = false; // end of section ?
572
573	// read header of file
574	while (!EOS) {
575	string line, tag, content;
576	expectLine(line);
577	if (!splitEmblTag(line, tag, content)) {
578	throw "Expected two-character tag at start of line";
579	}
580
581	const MetaTag *knownTag = findTag(tag);
582	if (!knownTag) throw GBS_global_string("Invalid tag '%s'", tag.c_str());
583
584	if (knownTag == prevTag) { // multiline tag
585	if (prevAppendNL) prevContent.append("\n"); // append a newline to make parsing in add_dbid() more easy
586	prevContent.append(content); // append w/o space - EMBL flatfiles have spaces at EOL when needed
587	}
588	else { // start of new tag
589	if (prevTag) { // save previous tag
590	switch (prevTag->type) {
591	case MT_REF: refs.add(prevTag->field, prevContent); break;
592	case MT_REF_DBID: refs.add_dbid(prevContent); prevAppendNL = false; break;
593	case MT_BASIC: meta.add(prevTag, prevContent, true); break;
594	case MT_HEADER:
595	meta.add(prevTag, prevContent, true);
596	expectedSeqLength = scanSeqlenFromID(prevContent);
597	break;
598	default: gi_assert(0); break;
599	}
600	prevTag = NULp;
601	}
602
603	switch (knownTag->type) {
604	case MT_HEADER:
605	if (seenHeaderLine) throw GBS_global_string("Multiple occurrences of tag '%s'", tag.c_str());
606	seenHeaderLine = true;
607	// fall-through
608	case MT_BASIC:
609	case MT_REF:
610	prevTag = knownTag;
611	prevContent = content;
612	break;
613
614	case MT_REF_DBID:
615	prevTag = knownTag;
616	prevContent = content;
617	prevAppendNL = true;
618	break;
619
620	case MT_REF_START:
621	refs.start(); // start a new reference
622	break;
623
624	case MT_FEATURE:
625	flatfile.backLine(line);
626	db_writer.createOrganism(flatfile.getFilename(), "EMBL");
627	parseFeatureTable();
628	break;
629
630	case MT_SEQUENCE_START:
631	parseSequence(content);
632	EOS = true; // end of section
633	break;
634
635	case MT_FEATURE_START:
636	case MT_IGNORE:
637	break;
638
639	default:
640	gi_assert(0);
641	throw GBS_global_string("Tag '%s' not expected here", knownTag->tag.c_str());
642	}
643	}
644	}
645	db_writer.finalizeOrganism(meta, refs, *this);
646	show_warnings(meta.getAccessionNumber());
647	}
648
649	// --------------------------------------------------------------------------------
650	// sequence readers:
651
652	inline bool parseCounter(bool expect, BaseCounter& headerCount, StringParser& parser, Base base, const char *word) {
653	// parses part of string (e.g. " 6021225 BP;" or " 878196 A;")
654	// if 'expect' == true -> throw exception if missing
655	// if 'expect' == false -> return false if missing
656
657	bool found = false;
658	stringCIter start = parser.getPosition();
659
660	parser.expectSpaces(0);
661
662	bool seen_number;
663	long count = parser.eatNumber(seen_number);
664
665	if (seen_number) {
666	headerCount.addCount(base, count);
667	size_t spaces = parser.eatSpaces();
668	if (spaces>0) {
669	size_t len = parser.lookingAt(word);
670	if (len>0) { // seen
671	parser.advance(len);
672	found = true;
673	}
674	}
675	}
676
677	if (!found) {
678	parser.setPosition(start); // reset position
679	if (expect) throw GBS_global_string("Expected counter '### %s', found '%s'", word, parser.rest().c_str());
680	}
681	return found;
682	}
683
684	void GenebankImporter::parseSequence(const string& tag, const string& headerline) {
685	SmartPtr<BaseCounter> headerCount;
686
687	if (tag == "BASE") { // base count not always present
688	// parse headerline :
689	headerCount = new BaseCounter("sequence header");
690	{
691	StringParser parser(headerline);
692
693	parser.expectContent("COUNT");
694
695	parseCounter(true, *headerCount, parser, BC_A, "a");
696	parseCounter(true, *headerCount, parser, BC_C, "c");
697	parseCounter(true, *headerCount, parser, BC_G, "g");
698	parseCounter(true, *headerCount, parser, BC_T, "t");
699	parseCounter(false, *headerCount, parser, BC_OTHER, "others"); // not always present
700
701	headerCount->calcOverallCounter();
702	}
703	}
704
705	// parse sequence data
706	size_t est_seq_size = headerCount.isNull() ? 500000 : headerCount->getCount(BC_ALL);
707	SequenceBuffer seqData(est_seq_size);
708	{
709	string line;
710
711	if (!headerCount.isNull()) {
712	// if BASE COUNT was present, check ORIGIN line
713	// otherwise ORIGIN line has already been read
714	expectLine(line);
715	if (!beginsWith(line, "ORIGIN")) throw "Expected 'ORIGIN'";
716	}
717
718	bool eos_seen = false;
719	while (!eos_seen) {
720	expectLine(line);
721	if (beginsWith(line, "//")) {
722	eos_seen = true;
723	}
724	else {
725	string data;
726	data.reserve(60);
727	StringParser parser(line);
728
729	parser.eatSpaces(); // not sure whether there really have to be spaces if number has 9 digits or more
730	size_t cur_pos = (size_t)parser.extractNumber();
731	size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
732
733	if (cur_pos != (datasize+1)) {
734	throw GBS_global_string("Got wrong base position (found=%zu, expected=%zu)", cur_pos, size_t(datasize+1));
735	}
736
737	int blocks = 0;
738	while (!parser.atEnd() && parser.at() == ' ') {
739	parser.expectSpaces(1);
740
741	stringCIter start = parser.pos;
742	stringCIter end = parser.find(' ');
743
744	data.append(start, end);
745	blocks++;
746	}
747
748	if (blocks>6) throw "Found more than 6 parts of sequence data";
749	seqData.addLine(data);
750	}
751	}
752	}
753
754	check_base_counters(seqData, headerCount.content());
755	db_writer.writeSequence(seqData);
756	}
757
758	void EmblImporter::parseSequence(const string& headerline) {
759	// parse headerline:
760	BaseCounter headerCount("sequence header");
761	{
762	StringParser parser(headerline);
763
764	parser.expectContent("Sequence");
765
766	parseCounter(true, headerCount, parser, BC_ALL, "BP;");
767	parseCounter(true, headerCount, parser, BC_A, "A;");
768	parseCounter(true, headerCount, parser, BC_C, "C;");
769	parseCounter(true, headerCount, parser, BC_G, "G;");
770	parseCounter(true, headerCount, parser, BC_T, "T;");
771	parseCounter(true, headerCount, parser, BC_OTHER, "other;");
772
773	headerCount.checkOverallCounter();
774	}
775
776	// parse sequence data
777	SequenceBuffer seqData(headerCount.getCount(BC_ALL));
778	{
779	bool eos_seen = false;
780	string line;
781
782	while (!eos_seen) {
783	expectLine(line);
784	if (beginsWith(line, "//")) {
785	eos_seen = true;
786	}
787	else {
788	string data;
789	data.reserve(60);
790	StringParser parser(line);
791
792	parser.expectSpaces(5, false);
793	int blocks = 0;
794	while (!parser.atEnd() && isalpha(parser.at())) {
795	stringCIter start = parser.pos;
796	stringCIter end = parser.find(' ');
797
798	data.append(start, end);
799	blocks++;
800	parser.expectSpaces(1);
801	}
802
803	if (blocks>6) throw "Found more than 6 parts of sequence data";
804
805	size_t basecount = (size_t)parser.extractNumber();
806
807	seqData.addLine(data);
808	size_t datasize = seqData.getBaseCounter().getCount(BC_ALL);
809
810	if (basecount != datasize) {
811	throw GBS_global_string("Got wrong base counter(found=%zu, expected=%zu)", basecount, datasize);
812	}
813	}
814	}
815	}
816
817	check_base_counters(seqData, &headerCount);
818	db_writer.writeSequence(seqData);
819	}
820

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/help/GENOM_IMPORT/Importer.cxx

Download in other formats: