Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

arb_help2xml.cxx

Visit:

Last change on this file was 19530, checked in by westram, 6 weeks ago
automatically remove generated files required after renaming/deleting help sources (otherwise reports bogus errors) dead links now cause an error also during development. resource checker: `bugtracker.hlp` is autoused.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 65.3 KB

Line
1	// ==================================================================== //
2	// //
3	// File : arb_help2xml.cxx //
4	// Purpose : Converts old ARB help format to XML //
5	// //
6	// Coded by Ralf Westram (coder@reallysoft.de) in October 2001 //
7	// Copyright Department of Microbiology (Technical University Munich) //
8	// //
9	// Visit our web site at: http://www.arb-home.de/ //
10	// //
11	// ==================================================================== //
12
13	#include <xml.hxx>
14	#include <arb_defs.h>
15	#include <arb_diff.h>
16	#include <static_assert.h>
17
18	#include <list>
19	#include <set>
20	#include <iostream>
21	#include <fstream>
22
23	#include <cstdlib>
24	#include <cstdarg>
25	#include <cstring>
26	#include <climits>
27
28	#include <unistd.h>
29	#include <sys/stat.h>
30
31	using namespace std;
32
33	#define h2x_assert(bed) arb_assert(bed)
34
35	// Limit the length of the TITLE/SUBTITLE of helppages.
36	// - TITLE has to fit into UP/SUB subwindows of arb internal help window
37	// - SUBTITLE has to fit into default help-textsubwindow width
38	#define MAX_TITLE_CHARS 42
39	#define MAX_SUBTITLE_CHARS 75
40
41	#if defined(DEBUG)
42	#define WARN_FORMATTING_PROBLEMS
43	#define WARN_MISSING_HELP
44	// #define DUMP_PARAGRAPHS
45	// #define PROTECT_HELP_VS_CHANGES
46	#endif // DEBUG
47
48
49	#if defined(WARN_FORMATTING_PROBLEMS)
50
51	#define WARN_FIXED_LAYOUT_LIST_ELEMENTS
52	#define WARN_LONESOME_ENUM_ELEMENTS
53
54	// warnings below are useless for production and shall be disabled in SVN
55	// #define WARN_LONESOME_LIST_ELEMENTS
56	// #define WARN_POSSIBLY_WRONG_INDENTATION_CORRECTION
57	// #define WARN_IGNORED_ALPHA_ENUMS
58
59	#endif
60
61
62	#define MAX_LINE_LENGTH 200 // maximum length of lines in input stream
63	#define TABSIZE 8
64
65	static const char *knownSections[] = {
66	"OCCURRENCE",
67	"DESCRIPTION",
68	"NOTES",
69	"EXAMPLES",
70	"WARNINGS",
71	"BUGS",
72	"SECTION",
73	};
74
75	enum SectionType {
76	SEC_OCCURRENCE,
77	SEC_DESCRIPTION,
78	SEC_NOTES,
79	SEC_EXAMPLES,
80	SEC_WARNINGS,
81	SEC_BUGS,
82	SEC_SECTION,
83
84	KNOWN_SECTION_TYPES,
85	SEC_NONE,
86	SEC_FAKE,
87	};
88
89	STATIC_ASSERT(ARRAY_ELEMS(knownSections) == KNOWN_SECTION_TYPES);
90
91	__ATTR__VFORMAT(1) static string vstrf(const char *format, va_list argPtr) {
92	static size_t buf_size = 256;
93	static char *buffer = new char[buf_size];
94
95	size_t length;
96	while (1) {
97	if (!buffer) {
98	h2x_assert(buffer); // to stop when debugging
99	throw string("out of memory");
100	}
101
102	length = vsnprintf(buffer, buf_size, format, argPtr);
103	if (length < buf_size) break; // string fits into current buffer
104
105	// otherwise resize buffer :
106	buf_size += buf_size/2;
107	delete [] buffer;
108	buffer = new char[buf_size];
109	}
110
111	return string(buffer, length);
112	}
113
114	__ATTR__FORMAT(1) static string strf(const char *format, ...) {
115	va_list argPtr;
116	va_start(argPtr, format);
117	string result = vstrf(format, argPtr);
118	va_end(argPtr);
119
120	return result;
121	}
122
123	// -----------------------------
124	// warnings and errors
125
126	class LineAttachedMessage {
127	string message;
128	size_t lineno;
129
130	public:
131	LineAttachedMessage(const string& message_, size_t lineno_) :
132	message(message_),
133	lineno(lineno_)
134	{}
135
136	const string& Message() const { return message; }
137	size_t Lineno() const { return lineno; }
138	};
139
140	const size_t NO_LINENUMBER_INFO = -1U;
141
142	LineAttachedMessage unattached_message(const string& message) { return LineAttachedMessage(message, NO_LINENUMBER_INFO); }
143
144
145	static list<LineAttachedMessage> warnings;
146	inline void add_warning(const LineAttachedMessage& laMsg) {
147	warnings.push_back(laMsg);
148	}
149	inline void add_warning(const string& warning, size_t lineno) {
150	add_warning(LineAttachedMessage(warning, lineno));
151	}
152
153	struct MessageAttachable {
154	virtual ~MessageAttachable() {}
155
156	virtual string location_description() const = 0; // may return empty string
157	virtual size_t line_number() const = 0; // if unknown -> should return NO_LINENUMBER_INFO
158
159	LineAttachedMessage attached_message(const string& message) const {
160	string where = location_description();
161	if (where.empty()) return LineAttachedMessage(message, line_number());
162	return LineAttachedMessage(message+" ["+where+"]", line_number());
163	}
164	void attach_warning(const string& message) const {
165	add_warning(attached_message(message));
166	}
167	};
168
169
170	// ----------------------
171	// class Reader
172
173	class Reader : public MessageAttachable {
174	private:
175	istream& in;
176	char lineBuffer[MAX_LINE_LENGTH];
177	char lineBuffer2[MAX_LINE_LENGTH];
178	bool readAgain;
179	bool eof;
180	int lineNo;
181
182	string location_description() const OVERRIDE { return ""; }
183	size_t line_number() const OVERRIDE { return lineNo; }
184
185	void getline() {
186	if (!eof) {
187	if (in.eof()) eof = true;
188	else {
189	h2x_assert(in.good());
190
191	in.getline(lineBuffer, MAX_LINE_LENGTH);
192	lineNo++;
193
194	if (in.eof()) eof = true;
195	else if (in.fail()) throw "line too long";
196
197	if (strchr(lineBuffer, '\t')) {
198	int o2 = 0;
199
200	for (int o = 0; lineBuffer[o]; ++o) {
201	if (lineBuffer[o] == '\t') {
202	int spaces = TABSIZE - (o2 % TABSIZE);
203	while (spaces--) lineBuffer2[o2++] = ' ';
204	}
205	else {
206	lineBuffer2[o2++] = lineBuffer[o];
207	}
208	}
209	lineBuffer2[o2] = 0;
210	strcpy(lineBuffer, lineBuffer2);
211	}
212
213	char *eol = strchr(lineBuffer, 0)-1;
214	while (eol >= lineBuffer && isspace(eol[0])) {
215	eol[0] = 0; // trim trailing whitespace
216	eol--;
217	}
218	if (eol > lineBuffer) {
219	// now eol points to last character
220	if (eol[0] == '-' && isalnum(eol[-1])) {
221	attach_warning("manual hyphenation detected");
222	}
223	}
224	}
225	}
226	}
227
228	public:
229	Reader(istream& in_) : in(in_), readAgain(true), eof(false), lineNo(0) { getline(); }
230	virtual ~Reader() {}
231
232	const char *getNext() {
233	if (readAgain) readAgain = false;
234	else getline();
235	return eof ? NULp : lineBuffer;
236	}
237
238	void back() {
239	h2x_assert(!readAgain);
240	readAgain = true;
241	}
242
243	int getLineNo() const { return lineNo; }
244	};
245
246	enum ParagraphType {
247	PLAIN_TEXT,
248	ENUMERATED,
249	ITEM,
250	};
251	enum EnumerationType {
252	NONE,
253	DIGITS,
254	ALPHA_UPPER,
255	ALPHA_LOWER,
256	};
257
258	class Ostring : public MessageAttachable {
259	string content;
260	size_t lineNo; // where string came from
261	ParagraphType type;
262
263	// only valid for type==ENUMERATED:
264	EnumerationType etype;
265	unsigned number;
266
267	public:
268
269	Ostring(const string& s, size_t line_no, ParagraphType type_)
270	: content(s),
271	lineNo(line_no),
272	type(type_),
273	etype(NONE)
274	{
275	h2x_assert(type != ENUMERATED);
276	}
277	Ostring(const string& s, size_t line_no, ParagraphType type_, EnumerationType etype_, unsigned num)
278	: content(s),
279	lineNo(line_no),
280	type(type_),
281	etype(etype_),
282	number(num)
283	{
284	h2x_assert(type == ENUMERATED);
285	h2x_assert(etype == DIGITS \|\| etype == ALPHA_UPPER \|\| etype == ALPHA_LOWER);
286	h2x_assert(num>0);
287	}
288
289	// MessageAttachable interface:
290	string location_description() const OVERRIDE { return ""; }
291	size_t line_number() const OVERRIDE { return get_lineno(); }
292
293	operator const string&() const { return content; }
294	operator string&() { return content; }
295
296	const string& as_string() const { return content; }
297	string& as_string() { return content; }
298
299	size_t get_lineno() const { return lineNo; } // @@@ replace by line_number()?
300
301	const ParagraphType& get_type() const { return type; }
302	const EnumerationType& get_enum_type() const {
303	h2x_assert(type == ENUMERATED);
304	return etype;
305	}
306	unsigned get_number() const {
307	h2x_assert(type == ENUMERATED);
308	return number;
309	}
310
311	// wrapper to make Ostring act like char*
312	const char *c_str() const { return content.c_str(); }
313	};
314
315	typedef list<Ostring> Ostrings;
316
317	#if defined(WARN_MISSING_HELP)
318	static void check_TODO(const char *line, const Reader& reader) {
319	if (strstr(line, "@@@") \|\| strstr(line, "TODO")) {
320	reader.attach_warning(strf("TODO: %s", line));
321	}
322	}
323	#else
324	inline void check_TODO(const char *, const Reader&) { }
325	#endif // WARN_MISSING_HELP
326
327	// ----------------------------
328	// class Section
329
330	class Section FINAL_TYPE : public MessageAttachable {
331	SectionType type;
332	string name;
333	Ostrings content;
334	size_t lineno;
335
336	string location_description() const OVERRIDE { return string("in SECTION '")+name+"'"; }
337
338	public:
339	Section(string name_, SectionType type_, size_t lineno_)
340	: type(type_),
341	name(name_),
342	lineno(lineno_)
343	{}
344	virtual ~Section() {}
345
346	const Ostrings& Content() const { return content; }
347	Ostrings& Content() { return content; }
348	SectionType get_type() const { return type; }
349	size_t line_number() const OVERRIDE { return lineno; }
350	const string& getName() const { return name; }
351	void setName(const string& name_) { name = name_; }
352
353	void set_line_number(size_t lineNumber) { lineno = lineNumber; }
354	};
355
356	typedef list<Section> SectionList;
357
358	// --------------------
359	// class Link
360
361	class Link {
362	string target;
363	size_t source_lineno;
364
365	public:
366	Link(const string& target_, size_t source_lineno_) :
367	target(target_),
368	source_lineno(source_lineno_)
369	{}
370
371	const string& Target() const { return target; }
372	size_t SourceLineno() const { return source_lineno; }
373	};
374
375	typedef list<Link> Links;
376
377	// ------------------------
378	// class Helpfile
379
380	class Helpfile {
381	Links uplinks;
382	Links references;
383	Links auto_references;
384	Section title;
385	SectionList sections;
386	string inputfile;
387
388	void check_self_ref(const string& link) {
389	size_t slash = inputfile.find('/');
390	if (slash != string::npos) {
391	if (inputfile.substr(slash+1) == link) {
392	throw string("Invalid link to self");
393	}
394	}
395	}
396
397	public:
398	Helpfile() : title("TITLE", SEC_FAKE, NO_LINENUMBER_INFO) {}
399	virtual ~Helpfile() {}
400
401	void readHelp(istream& in, const string& filename);
402	void writeXML(FILE *out, const string& page_name);
403	void extractInternalLinks();
404
405	const Section& get_title() const { return title; }
406	};
407
408	inline bool isSpace(char c) { return c == ' '; }
409	inline bool isWhitespace(char c) { return isSpace(c) \|\| c == '\n'; }
410
411	inline bool isEmptyOrComment(const char *s) {
412	if (s[0] == '#') return true;
413	for (int off = 0; ; ++off) {
414	if (s[off] == 0) return true;
415	if (!isSpace(s[off])) break;
416	}
417
418	return false;
419	}
420
421	inline const char extractKeyword(const char line, string& keyword) {
422	// returns NULp if no keyword was found
423	// otherwise returns position behind keyword and sets value of 'keyword'
424
425	const char *space = strchr(line, ' ');
426	if (space && space>line) {
427	keyword = string(line, 0, space-line);
428	return space;
429	}
430	else if (!space) { // test for keyword w/o content behind
431	if (line[0]) { // not empty
432	keyword = line;
433	return strchr(line, 0);
434	}
435	}
436	return NULp;
437	}
438
439	inline const char eatSpace(const char line) {
440	// skip over spaces at start of 'line'
441	while (isSpace(*line)) ++line;
442	return line;
443	}
444	inline const char eatWhitespace(const char paragraph) {
445	// skip over spaces and empty lines at start of 'paragraph'
446	while (isWhitespace(*paragraph)) ++paragraph;
447	return paragraph;
448	}
449
450	inline void pushParagraph(Section& sec, string& paragraph, size_t lineNo, ParagraphType& type, EnumerationType& etype, unsigned num) {
451	if (paragraph.length()) {
452	if (type == ENUMERATED) {
453	sec.Content().push_back(Ostring(paragraph, lineNo, type, etype, num));
454	}
455	else {
456	sec.Content().push_back(Ostring(paragraph, lineNo, type));
457	}
458
459	type = PLAIN_TEXT;
460	etype = NONE;
461	paragraph = "";
462	}
463	}
464
465	inline const char firstChar(const char s) {
466	while (isSpace(s[0])) ++s;
467	return s;
468	}
469
470	inline bool is_startof_itemlist_element(const char *contentStart) {
471	return
472	(contentStart[0] == '-' \|\|
473	contentStart[0] == '*')
474	&&
475	isspace(contentStart[1])
476	&&
477	!(isspace(contentStart[2]) \|\|
478	contentStart[2] == '-');
479	}
480
481	#define MAX_ALLOWED_ENUM 99 // otherwise it starts interpreting years as enums
482
483	static EnumerationType startsWithLetter(string& s, unsigned& number) {
484	// tests if first line starts with 'letter.'
485	// if true then 'letter.' is removed from the string
486	// the letter is converted and returned in 'number' ('a'->1, 'b'->2, ..)
487
488	size_t off = s.find_first_not_of(" \n");
489	if (off == string::npos) return NONE;
490	if (!isalpha(s[off])) return NONE;
491
492	size_t astart = off;
493	EnumerationType etype = isupper(s[off]) ? ALPHA_UPPER : ALPHA_LOWER;
494
495	number = s[off]-(etype == ALPHA_UPPER ? 'A' : 'a')+1;
496	++off;
497
498	h2x_assert(number>0 && number<MAX_ALLOWED_ENUM);
499
500	if (s[off] != '.' && s[off] != ')') return NONE;
501	if (s[off+1] != ' ') return NONE;
502
503	// remove 'letter.' from string :
504	++off;
505	while (s[off+1] == ' ') ++off;
506	s.erase(astart, off-astart+1);
507
508	return etype;
509	}
510
511	static bool startsWithNumber(string& s, unsigned& number) {
512	// tests if first line starts with 'number.'
513	// if true then 'number.' is removed from the string
514
515	size_t off = s.find_first_not_of(" \n");
516	if (off == string::npos) return false;
517	if (!isdigit(s[off])) return false;
518
519	size_t num_start = off;
520	number = 0;
521
522	for (; isdigit(s[off]); ++off) {
523	number = number*10 + (s[off]-'0');
524	}
525	if (number>MAX_ALLOWED_ENUM) return false;
526
527	if (s[off] != '.' && s[off] != ')') return false;
528	if (s[off+1] != ' ') return false;
529
530	// remove 'number.' from string :
531	++off;
532	while (s[off+1] == ' ') ++off;
533	s.erase(num_start, off-num_start+1);
534
535	return true;
536	}
537
538	static EnumerationType detectLineEnumType(string& line, unsigned& number) {
539	if (startsWithNumber(line, number)) return DIGITS;
540	return startsWithLetter(line, number);
541	}
542
543	static void parseSection(Section& sec, const char *line, int indentation, Reader& reader) {
544	string paragraph = line;
545	size_t para_start_lineno = reader.getLineNo();
546
547	if (sec.line_number() == NO_LINENUMBER_INFO) { // linenumber is not known yet
548	// assume section just started (this happens with TITLE)
549	sec.set_line_number(para_start_lineno);
550	}
551
552	ParagraphType type = PLAIN_TEXT;
553	EnumerationType etype = NONE;
554	unsigned num = 0;
555
556	unsigned last_alpha_num = -1;
557
558	h2x_assert(sec.Content().empty());
559
560	while (1) {
561	line = reader.getNext();
562	if (!line) break;
563
564	if (isEmptyOrComment(line)) {
565	pushParagraph(sec, paragraph, para_start_lineno, type, etype, num);
566	check_TODO(line, reader);
567	}
568	else {
569	string keyword;
570	const char *rest = extractKeyword(line, keyword);
571
572	if (rest) { // a new keyword
573	reader.back();
574	break;
575	}
576
577	check_TODO(line, reader);
578
579	string Line = line;
580
581	if (sec.get_type() == SEC_OCCURRENCE) {
582	pushParagraph(sec, paragraph, para_start_lineno, type, etype, num);
583	}
584	else {
585	const char *firstNonWhite = firstChar(line);
586	if (is_startof_itemlist_element(firstNonWhite)) {
587	h2x_assert(firstNonWhite != line);
588
589	pushParagraph(sec, paragraph, para_start_lineno, type, etype, num);
590
591	Line[firstNonWhite-line] = ' ';
592	type = ITEM; // is reset in call to pushParagraph
593	}
594	else {
595	unsigned foundNum;
596	EnumerationType foundEtype = detectLineEnumType(Line, foundNum);
597
598	if (foundEtype == ALPHA_UPPER \|\| foundEtype == ALPHA_LOWER) {
599	if (foundNum == (last_alpha_num+1) \|\| foundNum == 1) {
600	last_alpha_num = foundNum;
601	}
602	else {
603	#if defined(WARN_IGNORED_ALPHA_ENUMS)
604	add_warning(reader.attached_message("Ignoring non-consecutive alpha-enum"));
605	#endif
606	foundEtype = NONE;
607
608	reader.back();
609	Line = reader.getNext();
610	last_alpha_num = -1;
611	}
612	}
613
614	if (foundEtype != NONE) {
615	pushParagraph(sec, paragraph, para_start_lineno, type, etype, num);
616
617	type = ENUMERATED;
618	num = foundNum;
619	etype = foundEtype;
620
621	if (!num) {
622	h2x_assert(etype == DIGITS);
623	throw "Enumerations starting with zero are not supported";
624	}
625	}
626	}
627	}
628
629	if (paragraph.length()) {
630	paragraph = paragraph+"\n"+Line;
631	}
632	else {
633	paragraph = string("\n")+Line;
634	para_start_lineno = reader.getLineNo();
635	}
636	}
637	}
638
639	pushParagraph(sec, paragraph, para_start_lineno, type, etype, num);
640
641	if (sec.Content().size()>0 && indentation>0) {
642	string spaces;
643	spaces.reserve(indentation);
644	spaces.append(indentation, ' ');
645
646	string& ostr = sec.Content().front();
647	ostr = string("\n") + spaces + ostr;
648	}
649	}
650
651	inline void check_specific_duplicates(const string& link, const Links& existing, bool add_warnings) {
652	for (Links::const_iterator ex = existing.begin(); ex != existing.end(); ++ex) {
653	if (ex->Target() == link) {
654	if (add_warnings) add_warning(strf("First Link to '%s' was found here.", ex->Target().c_str()), ex->SourceLineno());
655	throw strf("Link to '%s' duplicated here.", link.c_str());
656	}
657	}
658	}
659	inline void check_duplicates(const string& link, const Links& uplinks, const Links& references, bool add_warnings) {
660	check_specific_duplicates(link, uplinks, add_warnings);
661	check_specific_duplicates(link, references, add_warnings);
662	}
663
664	static void warnAboutDuplicate(SectionList& sections) {
665	set<string> seen;
666	SectionList::iterator end = sections.end();
667	for (SectionList::iterator s = sections.begin(); s != end; ++s) {
668	const string& sname = s->getName();
669	if (sname == "NOTES") continue; // do not warn about multiple NOTES sections
670
671	SectionList::iterator o = s; ++o;
672	for (; o != end; ++o) {
673	if (sname == o->getName()) {
674	o->attach_warning("duplicated SECTION name");
675	if (seen.find(sname) == seen.end()) {
676	s->attach_warning("name was first used");
677	seen.insert(sname);
678	}
679	}
680	}
681	}
682	}
683
684	void Helpfile::readHelp(istream& in, const string& filename) {
685	if (!in.good()) throw unattached_message(strf("Can't read from '%s'", filename.c_str()));
686
687	Reader read(in);
688
689	inputfile = filename; // remember file read (for comment)
690
691	const char *line;
692	const char *name_only = strrchr(filename.c_str(), '/');
693
694	h2x_assert(name_only);
695	++name_only;
696
697	try {
698	while (1) {
699	line = read.getNext();
700	if (!line) break;
701
702	if (isEmptyOrComment(line)) {
703	check_TODO(line, read);
704	continue;
705	}
706
707	check_TODO(line, read);
708
709	string keyword;
710	const char *rest = extractKeyword(line, keyword);
711
712	if (rest) { // found a keyword
713	if (keyword == "UP") {
714	rest = eatSpace(rest);
715	if (strlen(rest)) {
716	check_duplicates(rest, uplinks, references, true);
717	if (strcmp(name_only, rest) == 0) throw "UP link to self";
718
719	uplinks.push_back(Link(rest, read.getLineNo()));
720	}
721	}
722	else if (keyword == "SUB") {
723	rest = eatSpace(rest);
724	if (strlen(rest)) {
725	check_duplicates(rest, uplinks, references, true);
726	if (strcmp(name_only, rest) == 0) throw "SUB link to self";
727
728	references.push_back(Link(rest, read.getLineNo()));
729	}
730	}
731	else if (keyword == "TITLE") {
732	rest = eatSpace(rest);
733	parseSection(title, rest, 0, read);
734
735	if (title.Content().empty()) throw "empty TITLE not allowed";
736
737	const string& t = title.Content().front();
738	if (t.find("Standard help file form") != string::npos) {
739	throw strf("Illegal title for help file: '%s'", t.c_str());
740	}
741
742	const size_t len = t.length();
743	if (len>MAX_TITLE_CHARS) {
744	// ignore non-alphanumeric characters at end of string:
745	size_t last_alnum_pos = len-1;
746	while (!isalnum(t[last_alnum_pos])) {
747	--last_alnum_pos;
748	}
749	++last_alnum_pos;
750	arb_assert(last_alnum_pos<=len);
751
752	const size_t ignored = len-last_alnum_pos;
753	if ((len-ignored)>MAX_TITLE_CHARS) {
754	title.attach_warning(strf("TITLE too verbose (max. %i chars allowed; found %zu%s)",
755	MAX_TITLE_CHARS,
756	len,
757	ignored ? strf("; acceptable trailing chars: %zu", ignored).c_str() : ""
758	));
759	}
760	}
761	}
762	else {
763	if (keyword == "NOTE") keyword = "NOTES";
764	if (keyword == "EXAMPLE") keyword = "EXAMPLES";
765	if (keyword == "WARNING") keyword = "WARNINGS";
766
767	SectionType stype = SEC_NONE;
768	int idx;
769	for (idx = 0; idx<KNOWN_SECTION_TYPES; ++idx) {
770	if (knownSections[idx] == keyword) {
771	stype = SectionType(idx);
772	break;
773	}
774	}
775
776	size_t lineno = read.getLineNo();
777
778	if (idx >= KNOWN_SECTION_TYPES) throw strf("unknown keyword '%s'", keyword.c_str());
779
780	if (stype == SEC_SECTION) {
781	string section_name = eatSpace(rest);
782	Section sec(section_name, stype, lineno);
783	parseSection(sec, "", 0, read);
784	sections.push_back(sec);
785	}
786	else {
787	Section sec(keyword, stype, lineno);
788	rest = eatSpace(rest);
789	parseSection(sec, rest, rest-line, read);
790	sections.push_back(sec);
791	}
792	}
793	}
794	else {
795	throw strf("Unhandled line");
796	}
797	}
798
799	warnAboutDuplicate(sections);
800	}
801	catch (string& err) { throw read.attached_message(err); }
802	catch (const char *err) { throw read.attached_message(err); }
803	}
804
805	static bool shouldReflow(const string& s, int& foundIndentation) {
806	// foundIndentation is only valid if shouldReflow() returns true
807	enum { START, CHAR, SPACE, MULTIPLE, DOT, DOTSPACE } state = START;
808	bool equal_indent = true;
809	int lastIndent = -1;
810	int thisIndent = 0;
811
812	for (string::const_iterator c = s.begin(); c != s.end(); ++c, ++thisIndent) {
813	if (*c == '\n') {
814	state = START;
815	thisIndent = 0;
816	}
817	else if (isSpace(*c)) {
818	if (state == DOT \|\| state == DOTSPACE) state = DOTSPACE; // multiple spaces after DOT are allowed
819	else if (state == SPACE) state = MULTIPLE; // now seen multiple spaces
820	else if (state == CHAR) state = SPACE; // now seen 1 space
821	}
822	else {
823	if (state == MULTIPLE) return false; // character after multiple spaces
824	if (state == START) {
825	if (lastIndent == -1) lastIndent = thisIndent;
826	else if (lastIndent != thisIndent) equal_indent = false;
827	}
828	state = (c == '.' \|\| c == ',') ? DOT : CHAR;
829	}
830	}
831
832	if (lastIndent<0) {
833	equal_indent = false;
834	}
835
836	if (equal_indent) {
837	foundIndentation = lastIndent-1;
838	h2x_assert(foundIndentation >= 0);
839	}
840	return equal_indent;
841	}
842
843	static string correctSpaces(const string& text, int change) {
844	h2x_assert(text.find('\n') == string::npos);
845
846	if (!change) return text;
847
848	size_t first = text.find_first_not_of(' ');
849	if (first == string::npos) return ""; // empty line
850
851	if (change<0) {
852	int remove = -change;
853	h2x_assert(remove <= int(first));
854	return text.substr(remove);
855	}
856
857	h2x_assert(change>0); // add spaces
858	return string(change, ' ')+text;
859	}
860
861	static string correctIndentation(const string& text, int change) {
862	// removes 'remove' spaces from every line
863
864	size_t this_lineend = text.find('\n');
865	string result;
866
867	if (this_lineend == string::npos) {
868	result = correctSpaces(text, change);
869	}
870	else {
871	result = correctSpaces(text.substr(0, this_lineend), change);
872
873	while (this_lineend != string::npos) {
874	size_t next_lineend = text.find('\n', this_lineend+1);
875	if (next_lineend == string::npos) { // last line
876	result = result+"\n"+correctSpaces(text.substr(this_lineend+1), change);
877	}
878	else {
879	result = result+"\n"+correctSpaces(text.substr(this_lineend+1, next_lineend-this_lineend-1), change);
880	}
881	this_lineend = next_lineend;
882	}
883	}
884	return result;
885	}
886
887	inline size_t countSpaces(const string& text) {
888	size_t first = text.find_first_not_of(' ');
889	if (first == string::npos) return INT_MAX; // empty line
890	return first;
891	}
892
893	static size_t scanMinIndentation(const string& text) {
894	size_t this_lineend = text.find('\n');
895	size_t min_indent = INT_MAX;
896
897	if (this_lineend == string::npos) {
898	min_indent = countSpaces(text);
899	}
900	else {
901	while (this_lineend != string::npos) {
902	size_t next_lineend = text.find('\n', this_lineend+1);
903	if (next_lineend == string::npos) {
904	min_indent = min(min_indent, countSpaces(text.substr(this_lineend+1)));
905	}
906	else {
907	min_indent = min(min_indent, countSpaces(text.substr(this_lineend+1, next_lineend-this_lineend-1)));
908	}
909	this_lineend = next_lineend;
910	}
911	}
912
913	if (min_indent == INT_MAX) min_indent = 0; // only empty lines
914	return min_indent;
915	}
916
917	// -----------------------------
918	// class ParagraphTree
919
920	class ParagraphTree FINAL_TYPE : public MessageAttachable, virtual Noncopyable {
921	ParagraphTree *brother; // has same indentation as this
922	ParagraphTree *son; // indentation + 1
923
924	Ostring otext; // text of the Section (containing linefeeds)
925
926	bool reflow; // should the paragraph be reflown ? (true if indentation is equal for all lines of text)
927	int indentation; // the real indentation of the blank (behind removed enumeration)
928
929
930	string location_description() const OVERRIDE { return "in paragraph starting here"; }
931	size_t line_number() const OVERRIDE { return otext.get_lineno(); }
932
933	ParagraphTree(Ostrings::const_iterator begin, const Ostrings::const_iterator end)
934	: son(NULp),
935	otext(*begin),
936	indentation(0)
937	{
938	h2x_assert(begin != end);
939
940	string& text = otext;
941
942	reflow = shouldReflow(text, indentation);
943	if (!reflow) {
944	size_t reststart = text.find('\n', 1);
945
946	if (reststart == 0) {
947	attach_warning("[internal] Paragraph starts with LF -> reflow calculation will probably fail");
948	}
949
950	if (reststart != string::npos) {
951	int rest_indent = -1;
952	string rest = text.substr(reststart);
953	bool rest_reflow = shouldReflow(rest, rest_indent);
954
955	if (rest_reflow) {
956	int first_indent = countSpaces(text.substr(1));
957	if (get_type() == PLAIN_TEXT) {
958	size_t last = text.find_last_not_of(' ', reststart-1);
959	bool is_header = last != string::npos && text[last] == ':';
960
961	if (!is_header && rest_indent == (first_indent+8)) {
962	#if defined(DEBUG)
963	size_t textstart = text.find_first_not_of(" \n");
964	h2x_assert(textstart != string::npos);
965	#endif // DEBUG
966
967	text = text.substr(0, reststart)+correctIndentation(rest, -8);
968	reflow = shouldReflow(text, indentation);
969	}
970	}
971	else {
972	int diff = rest_indent-first_indent;
973	if (diff>0) {
974	text = text.substr(0, reststart)+correctIndentation(rest, -diff);
975	reflow = shouldReflow(text, indentation);
976	}
977	else if (diff<0) {
978	// paragraph with more indent on first line (occurs?)
979	attach_warning(strf("[internal] unhandled: more indentation on the 1st line (diff=%i)", diff));
980	}
981	}
982	}
983	}
984	}
985
986	if (!reflow) {
987	indentation = scanMinIndentation(text);
988	}
989	text = correctIndentation(text, -indentation);
990	if (get_type() == ITEM) {
991	h2x_assert(indentation >= 2);
992	indentation -= 2;
993	}
994
995	brother = buildParagraphTree(++begin, end);
996	}
997
998	void brothers_to_sons(ParagraphTree *new_brother);
999
1000	public:
1001	virtual ~ParagraphTree() {
1002	delete brother;
1003	delete son;
1004	}
1005
1006	ParagraphType get_type() const { return otext.get_type(); }
1007
1008	bool is_itemlist_member() const { return get_type() == ITEM; }
1009	unsigned get_enumeration() const { return get_type() == ENUMERATED ? otext.get_number() : 0; }
1010	EnumerationType get_enum_type() const { return otext.get_enum_type(); }
1011
1012	const char *readable_type() const {
1013	const char *res = NULp;
1014	switch (get_type()) {
1015	case PLAIN_TEXT: res = "PLAIN_TEXT"; break;
1016	case ITEM: res = "ITEM"; break;
1017	case ENUMERATED: res = "ENUMERATED"; break;
1018	}
1019	return res;
1020	}
1021
1022	size_t countTextNodes() {
1023	size_t nodes = 1; // this
1024	if (son) nodes += son->countTextNodes();
1025	if (brother) nodes += brother->countTextNodes();
1026	return nodes;
1027	}
1028
1029	#if defined(DUMP_PARAGRAPHS)
1030	void print_indent(ostream& out, int indent) { while (indent-->0) out << ' '; }
1031	char masknl(const char text) {
1032	char *result = ARB_strdup(text);
1033	for (int i = 0; result[i]; ++i) {
1034	if (result[i] == '\n') result[i] = '\|';
1035	}
1036	return result;
1037	}
1038	void dump(ostream& out, int indent = 0) {
1039	print_indent(out, indent+1);
1040	{
1041	char *mtext = masknl(otext.as_string().c_str());
1042	out << "text='" << mtext << "'\n";
1043	free(mtext);
1044	}
1045
1046	print_indent(out, indent+1);
1047	out << "type='" << readable_type() << "' ";
1048	if (get_type() == ENUMERATED) {
1049	out << "enumeration='" << otext.get_number() << "' ";
1050	}
1051	out << "reflow='" << reflow << "' ";
1052	out << "indentation='" << indentation << "'\n";
1053
1054	if (son) {
1055	print_indent(out, indent+2); cout << "son:\n";
1056	son->dump(out, indent+2);
1057	cout << "\n";
1058	}
1059	if (brother) {
1060	print_indent(out, indent); cout << "brother:\n";
1061	brother->dump(out, indent);
1062	}
1063	}
1064	#endif // DUMP_PARAGRAPHS
1065
1066	private:
1067	static ParagraphTree* buildParagraphTree(Ostrings::const_iterator begin, const Ostrings::const_iterator end) {
1068	if (begin == end) return NULp;
1069	return new ParagraphTree(begin, end);
1070	}
1071	public:
1072	static ParagraphTree* buildParagraphTree(const Section& sec) {
1073	const Ostrings& txt = sec.Content();
1074	if (txt.empty()) throw "attempt to build an empty ParagraphTree";
1075	return buildParagraphTree(txt.begin(), txt.end());
1076	}
1077
1078	bool contains(ParagraphTree *that) {
1079	return
1080	this == that \|\|
1081	(son && son->contains(that)) \|\|
1082	(brother && brother->contains(that));
1083	}
1084
1085	ParagraphTree predecessor(ParagraphTree before_this) {
1086	if (brother == before_this) return this;
1087	if (!brother) return NULp;
1088	return brother->predecessor(before_this);
1089	}
1090
1091	void append(ParagraphTree *new_brother) {
1092	if (!brother) brother = new_brother;
1093	else brother->append(new_brother);
1094	}
1095
1096	bool is_some_brother(const ParagraphTree *other) const {
1097	return (other == brother) \|\| (brother && brother->is_some_brother(other));
1098	}
1099
1100	ParagraphTree* takeAllInFrontOf(ParagraphTree *after) {
1101	ParagraphTree *removed = this;
1102	ParagraphTree *after_pred = this;
1103
1104	h2x_assert(is_some_brother(after));
1105
1106	while (1) {
1107	h2x_assert(after_pred);
1108	h2x_assert(after_pred->brother); // takeAllInFrontOf called with non-existing 'after'
1109
1110	if (after_pred->brother == after) { // found after
1111	after_pred->brother = NULp; // unlink
1112	break;
1113	}
1114	after_pred = after_pred->brother;
1115	}
1116
1117	return removed;
1118	}
1119
1120	ParagraphTree *firstListMember() {
1121	switch (get_type()) {
1122	case PLAIN_TEXT: break;
1123	case ITEM: return this;
1124	case ENUMERATED: {
1125	if (get_enumeration() == 1) return this;
1126	break;
1127	}
1128	}
1129	if (brother) return brother->firstListMember();
1130	return NULp;
1131	}
1132
1133	ParagraphTree *nextListMemberAfter(const ParagraphTree& previous) {
1134	if (indentation<previous.indentation) return NULp;
1135	if (indentation == previous.indentation && get_type() == previous.get_type()) {
1136	if (get_type() != ENUMERATED) return this;
1137	if (get_enumeration() > previous.get_enumeration()) return this;
1138	return NULp;
1139	}
1140	if (!brother) return NULp;
1141	return brother->nextListMemberAfter(previous);
1142	}
1143	ParagraphTree *nextListMember() const {
1144	return brother ? brother->nextListMemberAfter(*this) : NULp;
1145	}
1146
1147	ParagraphTree* firstWithLessIndentThan(int wanted_indentation) {
1148	if (indentation < wanted_indentation) return this;
1149	if (!brother) return NULp;
1150	return brother->firstWithLessIndentThan(wanted_indentation);
1151	}
1152
1153	void format_indentations();
1154	void format_lists();
1155
1156	private:
1157	static ParagraphTree* buildNewParagraph(const string& Text, size_t beginLineNo, ParagraphType type) {
1158	Ostrings S;
1159	S.push_back(Ostring(Text, beginLineNo, type));
1160	return new ParagraphTree(S.begin(), S.end());
1161	}
1162	ParagraphTree *xml_write_list_contents();
1163	ParagraphTree *xml_write_enum_contents();
1164	void xml_write_textblock();
1165
1166	public:
1167	void xml_write();
1168	};
1169
1170	#if defined(DUMP_PARAGRAPHS)
1171	static void dump_paragraph(ParagraphTree *para) {
1172	// helper function for use in gdb
1173	para->dump(cout, 0);
1174	}
1175	#endif
1176
1177	void ParagraphTree::brothers_to_sons(ParagraphTree *new_brother) {
1178	/*! folds down brothers to sons
1179	* @param new_brother brother of 'this->brother', will become new brother.
1180	* If new_brother == NULp -> make all brothers sons.
1181	*/
1182
1183	if (new_brother) {
1184	h2x_assert(is_some_brother(new_brother));
1185
1186	if (brother != new_brother) {
1187	#if defined(DEBUG)
1188	if (son) {
1189	son->attach_warning("Found unexpected son (in brothers_to_sons)");
1190	brother->attach_warning("while trying to transform paragraphs from here ..");
1191	new_brother->attach_warning(".. to here ..");
1192	attach_warning(".. into sons of this paragraph.");
1193	return;
1194	}
1195	#endif
1196
1197	h2x_assert(!son);
1198	h2x_assert(brother);
1199
1200	if (!new_brother) { // all brothers -> sons
1201	son = brother;
1202	brother = NULp;
1203	}
1204	else {
1205	son = brother->takeAllInFrontOf(new_brother);
1206	brother = new_brother;
1207	}
1208	}
1209	}
1210	else {
1211	h2x_assert(!son);
1212	son = brother;
1213	brother = NULp;
1214	}
1215	}
1216	void ParagraphTree::format_lists() {
1217	// reformats tree such that all items/enumerations are brothers
1218	ParagraphTree *member = firstListMember();
1219	if (member) {
1220	for (ParagraphTree *curr = this; curr != member; curr = curr->brother) {
1221	h2x_assert(curr);
1222	if (curr->son) curr->son->format_lists();
1223	}
1224
1225	for (ParagraphTree *next = member->nextListMember();
1226	next;
1227	member = next, next = member->nextListMember())
1228	{
1229	member->brothers_to_sons(next);
1230	h2x_assert(member->brother == next);
1231
1232	if (member->son) member->son->format_lists();
1233	}
1234
1235	h2x_assert(!member->son); // member is the last item
1236
1237	if (member->brother) {
1238	ParagraphTree *non_member = member->brother->firstWithLessIndentThan(member->indentation+1);
1239	member->brothers_to_sons(non_member);
1240	}
1241
1242	if (member->son) member->son->format_lists();
1243	if (member->brother) member->brother->format_lists();
1244	}
1245	else {
1246	for (ParagraphTree *curr = this; curr; curr = curr->brother) {
1247	h2x_assert(curr);
1248	if (curr->son) curr->son->format_lists();
1249	}
1250	}
1251	}
1252
1253	void ParagraphTree::format_indentations() {
1254	if (brother) {
1255	ParagraphTree *same_indent = brother->firstWithLessIndentThan(indentation+1);
1256	#if defined(WARN_POSSIBLY_WRONG_INDENTATION_CORRECTION)
1257	if (same_indent && indentation != same_indent->indentation) {
1258	same_indent->attach_warning("indentation is assumed to be same as ..");
1259	attach_warning(".. here");
1260	}
1261	#endif
1262	brothers_to_sons(same_indent); // if same_indent is NULp -> make all brothers childs
1263	if (brother) brother->format_indentations();
1264	}
1265
1266	if (son) son->format_indentations();
1267	}
1268
1269	// -----------------
1270	// LinkType
1271
1272	enum LinkType {
1273	LT_UNKNOWN = 0,
1274	LT_HTTP = 1,
1275	LT_HTTPS = 2,
1276	LT_FTP = 4,
1277	LT_FILE = 8,
1278	LT_EMAIL = 16,
1279	LT_HLP = 32,
1280	LT_PS = 64,
1281	LT_PDF = 128,
1282	LT_TICKET = 256,
1283	};
1284
1285	static const char *link_id[] = {
1286	"unknown",
1287	"www", // "http:"
1288	"www", // "https:"
1289	"www", // "ftp:"
1290	"www", // "file:"
1291	"email",
1292	"hlp",
1293	"ps",
1294	"pdf",
1295	"ticket",
1296	};
1297
1298	static string LinkType2id(LinkType type) {
1299	size_t idx = 0;
1300	while (type >= 1) {
1301	idx++;
1302	type = LinkType(type>>1);
1303	}
1304	arb_assert(idx<ARRAY_ELEMS(link_id));
1305	return link_id[idx];
1306	}
1307
1308	inline const char *getExtension(const string& name) {
1309	size_t last_dot = name.find_last_of('.');
1310	if (last_dot == string::npos) {
1311	return NULp;
1312	}
1313	return name.c_str()+last_dot+1;
1314	}
1315
1316	static LinkType detectLinkType(const string& link_target) {
1317	LinkType type = LT_UNKNOWN;
1318	const char *ext = getExtension(link_target);
1319
1320	if (ext && strcasecmp(ext, "hlp") == 0) type = LT_HLP;
1321	else if (link_target.find("http://") == 0) type = LT_HTTP;
1322	else if (link_target.find("https://") == 0) type = LT_HTTPS;
1323	else if (link_target.find("ftp://") == 0) type = LT_FTP;
1324	else if (link_target.find("file://") == 0) type = LT_FILE;
1325	else if (link_target.find('@') != string::npos) type = LT_EMAIL;
1326	else if (ext && strcasecmp(ext, "ps") == 0) type = LT_PS;
1327	else if (ext && strcasecmp(ext, "pdf") == 0) type = LT_PDF;
1328	else if (link_target[0] == '#') type = LT_TICKET;
1329
1330	return type;
1331	}
1332
1333	// --------------------------------------------------------------------------------
1334
1335
1336
1337	static string locate_helpfile(const string& helpname) {
1338	// search for 'helpname' in various helpfile locations
1339
1340	#define PATHS 2
1341	static string path[PATHS] = { "source/", "genhelp/" };
1342	struct stat st;
1343
1344	for (size_t p = 0; p<PATHS; p++) {
1345	string fullname = path[p]+helpname;
1346	if (stat(fullname.c_str(), &st) == 0) {
1347	return fullname;
1348	}
1349	}
1350	return "";
1351	#undef PATHS
1352	}
1353
1354	static string locate_document(const string& docname) {
1355	// search for 'docname' or 'docname.gz' in various helpfile locations
1356
1357	string located = locate_helpfile(docname);
1358	if (located.empty()) {
1359	located = locate_helpfile(docname+".gz");
1360	}
1361	return located;
1362	}
1363
1364	static void add_link_attributes(XML_Tag& link, LinkType type, const string& dest, size_t source_line) {
1365	if (type == LT_UNKNOWN) {
1366	string msg = string("Unknown link type (dest='")+dest+"')";
1367	throw LineAttachedMessage(msg, source_line);
1368	}
1369
1370	link.add_attribute("dest", dest);
1371	link.add_attribute("type", LinkType2id(type));
1372	link.add_attribute("source_line", source_line);
1373
1374	if (type&(LT_HLP\|LT_PDF\|LT_PS)) { // other links (www, email) cannot be checked for existence here
1375	string fullhelp = ((type&LT_HLP) ? locate_helpfile : locate_document)(dest);
1376	if (fullhelp.empty()) {
1377	link.add_attribute("missing", "1");
1378	string deadlink = strf("Dead link to '%s'", dest.c_str());
1379	#if 1
1380	throw LineAttachedMessage(deadlink, source_line);
1381	#else
1382	add_warning(deadlink, source_line);
1383	#endif
1384	}
1385	}
1386	}
1387
1388	static void print_XML_Text_expanding_links(const string& text, size_t lineNo) {
1389	size_t found = text.find("LINK{", 0);
1390	if (found != string::npos) {
1391	size_t inside_link = found+5;
1392	size_t close = text.find('}', inside_link);
1393
1394	if (close == string::npos) throw "unclosed 'LINK{}'";
1395
1396	string link_target = text.substr(inside_link, close-inside_link);
1397	LinkType type = detectLinkType(link_target);
1398	string dest = link_target;
1399
1400	XML_Text(text.substr(0, found));
1401
1402	{
1403	XML_Tag link("LINK");
1404	link.set_on_extra_line(false);
1405	add_link_attributes(link, type, dest, lineNo);
1406	}
1407
1408	print_XML_Text_expanding_links(text.substr(close+1), lineNo);
1409	}
1410	else {
1411	XML_Text t(text);
1412	}
1413	}
1414
1415	static string autolink_ticket_references(const string& text) {
1416	size_t hashpos = text.find('#');
1417	if (hashpos == string::npos) {
1418	return text;
1419	}
1420
1421	if (!isdigit(text[hashpos+1])) { // char after '#' is no digit = > not a ticketref
1422	size_t afterhash = hashpos+1;
1423	return
1424	text.substr(0, afterhash) +
1425	autolink_ticket_references(text.substr(afterhash));
1426	}
1427
1428	size_t hashlength = 2;
1429	while (isdigit(text[hashpos+hashlength])) ++hashlength;
1430
1431	return
1432	text.substr(0, hashpos) +
1433	"LINK{" +
1434	text.substr(hashpos, hashlength) +
1435	"}" +
1436	autolink_ticket_references(text.substr(hashpos+hashlength));
1437	}
1438
1439	inline void print_XML_Text(const string& text, size_t lineNo) {
1440	string autolinkedText = autolink_ticket_references(text);
1441	print_XML_Text_expanding_links(autolinkedText, lineNo);
1442	}
1443
1444	void ParagraphTree::xml_write_textblock() {
1445	XML_Tag textblock("T");
1446	textblock.add_attribute("reflow", reflow ? "1" : "0");
1447
1448	{
1449	string usedText;
1450	const string& text = otext;
1451	if (reflow) {
1452	usedText = correctIndentation(text, (textblock.Indent()+1) * the_XML_Document->indentation_per_level);
1453	}
1454	else {
1455	usedText = text;
1456	}
1457	print_XML_Text(usedText, otext.get_lineno());
1458	}
1459	}
1460
1461	ParagraphTree *ParagraphTree::xml_write_list_contents() {
1462	h2x_assert(is_itemlist_member());
1463	#if defined(WARN_FIXED_LAYOUT_LIST_ELEMENTS)
1464	if (!reflow) attach_warning("ITEM not reflown (check output)");
1465	#endif
1466	{
1467	XML_Tag entry("ENTRY");
1468	entry.add_attribute("item", "1");
1469	xml_write_textblock();
1470	if (son) son->xml_write();
1471	}
1472	if (brother && brother->is_itemlist_member()) {
1473	return brother->xml_write_list_contents();
1474	}
1475	return brother;
1476	}
1477	ParagraphTree *ParagraphTree::xml_write_enum_contents() {
1478	h2x_assert(get_enumeration());
1479	#if defined(WARN_FIXED_LAYOUT_LIST_ELEMENTS)
1480	if (!reflow) attach_warning("ENUMERATED not reflown (check output)");
1481	#endif
1482	{
1483	XML_Tag entry("ENTRY");
1484	switch (get_enum_type()) {
1485	case DIGITS:
1486	entry.add_attribute("enumerated", strf("%i", get_enumeration()));
1487	break;
1488	case ALPHA_UPPER:
1489	entry.add_attribute("enumerated", strf("%c", 'A'-1+get_enumeration()));
1490	break;
1491	case ALPHA_LOWER:
1492	entry.add_attribute("enumerated", strf("%c", 'a'-1+get_enumeration()));
1493	break;
1494	default:
1495	h2x_assert(0);
1496	break;
1497	}
1498	xml_write_textblock();
1499	if (son) son->xml_write();
1500	}
1501	if (brother && brother->get_enumeration()) {
1502	int diff = brother->get_enumeration()-get_enumeration();
1503	if (diff != 1) {
1504	attach_warning("Non-consecutive enumeration detected between here..");
1505	brother->attach_warning(".. and here");
1506	}
1507	return brother->xml_write_enum_contents();
1508	}
1509	return brother;
1510	}
1511
1512	void ParagraphTree::xml_write() {
1513	try {
1514	ParagraphTree *next = NULp;
1515	if (get_enumeration()) {
1516	XML_Tag enu("ENUM");
1517	if (get_enumeration() != 1) {
1518	attach_warning(strf("First enum starts with '%u.' (maybe previous enum was not detected)", get_enumeration()));
1519	}
1520	next = xml_write_enum_contents();
1521	#if defined(WARN_LONESOME_ENUM_ELEMENTS)
1522	if (next == brother) attach_warning("Suspicious single-element-ENUM");
1523	#endif
1524	}
1525	else if (is_itemlist_member()) {
1526	XML_Tag list("LIST");
1527	next = xml_write_list_contents();
1528	#if defined(WARN_LONESOME_LIST_ELEMENTS)
1529	if (next == brother) attach_warning("Suspicious single-element-LIST");
1530	#endif
1531	}
1532	else {
1533	{
1534	XML_Tag para("P");
1535	xml_write_textblock();
1536	if (son) son->xml_write();
1537	}
1538	next = brother;
1539	}
1540	if (next) next->xml_write();
1541	}
1542	catch (string& err) { throw attached_message(err); }
1543	catch (const char *err) { throw attached_message(err); }
1544	}
1545
1546	static void create_top_links(const Links& links, const char *tag) {
1547	for (Links::const_iterator s = links.begin(); s != links.end(); ++s) {
1548	XML_Tag link(tag);
1549	add_link_attributes(link, detectLinkType(s->Target()), s->Target(), s->SourceLineno());
1550	}
1551	}
1552
1553	inline string remove_LF_and_indentation(string paragraph) {
1554	// remove linefeeds + spaces behind linefeed (=indentation)
1555	size_t pos = 0;
1556	while (1) {
1557	size_t lf = paragraph.find('\n', pos);
1558	if (lf == string::npos) break; // all LFs handled
1559
1560	// eliminate spaces before the LF:
1561	if (lf>0 && paragraph[lf-1] == ' ') { // LF is preceeded by space(s)
1562	size_t sp = lf-1;
1563	while (sp>=1 && paragraph[sp-1] == ' ') --sp; // position to 1st space
1564	arb_assert(sp<lf);
1565	paragraph.erase(sp, lf-sp);
1566	lf = sp;
1567	}
1568	arb_assert(paragraph[lf] == '\n');
1569
1570	size_t ns = paragraph.find(' ', lf); // next space
1571	if (ns != lf+1) { // no space after LF
1572	paragraph[lf] = ' '; // -> replace LF by single space
1573	pos = lf+1;
1574	}
1575	else {
1576	size_t as = paragraph.find_first_not_of(' ', ns); // pos after consecutive space(s)
1577	size_t ls = as == string::npos ? ns : as-1; // last consecutive space
1578	paragraph.erase(lf, ls-lf); // keep one space (between concatenated line contents)
1579	}
1580	}
1581	// remove trailing spaces:
1582	size_t ls = paragraph.find_last_not_of(' ');
1583	if (ls == string::npos) { // only spaces
1584	paragraph.clear();
1585	}
1586	else {
1587	++ls;
1588	paragraph.erase(ls, paragraph.length()-ls);
1589	}
1590	return paragraph;
1591	}
1592
1593	void Helpfile::writeXML(FILE *out, const string& page_name) {
1594	XML_Document xml("PAGE", "arb_help.dtd", out);
1595
1596	xml.skip_empty_tags = true;
1597	xml.indentation_per_level = 2;
1598
1599	xml.getRoot().add_attribute("name", page_name);
1600	#if defined(DEBUG)
1601	xml.getRoot().add_attribute("edit_warning", "devel"); // inserts a edit warning into development version
1602	#else
1603	xml.getRoot().add_attribute("edit_warning", "release"); // inserts a different edit warning into release version
1604	#endif // DEBUG
1605
1606	xml.getRoot().add_attribute("source", inputfile.c_str());
1607
1608	{
1609	XML_Comment(string("automatically generated from ../")+inputfile+' ');
1610	}
1611
1612	create_top_links(uplinks, "UP");
1613	create_top_links(references, "SUB");
1614	create_top_links(auto_references, "SUB");
1615
1616	try {
1617	string titleText, subtitleText;
1618
1619	const Ostrings& T = title.Content();
1620	Ostrings::const_iterator s = T.begin();
1621
1622	if (s != T.end()) titleText = *s++;
1623
1624	bool subtitleAdded = false; // @@@ not needed! (use !subtitleText.empty())
1625	for (; s != T.end(); ++s) {
1626	if (s->get_type() != PLAIN_TEXT) {
1627	throw s->attached_message("wrong paragraph type (plain text expected)");
1628	}
1629	string text = s->as_string();
1630	if (!text.empty()) { // ignore empty lines
1631	text = eatWhitespace(text.c_str());
1632	if (!text.empty()) {
1633	if (subtitleAdded) throw s->attached_message("only one subtitle accepted");
1634
1635	text = remove_LF_and_indentation(text);
1636
1637	if (text.length()>MAX_SUBTITLE_CHARS) {
1638	s->attach_warning(strf("subtitle too verbose (max. %i chars allowed; found %zu)", MAX_SUBTITLE_CHARS, text.length()));
1639	}
1640	subtitleText = text;
1641	subtitleAdded = true; // accept only one line
1642	}
1643	}
1644	}
1645
1646	{
1647	XML_Tag title_tag("TITLE"); { XML_Text text(titleText); }
1648	}
1649	if (!subtitleText.empty()) {
1650	XML_Tag title_tag("SUBTITLE"); { XML_Text text(subtitleText); }
1651	}
1652
1653	}
1654	catch (string& err) { throw title.attached_message(err); }
1655	catch (const char *err) { throw title.attached_message(err); }
1656
1657	for (SectionList::const_iterator sec = sections.begin(); sec != sections.end(); ++sec) {
1658	try {
1659	XML_Tag section_tag("SECTION");
1660	section_tag.add_attribute("name", sec->getName());
1661
1662	ParagraphTree ptree = ParagraphTree::buildParagraphTree(sec);
1663
1664	#if defined(DEBUG)
1665	size_t textnodes = ptree->countTextNodes();
1666	#endif
1667	#if defined(DUMP_PARAGRAPHS)
1668	cout << "Dump of section '" << sec->getName() << "' (before format_lists):\n";
1669	ptree->dump(cout);
1670	cout << "----------------------------------------\n";
1671	#endif
1672
1673	ptree->format_lists();
1674
1675	#if defined(DUMP_PARAGRAPHS)
1676	cout << "Dump of section '" << sec->getName() << "' (after format_lists):\n";
1677	ptree->dump(cout);
1678	cout << "----------------------------------------\n";
1679	#endif
1680	#if defined(DEBUG)
1681	size_t textnodes2 = ptree->countTextNodes();
1682	h2x_assert(textnodes2 == textnodes); // if this occurs format_lists has an error
1683	#endif
1684
1685	ptree->format_indentations();
1686
1687	#if defined(DUMP_PARAGRAPHS)
1688	cout << "Dump of section '" << sec->getName() << "' (after format_indentations):\n";
1689	ptree->dump(cout);
1690	cout << "----------------------------------------\n";
1691	#endif
1692	#if defined(DEBUG)
1693	size_t textnodes3 = ptree->countTextNodes();
1694	h2x_assert(textnodes3 == textnodes2); // if this occurs format_indentations has an error
1695	#endif
1696
1697	ptree->xml_write();
1698
1699	delete ptree;
1700	}
1701	catch (string& err) { throw sec->attached_message(err); }
1702	catch (const char *err) { throw sec->attached_message(err); }
1703	}
1704	}
1705
1706	void Helpfile::extractInternalLinks() {
1707	for (SectionList::const_iterator sec = sections.begin(); sec != sections.end(); ++sec) {
1708	try {
1709	const Ostrings& s = sec->Content();
1710
1711	for (Ostrings::const_iterator li = s.begin(); li != s.end(); ++li) {
1712	const string& line = *li;
1713	size_t start = 0;
1714
1715	while (1) {
1716	size_t found = line.find("LINK{", start);
1717	if (found == string::npos) break;
1718	found += 5;
1719	size_t close = line.find('}', found);
1720	if (close == string::npos) break;
1721
1722	string link_target = line.substr(found, close-found);
1723
1724	if (link_target.find("http://") == string::npos &&
1725	link_target.find("https://")== string::npos &&
1726	link_target.find("ftp://") == string::npos &&
1727	link_target.find("file://") == string::npos &&
1728	link_target.find('@') == string::npos)
1729	{
1730	check_self_ref(link_target);
1731
1732	try {
1733	check_specific_duplicates(link_target, references, false); // check only sublinks here
1734	check_specific_duplicates(link_target, uplinks, false); // check only uplinks here
1735	check_specific_duplicates(link_target, auto_references, false); // check only sublinks here
1736
1737	// only auto-add inline reference if none of the above checks has thrown
1738	auto_references.push_back(Link(link_target, li->line_number()));
1739	}
1740	catch (string& err) {
1741	; // silently ignore inlined
1742	}
1743	}
1744	start = close+1;
1745	}
1746	}
1747	}
1748	catch (string& err) {
1749	throw sec->attached_message("'"+err+"' while scanning LINK{}");
1750	}
1751	}
1752	}
1753
1754	static void show_err(const string& err, size_t lineno, const string& helpfile) {
1755	if (err.find(helpfile+':') != string::npos) {
1756	cerr << err;
1757	}
1758	else if (lineno == NO_LINENUMBER_INFO) {
1759	cerr << helpfile << ":1: [in unknown line] " << err;
1760	}
1761	else {
1762	cerr << helpfile << ":" << lineno << ": " << err;
1763	}
1764	cerr << '\n';
1765	}
1766	inline void show_err(const LineAttachedMessage& line_err, const string& helpfile) {
1767	show_err(line_err.Message(), line_err.Lineno(), helpfile);
1768	}
1769	inline void show_warning(const LineAttachedMessage& line_err, const string& helpfile) {
1770	show_err(string("Warning: ")+line_err.Message(), line_err.Lineno(), helpfile);
1771	}
1772	inline void show_warnings(const string& helpfile) {
1773	for (list<LineAttachedMessage>::const_iterator wi = warnings.begin(); wi != warnings.end(); ++wi) {
1774	show_warning(*wi, helpfile);
1775	}
1776	}
1777	static void show_error_and_warnings(const LineAttachedMessage& error, const string& helpfile) {
1778	show_err(error, helpfile);
1779	show_warnings(helpfile);
1780	}
1781
1782	int ARB_main(int argc, char *argv[]) {
1783	if (argc != 3) {
1784	cerr << "Usage: arb_help2xml <ARB helpfile> <XML output>\n";
1785	return EXIT_FAILURE;
1786	}
1787
1788	Helpfile help;
1789	string arb_help;
1790
1791	try {
1792	try {
1793	arb_help = argv[1];
1794	string xml_output = argv[2];
1795
1796	{
1797	ifstream in(arb_help.c_str());
1798	help.readHelp(in, arb_help);
1799	}
1800
1801	help.extractInternalLinks();
1802
1803	{
1804	FILE *out = std::fopen(xml_output.c_str(), "wt");
1805	if (!out) throw string("Can't open '")+xml_output+'\'';
1806
1807	try {
1808	// arb_help contains 'source/name.hlp'
1809	size_t slash = arb_help.find('/');
1810	size_t dot = arb_help.find_last_of('.');
1811
1812	if (slash == string::npos \|\| dot == string::npos) {
1813	throw string("parameter <ARB helpfile> has to be in format 'source/name.hlp' (not '"+arb_help+"')");
1814	}
1815
1816	string page_name(arb_help, slash+1, dot-slash-1);
1817	help.writeXML(out, page_name);
1818	fclose(out);
1819	}
1820	catch (...) {
1821	fclose(out);
1822	remove(xml_output.c_str());
1823	throw;
1824	}
1825	}
1826
1827	show_warnings(arb_help);
1828
1829	return EXIT_SUCCESS;
1830	}
1831	catch (string& err) { throw unattached_message(err); }
1832	catch (const char * err) { throw unattached_message(err); }
1833	catch (LineAttachedMessage& err) { throw; }
1834	catch (...) { throw unattached_message("unknown exception in arb_help2xml"); }
1835	}
1836	catch (LineAttachedMessage& err) { show_error_and_warnings(err, arb_help); }
1837	catch (...) { h2x_assert(0); }
1838
1839	return EXIT_FAILURE;
1840	}
1841
1842	// --------------------------------------------------------------------------------
1843
1844	#ifdef UNIT_TESTS
1845	#include <test_unit.h>
1846	#include <arb_msg.h>
1847	#include <arb_file.h>
1848
1849	// Hint: you may set ONLY_DO_UNITTEST = 1 to speed up code/test-cycle
1850	// see ./Makefile@ONLY_DO_UNITTEST
1851
1852	#define TEST_REMOVE_LF_AND_INDENTATION(i,want) TEST_EXPECT_EQUAL(remove_LF_and_indentation(i).c_str(), want)
1853	#define TEST_REMOVE_LF_AND_INDENTATION__BROKEN(i,want,got) TEST_EXPECT_EQUAL__BROKEN(remove_LF_and_indentation(i).c_str(), want, got)
1854
1855	void TEST_remove_LF_and_indentation() {
1856	TEST_REMOVE_LF_AND_INDENTATION("",
1857	"");
1858
1859	TEST_REMOVE_LF_AND_INDENTATION(" \n \n \n ",
1860	"");
1861	TEST_REMOVE_LF_AND_INDENTATION("hello\nNewline",
1862	"hello Newline");
1863	TEST_REMOVE_LF_AND_INDENTATION("hello\nNewline\n 1\n2 \n 3 \n4\n5\n 6 \n 7 \n 8\n",
1864	"hello Newline 1 2 3 4 5 6 7 8");
1865
1866	TEST_REMOVE_LF_AND_INDENTATION("Visualization of Three-dimensional\n structure of small subunit (16S) rRNA",
1867	"Visualization of Three-dimensional structure of small subunit (16S) rRNA");
1868	}
1869
1870	static arb_test::match_expectation help_file_compiles(const char helpfile, const char expected_title, const char *expected_error_part) {
1871	using namespace arb_test;
1872	expectation_group expected;
1873
1874	ifstream in(helpfile);
1875
1876	LineAttachedMessage *error = NULp;
1877
1878	Helpfile help;
1879	try {
1880	help.readHelp(in, helpfile);
1881	help.extractInternalLinks();
1882
1883	FILE *devnul = fopen("/dev/null", "wt");
1884	if (!devnul) throw unattached_message("can't write to null device");
1885	help.writeXML(devnul, "dummy");
1886	fclose(devnul);
1887	}
1888	catch (LineAttachedMessage& err) { error = new LineAttachedMessage(err); }
1889	catch (...) { error = new LineAttachedMessage(unattached_message("unknown exception")); }
1890
1891	if (expected_error_part) {
1892	expected.add(that(error).does_differ_from_NULL());
1893	if (error) expected.add(that(error->Message()).does_contain(expected_error_part));
1894	}
1895	else {
1896	expected.add(that(error).is_equal_to_NULL());
1897	if (!error) {
1898	Section title = help.get_title();
1899	const Ostrings& title_strings = title.Content();
1900
1901	expected.add(that(title_strings.front().as_string()).is_equal_to(expected_title));
1902	expected.add(that(title_strings.size()).is_equal_to(1));
1903	}
1904	else {
1905	show_error_and_warnings(*error, helpfile);
1906	}
1907	}
1908
1909	delete error;
1910
1911	return all().ofgroup(expected);
1912	}
1913
1914	#define HELP_FILE_COMPILES(name,expTitle) TEST_EXPECTATION(help_file_compiles(name,expTitle,NULp))
1915	#define HELP_FILE_COMPILE_ERROR(name,expError) TEST_EXPECTATION(help_file_compiles(name,NULp,expError))
1916
1917	void TEST_hlp2xml_conversion() {
1918	TEST_EXPECT_ZERO(chdir("../../HELP_SOURCE"));
1919
1920	HELP_FILE_COMPILES("genhelp/agde_treepuzzle.hlp", "treepuzzle"); // genhelp/agde_treepuzzle.hlp
1921
1922	HELP_FILE_COMPILES("source/markbyref.hlp", "Mark by reference"); // source/markbyref.hlp
1923	HELP_FILE_COMPILES("source/ad_align.hlp", "Alignment Administration"); // source/ad_align.hlp
1924	HELP_FILE_COMPILES("genhelp/copyright.hlp", "Copyrights and licenses"); // genhelp/copyright.hlp
1925
1926	// @@@ add test for helpfile with subtitle
1927
1928	HELP_FILE_COMPILE_ERROR("akjsdlkad.hlp", "Can't read from"); // no such file
1929	}
1930	TEST_PUBLISH(TEST_hlp2xml_conversion);
1931
1932
1933	// #define TEST_AUTO_UPDATE // uncomment to update expected xml // @@@ comment-out!
1934
1935	void TEST_hlp2xml_output() {
1936	string tested_helpfile[] = {
1937	"unittest"
1938	};
1939
1940	string HELP_SOURCE = "../../HELP_SOURCE/";
1941	string LIB = "../../lib/";
1942	string EXPECTED = "help/";
1943
1944	for (size_t i = 0; i<ARRAY_ELEMS(tested_helpfile); ++i) {
1945	string xml = HELP_SOURCE + "Xml/" + tested_helpfile[i] + ".xml";
1946	string html = LIB + "help_html/" + tested_helpfile[i] + ".html";
1947	string hlp = LIB + "help/" + tested_helpfile[i] + ".hlp";
1948
1949	string xml_expected = EXPECTED + tested_helpfile[i] + ".xml";
1950	string html_expected = EXPECTED + tested_helpfile[i] + ".html";
1951	string hlp_expected = EXPECTED + tested_helpfile[i] + ".hlp";
1952
1953
1954	#if defined(TEST_AUTO_UPDATE)
1955	# if defined(NDEBUG)
1956	# error please use auto-update only in DEBUG mode
1957	# endif
1958	TEST_COPY_FILE(xml.c_str(), xml_expected.c_str());
1959	TEST_COPY_FILE(html.c_str(), html_expected.c_str());
1960	TEST_COPY_FILE(hlp.c_str(), hlp_expected.c_str());
1961
1962	#else // !defined(TEST_AUTO_UPDATE)
1963
1964	# if defined(DEBUG)
1965	int expected_xml_difflines = 0;
1966	int expected_hlp_difflines = 0;
1967	# else // !defined(DEBUG)
1968	int expected_xml_difflines = 1; // value of "edit_warning" differs - see .@edit_warning
1969	int expected_hlp_difflines = 2; // resulting warning in helpfile
1970	# endif
1971	TEST_EXPECT_TEXTFILE_DIFFLINES(xml.c_str(), xml_expected.c_str(), expected_xml_difflines);
1972	TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(html.c_str(), html_expected.c_str(), 0); // html contains the update-date
1973	TEST_EXPECT_TEXTFILE_DIFFLINES(hlp.c_str(), hlp_expected.c_str(), expected_hlp_difflines);
1974	#endif
1975	}
1976	}
1977
1978
1979	#if defined(PROTECT_HELP_VS_CHANGES)
1980	void TEST_protect_help_vs_changes() { // should normally be disabled
1981	// fails if help changes compared to another checkout
1982	// or just updates the diff w/o failing (if you comment out the last line)
1983	//
1984	// if the patch is hugo and you load it into xemacs
1985	// you might want to (turn-on-lazy-shot)
1986	//
1987	// patch-pointer: ../UNIT_TESTER/run/help_changes.patch
1988
1989	bool do_help = true;
1990	bool do_html = true;
1991
1992	const char *ref_WC = "ARB.help.ref";
1993
1994	// ---------------------------------------- config above
1995
1996	string this_base = "../..";
1997	string ref_base = this_base+"/../"+ref_WC;
1998	string to_help = "/lib/help";
1999	string to_html = "/lib/help_html";
2000	string diff_help = "diff -u "+ref_base+to_help+" "+this_base+to_help;
2001	string diff_html = "diff -u "+ref_base+to_html+" "+this_base+to_html;
2002
2003	string update_cmd;
2004
2005	if (do_help) {
2006	if (do_html) update_cmd = string("(")+diff_help+";"+diff_html+")";
2007	else update_cmd = diff_help;
2008	}
2009	else if (do_html) update_cmd = diff_html;
2010
2011	string patch = "help_changes.patch";
2012	update_cmd += " >"+patch+" \|\|true";
2013
2014	string fail_on_change_cmd = "test \"`cat "+patch+" \| grep -v '^Common subdirectories' \| wc -l`\" = \"0\" \|\| ( echo \"Error: Help changed\"; false)";
2015
2016	TEST_EXPECT_NO_ERROR(GBK_system(update_cmd.c_str()));
2017	TEST_EXPECT_NO_ERROR(GBK_system(fail_on_change_cmd.c_str())); // @@@ uncomment before commit
2018	}
2019	#endif
2020
2021	#endif // UNIT_TESTS

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/help/HELP_SOURCE/arb_help2xml.cxx

Download in other formats: