Context Navigation

source: tags/ms_r17q2/ARBDB/adoptimize.cxx

Visit:

Last change on this file was 15640, checked in by westram, 7 years ago
reintegrates 'db' into 'trunk' drops support for database type `GB_LINK` adds: log:branches/db@15628:15639
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 94.4 KB

Line
1	// =============================================================== //
2	// //
3	// File : adoptimize.cxx //
4	// Purpose : //
5	// //
6	// Institute of Microbiology (Technical University Munich) //
7	// http://www.arb-home.de/ //
8	// //
9	// =============================================================== //
10
11	#include <climits>
12	#include <netinet/in.h>
13
14	#include <arb_file.h>
15	#include <arb_diff.h>
16
17	#include <arbdbt.h>
18
19	#include "gb_key.h"
20	#include "gb_compress.h"
21	#include "gb_dict.h"
22
23	#include "arb_progress.h"
24
25	#if defined(DEBUG)
26	// #define TEST_DICT
27	#endif // DEBUG
28
29	typedef unsigned char unsigned_char;
30	typedef unsigned char *u_str;
31	typedef const unsigned char *cu_str;
32
33	static int gbdByKey_cnt;
34	struct O_gbdByKey { // one for each diff. keyQuark
35	int cnt;
36	GBDATA **gbds; // gbdoff
37	};
38
39	struct FullDictTree;
40	struct SingleDictTree;
41
42	union DictTree {
43	FullDictTree *full;
44	SingleDictTree *single;
45	void *exists;
46
47	};
48
49	enum DictNodeType { SINGLE_NODE, FULL_NODE };
50
51	struct FullDictTree {
52	DictNodeType typ; // always FULL_NODE
53	int usedSons;
54	int count[256];
55	DictTree son[256]; // index == character
56	};
57
58	struct SingleDictTree {
59	DictNodeType typ; // always SINGLE_NODE
60	unsigned_char ch; // the character
61	int count; // no of occurrences of this branch
62	DictTree son;
63	DictTree brother;
64
65	};
66
67	// **************************************************
68
69	#define COMPRESSIBLE(type) ((type) >= GB_BYTES && (type)<=GB_STRING)
70	#define DICT_MEM_WEIGHT 4
71
72	#define WORD_HELPFUL(wordlen, occurrences) ((long)((occurrences)3 + DICT_MEM_WEIGHT(2*sizeof(GB_NINT)+(wordlen))) \
73	< \
74	(long)((occurrences)*(wordlen)))
75	/* (occurrences)*4 compressed size
76	* 2*sizeof(GB_NINT)+(wordlen) size in dictionary
77	* (occurrences)*(wordlen) uncompressed size
78	*/
79
80	// **************************************************
81
82	#define MIN_WORD_LEN 8 // minimum length of words in dictionary
83	#define MAX_WORD_LEN 50 // maximum length of words in dictionary
84	#define MAX_BROTHERS 10 /* maximum no of brothers linked with SingleDictTree
85	* above we use FullDictTree */
86	#define MAX_DIFFER 2 /* percentage of difference (of occurrences of strings) below which two
87	* consecutive parts are treated as EQUAL # of occurrences */
88	#define INCR_DIFFER 1 // the above percentage is incremented from 0 to MAX_DIFFER by INCR_DIFFER per step
89
90	#define DICT_STRING_INCR 1024 // dictionary string will be incremented by this size
91
92	// ***************** Tool functions ****************
93
94	inline cu_str get_data_n_size(GBDATA gbd, size_t size) {
95	GB_CSTR data;
96	*size = 0;
97
98	switch (gbd->type()) {
99	case GB_STRING: data = GB_read_char_pntr(gbd); break;
100	case GB_BYTES: data = GB_read_bytes_pntr(gbd); break;
101	case GB_INTS: data = (char*)GB_read_ints_pntr(gbd); break;
102	case GB_FLOATS: data = (char*)GB_read_floats_pntr(gbd); break;
103	default:
104	data = 0;
105	gb_assert(0);
106	break;
107	}
108
109	if (data) *size = gbd->as_entry()->uncompressed_size();
110	return (cu_str)data;
111	}
112
113	static inline long min(long a, long b) {
114	return a<b ? a : b;
115	}
116
117	// **************************************************
118
119	static void g_b_opti_scanGbdByKey(GB_MAIN_TYPE Main, GBDATA gbd, O_gbdByKey *gbk) {
120	if (gbd->is_container()) {
121	GBCONTAINER *gbc = gbd->as_container();
122	for (int idx=0; idx < gbc->d.nheader; idx++) {
123	GBDATA *gbd2 = GBCONTAINER_ELEM(gbc, idx);
124	if (gbd2) g_b_opti_scanGbdByKey(Main, gbd2, gbk);
125	}
126	}
127
128	GBQUARK quark = GB_KEY_QUARK(gbd);
129	if (quark)
130	{
131	gb_assert(gbk[quark].cnt < Main->keys[quark].nref \|\| quark==0);
132	gb_assert(gbk[quark].gbds != 0);
133
134	gbk[quark].gbds[gbk[quark].cnt] = gbd;
135	gbk[quark].cnt++;
136	}
137	}
138
139	static O_gbdByKey g_b_opti_createGbdByKey(GB_MAIN_TYPE Main)
140	{
141	O_gbdByKey *gbk = ARB_calloc<O_gbdByKey>(Main->keycnt);
142
143	gbdByKey_cnt = Main->keycnt; // always use gbdByKey_cnt instead of Main->keycnt cause Main->keycnt can change
144
145	for (int idx=1; idx<gbdByKey_cnt; idx++) {
146	gbk[idx].cnt = 0;
147
148	gb_Key& KEY = Main->keys[idx];
149	if (KEY.key && KEY.nref>0) {
150	ARB_calloc(gbk[idx].gbds, KEY.nref);
151	}
152	else {
153	gbk[idx].gbds = NULL;
154	}
155	}
156
157	gbk[0].cnt = 0;
158	ARB_calloc(gbk[0].gbds, 1);
159
160	g_b_opti_scanGbdByKey(Main, Main->gb_main(), gbk);
161
162	for (int idx=0; idx<gbdByKey_cnt; idx++) {
163	if (gbk[idx].cnt != Main->keys[idx].nref && idx)
164	{
165	printf("idx=%i gbk[idx].cnt=%i Main->keys[idx].nref=%li\n", // Main->keys[].nref ist falsch
166	idx, gbk[idx].cnt, Main->keys[idx].nref);
167
168	Main->keys[idx].nref = gbk[idx].cnt;
169	}
170	}
171	return gbk;
172	}
173
174	static void g_b_opti_freeGbdByKey(O_gbdByKey *gbk) {
175	for (int idx=0; idx<gbdByKey_cnt; idx++) free(gbk[idx].gbds);
176	free(gbk);
177	}
178
179	// ***************** Convert old compression style to new style ****************
180
181	static GB_ERROR gb_convert_compression(GBDATA *gbd) {
182	GB_ERROR error = 0;
183
184	if (gbd->is_container()) {
185	for (GBDATA *gb_child = GB_child(gbd); gb_child; gb_child = GB_nextChild(gb_child)) {
186	if (gb_child->flags.compressed_data \|\| gb_child->is_container()) {
187	error = gb_convert_compression(gb_child);
188	if (error) break;
189	}
190	}
191	}
192	else {
193	char *str = 0;
194	GBENTRY *gbe = gbd->as_entry();
195	long elems = gbe->size();
196	size_t data_size = gbe->uncompressed_size();
197	size_t new_size = -1;
198	int expectData = 1;
199
200	switch (gbd->type()) {
201	case GB_STRING:
202	case GB_OBSOLETE:
203	case GB_BYTES:
204	str = gb_uncompress_bytes(gbe->data(), data_size, &new_size);
205	if (str) {
206	gb_assert(new_size == data_size);
207	str = GB_memdup(str, data_size);
208	}
209	break;
210
211	case GB_INTS:
212	case GB_FLOATS:
213	str = gb_uncompress_longs_old(gbe->data(), elems, &new_size);
214	if (str) {
215	gb_assert(new_size == data_size);
216	str = GB_memdup(str, data_size);
217	}
218	break;
219
220	default:
221	expectData = 0;
222	break;
223	}
224
225	if (!str) {
226	if (expectData) {
227	error = GBS_global_string("Can't read old data to convert compression (Reason: %s)", GB_await_error());
228	}
229	}
230	else {
231	switch (gbd->type()) {
232	case GB_STRING:
233	error = GB_write_string(gbe, "");
234	if (!error) error = GB_write_string(gbe, str);
235	break;
236
237	case GB_OBSOLETE:
238	error = GB_set_temporary(gbe); // exclude obsolete type from next save
239	break;
240
241	case GB_BYTES:
242	error = GB_write_bytes(gbe, "", 0);
243	if (!error) error = GB_write_bytes(gbe, str, data_size);
244	break;
245
246	case GB_INTS:
247	case GB_FLOATS:
248	error = GB_write_pntr(gbe, str, data_size, elems);
249	break;
250
251	default:
252	gb_assert(0);
253	break;
254	}
255
256	free(str);
257	}
258	}
259	return error;
260	}
261
262	GB_ERROR gb_convert_V2_to_V3(GBDATA *gb_main) {
263	GB_ERROR error = 0;
264	GBDATA *gb_system = GB_search(gb_main, GB_SYSTEM_FOLDER, GB_FIND);
265
266	if (!gb_system) {
267	GB_create_container(gb_main, GB_SYSTEM_FOLDER);
268	if (GB_entry(gb_main, "extended_data")) {
269	GB_warning("Converting data from old V2.0 to V2.1 Format:\n"
270	" Please Wait (may take some time)");
271	}
272	error = gb_convert_compression(gb_main);
273	GB_disable_quicksave(gb_main, "Database converted to new format");
274	}
275	return error;
276	}
277
278
279	// ******************* Compress by dictionary ******************
280
281
282	/* compression tag format:
283	*
284	* unsigned int compressed:1;
285	* if compressed==0:
286	* unsigned int last:1; ==1 -> this is the last block
287	* unsigned int len:6; length of uncompressible bytes
288	* char[len];
289	* if compressed==1:
290	* unsigned int idxlen:1; ==0 -> 10-bit index
291	* ==1 -> 18-bit index
292	* unsigned int idxhigh:2; the 2 highest bits of the index
293	* unsigned int len:4; (length of word) - (MIN_COMPR_WORD_LEN-1)
294	* if len==0:
295	* char extralen; (length of word) -
296	* char[idxlen+1]; index (low,high)
297	*
298	* tag == 64 -> end of dictionary compressed block (if not coded in last uncompressed block)
299	*/
300
301	inline int INDEX_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
302	gb_assert(idx<dict->words);
303	return ntohl(dict->offsets[idx]);
304	}
305	inline int ALPHA_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
306	int realIndex;
307	gb_assert(idx<dict->words);
308	realIndex = ntohl(dict->resort[idx]);
309	return INDEX_DICT_OFFSET(realIndex, dict);
310	}
311
312	// #define ALPHA_DICT_OFFSET(i) ntohl(offset[ntohl(resort[i])])
313	// #define INDEX_DICT_OFFSET(i) ntohl(offset[i])
314
315	#define LEN_BITS 4
316	#define INDEX_BITS 2
317	#define INDEX_LEN_BITS 1
318
319	#define LEN_SHIFT 0
320	#define INDEX_SHIFT (LEN_SHIFT+LEN_BITS)
321	#define INDEX_LEN_SHIFT (INDEX_SHIFT+INDEX_BITS)
322
323	#define BITMASK(bits) ((1<<(bits))-1)
324	#define GETVAL(tag,typ) (((tag)>>typ##_SHIFT)&BITMASK(typ##_BITS))
325
326	#define MIN_SHORTLEN 6
327	#define MAX_SHORTLEN (BITMASK(LEN_BITS)+MIN_SHORTLEN-1)
328	#define MIN_LONGLEN (MAX_SHORTLEN+1)
329	#define MAX_LONGLEN (MIN_LONGLEN+255)
330
331	#define SHORTLEN_DECR (MIN_SHORTLEN-1) // !! zero is used as flag for long len !!
332	#define LONGLEN_DECR MIN_LONGLEN
333
334	#define MIN_COMPR_WORD_LEN MIN_SHORTLEN
335	#define MAX_COMPR_WORD_LEN MAX_LONGLEN
336
337	#define MAX_SHORT_INDEX BITMASK(INDEX_BITS+8)
338	#define MAX_LONG_INDEX BITMASK(INDEX_BITS+16)
339
340	#define LAST_COMPRESSED_BIT 64
341
342	#ifdef DEBUG
343	# define DUMP_COMPRESSION_TEST 0
344	/* 0 = only compression ratio
345	* 1 = + original/compressed/decompressed
346	* 2 = + words used to compress/uncompress
347	* 3 = + matching words in dictionary
348	* 4 = + search of words in dictionary
349	*/
350	#else
351	# define DUMP_COMPRESSION_TEST 0
352	#endif
353
354	#ifdef DEBUG
355	// #define COUNT_CHUNKS
356
357	#if defined(COUNT_CHUNKS)
358
359	static long uncompressedBlocks[64];
360	static long compressedBlocks[MAX_LONGLEN];
361
362	static void clearChunkCounters() {
363	int i;
364
365	for (i=0; i<64; i++) uncompressedBlocks[i] = 0;
366	for (i=0; i<MAX_LONGLEN; i++) compressedBlocks[i] = 0;
367	}
368
369	static void dumpChunkCounters() {
370	int i;
371
372	printf("------------------------------\n" "Uncompressed blocks used:\n");
373	for (i=0; i<64; i++) if (uncompressedBlocks[i]) printf(" size=%i used=%li\n", i, uncompressedBlocks[i]);
374	printf("------------------------------\n" "Words used:\n");
375	for (i=0; i<MAX_LONGLEN; i++) if (compressedBlocks[i]) printf(" size=%i used=%li\n", i, compressedBlocks[i]);
376	printf("------------------------------\n");
377	}
378	#endif // COUNT_CHUNKS
379
380	static cu_str lstr(cu_str s, int len) {
381	#define BUFLEN 20000
382	static unsigned_char buf[BUFLEN];
383
384	gb_assert(len<BUFLEN);
385	memcpy(buf, s, len);
386	buf[len] = 0;
387
388	return buf;
389	}
390
391	#if DUMP_COMPRESSION_TEST>=2
392
393	static cu_str dict_word(GB_DICTIONARY *dict, int idx, int len) {
394	return lstr(dict->text+INDEX_DICT_OFFSET(idx, dict), len);
395	}
396
397	#endif
398
399	#if DUMP_COMPRESSION_TEST>=1
400
401	static void dumpBinary(u_str data, long size) {
402	#define PER_LINE 12
403	int cnt = 0;
404
405	while (size--) {
406	unsigned_char c = *data++;
407	int bitval = 128;
408	int bits = 8;
409
410	while (bits--) {
411	putchar(c&bitval ? '1' : '0');
412	bitval>>=1;
413	}
414	putchar(' ');
415
416	cnt = (cnt+1)%PER_LINE;
417	if (!cnt) putchar('\n');
418	}
419
420	if (cnt) putchar('\n');
421	}
422
423	#endif
424
425	#endif
426
427	inline int GB_MEMCMP(const void vm1, const void vm2, long size) {
428	char c1 = (char)vm1,
429	c2 = (char)vm2;
430	int diff = 0;
431
432	while (size-- && !diff) diff = c1++-c2++;
433	return diff;
434	}
435
436	// --------------------------------------------------
437
438	static int searchWord(GB_DICTIONARY dict, cu_str source, long size, unsigned long wordIndex, int *wordLen)
439	{
440	int idx = -1;
441	int l = 0;
442	int h = dict->words-1;
443	cu_str text = dict->text;
444	GB_NINT *resort = dict->resort;
445	int dsize = dict->textlen;
446	int ilen = 0;
447
448	while (l<h-1) {
449	int m = (l+h)/2;
450	long off = ALPHA_DICT_OFFSET(m, dict);
451	cu_str dictword = text+off;
452	long msize = min(size, dsize-off);
453
454	#if DUMP_COMPRESSION_TEST>=4
455	printf(" %s (%i)\n", lstr(dictword, 20), m);
456	#endif
457
458	if (GB_MEMCMP(source, dictword, msize)<=0) h = m;
459	else l = m;
460	}
461
462	while (l<=h) {
463	int off = ALPHA_DICT_OFFSET(l, dict);
464	cu_str word = text+off;
465	int msize = (int)min(size, dsize-off);
466	int equal = 0;
467	cu_str s = source;
468
469	while (msize-- && s++==word++) equal++;
470
471	#if DUMP_COMPRESSION_TEST>=3
472	if (equal>=MIN_COMPR_WORD_LEN) {
473	printf(" EQUAL=%i '%s' (%i->%i, off=%i)", equal, lstr(text+off, equal), l, ntohl(resort[l]), ALPHA_DICT_OFFSET(l, dict));
474	printf(" (context=%s)\n", lstr(text+off-min(off, 20), min(off, 20)+equal+20));
475	}
476	#endif
477
478	if (equal>ilen) {
479	ilen = equal;
480	idx = ntohl(resort[l]);
481	gb_assert(idx<dict->words);
482	}
483
484	l++;
485	}
486
487	*wordIndex = idx;
488	*wordLen = (int)min(ilen, MAX_COMPR_WORD_LEN);
489
490	return idx!=-1 && ilen>=MIN_COMPR_WORD_LEN;
491	}
492
493	#ifdef DEBUG
494	int lookup_DICTIONARY(GB_DICTIONARY *dict, GB_CSTR source) { // used for debugging
495	unsigned long wordIndex;
496	int wordLen;
497	int wordFound = searchWord(dict, (cu_str)source, strlen(source), &wordIndex, &wordLen);
498
499	if (wordFound) {
500	printf("'%s' (idx=%lu, off=%i)\n", lstr(dict->text+ntohl(dict->offsets[wordIndex]), wordLen), wordIndex, ntohl(dict->offsets[wordIndex]));
501	}
502
503	return wordFound;
504	}
505	#endif
506
507
508
509	static char gb_uncompress_by_dictionary_internal(GB_DICTIONARY dict, /* GBDATA gbd, / GB_CSTR s_source, const size_t size, bool append_zero, size_t *new_size) {
510	cu_str source = (cu_str)s_source;
511	u_str dest;
512	u_str buffer;
513	cu_str text = dict->text;
514	int done = 0;
515	long left = size;
516
517	dest = buffer = (u_str)GB_give_other_buffer(s_source, size+2);
518
519	while (left && !done) {
520	int c;
521
522	if ((c=*source++)&128) { // compressed data
523	int indexLen = GETVAL(c, INDEX_LEN);
524	unsigned long idx = GETVAL(c, INDEX);
525
526	c = GETVAL(c, LEN); // ==wordLen
527	if (c) c += SHORTLEN_DECR;
528	else c = *source+++LONGLEN_DECR;
529
530	gb_assert(indexLen>=0 && indexLen<=1);
531
532	if (indexLen==0) {
533	idx = (idx << 8) \| *source++;
534	}
535	else {
536	idx = (((idx << 8) \| source[1]) << 8) \| source[0];
537	source += 2;
538	}
539
540	gb_assert(idx<(GB_ULONG)dict->words);
541
542	{
543	cu_str word = text+INDEX_DICT_OFFSET(idx, dict);
544
545	#if DUMP_COMPRESSION_TEST>=2
546	printf(" word='%s' (idx=%lu, off=%li, len=%i)\n",
547	lstr(word, c), idx, (long)ntohl(dict->offsets[idx]), c);
548	#endif
549
550	{
551	u_str d = dest;
552	gb_assert(((d + c) <= word) \|\| (d >= (word + c)));
553	while (c--) d++ = word++;
554	dest = d;
555	}
556	}
557	}
558	else { // uncompressed bytes
559	if (c & LAST_COMPRESSED_BIT) {
560	done = 1;
561	c ^= LAST_COMPRESSED_BIT;
562	}
563
564	left -= c;
565	{
566	u_str d = dest;
567	gb_assert(((d + c) <= source) \|\| (d >= (source + c)));
568	while (c--) d++ = source++;
569	dest=d;
570	}
571	}
572	}
573
574	if (append_zero) *dest++ = 0;
575
576	*new_size = dest-buffer;
577	gb_assert(size >= *new_size); // buffer overflow
578
579	return (char *)buffer;
580	}
581
582	char gb_uncompress_by_dictionary(GBDATA gbd, GB_CSTR s_source, size_t size, size_t *new_size)
583	{
584	GB_DICTIONARY *dict = gb_get_dictionary(GB_MAIN(gbd), GB_KEY_QUARK(gbd));
585	bool append_zero = gbd->is_a_string();
586
587	if (!dict) {
588	GB_ERROR error = GBS_global_string("Cannot decompress db-entry '%s' (no dictionary found)\n", GB_get_db_path(gbd));
589	GB_export_error(error);
590	return 0;
591	}
592
593	return gb_uncompress_by_dictionary_internal(dict, s_source, size, append_zero, new_size);
594	}
595
596	char gb_compress_by_dictionary(GB_DICTIONARY dict, GB_CSTR s_source, size_t size, size_t *msize, int last_flag, int search_backward, int search_forward)
597	{
598	cu_str source = (cu_str)s_source;
599	u_str dest;
600	u_str buffer;
601	cu_str unknown = source; // start of uncompressible bytes
602	u_str lastUncompressed = NULL; // ptr to start of last block of uncompressible bytes (in dest)
603
604	#if defined(ASSERTION_USED)
605	const size_t org_size = size;
606	#endif // ASSERTION_USED
607
608	gb_assert(size>0); // compression of zero-length data fails!
609
610	dest = buffer = (u_str)GB_give_other_buffer((GB_CSTR)source, 1+(size/63+1)+size);
611	*dest++ = GB_COMPRESSION_DICTIONARY \| last_flag;
612
613	while (size) {
614	unsigned long wordIndex;
615	int wordLen;
616	int wordFound;
617
618	if ((wordFound = searchWord(dict, source, size, &wordIndex, &wordLen))) {
619	int length;
620
621	takeRest :
622	length = source-unknown;
623
624	if (length) {
625	int shift;
626	int takeShift = 0;
627	int maxShift = (int)min(search_forward, wordLen-1);
628
629	for (shift=1; shift<=maxShift; shift++) {
630	unsigned long wordIndex2;
631	int wordLen2;
632	int wordFound2;
633
634	if ((wordFound2 = searchWord(dict, source+shift, size-shift, &wordIndex2, &wordLen2))) {
635	if (wordLen2>(wordLen+shift)) {
636	wordIndex = wordIndex2;
637	wordLen = wordLen2;
638	takeShift = shift;
639	}
640	}
641	}
642
643	if (takeShift) {
644	source += takeShift;
645	size -= takeShift;
646	length = source-unknown;
647	}
648	}
649
650	while (length) { // if there were uncompressible bytes
651	int take = (int)min(length, 63);
652
653	#ifdef COUNT_CHUNKS
654	uncompressedBlocks[take]++;
655	#endif
656
657	lastUncompressed = dest;
658
659	*dest++ = take; // tag byte
660	memcpy(dest, unknown, take);
661	dest += take;
662	unknown += take;
663	length -= take;
664	}
665
666	gb_assert(unknown==source);
667
668	while (wordFound) { // as long as we find words in dictionary
669	int indexLen = wordIndex>MAX_SHORT_INDEX;
670	int indexHighBits = indexLen==0 ? wordIndex>>8 : wordIndex>>16;
671	int nextWordFound;
672	int nextWordLen;
673	unsigned long nextWordIndex;
674
675	gb_assert((long)wordIndex<dict->words);
676	gb_assert((long)wordIndex <= MAX_LONG_INDEX);
677	gb_assert(indexHighBits==(indexHighBits & BITMASK(INDEX_BITS)));
678	gb_assert(wordLen>=MIN_SHORTLEN);
679
680	lastUncompressed = NULL;
681
682	{
683	cu_str source2 = source+wordLen;
684	long size2 = size-wordLen;
685
686	if (!(nextWordFound=searchWord(dict, source+wordLen, size-wordLen, &nextWordIndex, &nextWordLen))) { // no word right afterwards
687	int shift;
688
689	for (shift=1; shift<=search_backward && shift<(wordLen-MIN_COMPR_WORD_LEN); shift++) {
690	// try to cut end of word to get a better result
691	unsigned long wordIndex2;
692	int wordLen2;
693	int wordFound2;
694
695	if ((wordFound2=searchWord(dict, source2-shift, size2+shift, &wordIndex2, &wordLen2))) {
696	if (wordLen2>(shift+1)) {
697	wordLen -= shift;
698
699	nextWordFound = 1;
700	nextWordIndex = wordIndex2;
701	nextWordLen = wordLen2;
702	break;
703	}
704	}
705	}
706	}
707	}
708
709	#ifdef COUNT_CHUNKS
710	compressedBlocks[wordLen]++;
711	#endif
712
713	#if DUMP_COMPRESSION_TEST>=2
714	printf(" word='%s' (idx=%li, off=%i, len=%i)\n",
715	dict_word(dict, wordIndex, wordLen), wordIndex, (int)ntohl(dict->offsets[wordIndex]), wordLen);
716	#endif
717
718	if (wordLen<=MAX_SHORTLEN) {
719	*dest++ = 128 \|
720	(indexLen << INDEX_LEN_SHIFT) \|
721	(indexHighBits << INDEX_SHIFT) \|
722	((wordLen-SHORTLEN_DECR) << LEN_SHIFT);
723	}
724	else {
725	*dest++ = 128 \|
726	(indexLen << INDEX_LEN_SHIFT) \|
727	(indexHighBits << INDEX_SHIFT);
728	*dest++ = wordLen-LONGLEN_DECR; // extra length byte
729	}
730
731	*dest++ = (char)wordIndex; // low index byte
732	if (indexLen)
733	*dest++ = (char)(wordIndex >> 8); // high index byte
734
735	unknown = source += wordLen;
736	size -= wordLen;
737
738	wordFound = nextWordFound;
739	wordIndex = nextWordIndex;
740	wordLen = nextWordLen;
741	}
742	}
743	else {
744	source++;
745	if (--size==0) goto takeRest;
746	}
747	}
748
749	if (lastUncompressed) *lastUncompressed \|= LAST_COMPRESSED_BIT;
750	else *dest++ = LAST_COMPRESSED_BIT;
751
752	*msize = dest-buffer;
753
754	#if defined(ASSERTION_USED)
755	{
756	size_t new_size = -1;
757	char *test = gb_uncompress_by_dictionary_internal(dict, (GB_CSTR)buffer+1, org_size + GB_COMPRESSION_TAGS_SIZE_MAX, true, &new_size);
758
759	gb_assert(memcmp(test, s_source, org_size) == 0);
760	gb_assert((org_size+1) == new_size);
761	}
762	#endif // ASSERTION_USED
763
764	return (char*)buffer;
765	}
766
767
768	#if defined(TEST_DICT)
769
770	static void test_dictionary(GB_DICTIONARY dict, O_gbdByKey gbk, long uncompSum, long compSum) {
771	long uncompressed_sum = 0;
772	long compressed_sum = 0;
773
774	long dict_size = (dict->words2+1)sizeof(GB_NINT)+dict->textlen;
775
776	long char_count[256];
777	for (int i=0; i<256; i++) char_count[i] = 0;
778
779	printf(" * Testing compression..\n");
780
781	#ifdef COUNT_CHUNKS
782	clearChunkCounters();
783	#endif
784
785	for (int cnt=0; cnt<gbk->cnt; cnt++) {
786	GBDATA *gbd = gbk->gbds[cnt];
787
788	if (COMPRESSIBLE(gbd->type())) {
789	size_t size;
790	cu_str data = get_data_n_size(gbd, &size);
791
792	if (gbd->is_a_string()) size--;
793	if (size<1) continue;
794
795	u_str copy = (u_str)gbm_get_mem(size, GBM_DICT_INDEX);
796	gb_assert(copy!=0);
797	memcpy(copy, data, size);
798
799	#if DUMP_COMPRESSION_TEST>=1
800	printf("----------------------------\n");
801	printf("original : %3li b = '%s'\n", size, data);
802	#endif
803
804	int last_flag = 0;
805	size_t compressedSize;
806	u_str compressed = (u_str)gb_compress_by_dictionary(dict, (GB_CSTR)data, size, &compressedSize, last_flag, 9999, 2);
807
808	#if DUMP_COMPRESSION_TEST>=1
809	printf("compressed : %3li b = '%s'\n", compressedSize, lstr(compressed, compressedSize));
810	dumpBinary(compressed, compressedSize);
811	#endif
812
813	for (size_t i=0; i<compressedSize; i++) char_count[compressed[i]]++;
814
815	size_t new_size = -1;
816	u_str uncompressed = (u_str)gb_uncompress_by_dictionary(gbd, (char*)compressed+1, size+GB_COMPRESSION_TAGS_SIZE_MAX, &new_size);
817
818	#if DUMP_COMPRESSION_TEST>=1
819	printf("copy : %3li b = '%s'\n", size, lstr(copy, size));
820	printf("decompressed: %3li b = '%s'\n", size, lstr(uncompressed, size));
821	#endif
822
823	if (GB_MEMCMP(copy, uncompressed, size)!=0) {
824	int byte = 0;
825
826	while (copy[byte]==uncompressed[byte]) byte++;
827	printf("Error in compression (off=%i, '%s'", byte, lstr(copy+byte, 10));
828	printf("!='%s'\n", lstr(uncompressed+byte, 10));
829	}
830
831	if (compressedSize<size) {
832	uncompressed_sum += size;
833	compressed_sum += compressedSize;
834	}
835	else {
836	uncompressed_sum += size;
837	compressed_sum += size;
838	}
839
840	gbm_free_mem(copy, size, GBM_DICT_INDEX);
841	}
842	}
843
844	#ifdef COUNT_CHUNKS
845	dumpChunkCounters();
846	#endif
847
848	{
849	long compressed_plus_dict = compressed_sum+dict_size;
850	char *dict_text = GBS_global_string_copy("+dict %li b", dict_size);
851	long ratio = (compressed_plus_dict*100)/uncompressed_sum;
852
853	printf(" uncompressed size = %10li b\n"
854	" compressed size = %10li b\n"
855	" %17s = %10li b (Ratio=%li%%)\n",
856	uncompressed_sum,
857	compressed_sum,
858	dict_text, compressed_plus_dict, ratio);
859
860	free(dict_text);
861	}
862
863	*uncompSum += uncompressed_sum;
864	*compSum += compressed_sum+dict_size;
865	}
866
867	#endif // TEST_DICT
868
869
870	// ***************** Build dictionary ****************
871
872	#ifdef DEBUG
873	#define TEST // test trees?
874	// #define DUMP_TREE // dump trees?
875
876	// #define DUMP_EXPAND
877	/*
878	#define SELECT_WORDS
879	#define SELECTED_WORDS "oropl"
880	*/
881
882	# ifdef SELECT_WORDS
883	static char strnstr(char s1, int len, char *s2) {
884	char c = *s2;
885	int len2 = strlen(s2);
886
887	while (len-->=len2) {
888	if (*s1==c) {
889	if (strncmp(s1, s2, len2)==0) return s1;
890	}
891	s1++;
892	}
893
894	return NULL;
895	}
896	# endif
897
898	#ifdef DUMP_TREE
899	static void dump_dtree(int deep, DictTree tree)
900	{
901	static unsigned_char buffer[1024];
902
903	if (tree.full) {
904	switch (tree.full->typ) {
905	case FULL_NODE: {
906	int idx;
907
908	for (idx=0; idx<256; idx++) {
909	buffer[deep] = idx;
910	buffer[deep+1] = 0;
911
912	if (tree.full->son[idx].exists) dump_dtree(deep+1, tree.full->son[idx]);
913	else if (tree.full->count[idx]>0) printf(" '%s' (%i) [array]\n", buffer, tree.full->count[idx]);
914	}
915	break;
916	}
917	case SINGLE_NODE: {
918	buffer[deep] = tree.single->ch;
919	buffer[deep+1] = 0;
920
921	if (tree.single->son.exists) dump_dtree(deep+1, tree.single->son);
922	else printf(" '%s' (%i) [single]\n", buffer, tree.single->count);
923
924	if (tree.single->brother.exists) dump_dtree(deep, tree.single->brother);
925	break;
926	}
927	}
928	}
929	}
930	#endif
931
932	#else
933	#ifdef DUMP_TREE
934	# define dump_dtree(deep, tree)
935	#endif
936	#endif
937
938	#ifdef TEST
939	static int testCounts(DictTree tree) {
940	// tests if all inner nodes have correct 'count's
941	int cnt = 0;
942
943	if (tree.exists) {
944	switch (tree.full->typ) {
945	case SINGLE_NODE: {
946	while (tree.exists) {
947	if (tree.single->son.exists) {
948	int son_cnt = testCounts(tree.single->son);
949	#ifdef COUNT_EQUAL
950	gb_assert(son_cnt==tree.single->count);
951	#else
952	gb_assert(son_cnt<=tree.single->count);
953	#endif
954	}
955
956	gb_assert(tree.single->count>0);
957	cnt += tree.single->count;
958	tree = tree.single->brother;
959	}
960	break;
961	}
962	case FULL_NODE: {
963	int idx,
964	sons = 0;
965
966	for (idx=0; idx<256; idx++) {
967	if (tree.full->son[idx].exists) {
968	int son_cnt = testCounts(tree.full->son[idx]);
969	#ifdef COUNT_EQUAL
970	gb_assert(son_cnt==tree.full->count[idx]);
971	#else
972	gb_assert(son_cnt<=tree.full->count[idx]);
973	#endif
974	if (tree.full->usedSons) gb_assert(tree.full->count[idx]>0);
975	else gb_assert(tree.full->count[idx]==0);
976
977	sons++;
978	}
979	else if (tree.full->count[idx]) {
980	sons++;
981	}
982
983	cnt += tree.full->count[idx];
984	}
985
986	gb_assert(sons==tree.full->usedSons);
987	break;
988	}
989	}
990	}
991
992	return cnt;
993	}
994
995	// #define TEST_MAX_OCCUR_COUNT
996
997	#ifdef TEST_MAX_OCCUR_COUNT
998	#define MAX_OCCUR_COUNT 600000
999	#endif
1000
1001	static DictTree test_dtree(DictTree tree)
1002	// only correct while tree is under contruction (build_dict_tree())
1003	{
1004	if (tree.exists) {
1005	switch (tree.full->typ) {
1006	case SINGLE_NODE: {
1007	#if defined(TEST_MAX_OCCUR_COUNT)
1008	gb_assert(tree.single->count<MAX_OCCUR_COUNT); // quite improbable
1009	#endif // TEST_MAX_OCCUR_COUNT
1010
1011	if (tree.single->son.exists) {
1012	gb_assert(tree.single->count==0);
1013	test_dtree(tree.single->son);
1014	}
1015	else {
1016	gb_assert(tree.single->count>0);
1017	}
1018
1019	if (tree.single->brother.exists) test_dtree(tree.single->brother);
1020	break;
1021	}
1022	case FULL_NODE: {
1023	int idx;
1024	int countSons = 0;
1025
1026	for (idx=0; idx<256; idx++) {
1027	#if defined(TEST_MAX_OCCUR_COUNT)
1028	gb_assert(tree.full->count[idx]<MAX_OCCUR_COUNT); // quite improbable
1029	#endif // TEST_MAX_OCCUR_COUNT
1030
1031	if (tree.full->son[idx].exists) {
1032	gb_assert(tree.full->count[idx]==0);
1033	test_dtree(tree.full->son[idx]);
1034	countSons++;
1035	}
1036	else {
1037	gb_assert(tree.full->count[idx]>=0);
1038	if (tree.full->count[idx]>0)
1039	countSons++;
1040	}
1041	}
1042
1043	gb_assert(countSons==tree.full->usedSons);
1044
1045	break;
1046	}
1047	}
1048	}
1049
1050	return tree;
1051	}
1052
1053	#else
1054	# define test_dtree(tree) // (tree)
1055	# define testCounts(tree) // 0
1056	#endif
1057
1058
1059	static DictTree new_dtree(cu_str text, long len, long *memcount) {
1060	// creates a new (sub-)tree from 'text' (which has length 'len')
1061	DictTree tree;
1062
1063	if (len) {
1064	SingleDictTree *tail = NULL;
1065	SingleDictTree *head = NULL;
1066
1067	while (len) {
1068	if (tail) tail = tail->son.single = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1069	else tail = head = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1070
1071	(memcount) += sizeof(tail);
1072
1073	tail->typ = SINGLE_NODE;
1074	tail->ch = *text++;
1075	len--;
1076
1077	tail->brother.single = NULL;
1078	tail->son.single = NULL;
1079	}
1080
1081	tail->count = 1;
1082	tree.single = head;
1083	}
1084	else {
1085	tree.single = NULL;
1086	}
1087
1088	return tree;
1089	}
1090
1091	static DictTree single2full_dtree(DictTree tree, long *memcount) {
1092	if (tree.exists && tree.single->typ==SINGLE_NODE) {
1093	FullDictTree full = (FullDictTree)gbm_get_mem(sizeof(*full), GBM_DICT_INDEX);
1094	int idx;
1095
1096	(memcount) += sizeof(full);
1097	full->typ = FULL_NODE;
1098	full->usedSons = 0;
1099
1100	for (idx=0; idx<256; idx++) {
1101	full->son[idx].exists = NULL;
1102	full->count[idx] = 0;
1103	}
1104
1105	while (tree.exists) {
1106	SingleDictTree *t = tree.single;
1107
1108	gb_assert(t->typ==SINGLE_NODE);
1109	gb_assert(full->son[t->ch].exists==NULL);
1110
1111	full->son[t->ch] = t->son;
1112	full->count[t->ch] = t->count;
1113	full->usedSons++;
1114
1115	tree.single = t->brother.single;
1116
1117	gbm_free_mem(t, sizeof(*t), GBM_DICT_INDEX);
1118	(memcount) -= sizeof(t);
1119	}
1120
1121	tree.full = full;
1122	}
1123
1124	return tree;
1125	}
1126
1127	static void free_dtree(DictTree tree)
1128	{
1129	if (tree.exists) {
1130	switch (tree.full->typ) {
1131	case SINGLE_NODE: {
1132	if (tree.single->son.exists) free_dtree(tree.single->son);
1133	if (tree.single->brother.exists) free_dtree(tree.single->brother);
1134
1135	gbm_free_mem(tree.single, sizeof(*(tree.single)), GBM_DICT_INDEX);
1136	break;
1137	}
1138	case FULL_NODE: {
1139	int idx;
1140
1141	for (idx=0; idx<256; idx++) if (tree.full->son[idx].exists) free_dtree(tree.full->son[idx]);
1142	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1143	break;
1144	}
1145	}
1146	}
1147	}
1148
1149
1150
1151	static DictTree cut_dtree(DictTree tree, int cut_count, long memcount, long leafcount)
1152	/* removes all branches from 'tree' which are referenced less/equal than cut_count
1153	* returns: the reduced tree */
1154	{
1155	if (tree.exists) {
1156	switch (tree.full->typ) {
1157	case SINGLE_NODE: {
1158	if (tree.single->son.exists) tree.single->son = cut_dtree(tree.single->son, cut_count, memcount, leafcount);
1159
1160	if (!tree.single->son.exists) { // leaf
1161	if (tree.single->count<=cut_count) { // leaf with less/equal references
1162	DictTree brother = tree.single->brother;
1163
1164	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1165	(memcount) -= sizeof(tree.single);
1166	if (brother.exists) return cut_dtree(brother, cut_count, memcount, leafcount);
1167
1168	tree.single = NULL;
1169	break;
1170	}
1171	else {
1172	(*leafcount)++;
1173	}
1174	}
1175
1176	if (tree.single->brother.exists) tree.single->brother = cut_dtree(tree.single->brother, cut_count, memcount, leafcount);
1177	break;
1178	}
1179	case FULL_NODE: {
1180	int idx;
1181	int count = 0;
1182
1183	for (idx=0; idx<256; idx++) {
1184	if (tree.full->son[idx].exists) {
1185	tree.full->son[idx] = cut_dtree(tree.full->son[idx], cut_count, memcount, leafcount);
1186
1187	if (tree.full->son[idx].exists) count++;
1188	else tree.full->count[idx] = 0;
1189	}
1190	else if (tree.full->count[idx]>0) {
1191	if (tree.full->count[idx]<=cut_count) {
1192	tree.full->count[idx] = 0;
1193	}
1194	else {
1195	count++;
1196	(*leafcount)++;
1197	}
1198	}
1199	}
1200
1201	tree.full->usedSons = count;
1202
1203	if (!count) { // no more sons
1204	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1205	(memcount) -= sizeof((tree.full));
1206	tree.exists = NULL;
1207	}
1208
1209	break;
1210	}
1211	}
1212	}
1213
1214	return tree;
1215	}
1216	static DictTree cut_useless_words(DictTree tree, int deep, long *removed)
1217	/* removes/shortens all branches of 'tree' which are not useful for compression
1218	* 'deep' should be zero (incremented by cut_useless_words)
1219	* 'removed' will be set to the # of removed occurrences
1220	* returns: the reduced tree
1221	*/
1222	{
1223	*removed = 0;
1224
1225	if (tree.exists) {
1226	deep++;
1227
1228	switch (tree.full->typ) {
1229	long removed_single;
1230
1231	case SINGLE_NODE: {
1232	if (tree.single->son.exists) {
1233	tree.single->son = cut_useless_words(tree.single->son, deep, &removed_single);
1234	tree.single->count -= removed_single;
1235	*removed += removed_single;
1236	}
1237
1238	if (!tree.single->son.exists && !WORD_HELPFUL(deep, tree.single->count)) {
1239	DictTree brother = tree.single->brother;
1240
1241	*removed += tree.single->count;
1242	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1243
1244	if (brother.exists) {
1245	tree = cut_useless_words(brother, deep-1, &removed_single);
1246	*removed += removed_single;
1247	}
1248	else {
1249	tree.exists = NULL;
1250	}
1251
1252	break;
1253	}
1254
1255	if (tree.single->brother.exists) {
1256	tree.single->brother = cut_useless_words(tree.single->brother, deep-1, &removed_single);
1257	*removed += removed_single;
1258	}
1259
1260	break;
1261	}
1262	case FULL_NODE: {
1263	int idx;
1264	int count = 0;
1265
1266	for (idx=0; idx<256; idx++) {
1267	if (tree.full->son[idx].exists) {
1268	tree.full->son[idx] = cut_useless_words(tree.full->son[idx], deep, &removed_single);
1269	tree.full->count[idx] -= removed_single;
1270	*removed += removed_single;
1271	}
1272
1273	if (tree.full->son[idx].exists) {
1274	count++;
1275	}
1276	else if (tree.full->count[idx]) {
1277	if (!WORD_HELPFUL(deep, tree.full->count[idx])) { // useless!
1278	*removed += tree.full->count[idx];
1279	tree.full->count[idx] = 0;
1280	}
1281	else {
1282	count++;
1283	}
1284	}
1285	}
1286
1287	tree.full->usedSons = count;
1288
1289	if (!count) { // no more sons
1290	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1291	tree.exists = NULL;
1292	}
1293
1294	break;
1295	}
1296	}
1297	}
1298
1299	return tree;
1300	}
1301
1302	static DictTree add_dtree_to_dtree(DictTree toAdd, DictTree to, long *memcount)
1303	/* adds 'toAdd' as brother of 'to' (must be leftmost of all SINGLE_NODEs or a FULL_NODE)
1304	* returns: the leftmost of all SINGLE_NODEs or a FULL_NODE
1305	*/
1306	{
1307	DictTree tree = toAdd;
1308
1309	gb_assert(toAdd.single->typ==SINGLE_NODE);
1310
1311	if (to.exists) {
1312	switch (to.full->typ) {
1313	case SINGLE_NODE: {
1314	SingleDictTree *left = to.single;
1315
1316	gb_assert(left!=0);
1317
1318	if (toAdd.single->ch < to.single->ch) {
1319	toAdd.single->brother = to;
1320	return toAdd;
1321	}
1322
1323	while (to.single->brother.exists) {
1324	if (toAdd.single->ch < to.single->brother.single->ch) {
1325	toAdd.single->brother = to.single->brother;
1326	to.single->brother = toAdd;
1327
1328	tree.single = left;
1329	return tree;
1330	}
1331	to = to.single->brother;
1332	}
1333
1334	to.single->brother = toAdd;
1335	tree.single = left;
1336	break;
1337	}
1338	case FULL_NODE: {
1339	unsigned_char ch = toAdd.single->ch;
1340
1341	gb_assert(to.full->son[ch].exists==NULL);
1342	gb_assert(to.full->count[ch]==0); // if this fails, count must be added & tested
1343	gb_assert(toAdd.single->brother.exists==NULL);
1344
1345	to.full->son[ch] = toAdd.single->son;
1346	to.full->count[ch] = toAdd.single->count;
1347	to.full->usedSons++;
1348
1349	tree = to;
1350
1351	gbm_free_mem(toAdd.single, sizeof(*(toAdd.single)), GBM_DICT_INDEX);
1352	(*memcount) -= sizeof(toAdd.single);
1353
1354	break;
1355	}
1356	}
1357	}
1358
1359	return tree;
1360	}
1361
1362	static DictTree add_to_dtree(DictTree tree, cu_str text, long len, long *memcount)
1363	/* adds the string 'text' (which has length 'len') to 'tree'
1364	* returns: new tree
1365	*/
1366	{
1367	if (tree.exists) {
1368	switch (tree.full->typ) {
1369	case SINGLE_NODE: {
1370	SingleDictTree *t = tree.single;
1371	int count = 0;
1372
1373	do {
1374	count++;
1375	if (t->ch==text[0]) { // we found an existing subtree
1376	if (len>1) {
1377	t->son = add_to_dtree(t->son, text+1, len-1, memcount); // add rest of text to subtree
1378	}
1379	else {
1380	gb_assert(len==1);
1381	gb_assert(t->son.exists==NULL);
1382	t->count++;
1383	}
1384
1385	return count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree;
1386	}
1387	else if (t->ch > text[0]) {
1388	break;
1389	}
1390	}
1391	while ((t=t->brother.single)!=NULL);
1392
1393	tree = add_dtree_to_dtree(new_dtree(text, len, memcount), // otherwise we create a new subtree
1394	count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree,
1395	memcount);
1396	break;
1397	}
1398	case FULL_NODE: {
1399	unsigned_char ch = text[0];
1400
1401	if (tree.full->son[ch].exists) {
1402	tree.full->son[ch] = add_to_dtree(tree.full->son[ch], text+1, len-1, memcount);
1403	}
1404	else {
1405	tree.full->son[ch] = new_dtree(text+1, len-1, memcount);
1406	if (!tree.full->son[ch].exists) {
1407	if (tree.full->count[ch]==0) tree.full->usedSons++;
1408	tree.full->count[ch]++;
1409	}
1410	else {
1411	tree.full->usedSons++;
1412	}
1413	}
1414	break;
1415	}
1416	}
1417
1418	return tree;
1419	}
1420
1421	return new_dtree(text, len, memcount);
1422	}
1423
1424	static long calcCounts(DictTree tree)
1425	{
1426	long cnt = 0;
1427
1428	gb_assert(tree.exists!=0);
1429
1430	switch (tree.full->typ) {
1431	case SINGLE_NODE: {
1432	while (tree.exists) {
1433	if (tree.single->son.exists) tree.single->count = calcCounts(tree.single->son);
1434	gb_assert(tree.single->count>0);
1435	cnt += tree.single->count;
1436	tree = tree.single->brother;
1437	}
1438	break;
1439	}
1440	case FULL_NODE: {
1441	int idx;
1442
1443	for (idx=0; idx<256; idx++) {
1444	if (tree.full->son[idx].exists) {
1445	tree.full->count[idx] = calcCounts(tree.full->son[idx]);
1446	gb_assert(tree.full->count[idx]>0);
1447	}
1448	else {
1449	gb_assert(tree.full->count[idx]>=0);
1450	}
1451	cnt += tree.full->count[idx];
1452	}
1453	break;
1454	}
1455	}
1456
1457	return cnt;
1458	}
1459
1460	static int count_dtree_leafs(DictTree tree, int deep, int *maxdeep) {
1461	// returns # of leafs and max. depth of tree
1462	int leafs = 0;
1463
1464	gb_assert(tree.exists!=0);
1465
1466	if (++deep>maxdeep) maxdeep = deep;
1467
1468	switch (tree.full->typ) {
1469	case SINGLE_NODE: {
1470	if (tree.single->son.exists) leafs += count_dtree_leafs(tree.single->son, deep, maxdeep);
1471	else leafs++;
1472	if (tree.single->brother.exists) leafs += count_dtree_leafs(tree.single->brother, deep, maxdeep);
1473	break;
1474	}
1475	case FULL_NODE: {
1476	int idx;
1477
1478	for (idx=0; idx<256; idx++) {
1479	if (tree.full->son[idx].exists) leafs += count_dtree_leafs(tree.full->son[idx], deep, maxdeep);
1480	else if (tree.full->count[idx]) leafs++;
1481	}
1482	break;
1483	}
1484	}
1485
1486	return leafs;
1487	}
1488
1489	static int COUNT(DictTree tree) {
1490	// counts sum of # of occurrences of tree
1491	int cnt = 0;
1492
1493	switch (tree.single->typ) {
1494	case SINGLE_NODE: {
1495	while (tree.exists) {
1496	cnt += tree.single->count;
1497	tree = tree.single->brother;
1498	}
1499	break;
1500	}
1501	case FULL_NODE: {
1502	int idx;
1503
1504	for (idx=0; idx<256; idx++) cnt += tree.full->count[idx];
1505	break;
1506	}
1507	}
1508
1509	return cnt;
1510	}
1511
1512	static DictTree removeSubsequentString(DictTree *tree_pntr, cu_str buffer, int len, int max_occur) {
1513	/* searches tree for 'buffer' (length='len')
1514	*
1515	* returns - rest below found string
1516	* (if found and if the # of occurrences of the string is less/equal than 'max_occur')
1517	* - NULL otherwise
1518	*
1519	* removes the whole found string from the tree (not only the rest!)
1520	*/
1521	DictTree tree = *tree_pntr, rest;
1522	static int restCount;
1523
1524	rest.exists = NULL;
1525
1526	gb_assert(tree.exists!=0);
1527	gb_assert(len>0);
1528
1529	switch (tree.full->typ) {
1530	case SINGLE_NODE: {
1531	while (tree.single->ch <= buffer[0]) {
1532	if (tree.single->ch == buffer[0]) { // found wanted character
1533	if (tree.single->son.exists) {
1534	if (len==1) {
1535	if (tree.single->count <= max_occur) {
1536	rest = tree.single->son;
1537	restCount = COUNT(rest);
1538	tree.single->son.exists = NULL;
1539	}
1540	}
1541	else {
1542	rest = removeSubsequentString(&tree.single->son, buffer+1, len-1, max_occur);
1543	}
1544	}
1545
1546	if (rest.exists) { // the string was found
1547	tree.single->count -= restCount;
1548	gb_assert(tree.single->count >= 0);
1549
1550	if (!tree.single->count) { // empty subtree -> delete myself
1551	DictTree brother = tree.single->brother;
1552
1553	tree.single->brother.exists = NULL; // elsewise it would be freed by free_dtree
1554	free_dtree(tree);
1555	*tree_pntr = tree = brother;
1556	}
1557	}
1558
1559	break;
1560	}
1561
1562	tree_pntr = &(tree.single->brother);
1563	if (!(tree = tree.single->brother).exists) break;
1564	}
1565
1566	break;
1567	}
1568	case FULL_NODE: {
1569	unsigned_char ch;
1570
1571	if (tree.full->son[ch=buffer[0]].exists) {
1572	if (len==1) {
1573	if (tree.full->count[ch] <= max_occur) {
1574	rest = tree.full->son[ch];
1575	restCount = COUNT(rest);
1576	tree.full->son[ch].exists = NULL;
1577	}
1578	}
1579	else {
1580	rest = removeSubsequentString(&tree.full->son[ch], buffer+1, len-1, max_occur);
1581	}
1582
1583	if (rest.exists) {
1584	gb_assert(restCount>0);
1585	tree.full->count[ch] -= restCount;
1586	gb_assert(tree.full->count[ch]>=0);
1587	if (tree.full->count[ch]==0) {
1588	gb_assert(tree.full->son[ch].exists==NULL);
1589
1590	if (--tree.full->usedSons==0) { // last son deleted -> delete myself
1591	free_dtree(tree);
1592	tree.exists = NULL;
1593	*tree_pntr = tree;
1594	}
1595	}
1596	}
1597	}
1598
1599	break;
1600	}
1601	}
1602
1603	return rest;
1604	}
1605
1606	static cu_str memstr(cu_str stringStart, int stringStartLen, cu_str inString, int inStringLen) {
1607	if (!inStringLen) return stringStart; // string of length zero is found everywhere
1608
1609	while (stringStartLen) {
1610	cu_str found = (cu_str)memchr(stringStart, inString[0], stringStartLen);
1611
1612	if (!found) break;
1613
1614	stringStartLen -= found-stringStart;
1615	stringStart = found;
1616
1617	if (stringStartLen<inStringLen) break;
1618
1619	if (GB_MEMCMP(stringStart, inString, inStringLen)==0) return stringStart;
1620
1621	stringStart++;
1622	stringStartLen--;
1623	}
1624
1625	return NULL;
1626	}
1627
1628
1629	static int expandBranches(u_str buffer, int deep, int minwordlen, int maxdeep, DictTree tree, DictTree root, int max_percent) {
1630	/* expands all branches in 'tree'
1631	*
1632	* this is done by searching every of these branches in 'root' and moving any subsequent parts from there to 'tree'
1633	* (this is only done, if the # of occurrences of the found part does not exceed the # of occurrences of 'tree' more than 'max_percent' percent)
1634	*
1635	* 'buffer' strings are rebuild here while descending the tree (length of buffer==MAX_WORD_LEN)
1636	* 'deep' recursion level
1637	* 'maxdeep' maximum recursion level
1638	* 'minwordlen' is the length of the words to search (usually equal to MIN_WORD_LEN-1)
1639	*
1640	* returns the # of occurrences which were added to 'tree'
1641	*/
1642	int expand = 0; // calculate count-sum of added subsequent parts
1643
1644	gb_assert(tree.exists!=0);
1645
1646	if (deep<maxdeep) {
1647	switch (tree.full->typ) {
1648	case SINGLE_NODE: {
1649	while (tree.exists) {
1650	buffer[deep] = tree.single->ch;
1651
1652	if (!tree.single->son.exists) {
1653	DictTree rest;
1654	u_str buf = buffer+1;
1655	int len = deep;
1656
1657	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1658	buf += len-minwordlen;
1659	len = minwordlen;
1660	}
1661
1662	if (len==minwordlen) {
1663	cu_str self = memstr(buffer, deep+1, buf, len);
1664
1665	gb_assert(self!=0);
1666	if (self==buf) rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.single->count)/100);
1667	else rest.exists = NULL;
1668	}
1669	else {
1670	rest.exists = NULL;
1671	}
1672
1673	if (rest.exists) {
1674	int cnt = COUNT(rest);
1675
1676	tree.single->son = rest;
1677	tree.single->count += cnt;
1678	expand += cnt;
1679	#ifdef DUMP_EXPAND
1680	#define DUMP_MORE 1
1681	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1682	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1683	#endif
1684	}
1685	}
1686
1687	if (tree.single->son.exists) {
1688	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.single->son, root, max_percent);
1689
1690	expand += added;
1691	tree.single->count += added;
1692	}
1693
1694	tree = tree.single->brother;
1695	}
1696
1697	break;
1698	}
1699	case FULL_NODE: {
1700	int idx;
1701
1702	for (idx=0; idx<256; idx++) {
1703	buffer[deep] = idx;
1704
1705	if (!tree.full->son[idx].exists && tree.full->count[idx]) { // leaf
1706	DictTree rest;
1707	u_str buf = buffer+1;
1708	int len = deep;
1709
1710	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1711	buf += len-minwordlen;
1712	len = minwordlen;
1713	}
1714
1715	if (len==minwordlen) {
1716	cu_str self = memstr(buffer, deep+1, buf, len);
1717
1718	gb_assert(self!=0);
1719	if (self==buf)
1720	rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.full->count[idx])/100);
1721	else
1722	rest.exists = NULL;
1723	}
1724	else {
1725	rest.exists = NULL;
1726	}
1727
1728	if (rest.exists) { // substring found!
1729	int cnt = COUNT(rest);
1730
1731	if (tree.full->count[idx]==0) tree.full->usedSons++;
1732	tree.full->son[idx] = rest;
1733	tree.full->count[idx] += cnt;
1734
1735	expand += cnt;
1736	#ifdef DUMP_EXPAND
1737	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1738	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1739	#endif
1740	}
1741	}
1742
1743	if (tree.full->son[idx].exists) {
1744	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.full->son[idx], root, max_percent);
1745
1746	expand += added;
1747	tree.full->count[idx] += added;
1748	}
1749	}
1750
1751	break;
1752	}
1753	}
1754	}
1755
1756	return expand;
1757	}
1758
1759	static DictTree build_dict_tree(O_gbdByKey gbk, long maxmem, long maxdeep, size_t minwordlen, long data_sum)
1760	/* builds a tree of the most used words
1761	*
1762	* 'maxmem' is the amount of memory that will be allocated
1763	* 'maxdeep' is the maximum length of the _returned_ words
1764	* 'minwordlen' is the minimum length a word needs to get into the tree
1765	* this is used in the first pass as maximum tree depth
1766	* 'data_sum' will be set to the overall-size of data of which the tree was built
1767	*/
1768	{
1769	DictTree tree;
1770	long memcount = 0;
1771	long leafs = 0;
1772
1773	*data_sum = 0;
1774
1775	{
1776	int cnt;
1777	long lowmem = (maxmem*9)/10;
1778	int cut_count = 1;
1779
1780	// Build 8-level-deep tree of all existing words
1781
1782	tree.exists = NULL; // empty tree
1783
1784	for (cnt=0; cnt<gbk->cnt; cnt++) {
1785	GBDATA *gbd = gbk->gbds[cnt];
1786
1787	if (COMPRESSIBLE(gbd->type())) {
1788	size_t size;
1789	cu_str data = get_data_n_size(gbd, &size);
1790	cu_str lastWord;
1791
1792	if (gbd->is_a_string()) size--;
1793	if (size<minwordlen) continue;
1794
1795	*data_sum += size;
1796	lastWord = data+size-minwordlen;
1797
1798	#ifdef SELECT_WORDS
1799	if (strnstr(data, size, SELECTED_WORDS)) // test some words only
1800	#endif
1801	{
1802
1803	for (; data<=lastWord; data++) {
1804	tree = add_to_dtree(tree, data, minwordlen, &memcount);
1805
1806	while (memcount>maxmem) {
1807	leafs = 0;
1808	tree = cut_dtree(tree, cut_count, &memcount, &leafs);
1809	if (memcount<=lowmem) break;
1810	cut_count++;
1811	}
1812	}
1813	}
1814	}
1815	}
1816	}
1817
1818	{
1819	int cutoff = 1;
1820
1821	leafs = 0;
1822	tree = cut_dtree(tree, cutoff, &memcount, &leafs); // cut all single elements
1823	test_dtree(tree);
1824
1825	#if defined(DEBUG)
1826	if (tree.exists) {
1827	int maxdeep2 = 0;
1828	long counted = count_dtree_leafs(tree, 0, &maxdeep2);
1829	gb_assert(leafs == counted);
1830	}
1831	#endif // DEBUG
1832
1833	// avoid directory overflow (max. 18bit)
1834	while (leafs >= MAX_LONG_INDEX) {
1835	leafs = 0;
1836	++cutoff;
1837	#if defined(DEBUG)
1838	printf("Directory overflow (%li) -- reducing size (cutoff = %i)\n", leafs, cutoff);
1839	#endif // DEBUG
1840	tree = cut_dtree(tree, cutoff, &memcount, &leafs);
1841	}
1842	}
1843	#ifdef DUMP_TREE
1844	printf("----------------------- tree with short branches:\n");
1845	dump_dtree(0, tree);
1846	printf("---------------------------\n");
1847	#endif
1848
1849	// Try to create longer branches
1850
1851	if (tree.exists) {
1852	int add_count;
1853	u_str buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
1854	int max_differ;
1855	long dummy;
1856
1857	if (tree.full->typ != FULL_NODE) tree = single2full_dtree(tree, &memcount); // ensure root is FULL_NODE
1858
1859	test_dtree(tree);
1860	calcCounts(tree); // calculate counters of inner nodes
1861	testCounts(tree);
1862
1863	for (max_differ=0; max_differ<=MAX_DIFFER; max_differ+=INCR_DIFFER) { // percent of allowed difference for concatenating tree branches
1864	do {
1865	int idx;
1866	add_count = 0;
1867
1868	for (idx=0; idx<256; idx++) {
1869	if (tree.full->son[idx].exists) {
1870	int added;
1871
1872	buffer[0] = idx;
1873	added = expandBranches(buffer, 1, minwordlen-1, maxdeep, tree.full->son[idx], tree, max_differ);
1874	tree.full->count[idx] += added;
1875	add_count += added;
1876	}
1877	}
1878	}
1879	while (add_count);
1880	}
1881
1882	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
1883
1884	tree = cut_useless_words(tree, 0, &dummy);
1885	}
1886
1887	#ifdef DUMP_TREE
1888	printf("----------------------- tree with expanded branches:\n");
1889	dump_dtree(0, tree);
1890	printf("-----------------------\n");
1891	#endif
1892	testCounts(tree);
1893
1894	return tree;
1895	}
1896
1897	static DictTree remove_word_from_dtree(DictTree tree, cu_str wordStart, int wordLen, u_str resultBuffer, int resultLen, long resultFrequency, long *removed) {
1898	/* searches 'tree' for a word starting with 'wordStart' an removes it from the tree
1899	* if there are more than one possibilities, the returned word will be the one with the most occurrences
1900	* if there was no possibility -> resultLen==0, tree unchanged
1901	* otherwise: resultBuffer contains the word, returns new tree with word removed
1902	*/
1903	long removed_single = 0;
1904	gb_assert(tree.exists!=0);
1905	*removed = 0;
1906
1907	if (wordLen) { // search wanted path into tree
1908	switch (tree.full->typ) {
1909	case SINGLE_NODE: {
1910	if (tree.single->ch==*wordStart) {
1911	resultBuffer = wordStart;
1912
1913	if (tree.single->son.exists) {
1914	gb_assert(tree.single->count>0);
1915	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart+1, wordLen-1,
1916	resultBuffer+1, resultLen, resultFrequency,
1917	&removed_single);
1918	if (*resultLen) { // word removed
1919	gb_assert(tree.single->count>=removed_single);
1920	tree.single->count -= removed_single;
1921	*removed += removed_single;
1922	(*resultLen)++;
1923	}
1924	}
1925	else {
1926	*resultLen = wordLen==1; // if wordLen==1 -> fully overlapping word found
1927	*resultFrequency = tree.single->count;
1928	}
1929
1930	if (!tree.single->son.exists && *resultLen) { // if no son and a word was found -> remove branch
1931	DictTree brother = tree.single->brother;
1932
1933	*removed += tree.single->count;
1934	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1935
1936	if (brother.exists) tree = brother;
1937	else tree.exists = NULL;
1938	}
1939	}
1940	else if (tree.single->ch < *wordStart && tree.single->brother.exists) {
1941	tree.single->brother = remove_word_from_dtree(tree.single->brother, wordStart, wordLen,
1942	resultBuffer, resultLen, resultFrequency,
1943	&removed_single);
1944	if (resultLen) removed += removed_single;
1945	}
1946	else {
1947	*resultLen = 0; // not found
1948	}
1949
1950	break;
1951	}
1952	case FULL_NODE: {
1953	unsigned_char ch = *wordStart;
1954	*resultBuffer = ch;
1955
1956	if (tree.full->son[ch].exists) {
1957	tree.full->son[ch] = remove_word_from_dtree(tree.full->son[ch], wordStart+1, wordLen-1,
1958	resultBuffer+1, resultLen, resultFrequency,
1959	&removed_single);
1960	if (*resultLen) {
1961	if (tree.full->son[ch].exists) { // another son?
1962	tree.full->count[ch] -= removed_single;
1963	}
1964	else { // last son -> remove whole branch
1965	removed_single = tree.full->count[ch];
1966	tree.full->count[ch] = 0;
1967	tree.full->usedSons--;
1968	}
1969
1970	*removed += removed_single;
1971	(*resultLen)++;
1972	}
1973	}
1974	else if (tree.full->count[ch]) {
1975	*resultLen = (wordLen==1);
1976
1977	if (*resultLen) {
1978	removed += removed_single = resultFrequency = tree.full->count[ch];
1979	tree.full->count[ch] = 0;
1980	tree.full->usedSons--;
1981	}
1982	}
1983	else {
1984	*resultLen = 0; // not found
1985	}
1986
1987	if (!tree.full->usedSons) {
1988	free_dtree(tree);
1989	tree.exists = NULL;
1990	}
1991
1992	break;
1993	}
1994	}
1995	}
1996	else { // take any word
1997	switch (tree.full->typ) {
1998	case SINGLE_NODE: {
1999	*resultBuffer = tree.single->ch;
2000	gb_assert(tree.single->count>0);
2001
2002	if (tree.single->son.exists) {
2003	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart, wordLen,
2004	resultBuffer+1, resultLen, resultFrequency,
2005	&removed_single);
2006	gb_assert(*resultLen);
2007	(*resultLen)++;
2008	}
2009	else {
2010	*resultLen = 1;
2011	*resultFrequency = tree.single->count;
2012	removed_single = tree.single->count;
2013	}
2014
2015	gb_assert(*resultFrequency>0);
2016
2017	if (tree.single->son.exists) {
2018	gb_assert(tree.single->count>=removed_single);
2019	tree.single->count -= removed_single;
2020	*removed += removed_single;
2021	}
2022	else {
2023	DictTree brother = tree.single->brother;
2024
2025	*removed += tree.single->count;
2026	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
2027
2028	if (brother.exists) tree = brother;
2029	else tree.exists = NULL;
2030	}
2031
2032	break;
2033	}
2034	case FULL_NODE: {
2035	int idx;
2036
2037	for (idx=0; idx<256; idx++) {
2038	if (tree.full->son[idx].exists) {
2039	*resultBuffer = idx;
2040	tree.full->son[idx] = remove_word_from_dtree(tree.full->son[idx], wordStart, wordLen,
2041	resultBuffer+1, resultLen, resultFrequency,
2042	&removed_single);
2043	gb_assert(*resultLen);
2044	(*resultLen)++;
2045
2046	if (!tree.full->son[idx].exists) { // son branch removed -> zero count
2047	removed_single = tree.full->count[idx];
2048	tree.full->count[idx] = 0;
2049	tree.full->usedSons--;
2050	}
2051	else {
2052	tree.full->count[idx] -= removed_single;
2053	gb_assert(tree.full->count[idx]>0);
2054	}
2055
2056	break;
2057	}
2058	else if (tree.full->count[idx]) {
2059	*resultBuffer = idx;
2060	*resultLen = 1;
2061	*resultFrequency = tree.full->count[idx];
2062	removed_single = tree.full->count[idx];
2063	tree.full->count[idx] = 0;
2064	tree.full->usedSons--;
2065	break;
2066	}
2067	}
2068
2069	gb_assert(idx<256); // gb_assert break was used to exit loop (== node had a son)
2070
2071	*removed += removed_single;
2072
2073	if (!tree.full->usedSons) {
2074	free_dtree(tree);
2075	tree.exists = NULL;
2076	}
2077
2078	break;
2079	}
2080	}
2081	}
2082
2083	#ifdef DEBUG
2084	if (*resultLen) {
2085	gb_assert(*resultLen>0);
2086	gb_assert(*resultFrequency>0);
2087	gb_assert(*resultLen>=wordLen);
2088	}
2089	#endif
2090
2091	return tree;
2092	}
2093
2094	#define cmp(i1, i2) (heap2[i1]-heap2[i2])
2095	#define swap(i1, i2) do \
2096	{ \
2097	int s = heap[i1]; \
2098	heap[i1] = heap[i2]; \
2099	heap[i2] = s; \
2100	\
2101	s = heap2[i1]; \
2102	heap2[i1] = heap2[i2]; \
2103	heap2[i2] = s; \
2104	} \
2105	while (0)
2106
2107	static void downheap(int heap, int heap2, int me, int num) {
2108	int lson = me*2;
2109	int rson = lson+1;
2110
2111	gb_assert(me>=1);
2112	if (lson>num) return;
2113
2114	if (cmp(lson, me)<0) { // left son smaller than me? (we sort in descending order!!!)
2115	if (rson<=num && cmp(lson, rson)>0) { // right son smaller than left son?
2116	swap(me, rson);
2117	downheap(heap, heap2, rson, num);
2118	}
2119	else {
2120	swap(me, lson);
2121	downheap(heap, heap2, lson, num);
2122	}
2123	}
2124	else if (rson<=num && cmp(me, rson)>0) { // right son smaller than me?
2125	swap(me, rson);
2126	downheap(heap, heap2, rson, num);
2127	}
2128	}
2129
2130	#undef cmp
2131	#undef swap
2132
2133
2134
2135	#define cmp(i1, i2) GB_MEMCMP(dict->text+dict->offsets[heap[i1]], dict->text+dict->offsets[heap[i2]], dict->textlen)
2136	#define swap(i1, i2) do { int s = heap[i1]; heap[i1] = heap[i2]; heap[i2] = s; } while (0)
2137
2138	static void downheap2(int heap, GB_DICTIONARY dict, int me, int num) {
2139	int lson = me*2;
2140	int rson = lson+1;
2141
2142	gb_assert(me>=1);
2143	if (lson>num) return;
2144
2145	if (cmp(lson, me)>0) { // left son bigger than me?
2146	if (rson<=num && cmp(lson, rson)<0) { // right son bigger than left son?
2147	swap(me, rson);
2148	downheap2(heap, dict, rson, num);
2149	}
2150	else {
2151	swap(me, lson);
2152	downheap2(heap, dict, lson, num);
2153	}
2154	}
2155	else if (rson<=num && cmp(me, rson)<0) { // right son bigger than me?
2156	swap(me, rson);
2157	downheap2(heap, dict, rson, num);
2158	}
2159	}
2160
2161	#undef cmp
2162	#undef swap
2163
2164	static void sort_dict_offsets(GB_DICTIONARY *dict) {
2165	/* 1. sorts the 'dict->offsets' by frequency
2166	* (frequency of each offset is stored in the 'dict->resort' with the same index)
2167	* 2. initializes & sorts 'dict->resort' in alphabetic order
2168	*/
2169	int i;
2170	int num = dict->words;
2171	int *heap = dict->offsets-1;
2172	int *heap2 = dict->resort-1;
2173
2174	// sort offsets
2175
2176	for (i=num/2; i>=1; i--) downheap(heap, heap2, i, num); // make heap
2177
2178	while (num>1) { // sort heap
2179	int big = heap[1];
2180	int big2 = heap2[1];
2181
2182	heap[1] = heap[num];
2183	heap2[1] = heap2[num];
2184
2185	downheap(heap, heap2, 1, num-1);
2186
2187	heap[num] = big;
2188	heap2[num] = big2;
2189
2190	num--;
2191	}
2192
2193	// initialize dict->resort
2194
2195	for (i=0, num=dict->words; i<num; i++) dict->resort[i] = i;
2196
2197	// sort dictionary alphabetically
2198
2199	for (i=num/2; i>=1; i--) downheap2(heap2, dict, i, num); // make heap
2200
2201	while (num>1) {
2202	int big = heap2[1];
2203
2204	heap2[1] = heap2[num];
2205	downheap2(heap2, dict, 1, num-1);
2206	heap2[num] = big;
2207	num--;
2208	}
2209	}
2210
2211	// Warning dictionary is not in network byte order !!!!
2212	static GB_DICTIONARY gb_create_dictionary(O_gbdByKey gbk, long maxmem) {
2213	long data_sum;
2214	DictTree tree = build_dict_tree(gbk, maxmem, MAX_WORD_LEN, MIN_WORD_LEN, &data_sum);
2215
2216	if (tree.exists) {
2217	GB_DICTIONARY dict = (GB_DICTIONARY)gbm_get_mem(sizeof(*dict), GBM_DICT_INDEX);
2218	int maxdeep = 0;
2219	int words = count_dtree_leafs(tree, 0, &maxdeep);
2220	u_str word;
2221
2222	int wordLen;
2223	long wordFrequency;
2224	int offset = 0; // next free position in dict->text
2225	int overlap = 0; // # of bytes overlapping with last word
2226	u_str buffer;
2227	long dummy;
2228	#if defined(DEBUG)
2229	long word_sum = 0;
2230	long overlap_sum = 0;
2231	long max_overlap = 0;
2232	#endif
2233
2234	// reduce tree as long as it has to many leafs (>MAX_LONG_INDEX)
2235	while (words >= MAX_LONG_INDEX) {
2236
2237	words = count_dtree_leafs(tree, 0, &maxdeep);
2238	}
2239
2240	buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
2241
2242	calcCounts(tree);
2243	testCounts(tree);
2244
2245	#if DEBUG
2246	printf(" examined data was %li bytes\n", data_sum);
2247	printf(" tree contains %i words *** maximum tree depth = %i\n", words, maxdeep);
2248	#endif
2249
2250	dict->words = 0;
2251	dict->textlen = DICT_STRING_INCR;
2252
2253	dict->text = (u_str)gbm_get_mem(DICT_STRING_INCR, GBM_DICT_INDEX);
2254	dict->offsets = (GB_NINT)gbm_get_mem(sizeof((dict->offsets))*words, GBM_DICT_INDEX);
2255	dict->resort = (GB_NINT)gbm_get_mem(sizeof((dict->resort))*words, GBM_DICT_INDEX);
2256
2257	memset(buffer, '*', maxdeep);
2258	tree = remove_word_from_dtree(tree, NULL, 0, buffer, &wordLen, &wordFrequency, &dummy);
2259	testCounts(tree);
2260
2261	while (1) {
2262	int nextWordLen = 0;
2263	int len;
2264
2265	#if DUMP_COMPRESSION_TEST>=4
2266	printf("word='%s' (occur=%li overlap=%i)\n", lstr(buffer, wordLen), wordFrequency, overlap);
2267	#endif
2268
2269	#if defined(DEBUG)
2270	overlap_sum += overlap;
2271	if (overlap>max_overlap) max_overlap = overlap;
2272	word_sum += wordLen;
2273	#endif
2274
2275	if (offset-overlap+wordLen > dict->textlen) { // if not enough space allocated -> reallocate dictionary string
2276	u_str ntext = (u_str)gbm_get_mem(dict->textlen+DICT_STRING_INCR, GBM_DICT_INDEX);
2277
2278	memcpy(ntext, dict->text, dict->textlen);
2279	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2280
2281	dict->text = ntext;
2282	dict->textlen += DICT_STRING_INCR;
2283	}
2284
2285	dict->offsets[dict->words] = offset-overlap;
2286	dict->resort[dict->words] = wordFrequency; // temporarily miss-use this to store frequency
2287	dict->words++;
2288
2289	word = dict->text+offset-overlap;
2290	gb_assert(overlap==0 \|\| GB_MEMCMP(word, buffer, overlap)==0); // test overlapping string-part
2291	memcpy(word, buffer, wordLen); // word -> dictionary string
2292	offset += wordLen-overlap;
2293
2294	if (!tree.exists) break;
2295
2296	for (len=min(10, wordLen-1); len>=0 && nextWordLen==0; len--) {
2297	memset(buffer, '*', maxdeep);
2298	tree = remove_word_from_dtree(tree, word+wordLen-len, len, buffer, &nextWordLen, &wordFrequency, &dummy);
2299	overlap = len;
2300	}
2301
2302	wordLen = nextWordLen;
2303	}
2304
2305	gb_assert(dict->words <= MAX_LONG_INDEX);
2306	gb_assert(dict->words==words); /* dict->words == # of words stored in dictionary string
2307	* words == # of words pre-calculated */
2308
2309	#if DEBUG
2310	printf(" word_sum=%li overlap_sum=%li (%li%%) max_overlap=%li\n",
2311	word_sum, overlap_sum, (overlap_sum*100)/word_sum, max_overlap);
2312	#endif
2313
2314	if (offset<dict->textlen) { // reallocate dict->text if it was allocated too large
2315	u_str ntext = (u_str)gbm_get_mem(offset, GBM_DICT_INDEX);
2316
2317	memcpy(ntext, dict->text, offset);
2318	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2319
2320	dict->text = ntext;
2321	dict->textlen = offset;
2322	}
2323
2324	sort_dict_offsets(dict);
2325
2326	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
2327	free_dtree(tree);
2328
2329	return dict;
2330	}
2331
2332	return NULL;
2333	}
2334
2335	static void gb_free_dictionary(GB_DICTIONARY*& dict) {
2336	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2337	gbm_free_mem(dict->offsets, sizeof((dict->offsets))dict->words, GBM_DICT_INDEX);
2338	gbm_free_mem(dict->resort, sizeof((dict->resort))dict->words, GBM_DICT_INDEX);
2339
2340	gbm_free_mem(dict, sizeof(*dict), GBM_DICT_INDEX);
2341	dict = NULL;
2342	}
2343
2344	static GB_ERROR readAndWrite(O_gbdByKey *gbkp, size_t& old_size, size_t& new_size) {
2345	GB_ERROR error = 0;
2346
2347	old_size = 0;
2348	new_size = 0;
2349
2350	for (int i=0; i<gbkp->cnt && !error; i++) {
2351	GBDATA *gbd = gbkp->gbds[i];
2352
2353	if (COMPRESSIBLE(gbd->type())) {
2354	size_t size;
2355	char *data;
2356
2357	{
2358	cu_str d = (cu_str)get_data_n_size(gbd, &size);
2359	old_size += gbd->as_entry()->memsize();
2360
2361	data = (char*)gbm_get_mem(size, GBM_DICT_INDEX);
2362	memcpy(data, d, size);
2363	gb_assert(data[size-1] == 0);
2364	}
2365
2366	switch (gbd->type()) {
2367	case GB_STRING:
2368	error = GB_write_string(gbd, "");
2369	if (!error) error = GB_write_string(gbd, data);
2370	break;
2371	case GB_BYTES:
2372	error = GB_write_bytes(gbd, 0, 0);
2373	if (!error) error = GB_write_bytes(gbd, data, size);
2374	break;
2375	case GB_INTS:
2376	error = GB_write_ints(gbd, (GB_UINT4 *)0, 0);
2377	if (!error) error = GB_write_ints(gbd, (GB_UINT4 *)data, size);
2378	break;
2379	case GB_FLOATS:
2380	error = GB_write_floats(gbd, (float*)0, 0);
2381	if (!error) error = GB_write_floats(gbd, (float)(void)data, size);
2382	break;
2383	default:
2384	gb_assert(0);
2385	break;
2386	}
2387
2388	new_size += gbd->as_entry()->memsize();
2389
2390	gbm_free_mem(data, size, GBM_DICT_INDEX);
2391	}
2392	}
2393	return error;
2394	}
2395
2396	static GB_ERROR gb_create_dictionaries(GB_MAIN_TYPE *Main, long maxmem) {
2397	GB_ERROR error = NULL;
2398	#if defined(TEST_DICT)
2399	long uncompressed_sum = 0;
2400	long compressed_sum = 0;
2401	#endif // TEST_DICT
2402
2403	printf("Creating GBDATA-Arrays..\n");
2404
2405	if (!error) {
2406	O_gbdByKey *gbk = g_b_opti_createGbdByKey(Main);
2407	int idx = -1;
2408
2409	printf("Creating dictionaries..\n");
2410
2411	#ifdef DEBUG
2412	// #define TEST_ONE // test only key specified below
2413	// #define TEST_SOME // test only some keys specified below
2414	#if defined(TEST_ONE)
2415	// select wanted index
2416	for (idx=0; idx<gbdByKey_cnt; idx++) { // title author dew_author ebi_journal name ua_tax date full_name ua_title
2417	if (gbk[idx].cnt && strcmp(quark2key(Main, idx), "tree")==0) break;
2418	}
2419	gb_assert(idx<gbdByKey_cnt);
2420	#endif
2421	#endif
2422
2423	#ifdef TEST_ONE
2424	// only create dictionary for index selected above (no loop)
2425	#else
2426	// create dictionaries for all indices (this is the normal operation)
2427	arb_progress progress("Optimizing key data", gbdByKey_cnt-1);
2428	for (idx = gbdByKey_cnt-1; idx >= 1 && !error; --idx, progress.inc_and_check_user_abort(error))
2429	#endif
2430
2431	{
2432	GB_DICTIONARY *dict;
2433
2434	GB_CSTR key_name = quark2key(Main, idx);
2435	GBDATA *gb_main = Main->gb_main();
2436
2437	#ifdef TEST_SOME
2438	if (!( // add all wanted keys here
2439	strcmp(key_name, "REF") == 0 \|\|
2440	strcmp(key_name, "ref") == 0
2441	)) continue;
2442	#endif // TEST_SOME
2443
2444	#ifndef TEST_ONE
2445	if (!gbk[idx].cnt) continue; // there are no entries with this quark
2446
2447	GB_TYPES type = gbk[idx].gbds[0]->type();
2448
2449	GB_begin_transaction(gb_main);
2450	int compression_mask = gb_get_compression_mask(Main, idx, type);
2451	GB_commit_transaction(gb_main);
2452
2453	if ((compression_mask & GB_COMPRESSION_DICTIONARY) == 0) continue; // compression with dictionary is not allowed
2454	if (strcmp(key_name, "data") == 0) continue;
2455	if (strcmp(key_name, "quality") == 0) continue;
2456	#endif
2457
2458	printf("- dictionary for '%s' (idx=%i)\n", key_name, idx);
2459	GB_begin_transaction(gb_main);
2460	dict = gb_create_dictionary(&(gbk[idx]), maxmem);
2461
2462	if (dict) {
2463	/* decompress with old dictionary and write
2464	all data of actual type without compression: */
2465
2466	printf(" * Uncompressing all with old dictionary ...\n");
2467
2468	size_t old_compressed_size;
2469
2470	{
2471	int& compr_mask = Main->keys[idx].compression_mask;
2472	int old_compr_mask = compr_mask;
2473
2474	size_t new_size;
2475
2476	compr_mask &= ~GB_COMPRESSION_DICTIONARY;
2477	error = readAndWrite(&gbk[idx], old_compressed_size, new_size);
2478	compr_mask = old_compr_mask;
2479	}
2480
2481	if (!error) {
2482	/* dictionary is saved in the following format:
2483	*
2484	* GB_NINT size
2485	* GB_NINT offsets[dict->words]
2486	* GB_NINT resort[dict->words]
2487	* char *text
2488	*/
2489
2490	int dict_buffer_size = sizeof(GB_NINT) * (1+dict->words*2) + dict->textlen;
2491	char dict_buffer = (char)gbm_get_mem(dict_buffer_size, GBM_DICT_INDEX);
2492	long old_dict_buffer_size;
2493	char *old_dict_buffer;
2494
2495	{
2496	GB_NINT nint = (GB_NINT)dict_buffer;
2497	int n;
2498
2499	*nint++ = htonl(dict->words);
2500	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->offsets[n]);
2501	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->resort[n]);
2502
2503	memcpy(nint, dict->text, dict->textlen);
2504	}
2505
2506	const char *key = Main->keys[idx].key;
2507
2508	error = gb_load_dictionary_data(gb_main, key, &old_dict_buffer, &old_dict_buffer_size);
2509	if (!error) {
2510	gb_save_dictionary_data(gb_main, key, dict_buffer, dict_buffer_size);
2511
2512	// compress all data with new dictionary
2513	printf(" * Compressing all with new dictionary ...\n");
2514
2515	size_t old_size, new_compressed_size;
2516	error = readAndWrite(&gbk[idx], old_size, new_compressed_size);
2517
2518	if (!error) {
2519	printf(" (compressed size: old=%zu new=%zu ratio=%.1f%%)\n",
2520	old_compressed_size, new_compressed_size, (new_compressed_size*100.0)/old_compressed_size);
2521
2522	// @@@ for some keys compression fails (e.g. 'ref');
2523	// need to find out why dictionary is so bad
2524	// gb_assert(new_compressed_size <= old_compressed_size); // @@@ enable this (fails in unit-test)
2525	}
2526
2527	if (error) {
2528	/* critical state: new dictionary has been written, but transaction will be aborted below.
2529	* Solution: Write back old dictionary.
2530	*/
2531	gb_save_dictionary_data(gb_main, key, old_dict_buffer, old_dict_buffer_size);
2532	}
2533	}
2534
2535	gbm_free_mem(dict_buffer, dict_buffer_size, GBM_DICT_INDEX);
2536	if (old_dict_buffer) gbm_free_mem(old_dict_buffer, old_dict_buffer_size, GBM_DICT_INDEX);
2537
2538	#if defined(TEST_DICT)
2539	if (!error) {
2540	GB_DICTIONARY *dict_reloaded = gb_get_dictionary(Main, idx);
2541	test_dictionary(dict_reloaded, &(gbk[idx]), &uncompressed_sum, &compressed_sum);
2542	}
2543	#endif // TEST_DICT
2544	}
2545
2546	gb_free_dictionary(dict);
2547	}
2548
2549	error = GB_end_transaction(gb_main, error);
2550	}
2551
2552	#ifdef TEST_DICT
2553	if (!error) {
2554	printf(" overall uncompressed size = %li b\n"
2555	" overall compressed size = %li b (Ratio=%li%%)\n",
2556	uncompressed_sum, compressed_sum,
2557	(compressed_sum*100)/uncompressed_sum);
2558	}
2559	#endif // TEST_DICT
2560
2561	printf("Done.\n");
2562
2563	g_b_opti_freeGbdByKey(gbk);
2564	}
2565
2566	return error;
2567	}
2568
2569	GB_ERROR GB_optimize(GBDATA *gb_main) {
2570	unsigned long maxKB = GB_get_usable_memory();
2571	long maxMem;
2572	GB_ERROR error = 0;
2573	GB_UNDO_TYPE prev_undo_type = GB_get_requested_undo_type(gb_main);
2574
2575	#ifdef DEBUG
2576	maxKB /= 2;
2577	#endif
2578
2579	if (maxKB<=(LONG_MAX/1024)) maxMem = maxKB*1024;
2580	else maxMem = LONG_MAX;
2581
2582	error = GB_request_undo_type(gb_main, GB_UNDO_KILL);
2583	if (!error) {
2584	error = gb_create_dictionaries(GB_MAIN(gb_main), maxMem);
2585	if (!error) GB_disable_quicksave(gb_main, "Database optimized");
2586	ASSERT_NO_ERROR(GB_request_undo_type(gb_main, prev_undo_type));
2587	}
2588	return error;
2589	}
2590
2591	// --------------------------------------------------------------------------------
2592
2593	#ifdef UNIT_TESTS
2594	#ifndef TEST_UNIT_H
2595	#include <test_unit.h>
2596	#endif
2597
2598	// #define TEST_AUTO_UPDATE // uncomment to auto-update binary result of DB optimization
2599	// #define TEST_AUTO_UPDATE_ASCII // uncomment to auto-update ascii-input from output (be careful with this!)
2600
2601	void TEST_SLOW_optimize() {
2602	// test DB optimization (optimizes compression)
2603	// [current coverage ~89%]
2604	//
2605	// Note: a test for sequence compression is in adseqcompr.cxx@TEST_SLOW_sequence_compression
2606
2607	GB_shell shell;
2608
2609	const char *source_ascii = "TEST_opti_ascii_in.arb";
2610	const char *target_ascii = "TEST_opti_ascii_out.arb";
2611
2612	const char *nonopti = "TEST_opti_none.arb";
2613	const char *optimized = "TEST_opti_initial.arb";
2614	const char *reoptimized = "TEST_opti_again.arb";
2615	const char *expected = "TEST_opti_expected.arb"; // expected result after optimize
2616
2617	// initial optimization of ASCII-DB
2618	{
2619	GBDATA *gb_main = GB_open(source_ascii, "rw");
2620
2621	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, nonopti, "b"));
2622
2623	GB_push_my_security(gb_main);
2624	TEST_EXPECT_NO_ERROR(GB_optimize(gb_main));
2625	GB_pop_my_security(gb_main);
2626
2627	GB_flush_cache(gb_main);
2628
2629	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, optimized, "b"));
2630	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2631
2632	#if defined(TEST_AUTO_UPDATE)
2633	TEST_COPY_FILE(optimized, expected);
2634	#endif
2635	#if defined(TEST_AUTO_UPDATE_ASCII)
2636	TEST_COPY_FILE(target_ascii, source_ascii);
2637	#endif
2638
2639	TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2640	TEST_EXPECT_FILES_EQUAL(optimized, expected);
2641
2642	// check that optimization made sense:
2643	long nonopti_size = GB_size_of_file(nonopti);
2644	long optimized_size = GB_size_of_file(optimized);
2645	TEST_EXPECT_LESS(optimized_size, nonopti_size); // did file shrink?
2646	TEST_EXPECT_EQUAL(optimized_size*100/nonopti_size, 74); // document compression ratio (in percent)
2647
2648	GB_close(gb_main);
2649	}
2650
2651	// re-optimize DB (which is already compressed)
2652	{
2653	GBDATA *gb_main = GB_open(optimized, "rw");
2654
2655	GB_push_my_security(gb_main);
2656	TEST_EXPECT_NO_ERROR(GB_optimize(gb_main));
2657	GB_pop_my_security(gb_main);
2658
2659	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, reoptimized, "b"));
2660	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2661
2662	TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2663	TEST_EXPECT_FILES_EQUAL(reoptimized, expected); // reoptimize should produce same result as initial optimize
2664
2665	GB_close(gb_main);
2666	}
2667
2668	TEST_EXPECT_NO_ERROR(GBK_system(GBS_global_string("rm %s %s %s %s", target_ascii, nonopti, optimized, reoptimized)));
2669	}
2670
2671	void TEST_ticket_742() {
2672	// reproduce problem reported in ticket #742
2673	GB_shell shell;
2674	{
2675	const char *opti_db = "TEST_opti_expected.arb"; // expected result after optimize
2676	GBDATA *gb_main = GBT_open(opti_db, "rw"); // use GBT_open to create index for 'name'
2677
2678	for (int suf = 2; suf<=4; ++suf) {
2679	char *sainame = GBS_global_string_copy("forcecompress_%i", suf);
2680	char *mod_sainame = GBS_global_string_copy("%s.mod", sainame);
2681
2682	GBDATA *gb_sai_name;
2683	{
2684	GB_transaction ta(gb_main);
2685
2686	GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame); TEST_REJECT_NULL(gb_sai);
2687	gb_sai_name = GB_entry(gb_sai, "name"); TEST_REJECT_NULL(gb_sai_name);
2688
2689	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), sainame);
2690
2691	TEST_EXPECT_NO_ERROR(GB_write_string(gb_sai_name, mod_sainame));
2692	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame); // fixed
2693
2694	switch (suf) {
2695	case 2:
2696	// do nothing special (only test value in next TA below)
2697	break;
2698	case 3:
2699	// flushing the cache "solves" the problem => GB_write_string does not flush correctly
2700	GB_flush_cache(gb_sai_name);
2701	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame);
2702	break;
2703	case 4:
2704	// writing twice also "solves" the problem
2705	TEST_EXPECT_NO_ERROR(GB_write_string(gb_sai_name, mod_sainame));
2706	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame);
2707	break;
2708
2709
2710	default:
2711	TEST_REJECT(true);
2712	break;
2713	}
2714	}
2715
2716	{
2717	GB_transaction ta(gb_main);
2718	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame); // ok in next TA
2719	}
2720
2721	free(mod_sainame);
2722	free(sainame);
2723	}
2724
2725	{
2726	GB_transaction ta(gb_main);
2727
2728	GBDATA *gb_species = GBT_find_species(gb_main, "FrnPhilo"); TEST_REJECT_NULL(gb_species);
2729	GBDATA *gb_data = GB_create(gb_species, "data", GB_STRING); TEST_REJECT_NULL(gb_data);
2730
2731	const char *seq1 = ".....................AUUCUGGUU-----GA--U-CC-U-G------------..............";
2732	const char *seq2 = "........A-A--CU---------------C-A-A-A-G-GA-G--AC---A-C-U-G...............";
2733
2734	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq1));
2735	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq1);
2736	TEST_EXPECT_EQUAL(gb_data->flags.compressed_data, 1);
2737	TEST_EXPECT_EQUAL(gb_data->flags2.extern_data, 1);
2738
2739	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq2));
2740	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2);
2741	TEST_EXPECT_EQUAL(gb_data->flags.compressed_data, 1);
2742	TEST_EXPECT_EQUAL(gb_data->flags2.extern_data, 1);
2743
2744	TEST_EXPECT_NO_ERROR(GB_create_index(GBT_get_species_data(gb_main), "data", GB_IGNORE_CASE, 20));
2745
2746	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq1));
2747	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq1); // fixed
2748
2749	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq2));
2750	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2); // seems to work ...
2751
2752	GB_flush_cache(gb_data);
2753	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2); // fixed
2754	}
2755
2756	GB_close(gb_main);
2757	}
2758	}
2759
2760	void TEST_streamed_ascii_save_asUsedBy_silva_pipeline() {
2761	GB_shell shell;
2762
2763	const char *loadname = "TEST_loadsave_ascii.arb";
2764	const char *savename = "TEST_streamsaved.arb";
2765
2766	{
2767	GBDATA *gb_main1 = GB_open(loadname, "r"); TEST_REJECT_NULL(gb_main1);
2768	GBDATA *gb_main2 = GB_open(loadname, "rw"); TEST_REJECT_NULL(gb_main2);
2769
2770	// delete all species from DB2
2771	GBDATA *gb_species_data2 = NULL;
2772	{
2773	GB_transaction ta(gb_main2);
2774	GB_push_my_security(gb_main2);
2775	gb_species_data2 = GBT_get_species_data(gb_main2);
2776	for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data2);
2777	gb_species;
2778	gb_species = GBT_next_species(gb_species))
2779	{
2780	TEST_EXPECT_NO_ERROR(GB_delete(gb_species));
2781	}
2782	GB_pop_my_security(gb_main2);
2783	}
2784
2785	for (int saveWhileTransactionOpen = 0; saveWhileTransactionOpen<=1; ++saveWhileTransactionOpen) {
2786	{
2787	ArbDBWriter *writer = NULL;
2788	TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "a", writer));
2789	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_main2, gb_species_data2));
2790
2791	{
2792	GB_transaction ta1(gb_main1);
2793	GB_transaction ta2(gb_main2);
2794
2795	for (GBDATA *gb_species1 = GBT_first_species(gb_main1);
2796	gb_species1;
2797	gb_species1 = GBT_next_species(gb_species1))
2798	{
2799	GBDATA *gb_species2 = GB_create_container(gb_species_data2, "species");
2800	if (!gb_species2) {
2801	TEST_EXPECT_NO_ERROR(GB_await_error());
2802	}
2803	else {
2804	TEST_EXPECT_NO_ERROR(GB_copy_with_protection(gb_species2, gb_species1, true));
2805	GB_write_flag(gb_species2, GB_read_flag(gb_species1)); // copy marked flag
2806	}
2807
2808	if (!saveWhileTransactionOpen) GB_commit_transaction(gb_main2);
2809	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_species2, gb_species2));
2810	if (!saveWhileTransactionOpen) GB_begin_transaction(gb_main2);
2811
2812	GB_push_my_security(gb_main2);
2813	TEST_EXPECT_NO_ERROR(GB_delete(gb_species2));
2814	GB_pop_my_security(gb_main2);
2815	}
2816	}
2817
2818	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_species_data2, gb_main2));
2819	TEST_EXPECT_NO_ERROR(GB_finish_stream_save(writer));
2820	}
2821
2822	// test file content
2823	TEST_EXPECT_TEXTFILES_EQUAL(savename, loadname);
2824	TEST_EXPECT_ZERO_OR_SHOW_ERRNO(GB_unlink(savename));
2825	}
2826
2827	// test some error cases:
2828	{
2829	// 1. stream binary (not supported)
2830	ArbDBWriter *writer = NULL;
2831	TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "b", writer));
2832	TEST_EXPECT_ERROR_CONTAINS(GB_stream_save_part(writer, gb_main2, gb_species_data2), "only supported for ascii");
2833	GB_finish_stream_save(writer);
2834	TEST_EXPECT(!GB_is_regularfile(savename)); // no partial file remains
2835
2836	// 2. invalid use of GB_stream_save_part (not ancestors)
2837	TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "a", writer));
2838	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_main2, gb_species_data2));
2839	GBDATA *gb_extended_data2;
2840	{
2841	GB_transaction ta(gb_main2);
2842	gb_extended_data2 = GBT_get_SAI_data(gb_main2);
2843	}
2844	TEST_EXPECT_ERROR_CONTAINS(GB_stream_save_part(writer, gb_species_data2, gb_extended_data2), "has to be an ancestor");
2845	GB_finish_stream_save(writer);
2846	TEST_EXPECT(!GB_is_regularfile(savename)); // no partial file remains
2847
2848	}
2849	// TEST_EXPECT_ZERO_OR_SHOW_ERRNO(GB_unlink(savename));
2850
2851	GB_close(gb_main2);
2852	GB_close(gb_main1);
2853	}
2854	}
2855
2856	TEST_PUBLISH(TEST_streamed_ascii_save_asUsedBy_silva_pipeline);
2857
2858	#endif // UNIT_TESTS
2859
2860	// --------------------------------------------------------------------------------
2861

Note: See TracBrowser for help on using the repository browser.

Download in other formats: