Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

adoptimize.cxx

Visit:

Last change on this file was 19506, checked in by westram, 3 months ago
fix gcc10.x vectorization checks: update `LOOP_VECTORIZED` comment: fix one vectorization for gcc 10.x series. Depends on UNIT_TEST=0\|1. document vectorization count may depend on UNIT_TEST.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 94.7 KB

Line
1	// =============================================================== //
2	// //
3	// File : adoptimize.cxx //
4	// Purpose : //
5	// //
6	// Institute of Microbiology (Technical University Munich) //
7	// http://www.arb-home.de/ //
8	// //
9	// =============================================================== //
10
11	#include <climits>
12	#include <netinet/in.h>
13
14	#include <arb_file.h>
15	#include <arb_diff.h>
16
17	#include <arbdbt.h>
18
19	#include "gb_key.h"
20	#include "gb_compress.h"
21	#include "gb_dict.h"
22
23	#include "arb_progress.h"
24
25	#if defined(DEBUG)
26	// #define TEST_DICT
27	#endif // DEBUG
28
29	typedef unsigned char unsigned_char;
30	typedef unsigned char *u_str;
31	typedef const unsigned char *cu_str;
32
33	static int gbdByKey_cnt;
34	struct O_gbdByKey { // one for each diff. keyQuark
35	int cnt;
36	GBDATA **gbds; // gbdoff
37	};
38
39	struct FullDictTree;
40	struct SingleDictTree;
41
42	union DictTree {
43	FullDictTree *full;
44	SingleDictTree *single;
45	void *exists;
46
47	};
48
49	enum DictNodeType { SINGLE_NODE, FULL_NODE };
50
51	struct FullDictTree {
52	DictNodeType typ; // always FULL_NODE
53	int usedSons;
54	int count[256];
55	DictTree son[256]; // index == character
56	};
57
58	struct SingleDictTree {
59	DictNodeType typ; // always SINGLE_NODE
60	unsigned_char ch; // the character
61	int count; // no of occurrences of this branch
62	DictTree son;
63	DictTree brother;
64
65	};
66
67	// **************************************************
68
69	#define COMPRESSIBLE(type) ((type) >= GB_BYTES && (type)<=GB_STRING)
70	#define DICT_MEM_WEIGHT 4
71
72	#define WORD_HELPFUL(wordlen, occurrences) ((long)((occurrences)3 + DICT_MEM_WEIGHT(2*sizeof(GB_NINT)+(wordlen))) \
73	< \
74	(long)((occurrences)*(wordlen)))
75	/* (occurrences)*4 compressed size
76	* 2*sizeof(GB_NINT)+(wordlen) size in dictionary
77	* (occurrences)*(wordlen) uncompressed size
78	*/
79
80	// **************************************************
81
82	#define MIN_WORD_LEN 8 // minimum length of words in dictionary
83	#define MAX_WORD_LEN 50 // maximum length of words in dictionary
84	#define MAX_BROTHERS 10 /* maximum no of brothers linked with SingleDictTree
85	* above we use FullDictTree */
86	#define MAX_DIFFER 2 /* percentage of difference (of occurrences of strings) below which two
87	* consecutive parts are treated as EQUAL # of occurrences */
88	#define INCR_DIFFER 1 // the above percentage is incremented from 0 to MAX_DIFFER by INCR_DIFFER per step
89
90	#define DICT_STRING_INCR 1024 // dictionary string will be incremented by this size
91
92	// ***************** Tool functions ****************
93
94	inline cu_str get_data_n_size(GBDATA gbd, size_t size) {
95	GB_CSTR data;
96	*size = 0;
97
98	switch (gbd->type()) {
99	case GB_STRING: data = GB_read_char_pntr(gbd); break;
100	case GB_BYTES: data = GB_read_bytes_pntr(gbd); break;
101	case GB_INTS: data = (char*)GB_read_ints_pntr(gbd); break;
102	case GB_FLOATS: data = (char*)GB_read_floats_pntr(gbd); break;
103	default:
104	data = NULp;
105	gb_assert(0);
106	break;
107	}
108
109	if (data) *size = gbd->as_entry()->uncompressed_size();
110	return (cu_str)data;
111	}
112
113	static inline long min(long a, long b) {
114	return a<b ? a : b;
115	}
116
117	// **************************************************
118
119	static void g_b_opti_scanGbdByKey(GB_MAIN_TYPE Main, GBDATA gbd, O_gbdByKey *gbk) {
120	if (gbd->is_container()) {
121	GBCONTAINER *gbc = gbd->as_container();
122	for (int idx=0; idx < gbc->d.nheader; idx++) {
123	GBDATA *gbd2 = GBCONTAINER_ELEM(gbc, idx);
124	if (gbd2) g_b_opti_scanGbdByKey(Main, gbd2, gbk);
125	}
126	}
127
128	GBQUARK quark = GB_KEY_QUARK(gbd);
129	if (quark) {
130	gb_assert(gbk[quark].cnt < Main->keys[quark].nref \|\| quark==0);
131	gb_assert(gbk[quark].gbds);
132
133	gbk[quark].gbds[gbk[quark].cnt] = gbd;
134	gbk[quark].cnt++;
135	}
136	}
137
138	static O_gbdByKey g_b_opti_createGbdByKey(GB_MAIN_TYPE Main) {
139	O_gbdByKey *gbk = ARB_calloc<O_gbdByKey>(Main->keycnt);
140
141	gbdByKey_cnt = Main->keycnt; // always use gbdByKey_cnt instead of Main->keycnt cause Main->keycnt can change
142
143	for (int idx=1; idx<gbdByKey_cnt; idx++) {
144	gbk[idx].cnt = 0;
145
146	gb_Key& KEY = Main->keys[idx];
147	if (KEY.key && KEY.nref>0) {
148	ARB_calloc(gbk[idx].gbds, KEY.nref);
149	}
150	else {
151	gbk[idx].gbds = NULp;
152	}
153	}
154
155	gbk[0].cnt = 0;
156	ARB_calloc(gbk[0].gbds, 1);
157
158	g_b_opti_scanGbdByKey(Main, Main->gb_main(), gbk);
159
160	for (int idx=0; idx<gbdByKey_cnt; idx++) {
161	if (gbk[idx].cnt != Main->keys[idx].nref && idx) {
162	printf("idx=%i gbk[idx].cnt=%i Main->keys[idx].nref=%li\n", // Main->keys[].nref ist falsch
163	idx, gbk[idx].cnt, Main->keys[idx].nref);
164
165	Main->keys[idx].nref = gbk[idx].cnt;
166	}
167	}
168	return gbk;
169	}
170
171	static void g_b_opti_freeGbdByKey(O_gbdByKey *gbk) {
172	for (int idx=0; idx<gbdByKey_cnt; idx++) free(gbk[idx].gbds);
173	free(gbk);
174	}
175
176	// ***************** Convert old compression style to new style ****************
177
178	static GB_ERROR gb_convert_compression(GBDATA *gbd) {
179	GB_ERROR error = NULp;
180
181	if (gbd->is_container()) {
182	for (GBDATA *gb_child = GB_child(gbd); gb_child; gb_child = GB_nextChild(gb_child)) {
183	if (gb_child->flags.compressed_data \|\| gb_child->is_container()) {
184	error = gb_convert_compression(gb_child);
185	if (error) break;
186	}
187	}
188	}
189	else {
190	char *str = NULp;
191	GBENTRY *gbe = gbd->as_entry();
192	long elems = gbe->size();
193	size_t data_size = gbe->uncompressed_size();
194	size_t new_size = -1;
195	int expectData = 1;
196
197	switch (gbd->type()) {
198	case GB_STRING:
199	case GB_OBSOLETE:
200	case GB_BYTES:
201	str = gb_uncompress_bytes(gbe->data(), data_size, &new_size);
202	if (str) {
203	gb_assert(new_size == data_size);
204	str = GB_memdup(str, data_size);
205	}
206	break;
207
208	case GB_INTS:
209	case GB_FLOATS:
210	str = gb_uncompress_longs_old(gbe->data(), elems, &new_size);
211	if (str) {
212	gb_assert(new_size == data_size);
213	str = GB_memdup(str, data_size);
214	}
215	break;
216
217	default:
218	expectData = 0;
219	break;
220	}
221
222	if (!str) {
223	if (expectData) {
224	error = GBS_global_string("Can't read old data to convert compression (Reason: %s)", GB_await_error());
225	}
226	}
227	else {
228	switch (gbd->type()) {
229	case GB_STRING:
230	error = GB_write_string(gbe, "");
231	if (!error) error = GB_write_string(gbe, str);
232	break;
233
234	case GB_OBSOLETE:
235	error = GB_set_temporary(gbe); // exclude obsolete type from next save
236	break;
237
238	case GB_BYTES:
239	error = GB_write_bytes(gbe, "", 0);
240	if (!error) error = GB_write_bytes(gbe, str, data_size);
241	break;
242
243	case GB_INTS:
244	case GB_FLOATS:
245	error = GB_write_pntr(gbe, str, data_size, elems);
246	break;
247
248	default:
249	gb_assert(0);
250	break;
251	}
252
253	free(str);
254	}
255	}
256	return error;
257	}
258
259	GB_ERROR gb_convert_V2_to_V3(GBDATA *gb_main) {
260	GB_ERROR error = NULp;
261	GBDATA *gb_system = GB_search(gb_main, GB_SYSTEM_FOLDER, GB_FIND);
262
263	if (!gb_system) {
264	GB_create_container(gb_main, GB_SYSTEM_FOLDER);
265	if (GB_entry(gb_main, "extended_data")) {
266	GB_warning("Converting data from old V2.0 to V2.1 Format:\n"
267	" Please Wait (may take some time)");
268	}
269	error = gb_convert_compression(gb_main);
270	GB_disable_quicksave(gb_main, "Database converted to new format");
271	}
272	return error;
273	}
274
275
276	// ******************* Compress by dictionary ******************
277
278
279	/* compression tag format:
280	*
281	* unsigned int compressed:1;
282	* if compressed==0:
283	* unsigned int last:1; ==1 -> this is the last block
284	* unsigned int len:6; length of uncompressible bytes
285	* char[len];
286	* if compressed==1:
287	* unsigned int idxlen:1; ==0 -> 10-bit index
288	* ==1 -> 18-bit index
289	* unsigned int idxhigh:2; the 2 highest bits of the index
290	* unsigned int len:4; (length of word) - (MIN_COMPR_WORD_LEN-1)
291	* if len==0:
292	* char extralen; (length of word) -
293	* char[idxlen+1]; index (low,high)
294	*
295	* tag == 64 -> end of dictionary compressed block (if not coded in last uncompressed block)
296	*/
297
298	inline int INDEX_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
299	gb_assert(idx<dict->words);
300	return ntohl(dict->offsets[idx]);
301	}
302	inline int ALPHA_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
303	int realIndex;
304	gb_assert(idx<dict->words);
305	realIndex = ntohl(dict->resort[idx]);
306	return INDEX_DICT_OFFSET(realIndex, dict);
307	}
308
309	// #define ALPHA_DICT_OFFSET(i) ntohl(offset[ntohl(resort[i])])
310	// #define INDEX_DICT_OFFSET(i) ntohl(offset[i])
311
312	#define LEN_BITS 4
313	#define INDEX_BITS 2
314	#define INDEX_LEN_BITS 1
315
316	#define LEN_SHIFT 0
317	#define INDEX_SHIFT (LEN_SHIFT+LEN_BITS)
318	#define INDEX_LEN_SHIFT (INDEX_SHIFT+INDEX_BITS)
319
320	#define BITMASK(bits) ((1<<(bits))-1)
321	#define GETVAL(tag,typ) (((tag)>>typ##_SHIFT)&BITMASK(typ##_BITS))
322
323	#define MIN_SHORTLEN 6
324	#define MAX_SHORTLEN (BITMASK(LEN_BITS)+MIN_SHORTLEN-1)
325	#define MIN_LONGLEN (MAX_SHORTLEN+1)
326	#define MAX_LONGLEN (MIN_LONGLEN+255)
327
328	#define SHORTLEN_DECR (MIN_SHORTLEN-1) // !! zero is used as flag for long len !!
329	#define LONGLEN_DECR MIN_LONGLEN
330
331	#define MIN_COMPR_WORD_LEN MIN_SHORTLEN
332	#define MAX_COMPR_WORD_LEN MAX_LONGLEN
333
334	#define MAX_SHORT_INDEX BITMASK(INDEX_BITS+8)
335	#define MAX_LONG_INDEX BITMASK(INDEX_BITS+16)
336
337	#define LAST_COMPRESSED_BIT 64
338
339	#ifdef DEBUG
340	# define DUMP_COMPRESSION_TEST 0
341	/* 0 = only compression ratio
342	* 1 = + original/compressed/decompressed
343	* 2 = + words used to compress/uncompress
344	* 3 = + matching words in dictionary
345	* 4 = + search of words in dictionary
346	*/
347	#else
348	# define DUMP_COMPRESSION_TEST 0
349	#endif
350
351	#ifdef DEBUG
352	// #define COUNT_CHUNKS
353
354	#if defined(COUNT_CHUNKS)
355
356	static long uncompressedBlocks[64];
357	static long compressedBlocks[MAX_LONGLEN];
358
359	static void clearChunkCounters() {
360	int i;
361
362	for (i=0; i<64; i++) uncompressedBlocks[i] = 0;
363	for (i=0; i<MAX_LONGLEN; i++) compressedBlocks[i] = 0;
364	}
365
366	static void dumpChunkCounters() {
367	int i;
368
369	printf("------------------------------\n" "Uncompressed blocks used:\n");
370	for (i=0; i<64; i++) if (uncompressedBlocks[i]) printf(" size=%i used=%li\n", i, uncompressedBlocks[i]);
371	printf("------------------------------\n" "Words used:\n");
372	for (i=0; i<MAX_LONGLEN; i++) if (compressedBlocks[i]) printf(" size=%i used=%li\n", i, compressedBlocks[i]);
373	printf("------------------------------\n");
374	}
375	#endif // COUNT_CHUNKS
376
377	static cu_str lstr(cu_str s, int len) {
378	#define BUFLEN 20000
379	static unsigned_char buf[BUFLEN];
380
381	gb_assert(len<BUFLEN);
382	memcpy(buf, s, len);
383	buf[len] = 0;
384
385	return buf;
386	}
387
388	#if DUMP_COMPRESSION_TEST>=2
389
390	static cu_str dict_word(GB_DICTIONARY *dict, int idx, int len) {
391	return lstr(dict->text+INDEX_DICT_OFFSET(idx, dict), len);
392	}
393
394	#endif
395
396	#if DUMP_COMPRESSION_TEST>=1
397
398	static void dumpBinary(u_str data, long size) {
399	#define PER_LINE 12
400	int cnt = 0;
401
402	while (size--) {
403	unsigned_char c = *data++;
404	int bitval = 128;
405	int bits = 8;
406
407	while (bits--) {
408	putchar(c&bitval ? '1' : '0');
409	bitval>>=1;
410	}
411	putchar(' ');
412
413	cnt = (cnt+1)%PER_LINE;
414	if (!cnt) putchar('\n');
415	}
416
417	if (cnt) putchar('\n');
418	}
419
420	#endif
421
422	#endif
423
424	inline int GB_MEMCMP(const void vm1, const void vm2, long size) {
425	char c1 = (char)vm1,
426	c2 = (char)vm2;
427	int diff = 0;
428
429	while (size-- && !diff) diff = c1++-c2++;
430	return diff;
431	}
432
433	// --------------------------------------------------
434
435	static int searchWord(GB_DICTIONARY dict, cu_str source, long size, unsigned long wordIndex, int *wordLen) {
436	int idx = -1;
437	int l = 0;
438	int h = dict->words-1;
439	cu_str text = dict->text;
440	GB_NINT *resort = dict->resort;
441	int dsize = dict->textlen;
442	int ilen = 0;
443
444	while (l<h-1) {
445	int m = (l+h)/2;
446	long off = ALPHA_DICT_OFFSET(m, dict);
447	cu_str dictword = text+off;
448	long msize = min(size, dsize-off);
449
450	#if DUMP_COMPRESSION_TEST>=4
451	printf(" %s (%i)\n", lstr(dictword, 20), m);
452	#endif
453
454	if (GB_MEMCMP(source, dictword, msize)<=0) h = m;
455	else l = m;
456	}
457
458	while (l<=h) {
459	int off = ALPHA_DICT_OFFSET(l, dict);
460	cu_str word = text+off;
461	int msize = (int)min(size, dsize-off);
462	int equal = 0;
463	cu_str s = source;
464
465	while (msize-- && s++==word++) equal++;
466
467	#if DUMP_COMPRESSION_TEST>=3
468	if (equal>=MIN_COMPR_WORD_LEN) {
469	printf(" EQUAL=%i '%s' (%i->%i, off=%i)", equal, lstr(text+off, equal), l, ntohl(resort[l]), ALPHA_DICT_OFFSET(l, dict));
470	printf(" (context=%s)\n", lstr(text+off-min(off, 20), min(off, 20)+equal+20));
471	}
472	#endif
473
474	if (equal>ilen) {
475	ilen = equal;
476	idx = ntohl(resort[l]);
477	gb_assert(idx<dict->words);
478	}
479
480	l++;
481	}
482
483	*wordIndex = idx;
484	*wordLen = (int)min(ilen, MAX_COMPR_WORD_LEN);
485
486	return idx!=-1 && ilen>=MIN_COMPR_WORD_LEN;
487	}
488
489	#ifdef DEBUG
490	int lookup_DICTIONARY(GB_DICTIONARY *dict, GB_CSTR source) { // used for debugging
491	unsigned long wordIndex;
492	int wordLen;
493	int wordFound = searchWord(dict, (cu_str)source, strlen(source), &wordIndex, &wordLen);
494
495	if (wordFound) {
496	printf("'%s' (idx=%lu, off=%i)\n", lstr(dict->text+ntohl(dict->offsets[wordIndex]), wordLen), wordIndex, ntohl(dict->offsets[wordIndex]));
497	}
498
499	return wordFound;
500	}
501	#endif
502
503
504
505	static char gb_uncompress_by_dictionary_internal(GB_DICTIONARY dict, /* GBDATA gbd, / GB_CSTR s_source, const size_t size, bool append_zero, size_t *new_size) {
506	cu_str source = (cu_str)s_source;
507	u_str dest;
508	u_str buffer;
509	cu_str text = dict->text;
510	int done = 0;
511	long left = size;
512
513	dest = buffer = (u_str)GB_give_other_buffer(s_source, size+2);
514
515	while (left && !done) {
516	int c;
517
518	if ((c=*source++)&128) { // compressed data
519	int indexLen = GETVAL(c, INDEX_LEN);
520	unsigned long idx = GETVAL(c, INDEX);
521
522	c = GETVAL(c, LEN); // ==wordLen
523	if (c) c += SHORTLEN_DECR;
524	else c = *source+++LONGLEN_DECR;
525
526	gb_assert(indexLen>=0 && indexLen<=1);
527
528	if (indexLen==0) {
529	idx = (idx << 8) \| *source++;
530	}
531	else {
532	idx = (((idx << 8) \| source[1]) << 8) \| source[0];
533	source += 2;
534	}
535
536	gb_assert(idx<(GB_ULONG)dict->words);
537
538	{
539	cu_str word = text+INDEX_DICT_OFFSET(idx, dict);
540
541	#if DUMP_COMPRESSION_TEST>=2
542	printf(" word='%s' (idx=%lu, off=%li, len=%i)\n",
543	lstr(word, c), idx, (long)ntohl(dict->offsets[idx]), c);
544	#endif
545
546	{
547	u_str d = dest;
548	gb_assert(((d + c) <= word) \|\| (d >= (word + c)));
549	while (c--) d++ = word++; // LOOP_VECTORIZED
550	dest = d;
551	}
552	}
553	}
554	else { // uncompressed bytes
555	if (c & LAST_COMPRESSED_BIT) {
556	done = 1;
557	c ^= LAST_COMPRESSED_BIT;
558	}
559
560	left -= c;
561	{
562	u_str d = dest;
563	gb_assert(((d + c) <= source) \|\| (d >= (source + c)));
564	while (c--) d++ = source++; // LOOP_VECTORIZED
565	dest=d;
566	}
567	}
568	}
569
570	if (append_zero) *dest++ = 0;
571
572	*new_size = dest-buffer;
573	gb_assert(size >= *new_size); // buffer overflow
574
575	return (char *)buffer;
576	}
577
578	char gb_uncompress_by_dictionary(GBDATA gbd, GB_CSTR s_source, size_t size, size_t *new_size) {
579	GB_DICTIONARY *dict = gb_get_dictionary(GB_MAIN(gbd), GB_KEY_QUARK(gbd));
580	bool append_zero = gbd->is_a_string();
581
582	if (!dict) {
583	GB_ERROR error = GBS_global_string("Cannot decompress db-entry '%s' (no dictionary found)\n", GB_get_db_path(gbd));
584	GB_export_error(error);
585	return NULp;
586	}
587
588	return gb_uncompress_by_dictionary_internal(dict, s_source, size, append_zero, new_size);
589	}
590
591	char gb_compress_by_dictionary(GB_DICTIONARY dict, GB_CSTR s_source, size_t size, size_t *msize, int last_flag, int search_backward, int search_forward) {
592	cu_str source = (cu_str)s_source;
593	u_str dest;
594	u_str buffer;
595	cu_str unknown = source; // start of uncompressible bytes
596	u_str lastUncompressed = NULp; // ptr to start of last block of uncompressible bytes (in dest)
597
598	#if defined(ASSERTION_USED)
599	const size_t org_size = size;
600	#endif // ASSERTION_USED
601
602	gb_assert(size>0); // compression of zero-length data fails!
603
604	dest = buffer = (u_str)GB_give_other_buffer((GB_CSTR)source, 1+(size/63+1)+size);
605	*dest++ = GB_COMPRESSION_DICTIONARY \| last_flag;
606
607	while (size) {
608	unsigned long wordIndex;
609	int wordLen;
610	int wordFound;
611
612	if ((wordFound = searchWord(dict, source, size, &wordIndex, &wordLen))) {
613	int length;
614
615	takeRest :
616	length = source-unknown;
617
618	if (length) {
619	int shift;
620	int takeShift = 0;
621	int maxShift = (int)min(search_forward, wordLen-1);
622
623	for (shift=1; shift<=maxShift; shift++) {
624	unsigned long wordIndex2;
625	int wordLen2;
626	int wordFound2;
627
628	if ((wordFound2 = searchWord(dict, source+shift, size-shift, &wordIndex2, &wordLen2))) {
629	if (wordLen2>(wordLen+shift)) {
630	wordIndex = wordIndex2;
631	wordLen = wordLen2;
632	takeShift = shift;
633	}
634	}
635	}
636
637	if (takeShift) {
638	source += takeShift;
639	size -= takeShift;
640	length = source-unknown;
641	}
642	}
643
644	while (length) { // if there were uncompressible bytes
645	int take = (int)min(length, 63);
646
647	#ifdef COUNT_CHUNKS
648	uncompressedBlocks[take]++;
649	#endif
650
651	lastUncompressed = dest;
652
653	*dest++ = take; // tag byte
654	memcpy(dest, unknown, take);
655	dest += take;
656	unknown += take;
657	length -= take;
658	}
659
660	gb_assert(unknown==source);
661
662	while (wordFound) { // as long as we find words in dictionary
663	int indexLen = wordIndex>MAX_SHORT_INDEX;
664	int indexHighBits = indexLen==0 ? wordIndex>>8 : wordIndex>>16;
665	int nextWordFound;
666	int nextWordLen;
667	unsigned long nextWordIndex;
668
669	gb_assert((long)wordIndex<dict->words);
670	gb_assert((long)wordIndex <= MAX_LONG_INDEX);
671	gb_assert(indexHighBits==(indexHighBits & BITMASK(INDEX_BITS)));
672	gb_assert(wordLen>=MIN_SHORTLEN);
673
674	lastUncompressed = NULp;
675
676	{
677	cu_str source2 = source+wordLen;
678	long size2 = size-wordLen;
679
680	if (!(nextWordFound=searchWord(dict, source+wordLen, size-wordLen, &nextWordIndex, &nextWordLen))) { // no word right afterwards
681	int shift;
682
683	for (shift=1; shift<=search_backward && shift<(wordLen-MIN_COMPR_WORD_LEN); shift++) {
684	// try to cut end of word to get a better result
685	unsigned long wordIndex2;
686	int wordLen2;
687	int wordFound2;
688
689	if ((wordFound2=searchWord(dict, source2-shift, size2+shift, &wordIndex2, &wordLen2))) {
690	if (wordLen2>(shift+1)) {
691	wordLen -= shift;
692
693	nextWordFound = 1;
694	nextWordIndex = wordIndex2;
695	nextWordLen = wordLen2;
696	break;
697	}
698	}
699	}
700	}
701	}
702
703	#ifdef COUNT_CHUNKS
704	compressedBlocks[wordLen]++;
705	#endif
706
707	#if DUMP_COMPRESSION_TEST>=2
708	printf(" word='%s' (idx=%li, off=%i, len=%i)\n",
709	dict_word(dict, wordIndex, wordLen), wordIndex, (int)ntohl(dict->offsets[wordIndex]), wordLen);
710	#endif
711
712	if (wordLen<=MAX_SHORTLEN) {
713	*dest++ = 128 \|
714	(indexLen << INDEX_LEN_SHIFT) \|
715	(indexHighBits << INDEX_SHIFT) \|
716	((wordLen-SHORTLEN_DECR) << LEN_SHIFT);
717	}
718	else {
719	*dest++ = 128 \|
720	(indexLen << INDEX_LEN_SHIFT) \|
721	(indexHighBits << INDEX_SHIFT);
722	*dest++ = wordLen-LONGLEN_DECR; // extra length byte
723	}
724
725	*dest++ = (char)wordIndex; // low index byte
726	if (indexLen)
727	*dest++ = (char)(wordIndex >> 8); // high index byte
728
729	unknown = source += wordLen;
730	size -= wordLen;
731
732	wordFound = nextWordFound;
733	wordIndex = nextWordIndex;
734	wordLen = nextWordLen;
735	}
736	}
737	else {
738	source++;
739	if (--size==0) goto takeRest;
740	}
741	}
742
743	if (lastUncompressed) *lastUncompressed \|= LAST_COMPRESSED_BIT;
744	else *dest++ = LAST_COMPRESSED_BIT;
745
746	*msize = dest-buffer;
747
748	#if defined(ASSERTION_USED)
749	{
750	size_t new_size = -1;
751	char *test = gb_uncompress_by_dictionary_internal(dict, (GB_CSTR)buffer+1, org_size + GB_COMPRESSION_TAGS_SIZE_MAX, true, &new_size);
752
753	gb_assert(memcmp(test, s_source, org_size) == 0);
754	gb_assert((org_size+1) == new_size);
755	}
756	#endif // ASSERTION_USED
757
758	return (char*)buffer;
759	}
760
761
762	#if defined(TEST_DICT)
763
764	static void test_dictionary(GB_DICTIONARY dict, O_gbdByKey gbk, long uncompSum, long compSum) {
765	long uncompressed_sum = 0;
766	long compressed_sum = 0;
767
768	long dict_size = (dict->words2+1)sizeof(GB_NINT)+dict->textlen;
769
770	long char_count[256];
771	for (int i=0; i<256; i++) char_count[i] = 0;
772
773	printf(" * Testing compression..\n");
774
775	#ifdef COUNT_CHUNKS
776	clearChunkCounters();
777	#endif
778
779	for (int cnt=0; cnt<gbk->cnt; cnt++) {
780	GBDATA *gbd = gbk->gbds[cnt];
781
782	if (COMPRESSIBLE(gbd->type())) {
783	size_t size;
784	cu_str data = get_data_n_size(gbd, &size);
785
786	if (gbd->is_a_string()) size--;
787	if (size<1) continue;
788
789	u_str copy = (u_str)gbm_get_mem(size, GBM_DICT_INDEX);
790	gb_assert(copy!=0);
791	memcpy(copy, data, size);
792
793	#if DUMP_COMPRESSION_TEST>=1
794	printf("----------------------------\n");
795	printf("original : %3li b = '%s'\n", size, data);
796	#endif
797
798	int last_flag = 0;
799	size_t compressedSize;
800	u_str compressed = (u_str)gb_compress_by_dictionary(dict, (GB_CSTR)data, size, &compressedSize, last_flag, 9999, 2);
801
802	#if DUMP_COMPRESSION_TEST>=1
803	printf("compressed : %3li b = '%s'\n", compressedSize, lstr(compressed, compressedSize));
804	dumpBinary(compressed, compressedSize);
805	#endif
806
807	for (size_t i=0; i<compressedSize; i++) char_count[compressed[i]]++;
808
809	size_t new_size = -1;
810	u_str uncompressed = (u_str)gb_uncompress_by_dictionary(gbd, (char*)compressed+1, size+GB_COMPRESSION_TAGS_SIZE_MAX, &new_size);
811
812	#if DUMP_COMPRESSION_TEST>=1
813	printf("copy : %3li b = '%s'\n", size, lstr(copy, size));
814	printf("decompressed: %3li b = '%s'\n", size, lstr(uncompressed, size));
815	#endif
816
817	if (GB_MEMCMP(copy, uncompressed, size)!=0) {
818	int byte = 0;
819
820	while (copy[byte]==uncompressed[byte]) byte++;
821	printf("Error in compression (off=%i, '%s'", byte, lstr(copy+byte, 10));
822	printf("!='%s'\n", lstr(uncompressed+byte, 10));
823	}
824
825	if (compressedSize<size) {
826	uncompressed_sum += size;
827	compressed_sum += compressedSize;
828	}
829	else {
830	uncompressed_sum += size;
831	compressed_sum += size;
832	}
833
834	gbm_free_mem(copy, size, GBM_DICT_INDEX);
835	}
836	}
837
838	#ifdef COUNT_CHUNKS
839	dumpChunkCounters();
840	#endif
841
842	{
843	long compressed_plus_dict = compressed_sum+dict_size;
844	char *dict_text = GBS_global_string_copy("+dict %li b", dict_size);
845	long ratio = (compressed_plus_dict*100)/uncompressed_sum;
846
847	printf(" uncompressed size = %10li b\n"
848	" compressed size = %10li b\n"
849	" %17s = %10li b (Ratio=%li%%)\n",
850	uncompressed_sum,
851	compressed_sum,
852	dict_text, compressed_plus_dict, ratio);
853
854	free(dict_text);
855	}
856
857	*uncompSum += uncompressed_sum;
858	*compSum += compressed_sum+dict_size;
859	}
860
861	#endif // TEST_DICT
862
863
864	// ***************** Build dictionary ****************
865
866	#ifdef DEBUG
867	#define TEST // test trees?
868	// #define DUMP_TREE // dump trees?
869
870	// #define DUMP_EXPAND
871	/*
872	#define SELECT_WORDS
873	#define SELECTED_WORDS "oropl"
874	*/
875
876	# ifdef SELECT_WORDS
877	static char strnstr(char s1, int len, char *s2) {
878	char c = *s2;
879	int len2 = strlen(s2);
880
881	while (len-->=len2) {
882	if (*s1==c) {
883	if (strncmp(s1, s2, len2)==0) return s1;
884	}
885	s1++;
886	}
887
888	return NULp;
889	}
890	# endif
891
892	#ifdef DUMP_TREE
893	static void dump_dtree(int deep, DictTree tree) {
894	static unsigned_char buffer[1024];
895
896	if (tree.full) {
897	switch (tree.full->typ) {
898	case FULL_NODE: {
899	int idx;
900
901	for (idx=0; idx<256; idx++) {
902	buffer[deep] = idx;
903	buffer[deep+1] = 0;
904
905	if (tree.full->son[idx].exists) dump_dtree(deep+1, tree.full->son[idx]);
906	else if (tree.full->count[idx]>0) printf(" '%s' (%i) [array]\n", buffer, tree.full->count[idx]);
907	}
908	break;
909	}
910	case SINGLE_NODE: {
911	buffer[deep] = tree.single->ch;
912	buffer[deep+1] = 0;
913
914	if (tree.single->son.exists) dump_dtree(deep+1, tree.single->son);
915	else printf(" '%s' (%i) [single]\n", buffer, tree.single->count);
916
917	if (tree.single->brother.exists) dump_dtree(deep, tree.single->brother);
918	break;
919	}
920	}
921	}
922	}
923	#endif
924
925	#else
926	#ifdef DUMP_TREE
927	# define dump_dtree(deep, tree)
928	#endif
929	#endif
930
931	#ifdef TEST
932	static int testCounts(DictTree tree) {
933	// tests if all inner nodes have correct 'count's
934	int cnt = 0;
935
936	if (tree.exists) {
937	switch (tree.full->typ) {
938	case SINGLE_NODE: {
939	while (tree.exists) {
940	if (tree.single->son.exists) {
941	int son_cnt = testCounts(tree.single->son);
942	#ifdef COUNT_EQUAL
943	gb_assert(son_cnt==tree.single->count);
944	#else
945	gb_assert(son_cnt<=tree.single->count);
946	#endif
947	}
948
949	gb_assert(tree.single->count>0);
950	cnt += tree.single->count;
951	tree = tree.single->brother;
952	}
953	break;
954	}
955	case FULL_NODE: {
956	int idx,
957	sons = 0;
958
959	for (idx=0; idx<256; idx++) {
960	if (tree.full->son[idx].exists) {
961	int son_cnt = testCounts(tree.full->son[idx]);
962	#ifdef COUNT_EQUAL
963	gb_assert(son_cnt==tree.full->count[idx]);
964	#else
965	gb_assert(son_cnt<=tree.full->count[idx]);
966	#endif
967	if (tree.full->usedSons) gb_assert(tree.full->count[idx]>0);
968	else gb_assert(tree.full->count[idx]==0);
969
970	sons++;
971	}
972	else if (tree.full->count[idx]) {
973	sons++;
974	}
975
976	cnt += tree.full->count[idx];
977	}
978
979	gb_assert(sons==tree.full->usedSons);
980	break;
981	}
982	}
983	}
984
985	return cnt;
986	}
987
988	// #define TEST_MAX_OCCUR_COUNT
989
990	#ifdef TEST_MAX_OCCUR_COUNT
991	#define MAX_OCCUR_COUNT 600000
992	#endif
993
994	static DictTree test_dtree(DictTree tree) {
995	// only correct while tree is under contruction (build_dict_tree())
996
997	if (tree.exists) {
998	switch (tree.full->typ) {
999	case SINGLE_NODE: {
1000	#if defined(TEST_MAX_OCCUR_COUNT)
1001	gb_assert(tree.single->count<MAX_OCCUR_COUNT); // quite improbable
1002	#endif // TEST_MAX_OCCUR_COUNT
1003
1004	if (tree.single->son.exists) {
1005	gb_assert(tree.single->count==0);
1006	test_dtree(tree.single->son);
1007	}
1008	else {
1009	gb_assert(tree.single->count>0);
1010	}
1011
1012	if (tree.single->brother.exists) test_dtree(tree.single->brother);
1013	break;
1014	}
1015	case FULL_NODE: {
1016	int idx;
1017	int countSons = 0;
1018
1019	for (idx=0; idx<256; idx++) {
1020	#if defined(TEST_MAX_OCCUR_COUNT)
1021	gb_assert(tree.full->count[idx]<MAX_OCCUR_COUNT); // quite improbable
1022	#endif // TEST_MAX_OCCUR_COUNT
1023
1024	if (tree.full->son[idx].exists) {
1025	gb_assert(tree.full->count[idx]==0);
1026	test_dtree(tree.full->son[idx]);
1027	countSons++;
1028	}
1029	else {
1030	gb_assert(tree.full->count[idx]>=0);
1031	if (tree.full->count[idx]>0)
1032	countSons++;
1033	}
1034	}
1035
1036	gb_assert(countSons==tree.full->usedSons);
1037
1038	break;
1039	}
1040	}
1041	}
1042
1043	return tree;
1044	}
1045
1046	#else
1047	# define test_dtree(tree) // (tree)
1048	# define testCounts(tree) // 0
1049	#endif
1050
1051
1052	static DictTree new_dtree(cu_str text, long len, long *memcount) {
1053	// creates a new (sub-)tree from 'text' (which has length 'len')
1054	DictTree tree;
1055
1056	if (len) {
1057	SingleDictTree *tail = NULp;
1058	SingleDictTree *head = NULp;
1059
1060	while (len) {
1061	if (tail) tail = tail->son.single = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1062	else tail = head = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1063
1064	(memcount) += sizeof(tail);
1065
1066	tail->typ = SINGLE_NODE;
1067	tail->ch = *text++;
1068	len--;
1069
1070	tail->brother.single = NULp;
1071	tail->son.single = NULp;
1072	}
1073
1074	tail->count = 1;
1075	tree.single = head;
1076	}
1077	else {
1078	tree.single = NULp;
1079	}
1080
1081	return tree;
1082	}
1083
1084	static DictTree single2full_dtree(DictTree tree, long *memcount) {
1085	if (tree.exists && tree.single->typ==SINGLE_NODE) {
1086	FullDictTree full = (FullDictTree)gbm_get_mem(sizeof(*full), GBM_DICT_INDEX);
1087	int idx;
1088
1089	(memcount) += sizeof(full);
1090	full->typ = FULL_NODE;
1091	full->usedSons = 0;
1092
1093	for (idx=0; idx<256; idx++) { // LOOP_VECTORIZED =1\|3 // depends on whether this function gets inlined (1x(UNIT_TESTS=1) or 3x(UNIT_TESTS=0))
1094	full->son[idx].exists = NULp;
1095	full->count[idx] = 0;
1096	}
1097
1098	while (tree.exists) {
1099	SingleDictTree *t = tree.single;
1100
1101	gb_assert(t->typ==SINGLE_NODE);
1102	gb_assert(!full->son[t->ch].exists);
1103
1104	full->son[t->ch] = t->son;
1105	full->count[t->ch] = t->count;
1106	full->usedSons++;
1107
1108	tree.single = t->brother.single;
1109
1110	gbm_free_mem(t, sizeof(*t), GBM_DICT_INDEX);
1111	(memcount) -= sizeof(t);
1112	}
1113
1114	tree.full = full;
1115	}
1116
1117	return tree;
1118	}
1119
1120	static void free_dtree(DictTree tree) {
1121	if (tree.exists) {
1122	switch (tree.full->typ) {
1123	case SINGLE_NODE: {
1124	if (tree.single->son.exists) free_dtree(tree.single->son);
1125	if (tree.single->brother.exists) free_dtree(tree.single->brother);
1126
1127	gbm_free_mem(tree.single, sizeof(*(tree.single)), GBM_DICT_INDEX);
1128	break;
1129	}
1130	case FULL_NODE: {
1131	int idx;
1132
1133	for (idx=0; idx<256; idx++) if (tree.full->son[idx].exists) free_dtree(tree.full->son[idx]);
1134	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1135	break;
1136	}
1137	}
1138	}
1139	}
1140
1141
1142
1143	static DictTree cut_dtree(DictTree tree, int cut_count, long memcount, long leafcount) {
1144	/* removes all branches from 'tree' which are referenced less/equal than cut_count
1145	* returns: the reduced tree
1146	*/
1147	if (tree.exists) {
1148	switch (tree.full->typ) {
1149	case SINGLE_NODE: {
1150	if (tree.single->son.exists) tree.single->son = cut_dtree(tree.single->son, cut_count, memcount, leafcount);
1151
1152	if (!tree.single->son.exists) { // leaf
1153	if (tree.single->count<=cut_count) { // leaf with less/equal references
1154	DictTree brother = tree.single->brother;
1155
1156	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1157	(memcount) -= sizeof(tree.single);
1158	if (brother.exists) return cut_dtree(brother, cut_count, memcount, leafcount);
1159
1160	tree.single = NULp;
1161	break;
1162	}
1163	else {
1164	(*leafcount)++;
1165	}
1166	}
1167
1168	if (tree.single->brother.exists) tree.single->brother = cut_dtree(tree.single->brother, cut_count, memcount, leafcount);
1169	break;
1170	}
1171	case FULL_NODE: {
1172	int idx;
1173	int count = 0;
1174
1175	for (idx=0; idx<256; idx++) {
1176	if (tree.full->son[idx].exists) {
1177	tree.full->son[idx] = cut_dtree(tree.full->son[idx], cut_count, memcount, leafcount);
1178
1179	if (tree.full->son[idx].exists) count++;
1180	else tree.full->count[idx] = 0;
1181	}
1182	else if (tree.full->count[idx]>0) {
1183	if (tree.full->count[idx]<=cut_count) {
1184	tree.full->count[idx] = 0;
1185	}
1186	else {
1187	count++;
1188	(*leafcount)++;
1189	}
1190	}
1191	}
1192
1193	tree.full->usedSons = count;
1194
1195	if (!count) { // no more sons
1196	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1197	(memcount) -= sizeof((tree.full));
1198	tree.exists = NULp;
1199	}
1200
1201	break;
1202	}
1203	}
1204	}
1205
1206	return tree;
1207	}
1208	static DictTree cut_useless_words(DictTree tree, int deep, long *removed) {
1209	/* removes/shortens all branches of 'tree' which are not useful for compression
1210	* 'deep' should be zero (incremented by cut_useless_words)
1211	* 'removed' will be set to the # of removed occurrences
1212	* returns: the reduced tree
1213	*/
1214	*removed = 0;
1215
1216	if (tree.exists) {
1217	deep++;
1218
1219	switch (tree.full->typ) {
1220	long removed_single;
1221
1222	case SINGLE_NODE: {
1223	if (tree.single->son.exists) {
1224	tree.single->son = cut_useless_words(tree.single->son, deep, &removed_single);
1225	tree.single->count -= removed_single;
1226	*removed += removed_single;
1227	}
1228
1229	if (!tree.single->son.exists && !WORD_HELPFUL(deep, tree.single->count)) {
1230	DictTree brother = tree.single->brother;
1231
1232	*removed += tree.single->count;
1233	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1234
1235	if (brother.exists) {
1236	tree = cut_useless_words(brother, deep-1, &removed_single);
1237	*removed += removed_single;
1238	}
1239	else {
1240	tree.exists = NULp;
1241	}
1242
1243	break;
1244	}
1245
1246	if (tree.single->brother.exists) {
1247	tree.single->brother = cut_useless_words(tree.single->brother, deep-1, &removed_single);
1248	*removed += removed_single;
1249	}
1250
1251	break;
1252	}
1253	case FULL_NODE: {
1254	int idx;
1255	int count = 0;
1256
1257	for (idx=0; idx<256; idx++) {
1258	if (tree.full->son[idx].exists) {
1259	tree.full->son[idx] = cut_useless_words(tree.full->son[idx], deep, &removed_single);
1260	tree.full->count[idx] -= removed_single;
1261	*removed += removed_single;
1262	}
1263
1264	if (tree.full->son[idx].exists) {
1265	count++;
1266	}
1267	else if (tree.full->count[idx]) {
1268	if (!WORD_HELPFUL(deep, tree.full->count[idx])) { // useless!
1269	*removed += tree.full->count[idx];
1270	tree.full->count[idx] = 0;
1271	}
1272	else {
1273	count++;
1274	}
1275	}
1276	}
1277
1278	tree.full->usedSons = count;
1279
1280	if (!count) { // no more sons
1281	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1282	tree.exists = NULp;
1283	}
1284
1285	break;
1286	}
1287	}
1288	}
1289
1290	return tree;
1291	}
1292
1293	static DictTree add_dtree_to_dtree(DictTree toAdd, DictTree to, long *memcount) {
1294	/* adds 'toAdd' as brother of 'to' (must be leftmost of all SINGLE_NODEs or a FULL_NODE)
1295	* returns: the leftmost of all SINGLE_NODEs or a FULL_NODE
1296	*/
1297	DictTree tree = toAdd;
1298
1299	gb_assert(toAdd.single->typ==SINGLE_NODE);
1300
1301	if (to.exists) {
1302	switch (to.full->typ) {
1303	case SINGLE_NODE: {
1304	SingleDictTree *left = to.single;
1305
1306	gb_assert(left);
1307
1308	if (toAdd.single->ch < to.single->ch) {
1309	toAdd.single->brother = to;
1310	return toAdd;
1311	}
1312
1313	while (to.single->brother.exists) {
1314	if (toAdd.single->ch < to.single->brother.single->ch) {
1315	toAdd.single->brother = to.single->brother;
1316	to.single->brother = toAdd;
1317
1318	tree.single = left;
1319	return tree;
1320	}
1321	to = to.single->brother;
1322	}
1323
1324	to.single->brother = toAdd;
1325	tree.single = left;
1326	break;
1327	}
1328	case FULL_NODE: {
1329	unsigned_char ch = toAdd.single->ch;
1330
1331	gb_assert(!to.full->son[ch].exists);
1332	gb_assert(to.full->count[ch]==0); // if this fails, count must be added & tested
1333	gb_assert(!toAdd.single->brother.exists);
1334
1335	to.full->son[ch] = toAdd.single->son;
1336	to.full->count[ch] = toAdd.single->count;
1337	to.full->usedSons++;
1338
1339	tree = to;
1340
1341	gbm_free_mem(toAdd.single, sizeof(*(toAdd.single)), GBM_DICT_INDEX);
1342	(*memcount) -= sizeof(toAdd.single);
1343
1344	break;
1345	}
1346	}
1347	}
1348
1349	return tree;
1350	}
1351
1352	static DictTree add_to_dtree(DictTree tree, cu_str text, long len, long *memcount) {
1353	/* adds the string 'text' (which has length 'len') to 'tree'
1354	* returns: new tree
1355	*/
1356	if (tree.exists) {
1357	switch (tree.full->typ) {
1358	case SINGLE_NODE: {
1359	SingleDictTree *t = tree.single;
1360	int count = 0;
1361
1362	do {
1363	count++;
1364	if (t->ch==text[0]) { // we found an existing subtree
1365	if (len>1) {
1366	t->son = add_to_dtree(t->son, text+1, len-1, memcount); // add rest of text to subtree
1367	}
1368	else {
1369	gb_assert(len==1);
1370	gb_assert(!t->son.exists);
1371	t->count++;
1372	}
1373
1374	return count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree;
1375	}
1376	else if (t->ch > text[0]) {
1377	break;
1378	}
1379	}
1380	while ((t=t->brother.single));
1381
1382	tree = add_dtree_to_dtree(new_dtree(text, len, memcount), // otherwise we create a new subtree
1383	count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree,
1384	memcount);
1385	break;
1386	}
1387	case FULL_NODE: {
1388	unsigned_char ch = text[0];
1389
1390	if (tree.full->son[ch].exists) {
1391	tree.full->son[ch] = add_to_dtree(tree.full->son[ch], text+1, len-1, memcount);
1392	}
1393	else {
1394	tree.full->son[ch] = new_dtree(text+1, len-1, memcount);
1395	if (!tree.full->son[ch].exists) {
1396	if (tree.full->count[ch]==0) tree.full->usedSons++;
1397	tree.full->count[ch]++;
1398	}
1399	else {
1400	tree.full->usedSons++;
1401	}
1402	}
1403	break;
1404	}
1405	}
1406
1407	return tree;
1408	}
1409
1410	return new_dtree(text, len, memcount);
1411	}
1412
1413	static long calcCounts(DictTree tree) {
1414	long cnt = 0;
1415
1416	gb_assert(tree.exists);
1417
1418	switch (tree.full->typ) {
1419	case SINGLE_NODE: {
1420	while (tree.exists) {
1421	if (tree.single->son.exists) tree.single->count = calcCounts(tree.single->son);
1422	gb_assert(tree.single->count>0);
1423	cnt += tree.single->count;
1424	tree = tree.single->brother;
1425	}
1426	break;
1427	}
1428	case FULL_NODE: {
1429	int idx;
1430
1431	for (idx=0; idx<256; idx++) {
1432	if (tree.full->son[idx].exists) {
1433	tree.full->count[idx] = calcCounts(tree.full->son[idx]);
1434	gb_assert(tree.full->count[idx]>0);
1435	}
1436	else {
1437	gb_assert(tree.full->count[idx]>=0);
1438	}
1439	cnt += tree.full->count[idx];
1440	}
1441	break;
1442	}
1443	}
1444
1445	return cnt;
1446	}
1447
1448	static int count_dtree_leafs(DictTree tree, int deep, int *maxdeep) {
1449	// returns # of leafs and max. depth of tree
1450	int leafs = 0;
1451
1452	gb_assert(tree.exists);
1453
1454	if (++deep>maxdeep) maxdeep = deep;
1455
1456	switch (tree.full->typ) {
1457	case SINGLE_NODE: {
1458	if (tree.single->son.exists) leafs += count_dtree_leafs(tree.single->son, deep, maxdeep);
1459	else leafs++;
1460	if (tree.single->brother.exists) leafs += count_dtree_leafs(tree.single->brother, deep, maxdeep);
1461	break;
1462	}
1463	case FULL_NODE: {
1464	int idx;
1465
1466	for (idx=0; idx<256; idx++) {
1467	if (tree.full->son[idx].exists) leafs += count_dtree_leafs(tree.full->son[idx], deep, maxdeep);
1468	else if (tree.full->count[idx]) leafs++;
1469	}
1470	break;
1471	}
1472	}
1473
1474	return leafs;
1475	}
1476
1477	static int COUNT(DictTree tree) {
1478	// counts sum of # of occurrences of tree
1479	int cnt = 0;
1480
1481	switch (tree.single->typ) {
1482	case SINGLE_NODE: {
1483	while (tree.exists) {
1484	cnt += tree.single->count;
1485	tree = tree.single->brother;
1486	}
1487	break;
1488	}
1489	case FULL_NODE: {
1490	int idx;
1491
1492	for (idx=0; idx<256; idx++) cnt += tree.full->count[idx]; // LOOP_VECTORIZED =4[<10] =8\|18 // 4x with 4.9.2 + 5.x; 8x(UNIT_TESTS=1) or 18x(UNIT_TESTS=0) with gcc 10.1 - 10.5
1493	break;
1494	}
1495	}
1496
1497	return cnt;
1498	}
1499
1500	static DictTree removeSubsequentString(DictTree *tree_pntr, cu_str buffer, int len, int max_occur) {
1501	/* searches tree for 'buffer' (length='len')
1502	*
1503	* returns - rest below found string
1504	* (if found and if the # of occurrences of the string is less/equal than 'max_occur')
1505	* - NULp otherwise
1506	*
1507	* removes the whole found string from the tree (not only the rest!)
1508	*/
1509	DictTree tree = *tree_pntr, rest;
1510	static int restCount;
1511
1512	rest.exists = NULp;
1513
1514	gb_assert(tree.exists);
1515	gb_assert(len>0);
1516
1517	switch (tree.full->typ) {
1518	case SINGLE_NODE: {
1519	while (tree.single->ch <= buffer[0]) {
1520	if (tree.single->ch == buffer[0]) { // found wanted character
1521	if (tree.single->son.exists) {
1522	if (len==1) {
1523	if (tree.single->count <= max_occur) {
1524	rest = tree.single->son;
1525	restCount = COUNT(rest);
1526	tree.single->son.exists = NULp;
1527	}
1528	}
1529	else {
1530	rest = removeSubsequentString(&tree.single->son, buffer+1, len-1, max_occur);
1531	}
1532	}
1533
1534	if (rest.exists) { // the string was found
1535	tree.single->count -= restCount;
1536	gb_assert(tree.single->count >= 0);
1537
1538	if (!tree.single->count) { // empty subtree -> delete myself
1539	DictTree brother = tree.single->brother;
1540
1541	tree.single->brother.exists = NULp; // elsewise it would be freed by free_dtree
1542	free_dtree(tree);
1543	*tree_pntr = tree = brother;
1544	}
1545	}
1546
1547	break;
1548	}
1549
1550	tree_pntr = &(tree.single->brother);
1551	if (!(tree = tree.single->brother).exists) break;
1552	}
1553
1554	break;
1555	}
1556	case FULL_NODE: {
1557	unsigned_char ch;
1558
1559	if (tree.full->son[ch=buffer[0]].exists) {
1560	if (len==1) {
1561	if (tree.full->count[ch] <= max_occur) {
1562	rest = tree.full->son[ch];
1563	restCount = COUNT(rest);
1564	tree.full->son[ch].exists = NULp;
1565	}
1566	}
1567	else {
1568	rest = removeSubsequentString(&tree.full->son[ch], buffer+1, len-1, max_occur);
1569	}
1570
1571	if (rest.exists) {
1572	gb_assert(restCount>0);
1573	tree.full->count[ch] -= restCount;
1574	gb_assert(tree.full->count[ch]>=0);
1575	if (tree.full->count[ch]==0) {
1576	gb_assert(!tree.full->son[ch].exists);
1577
1578	if (--tree.full->usedSons==0) { // last son deleted -> delete myself
1579	free_dtree(tree);
1580	tree.exists = NULp;
1581	*tree_pntr = tree;
1582	}
1583	}
1584	}
1585	}
1586
1587	break;
1588	}
1589	}
1590
1591	return rest;
1592	}
1593
1594	static cu_str memstr(cu_str stringStart, int stringStartLen, cu_str inString, int inStringLen) {
1595	if (!inStringLen) return stringStart; // string of length zero is found everywhere
1596
1597	while (stringStartLen) {
1598	cu_str found = (cu_str)memchr(stringStart, inString[0], stringStartLen);
1599
1600	if (!found) break;
1601
1602	stringStartLen -= found-stringStart;
1603	stringStart = found;
1604
1605	if (stringStartLen<inStringLen) break;
1606
1607	if (GB_MEMCMP(stringStart, inString, inStringLen)==0) return stringStart;
1608
1609	stringStart++;
1610	stringStartLen--;
1611	}
1612
1613	return NULp;
1614	}
1615
1616
1617	static int expandBranches(u_str buffer, int deep, int minwordlen, int maxdeep, DictTree tree, DictTree root, int max_percent) {
1618	/* expands all branches in 'tree'
1619	*
1620	* this is done by searching every of these branches in 'root' and moving any subsequent parts from there to 'tree'
1621	* (this is only done, if the # of occurrences of the found part does not exceed the # of occurrences of 'tree' more than 'max_percent' percent)
1622	*
1623	* 'buffer' strings are rebuild here while descending the tree (length of buffer==MAX_WORD_LEN)
1624	* 'deep' recursion level
1625	* 'maxdeep' maximum recursion level
1626	* 'minwordlen' is the length of the words to search (usually equal to MIN_WORD_LEN-1)
1627	*
1628	* returns the # of occurrences which were added to 'tree'
1629	*/
1630	int expand = 0; // calculate count-sum of added subsequent parts
1631
1632	gb_assert(tree.exists);
1633
1634	if (deep<maxdeep) {
1635	switch (tree.full->typ) {
1636	case SINGLE_NODE: {
1637	while (tree.exists) {
1638	buffer[deep] = tree.single->ch;
1639
1640	if (!tree.single->son.exists) {
1641	DictTree rest;
1642	u_str buf = buffer+1;
1643	int len = deep;
1644
1645	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1646	buf += len-minwordlen;
1647	len = minwordlen;
1648	}
1649
1650	if (len==minwordlen) {
1651	cu_str self = memstr(buffer, deep+1, buf, len);
1652
1653	gb_assert(self);
1654	if (self==buf) rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.single->count)/100);
1655	else rest.exists = NULp;
1656	}
1657	else {
1658	rest.exists = NULp;
1659	}
1660
1661	if (rest.exists) {
1662	int cnt = COUNT(rest);
1663
1664	tree.single->son = rest;
1665	tree.single->count += cnt;
1666	expand += cnt;
1667	#ifdef DUMP_EXPAND
1668	#define DUMP_MORE 1
1669	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1670	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1671	#endif
1672	}
1673	}
1674
1675	if (tree.single->son.exists) {
1676	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.single->son, root, max_percent);
1677
1678	expand += added;
1679	tree.single->count += added;
1680	}
1681
1682	tree = tree.single->brother;
1683	}
1684
1685	break;
1686	}
1687	case FULL_NODE: {
1688	int idx;
1689
1690	for (idx=0; idx<256; idx++) {
1691	buffer[deep] = idx;
1692
1693	if (!tree.full->son[idx].exists && tree.full->count[idx]) { // leaf
1694	DictTree rest;
1695	u_str buf = buffer+1;
1696	int len = deep;
1697
1698	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1699	buf += len-minwordlen;
1700	len = minwordlen;
1701	}
1702
1703	if (len==minwordlen) {
1704	cu_str self = memstr(buffer, deep+1, buf, len);
1705
1706	gb_assert(self);
1707	if (self==buf)
1708	rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.full->count[idx])/100);
1709	else
1710	rest.exists = NULp;
1711	}
1712	else {
1713	rest.exists = NULp;
1714	}
1715
1716	if (rest.exists) { // substring found!
1717	int cnt = COUNT(rest);
1718
1719	if (tree.full->count[idx]==0) tree.full->usedSons++;
1720	tree.full->son[idx] = rest;
1721	tree.full->count[idx] += cnt;
1722
1723	expand += cnt;
1724	#ifdef DUMP_EXPAND
1725	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1726	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1727	#endif
1728	}
1729	}
1730
1731	if (tree.full->son[idx].exists) {
1732	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.full->son[idx], root, max_percent);
1733
1734	expand += added;
1735	tree.full->count[idx] += added;
1736	}
1737	}
1738
1739	break;
1740	}
1741	}
1742	}
1743
1744	return expand;
1745	}
1746
1747	static DictTree build_dict_tree(O_gbdByKey gbk, long maxmem, long maxdeep, size_t minwordlen, long data_sum) {
1748	/* builds a tree of the most used words
1749	*
1750	* 'maxmem' is the amount of memory that will be allocated
1751	* 'maxdeep' is the maximum length of the _returned_ words
1752	* 'minwordlen' is the minimum length a word needs to get into the tree
1753	* this is used in the first pass as maximum tree depth
1754	* 'data_sum' will be set to the overall-size of data of which the tree was built
1755	*/
1756	DictTree tree;
1757	long memcount = 0;
1758	long leafs = 0;
1759
1760	*data_sum = 0;
1761
1762	{
1763	int cnt;
1764	long lowmem = (maxmem*9)/10;
1765	int cut_count = 1;
1766
1767	// Build 8-level-deep tree of all existing words
1768
1769	tree.exists = NULp; // empty tree
1770
1771	for (cnt=0; cnt<gbk->cnt; cnt++) {
1772	GBDATA *gbd = gbk->gbds[cnt];
1773
1774	if (COMPRESSIBLE(gbd->type())) {
1775	size_t size;
1776	cu_str data = get_data_n_size(gbd, &size);
1777	cu_str lastWord;
1778
1779	if (gbd->is_a_string()) size--;
1780	if (size<minwordlen) continue;
1781
1782	*data_sum += size;
1783	lastWord = data+size-minwordlen;
1784
1785	#ifdef SELECT_WORDS
1786	if (strnstr(data, size, SELECTED_WORDS)) // test some words only
1787	#endif
1788	{
1789
1790	for (; data<=lastWord; data++) {
1791	tree = add_to_dtree(tree, data, minwordlen, &memcount);
1792
1793	while (memcount>maxmem) {
1794	leafs = 0;
1795	tree = cut_dtree(tree, cut_count, &memcount, &leafs);
1796	if (memcount<=lowmem) break;
1797	cut_count++;
1798	}
1799	}
1800	}
1801	}
1802	}
1803	}
1804
1805	{
1806	int cutoff = 1;
1807
1808	leafs = 0;
1809	tree = cut_dtree(tree, cutoff, &memcount, &leafs); // cut all single elements
1810	test_dtree(tree);
1811
1812	#if defined(DEBUG)
1813	if (tree.exists) {
1814	int maxdeep2 = 0;
1815	long counted = count_dtree_leafs(tree, 0, &maxdeep2);
1816	gb_assert(leafs == counted);
1817	}
1818	#endif // DEBUG
1819
1820	// avoid directory overflow (max. 18bit)
1821	while (leafs >= MAX_LONG_INDEX) {
1822	leafs = 0;
1823	++cutoff;
1824	#if defined(DEBUG)
1825	printf("Directory overflow (%li) -- reducing size (cutoff = %i)\n", leafs, cutoff);
1826	#endif // DEBUG
1827	tree = cut_dtree(tree, cutoff, &memcount, &leafs);
1828	}
1829	}
1830	#ifdef DUMP_TREE
1831	printf("----------------------- tree with short branches:\n");
1832	dump_dtree(0, tree);
1833	printf("---------------------------\n");
1834	#endif
1835
1836	// Try to create longer branches
1837
1838	if (tree.exists) {
1839	int add_count;
1840	u_str buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
1841	int max_differ;
1842	long dummy;
1843
1844	if (tree.full->typ != FULL_NODE) tree = single2full_dtree(tree, &memcount); // ensure root is FULL_NODE
1845
1846	test_dtree(tree);
1847	calcCounts(tree); // calculate counters of inner nodes
1848	testCounts(tree);
1849
1850	for (max_differ=0; max_differ<=MAX_DIFFER; max_differ+=INCR_DIFFER) { // percent of allowed difference for concatenating tree branches
1851	do {
1852	int idx;
1853	add_count = 0;
1854
1855	for (idx=0; idx<256; idx++) {
1856	if (tree.full->son[idx].exists) {
1857	int added;
1858
1859	buffer[0] = idx;
1860	added = expandBranches(buffer, 1, minwordlen-1, maxdeep, tree.full->son[idx], tree, max_differ);
1861	tree.full->count[idx] += added;
1862	add_count += added;
1863	}
1864	}
1865	}
1866	while (add_count);
1867	}
1868
1869	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
1870
1871	tree = cut_useless_words(tree, 0, &dummy);
1872	}
1873
1874	#ifdef DUMP_TREE
1875	printf("----------------------- tree with expanded branches:\n");
1876	dump_dtree(0, tree);
1877	printf("-----------------------\n");
1878	#endif
1879	testCounts(tree);
1880
1881	return tree;
1882	}
1883
1884	static DictTree remove_word_from_dtree(DictTree tree, cu_str wordStart, int wordLen, u_str resultBuffer, int resultLen, long resultFrequency, long *removed) {
1885	/* searches 'tree' for a word starting with 'wordStart' an removes it from the tree
1886	* if there are more than one possibilities, the returned word will be the one with the most occurrences
1887	* if there was no possibility -> resultLen==0, tree unchanged
1888	* otherwise: resultBuffer contains the word, returns new tree with word removed
1889	*/
1890	long removed_single = 0;
1891	gb_assert(tree.exists);
1892	*removed = 0;
1893
1894	if (wordLen) { // search wanted path into tree
1895	switch (tree.full->typ) {
1896	case SINGLE_NODE: {
1897	if (tree.single->ch==*wordStart) {
1898	resultBuffer = wordStart;
1899
1900	if (tree.single->son.exists) {
1901	gb_assert(tree.single->count>0);
1902	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart+1, wordLen-1,
1903	resultBuffer+1, resultLen, resultFrequency,
1904	&removed_single);
1905	if (*resultLen) { // word removed
1906	gb_assert(tree.single->count>=removed_single);
1907	tree.single->count -= removed_single;
1908	*removed += removed_single;
1909	(*resultLen)++;
1910	}
1911	}
1912	else {
1913	*resultLen = wordLen==1; // if wordLen==1 -> fully overlapping word found
1914	*resultFrequency = tree.single->count;
1915	}
1916
1917	if (!tree.single->son.exists && *resultLen) { // if no son and a word was found -> remove branch
1918	DictTree brother = tree.single->brother;
1919
1920	*removed += tree.single->count;
1921	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1922
1923	if (brother.exists) tree = brother;
1924	else tree.exists = NULp;
1925	}
1926	}
1927	else if (tree.single->ch < *wordStart && tree.single->brother.exists) {
1928	tree.single->brother = remove_word_from_dtree(tree.single->brother, wordStart, wordLen,
1929	resultBuffer, resultLen, resultFrequency,
1930	&removed_single);
1931	if (resultLen) removed += removed_single;
1932	}
1933	else {
1934	*resultLen = 0; // not found
1935	}
1936
1937	break;
1938	}
1939	case FULL_NODE: {
1940	unsigned_char ch = *wordStart;
1941	*resultBuffer = ch;
1942
1943	if (tree.full->son[ch].exists) {
1944	tree.full->son[ch] = remove_word_from_dtree(tree.full->son[ch], wordStart+1, wordLen-1,
1945	resultBuffer+1, resultLen, resultFrequency,
1946	&removed_single);
1947	if (*resultLen) {
1948	if (tree.full->son[ch].exists) { // another son?
1949	tree.full->count[ch] -= removed_single;
1950	}
1951	else { // last son -> remove whole branch
1952	removed_single = tree.full->count[ch];
1953	tree.full->count[ch] = 0;
1954	tree.full->usedSons--;
1955	}
1956
1957	*removed += removed_single;
1958	(*resultLen)++;
1959	}
1960	}
1961	else if (tree.full->count[ch]) {
1962	*resultLen = (wordLen==1);
1963
1964	if (*resultLen) {
1965	removed += removed_single = resultFrequency = tree.full->count[ch];
1966	tree.full->count[ch] = 0;
1967	tree.full->usedSons--;
1968	}
1969	}
1970	else {
1971	*resultLen = 0; // not found
1972	}
1973
1974	if (!tree.full->usedSons) {
1975	free_dtree(tree);
1976	tree.exists = NULp;
1977	}
1978
1979	break;
1980	}
1981	}
1982	}
1983	else { // take any word
1984	switch (tree.full->typ) {
1985	case SINGLE_NODE: {
1986	*resultBuffer = tree.single->ch;
1987	gb_assert(tree.single->count>0);
1988
1989	if (tree.single->son.exists) {
1990	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart, wordLen,
1991	resultBuffer+1, resultLen, resultFrequency,
1992	&removed_single);
1993	gb_assert(*resultLen);
1994	(*resultLen)++;
1995	}
1996	else {
1997	*resultLen = 1;
1998	*resultFrequency = tree.single->count;
1999	removed_single = tree.single->count;
2000	}
2001
2002	gb_assert(*resultFrequency>0);
2003
2004	if (tree.single->son.exists) {
2005	gb_assert(tree.single->count>=removed_single);
2006	tree.single->count -= removed_single;
2007	*removed += removed_single;
2008	}
2009	else {
2010	DictTree brother = tree.single->brother;
2011
2012	*removed += tree.single->count;
2013	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
2014
2015	if (brother.exists) tree = brother;
2016	else tree.exists = NULp;
2017	}
2018
2019	break;
2020	}
2021	case FULL_NODE: {
2022	int idx;
2023
2024	for (idx=0; idx<256; idx++) {
2025	if (tree.full->son[idx].exists) {
2026	*resultBuffer = idx;
2027	tree.full->son[idx] = remove_word_from_dtree(tree.full->son[idx], wordStart, wordLen,
2028	resultBuffer+1, resultLen, resultFrequency,
2029	&removed_single);
2030	gb_assert(*resultLen);
2031	(*resultLen)++;
2032
2033	if (!tree.full->son[idx].exists) { // son branch removed -> zero count
2034	removed_single = tree.full->count[idx];
2035	tree.full->count[idx] = 0;
2036	tree.full->usedSons--;
2037	}
2038	else {
2039	tree.full->count[idx] -= removed_single;
2040	gb_assert(tree.full->count[idx]>0);
2041	}
2042
2043	break;
2044	}
2045	else if (tree.full->count[idx]) {
2046	*resultBuffer = idx;
2047	*resultLen = 1;
2048	*resultFrequency = tree.full->count[idx];
2049	removed_single = tree.full->count[idx];
2050	tree.full->count[idx] = 0;
2051	tree.full->usedSons--;
2052	break;
2053	}
2054	}
2055
2056	gb_assert(idx<256); // gb_assert break was used to exit loop (== node had a son)
2057
2058	*removed += removed_single;
2059
2060	if (!tree.full->usedSons) {
2061	free_dtree(tree);
2062	tree.exists = NULp;
2063	}
2064
2065	break;
2066	}
2067	}
2068	}
2069
2070	#ifdef DEBUG
2071	if (*resultLen) {
2072	gb_assert(*resultLen>0);
2073	gb_assert(*resultFrequency>0);
2074	gb_assert(*resultLen>=wordLen);
2075	}
2076	#endif
2077
2078	return tree;
2079	}
2080
2081	#define cmp(i1, i2) (heap2[i1]-heap2[i2])
2082	#define swap(i1, i2) do \
2083	{ \
2084	int s = heap[i1]; \
2085	heap[i1] = heap[i2]; \
2086	heap[i2] = s; \
2087	\
2088	s = heap2[i1]; \
2089	heap2[i1] = heap2[i2]; \
2090	heap2[i2] = s; \
2091	} \
2092	while (0)
2093
2094	static void downheap(int heap, int heap2, int me, int num) {
2095	int lson = me*2;
2096	int rson = lson+1;
2097
2098	gb_assert(me>=1);
2099	if (lson>num) return;
2100
2101	if (cmp(lson, me)<0) { // left son smaller than me? (we sort in descending order!!!)
2102	if (rson<=num && cmp(lson, rson)>0) { // right son smaller than left son?
2103	swap(me, rson);
2104	downheap(heap, heap2, rson, num);
2105	}
2106	else {
2107	swap(me, lson);
2108	downheap(heap, heap2, lson, num);
2109	}
2110	}
2111	else if (rson<=num && cmp(me, rson)>0) { // right son smaller than me?
2112	swap(me, rson);
2113	downheap(heap, heap2, rson, num);
2114	}
2115	}
2116
2117	#undef cmp
2118	#undef swap
2119
2120
2121
2122	#define cmp(i1, i2) GB_MEMCMP(dict->text+dict->offsets[heap[i1]], dict->text+dict->offsets[heap[i2]], dict->textlen)
2123	#define swap(i1, i2) do { int s = heap[i1]; heap[i1] = heap[i2]; heap[i2] = s; } while (0)
2124
2125	static void downheap2(int heap, GB_DICTIONARY dict, int me, int num) {
2126	int lson = me*2;
2127	int rson = lson+1;
2128
2129	gb_assert(me>=1);
2130	if (lson>num) return;
2131
2132	if (cmp(lson, me)>0) { // left son bigger than me?
2133	if (rson<=num && cmp(lson, rson)<0) { // right son bigger than left son?
2134	swap(me, rson);
2135	downheap2(heap, dict, rson, num);
2136	}
2137	else {
2138	swap(me, lson);
2139	downheap2(heap, dict, lson, num);
2140	}
2141	}
2142	else if (rson<=num && cmp(me, rson)<0) { // right son bigger than me?
2143	swap(me, rson);
2144	downheap2(heap, dict, rson, num);
2145	}
2146	}
2147
2148	#undef cmp
2149	#undef swap
2150
2151	static void sort_dict_offsets(GB_DICTIONARY *dict) {
2152	/* 1. sorts the 'dict->offsets' by frequency
2153	* (frequency of each offset is stored in the 'dict->resort' with the same index)
2154	* 2. initializes & sorts 'dict->resort' in alphabetic order
2155	*/
2156	int i;
2157	int num = dict->words;
2158	int *heap = dict->offsets-1;
2159	int *heap2 = dict->resort-1;
2160
2161	// sort offsets
2162
2163	for (i=num/2; i>=1; i--) downheap(heap, heap2, i, num); // make heap
2164
2165	while (num>1) { // sort heap
2166	int big = heap[1];
2167	int big2 = heap2[1];
2168
2169	heap[1] = heap[num];
2170	heap2[1] = heap2[num];
2171
2172	downheap(heap, heap2, 1, num-1);
2173
2174	heap[num] = big;
2175	heap2[num] = big2;
2176
2177	num--;
2178	}
2179
2180	// initialize dict->resort
2181
2182	for (i=0, num=dict->words; i<num; i++) dict->resort[i] = i; // LOOP_VECTORIZED
2183
2184	// sort dictionary alphabetically
2185
2186	for (i=num/2; i>=1; i--) downheap2(heap2, dict, i, num); // make heap
2187
2188	while (num>1) {
2189	int big = heap2[1];
2190
2191	heap2[1] = heap2[num];
2192	downheap2(heap2, dict, 1, num-1);
2193	heap2[num] = big;
2194	num--;
2195	}
2196	}
2197
2198	// Warning dictionary is not in network byte order !!!!
2199	static GB_DICTIONARY gb_create_dictionary(O_gbdByKey gbk, long maxmem) {
2200	long data_sum;
2201	DictTree tree = build_dict_tree(gbk, maxmem, MAX_WORD_LEN, MIN_WORD_LEN, &data_sum);
2202
2203	if (tree.exists) {
2204	GB_DICTIONARY dict = (GB_DICTIONARY)gbm_get_mem(sizeof(*dict), GBM_DICT_INDEX);
2205	int maxdeep = 0;
2206	int words = count_dtree_leafs(tree, 0, &maxdeep);
2207	u_str word;
2208
2209	int wordLen;
2210	long wordFrequency;
2211	int offset = 0; // next free position in dict->text
2212	int overlap = 0; // # of bytes overlapping with last word
2213	u_str buffer;
2214	long dummy;
2215	#if defined(DEBUG)
2216	long word_sum = 0;
2217	long overlap_sum = 0;
2218	long max_overlap = 0;
2219	#endif
2220
2221	// reduce tree as long as it has to many leafs (>MAX_LONG_INDEX)
2222	while (words >= MAX_LONG_INDEX) {
2223
2224	words = count_dtree_leafs(tree, 0, &maxdeep);
2225	}
2226
2227	buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
2228
2229	calcCounts(tree);
2230	testCounts(tree);
2231
2232	#if DEBUG
2233	printf(" examined data was %li bytes\n", data_sum);
2234	printf(" tree contains %i words *** maximum tree depth = %i\n", words, maxdeep);
2235	#endif
2236
2237	dict->words = 0;
2238	dict->textlen = DICT_STRING_INCR;
2239
2240	dict->text = (u_str)gbm_get_mem(DICT_STRING_INCR, GBM_DICT_INDEX);
2241	dict->offsets = (GB_NINT)gbm_get_mem(sizeof((dict->offsets))*words, GBM_DICT_INDEX);
2242	dict->resort = (GB_NINT)gbm_get_mem(sizeof((dict->resort))*words, GBM_DICT_INDEX);
2243
2244	memset(buffer, '*', maxdeep);
2245	tree = remove_word_from_dtree(tree, NULp, 0, buffer, &wordLen, &wordFrequency, &dummy);
2246	testCounts(tree);
2247
2248	while (1) {
2249	int nextWordLen = 0;
2250	int len;
2251
2252	#if DUMP_COMPRESSION_TEST>=4
2253	printf("word='%s' (occur=%li overlap=%i)\n", lstr(buffer, wordLen), wordFrequency, overlap);
2254	#endif
2255
2256	#if defined(DEBUG)
2257	overlap_sum += overlap;
2258	if (overlap>max_overlap) max_overlap = overlap;
2259	word_sum += wordLen;
2260	#endif
2261
2262	if (offset-overlap+wordLen > dict->textlen) { // if not enough space allocated -> reallocate dictionary string
2263	u_str ntext = (u_str)gbm_get_mem(dict->textlen+DICT_STRING_INCR, GBM_DICT_INDEX);
2264
2265	memcpy(ntext, dict->text, dict->textlen);
2266	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2267
2268	dict->text = ntext;
2269	dict->textlen += DICT_STRING_INCR;
2270	}
2271
2272	dict->offsets[dict->words] = offset-overlap;
2273	dict->resort[dict->words] = wordFrequency; // temporarily miss-use this to store frequency
2274	dict->words++;
2275
2276	word = dict->text+offset-overlap;
2277	gb_assert(overlap==0 \|\| GB_MEMCMP(word, buffer, overlap)==0); // test overlapping string-part
2278	memcpy(word, buffer, wordLen); // word -> dictionary string
2279	offset += wordLen-overlap;
2280
2281	if (!tree.exists) break;
2282
2283	for (len=min(10, wordLen-1); len>=0 && nextWordLen==0; len--) {
2284	memset(buffer, '*', maxdeep);
2285	tree = remove_word_from_dtree(tree, word+wordLen-len, len, buffer, &nextWordLen, &wordFrequency, &dummy);
2286	overlap = len;
2287	}
2288
2289	wordLen = nextWordLen;
2290	}
2291
2292	gb_assert(dict->words <= MAX_LONG_INDEX);
2293	gb_assert(dict->words==words); /* dict->words == # of words stored in dictionary string
2294	* words == # of words pre-calculated */
2295
2296	#if DEBUG
2297	printf(" word_sum=%li overlap_sum=%li (%li%%) max_overlap=%li\n",
2298	word_sum, overlap_sum, (overlap_sum*100)/word_sum, max_overlap);
2299	#endif
2300
2301	if (offset<dict->textlen) { // reallocate dict->text if it was allocated too large
2302	u_str ntext = (u_str)gbm_get_mem(offset, GBM_DICT_INDEX);
2303
2304	memcpy(ntext, dict->text, offset);
2305	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2306
2307	dict->text = ntext;
2308	dict->textlen = offset;
2309	}
2310
2311	sort_dict_offsets(dict);
2312
2313	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
2314	free_dtree(tree);
2315
2316	return dict;
2317	}
2318
2319	return NULp;
2320	}
2321
2322	static void gb_free_dictionary(GB_DICTIONARY*& dict) {
2323	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2324	gbm_free_mem(dict->offsets, sizeof((dict->offsets))dict->words, GBM_DICT_INDEX);
2325	gbm_free_mem(dict->resort, sizeof((dict->resort))dict->words, GBM_DICT_INDEX);
2326
2327	gbm_free_mem(dict, sizeof(*dict), GBM_DICT_INDEX);
2328	dict = NULp;
2329	}
2330
2331	static GB_ERROR readAndWrite(O_gbdByKey *gbkp, size_t& old_size, size_t& new_size) {
2332	GB_ERROR error = NULp;
2333
2334	old_size = 0;
2335	new_size = 0;
2336
2337	for (int i=0; i<gbkp->cnt && !error; i++) {
2338	GBDATA *gbd = gbkp->gbds[i];
2339
2340	if (COMPRESSIBLE(gbd->type())) {
2341	size_t size;
2342	char *data;
2343
2344	{
2345	cu_str d = (cu_str)get_data_n_size(gbd, &size);
2346	old_size += gbd->as_entry()->memsize();
2347
2348	data = (char*)gbm_get_mem(size, GBM_DICT_INDEX);
2349	memcpy(data, d, size);
2350	gb_assert(data[size-1] == 0);
2351	}
2352
2353	switch (gbd->type()) {
2354	case GB_STRING:
2355	error = GB_write_string(gbd, "");
2356	if (!error) error = GB_write_string(gbd, data);
2357	break;
2358	case GB_BYTES:
2359	error = GB_write_bytes(gbd, NULp, 0);
2360	if (!error) error = GB_write_bytes(gbd, data, size);
2361	break;
2362	case GB_INTS:
2363	error = GB_write_ints(gbd, (GB_UINT4 *)NULp, 0);
2364	if (!error) error = GB_write_ints(gbd, (GB_UINT4 *)data, size);
2365	break;
2366	case GB_FLOATS:
2367	error = GB_write_floats(gbd, (float*)NULp, 0);
2368	if (!error) error = GB_write_floats(gbd, (float)(void)data, size);
2369	break;
2370	default:
2371	gb_assert(0);
2372	break;
2373	}
2374
2375	new_size += gbd->as_entry()->memsize();
2376
2377	gbm_free_mem(data, size, GBM_DICT_INDEX);
2378	}
2379	}
2380	return error;
2381	}
2382
2383	static GB_ERROR gb_create_dictionaries(GB_MAIN_TYPE *Main, long maxmem) {
2384	GB_ERROR error = NULp;
2385	#if defined(TEST_DICT)
2386	long uncompressed_sum = 0;
2387	long compressed_sum = 0;
2388	#endif // TEST_DICT
2389
2390	printf("Creating GBDATA-Arrays..\n");
2391
2392	if (!error) {
2393	O_gbdByKey *gbk = g_b_opti_createGbdByKey(Main);
2394	int idx = -1;
2395
2396	printf("Creating dictionaries..\n");
2397
2398	#ifdef DEBUG
2399	// #define TEST_ONE // test only key specified below
2400	// #define TEST_SOME // test only some keys specified below
2401	#if defined(TEST_ONE)
2402	// select wanted index
2403	for (idx=0; idx<gbdByKey_cnt; idx++) { // title author dew_author ebi_journal name ua_tax date full_name ua_title
2404	if (gbk[idx].cnt && strcmp(quark2key(Main, idx), "tree")==0) break;
2405	}
2406	gb_assert(idx<gbdByKey_cnt);
2407	#endif
2408	#endif
2409
2410	#ifdef TEST_ONE
2411	// only create dictionary for index selected above (no loop)
2412	#else
2413	// create dictionaries for all indices (this is the normal operation)
2414	arb_progress progress("Optimizing key data", long(gbdByKey_cnt-1));
2415	for (idx = gbdByKey_cnt-1; idx >= 1 && !error; --idx, progress.inc_and_check_user_abort(error))
2416	#endif
2417
2418	{
2419	GB_DICTIONARY *dict;
2420
2421	GB_CSTR key_name = quark2key(Main, idx);
2422	GBDATA *gb_main = Main->gb_main();
2423
2424	#ifdef TEST_SOME
2425	if (!( // add all wanted keys here
2426	strcmp(key_name, "REF") == 0 \|\|
2427	strcmp(key_name, "ref") == 0
2428	)) continue;
2429	#endif // TEST_SOME
2430
2431	#ifndef TEST_ONE
2432	if (!gbk[idx].cnt) continue; // there are no entries with this quark
2433
2434	GB_TYPES type = gbk[idx].gbds[0]->type();
2435
2436	GB_begin_transaction(gb_main);
2437	int compression_mask = gb_get_compression_mask(Main, idx, type);
2438	GB_commit_transaction(gb_main);
2439
2440	if ((compression_mask & GB_COMPRESSION_DICTIONARY) == 0) continue; // compression with dictionary is not allowed
2441	if (strcmp(key_name, "data") == 0) continue;
2442	if (strcmp(key_name, "quality") == 0) continue;
2443	#endif
2444
2445	printf("- dictionary for '%s' (idx=%i)\n", key_name, idx);
2446	GB_begin_transaction(gb_main);
2447	dict = gb_create_dictionary(&(gbk[idx]), maxmem);
2448
2449	if (dict) {
2450	/* decompress with old dictionary and write
2451	all data of actual type without compression: */
2452
2453	printf(" * Uncompressing all with old dictionary ...\n");
2454
2455	size_t old_compressed_size;
2456
2457	{
2458	int& compr_mask = Main->keys[idx].compression_mask;
2459	int old_compr_mask = compr_mask;
2460
2461	size_t new_size;
2462
2463	compr_mask &= ~GB_COMPRESSION_DICTIONARY;
2464	error = readAndWrite(&gbk[idx], old_compressed_size, new_size);
2465	compr_mask = old_compr_mask;
2466	}
2467
2468	if (!error) {
2469	/* dictionary is saved in the following format:
2470	*
2471	* GB_NINT size
2472	* GB_NINT offsets[dict->words]
2473	* GB_NINT resort[dict->words]
2474	* char *text
2475	*/
2476
2477	int dict_buffer_size = sizeof(GB_NINT) * (1+dict->words*2) + dict->textlen;
2478	char dict_buffer = (char)gbm_get_mem(dict_buffer_size, GBM_DICT_INDEX);
2479	long old_dict_buffer_size;
2480	char *old_dict_buffer;
2481
2482	{
2483	GB_NINT nint = (GB_NINT)dict_buffer;
2484	int n;
2485
2486	*nint++ = htonl(dict->words);
2487	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->offsets[n]);
2488	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->resort[n]);
2489
2490	memcpy(nint, dict->text, dict->textlen);
2491	}
2492
2493	const char *key = Main->keys[idx].key;
2494
2495	error = gb_load_dictionary_data(gb_main, key, &old_dict_buffer, &old_dict_buffer_size);
2496	if (!error) {
2497	gb_save_dictionary_data(gb_main, key, dict_buffer, dict_buffer_size);
2498
2499	// compress all data with new dictionary
2500	printf(" * Compressing all with new dictionary ...\n");
2501
2502	size_t old_size, new_compressed_size;
2503	error = readAndWrite(&gbk[idx], old_size, new_compressed_size);
2504
2505	if (!error) {
2506	printf(" (compressed size: old=%zu new=%zu ratio=%.1f%%)\n",
2507	old_compressed_size, new_compressed_size, (new_compressed_size*100.0)/old_compressed_size);
2508
2509	// @@@ for some keys compression fails (e.g. 'ref');
2510	// need to find out why dictionary is so bad
2511	// gb_assert(new_compressed_size <= old_compressed_size); // @@@ enable this (fails in unit-test)
2512	}
2513
2514	if (error) {
2515	/* critical state: new dictionary has been written, but transaction will be aborted below.
2516	* Solution: Write back old dictionary.
2517	*/
2518	gb_save_dictionary_data(gb_main, key, old_dict_buffer, old_dict_buffer_size);
2519	}
2520	}
2521
2522	gbm_free_mem(dict_buffer, dict_buffer_size, GBM_DICT_INDEX);
2523	if (old_dict_buffer) gbm_free_mem(old_dict_buffer, old_dict_buffer_size, GBM_DICT_INDEX);
2524
2525	#if defined(TEST_DICT)
2526	if (!error) {
2527	GB_DICTIONARY *dict_reloaded = gb_get_dictionary(Main, idx);
2528	test_dictionary(dict_reloaded, &(gbk[idx]), &uncompressed_sum, &compressed_sum);
2529	}
2530	#endif // TEST_DICT
2531	}
2532
2533	gb_free_dictionary(dict);
2534	}
2535
2536	error = GB_end_transaction(gb_main, error);
2537	}
2538
2539	#ifdef TEST_DICT
2540	if (!error) {
2541	printf(" overall uncompressed size = %li b\n"
2542	" overall compressed size = %li b (Ratio=%li%%)\n",
2543	uncompressed_sum, compressed_sum,
2544	(compressed_sum*100)/uncompressed_sum);
2545	}
2546	#endif // TEST_DICT
2547
2548	printf("Done.\n");
2549
2550	g_b_opti_freeGbdByKey(gbk);
2551	}
2552
2553	return error;
2554	}
2555
2556	GB_ERROR GB_optimize(GBDATA *gb_main) {
2557	unsigned long maxKB = GB_get_usable_memory();
2558	long maxMem;
2559	GB_ERROR error = NULp;
2560	GB_UNDO_TYPE prev_undo_type = GB_get_requested_undo_type(gb_main);
2561
2562	#ifdef DEBUG
2563	maxKB /= 2;
2564	#endif
2565
2566	if (maxKB<=(LONG_MAX/1024)) maxMem = maxKB*1024;
2567	else maxMem = LONG_MAX;
2568
2569	error = GB_request_undo_type(gb_main, GB_UNDO_KILL);
2570	if (!error) {
2571	error = gb_create_dictionaries(GB_MAIN(gb_main), maxMem);
2572	if (!error) GB_disable_quicksave(gb_main, "Database optimized");
2573	ASSERT_NO_ERROR(GB_request_undo_type(gb_main, prev_undo_type));
2574	}
2575	return error;
2576	}
2577
2578	// --------------------------------------------------------------------------------
2579
2580	#ifdef UNIT_TESTS
2581	#ifndef TEST_UNIT_H
2582	#include <test_unit.h>
2583	#endif
2584
2585	// #define TEST_AUTO_UPDATE // uncomment to auto-update binary result of DB optimization
2586	// #define TEST_AUTO_UPDATE_ASCII // uncomment to auto-update ascii-input from output (be careful with this!)
2587
2588	void TEST_SLOW_optimize() {
2589	// test DB optimization (optimizes compression)
2590	// [current coverage ~89%]
2591	//
2592	// Note: a test for sequence compression is in adseqcompr.cxx@TEST_SLOW_sequence_compression
2593
2594	GB_shell shell;
2595
2596	const char *source_ascii = "TEST_opti_ascii_in.arb";
2597	const char *target_ascii = "TEST_opti_ascii_out.arb";
2598
2599	const char *nonopti = "TEST_opti_none.arb";
2600	const char *optimized = "TEST_opti_initial.arb";
2601	const char *reoptimized = "TEST_opti_again.arb";
2602	const char *expected = "TEST_opti_expected.arb"; // expected result after optimize
2603
2604	// initial optimization of ASCII-DB
2605	{
2606	GBDATA *gb_main = GB_open(source_ascii, "rw");
2607
2608	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, nonopti, "b"));
2609
2610	{
2611	GB_topSecurityLevel unsecured(gb_main);
2612	TEST_EXPECT_NO_ERROR(GB_optimize(gb_main));
2613	}
2614
2615	GB_flush_cache(gb_main);
2616
2617	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, optimized, "b"));
2618	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2619
2620	#if defined(TEST_AUTO_UPDATE)
2621	TEST_COPY_FILE(optimized, expected);
2622	#endif
2623	#if defined(TEST_AUTO_UPDATE_ASCII)
2624	TEST_COPY_FILE(target_ascii, source_ascii);
2625	#endif
2626
2627	TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2628	TEST_EXPECT_FILES_EQUAL(optimized, expected);
2629
2630	// check that optimization made sense:
2631	long nonopti_size = GB_size_of_file(nonopti);
2632	long optimized_size = GB_size_of_file(optimized);
2633	TEST_EXPECT_LESS(optimized_size, nonopti_size); // did file shrink?
2634	TEST_EXPECT_EQUAL(optimized_size*100/nonopti_size, 74); // document compression ratio (in percent)
2635
2636	GB_close(gb_main);
2637	}
2638
2639	// re-optimize DB (which is already compressed)
2640	{
2641	GBDATA *gb_main = GB_open(optimized, "rw");
2642
2643	{
2644	GB_topSecurityLevel unsecured(gb_main);
2645	TEST_EXPECT_NO_ERROR(GB_optimize(gb_main));
2646	}
2647
2648	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, reoptimized, "b"));
2649	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2650
2651	TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2652	TEST_EXPECT_FILES_EQUAL(reoptimized, expected); // reoptimize should produce same result as initial optimize
2653
2654	GB_close(gb_main);
2655	}
2656
2657	TEST_EXPECT_NO_ERROR(GBK_system(GBS_global_string("rm %s %s %s %s", target_ascii, nonopti, optimized, reoptimized)));
2658	}
2659
2660	void TEST_ticket_742() {
2661	// reproduce problem reported in ticket #742
2662	GB_shell shell;
2663	{
2664	const char *opti_db = "TEST_opti_expected.arb"; // expected result after optimize
2665	GBDATA *gb_main = GBT_open(opti_db, "rw"); // use GBT_open to create index for 'name'
2666
2667	for (int suf = 2; suf<=4; ++suf) {
2668	char *sainame = GBS_global_string_copy("forcecompress_%i", suf);
2669	char *mod_sainame = GBS_global_string_copy("%s.mod", sainame);
2670
2671	GBDATA *gb_sai_name;
2672	{
2673	GB_transaction ta(gb_main);
2674
2675	GBDATA *gb_sai = GBT_find_SAI(gb_main, sainame); TEST_REJECT_NULL(gb_sai);
2676	gb_sai_name = GB_entry(gb_sai, "name"); TEST_REJECT_NULL(gb_sai_name);
2677
2678	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), sainame);
2679
2680	TEST_EXPECT_NO_ERROR(GB_write_string(gb_sai_name, mod_sainame));
2681	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame); // fixed
2682
2683	switch (suf) {
2684	case 2:
2685	// do nothing special (only test value in next TA below)
2686	break;
2687	case 3:
2688	// flushing the cache "solves" the problem => GB_write_string does not flush correctly
2689	GB_flush_cache(gb_sai_name);
2690	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame);
2691	break;
2692	case 4:
2693	// writing twice also "solves" the problem
2694	TEST_EXPECT_NO_ERROR(GB_write_string(gb_sai_name, mod_sainame));
2695	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame);
2696	break;
2697
2698
2699	default:
2700	TEST_REJECT(true);
2701	break;
2702	}
2703	}
2704
2705	{
2706	GB_transaction ta(gb_main);
2707	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_sai_name), mod_sainame); // ok in next TA
2708	}
2709
2710	free(mod_sainame);
2711	free(sainame);
2712	}
2713
2714	{
2715	GB_transaction ta(gb_main);
2716
2717	GBDATA *gb_species = GBT_find_species(gb_main, "FrnPhilo"); TEST_REJECT_NULL(gb_species);
2718	GBDATA *gb_data = GB_create(gb_species, "data", GB_STRING); TEST_REJECT_NULL(gb_data);
2719
2720	const char *seq1 = ".....................AUUCUGGUU-----GA--U-CC-U-G------------..............";
2721	const char *seq2 = "........A-A--CU---------------C-A-A-A-G-GA-G--AC---A-C-U-G...............";
2722
2723	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq1));
2724	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq1);
2725	TEST_EXPECT_EQUAL(gb_data->flags.compressed_data, 1);
2726	TEST_EXPECT_EQUAL(gb_data->flags2.extern_data, 1);
2727
2728	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq2));
2729	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2);
2730	TEST_EXPECT_EQUAL(gb_data->flags.compressed_data, 1);
2731	TEST_EXPECT_EQUAL(gb_data->flags2.extern_data, 1);
2732
2733	TEST_EXPECT_NO_ERROR(GB_create_index(GBT_get_species_data(gb_main), "data", GB_IGNORE_CASE, 20));
2734
2735	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq1));
2736	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq1); // fixed
2737
2738	TEST_EXPECT_NO_ERROR(GB_write_string(gb_data, seq2));
2739	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2); // seems to work ...
2740
2741	GB_flush_cache(gb_data);
2742	TEST_EXPECT_EQUAL(GB_read_char_pntr(gb_data), seq2); // fixed
2743	}
2744
2745	GB_close(gb_main);
2746	}
2747	}
2748
2749	void TEST_AFTER_SLOW_streamed_ascii_save_asUsedBy_silva_pipeline() { // run after TEST_SLOW_loadsave!
2750	GB_shell shell;
2751
2752	const char *loadname = "TEST_loadsave_ascii.arb";
2753	const char *savename = "TEST_streamsaved.arb";
2754
2755	{
2756	GBDATA *gb_main1 = GB_open(loadname, "r"); TEST_REJECT_NULL(gb_main1);
2757	GBDATA *gb_main2 = GB_open(loadname, "rw"); TEST_REJECT_NULL(gb_main2);
2758
2759	// delete all species from DB2
2760	GBDATA *gb_species_data2 = NULp;
2761	{
2762	GB_transaction ta(gb_main2);
2763	GB_topSecurityLevel unsecured(gb_main2);
2764	gb_species_data2 = GBT_get_species_data(gb_main2);
2765	for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data2);
2766	gb_species;
2767	gb_species = GBT_next_species(gb_species))
2768	{
2769	TEST_EXPECT_NO_ERROR(GB_delete(gb_species));
2770	}
2771	}
2772
2773	for (int saveWhileTransactionOpen = 0; saveWhileTransactionOpen<=1; ++saveWhileTransactionOpen) {
2774	{
2775	ArbDBWriter *writer = NULp;
2776	TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "a", writer));
2777	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_main2, gb_species_data2));
2778
2779	{
2780	GB_transaction ta1(gb_main1);
2781	GB_transaction ta2(gb_main2);
2782
2783	for (GBDATA *gb_species1 = GBT_first_species(gb_main1);
2784	gb_species1;
2785	gb_species1 = GBT_next_species(gb_species1))
2786	{
2787	GBDATA *gb_species2 = GB_create_container(gb_species_data2, "species");
2788	if (!gb_species2) {
2789	TEST_EXPECT_NO_ERROR(GB_await_error());
2790	}
2791	else {
2792	TEST_EXPECT_NO_ERROR(GB_copy_dropMarksAndTempstate(gb_species2, gb_species1));
2793	GB_write_flag(gb_species2, GB_read_flag(gb_species1)); // copy marked flag
2794	}
2795
2796	if (!saveWhileTransactionOpen) GB_commit_transaction(gb_main2);
2797	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_species2, gb_species2));
2798	if (!saveWhileTransactionOpen) GB_begin_transaction(gb_main2);
2799
2800	GB_topSecurityLevel unsecured(gb_main2);
2801	TEST_EXPECT_NO_ERROR(GB_delete(gb_species2));
2802	}
2803	}
2804
2805	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_species_data2, gb_main2));
2806	TEST_EXPECT_NO_ERROR(GB_finish_stream_save(writer));
2807	}
2808
2809	// test file content
2810	TEST_EXPECT_TEXTFILES_EQUAL(savename, loadname); // (Note: if this fails, run auto-update in TEST_SLOW_loadsave)
2811	TEST_EXPECT_ZERO_OR_SHOW_ERRNO(GB_unlink(savename));
2812	}
2813
2814	// test some error cases:
2815	{
2816	// 1. stream binary (not supported)
2817	ArbDBWriter *writer = NULp;
2818	TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "b", writer));
2819	TEST_EXPECT_ERROR_CONTAINS(GB_stream_save_part(writer, gb_main2, gb_species_data2), "only supported for ascii");
2820	GB_finish_stream_save(writer);
2821	TEST_EXPECT(!GB_is_regularfile(savename)); // no partial file remains
2822
2823	// 2. invalid use of GB_stream_save_part (not ancestors)
2824	TEST_EXPECT_NO_ERROR(GB_start_streamed_save_as(gb_main2, savename, "a", writer));
2825	TEST_EXPECT_NO_ERROR(GB_stream_save_part(writer, gb_main2, gb_species_data2));
2826	GBDATA *gb_extended_data2;
2827	{
2828	GB_transaction ta(gb_main2);
2829	gb_extended_data2 = GBT_get_SAI_data(gb_main2);
2830	}
2831	TEST_EXPECT_ERROR_CONTAINS(GB_stream_save_part(writer, gb_species_data2, gb_extended_data2), "has to be an ancestor");
2832	GB_finish_stream_save(writer);
2833	TEST_EXPECT(!GB_is_regularfile(savename)); // no partial file remains
2834
2835	}
2836	// TEST_EXPECT_ZERO_OR_SHOW_ERRNO(GB_unlink(savename));
2837
2838	GB_close(gb_main2);
2839	GB_close(gb_main1);
2840	}
2841	}
2842
2843	#endif // UNIT_TESTS
2844
2845	// --------------------------------------------------------------------------------
2846

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/ARBDB/adoptimize.cxx

Download in other formats: