Context Navigation

source: tags/arb-6.0.5/ARBDB/adoptimize.cxx

Visit:

Last change on this file was 11838, checked in by westram, 12 years ago
code visibility made functions static moved functions into UNIT_TESTS section
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 86.9 KB

Line
1	// =============================================================== //
2	// //
3	// File : adoptimize.cxx //
4	// Purpose : //
5	// //
6	// Institute of Microbiology (Technical University Munich) //
7	// http://www.arb-home.de/ //
8	// //
9	// =============================================================== //
10
11	#include <climits>
12	#include <netinet/in.h>
13
14	#include <arb_file.h>
15	#include <arb_diff.h>
16
17	#include <arbdbt.h>
18
19	#include "gb_key.h"
20	#include "gb_compress.h"
21	#include "gb_dict.h"
22
23	#include "arb_progress.h"
24
25	#if defined(DEBUG)
26	// #define TEST_DICT
27	#endif // DEBUG
28
29	typedef unsigned char unsigned_char;
30	typedef unsigned char *u_str;
31	typedef const unsigned char *cu_str;
32
33	static int gbdByKey_cnt;
34	struct O_gbdByKey { // one for each diff. keyQuark
35	int cnt;
36	GBDATA **gbds; // gbdoff
37	};
38
39	struct FullDictTree;
40	struct SingleDictTree;
41
42	union DictTree {
43	FullDictTree *full;
44	SingleDictTree *single;
45	void *exists;
46
47	};
48
49	enum DictNodeType { SINGLE_NODE, FULL_NODE };
50
51	struct FullDictTree {
52	DictNodeType typ; // always FULL_NODE
53	int usedSons;
54	int count[256];
55	DictTree son[256]; // index == character
56	};
57
58	struct SingleDictTree {
59	DictNodeType typ; // always SINGLE_NODE
60	unsigned_char ch; // the character
61	int count; // no of occurrences of this branch
62	DictTree son;
63	DictTree brother;
64
65	};
66
67	// **************************************************
68
69	#define COMPRESSIBLE(type) ((type) >= GB_BYTES && (type)<=GB_STRING)
70	#define DICT_MEM_WEIGHT 4
71
72	#define WORD_HELPFUL(wordlen, occurrences) ((long)((occurrences)3 + DICT_MEM_WEIGHT(2*sizeof(GB_NINT)+(wordlen))) \
73	< \
74	(long)((occurrences)*(wordlen)))
75	/* (occurrences)*4 compressed size
76	* 2*sizeof(GB_NINT)+(wordlen) size in dictionary
77	* (occurrences)*(wordlen) uncompressed size
78	*/
79
80	// **************************************************
81
82	#define MIN_WORD_LEN 8 // minimum length of words in dictionary
83	#define MAX_WORD_LEN 50 // maximum length of words in dictionary
84	#define MAX_BROTHERS 10 /* maximum no of brothers linked with SingleDictTree
85	* above we use FullDictTree */
86	#define MAX_DIFFER 2 /* percentage of difference (of occurrences of strings) below which two
87	* consecutive parts are treated as EQUAL # of occurrences */
88	#define INCR_DIFFER 1 // the above percentage is incremented from 0 to MAX_DIFFER by INCR_DIFFER per step
89
90	#define DICT_STRING_INCR 1024 // dictionary string will be incremented by this size
91
92	// ***************** Tool functions ****************
93
94	inline cu_str get_data_n_size(GBDATA gbd, size_t size) {
95	GB_CSTR data;
96	*size = 0;
97
98	switch (gbd->type()) {
99	case GB_STRING: data = GB_read_char_pntr(gbd); break;
100	case GB_LINK: data = GB_read_link_pntr(gbd); break;
101	case GB_BYTES: data = GB_read_bytes_pntr(gbd); break;
102	case GB_INTS: data = (char*)GB_read_ints_pntr(gbd); break;
103	case GB_FLOATS: data = (char*)GB_read_floats_pntr(gbd); break;
104	default:
105	data = 0;
106	gb_assert(0);
107	break;
108	}
109
110	if (data) *size = gbd->as_entry()->uncompressed_size();
111	return (cu_str)data;
112	}
113
114	static inline long min(long a, long b) {
115	return a<b ? a : b;
116	}
117
118	// **************************************************
119
120	static void g_b_opti_scanGbdByKey(GB_MAIN_TYPE Main, GBDATA gbd, O_gbdByKey *gbk) {
121	if (gbd->is_container()) {
122	GBCONTAINER *gbc = gbd->as_container();
123	for (int idx=0; idx < gbc->d.nheader; idx++) {
124	GBDATA *gbd2 = GBCONTAINER_ELEM(gbc, idx);
125	if (gbd2) g_b_opti_scanGbdByKey(Main, gbd2, gbk);
126	}
127	}
128
129	GBQUARK quark = GB_KEY_QUARK(gbd);
130	if (quark)
131	{
132	gb_assert(gbk[quark].cnt < Main->keys[quark].nref \|\| quark==0);
133	gb_assert(gbk[quark].gbds != 0);
134
135	gbk[quark].gbds[gbk[quark].cnt] = gbd;
136	gbk[quark].cnt++;
137	}
138	}
139
140	static O_gbdByKey g_b_opti_createGbdByKey(GB_MAIN_TYPE Main)
141	{
142	int idx;
143	O_gbdByKey gbk = (O_gbdByKey )GB_calloc(Main->keycnt, sizeof(O_gbdByKey));
144
145	gbdByKey_cnt = Main->keycnt; // always use gbdByKey_cnt instead of Main->keycnt cause Main->keycnt can change
146
147	for (idx=1; idx<gbdByKey_cnt; idx++) {
148	gbk[idx].cnt = 0;
149
150	if (Main->keys[idx].key && Main->keys[idx].nref>0) {
151	gbk[idx].gbds = (GBDATA *) GB_calloc(Main->keys[idx].nref, sizeof(GBDATA));
152	}
153	else {
154	gbk[idx].gbds = NULL;
155	}
156	}
157
158	gbk[0].cnt = 0;
159	gbk[0].gbds = (GBDATA *)GB_calloc(1, sizeof(GBDATA));
160
161	g_b_opti_scanGbdByKey(Main, Main->gb_main(), gbk);
162
163	for (idx=0; idx<gbdByKey_cnt; idx++) {
164	if (gbk[idx].cnt != Main->keys[idx].nref && idx)
165	{
166	printf("idx=%i gbk[idx].cnt=%i Main->keys[idx].nref=%li\n", // Main->keys[].nref ist falsch
167	idx, gbk[idx].cnt, Main->keys[idx].nref);
168
169	Main->keys[idx].nref = gbk[idx].cnt;
170	}
171	}
172	return gbk;
173	}
174
175	static void g_b_opti_freeGbdByKey(O_gbdByKey *gbk) {
176	for (int idx=0; idx<gbdByKey_cnt; idx++) free(gbk[idx].gbds);
177	free(gbk);
178	}
179
180	// ***************** Convert old compression style to new style ****************
181
182	static GB_ERROR gb_convert_compression(GBDATA *gbd) {
183	GB_ERROR error = 0;
184
185	if (gbd->is_container()) {
186	for (GBDATA *gb_child = GB_child(gbd); gb_child; gb_child = GB_nextChild(gb_child)) {
187	if (gb_child->flags.compressed_data \|\| gb_child->is_container()) {
188	error = gb_convert_compression(gb_child);
189	if (error) break;
190	}
191	}
192	}
193	else {
194	char *str = 0;
195	GBENTRY *gbe = gbd->as_entry();
196	long elems = gbe->size();
197	size_t data_size = gbe->uncompressed_size();
198	size_t new_size = -1;
199	int expectData = 1;
200
201	switch (gbd->type()) {
202	case GB_STRING:
203	case GB_LINK:
204	case GB_BYTES:
205	str = gb_uncompress_bytes(gbe->data(), data_size, &new_size);
206	if (str) {
207	gb_assert(new_size == data_size);
208	str = GB_memdup(str, data_size);
209	}
210	break;
211
212	case GB_INTS:
213	case GB_FLOATS:
214	str = gb_uncompress_longs_old(gbe->data(), elems, &new_size);
215	if (str) {
216	gb_assert(new_size == data_size);
217	str = GB_memdup(str, data_size);
218	}
219	break;
220
221	default:
222	expectData = 0;
223	break;
224	}
225
226	if (!str) {
227	if (expectData) {
228	error = GBS_global_string("Can't read old data to convert compression (Reason: %s)", GB_await_error());
229	}
230	}
231	else {
232	switch (gbd->type()) {
233	case GB_STRING:
234	error = GB_write_string(gbe, "");
235	if (!error) error = GB_write_string(gbe, str);
236	break;
237
238	case GB_LINK:
239	error = GB_write_link(gbe, "");
240	if (!error) error = GB_write_link(gbe, str);
241	break;
242
243	case GB_BYTES:
244	error = GB_write_bytes(gbe, "", 0);
245	if (!error) error = GB_write_bytes(gbe, str, data_size);
246	break;
247
248	case GB_INTS:
249	case GB_FLOATS:
250	error = GB_write_pntr(gbe, str, data_size, elems);
251	break;
252
253	default:
254	gb_assert(0);
255	break;
256	}
257
258	free(str);
259	}
260	}
261	return error;
262	}
263
264	GB_ERROR gb_convert_V2_to_V3(GBDATA *gb_main) {
265	GB_ERROR error = 0;
266	GBDATA *gb_system = GB_search(gb_main, GB_SYSTEM_FOLDER, GB_FIND);
267
268	if (!gb_system) {
269	GB_create_container(gb_main, GB_SYSTEM_FOLDER);
270	if (GB_entry(gb_main, "extended_data")) {
271	GB_warning("Converting data from old V2.0 to V2.1 Format:\n"
272	" Please Wait (may take some time)");
273	}
274	error = gb_convert_compression(gb_main);
275	GB_disable_quicksave(gb_main, "Database converted to new format");
276	}
277	return error;
278	}
279
280
281	// ******************* Compress by dictionary ******************
282
283
284	/* compression tag format:
285	*
286	* unsigned int compressed:1;
287	* if compressed==0:
288	* unsigned int last:1; ==1 -> this is the last block
289	* unsigned int len:6; length of uncompressible bytes
290	* char[len];
291	* if compressed==1:
292	* unsigned int idxlen:1; ==0 -> 10-bit index
293	* ==1 -> 18-bit index
294	* unsigned int idxhigh:2; the 2 highest bits of the index
295	* unsigned int len:4; (length of word) - (MIN_COMPR_WORD_LEN-1)
296	* if len==0:
297	* char extralen; (length of word) -
298	* char[idxlen+1]; index (low,high)
299	*
300	* tag == 64 -> end of dictionary compressed block (if not coded in last uncompressed block)
301	*/
302
303	inline int INDEX_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
304	gb_assert(idx<dict->words);
305	return ntohl(dict->offsets[idx]);
306	}
307	inline int ALPHA_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
308	int realIndex;
309	gb_assert(idx<dict->words);
310	realIndex = ntohl(dict->resort[idx]);
311	return INDEX_DICT_OFFSET(realIndex, dict);
312	}
313
314	// #define ALPHA_DICT_OFFSET(i) ntohl(offset[ntohl(resort[i])])
315	// #define INDEX_DICT_OFFSET(i) ntohl(offset[i])
316
317	#define LEN_BITS 4
318	#define INDEX_BITS 2
319	#define INDEX_LEN_BITS 1
320
321	#define LEN_SHIFT 0
322	#define INDEX_SHIFT (LEN_SHIFT+LEN_BITS)
323	#define INDEX_LEN_SHIFT (INDEX_SHIFT+INDEX_BITS)
324
325	#define BITMASK(bits) ((1<<(bits))-1)
326	#define GETVAL(tag,typ) (((tag)>>typ##_SHIFT)&BITMASK(typ##_BITS))
327
328	#define MIN_SHORTLEN 6
329	#define MAX_SHORTLEN (BITMASK(LEN_BITS)+MIN_SHORTLEN-1)
330	#define MIN_LONGLEN (MAX_SHORTLEN+1)
331	#define MAX_LONGLEN (MIN_LONGLEN+255)
332
333	#define SHORTLEN_DECR (MIN_SHORTLEN-1) // !! zero is used as flag for long len !!
334	#define LONGLEN_DECR MIN_LONGLEN
335
336	#define MIN_COMPR_WORD_LEN MIN_SHORTLEN
337	#define MAX_COMPR_WORD_LEN MAX_LONGLEN
338
339	#define MAX_SHORT_INDEX BITMASK(INDEX_BITS+8)
340	#define MAX_LONG_INDEX BITMASK(INDEX_BITS+16)
341
342	#define LAST_COMPRESSED_BIT 64
343
344	#ifdef DEBUG
345	# define DUMP_COMPRESSION_TEST 0
346	/* 0 = only compression ratio
347	* 1 = + original/compressed/decompressed
348	* 2 = + words used to compress/uncompress
349	* 3 = + matching words in dictionary
350	* 4 = + search of words in dictionary
351	*/
352	#else
353	# define DUMP_COMPRESSION_TEST 0
354	#endif
355
356	#ifdef DEBUG
357	// #define COUNT_CHUNKS
358
359	#if defined(COUNT_CHUNKS)
360
361	static long uncompressedBlocks[64];
362	static long compressedBlocks[MAX_LONGLEN];
363
364	static void clearChunkCounters() {
365	int i;
366
367	for (i=0; i<64; i++) uncompressedBlocks[i] = 0;
368	for (i=0; i<MAX_LONGLEN; i++) compressedBlocks[i] = 0;
369	}
370
371	static void dumpChunkCounters() {
372	int i;
373
374	printf("------------------------------\n" "Uncompressed blocks used:\n");
375	for (i=0; i<64; i++) if (uncompressedBlocks[i]) printf(" size=%i used=%li\n", i, uncompressedBlocks[i]);
376	printf("------------------------------\n" "Words used:\n");
377	for (i=0; i<MAX_LONGLEN; i++) if (compressedBlocks[i]) printf(" size=%i used=%li\n", i, compressedBlocks[i]);
378	printf("------------------------------\n");
379	}
380	#endif // COUNT_CHUNKS
381
382	static cu_str lstr(cu_str s, int len) {
383	#define BUFLEN 20000
384	static unsigned_char buf[BUFLEN];
385
386	gb_assert(len<BUFLEN);
387	memcpy(buf, s, len);
388	buf[len] = 0;
389
390	return buf;
391	}
392
393	#if DUMP_COMPRESSION_TEST>=2
394
395	static cu_str dict_word(GB_DICTIONARY *dict, int idx, int len) {
396	return lstr(dict->text+INDEX_DICT_OFFSET(idx, dict), len);
397	}
398
399	#endif
400
401	#if DUMP_COMPRESSION_TEST>=1
402
403	static void dumpBinary(u_str data, long size) {
404	#define PER_LINE 12
405	int cnt = 0;
406
407	while (size--) {
408	unsigned_char c = *data++;
409	int bitval = 128;
410	int bits = 8;
411
412	while (bits--) {
413	putchar(c&bitval ? '1' : '0');
414	bitval>>=1;
415	}
416	putchar(' ');
417
418	cnt = (cnt+1)%PER_LINE;
419	if (!cnt) putchar('\n');
420	}
421
422	if (cnt) putchar('\n');
423	}
424
425	#endif
426
427	#endif
428
429	inline int GB_MEMCMP(const void vm1, const void vm2, long size) {
430	char c1 = (char)vm1,
431	c2 = (char)vm2;
432	int diff = 0;
433
434	while (size-- && !diff) diff = c1++-c2++;
435	return diff;
436	}
437
438	// --------------------------------------------------
439
440	static int searchWord(GB_DICTIONARY dict, cu_str source, long size, unsigned long wordIndex, int *wordLen)
441	{
442	int idx = -1;
443	int l = 0;
444	int h = dict->words-1;
445	cu_str text = dict->text;
446	GB_NINT *resort = dict->resort;
447	int dsize = dict->textlen;
448	int ilen = 0;
449
450	while (l<h-1) {
451	int m = (l+h)/2;
452	long off = ALPHA_DICT_OFFSET(m, dict);
453	cu_str dictword = text+off;
454	long msize = min(size, dsize-off);
455
456	#if DUMP_COMPRESSION_TEST>=4
457	printf(" %s (%i)\n", lstr(dictword, 20), m);
458	#endif
459
460	if (GB_MEMCMP(source, dictword, msize)<=0) h = m;
461	else l = m;
462	}
463
464	while (l<=h) {
465	int off = ALPHA_DICT_OFFSET(l, dict);
466	cu_str word = text+off;
467	int msize = (int)min(size, dsize-off);
468	int equal = 0;
469	cu_str s = source;
470
471	while (msize-- && s++==word++) equal++;
472
473	#if DUMP_COMPRESSION_TEST>=3
474	if (equal>=MIN_COMPR_WORD_LEN) {
475	printf(" EQUAL=%i '%s' (%i->%i, off=%i)", equal, lstr(text+off, equal), l, ntohl(resort[l]), ALPHA_DICT_OFFSET(l, dict));
476	printf(" (context=%s)\n", lstr(text+off-min(off, 20), min(off, 20)+equal+20));
477	}
478	#endif
479
480	if (equal>ilen) {
481	ilen = equal;
482	idx = ntohl(resort[l]);
483	gb_assert(idx<dict->words);
484	}
485
486	l++;
487	}
488
489	*wordIndex = idx;
490	*wordLen = (int)min(ilen, MAX_COMPR_WORD_LEN);
491
492	return idx!=-1 && ilen>=MIN_COMPR_WORD_LEN;
493	}
494
495	#ifdef DEBUG
496	int lookup_DICTIONARY(GB_DICTIONARY *dict, GB_CSTR source) { // used for debugging
497	unsigned long wordIndex;
498	int wordLen;
499	int wordFound = searchWord(dict, (cu_str)source, strlen(source), &wordIndex, &wordLen);
500
501	if (wordFound) {
502	printf("'%s' (idx=%lu, off=%i)\n", lstr(dict->text+ntohl(dict->offsets[wordIndex]), wordLen), wordIndex, ntohl(dict->offsets[wordIndex]));
503	}
504
505	return wordFound;
506	}
507	#endif
508
509
510
511	static char gb_uncompress_by_dictionary_internal(GB_DICTIONARY dict, /* GBDATA gbd, / GB_CSTR s_source, const size_t size, bool append_zero, size_t *new_size) {
512	cu_str source = (cu_str)s_source;
513	u_str dest;
514	u_str buffer;
515	cu_str text = dict->text;
516	int done = 0;
517	long left = size;
518
519	dest = buffer = (u_str)GB_give_other_buffer(s_source, size+2);
520
521	while (left && !done) {
522	int c;
523
524	if ((c=*source++)&128) { // compressed data
525	int indexLen = GETVAL(c, INDEX_LEN);
526	unsigned long idx = GETVAL(c, INDEX);
527
528	c = GETVAL(c, LEN); // ==wordLen
529	if (c) c += SHORTLEN_DECR;
530	else c = *source+++LONGLEN_DECR;
531
532	gb_assert(indexLen>=0 && indexLen<=1);
533
534	if (indexLen==0) {
535	idx = (idx << 8) \| *source++;
536	}
537	else {
538	idx = (((idx << 8) \| source[1]) << 8) \| source[0];
539	source += 2;
540	}
541
542	gb_assert(idx<(GB_ULONG)dict->words);
543
544	{
545	cu_str word = text+INDEX_DICT_OFFSET(idx, dict);
546
547	#if DUMP_COMPRESSION_TEST>=2
548	printf(" word='%s' (idx=%lu, off=%li, len=%i)\n",
549	lstr(word, c), idx, (long)ntohl(dict->offsets[idx]), c);
550	#endif
551
552	{
553	u_str d = dest;
554	gb_assert(((d + c) <= word) \|\| (d >= (word + c)));
555	while (c--) d++ = word++;
556	dest = d;
557	}
558	}
559	}
560	else { // uncompressed bytes
561	if (c & LAST_COMPRESSED_BIT) {
562	done = 1;
563	c ^= LAST_COMPRESSED_BIT;
564	}
565
566	left -= c;
567	{
568	u_str d = dest;
569	gb_assert(((d + c) <= source) \|\| (d >= (source + c)));
570	while (c--) d++ = source++;
571	dest=d;
572	}
573	}
574	}
575
576	if (append_zero) *dest++ = 0;
577
578	*new_size = dest-buffer;
579	gb_assert(size >= *new_size); // buffer overflow
580
581	return (char *)buffer;
582	}
583
584	char gb_uncompress_by_dictionary(GBDATA gbd, GB_CSTR s_source, size_t size, size_t *new_size)
585	{
586	GB_DICTIONARY *dict = gb_get_dictionary(GB_MAIN(gbd), GB_KEY_QUARK(gbd));
587	bool append_zero = gbd->is_a_string();
588
589	if (!dict) {
590	GB_ERROR error = GBS_global_string("Cannot decompress db-entry '%s' (no dictionary found)\n", GB_get_db_path(gbd));
591	GB_export_error(error);
592	return 0;
593	}
594
595	return gb_uncompress_by_dictionary_internal(dict, s_source, size, append_zero, new_size);
596	}
597
598	char gb_compress_by_dictionary(GB_DICTIONARY dict, GB_CSTR s_source, size_t size, size_t *msize, int last_flag, int search_backward, int search_forward)
599	{
600	cu_str source = (cu_str)s_source;
601	u_str dest;
602	u_str buffer;
603	cu_str unknown = source; // start of uncompressible bytes
604	u_str lastUncompressed = NULL; // ptr to start of last block of uncompressible bytes (in dest)
605
606	#if defined(ASSERTION_USED)
607	const size_t org_size = size;
608	#endif // ASSERTION_USED
609
610	gb_assert(size>0); // compression of zero-length data fails!
611
612	dest = buffer = (u_str)GB_give_other_buffer((GB_CSTR)source, 1+(size/63+1)+size);
613	*dest++ = GB_COMPRESSION_DICTIONARY \| last_flag;
614
615	while (size) {
616	unsigned long wordIndex;
617	int wordLen;
618	int wordFound;
619
620	if ((wordFound = searchWord(dict, source, size, &wordIndex, &wordLen))) {
621	int length;
622
623	takeRest :
624	length = source-unknown;
625
626	if (length) {
627	int shift;
628	int takeShift = 0;
629	int maxShift = (int)min(search_forward, wordLen-1);
630
631	for (shift=1; shift<=maxShift; shift++) {
632	unsigned long wordIndex2;
633	int wordLen2;
634	int wordFound2;
635
636	if ((wordFound2 = searchWord(dict, source+shift, size-shift, &wordIndex2, &wordLen2))) {
637	if (wordLen2>(wordLen+shift)) {
638	wordIndex = wordIndex2;
639	wordLen = wordLen2;
640	takeShift = shift;
641	}
642	}
643	}
644
645	if (takeShift) {
646	source += takeShift;
647	size -= takeShift;
648	length = source-unknown;
649	}
650	}
651
652	while (length) { // if there were uncompressible bytes
653	int take = (int)min(length, 63);
654
655	#ifdef COUNT_CHUNKS
656	uncompressedBlocks[take]++;
657	#endif
658
659	lastUncompressed = dest;
660
661	*dest++ = take; // tag byte
662	memcpy(dest, unknown, take);
663	dest += take;
664	unknown += take;
665	length -= take;
666	}
667
668	gb_assert(unknown==source);
669
670	while (wordFound) { // as long as we find words in dictionary
671	int indexLen = wordIndex>MAX_SHORT_INDEX;
672	int indexHighBits = indexLen==0 ? wordIndex>>8 : wordIndex>>16;
673	int nextWordFound;
674	int nextWordLen;
675	unsigned long nextWordIndex;
676
677	gb_assert((long)wordIndex<dict->words);
678	gb_assert((long)wordIndex <= MAX_LONG_INDEX);
679	gb_assert(indexHighBits==(indexHighBits & BITMASK(INDEX_BITS)));
680	gb_assert(wordLen>=MIN_SHORTLEN);
681
682	lastUncompressed = NULL;
683
684	{
685	cu_str source2 = source+wordLen;
686	long size2 = size-wordLen;
687
688	if (!(nextWordFound=searchWord(dict, source+wordLen, size-wordLen, &nextWordIndex, &nextWordLen))) { // no word right afterwards
689	int shift;
690
691	for (shift=1; shift<=search_backward && shift<(wordLen-MIN_COMPR_WORD_LEN); shift++) {
692	// try to cut end of word to get a better result
693	unsigned long wordIndex2;
694	int wordLen2;
695	int wordFound2;
696
697	if ((wordFound2=searchWord(dict, source2-shift, size2+shift, &wordIndex2, &wordLen2))) {
698	if (wordLen2>(shift+1)) {
699	wordLen -= shift;
700
701	nextWordFound = 1;
702	nextWordIndex = wordIndex2;
703	nextWordLen = wordLen2;
704	break;
705	}
706	}
707	}
708	}
709	}
710
711	#ifdef COUNT_CHUNKS
712	compressedBlocks[wordLen]++;
713	#endif
714
715	#if DUMP_COMPRESSION_TEST>=2
716	printf(" word='%s' (idx=%li, off=%i, len=%i)\n",
717	dict_word(dict, wordIndex, wordLen), wordIndex, (int)ntohl(dict->offsets[wordIndex]), wordLen);
718	#endif
719
720	if (wordLen<=MAX_SHORTLEN) {
721	*dest++ = 128 \|
722	(indexLen << INDEX_LEN_SHIFT) \|
723	(indexHighBits << INDEX_SHIFT) \|
724	((wordLen-SHORTLEN_DECR) << LEN_SHIFT);
725	}
726	else {
727	*dest++ = 128 \|
728	(indexLen << INDEX_LEN_SHIFT) \|
729	(indexHighBits << INDEX_SHIFT);
730	*dest++ = wordLen-LONGLEN_DECR; // extra length byte
731	}
732
733	*dest++ = (char)wordIndex; // low index byte
734	if (indexLen)
735	*dest++ = (char)(wordIndex >> 8); // high index byte
736
737	unknown = source += wordLen;
738	size -= wordLen;
739
740	wordFound = nextWordFound;
741	wordIndex = nextWordIndex;
742	wordLen = nextWordLen;
743	}
744	}
745	else {
746	source++;
747	if (--size==0) goto takeRest;
748	}
749	}
750
751	if (lastUncompressed) *lastUncompressed \|= LAST_COMPRESSED_BIT;
752	else *dest++ = LAST_COMPRESSED_BIT;
753
754	*msize = dest-buffer;
755
756	#if defined(ASSERTION_USED)
757	{
758	size_t new_size = -1;
759	char *test = gb_uncompress_by_dictionary_internal(dict, (GB_CSTR)buffer+1, org_size + GB_COMPRESSION_TAGS_SIZE_MAX, true, &new_size);
760
761	gb_assert(memcmp(test, s_source, org_size) == 0);
762	gb_assert((org_size+1) == new_size);
763	}
764	#endif // ASSERTION_USED
765
766	return (char*)buffer;
767	}
768
769
770	#if defined(TEST_DICT)
771
772	static void test_dictionary(GB_DICTIONARY dict, O_gbdByKey gbk, long uncompSum, long compSum) {
773	long uncompressed_sum = 0;
774	long compressed_sum = 0;
775
776	long dict_size = (dict->words2+1)sizeof(GB_NINT)+dict->textlen;
777
778	long char_count[256];
779	for (int i=0; i<256; i++) char_count[i] = 0;
780
781	printf(" * Testing compression..\n");
782
783	#ifdef COUNT_CHUNKS
784	clearChunkCounters();
785	#endif
786
787	for (int cnt=0; cnt<gbk->cnt; cnt++) {
788	GBDATA *gbd = gbk->gbds[cnt];
789
790	if (COMPRESSIBLE(gbd->type())) {
791	size_t size;
792	cu_str data = get_data_n_size(gbd, &size);
793
794	if (gbd->is_a_string()) size--;
795	if (size<1) continue;
796
797	u_str copy = (u_str)gbm_get_mem(size, GBM_DICT_INDEX);
798	gb_assert(copy!=0);
799	memcpy(copy, data, size);
800
801	#if DUMP_COMPRESSION_TEST>=1
802	printf("----------------------------\n");
803	printf("original : %3li b = '%s'\n", size, data);
804	#endif
805
806	int last_flag = 0;
807	size_t compressedSize;
808	u_str compressed = (u_str)gb_compress_by_dictionary(dict, (GB_CSTR)data, size, &compressedSize, last_flag, 9999, 2);
809
810	#if DUMP_COMPRESSION_TEST>=1
811	printf("compressed : %3li b = '%s'\n", compressedSize, lstr(compressed, compressedSize));
812	dumpBinary(compressed, compressedSize);
813	#endif
814
815	for (size_t i=0; i<compressedSize; i++) char_count[compressed[i]]++;
816
817	size_t new_size = -1;
818	u_str uncompressed = (u_str)gb_uncompress_by_dictionary(gbd, (char*)compressed+1, size+GB_COMPRESSION_TAGS_SIZE_MAX, &new_size);
819
820	#if DUMP_COMPRESSION_TEST>=1
821	printf("copy : %3li b = '%s'\n", size, lstr(copy, size));
822	printf("decompressed: %3li b = '%s'\n", size, lstr(uncompressed, size));
823	#endif
824
825	if (GB_MEMCMP(copy, uncompressed, size)!=0) {
826	int byte = 0;
827
828	while (copy[byte]==uncompressed[byte]) byte++;
829	printf("Error in compression (off=%i, '%s'", byte, lstr(copy+byte, 10));
830	printf("!='%s'\n", lstr(uncompressed+byte, 10));
831	}
832
833	if (compressedSize<size) {
834	uncompressed_sum += size;
835	compressed_sum += compressedSize;
836	}
837	else {
838	uncompressed_sum += size;
839	compressed_sum += size;
840	}
841
842	gbm_free_mem(copy, size, GBM_DICT_INDEX);
843	}
844	}
845
846	#ifdef COUNT_CHUNKS
847	dumpChunkCounters();
848	#endif
849
850	{
851	long compressed_plus_dict = compressed_sum+dict_size;
852	char *dict_text = GBS_global_string_copy("+dict %li b", dict_size);
853	long ratio = (compressed_plus_dict*100)/uncompressed_sum;
854
855	printf(" uncompressed size = %10li b\n"
856	" compressed size = %10li b\n"
857	" %17s = %10li b (Ratio=%li%%)\n",
858	uncompressed_sum,
859	compressed_sum,
860	dict_text, compressed_plus_dict, ratio);
861
862	free(dict_text);
863	}
864
865	*uncompSum += uncompressed_sum;
866	*compSum += compressed_sum+dict_size;
867	}
868
869	#endif // TEST_DICT
870
871
872	// ***************** Build dictionary ****************
873
874	#ifdef DEBUG
875	#define TEST // test trees?
876	// #define DUMP_TREE // dump trees?
877
878	// #define DUMP_EXPAND
879	/*
880	#define SELECT_WORDS
881	#define SELECTED_WORDS "oropl"
882	*/
883
884	# ifdef SELECT_WORDS
885	static char strnstr(char s1, int len, char *s2) {
886	char c = *s2;
887	int len2 = strlen(s2);
888
889	while (len-->=len2) {
890	if (*s1==c) {
891	if (strncmp(s1, s2, len2)==0) return s1;
892	}
893	s1++;
894	}
895
896	return NULL;
897	}
898	# endif
899
900	#ifdef DUMP_TREE
901	static void dump_dtree(int deep, DictTree tree)
902	{
903	static unsigned_char buffer[1024];
904
905	if (tree.full) {
906	switch (tree.full->typ) {
907	case FULL_NODE: {
908	int idx;
909
910	for (idx=0; idx<256; idx++) {
911	buffer[deep] = idx;
912	buffer[deep+1] = 0;
913
914	if (tree.full->son[idx].exists) dump_dtree(deep+1, tree.full->son[idx]);
915	else if (tree.full->count[idx]>0) printf(" '%s' (%i) [array]\n", buffer, tree.full->count[idx]);
916	}
917	break;
918	}
919	case SINGLE_NODE: {
920	buffer[deep] = tree.single->ch;
921	buffer[deep+1] = 0;
922
923	if (tree.single->son.exists) dump_dtree(deep+1, tree.single->son);
924	else printf(" '%s' (%i) [single]\n", buffer, tree.single->count);
925
926	if (tree.single->brother.exists) dump_dtree(deep, tree.single->brother);
927	break;
928	}
929	}
930	}
931	}
932	#endif
933
934	#else
935	#ifdef DUMP_TREE
936	# define dump_dtree(deep, tree)
937	#endif
938	#endif
939
940	#ifdef TEST
941	static int testCounts(DictTree tree) {
942	// tests if all inner nodes have correct 'count's
943	int cnt = 0;
944
945	if (tree.exists) {
946	switch (tree.full->typ) {
947	case SINGLE_NODE: {
948	while (tree.exists) {
949	if (tree.single->son.exists) {
950	int son_cnt = testCounts(tree.single->son);
951	#ifdef COUNT_EQUAL
952	gb_assert(son_cnt==tree.single->count);
953	#else
954	gb_assert(son_cnt<=tree.single->count);
955	#endif
956	}
957
958	gb_assert(tree.single->count>0);
959	cnt += tree.single->count;
960	tree = tree.single->brother;
961	}
962	break;
963	}
964	case FULL_NODE: {
965	int idx,
966	sons = 0;
967
968	for (idx=0; idx<256; idx++) {
969	if (tree.full->son[idx].exists) {
970	int son_cnt = testCounts(tree.full->son[idx]);
971	#ifdef COUNT_EQUAL
972	gb_assert(son_cnt==tree.full->count[idx]);
973	#else
974	gb_assert(son_cnt<=tree.full->count[idx]);
975	#endif
976	if (tree.full->usedSons) gb_assert(tree.full->count[idx]>0);
977	else gb_assert(tree.full->count[idx]==0);
978
979	sons++;
980	}
981	else if (tree.full->count[idx]) {
982	sons++;
983	}
984
985	cnt += tree.full->count[idx];
986	}
987
988	gb_assert(sons==tree.full->usedSons);
989	break;
990	}
991	}
992	}
993
994	return cnt;
995	}
996
997	// #define TEST_MAX_OCCUR_COUNT
998
999	#ifdef TEST_MAX_OCCUR_COUNT
1000	#define MAX_OCCUR_COUNT 600000
1001	#endif
1002
1003	static DictTree test_dtree(DictTree tree)
1004	// only correct while tree is under contruction (build_dict_tree())
1005	{
1006	if (tree.exists) {
1007	switch (tree.full->typ) {
1008	case SINGLE_NODE: {
1009	#if defined(TEST_MAX_OCCUR_COUNT)
1010	gb_assert(tree.single->count<MAX_OCCUR_COUNT); // quite improbable
1011	#endif // TEST_MAX_OCCUR_COUNT
1012
1013	if (tree.single->son.exists) {
1014	gb_assert(tree.single->count==0);
1015	test_dtree(tree.single->son);
1016	}
1017	else {
1018	gb_assert(tree.single->count>0);
1019	}
1020
1021	if (tree.single->brother.exists) test_dtree(tree.single->brother);
1022	break;
1023	}
1024	case FULL_NODE: {
1025	int idx;
1026	int countSons = 0;
1027
1028	for (idx=0; idx<256; idx++) {
1029	#if defined(TEST_MAX_OCCUR_COUNT)
1030	gb_assert(tree.full->count[idx]<MAX_OCCUR_COUNT); // quite improbable
1031	#endif // TEST_MAX_OCCUR_COUNT
1032
1033	if (tree.full->son[idx].exists) {
1034	gb_assert(tree.full->count[idx]==0);
1035	test_dtree(tree.full->son[idx]);
1036	countSons++;
1037	}
1038	else {
1039	gb_assert(tree.full->count[idx]>=0);
1040	if (tree.full->count[idx]>0)
1041	countSons++;
1042	}
1043	}
1044
1045	gb_assert(countSons==tree.full->usedSons);
1046
1047	break;
1048	}
1049	}
1050	}
1051
1052	return tree;
1053	}
1054
1055	#else
1056	# define test_dtree(tree) // (tree)
1057	# define testCounts(tree) // 0
1058	#endif
1059
1060
1061	static DictTree new_dtree(cu_str text, long len, long *memcount) {
1062	// creates a new (sub-)tree from 'text' (which has length 'len')
1063	DictTree tree;
1064
1065	if (len) {
1066	SingleDictTree *tail = NULL;
1067	SingleDictTree *head = NULL;
1068
1069	while (len) {
1070	if (tail) tail = tail->son.single = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1071	else tail = head = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1072
1073	(memcount) += sizeof(tail);
1074
1075	tail->typ = SINGLE_NODE;
1076	tail->ch = *text++;
1077	len--;
1078
1079	tail->brother.single = NULL;
1080	tail->son.single = NULL;
1081	}
1082
1083	tail->count = 1;
1084	tree.single = head;
1085	}
1086	else {
1087	tree.single = NULL;
1088	}
1089
1090	return tree;
1091	}
1092
1093	static DictTree single2full_dtree(DictTree tree, long *memcount) {
1094	if (tree.exists && tree.single->typ==SINGLE_NODE) {
1095	FullDictTree full = (FullDictTree)gbm_get_mem(sizeof(*full), GBM_DICT_INDEX);
1096	int idx;
1097
1098	(memcount) += sizeof(full);
1099	full->typ = FULL_NODE;
1100	full->usedSons = 0;
1101
1102	for (idx=0; idx<256; idx++) {
1103	full->son[idx].exists = NULL;
1104	full->count[idx] = 0;
1105	}
1106
1107	while (tree.exists) {
1108	SingleDictTree *t = tree.single;
1109
1110	gb_assert(t->typ==SINGLE_NODE);
1111	gb_assert(full->son[t->ch].exists==NULL);
1112
1113	full->son[t->ch] = t->son;
1114	full->count[t->ch] = t->count;
1115	full->usedSons++;
1116
1117	tree.single = t->brother.single;
1118
1119	gbm_free_mem(t, sizeof(*t), GBM_DICT_INDEX);
1120	(memcount) -= sizeof(t);
1121	}
1122
1123	tree.full = full;
1124	}
1125
1126	return tree;
1127	}
1128
1129	static void free_dtree(DictTree tree)
1130	{
1131	if (tree.exists) {
1132	switch (tree.full->typ) {
1133	case SINGLE_NODE: {
1134	if (tree.single->son.exists) free_dtree(tree.single->son);
1135	if (tree.single->brother.exists) free_dtree(tree.single->brother);
1136
1137	gbm_free_mem(tree.single, sizeof(*(tree.single)), GBM_DICT_INDEX);
1138	break;
1139	}
1140	case FULL_NODE: {
1141	int idx;
1142
1143	for (idx=0; idx<256; idx++) if (tree.full->son[idx].exists) free_dtree(tree.full->son[idx]);
1144	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1145	break;
1146	}
1147	}
1148	}
1149	}
1150
1151
1152
1153	static DictTree cut_dtree(DictTree tree, int cut_count, long memcount, long leafcount)
1154	/* removes all branches from 'tree' which are referenced less/equal than cut_count
1155	* returns: the reduced tree */
1156	{
1157	if (tree.exists) {
1158	switch (tree.full->typ) {
1159	case SINGLE_NODE: {
1160	if (tree.single->son.exists) tree.single->son = cut_dtree(tree.single->son, cut_count, memcount, leafcount);
1161
1162	if (!tree.single->son.exists) { // leaf
1163	if (tree.single->count<=cut_count) { // leaf with less/equal references
1164	DictTree brother = tree.single->brother;
1165
1166	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1167	(memcount) -= sizeof(tree.single);
1168	if (brother.exists) return cut_dtree(brother, cut_count, memcount, leafcount);
1169
1170	tree.single = NULL;
1171	break;
1172	}
1173	else {
1174	(*leafcount)++;
1175	}
1176	}
1177
1178	if (tree.single->brother.exists) tree.single->brother = cut_dtree(tree.single->brother, cut_count, memcount, leafcount);
1179	break;
1180	}
1181	case FULL_NODE: {
1182	int idx;
1183	int count = 0;
1184
1185	for (idx=0; idx<256; idx++) {
1186	if (tree.full->son[idx].exists) {
1187	tree.full->son[idx] = cut_dtree(tree.full->son[idx], cut_count, memcount, leafcount);
1188
1189	if (tree.full->son[idx].exists) count++;
1190	else tree.full->count[idx] = 0;
1191	}
1192	else if (tree.full->count[idx]>0) {
1193	if (tree.full->count[idx]<=cut_count) {
1194	tree.full->count[idx] = 0;
1195	}
1196	else {
1197	count++;
1198	(*leafcount)++;
1199	}
1200	}
1201	}
1202
1203	tree.full->usedSons = count;
1204
1205	if (!count) { // no more sons
1206	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1207	(memcount) -= sizeof((tree.full));
1208	tree.exists = NULL;
1209	}
1210
1211	break;
1212	}
1213	}
1214	}
1215
1216	return tree;
1217	}
1218	static DictTree cut_useless_words(DictTree tree, int deep, long *removed)
1219	/* removes/shortens all branches of 'tree' which are not useful for compression
1220	* 'deep' should be zero (incremented by cut_useless_words)
1221	* 'removed' will be set to the # of removed occurrences
1222	* returns: the reduced tree
1223	*/
1224	{
1225	*removed = 0;
1226
1227	if (tree.exists) {
1228	deep++;
1229
1230	switch (tree.full->typ) {
1231	long removed_single;
1232
1233	case SINGLE_NODE: {
1234	if (tree.single->son.exists) {
1235	tree.single->son = cut_useless_words(tree.single->son, deep, &removed_single);
1236	tree.single->count -= removed_single;
1237	*removed += removed_single;
1238	}
1239
1240	if (!tree.single->son.exists && !WORD_HELPFUL(deep, tree.single->count)) {
1241	DictTree brother = tree.single->brother;
1242
1243	*removed += tree.single->count;
1244	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1245
1246	if (brother.exists) {
1247	tree = cut_useless_words(brother, deep-1, &removed_single);
1248	*removed += removed_single;
1249	}
1250	else {
1251	tree.exists = NULL;
1252	}
1253
1254	break;
1255	}
1256
1257	if (tree.single->brother.exists) {
1258	tree.single->brother = cut_useless_words(tree.single->brother, deep-1, &removed_single);
1259	*removed += removed_single;
1260	}
1261
1262	break;
1263	}
1264	case FULL_NODE: {
1265	int idx;
1266	int count = 0;
1267
1268	for (idx=0; idx<256; idx++) {
1269	if (tree.full->son[idx].exists) {
1270	tree.full->son[idx] = cut_useless_words(tree.full->son[idx], deep, &removed_single);
1271	tree.full->count[idx] -= removed_single;
1272	*removed += removed_single;
1273	}
1274
1275	if (tree.full->son[idx].exists) {
1276	count++;
1277	}
1278	else if (tree.full->count[idx]) {
1279	if (!WORD_HELPFUL(deep, tree.full->count[idx])) { // useless!
1280	*removed += tree.full->count[idx];
1281	tree.full->count[idx] = 0;
1282	}
1283	else {
1284	count++;
1285	}
1286	}
1287	}
1288
1289	tree.full->usedSons = count;
1290
1291	if (!count) { // no more sons
1292	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1293	tree.exists = NULL;
1294	}
1295
1296	break;
1297	}
1298	}
1299	}
1300
1301	return tree;
1302	}
1303
1304	static DictTree add_dtree_to_dtree(DictTree toAdd, DictTree to, long *memcount)
1305	/* adds 'toAdd' as brother of 'to' (must be leftmost of all SINGLE_NODEs or a FULL_NODE)
1306	* returns: the leftmost of all SINGLE_NODEs or a FULL_NODE
1307	*/
1308	{
1309	DictTree tree = toAdd;
1310
1311	gb_assert(toAdd.single->typ==SINGLE_NODE);
1312
1313	if (to.exists) {
1314	switch (to.full->typ) {
1315	case SINGLE_NODE: {
1316	SingleDictTree *left = to.single;
1317
1318	gb_assert(left!=0);
1319
1320	if (toAdd.single->ch < to.single->ch) {
1321	toAdd.single->brother = to;
1322	return toAdd;
1323	}
1324
1325	while (to.single->brother.exists) {
1326	if (toAdd.single->ch < to.single->brother.single->ch) {
1327	toAdd.single->brother = to.single->brother;
1328	to.single->brother = toAdd;
1329
1330	tree.single = left;
1331	return tree;
1332	}
1333	to = to.single->brother;
1334	}
1335
1336	to.single->brother = toAdd;
1337	tree.single = left;
1338	break;
1339	}
1340	case FULL_NODE: {
1341	unsigned_char ch = toAdd.single->ch;
1342
1343	gb_assert(to.full->son[ch].exists==NULL);
1344	gb_assert(to.full->count[ch]==0); // if this fails, count must be added & tested
1345	gb_assert(toAdd.single->brother.exists==NULL);
1346
1347	to.full->son[ch] = toAdd.single->son;
1348	to.full->count[ch] = toAdd.single->count;
1349	to.full->usedSons++;
1350
1351	tree = to;
1352
1353	gbm_free_mem(toAdd.single, sizeof(*(toAdd.single)), GBM_DICT_INDEX);
1354	(*memcount) -= sizeof(toAdd.single);
1355
1356	break;
1357	}
1358	}
1359	}
1360
1361	return tree;
1362	}
1363
1364	static DictTree add_to_dtree(DictTree tree, cu_str text, long len, long *memcount)
1365	/* adds the string 'text' (which has length 'len') to 'tree'
1366	* returns: new tree
1367	*/
1368	{
1369	if (tree.exists) {
1370	switch (tree.full->typ) {
1371	case SINGLE_NODE: {
1372	SingleDictTree *t = tree.single;
1373	int count = 0;
1374
1375	do {
1376	count++;
1377	if (t->ch==text[0]) { // we found an existing subtree
1378	if (len>1) {
1379	t->son = add_to_dtree(t->son, text+1, len-1, memcount); // add rest of text to subtree
1380	}
1381	else {
1382	gb_assert(len==1);
1383	gb_assert(t->son.exists==NULL);
1384	t->count++;
1385	}
1386
1387	return count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree;
1388	}
1389	else if (t->ch > text[0]) {
1390	break;
1391	}
1392	}
1393	while ((t=t->brother.single)!=NULL);
1394
1395	tree = add_dtree_to_dtree(new_dtree(text, len, memcount), // otherwise we create a new subtree
1396	count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree,
1397	memcount);
1398	break;
1399	}
1400	case FULL_NODE: {
1401	unsigned_char ch = text[0];
1402
1403	if (tree.full->son[ch].exists) {
1404	tree.full->son[ch] = add_to_dtree(tree.full->son[ch], text+1, len-1, memcount);
1405	}
1406	else {
1407	tree.full->son[ch] = new_dtree(text+1, len-1, memcount);
1408	if (!tree.full->son[ch].exists) {
1409	if (tree.full->count[ch]==0) tree.full->usedSons++;
1410	tree.full->count[ch]++;
1411	}
1412	else {
1413	tree.full->usedSons++;
1414	}
1415	}
1416	break;
1417	}
1418	}
1419
1420	return tree;
1421	}
1422
1423	return new_dtree(text, len, memcount);
1424	}
1425
1426	static long calcCounts(DictTree tree)
1427	{
1428	long cnt = 0;
1429
1430	gb_assert(tree.exists!=0);
1431
1432	switch (tree.full->typ) {
1433	case SINGLE_NODE: {
1434	while (tree.exists) {
1435	if (tree.single->son.exists) tree.single->count = calcCounts(tree.single->son);
1436	gb_assert(tree.single->count>0);
1437	cnt += tree.single->count;
1438	tree = tree.single->brother;
1439	}
1440	break;
1441	}
1442	case FULL_NODE: {
1443	int idx;
1444
1445	for (idx=0; idx<256; idx++) {
1446	if (tree.full->son[idx].exists) {
1447	tree.full->count[idx] = calcCounts(tree.full->son[idx]);
1448	gb_assert(tree.full->count[idx]>0);
1449	}
1450	else {
1451	gb_assert(tree.full->count[idx]>=0);
1452	}
1453	cnt += tree.full->count[idx];
1454	}
1455	break;
1456	}
1457	}
1458
1459	return cnt;
1460	}
1461
1462	static int count_dtree_leafs(DictTree tree, int deep, int *maxdeep) {
1463	// returns # of leafs and max. depth of tree
1464	int leafs = 0;
1465
1466	gb_assert(tree.exists!=0);
1467
1468	if (++deep>maxdeep) maxdeep = deep;
1469
1470	switch (tree.full->typ) {
1471	case SINGLE_NODE: {
1472	if (tree.single->son.exists) leafs += count_dtree_leafs(tree.single->son, deep, maxdeep);
1473	else leafs++;
1474	if (tree.single->brother.exists) leafs += count_dtree_leafs(tree.single->brother, deep, maxdeep);
1475	break;
1476	}
1477	case FULL_NODE: {
1478	int idx;
1479
1480	for (idx=0; idx<256; idx++) {
1481	if (tree.full->son[idx].exists) leafs += count_dtree_leafs(tree.full->son[idx], deep, maxdeep);
1482	else if (tree.full->count[idx]) leafs++;
1483	}
1484	break;
1485	}
1486	}
1487
1488	return leafs;
1489	}
1490
1491	static int COUNT(DictTree tree) {
1492	// counts sum of # of occurrences of tree
1493	int cnt = 0;
1494
1495	switch (tree.single->typ) {
1496	case SINGLE_NODE: {
1497	while (tree.exists) {
1498	cnt += tree.single->count;
1499	tree = tree.single->brother;
1500	}
1501	break;
1502	}
1503	case FULL_NODE: {
1504	int idx;
1505
1506	for (idx=0; idx<256; idx++) cnt += tree.full->count[idx];
1507	break;
1508	}
1509	}
1510
1511	return cnt;
1512	}
1513
1514	static DictTree removeSubsequentString(DictTree *tree_pntr, cu_str buffer, int len, int max_occur) {
1515	/* searches tree for 'buffer' (length='len')
1516	*
1517	* returns - rest below found string
1518	* (if found and if the # of occurrences of the string is less/equal than 'max_occur')
1519	* - NULL otherwise
1520	*
1521	* removes the whole found string from the tree (not only the rest!)
1522	*/
1523	DictTree tree = *tree_pntr, rest;
1524	static int restCount;
1525
1526	rest.exists = NULL;
1527
1528	gb_assert(tree.exists!=0);
1529	gb_assert(len>0);
1530
1531	switch (tree.full->typ) {
1532	case SINGLE_NODE: {
1533	while (tree.single->ch <= buffer[0]) {
1534	if (tree.single->ch == buffer[0]) { // found wanted character
1535	if (tree.single->son.exists) {
1536	if (len==1) {
1537	if (tree.single->count <= max_occur) {
1538	rest = tree.single->son;
1539	restCount = COUNT(rest);
1540	tree.single->son.exists = NULL;
1541	}
1542	}
1543	else {
1544	rest = removeSubsequentString(&tree.single->son, buffer+1, len-1, max_occur);
1545	}
1546	}
1547
1548	if (rest.exists) { // the string was found
1549	tree.single->count -= restCount;
1550	gb_assert(tree.single->count >= 0);
1551
1552	if (!tree.single->count) { // empty subtree -> delete myself
1553	DictTree brother = tree.single->brother;
1554
1555	tree.single->brother.exists = NULL; // elsewise it would be freed by free_dtree
1556	free_dtree(tree);
1557	*tree_pntr = tree = brother;
1558	}
1559	}
1560
1561	break;
1562	}
1563
1564	tree_pntr = &(tree.single->brother);
1565	if (!(tree = tree.single->brother).exists) break;
1566	}
1567
1568	break;
1569	}
1570	case FULL_NODE: {
1571	unsigned_char ch;
1572
1573	if (tree.full->son[ch=buffer[0]].exists) {
1574	if (len==1) {
1575	if (tree.full->count[ch] <= max_occur) {
1576	rest = tree.full->son[ch];
1577	restCount = COUNT(rest);
1578	tree.full->son[ch].exists = NULL;
1579	}
1580	}
1581	else {
1582	rest = removeSubsequentString(&tree.full->son[ch], buffer+1, len-1, max_occur);
1583	}
1584
1585	if (rest.exists) {
1586	gb_assert(restCount>0);
1587	tree.full->count[ch] -= restCount;
1588	gb_assert(tree.full->count[ch]>=0);
1589	if (tree.full->count[ch]==0) {
1590	gb_assert(tree.full->son[ch].exists==NULL);
1591
1592	if (--tree.full->usedSons==0) { // last son deleted -> delete myself
1593	free_dtree(tree);
1594	tree.exists = NULL;
1595	*tree_pntr = tree;
1596	}
1597	}
1598	}
1599	}
1600
1601	break;
1602	}
1603	}
1604
1605	return rest;
1606	}
1607
1608	static cu_str memstr(cu_str stringStart, int stringStartLen, cu_str inString, int inStringLen) {
1609	if (!inStringLen) return stringStart; // string of length zero is found everywhere
1610
1611	while (stringStartLen) {
1612	cu_str found = (cu_str)memchr(stringStart, inString[0], stringStartLen);
1613
1614	if (!found) break;
1615
1616	stringStartLen -= found-stringStart;
1617	stringStart = found;
1618
1619	if (stringStartLen<inStringLen) break;
1620
1621	if (GB_MEMCMP(stringStart, inString, inStringLen)==0) return stringStart;
1622
1623	stringStart++;
1624	stringStartLen--;
1625	}
1626
1627	return NULL;
1628	}
1629
1630
1631	static int expandBranches(u_str buffer, int deep, int minwordlen, int maxdeep, DictTree tree, DictTree root, int max_percent) {
1632	/* expands all branches in 'tree'
1633	*
1634	* this is done by searching every of these branches in 'root' and moving any subsequent parts from there to 'tree'
1635	* (this is only done, if the # of occurrences of the found part does not exceed the # of occurrences of 'tree' more than 'max_percent' percent)
1636	*
1637	* 'buffer' strings are rebuild here while descending the tree (length of buffer==MAX_WORD_LEN)
1638	* 'deep' recursion level
1639	* 'maxdeep' maximum recursion level
1640	* 'minwordlen' is the length of the words to search (usually equal to MIN_WORD_LEN-1)
1641	*
1642	* returns the # of occurrences which were added to 'tree'
1643	*/
1644	int expand = 0; // calculate count-sum of added subsequent parts
1645
1646	gb_assert(tree.exists!=0);
1647
1648	if (deep<maxdeep) {
1649	switch (tree.full->typ) {
1650	case SINGLE_NODE: {
1651	while (tree.exists) {
1652	buffer[deep] = tree.single->ch;
1653
1654	if (!tree.single->son.exists) {
1655	DictTree rest;
1656	u_str buf = buffer+1;
1657	int len = deep;
1658
1659	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1660	buf += len-minwordlen;
1661	len = minwordlen;
1662	}
1663
1664	if (len==minwordlen) {
1665	cu_str self = memstr(buffer, deep+1, buf, len);
1666
1667	gb_assert(self!=0);
1668	if (self==buf) rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.single->count)/100);
1669	else rest.exists = NULL;
1670	}
1671	else {
1672	rest.exists = NULL;
1673	}
1674
1675	if (rest.exists) {
1676	int cnt = COUNT(rest);
1677
1678	tree.single->son = rest;
1679	tree.single->count += cnt;
1680	expand += cnt;
1681	#ifdef DUMP_EXPAND
1682	#define DUMP_MORE 1
1683	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1684	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1685	#endif
1686	}
1687	}
1688
1689	if (tree.single->son.exists) {
1690	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.single->son, root, max_percent);
1691
1692	expand += added;
1693	tree.single->count += added;
1694	}
1695
1696	tree = tree.single->brother;
1697	}
1698
1699	break;
1700	}
1701	case FULL_NODE: {
1702	int idx;
1703
1704	for (idx=0; idx<256; idx++) {
1705	buffer[deep] = idx;
1706
1707	if (!tree.full->son[idx].exists && tree.full->count[idx]) { // leaf
1708	DictTree rest;
1709	u_str buf = buffer+1;
1710	int len = deep;
1711
1712	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1713	buf += len-minwordlen;
1714	len = minwordlen;
1715	}
1716
1717	if (len==minwordlen) {
1718	cu_str self = memstr(buffer, deep+1, buf, len);
1719
1720	gb_assert(self!=0);
1721	if (self==buf)
1722	rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.full->count[idx])/100);
1723	else
1724	rest.exists = NULL;
1725	}
1726	else {
1727	rest.exists = NULL;
1728	}
1729
1730	if (rest.exists) { // substring found!
1731	int cnt = COUNT(rest);
1732
1733	if (tree.full->count[idx]==0) tree.full->usedSons++;
1734	tree.full->son[idx] = rest;
1735	tree.full->count[idx] += cnt;
1736
1737	expand += cnt;
1738	#ifdef DUMP_EXPAND
1739	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1740	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1741	#endif
1742	}
1743	}
1744
1745	if (tree.full->son[idx].exists) {
1746	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.full->son[idx], root, max_percent);
1747
1748	expand += added;
1749	tree.full->count[idx] += added;
1750	}
1751	}
1752
1753	break;
1754	}
1755	}
1756	}
1757
1758	return expand;
1759	}
1760
1761	static DictTree build_dict_tree(O_gbdByKey gbk, long maxmem, long maxdeep, size_t minwordlen, long data_sum)
1762	/* builds a tree of the most used words
1763	*
1764	* 'maxmem' is the amount of memory that will be allocated
1765	* 'maxdeep' is the maximum length of the _returned_ words
1766	* 'minwordlen' is the minimum length a word needs to get into the tree
1767	* this is used in the first pass as maximum tree depth
1768	* 'data_sum' will be set to the overall-size of data of which the tree was built
1769	*/
1770	{
1771	DictTree tree;
1772	long memcount = 0;
1773	long leafs = 0;
1774
1775	*data_sum = 0;
1776
1777	{
1778	int cnt;
1779	long lowmem = (maxmem*9)/10;
1780	int cut_count = 1;
1781
1782	// Build 8-level-deep tree of all existing words
1783
1784	tree.exists = NULL; // empty tree
1785
1786	for (cnt=0; cnt<gbk->cnt; cnt++) {
1787	GBDATA *gbd = gbk->gbds[cnt];
1788
1789	if (COMPRESSIBLE(gbd->type())) {
1790	size_t size;
1791	cu_str data = get_data_n_size(gbd, &size);
1792	cu_str lastWord;
1793
1794	if (gbd->is_a_string()) size--;
1795	if (size<minwordlen) continue;
1796
1797	*data_sum += size;
1798	lastWord = data+size-minwordlen;
1799
1800	#ifdef SELECT_WORDS
1801	if (strnstr(data, size, SELECTED_WORDS)) // test some words only
1802	#endif
1803	{
1804
1805	for (; data<=lastWord; data++) {
1806	tree = add_to_dtree(tree, data, minwordlen, &memcount);
1807
1808	while (memcount>maxmem) {
1809	leafs = 0;
1810	tree = cut_dtree(tree, cut_count, &memcount, &leafs);
1811	if (memcount<=lowmem) break;
1812	cut_count++;
1813	}
1814	}
1815	}
1816	}
1817	}
1818	}
1819
1820	{
1821	int cutoff = 1;
1822
1823	leafs = 0;
1824	tree = cut_dtree(tree, cutoff, &memcount, &leafs); // cut all single elements
1825	test_dtree(tree);
1826
1827	#if defined(DEBUG)
1828	if (tree.exists) {
1829	int maxdeep2 = 0;
1830	long counted = count_dtree_leafs(tree, 0, &maxdeep2);
1831	gb_assert(leafs == counted);
1832	}
1833	#endif // DEBUG
1834
1835	// avoid directory overflow (max. 18bit)
1836	while (leafs >= MAX_LONG_INDEX) {
1837	leafs = 0;
1838	++cutoff;
1839	#if defined(DEBUG)
1840	printf("Directory overflow (%li) -- reducing size (cutoff = %i)\n", leafs, cutoff);
1841	#endif // DEBUG
1842	tree = cut_dtree(tree, cutoff, &memcount, &leafs);
1843	}
1844	}
1845	#ifdef DUMP_TREE
1846	printf("----------------------- tree with short branches:\n");
1847	dump_dtree(0, tree);
1848	printf("---------------------------\n");
1849	#endif
1850
1851	// Try to create longer branches
1852
1853	if (tree.exists) {
1854	int add_count;
1855	u_str buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
1856	int max_differ;
1857	long dummy;
1858
1859	if (tree.full->typ != FULL_NODE) tree = single2full_dtree(tree, &memcount); // ensure root is FULL_NODE
1860
1861	test_dtree(tree);
1862	calcCounts(tree); // calculate counters of inner nodes
1863	testCounts(tree);
1864
1865	for (max_differ=0; max_differ<=MAX_DIFFER; max_differ+=INCR_DIFFER) { // percent of allowed difference for concatenating tree branches
1866	do {
1867	int idx;
1868	add_count = 0;
1869
1870	for (idx=0; idx<256; idx++) {
1871	if (tree.full->son[idx].exists) {
1872	int added;
1873
1874	buffer[0] = idx;
1875	added = expandBranches(buffer, 1, minwordlen-1, maxdeep, tree.full->son[idx], tree, max_differ);
1876	tree.full->count[idx] += added;
1877	add_count += added;
1878	}
1879	}
1880	}
1881	while (add_count);
1882	}
1883
1884	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
1885
1886	tree = cut_useless_words(tree, 0, &dummy);
1887	}
1888
1889	#ifdef DUMP_TREE
1890	printf("----------------------- tree with expanded branches:\n");
1891	dump_dtree(0, tree);
1892	printf("-----------------------\n");
1893	#endif
1894	testCounts(tree);
1895
1896	return tree;
1897	}
1898
1899	static DictTree remove_word_from_dtree(DictTree tree, cu_str wordStart, int wordLen, u_str resultBuffer, int resultLen, long resultFrequency, long *removed) {
1900	/* searches 'tree' for a word starting with 'wordStart' an removes it from the tree
1901	* if there are more than one possibilities, the returned word will be the one with the most occurrences
1902	* if there was no possibility -> resultLen==0, tree unchanged
1903	* otherwise: resultBuffer contains the word, returns new tree with word removed
1904	*/
1905	long removed_single = 0;
1906	gb_assert(tree.exists!=0);
1907	*removed = 0;
1908
1909	if (wordLen) { // search wanted path into tree
1910	switch (tree.full->typ) {
1911	case SINGLE_NODE: {
1912	if (tree.single->ch==*wordStart) {
1913	resultBuffer = wordStart;
1914
1915	if (tree.single->son.exists) {
1916	gb_assert(tree.single->count>0);
1917	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart+1, wordLen-1,
1918	resultBuffer+1, resultLen, resultFrequency,
1919	&removed_single);
1920	if (*resultLen) { // word removed
1921	gb_assert(tree.single->count>=removed_single);
1922	tree.single->count -= removed_single;
1923	*removed += removed_single;
1924	(*resultLen)++;
1925	}
1926	}
1927	else {
1928	*resultLen = wordLen==1; // if wordLen==1 -> fully overlapping word found
1929	*resultFrequency = tree.single->count;
1930	}
1931
1932	if (!tree.single->son.exists && *resultLen) { // if no son and a word was found -> remove branch
1933	DictTree brother = tree.single->brother;
1934
1935	*removed += tree.single->count;
1936	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1937
1938	if (brother.exists) tree = brother;
1939	else tree.exists = NULL;
1940	}
1941	}
1942	else if (tree.single->ch < *wordStart && tree.single->brother.exists) {
1943	tree.single->brother = remove_word_from_dtree(tree.single->brother, wordStart, wordLen,
1944	resultBuffer, resultLen, resultFrequency,
1945	&removed_single);
1946	if (resultLen) removed += removed_single;
1947	}
1948	else {
1949	*resultLen = 0; // not found
1950	}
1951
1952	break;
1953	}
1954	case FULL_NODE: {
1955	unsigned_char ch = *wordStart;
1956	*resultBuffer = ch;
1957
1958	if (tree.full->son[ch].exists) {
1959	tree.full->son[ch] = remove_word_from_dtree(tree.full->son[ch], wordStart+1, wordLen-1,
1960	resultBuffer+1, resultLen, resultFrequency,
1961	&removed_single);
1962	if (*resultLen) {
1963	if (tree.full->son[ch].exists) { // another son?
1964	tree.full->count[ch] -= removed_single;
1965	}
1966	else { // last son -> remove whole branch
1967	removed_single = tree.full->count[ch];
1968	tree.full->count[ch] = 0;
1969	tree.full->usedSons--;
1970	}
1971
1972	*removed += removed_single;
1973	(*resultLen)++;
1974	}
1975	}
1976	else if (tree.full->count[ch]) {
1977	*resultLen = (wordLen==1);
1978
1979	if (*resultLen) {
1980	removed += removed_single = resultFrequency = tree.full->count[ch];
1981	tree.full->count[ch] = 0;
1982	tree.full->usedSons--;
1983	}
1984	}
1985	else {
1986	*resultLen = 0; // not found
1987	}
1988
1989	if (!tree.full->usedSons) {
1990	free_dtree(tree);
1991	tree.exists = NULL;
1992	}
1993
1994	break;
1995	}
1996	}
1997	}
1998	else { // take any word
1999	switch (tree.full->typ) {
2000	case SINGLE_NODE: {
2001	*resultBuffer = tree.single->ch;
2002	gb_assert(tree.single->count>0);
2003
2004	if (tree.single->son.exists) {
2005	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart, wordLen,
2006	resultBuffer+1, resultLen, resultFrequency,
2007	&removed_single);
2008	gb_assert(*resultLen);
2009	(*resultLen)++;
2010	}
2011	else {
2012	*resultLen = 1;
2013	*resultFrequency = tree.single->count;
2014	removed_single = tree.single->count;
2015	}
2016
2017	gb_assert(*resultFrequency>0);
2018
2019	if (tree.single->son.exists) {
2020	gb_assert(tree.single->count>=removed_single);
2021	tree.single->count -= removed_single;
2022	*removed += removed_single;
2023	}
2024	else {
2025	DictTree brother = tree.single->brother;
2026
2027	*removed += tree.single->count;
2028	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
2029
2030	if (brother.exists) tree = brother;
2031	else tree.exists = NULL;
2032	}
2033
2034	break;
2035	}
2036	case FULL_NODE: {
2037	int idx;
2038
2039	for (idx=0; idx<256; idx++) {
2040	if (tree.full->son[idx].exists) {
2041	*resultBuffer = idx;
2042	tree.full->son[idx] = remove_word_from_dtree(tree.full->son[idx], wordStart, wordLen,
2043	resultBuffer+1, resultLen, resultFrequency,
2044	&removed_single);
2045	gb_assert(*resultLen);
2046	(*resultLen)++;
2047
2048	if (!tree.full->son[idx].exists) { // son branch removed -> zero count
2049	removed_single = tree.full->count[idx];
2050	tree.full->count[idx] = 0;
2051	tree.full->usedSons--;
2052	}
2053	else {
2054	tree.full->count[idx] -= removed_single;
2055	gb_assert(tree.full->count[idx]>0);
2056	}
2057
2058	break;
2059	}
2060	else if (tree.full->count[idx]) {
2061	*resultBuffer = idx;
2062	*resultLen = 1;
2063	*resultFrequency = tree.full->count[idx];
2064	removed_single = tree.full->count[idx];
2065	tree.full->count[idx] = 0;
2066	tree.full->usedSons--;
2067	break;
2068	}
2069	}
2070
2071	gb_assert(idx<256); // gb_assert break was used to exit loop (== node had a son)
2072
2073	*removed += removed_single;
2074
2075	if (!tree.full->usedSons) {
2076	free_dtree(tree);
2077	tree.exists = NULL;
2078	}
2079
2080	break;
2081	}
2082	}
2083	}
2084
2085	#ifdef DEBUG
2086	if (*resultLen) {
2087	gb_assert(*resultLen>0);
2088	gb_assert(*resultFrequency>0);
2089	gb_assert(*resultLen>=wordLen);
2090	}
2091	#endif
2092
2093	return tree;
2094	}
2095
2096	#define cmp(i1, i2) (heap2[i1]-heap2[i2])
2097	#define swap(i1, i2) do \
2098	{ \
2099	int s = heap[i1]; \
2100	heap[i1] = heap[i2]; \
2101	heap[i2] = s; \
2102	\
2103	s = heap2[i1]; \
2104	heap2[i1] = heap2[i2]; \
2105	heap2[i2] = s; \
2106	} \
2107	while (0)
2108
2109	static void downheap(int heap, int heap2, int me, int num) {
2110	int lson = me*2;
2111	int rson = lson+1;
2112
2113	gb_assert(me>=1);
2114	if (lson>num) return;
2115
2116	if (cmp(lson, me)<0) { // left son smaller than me? (we sort in descending order!!!)
2117	if (rson<=num && cmp(lson, rson)>0) { // right son smaller than left son?
2118	swap(me, rson);
2119	downheap(heap, heap2, rson, num);
2120	}
2121	else {
2122	swap(me, lson);
2123	downheap(heap, heap2, lson, num);
2124	}
2125	}
2126	else if (rson<=num && cmp(me, rson)>0) { // right son smaller than me?
2127	swap(me, rson);
2128	downheap(heap, heap2, rson, num);
2129	}
2130	}
2131
2132	#undef cmp
2133	#undef swap
2134
2135
2136
2137	#define cmp(i1, i2) GB_MEMCMP(dict->text+dict->offsets[heap[i1]], dict->text+dict->offsets[heap[i2]], dict->textlen)
2138	#define swap(i1, i2) do { int s = heap[i1]; heap[i1] = heap[i2]; heap[i2] = s; } while (0)
2139
2140	static void downheap2(int heap, GB_DICTIONARY dict, int me, int num) {
2141	int lson = me*2;
2142	int rson = lson+1;
2143
2144	gb_assert(me>=1);
2145	if (lson>num) return;
2146
2147	if (cmp(lson, me)>0) { // left son bigger than me?
2148	if (rson<=num && cmp(lson, rson)<0) { // right son bigger than left son?
2149	swap(me, rson);
2150	downheap2(heap, dict, rson, num);
2151	}
2152	else {
2153	swap(me, lson);
2154	downheap2(heap, dict, lson, num);
2155	}
2156	}
2157	else if (rson<=num && cmp(me, rson)<0) { // right son bigger than me?
2158	swap(me, rson);
2159	downheap2(heap, dict, rson, num);
2160	}
2161	}
2162
2163	#undef cmp
2164	#undef swap
2165
2166	static void sort_dict_offsets(GB_DICTIONARY *dict) {
2167	/* 1. sorts the 'dict->offsets' by frequency
2168	* (frequency of each offset is stored in the 'dict->resort' with the same index)
2169	* 2. initializes & sorts 'dict->resort' in alphabetic order
2170	*/
2171	int i;
2172	int num = dict->words;
2173	int *heap = dict->offsets-1;
2174	int *heap2 = dict->resort-1;
2175
2176	// sort offsets
2177
2178	for (i=num/2; i>=1; i--) downheap(heap, heap2, i, num); // make heap
2179
2180	while (num>1) { // sort heap
2181	int big = heap[1];
2182	int big2 = heap2[1];
2183
2184	heap[1] = heap[num];
2185	heap2[1] = heap2[num];
2186
2187	downheap(heap, heap2, 1, num-1);
2188
2189	heap[num] = big;
2190	heap2[num] = big2;
2191
2192	num--;
2193	}
2194
2195	// initialize dict->resort
2196
2197	for (i=0, num=dict->words; i<num; i++) dict->resort[i] = i;
2198
2199	// sort dictionary alphabetically
2200
2201	for (i=num/2; i>=1; i--) downheap2(heap2, dict, i, num); // make heap
2202
2203	while (num>1) {
2204	int big = heap2[1];
2205
2206	heap2[1] = heap2[num];
2207	downheap2(heap2, dict, 1, num-1);
2208	heap2[num] = big;
2209	num--;
2210	}
2211	}
2212
2213	// Warning dictionary is not in network byte order !!!!
2214	static GB_DICTIONARY gb_create_dictionary(O_gbdByKey gbk, long maxmem) {
2215	long data_sum;
2216	DictTree tree = build_dict_tree(gbk, maxmem, MAX_WORD_LEN, MIN_WORD_LEN, &data_sum);
2217
2218	if (tree.exists) {
2219	GB_DICTIONARY dict = (GB_DICTIONARY)gbm_get_mem(sizeof(*dict), GBM_DICT_INDEX);
2220	int maxdeep = 0;
2221	int words = count_dtree_leafs(tree, 0, &maxdeep);
2222	u_str word;
2223
2224	int wordLen;
2225	long wordFrequency;
2226	int offset = 0; // next free position in dict->text
2227	int overlap = 0; // # of bytes overlapping with last word
2228	u_str buffer;
2229	long dummy;
2230	#if defined(DEBUG)
2231	long word_sum = 0;
2232	long overlap_sum = 0;
2233	long max_overlap = 0;
2234	#endif
2235
2236	// reduce tree as long as it has to many leafs (>MAX_LONG_INDEX)
2237	while (words >= MAX_LONG_INDEX) {
2238
2239	words = count_dtree_leafs(tree, 0, &maxdeep);
2240	}
2241
2242	buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
2243
2244	calcCounts(tree);
2245	testCounts(tree);
2246
2247	#if DEBUG
2248	printf(" examined data was %li bytes\n", data_sum);
2249	printf(" tree contains %i words *** maximum tree depth = %i\n", words, maxdeep);
2250	#endif
2251
2252	dict->words = 0;
2253	dict->textlen = DICT_STRING_INCR;
2254
2255	dict->text = (u_str)gbm_get_mem(DICT_STRING_INCR, GBM_DICT_INDEX);
2256	dict->offsets = (GB_NINT)gbm_get_mem(sizeof((dict->offsets))*words, GBM_DICT_INDEX);
2257	dict->resort = (GB_NINT)gbm_get_mem(sizeof((dict->resort))*words, GBM_DICT_INDEX);
2258
2259	memset(buffer, '*', maxdeep);
2260	tree = remove_word_from_dtree(tree, NULL, 0, buffer, &wordLen, &wordFrequency, &dummy);
2261	testCounts(tree);
2262
2263	while (1) {
2264	int nextWordLen = 0;
2265	int len;
2266
2267	#if DUMP_COMPRESSION_TEST>=4
2268	printf("word='%s' (occur=%li overlap=%i)\n", lstr(buffer, wordLen), wordFrequency, overlap);
2269	#endif
2270
2271	#if defined(DEBUG)
2272	overlap_sum += overlap;
2273	if (overlap>max_overlap) max_overlap = overlap;
2274	word_sum += wordLen;
2275	#endif
2276
2277	if (offset-overlap+wordLen > dict->textlen) { // if not enough space allocated -> realloc dictionary string
2278	u_str ntext = (u_str)gbm_get_mem(dict->textlen+DICT_STRING_INCR, GBM_DICT_INDEX);
2279
2280	memcpy(ntext, dict->text, dict->textlen);
2281	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2282
2283	dict->text = ntext;
2284	dict->textlen += DICT_STRING_INCR;
2285	}
2286
2287	dict->offsets[dict->words] = offset-overlap;
2288	dict->resort[dict->words] = wordFrequency; // temporarily miss-use this to store frequency
2289	dict->words++;
2290
2291	word = dict->text+offset-overlap;
2292	gb_assert(overlap==0 \|\| GB_MEMCMP(word, buffer, overlap)==0); // test overlapping string-part
2293	memcpy(word, buffer, wordLen); // word -> dictionary string
2294	offset += wordLen-overlap;
2295
2296	if (!tree.exists) break;
2297
2298	for (len=min(10, wordLen-1); len>=0 && nextWordLen==0; len--) {
2299	memset(buffer, '*', maxdeep);
2300	tree = remove_word_from_dtree(tree, word+wordLen-len, len, buffer, &nextWordLen, &wordFrequency, &dummy);
2301	overlap = len;
2302	}
2303
2304	wordLen = nextWordLen;
2305	}
2306
2307	gb_assert(dict->words <= MAX_LONG_INDEX);
2308	gb_assert(dict->words==words); /* dict->words == # of words stored in dictionary string
2309	* words == # of words pre-calculated */
2310
2311	#if DEBUG
2312	printf(" word_sum=%li overlap_sum=%li (%li%%) max_overlap=%li\n",
2313	word_sum, overlap_sum, (overlap_sum*100)/word_sum, max_overlap);
2314	#endif
2315
2316	if (offset<dict->textlen) { // reallocate dict->text if it was allocated too large
2317	u_str ntext = (u_str)gbm_get_mem(offset, GBM_DICT_INDEX);
2318
2319	memcpy(ntext, dict->text, offset);
2320	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2321
2322	dict->text = ntext;
2323	dict->textlen = offset;
2324	}
2325
2326	sort_dict_offsets(dict);
2327
2328	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
2329	free_dtree(tree);
2330
2331	return dict;
2332	}
2333
2334	return NULL;
2335	}
2336
2337	static void gb_free_dictionary(GB_DICTIONARY*& dict) {
2338	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2339	gbm_free_mem(dict->offsets, sizeof((dict->offsets))dict->words, GBM_DICT_INDEX);
2340	gbm_free_mem(dict->resort, sizeof((dict->resort))dict->words, GBM_DICT_INDEX);
2341
2342	gbm_free_mem(dict, sizeof(*dict), GBM_DICT_INDEX);
2343	dict = NULL;
2344	}
2345
2346	static GB_ERROR readAndWrite(O_gbdByKey *gbkp, size_t& old_size, size_t& new_size) {
2347	GB_ERROR error = 0;
2348
2349	old_size = 0;
2350	new_size = 0;
2351
2352	for (int i=0; i<gbkp->cnt && !error; i++) {
2353	GBDATA *gbd = gbkp->gbds[i];
2354
2355	if (COMPRESSIBLE(gbd->type())) {
2356	size_t size;
2357	char *data;
2358
2359	{
2360	cu_str d = (cu_str)get_data_n_size(gbd, &size);
2361	old_size += gbd->as_entry()->memsize();
2362
2363	data = (char*)gbm_get_mem(size, GBM_DICT_INDEX);
2364	memcpy(data, d, size);
2365	gb_assert(data[size-1] == 0);
2366	}
2367
2368	switch (gbd->type()) {
2369	case GB_STRING:
2370	error = GB_write_string(gbd, "");
2371	if (!error) error = GB_write_string(gbd, data);
2372	break;
2373	case GB_LINK:
2374	error = GB_write_link(gbd, "");
2375	if (!error) error = GB_write_link(gbd, data);
2376	break;
2377	case GB_BYTES:
2378	error = GB_write_bytes(gbd, 0, 0);
2379	if (!error) error = GB_write_bytes(gbd, data, size);
2380	break;
2381	case GB_INTS:
2382	error = GB_write_ints(gbd, (GB_UINT4 *)0, 0);
2383	if (!error) error = GB_write_ints(gbd, (GB_UINT4 *)data, size);
2384	break;
2385	case GB_FLOATS:
2386	error = GB_write_floats(gbd, (float*)0, 0);
2387	if (!error) error = GB_write_floats(gbd, (float)(void)data, size);
2388	break;
2389	default:
2390	gb_assert(0);
2391	break;
2392	}
2393
2394	new_size += gbd->as_entry()->memsize();
2395
2396	gbm_free_mem(data, size, GBM_DICT_INDEX);
2397	}
2398	}
2399	return error;
2400	}
2401
2402	static GB_ERROR gb_create_dictionaries(GB_MAIN_TYPE *Main, long maxmem) {
2403	GB_ERROR error = NULL;
2404	#if defined(TEST_DICT)
2405	long uncompressed_sum = 0;
2406	long compressed_sum = 0;
2407	#endif // TEST_DICT
2408
2409	printf("Creating GBDATA-Arrays..\n");
2410
2411	if (!error) {
2412	O_gbdByKey *gbk = g_b_opti_createGbdByKey(Main);
2413	int idx = -1;
2414
2415	printf("Creating dictionaries..\n");
2416
2417	#ifdef DEBUG
2418	// #define TEST_ONE // test only key specified below
2419	// #define TEST_SOME // test only some keys specified below
2420	#if defined(TEST_ONE)
2421	// select wanted index
2422	for (idx=0; idx<gbdByKey_cnt; idx++) { // title author dew_author ebi_journal name ua_tax date full_name ua_title
2423	if (gbk[idx].cnt && strcmp(Main->keys[idx].key, "tree")==0) break;
2424	}
2425	gb_assert(idx<gbdByKey_cnt);
2426	#endif
2427	#endif
2428
2429	#ifdef TEST_ONE
2430	// only create dictionary for index selected above (no loop)
2431	#else
2432	// create dictionaries for all indices (this is the normal operation)
2433	arb_progress progress("Optimizing key data", gbdByKey_cnt-1);
2434	for (idx = gbdByKey_cnt-1; idx >= 1 && !error; --idx, progress.inc_and_check_user_abort(error))
2435	#endif
2436
2437	{
2438	GB_DICTIONARY *dict;
2439	GB_CSTR key_name = Main->keys[idx].key;
2440	GBDATA *gb_main = Main->gb_main();
2441
2442	#ifdef TEST_SOME
2443	if (!( // add all wanted keys here
2444	strcmp(key_name, "REF") == 0 \|\|
2445	strcmp(key_name, "ref") == 0
2446	)) continue;
2447	#endif // TEST_SOME
2448
2449	#ifndef TEST_ONE
2450	if (!gbk[idx].cnt) continue; // there are no entries with this quark
2451
2452	GB_TYPES type = gbk[idx].gbds[0]->type();
2453
2454	GB_begin_transaction(gb_main);
2455	int compression_mask = gb_get_compression_mask(Main, idx, type);
2456	GB_commit_transaction(gb_main);
2457
2458	if ((compression_mask & GB_COMPRESSION_DICTIONARY) == 0) continue; // compression with dictionary is not allowed
2459	if (strcmp(key_name, "data") == 0) continue;
2460	if (strcmp(key_name, "quality") == 0) continue;
2461	#endif
2462
2463	printf("- dictionary for '%s' (idx=%i)\n", key_name, idx);
2464	GB_begin_transaction(gb_main);
2465	dict = gb_create_dictionary(&(gbk[idx]), maxmem);
2466
2467	if (dict) {
2468	/* decompress with old dictionary and write
2469	all data of actual type without compression: */
2470
2471	printf(" * Uncompressing all with old dictionary ...\n");
2472
2473	size_t old_compressed_size;
2474
2475	{
2476	int& compr_mask = Main->keys[idx].compression_mask;
2477	int old_compr_mask = compr_mask;
2478
2479	size_t new_size;
2480
2481	// UNCOVERED();
2482
2483	compr_mask &= ~GB_COMPRESSION_DICTIONARY;
2484	error = readAndWrite(&gbk[idx], old_compressed_size, new_size);
2485	compr_mask = old_compr_mask;
2486	}
2487
2488	if (!error) {
2489	/* dictionary is saved in the following format:
2490	*
2491	* GB_NINT size
2492	* GB_NINT offsets[dict->words]
2493	* GB_NINT resort[dict->words]
2494	* char *text
2495	*/
2496
2497	int dict_buffer_size = sizeof(GB_NINT) * (1+dict->words*2) + dict->textlen;
2498	char dict_buffer = (char)gbm_get_mem(dict_buffer_size, GBM_DICT_INDEX);
2499	long old_dict_buffer_size;
2500	char *old_dict_buffer;
2501
2502	{
2503	GB_NINT nint = (GB_NINT)dict_buffer;
2504	int n;
2505
2506	*nint++ = htonl(dict->words);
2507	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->offsets[n]);
2508	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->resort[n]);
2509
2510	memcpy(nint, dict->text, dict->textlen);
2511	}
2512
2513	error = gb_load_dictionary_data(gb_main, Main->keys[idx].key, &old_dict_buffer, &old_dict_buffer_size);
2514	if (!error) {
2515	gb_save_dictionary_data(gb_main, Main->keys[idx].key, dict_buffer, dict_buffer_size);
2516
2517	// compress all data with new dictionary
2518	printf(" * Compressing all with new dictionary ...\n");
2519
2520	size_t old_size, new_compressed_size;
2521	error = readAndWrite(&gbk[idx], old_size, new_compressed_size);
2522
2523	if (!error) {
2524	printf(" (compressed size: old=%zu new=%zu ratio=%.1f%%)\n",
2525	old_compressed_size, new_compressed_size, (new_compressed_size*100.0)/old_compressed_size);
2526
2527	// @@@ for some keys compression fails (e.g. 'ref');
2528	// need to find out why dictionary is so bad
2529	// gb_assert(new_compressed_size <= old_compressed_size); // @@@ enable this (fails in unit-test)
2530	}
2531
2532	if (error) {
2533	/* critical state: new dictionary has been written, but transaction will be aborted below.
2534	* Solution: Write back old dictionary.
2535	*/
2536	gb_save_dictionary_data(gb_main, Main->keys[idx].key, old_dict_buffer, old_dict_buffer_size);
2537	}
2538	}
2539
2540	gbm_free_mem(dict_buffer, dict_buffer_size, GBM_DICT_INDEX);
2541	if (old_dict_buffer) gbm_free_mem(old_dict_buffer, old_dict_buffer_size, GBM_DICT_INDEX);
2542
2543	#if defined(TEST_DICT)
2544	if (!error) {
2545	GB_DICTIONARY *dict_reloaded = gb_get_dictionary(Main, idx);
2546	test_dictionary(dict_reloaded, &(gbk[idx]), &uncompressed_sum, &compressed_sum);
2547	}
2548	#endif // TEST_DICT
2549	}
2550
2551	gb_free_dictionary(dict);
2552	}
2553
2554	error = GB_end_transaction(gb_main, error);
2555	}
2556
2557	#ifdef TEST_DICT
2558	if (!error) {
2559	printf(" overall uncompressed size = %li b\n"
2560	" overall compressed size = %li b (Ratio=%li%%)\n",
2561	uncompressed_sum, compressed_sum,
2562	(compressed_sum*100)/uncompressed_sum);
2563	}
2564	#endif // TEST_DICT
2565
2566	printf("Done.\n");
2567
2568	g_b_opti_freeGbdByKey(gbk);
2569	}
2570
2571	return error;
2572	}
2573
2574	GB_ERROR GB_optimize(GBDATA *gb_main) {
2575	unsigned long maxKB = GB_get_usable_memory();
2576	long maxMem;
2577	GB_ERROR error = 0;
2578	GB_UNDO_TYPE prev_undo_type = GB_get_requested_undo_type(gb_main);
2579
2580	#ifdef DEBUG
2581	maxKB /= 2;
2582	#endif
2583
2584	if (maxKB<=(LONG_MAX/1024)) maxMem = maxKB*1024;
2585	else maxMem = LONG_MAX;
2586
2587	error = GB_request_undo_type(gb_main, GB_UNDO_KILL);
2588	if (!error) {
2589	error = gb_create_dictionaries(GB_MAIN(gb_main), maxMem);
2590	if (!error) GB_disable_quicksave(gb_main, "Database optimized");
2591	ASSERT_NO_ERROR(GB_request_undo_type(gb_main, prev_undo_type));
2592	}
2593	return error;
2594	}
2595
2596	// --------------------------------------------------------------------------------
2597
2598	#ifdef UNIT_TESTS
2599	#ifndef TEST_UNIT_H
2600	#include <test_unit.h>
2601	#endif
2602
2603	// #define TEST_AUTO_UPDATE // uncomment to auto-update binary result of DB optimization
2604	// #define TEST_AUTO_UPDATE_ASCII // uncomment to auto-update ascii-input from output (be careful with this!)
2605
2606	void TEST_SLOW_optimize() {
2607	// test DB optimization (optimizes compression)
2608	// [current coverage ~89%]
2609	//
2610	// Note: a test for sequence compression is in adseqcompr.cxx@TEST_SLOW_sequence_compression
2611
2612	GB_shell shell;
2613
2614	const char *source_ascii = "TEST_opti_ascii_in.arb";
2615	const char *target_ascii = "TEST_opti_ascii_out.arb";
2616
2617	const char *nonopti = "TEST_opti_none.arb";
2618	const char *optimized = "TEST_opti_initial.arb";
2619	const char *reoptimized = "TEST_opti_again.arb";
2620	const char *expected = "TEST_opti_expected.arb"; // expected result after optimize
2621
2622	// initial optimization of ASCII-DB
2623	{
2624	GBDATA *gb_main = GB_open(source_ascii, "rw");
2625
2626	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, nonopti, "b"));
2627
2628	GB_push_my_security(gb_main);
2629	TEST_EXPECT_NO_ERROR(GB_optimize(gb_main));
2630	GB_pop_my_security(gb_main);
2631
2632	GB_flush_cache(gb_main);
2633
2634	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, optimized, "b"));
2635	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2636
2637	#if defined(TEST_AUTO_UPDATE)
2638	TEST_COPY_FILE(optimized, expected);
2639	#endif
2640	#if defined(TEST_AUTO_UPDATE_ASCII)
2641	TEST_COPY_FILE(target_ascii, source_ascii);
2642	#endif
2643
2644	TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2645	TEST_EXPECT_FILES_EQUAL(optimized, expected);
2646
2647	// check that optimization made sense:
2648	long nonopti_size = GB_size_of_file(nonopti);
2649	long optimized_size = GB_size_of_file(optimized);
2650	TEST_EXPECT_LESS(optimized_size, nonopti_size); // did file shrink?
2651	TEST_EXPECT_EQUAL(optimized_size*100/nonopti_size, 74); // document compression ratio (in percent)
2652
2653	GB_close(gb_main);
2654	}
2655
2656	// re-optimize DB (which is already compressed)
2657	{
2658	GBDATA *gb_main = GB_open(optimized, "rw");
2659
2660	GB_push_my_security(gb_main);
2661	TEST_EXPECT_NO_ERROR(GB_optimize(gb_main));
2662	GB_pop_my_security(gb_main);
2663
2664	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, reoptimized, "b"));
2665	TEST_EXPECT_NO_ERROR(GB_save_as(gb_main, target_ascii, "a"));
2666
2667	TEST_EXPECT_TEXTFILE_DIFFLINES(source_ascii, target_ascii, 0);
2668	TEST_EXPECT_FILES_EQUAL(reoptimized, expected); // reoptimize should produce same result as initial optimize
2669
2670	GB_close(gb_main);
2671	}
2672
2673	TEST_EXPECT_NO_ERROR(GBK_system(GBS_global_string("rm %s %s %s %s", target_ascii, nonopti, optimized, reoptimized)));
2674	}
2675
2676	#endif // UNIT_TESTS
2677
2678	// --------------------------------------------------------------------------------
2679

Note: See TracBrowser for help on using the repository browser.

Download in other formats: