Context Navigation

source: tags/svn.1.5.4/ARBDB/adoptimize.cxx

Visit:

Last change on this file was 8309, checked in by westram, 14 years ago

moved much code into static scope

(partly reverted by [8310])

Property svn:eol-style set to native
Property svn:keywords set to Author Date Id Revision

File size: 83.1 KB

Line
1	// =============================================================== //
2	// //
3	// File : adoptimize.cxx //
4	// Purpose : //
5	// //
6	// Institute of Microbiology (Technical University Munich) //
7	// http://www.arb-home.de/ //
8	// //
9	// =============================================================== //
10
11	#include <climits>
12	#include <netinet/in.h>
13
14	#include <arbdbt.h>
15
16	#include "gb_compress.h"
17	#include "gb_dict.h"
18	#include "arb_progress.h"
19
20	#if defined(DEBUG)
21	// #define TEST_DICT
22	#endif // DEBUG
23
24	typedef unsigned char unsigned_char;
25	typedef unsigned char *u_str;
26	typedef const unsigned char *cu_str;
27
28	static int gbdByKey_cnt;
29	struct O_gbdByKey { // one for each diff. keyQuark
30	int cnt;
31	GBDATA **gbds; // gbdoff
32	};
33
34	struct FullDictTree;
35	struct SingleDictTree;
36
37	union DictTree {
38	FullDictTree *full;
39	SingleDictTree *single;
40	void *exists;
41
42	};
43
44	enum DictNodeType { SINGLE_NODE, FULL_NODE };
45
46	struct FullDictTree {
47	DictNodeType typ; // always FULL_NODE
48	int usedSons;
49	int count[256];
50	DictTree son[256]; // index == character
51	};
52
53	struct SingleDictTree {
54	DictNodeType typ; // always SINGLE_NODE
55	unsigned_char ch; // the character
56	int count; // no of occurrences of this branch
57	DictTree son;
58	DictTree brother;
59
60	};
61
62	// **************************************************
63
64	#define COMPRESSIBLE(type) ((type) >= GB_BYTES && (type)<=GB_STRING)
65	#define DICT_MEM_WEIGHT 4
66
67	#define WORD_HELPFUL(wordlen, occurrences) ((long)((occurrences)3 + DICT_MEM_WEIGHT(2*sizeof(GB_NINT)+(wordlen))) \
68	< \
69	(long)((occurrences)*(wordlen)))
70	/* (occurrences)*4 compressed size
71	* 2*sizeof(GB_NINT)+(wordlen) size in dictionary
72	* (occurrences)*(wordlen) uncompressed size
73	*/
74
75	// **************************************************
76
77	#define MIN_WORD_LEN 8 // minimum length of words in dictionary
78	#define MAX_WORD_LEN 50 // maximum length of words in dictionary
79	#define MAX_BROTHERS 10 /* maximum no of brothers linked with SingleDictTree
80	* above we use FullDictTree */
81	#define MAX_DIFFER 2 /* percentage of difference (of occurrences of strings) below which two
82	* consecutive parts are treated as EQUAL # of occurrences */
83	#define INCR_DIFFER 1 // the above percentage is incremented from 0 to MAX_DIFFER by INCR_DIFFER per step
84
85	#define DICT_STRING_INCR 1024 // dictionary string will be incremented by this size
86
87	// ***************** Tool functions ****************
88
89	static inline cu_str get_data_n_size(GBDATA gbd, long size) {
90	GB_CSTR data;
91	int type = GB_TYPE(gbd);
92
93	*size = 0;
94
95	switch (type) {
96	case GB_STRING: data = GB_read_char_pntr(gbd); break;
97	case GB_LINK: data = GB_read_link_pntr(gbd); break;
98	case GB_BYTES: data = GB_read_bytes_pntr(gbd); break;
99	case GB_INTS: data = (char*)GB_read_ints_pntr(gbd); break;
100	case GB_FLOATS: data = (char*)GB_read_floats_pntr(gbd); break;
101	default:
102	data = 0;
103	gb_assert(0);
104	break;
105	}
106
107	if (data) *size = GB_UNCOMPRESSED_SIZE(gbd, type);
108	return (cu_str)data;
109	}
110
111	static inline long min(long a, long b) {
112	return a<b ? a : b;
113	}
114
115	// **************************************************
116
117	static void g_b_opti_scanGbdByKey(GB_MAIN_TYPE Main, GBDATA gbd, O_gbdByKey *gbk)
118	{
119	GBQUARK quark;
120
121	if (GB_TYPE(gbd) == GB_DB) // CONTAINER
122	{
123	int idx;
124	GBCONTAINER gbc = (GBCONTAINER )gbd;
125	GBDATA *gbd2;
126
127	for (idx=0; idx < gbc->d.nheader; idx++)
128	if ((gbd2=GBCONTAINER_ELEM(gbc, idx))!=NULL)
129	g_b_opti_scanGbdByKey(Main, gbd2, gbk);
130	}
131
132	quark = GB_KEY_QUARK(gbd);
133
134	if (quark)
135	{
136	gb_assert(gbk[quark].cnt < Main->keys[quark].nref \|\| quark==0);
137	gb_assert(gbk[quark].gbds != 0);
138
139	gbk[quark].gbds[gbk[quark].cnt] = gbd;
140	gbk[quark].cnt++;
141	}
142	}
143
144	static O_gbdByKey g_b_opti_createGbdByKey(GB_MAIN_TYPE Main)
145	{
146	int idx;
147	O_gbdByKey gbk = (O_gbdByKey )GB_calloc(Main->keycnt, sizeof(O_gbdByKey));
148
149	gbdByKey_cnt = Main->keycnt; // always use gbdByKey_cnt instead of Main->keycnt cause Main->keycnt can change
150
151	for (idx=1; idx<gbdByKey_cnt; idx++) {
152	gbk[idx].cnt = 0;
153
154	if (Main->keys[idx].key && Main->keys[idx].nref>0) {
155	gbk[idx].gbds = (GBDATA *) GB_calloc(Main->keys[idx].nref, sizeof(GBDATA));
156	}
157	else {
158	gbk[idx].gbds = NULL;
159	}
160	}
161
162	gbk[0].cnt = 0;
163	gbk[0].gbds = (GBDATA *)GB_calloc(1, sizeof(GBDATA));
164
165	g_b_opti_scanGbdByKey(Main, (GBDATA*)Main->data, gbk);
166
167	for (idx=0; idx<gbdByKey_cnt; idx++) {
168	if (gbk[idx].cnt != Main->keys[idx].nref && idx)
169	{
170	printf("idx=%i gbk[idx].cnt=%i Main->keys[idx].nref=%li\n", // Main->keys[].nref ist falsch
171	idx, gbk[idx].cnt, Main->keys[idx].nref);
172
173	Main->keys[idx].nref = gbk[idx].cnt;
174	}
175	}
176	return gbk;
177	}
178
179	static void g_b_opti_freeGbdByKey(O_gbdByKey *gbk) {
180	for (int idx=0; idx<gbdByKey_cnt; idx++) free(gbk[idx].gbds);
181	free(gbk);
182	}
183
184	// ***************** Convert old compression style to new style ****************
185
186	static GB_ERROR gb_convert_compression(GBDATA *source) {
187	GB_ERROR error = 0;
188	long type = GB_TYPE(source);
189
190	if (type == GB_DB) {
191	GBDATA *gb_p;
192	for (gb_p = GB_child(source); gb_p; gb_p = GB_nextChild(gb_p)) {
193	if (gb_p->flags.compressed_data \|\| GB_TYPE(gb_p) == GB_DB) {
194	error = gb_convert_compression(gb_p);
195	if (error) break;
196	}
197	}
198	}
199	else {
200	char *string = 0;
201	long elems = GB_GETSIZE(source);
202	long data_size = GB_UNCOMPRESSED_SIZE(source, type);
203	long new_size = -1;
204	int expectData = 1;
205
206	switch (type) {
207	case GB_STRING:
208	case GB_LINK:
209	case GB_BYTES:
210	string = gb_uncompress_bytes(GB_GETDATA(source), data_size, &new_size);
211	if (string) {
212	gb_assert(new_size == data_size);
213	string = GB_memdup(string, data_size);
214	}
215	break;
216
217	case GB_INTS:
218	case GB_FLOATS:
219	string = gb_uncompress_longs_old(GB_GETDATA(source), elems, &new_size);
220	if (string) {
221	gb_assert(new_size == data_size);
222	string = GB_memdup(string, data_size);
223	}
224	break;
225
226	default:
227	expectData = 0;
228	break;
229	}
230
231	if (!string) {
232	if (expectData) {
233	error = GBS_global_string("Can't read old data to convert compression (Reason: %s)", GB_await_error());
234	}
235	}
236	else {
237	switch (type) {
238	case GB_STRING:
239	error = GB_write_string(source, "");
240	if (!error) error = GB_write_string(source, string);
241	break;
242
243	case GB_LINK:
244	error = GB_write_link(source, "");
245	if (!error) error = GB_write_link(source, string);
246	break;
247
248	case GB_BYTES:
249	error = GB_write_bytes(source, "", 0);
250	if (!error) error = GB_write_bytes(source, string, data_size);
251	break;
252
253	case GB_INTS:
254	case GB_FLOATS:
255	error = GB_write_pntr(source, string, data_size, elems);
256	break;
257
258	default:
259	gb_assert(0);
260	break;
261	}
262
263	free(string);
264	}
265	}
266	return error;
267	}
268
269	GB_ERROR gb_convert_V2_to_V3(GBDATA *gb_main) {
270	GB_ERROR error = 0;
271	GBDATA *gb_system = GB_search(gb_main, GB_SYSTEM_FOLDER, GB_FIND);
272
273	if (!gb_system) {
274	gb_system = GB_create_container(gb_main, GB_SYSTEM_FOLDER);
275	if (GB_entry(gb_main, "extended_data")) {
276	GB_warning("Converting data from old V2.0 to V2.1 Format:\n"
277	" Please Wait (may take some time)");
278	}
279	error = gb_convert_compression(gb_main);
280	GB_disable_quicksave(gb_main, "Database converted to new format");
281	}
282	return error;
283	}
284
285
286	// ******************* Compress by dictionary ******************
287
288
289	/* compression tag format:
290	*
291	* unsigned int compressed:1;
292	* if compressed==0:
293	* unsigned int last:1; ==1 -> this is the last block
294	* unsigned int len:6; length of uncompressible bytes
295	* char[len];
296	* if compressed==1:
297	* unsigned int idxlen:1; ==0 -> 10-bit index
298	* ==1 -> 18-bit index
299	* unsigned int idxhigh:2; the 2 highest bits of the index
300	* unsigned int len:4; (length of word) - (MIN_COMPR_WORD_LEN-1)
301	* if len==0:
302	* char extralen; (length of word) -
303	* char[idxlen+1]; index (low,high)
304	*
305	* tag == 64 -> end of dictionary compressed block (if not coded in last uncompressed block)
306	*/
307
308	inline int INDEX_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
309	gb_assert(idx<dict->words);
310	return ntohl(dict->offsets[idx]);
311	}
312	inline int ALPHA_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
313	int realIndex;
314	gb_assert(idx<dict->words);
315	realIndex = ntohl(dict->resort[idx]);
316	return INDEX_DICT_OFFSET(realIndex, dict);
317	}
318
319	// #define ALPHA_DICT_OFFSET(i) ntohl(offset[ntohl(resort[i])])
320	// #define INDEX_DICT_OFFSET(i) ntohl(offset[i])
321
322	#define LEN_BITS 4
323	#define INDEX_BITS 2
324	#define INDEX_LEN_BITS 1
325
326	#define LEN_SHIFT 0
327	#define INDEX_SHIFT (LEN_SHIFT+LEN_BITS)
328	#define INDEX_LEN_SHIFT (INDEX_SHIFT+INDEX_BITS)
329
330	#define BITMASK(bits) ((1<<(bits))-1)
331	#define GETVAL(tag, typ) (((tag)>>typ##_SHIFT)&BITMASK(typ##_BITS))
332
333	#define MIN_SHORTLEN 6
334	#define MAX_SHORTLEN (BITMASK(LEN_BITS)+MIN_SHORTLEN-1)
335	#define MIN_LONGLEN (MAX_SHORTLEN+1)
336	#define MAX_LONGLEN (MIN_LONGLEN+255)
337
338	#define SHORTLEN_DECR (MIN_SHORTLEN-1) // !! zero is used as flag for long len !!
339	#define LONGLEN_DECR MIN_LONGLEN
340
341	#define MIN_COMPR_WORD_LEN MIN_SHORTLEN
342	#define MAX_COMPR_WORD_LEN MAX_LONGLEN
343
344	#define MAX_SHORT_INDEX BITMASK(INDEX_BITS+8)
345	#define MAX_LONG_INDEX BITMASK(INDEX_BITS+16)
346
347	#define LAST_COMPRESSED_BIT 64
348
349	#ifdef DEBUG
350	# define DUMP_COMPRESSION_TEST 0
351	/* 0 = only compression ratio
352	* 1 = + original/compressed/decompressed
353	* 2 = + words used to compress/uncompress
354	* 3 = + matching words in dictionary
355	* 4 = + search of words in dictionary
356	*/
357	#else
358	# define DUMP_COMPRESSION_TEST 0
359	#endif
360
361	#ifdef DEBUG
362	// #define COUNT_CHUNKS
363
364	#if defined(COUNT_CHUNKS)
365
366	static long uncompressedBlocks[64];
367	static long compressedBlocks[MAX_LONGLEN];
368
369	static void clearChunkCounters() {
370	int i;
371
372	for (i=0; i<64; i++) uncompressedBlocks[i] = 0;
373	for (i=0; i<MAX_LONGLEN; i++) compressedBlocks[i] = 0;
374	}
375
376	static void dumpChunkCounters() {
377	int i;
378
379	printf("------------------------------\n" "Uncompressed blocks used:\n");
380	for (i=0; i<64; i++) if (uncompressedBlocks[i]) printf(" size=%i used=%li\n", i, uncompressedBlocks[i]);
381	printf("------------------------------\n" "Words used:\n");
382	for (i=0; i<MAX_LONGLEN; i++) if (compressedBlocks[i]) printf(" size=%i used=%li\n", i, compressedBlocks[i]);
383	printf("------------------------------\n");
384	}
385	#endif // COUNT_CHUNKS
386
387	static cu_str lstr(cu_str s, int len) {
388	#define BUFLEN 10000
389	static unsigned_char buf[BUFLEN];
390
391	gb_assert(len<BUFLEN);
392	memcpy(buf, s, len);
393	buf[len] = 0;
394
395	return buf;
396	}
397
398	#if DUMP_COMPRESSION_TEST>=2
399
400	static cu_str dict_word(GB_DICTIONARY *dict, int idx, int len) {
401	return lstr(dict->text+INDEX_DICT_OFFSET(idx, dict), len);
402	}
403
404	#endif
405
406	#if DUMP_COMPRESSION_TEST>=1
407
408	static void dumpBinary(u_str data, long size) {
409	#define PER_LINE 12
410	int cnt = 0;
411
412	while (size--) {
413	unsigned_char c = *data++;
414	int bitval = 128;
415	int bits = 8;
416
417	while (bits--) {
418	putchar(c&bitval ? '1' : '0');
419	bitval>>=1;
420	}
421	putchar(' ');
422
423	cnt = (cnt+1)%PER_LINE;
424	if (!cnt) putchar('\n');
425	}
426
427	if (cnt) putchar('\n');
428	}
429
430	#endif
431
432	#endif
433
434	inline int GB_MEMCMP(const void vm1, const void vm2, long size) {
435	char c1 = (char)vm1,
436	c2 = (char)vm2;
437	int diff = 0;
438
439	while (size-- && !diff) diff = c1++-c2++;
440	return diff;
441	}
442
443	// --------------------------------------------------
444
445	static int searchWord(GB_DICTIONARY dict, cu_str source, long size, unsigned long wordIndex, int *wordLen)
446	{
447	int idx = -1;
448	int l = 0;
449	int h = dict->words-1;
450	cu_str text = dict->text;
451	GB_NINT *resort = dict->resort;
452	int dsize = dict->textlen;
453	int ilen = 0;
454
455	while (l<h-1) {
456	int m = (l+h)/2;
457	long off = ALPHA_DICT_OFFSET(m, dict);
458	cu_str dictword = text+off;
459	long msize = min(size, dsize-off);
460
461	#if DUMP_COMPRESSION_TEST>=4
462	printf(" %s (%i)\n", lstr(dictword, 20), m);
463	#endif
464
465	if (GB_MEMCMP(source, dictword, msize)<=0) h = m;
466	else l = m;
467	}
468
469	while (l<=h) {
470	int off = ALPHA_DICT_OFFSET(l, dict);
471	cu_str word = text+off;
472	int msize = (int)min(size, dsize-off);
473	int equal = 0;
474	cu_str s = source;
475
476	while (msize-- && s++==word++) equal++;
477
478	#if DUMP_COMPRESSION_TEST>=3
479	if (equal>=MIN_COMPR_WORD_LEN) {
480	printf(" EQUAL=%i '%s' (%i->%i, off=%i)", equal, lstr(text+off, equal), l, ntohl(resort[l]), ALPHA_DICT_OFFSET(l, dict));
481	printf(" (context=%s)\n", lstr(text+off-min(off, 20), min(off, 20)+equal+20));
482	}
483	#endif
484
485	if (equal>ilen) {
486	ilen = equal;
487	idx = ntohl(resort[l]);
488	gb_assert(idx<dict->words);
489	}
490
491	l++;
492	}
493
494	*wordIndex = idx;
495	*wordLen = (int)min(ilen, MAX_COMPR_WORD_LEN);
496
497	return idx!=-1 && ilen>=MIN_COMPR_WORD_LEN;
498	}
499
500	#ifdef DEBUG
501	int look(GB_DICTIONARY *dict, GB_CSTR source) {
502	unsigned long wordIndex;
503	int wordLen;
504	int wordFound = searchWord(dict, (cu_str)source, strlen(source), &wordIndex, &wordLen);
505
506	if (wordFound) {
507	printf("'%s' (idx=%lu, off=%i)\n", lstr(dict->text+ntohl(dict->offsets[wordIndex]), wordLen), wordIndex, ntohl(dict->offsets[wordIndex]));
508	}
509
510	return wordFound;
511	}
512	#endif
513
514
515
516	static char gb_uncompress_by_dictionary_internal(GB_DICTIONARY dict, /* GBDATA gbd, / GB_CSTR s_source, const long size, bool append_zero, long *new_size) {
517	cu_str source = (cu_str)s_source;
518	u_str dest;
519	u_str buffer;
520	cu_str text = dict->text;
521	int done = 0;
522	long left = size;
523
524	dest = buffer = (u_str)GB_give_other_buffer(s_source, size+2);
525
526	while (left && !done) {
527	int c;
528
529	if ((c=*source++)&128) { // compressed data
530	int indexLen = GETVAL(c, INDEX_LEN);
531	unsigned long idx = GETVAL(c, INDEX);
532
533	c = GETVAL(c, LEN); // ==wordLen
534	if (c) c += SHORTLEN_DECR;
535	else c = *source+++LONGLEN_DECR;
536
537	gb_assert(indexLen>=0 && indexLen<=1);
538
539	if (indexLen==0) {
540	idx = (idx << 8) \| *source++;
541	}
542	else {
543	idx = (((idx << 8) \| source[1]) << 8) \| source[0];
544	source += 2;
545	}
546
547	gb_assert(idx<(GB_ULONG)dict->words);
548
549	{
550	cu_str word = text+INDEX_DICT_OFFSET(idx, dict);
551
552	#if DUMP_COMPRESSION_TEST>=2
553	printf(" word='%s' (idx=%lu, off=%li, len=%i)\n",
554	lstr(word, c), idx, (long)ntohl(dict->offsets[idx]), c);
555	#endif
556
557	{
558	u_str d = dest;
559	gb_assert(((d + c) <= word) \|\| (d >= (word + c)));
560	while (c--) d++ = word++;
561	dest = d;
562	}
563	}
564	}
565	else { // uncompressed bytes
566	if (c & LAST_COMPRESSED_BIT) {
567	done = 1;
568	c ^= LAST_COMPRESSED_BIT;
569	}
570
571	left -= c;
572	{
573	u_str d = dest;
574	gb_assert(((d + c) <= source) \|\| (d >= (source + c)));
575	while (c--) d++ = source++;
576	dest=d;
577	}
578	}
579	}
580
581	if (append_zero) *dest++ = 0;
582
583	*new_size = dest-buffer;
584	gb_assert(size >= *new_size); // buffer overflow
585
586	return (char *)buffer;
587	}
588
589	char gb_uncompress_by_dictionary(GBDATA gbd, GB_CSTR s_source, long size, long *new_size)
590	{
591	GB_DICTIONARY *dict = gb_get_dictionary(GB_MAIN(gbd), GB_KEY_QUARK(gbd));
592	bool append_zero = GB_TYPE(gbd)==GB_STRING \|\| GB_TYPE(gbd) == GB_LINK;
593
594	if (!dict) {
595	GB_ERROR error = GBS_global_string("Cannot decompress db-entry '%s' (no dictionary found)\n", GB_get_db_path(gbd));
596	GB_export_error(error);
597	return 0;
598	}
599
600	return gb_uncompress_by_dictionary_internal(dict, s_source, size, append_zero, new_size);
601	}
602
603	char gb_compress_by_dictionary(GB_DICTIONARY dict, GB_CSTR s_source, long size, long *msize, int last_flag, int search_backward, int search_forward)
604	{
605	cu_str source = (cu_str)s_source;
606	u_str dest;
607	u_str buffer;
608	cu_str unknown = source; // start of uncompressible bytes
609	u_str lastUncompressed = NULL; // ptr to start of last block of uncompressible bytes (in dest)
610
611	#if defined(ASSERTION_USED)
612	const long org_size = size;
613	#endif // ASSERTION_USED
614
615	gb_assert(size>0); // compression of zero-length data fails!
616
617	dest = buffer = (u_str)GB_give_other_buffer((GB_CSTR)source, 1+(size/63+1)+size);
618	*dest++ = GB_COMPRESSION_DICTIONARY \| last_flag;
619
620	while (size) {
621	unsigned long wordIndex;
622	int wordLen;
623	int wordFound;
624
625	if ((wordFound = searchWord(dict, source, size, &wordIndex, &wordLen))) {
626	int length;
627
628	takeRest :
629	length = source-unknown;
630
631	if (length) {
632	int shift;
633	int takeShift = 0;
634	int maxShift = (int)min(search_forward, wordLen-1);
635
636	for (shift=1; shift<=maxShift; shift++) {
637	unsigned long wordIndex2;
638	int wordLen2;
639	int wordFound2;
640
641	if ((wordFound2 = searchWord(dict, source+shift, size-shift, &wordIndex2, &wordLen2))) {
642	if (wordLen2>(wordLen+shift)) {
643	wordIndex = wordIndex2;
644	wordLen = wordLen2;
645	takeShift = shift;
646	}
647	}
648	}
649
650	if (takeShift) {
651	source += takeShift;
652	size -= takeShift;
653	length = source-unknown;
654	}
655	}
656
657	while (length) { // if there were uncompressible bytes
658	int take = (int)min(length, 63);
659
660	#ifdef COUNT_CHUNKS
661	uncompressedBlocks[take]++;
662	#endif
663
664	lastUncompressed = dest;
665
666	*dest++ = take; // tag byte
667	memcpy(dest, unknown, take);
668	dest += take;
669	unknown += take;
670	length -= take;
671	}
672
673	gb_assert(unknown==source);
674
675	while (wordFound) { // as long as we find words in dictionary
676	int indexLen = wordIndex>MAX_SHORT_INDEX;
677	int indexHighBits = indexLen==0 ? wordIndex>>8 : wordIndex>>16;
678	int nextWordFound;
679	int nextWordLen;
680	unsigned long nextWordIndex;
681
682	gb_assert((long)wordIndex<dict->words);
683	gb_assert((long)wordIndex <= MAX_LONG_INDEX);
684	gb_assert(indexHighBits==(indexHighBits & BITMASK(INDEX_BITS)));
685	gb_assert(wordLen>=MIN_SHORTLEN);
686
687	lastUncompressed = NULL;
688
689	{
690	cu_str source2 = source+wordLen;
691	long size2 = size-wordLen;
692
693	if (!(nextWordFound=searchWord(dict, source+wordLen, size-wordLen, &nextWordIndex, &nextWordLen))) { // no word right afterwards
694	int shift;
695
696	for (shift=1; shift<=search_backward && shift<(wordLen-MIN_COMPR_WORD_LEN); shift++) {
697	// try to cut end of word to get a better result
698	unsigned long wordIndex2;
699	int wordLen2;
700	int wordFound2;
701
702	if ((wordFound2=searchWord(dict, source2-shift, size2+shift, &wordIndex2, &wordLen2))) {
703	if (wordLen2>(shift+1)) {
704	wordLen -= shift;
705
706	nextWordFound = 1;
707	nextWordIndex = wordIndex2;
708	nextWordLen = wordLen2;
709	break;
710	}
711	}
712	}
713	}
714	}
715
716	#ifdef COUNT_CHUNKS
717	compressedBlocks[wordLen]++;
718	#endif
719
720	#if DUMP_COMPRESSION_TEST>=2
721	printf(" word='%s' (idx=%li, off=%i, len=%i)\n",
722	dict_word(dict, wordIndex, wordLen), wordIndex, (int)ntohl(dict->offsets[wordIndex]), wordLen);
723	#endif
724
725	if (wordLen<=MAX_SHORTLEN) {
726	*dest++ = 128 \|
727	(indexLen << INDEX_LEN_SHIFT) \|
728	(indexHighBits << INDEX_SHIFT) \|
729	((wordLen-SHORTLEN_DECR) << LEN_SHIFT);
730	}
731	else {
732	*dest++ = 128 \|
733	(indexLen << INDEX_LEN_SHIFT) \|
734	(indexHighBits << INDEX_SHIFT);
735	*dest++ = wordLen-LONGLEN_DECR; // extra length byte
736	}
737
738	*dest++ = (char)wordIndex; // low index byte
739	if (indexLen)
740	*dest++ = (char)(wordIndex >> 8); // high index byte
741
742	unknown = source += wordLen;
743	size -= wordLen;
744
745	wordFound = nextWordFound;
746	wordIndex = nextWordIndex;
747	wordLen = nextWordLen;
748	}
749	}
750	else {
751	source++;
752	if (--size==0) goto takeRest;
753	}
754	}
755
756	if (lastUncompressed) *lastUncompressed \|= LAST_COMPRESSED_BIT;
757	else *dest++ = LAST_COMPRESSED_BIT;
758
759	*msize = dest-buffer;
760
761	#if defined(ASSERTION_USED)
762	{
763	long new_size = -1;
764	char *test = gb_uncompress_by_dictionary_internal(dict, (GB_CSTR)buffer+1, org_size + GB_COMPRESSION_TAGS_SIZE_MAX, true, &new_size);
765
766	gb_assert(memcmp(test, s_source, org_size) == 0);
767	gb_assert((org_size+1) == new_size);
768	}
769	#endif // ASSERTION_USED
770
771	return (char*)buffer;
772	}
773
774
775	#if defined(TEST_DICT)
776
777	static void test_dictionary(GB_DICTIONARY dict, O_gbdByKey gbk, long uncompSum, long compSum)
778	{
779	int cnt;
780	long uncompressed_sum = 0;
781	long compressed_sum = 0;
782	long dict_size = (dict->words2+1)sizeof(GB_NINT)+dict->textlen;
783	int i;
784	long char_count[256];
785
786	for (i=0; i<256; i++) char_count[i] = 0;
787
788	printf(" * Testing compression..\n");
789
790	#ifdef COUNT_CHUNKS
791	clearChunkCounters();
792	#endif
793
794	for (cnt=0; cnt<gbk->cnt; cnt++) {
795	GBDATA *gbd = gbk->gbds[cnt];
796	int type = GB_TYPE(gbd);
797
798	if (COMPRESSIBLE(type)) {
799	long size;
800	cu_str data = get_data_n_size(gbd, &size);
801	u_str copy;
802	long compressedSize;
803	int last_flag = 0;
804	u_str compressed;
805	u_str uncompressed;
806
807	if (type==GB_STRING \|\| type == GB_LINK) size--;
808
809	if (size<1) continue;
810
811	#ifndef NDEBUG
812	copy = (u_str)gbm_get_mem(size, GBM_DICT_INDEX);
813	gb_assert(copy!=0);
814	memcpy(copy, data, size);
815	#endif
816
817	#if DUMP_COMPRESSION_TEST>=1
818	printf("----------------------------\n");
819	printf("original : %3li b = '%s'\n", size, data);
820	#endif
821
822	compressed = (u_str)gb_compress_by_dictionary(dict, (GB_CSTR)data, size, &compressedSize, last_flag, 9999, 2);
823
824	#if DUMP_COMPRESSION_TEST>=1
825	printf("compressed : %3li b = '%s'\n", compressedSize, lstr(compressed, compressedSize));
826	dumpBinary(compressed, compressedSize);
827	#endif
828
829	for (i=0; i<compressedSize; i++) char_count[compressed[i]]++;
830
831	uncompressed = (u_str)gb_uncompress_by_dictionary(gbd, (char*)compressed+1, size);
832
833	#if DUMP_COMPRESSION_TEST>=1
834	printf("copy : %3li b = '%s'\n", size, lstr(copy, size));
835	printf("decompressed: %3li b = '%s'\n", size, lstr(uncompressed, size));
836	#endif
837
838	if (GB_MEMCMP(copy, uncompressed, size)!=0) {
839	int byte = 0;
840
841	while (copy[byte]==uncompressed[byte]) byte++;
842	printf("Error in compression (off=%i, '%s'", byte, lstr(copy+byte, 10));
843	printf("!='%s'\n", lstr(uncompressed+byte, 10));
844	}
845
846	if (compressedSize<size) {
847	uncompressed_sum += size;
848	compressed_sum += compressedSize;
849	}
850	else {
851	uncompressed_sum += size;
852	compressed_sum += size;
853	}
854
855	gbm_free_mem(copy, size, GBM_DICT_INDEX);
856	}
857	}
858
859	#ifdef COUNT_CHUNKS
860	dumpChunkCounters();
861	#endif
862
863	{
864	long compressed_plus_dict = compressed_sum+dict_size;
865	char *dict_text = GBS_global_string_copy("+dict %li b", dict_size);
866	long ratio = (compressed_plus_dict*100)/uncompressed_sum;
867
868	printf(" uncompressed size = %10li b\n"
869	" compressed size = %10li b\n"
870	" %17s = %10li b (Ratio=%li%%)\n",
871	uncompressed_sum,
872	compressed_sum,
873	dict_text, compressed_plus_dict, ratio);
874
875	free(dict_text);
876	}
877
878	*uncompSum += uncompressed_sum;
879	*compSum += compressed_sum+dict_size;
880	}
881
882	#endif // TEST_DICT
883
884
885	// ***************** Build dictionary ****************
886
887	#ifdef DEBUG
888	#define TEST // test trees?
889	// #define DUMP_TREE // dump trees?
890
891	// #define DUMP_EXPAND
892	/*
893	#define SELECT_WORDS
894	#define SELECTED_WORDS "oropl"
895	*/
896
897	# ifdef SELECT_WORDS
898	static char strnstr(char s1, int len, char *s2) {
899	char c = *s2;
900	int len2 = strlen(s2);
901
902	while (len-->=len2) {
903	if (*s1==c) {
904	if (strncmp(s1, s2, len2)==0) return s1;
905	}
906	s1++;
907	}
908
909	return NULL;
910	}
911	# endif
912
913	#ifdef DUMP_TREE
914	static void dump_dtree(int deep, DictTree tree)
915	{
916	static unsigned_char buffer[1024];
917
918	if (tree.full) {
919	switch (tree.full->typ) {
920	case FULL_NODE: {
921	int idx;
922
923	for (idx=0; idx<256; idx++) {
924	buffer[deep] = idx;
925	buffer[deep+1] = 0;
926
927	if (tree.full->son[idx].exists) dump_dtree(deep+1, tree.full->son[idx]);
928	else if (tree.full->count[idx]>0) printf(" '%s' (%i) [array]\n", buffer, tree.full->count[idx]);
929	}
930	break;
931	}
932	case SINGLE_NODE: {
933	buffer[deep] = tree.single->ch;
934	buffer[deep+1] = 0;
935
936	if (tree.single->son.exists) dump_dtree(deep+1, tree.single->son);
937	else printf(" '%s' (%i) [single]\n", buffer, tree.single->count);
938
939	if (tree.single->brother.exists) dump_dtree(deep, tree.single->brother);
940	break;
941	}
942	}
943	}
944	}
945	#endif
946
947	#else
948	#ifdef DUMP_TREE
949	# define dump_dtree(deep, tree)
950	#endif
951	#endif
952
953	#ifdef TEST
954	static int testCounts(DictTree tree) {
955	// tests if all inner nodes have correct 'count's
956	int cnt = 0;
957
958	if (tree.exists) {
959	switch (tree.full->typ) {
960	case SINGLE_NODE: {
961	while (tree.exists) {
962	if (tree.single->son.exists) {
963	int son_cnt = testCounts(tree.single->son);
964	#ifdef COUNT_EQUAL
965	gb_assert(son_cnt==tree.single->count);
966	#else
967	gb_assert(son_cnt<=tree.single->count);
968	#endif
969	}
970
971	gb_assert(tree.single->count>0);
972	cnt += tree.single->count;
973	tree = tree.single->brother;
974	}
975	break;
976	}
977	case FULL_NODE: {
978	int idx,
979	sons = 0;
980
981	for (idx=0; idx<256; idx++) {
982	if (tree.full->son[idx].exists) {
983	int son_cnt = testCounts(tree.full->son[idx]);
984	#ifdef COUNT_EQUAL
985	gb_assert(son_cnt==tree.full->count[idx]);
986	#else
987	gb_assert(son_cnt<=tree.full->count[idx]);
988	#endif
989	if (tree.full->usedSons) gb_assert(tree.full->count[idx]>0);
990	else gb_assert(tree.full->count[idx]==0);
991
992	sons++;
993	}
994	else if (tree.full->count[idx]) {
995	sons++;
996	}
997
998	cnt += tree.full->count[idx];
999	}
1000
1001	gb_assert(sons==tree.full->usedSons);
1002	break;
1003	}
1004	}
1005	}
1006
1007	return cnt;
1008	}
1009
1010	// #define TEST_MAX_OCCUR_COUNT
1011
1012	#ifdef TEST_MAX_OCCUR_COUNT
1013	#define MAX_OCCUR_COUNT 600000
1014	#endif
1015
1016	static DictTree test_dtree(DictTree tree)
1017	// only correct while tree is under contruction (build_dict_tree())
1018	{
1019	if (tree.exists) {
1020	switch (tree.full->typ) {
1021	case SINGLE_NODE: {
1022	#if defined(TEST_MAX_OCCUR_COUNT)
1023	gb_assert(tree.single->count<MAX_OCCUR_COUNT); // quite improbable
1024	#endif // TEST_MAX_OCCUR_COUNT
1025
1026	if (tree.single->son.exists) {
1027	gb_assert(tree.single->count==0);
1028	test_dtree(tree.single->son);
1029	}
1030	else {
1031	gb_assert(tree.single->count>0);
1032	}
1033
1034	if (tree.single->brother.exists) test_dtree(tree.single->brother);
1035	break;
1036	}
1037	case FULL_NODE: {
1038	int idx;
1039	int countSons = 0;
1040
1041	for (idx=0; idx<256; idx++) {
1042	#if defined(TEST_MAX_OCCUR_COUNT)
1043	gb_assert(tree.full->count[idx]<MAX_OCCUR_COUNT); // quite improbable
1044	#endif // TEST_MAX_OCCUR_COUNT
1045
1046	if (tree.full->son[idx].exists) {
1047	gb_assert(tree.full->count[idx]==0);
1048	test_dtree(tree.full->son[idx]);
1049	countSons++;
1050	}
1051	else {
1052	gb_assert(tree.full->count[idx]>=0);
1053	if (tree.full->count[idx]>0)
1054	countSons++;
1055	}
1056	}
1057
1058	gb_assert(countSons==tree.full->usedSons);
1059
1060	break;
1061	}
1062	}
1063	}
1064
1065	return tree;
1066	}
1067
1068	#else
1069	# define test_dtree(tree) // (tree)
1070	# define testCounts(tree) // 0
1071	#endif
1072
1073
1074	static DictTree new_dtree(cu_str text, long len, long *memcount) {
1075	// creates a new (sub-)tree from 'text' (which has length 'len')
1076	DictTree tree;
1077
1078	if (len) {
1079	SingleDictTree *tail = NULL;
1080	SingleDictTree *head = NULL;
1081
1082	while (len) {
1083	if (tail) tail = tail->son.single = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1084	else tail = head = (SingleDictTree)gbm_get_mem(sizeof(tail), GBM_DICT_INDEX);
1085
1086	(memcount) += sizeof(tail);
1087
1088	tail->typ = SINGLE_NODE;
1089	tail->ch = *text++;
1090	len--;
1091
1092	tail->brother.single = NULL;
1093	tail->son.single = NULL;
1094	}
1095
1096	tail->count = 1;
1097	tree.single = head;
1098	}
1099	else {
1100	tree.single = NULL;
1101	}
1102
1103	return tree;
1104	}
1105
1106	static DictTree single2full_dtree(DictTree tree, long *memcount) {
1107	if (tree.exists && tree.single->typ==SINGLE_NODE) {
1108	FullDictTree full = (FullDictTree)gbm_get_mem(sizeof(*full), GBM_DICT_INDEX);
1109	int idx;
1110
1111	(memcount) += sizeof(full);
1112	full->typ = FULL_NODE;
1113	full->usedSons = 0;
1114
1115	for (idx=0; idx<256; idx++) {
1116	full->son[idx].exists = NULL;
1117	full->count[idx] = 0;
1118	}
1119
1120	while (tree.exists) {
1121	SingleDictTree *t = tree.single;
1122
1123	gb_assert(t->typ==SINGLE_NODE);
1124	gb_assert(full->son[t->ch].exists==NULL);
1125
1126	full->son[t->ch] = t->son;
1127	full->count[t->ch] = t->count;
1128	full->usedSons++;
1129
1130	tree.single = t->brother.single;
1131
1132	gbm_free_mem(t, sizeof(*t), GBM_DICT_INDEX);
1133	(memcount) -= sizeof(t);
1134	}
1135
1136	tree.full = full;
1137	}
1138
1139	return tree;
1140	}
1141
1142	static void free_dtree(DictTree tree)
1143	{
1144	if (tree.exists) {
1145	switch (tree.full->typ) {
1146	case SINGLE_NODE: {
1147	if (tree.single->son.exists) free_dtree(tree.single->son);
1148	if (tree.single->brother.exists) free_dtree(tree.single->brother);
1149
1150	gbm_free_mem(tree.single, sizeof(*(tree.single)), GBM_DICT_INDEX);
1151	break;
1152	}
1153	case FULL_NODE: {
1154	int idx;
1155
1156	for (idx=0; idx<256; idx++) if (tree.full->son[idx].exists) free_dtree(tree.full->son[idx]);
1157	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1158	break;
1159	}
1160	}
1161	}
1162	}
1163
1164
1165
1166	static DictTree cut_dtree(DictTree tree, int cut_count, long memcount, long leafcount)
1167	/* removes all branches from 'tree' which are referenced less/equal than cut_count
1168	* returns: the reduced tree */
1169	{
1170	if (tree.exists) {
1171	switch (tree.full->typ) {
1172	case SINGLE_NODE: {
1173	if (tree.single->son.exists) tree.single->son = cut_dtree(tree.single->son, cut_count, memcount, leafcount);
1174
1175	if (!tree.single->son.exists) { // leaf
1176	if (tree.single->count<=cut_count) { // leaf with less/equal references
1177	DictTree brother = tree.single->brother;
1178
1179	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1180	(memcount) -= sizeof(tree.single);
1181	if (brother.exists) return cut_dtree(brother, cut_count, memcount, leafcount);
1182
1183	tree.single = NULL;
1184	break;
1185	}
1186	else {
1187	(*leafcount)++;
1188	}
1189	}
1190
1191	if (tree.single->brother.exists) tree.single->brother = cut_dtree(tree.single->brother, cut_count, memcount, leafcount);
1192	break;
1193	}
1194	case FULL_NODE: {
1195	int idx;
1196	int count = 0;
1197
1198	for (idx=0; idx<256; idx++) {
1199	if (tree.full->son[idx].exists) {
1200	tree.full->son[idx] = cut_dtree(tree.full->son[idx], cut_count, memcount, leafcount);
1201
1202	if (tree.full->son[idx].exists) count++;
1203	else tree.full->count[idx] = 0;
1204	}
1205	else if (tree.full->count[idx]>0) {
1206	if (tree.full->count[idx]<=cut_count) {
1207	tree.full->count[idx] = 0;
1208	}
1209	else {
1210	count++;
1211	(*leafcount)++;
1212	}
1213	}
1214	}
1215
1216	tree.full->usedSons = count;
1217
1218	if (!count) { // no more sons
1219	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1220	(memcount) -= sizeof((tree.full));
1221	tree.exists = NULL;
1222	}
1223
1224	break;
1225	}
1226	}
1227	}
1228
1229	return tree;
1230	}
1231	static DictTree cut_useless_words(DictTree tree, int deep, long *removed)
1232	/* removes/shortens all branches of 'tree' which are not useful for compression
1233	* 'deep' should be zero (incremented by cut_useless_words)
1234	* 'removed' will be set to the # of removed occurrences
1235	* returns: the reduced tree
1236	*/
1237	{
1238	*removed = 0;
1239
1240	if (tree.exists) {
1241	deep++;
1242
1243	switch (tree.full->typ) {
1244	long removed_single;
1245
1246	case SINGLE_NODE: {
1247	if (tree.single->son.exists) {
1248	tree.single->son = cut_useless_words(tree.single->son, deep, &removed_single);
1249	tree.single->count -= removed_single;
1250	*removed += removed_single;
1251	}
1252
1253	if (!tree.single->son.exists && !WORD_HELPFUL(deep, tree.single->count)) {
1254	DictTree brother = tree.single->brother;
1255
1256	*removed += tree.single->count;
1257	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1258
1259	if (brother.exists) {
1260	tree = cut_useless_words(brother, deep-1, &removed_single);
1261	*removed += removed_single;
1262	}
1263	else {
1264	tree.exists = NULL;
1265	}
1266
1267	break;
1268	}
1269
1270	if (tree.single->brother.exists) {
1271	tree.single->brother = cut_useless_words(tree.single->brother, deep-1, &removed_single);
1272	*removed += removed_single;
1273	}
1274
1275	break;
1276	}
1277	case FULL_NODE: {
1278	int idx;
1279	int count = 0;
1280
1281	for (idx=0; idx<256; idx++) {
1282	if (tree.full->son[idx].exists) {
1283	tree.full->son[idx] = cut_useless_words(tree.full->son[idx], deep, &removed_single);
1284	tree.full->count[idx] -= removed_single;
1285	*removed += removed_single;
1286	}
1287
1288	if (tree.full->son[idx].exists) {
1289	count++;
1290	}
1291	else if (tree.full->count[idx]) {
1292	if (!WORD_HELPFUL(deep, tree.full->count[idx])) { // useless!
1293	*removed += tree.full->count[idx];
1294	tree.full->count[idx] = 0;
1295	}
1296	else {
1297	count++;
1298	}
1299	}
1300	}
1301
1302	tree.full->usedSons = count;
1303
1304	if (!count) { // no more sons
1305	gbm_free_mem(tree.full, sizeof(*(tree.full)), GBM_DICT_INDEX);
1306	tree.exists = NULL;
1307	}
1308
1309	break;
1310	}
1311	}
1312	}
1313
1314	return tree;
1315	}
1316
1317	static DictTree add_dtree_to_dtree(DictTree toAdd, DictTree to, long *memcount)
1318	/* adds 'toAdd' as brother of 'to' (must be leftmost of all SINGLE_NODEs or a FULL_NODE)
1319	* returns: the leftmost of all SINGLE_NODEs or a FULL_NODE
1320	*/
1321	{
1322	DictTree tree = toAdd;
1323
1324	gb_assert(toAdd.single->typ==SINGLE_NODE);
1325
1326	if (to.exists) {
1327	switch (to.full->typ) {
1328	case SINGLE_NODE: {
1329	SingleDictTree *left = to.single;
1330
1331	gb_assert(left!=0);
1332
1333	if (toAdd.single->ch < to.single->ch) {
1334	toAdd.single->brother = to;
1335	return toAdd;
1336	}
1337
1338	while (to.single->brother.exists) {
1339	if (toAdd.single->ch < to.single->brother.single->ch) {
1340	toAdd.single->brother = to.single->brother;
1341	to.single->brother = toAdd;
1342
1343	tree.single = left;
1344	return tree;
1345	}
1346	to = to.single->brother;
1347	}
1348
1349	to.single->brother = toAdd;
1350	tree.single = left;
1351	break;
1352	}
1353	case FULL_NODE: {
1354	unsigned_char ch = toAdd.single->ch;
1355
1356	gb_assert(to.full->son[ch].exists==NULL);
1357	gb_assert(to.full->count[ch]==0); // if this fails, count must be added & tested
1358	gb_assert(toAdd.single->brother.exists==NULL);
1359
1360	to.full->son[ch] = toAdd.single->son;
1361	to.full->count[ch] = toAdd.single->count;
1362	to.full->usedSons++;
1363
1364	tree = to;
1365
1366	gbm_free_mem(toAdd.single, sizeof(*(toAdd.single)), GBM_DICT_INDEX);
1367	(*memcount) -= sizeof(toAdd.single);
1368
1369	break;
1370	}
1371	}
1372	}
1373
1374	return tree;
1375	}
1376
1377	static DictTree add_to_dtree(DictTree tree, cu_str text, long len, long *memcount)
1378	/* adds the string 'text' (which has length 'len') to 'tree'
1379	* returns: new tree
1380	*/
1381	{
1382	if (tree.exists) {
1383	switch (tree.full->typ) {
1384	case SINGLE_NODE: {
1385	SingleDictTree *t = tree.single;
1386	int count = 0;
1387
1388	do {
1389	count++;
1390	if (t->ch==text[0]) { // we found an existing subtree
1391	if (len>1) {
1392	t->son = add_to_dtree(t->son, text+1, len-1, memcount); // add rest of text to subtree
1393	}
1394	else {
1395	gb_assert(len==1);
1396	gb_assert(t->son.exists==NULL);
1397	t->count++;
1398	}
1399
1400	return count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree;
1401	}
1402	else if (t->ch > text[0]) {
1403	break;
1404	}
1405	}
1406	while ((t=t->brother.single)!=NULL);
1407
1408	tree = add_dtree_to_dtree(new_dtree(text, len, memcount), // otherwise we create a new subtree
1409	count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree,
1410	memcount);
1411	break;
1412	}
1413	case FULL_NODE: {
1414	unsigned_char ch = text[0];
1415
1416	if (tree.full->son[ch].exists) {
1417	tree.full->son[ch] = add_to_dtree(tree.full->son[ch], text+1, len-1, memcount);
1418	}
1419	else {
1420	tree.full->son[ch] = new_dtree(text+1, len-1, memcount);
1421	if (!tree.full->son[ch].exists) {
1422	if (tree.full->count[ch]==0) tree.full->usedSons++;
1423	tree.full->count[ch]++;
1424	}
1425	else {
1426	tree.full->usedSons++;
1427	}
1428	}
1429	break;
1430	}
1431	}
1432
1433	return tree;
1434	}
1435
1436	return new_dtree(text, len, memcount);
1437	}
1438
1439	static long calcCounts(DictTree tree)
1440	{
1441	long cnt = 0;
1442
1443	gb_assert(tree.exists!=0);
1444
1445	switch (tree.full->typ) {
1446	case SINGLE_NODE: {
1447	while (tree.exists) {
1448	if (tree.single->son.exists) tree.single->count = calcCounts(tree.single->son);
1449	gb_assert(tree.single->count>0);
1450	cnt += tree.single->count;
1451	tree = tree.single->brother;
1452	}
1453	break;
1454	}
1455	case FULL_NODE: {
1456	int idx;
1457
1458	for (idx=0; idx<256; idx++) {
1459	if (tree.full->son[idx].exists) {
1460	tree.full->count[idx] = calcCounts(tree.full->son[idx]);
1461	gb_assert(tree.full->count[idx]>0);
1462	}
1463	else {
1464	gb_assert(tree.full->count[idx]>=0);
1465	}
1466	cnt += tree.full->count[idx];
1467	}
1468	break;
1469	}
1470	}
1471
1472	return cnt;
1473	}
1474
1475	static int count_dtree_leafs(DictTree tree, int deep, int *maxdeep) {
1476	// returns # of leafs and max. depth of tree
1477	int leafs = 0;
1478
1479	gb_assert(tree.exists!=0);
1480
1481	if (++deep>maxdeep) maxdeep = deep;
1482
1483	switch (tree.full->typ) {
1484	case SINGLE_NODE: {
1485	if (tree.single->son.exists) leafs += count_dtree_leafs(tree.single->son, deep, maxdeep);
1486	else leafs++;
1487	if (tree.single->brother.exists) leafs += count_dtree_leafs(tree.single->brother, deep, maxdeep);
1488	break;
1489	}
1490	case FULL_NODE: {
1491	int idx;
1492
1493	for (idx=0; idx<256; idx++) {
1494	if (tree.full->son[idx].exists) leafs += count_dtree_leafs(tree.full->son[idx], deep, maxdeep);
1495	else if (tree.full->count[idx]) leafs++;
1496	}
1497	break;
1498	}
1499	}
1500
1501	return leafs;
1502	}
1503
1504	static int COUNT(DictTree tree) {
1505	// counts sum of # of occurrences of tree
1506	int cnt = 0;
1507
1508	switch (tree.single->typ) {
1509	case SINGLE_NODE: {
1510	while (tree.exists) {
1511	cnt += tree.single->count;
1512	tree = tree.single->brother;
1513	}
1514	break;
1515	}
1516	case FULL_NODE: {
1517	int idx;
1518
1519	for (idx=0; idx<256; idx++) cnt += tree.full->count[idx];
1520	break;
1521	}
1522	}
1523
1524	return cnt;
1525	}
1526
1527	static DictTree removeSubsequentString(DictTree *tree_pntr, cu_str buffer, int len, int max_occur) {
1528	/* searches tree for 'buffer' (length='len')
1529	*
1530	* returns - rest below found string
1531	* (if found and if the # of occurrences of the string is less/equal than 'max_occur')
1532	* - NULL otherwise
1533	*
1534	* removes the whole found string from the tree (not only the rest!)
1535	*/
1536	DictTree tree = *tree_pntr, rest;
1537	static int restCount;
1538
1539	rest.exists = NULL;
1540
1541	gb_assert(tree.exists!=0);
1542	gb_assert(len>0);
1543
1544	switch (tree.full->typ) {
1545	case SINGLE_NODE: {
1546	while (tree.single->ch <= buffer[0]) {
1547	if (tree.single->ch == buffer[0]) { // found wanted character
1548	if (tree.single->son.exists) {
1549	if (len==1) {
1550	if (tree.single->count <= max_occur) {
1551	rest = tree.single->son;
1552	restCount = COUNT(rest);
1553	tree.single->son.exists = NULL;
1554	}
1555	}
1556	else {
1557	rest = removeSubsequentString(&tree.single->son, buffer+1, len-1, max_occur);
1558	}
1559	}
1560
1561	if (rest.exists) { // the string was found
1562	tree.single->count -= restCount;
1563	gb_assert(tree.single->count >= 0);
1564
1565	if (!tree.single->count) { // empty subtree -> delete myself
1566	DictTree brother = tree.single->brother;
1567
1568	tree.single->brother.exists = NULL; // elsewise it would be freed by free_dtree
1569	free_dtree(tree);
1570	*tree_pntr = tree = brother;
1571	}
1572	}
1573
1574	break;
1575	}
1576
1577	tree_pntr = &(tree.single->brother);
1578	if (!(tree = tree.single->brother).exists) break;
1579	}
1580
1581	break;
1582	}
1583	case FULL_NODE: {
1584	unsigned_char ch;
1585
1586	if (tree.full->son[ch=buffer[0]].exists) {
1587	if (len==1) {
1588	if (tree.full->count[ch] <= max_occur) {
1589	rest = tree.full->son[ch];
1590	restCount = COUNT(rest);
1591	tree.full->son[ch].exists = NULL;
1592	}
1593	}
1594	else {
1595	rest = removeSubsequentString(&tree.full->son[ch], buffer+1, len-1, max_occur);
1596	}
1597
1598	if (rest.exists) {
1599	gb_assert(restCount>0);
1600	tree.full->count[ch] -= restCount;
1601	gb_assert(tree.full->count[ch]>=0);
1602	if (tree.full->count[ch]==0) {
1603	gb_assert(tree.full->son[ch].exists==NULL);
1604
1605	if (--tree.full->usedSons==0) { // last son deleted -> delete myself
1606	free_dtree(tree);
1607	tree.exists = NULL;
1608	*tree_pntr = tree;
1609	}
1610	}
1611	}
1612	}
1613
1614	break;
1615	}
1616	}
1617
1618	return rest;
1619	}
1620
1621	static cu_str memstr(cu_str stringStart, int stringStartLen, cu_str inString, int inStringLen) {
1622	if (!inStringLen) return stringStart; // string of length zero is found everywhere
1623
1624	while (stringStartLen) {
1625	cu_str found = (cu_str)memchr(stringStart, inString[0], stringStartLen);
1626
1627	if (!found) break;
1628
1629	stringStartLen -= found-stringStart;
1630	stringStart = found;
1631
1632	if (stringStartLen<inStringLen) break;
1633
1634	if (GB_MEMCMP(stringStart, inString, inStringLen)==0) return stringStart;
1635
1636	stringStart++;
1637	stringStartLen--;
1638	}
1639
1640	return NULL;
1641	}
1642
1643
1644	static int expandBranches(u_str buffer, int deep, int minwordlen, int maxdeep, DictTree tree, DictTree root, int max_percent) {
1645	/* expands all branches in 'tree'
1646	*
1647	* this is done by searching every of these branches in 'root' and moving any subsequent parts from there to 'tree'
1648	* (this is only done, if the # of occurrences of the found part does not exceed the # of occurrences of 'tree' more than 'max_percent' percent)
1649	*
1650	* 'buffer' strings are rebuild here while descending the tree (length of buffer==MAX_WORD_LEN)
1651	* 'deep' recursion level
1652	* 'maxdeep' maximum recursion level
1653	* 'minwordlen' is the length of the words to search (usually equal to MIN_WORD_LEN-1)
1654	*
1655	* returns the # of occurrences which were added to 'tree'
1656	*/
1657	int expand = 0; // calculate count-sum of added subsequent parts
1658
1659	gb_assert(tree.exists!=0);
1660
1661	if (deep<maxdeep) {
1662	switch (tree.full->typ) {
1663	case SINGLE_NODE: {
1664	while (tree.exists) {
1665	buffer[deep] = tree.single->ch;
1666
1667	if (!tree.single->son.exists) {
1668	DictTree rest;
1669	u_str buf = buffer+1;
1670	int len = deep;
1671
1672	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1673	buf += len-minwordlen;
1674	len = minwordlen;
1675	}
1676
1677	if (len==minwordlen) {
1678	cu_str self = memstr(buffer, deep+1, buf, len);
1679
1680	gb_assert(self!=0);
1681	if (self==buf) rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.single->count)/100);
1682	else rest.exists = NULL;
1683	}
1684	else {
1685	rest.exists = NULL;
1686	}
1687
1688	if (rest.exists) {
1689	int cnt = COUNT(rest);
1690
1691	tree.single->son = rest;
1692	tree.single->count += cnt;
1693	expand += cnt;
1694	#ifdef DUMP_EXPAND
1695	#define DUMP_MORE 1
1696	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1697	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1698	#endif
1699	}
1700	}
1701
1702	if (tree.single->son.exists) {
1703	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.single->son, root, max_percent);
1704
1705	expand += added;
1706	tree.single->count += added;
1707	}
1708
1709	tree = tree.single->brother;
1710	}
1711
1712	break;
1713	}
1714	case FULL_NODE: {
1715	int idx;
1716
1717	for (idx=0; idx<256; idx++) {
1718	buffer[deep] = idx;
1719
1720	if (!tree.full->son[idx].exists && tree.full->count[idx]) { // leaf
1721	DictTree rest;
1722	u_str buf = buffer+1;
1723	int len = deep;
1724
1725	if (len>minwordlen) { // do not search more than MIN_WORD_LEN-1 chars
1726	buf += len-minwordlen;
1727	len = minwordlen;
1728	}
1729
1730	if (len==minwordlen) {
1731	cu_str self = memstr(buffer, deep+1, buf, len);
1732
1733	gb_assert(self!=0);
1734	if (self==buf)
1735	rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.full->count[idx])/100);
1736	else
1737	rest.exists = NULL;
1738	}
1739	else {
1740	rest.exists = NULL;
1741	}
1742
1743	if (rest.exists) { // substring found!
1744	int cnt = COUNT(rest);
1745
1746	if (tree.full->count[idx]==0) tree.full->usedSons++;
1747	tree.full->son[idx] = rest;
1748	tree.full->count[idx] += cnt;
1749
1750	expand += cnt;
1751	#ifdef DUMP_EXPAND
1752	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1753	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1754	#endif
1755	}
1756	}
1757
1758	if (tree.full->son[idx].exists) {
1759	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.full->son[idx], root, max_percent);
1760
1761	expand += added;
1762	tree.full->count[idx] += added;
1763	}
1764	}
1765
1766	break;
1767	}
1768	}
1769	}
1770
1771	return expand;
1772	}
1773
1774	static DictTree build_dict_tree(O_gbdByKey gbk, long maxmem, long maxdeep, long minwordlen, long data_sum)
1775	/* builds a tree of the most used words
1776	*
1777	* 'maxmem' is the amount of memory that will be allocated
1778	* 'maxdeep' is the maximum length of the _returned_ words
1779	* 'minwordlen' is the minimum length a word needs to get into the tree
1780	* this is used in the first pass as maximum tree depth
1781	* 'data_sum' will be set to the overall-size of data of which the tree was built
1782	*/
1783	{
1784	DictTree tree;
1785	long memcount = 0;
1786	long leafs = 0;
1787
1788	*data_sum = 0;
1789
1790	{
1791	int cnt;
1792	long lowmem = (maxmem*9)/10;
1793	int cut_count = 1;
1794
1795	// Build 8-level-deep tree of all existing words
1796
1797	tree.exists = NULL; // empty tree
1798
1799	for (cnt=0; cnt<gbk->cnt; cnt++) {
1800	GBDATA *gbd = gbk->gbds[cnt];
1801	int type = GB_TYPE(gbd);
1802
1803	if (COMPRESSIBLE(type)) {
1804	long size;
1805	cu_str data = get_data_n_size(gbd, &size);
1806	cu_str lastWord;
1807
1808	if (type==GB_STRING \|\| type == GB_LINK) size--;
1809	if (size<minwordlen) continue;
1810
1811	*data_sum += size;
1812	lastWord = data+size-minwordlen;
1813
1814	#ifdef SELECT_WORDS
1815	if (strnstr(data, size, SELECTED_WORDS)) // test some words only
1816	#endif
1817	{
1818
1819	for (; data<=lastWord; data++) {
1820	tree = add_to_dtree(tree, data, minwordlen, &memcount);
1821
1822	while (memcount>maxmem) {
1823	leafs = 0;
1824	tree = cut_dtree(tree, cut_count, &memcount, &leafs);
1825	if (memcount<=lowmem) break;
1826	cut_count++;
1827	}
1828	}
1829	}
1830	}
1831	}
1832	}
1833
1834	{
1835	int cutoff = 1;
1836
1837	leafs = 0;
1838	tree = cut_dtree(tree, cutoff, &memcount, &leafs); // cut all single elements
1839	test_dtree(tree);
1840
1841	#if defined(DEBUG)
1842	if (tree.exists) {
1843	int maxdeep2 = 0;
1844	long counted = count_dtree_leafs(tree, 0, &maxdeep2);
1845	gb_assert(leafs == counted);
1846	}
1847	#endif // DEBUG
1848
1849	// avoid directory overflow (max. 18bit)
1850	while (leafs >= MAX_LONG_INDEX) {
1851	leafs = 0;
1852	++cutoff;
1853	#if defined(DEBUG)
1854	printf("Directory overflow (%li) -- reducing size (cutoff = %i)\n", leafs, cutoff);
1855	#endif // DEBUG
1856	tree = cut_dtree(tree, cutoff, &memcount, &leafs);
1857	}
1858	}
1859	#ifdef DUMP_TREE
1860	printf("----------------------- tree with short branches:\n");
1861	dump_dtree(0, tree);
1862	printf("---------------------------\n");
1863	#endif
1864
1865	// Try to create longer branches
1866
1867	if (tree.exists) {
1868	int add_count;
1869	u_str buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
1870	int max_differ;
1871	long dummy;
1872
1873	if (tree.full->typ != FULL_NODE) tree = single2full_dtree(tree, &memcount); // ensure root is FULL_NODE
1874
1875	test_dtree(tree);
1876	calcCounts(tree); // calculate counters of inner nodes
1877	testCounts(tree);
1878
1879	for (max_differ=0; max_differ<=MAX_DIFFER; max_differ+=INCR_DIFFER) { // percent of allowed difference for concatenating tree branches
1880	do {
1881	int idx;
1882	add_count = 0;
1883
1884	for (idx=0; idx<256; idx++) {
1885	if (tree.full->son[idx].exists) {
1886	int added;
1887
1888	buffer[0] = idx;
1889	added = expandBranches(buffer, 1, minwordlen-1, maxdeep, tree.full->son[idx], tree, max_differ);
1890	tree.full->count[idx] += added;
1891	add_count += added;
1892	}
1893	}
1894	}
1895	while (add_count);
1896	}
1897
1898	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
1899
1900	tree = cut_useless_words(tree, 0, &dummy);
1901	}
1902
1903	#ifdef DUMP_TREE
1904	printf("----------------------- tree with expanded branches:\n");
1905	dump_dtree(0, tree);
1906	printf("-----------------------\n");
1907	#endif
1908	testCounts(tree);
1909
1910	return tree;
1911	}
1912
1913	static DictTree remove_word_from_dtree(DictTree tree, cu_str wordStart, int wordLen, u_str resultBuffer, int resultLen, long resultFrequency, long *removed) {
1914	/* searches 'tree' for a word starting with 'wordStart' an removes it from the tree
1915	* if there are more than one possibilities, the returned word will be the one with the most occurrences
1916	* if there was no possibility -> resultLen==0, tree unchanged
1917	* otherwise: resultBuffer contains the word, returns new tree with word removed
1918	*/
1919	long removed_single = 0;
1920	gb_assert(tree.exists!=0);
1921	*removed = 0;
1922
1923	if (wordLen) { // search wanted path into tree
1924	switch (tree.full->typ) {
1925	case SINGLE_NODE: {
1926	if (tree.single->ch==*wordStart) {
1927	resultBuffer = wordStart;
1928
1929	if (tree.single->son.exists) {
1930	gb_assert(tree.single->count>0);
1931	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart+1, wordLen-1,
1932	resultBuffer+1, resultLen, resultFrequency,
1933	&removed_single);
1934	if (*resultLen) { // word removed
1935	gb_assert(tree.single->count>=removed_single);
1936	tree.single->count -= removed_single;
1937	*removed += removed_single;
1938	(*resultLen)++;
1939	}
1940	}
1941	else {
1942	*resultLen = wordLen==1; // if wordLen==1 -> fully overlapping word found
1943	*resultFrequency = tree.single->count;
1944	}
1945
1946	if (!tree.single->son.exists && *resultLen) { // if no son and a word was found -> remove branch
1947	DictTree brother = tree.single->brother;
1948
1949	*removed += tree.single->count;
1950	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
1951
1952	if (brother.exists) tree = brother;
1953	else tree.exists = NULL;
1954	}
1955	}
1956	else if (tree.single->ch < *wordStart && tree.single->brother.exists) {
1957	tree.single->brother = remove_word_from_dtree(tree.single->brother, wordStart, wordLen,
1958	resultBuffer, resultLen, resultFrequency,
1959	&removed_single);
1960	if (resultLen) removed += removed_single;
1961	}
1962	else {
1963	*resultLen = 0; // not found
1964	}
1965
1966	break;
1967	}
1968	case FULL_NODE: {
1969	unsigned_char ch = *wordStart;
1970	*resultBuffer = ch;
1971
1972	if (tree.full->son[ch].exists) {
1973	tree.full->son[ch] = remove_word_from_dtree(tree.full->son[ch], wordStart+1, wordLen-1,
1974	resultBuffer+1, resultLen, resultFrequency,
1975	&removed_single);
1976	if (*resultLen) {
1977	if (tree.full->son[ch].exists) { // another son?
1978	tree.full->count[ch] -= removed_single;
1979	}
1980	else { // last son -> remove whole branch
1981	removed_single = tree.full->count[ch];
1982	tree.full->count[ch] = 0;
1983	tree.full->usedSons--;
1984	}
1985
1986	*removed += removed_single;
1987	(*resultLen)++;
1988	}
1989	}
1990	else if (tree.full->count[ch]) {
1991	*resultLen = (wordLen==1);
1992
1993	if (*resultLen) {
1994	removed += removed_single = resultFrequency = tree.full->count[ch];
1995	tree.full->count[ch] = 0;
1996	tree.full->usedSons--;
1997	}
1998	}
1999	else {
2000	*resultLen = 0; // not found
2001	}
2002
2003	if (!tree.full->usedSons) {
2004	free_dtree(tree);
2005	tree.exists = NULL;
2006	}
2007
2008	break;
2009	}
2010	}
2011	}
2012	else { // take any word
2013	switch (tree.full->typ) {
2014	case SINGLE_NODE: {
2015	*resultBuffer = tree.single->ch;
2016	gb_assert(tree.single->count>0);
2017
2018	if (tree.single->son.exists) {
2019	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart, wordLen,
2020	resultBuffer+1, resultLen, resultFrequency,
2021	&removed_single);
2022	gb_assert(*resultLen);
2023	(*resultLen)++;
2024	}
2025	else {
2026	*resultLen = 1;
2027	*resultFrequency = tree.single->count;
2028	removed_single = tree.single->count;
2029	}
2030
2031	gb_assert(*resultFrequency>0);
2032
2033	if (tree.single->son.exists) {
2034	gb_assert(tree.single->count>=removed_single);
2035	tree.single->count -= removed_single;
2036	*removed += removed_single;
2037	}
2038	else {
2039	DictTree brother = tree.single->brother;
2040
2041	*removed += tree.single->count;
2042	gbm_free_mem(tree.single, sizeof(*tree.single), GBM_DICT_INDEX);
2043
2044	if (brother.exists) tree = brother;
2045	else tree.exists = NULL;
2046	}
2047
2048	break;
2049	}
2050	case FULL_NODE: {
2051	int idx;
2052
2053	for (idx=0; idx<256; idx++) {
2054	if (tree.full->son[idx].exists) {
2055	*resultBuffer = idx;
2056	tree.full->son[idx] = remove_word_from_dtree(tree.full->son[idx], wordStart, wordLen,
2057	resultBuffer+1, resultLen, resultFrequency,
2058	&removed_single);
2059	gb_assert(*resultLen);
2060	(*resultLen)++;
2061
2062	if (!tree.full->son[idx].exists) { // son branch removed -> zero count
2063	removed_single = tree.full->count[idx];
2064	tree.full->count[idx] = 0;
2065	tree.full->usedSons--;
2066	}
2067	else {
2068	tree.full->count[idx] -= removed_single;
2069	gb_assert(tree.full->count[idx]>0);
2070	}
2071
2072	break;
2073	}
2074	else if (tree.full->count[idx]) {
2075	*resultBuffer = idx;
2076	*resultLen = 1;
2077	*resultFrequency = tree.full->count[idx];
2078	removed_single = tree.full->count[idx];
2079	tree.full->count[idx] = 0;
2080	tree.full->usedSons--;
2081	break;
2082	}
2083	}
2084
2085	gb_assert(idx<256); // gb_assert break was used to exit loop (== node had a son)
2086
2087	*removed += removed_single;
2088
2089	if (!tree.full->usedSons) {
2090	free_dtree(tree);
2091	tree.exists = NULL;
2092	}
2093
2094	break;
2095	}
2096	}
2097	}
2098
2099	#ifdef DEBUG
2100	if (*resultLen) {
2101	gb_assert(*resultLen>0);
2102	gb_assert(*resultFrequency>0);
2103	gb_assert(*resultLen>=wordLen);
2104	}
2105	#endif
2106
2107	return tree;
2108	}
2109
2110	#define cmp(i1, i2) (heap2[i1]-heap2[i2])
2111	#define swap(i1, i2) do \
2112	{ \
2113	int s = heap[i1]; \
2114	heap[i1] = heap[i2]; \
2115	heap[i2] = s; \
2116	\
2117	s = heap2[i1]; \
2118	heap2[i1] = heap2[i2]; \
2119	heap2[i2] = s; \
2120	} \
2121	while (0)
2122
2123	static void downheap(int heap, int heap2, int me, int num) {
2124	int lson = me*2;
2125	int rson = lson+1;
2126
2127	gb_assert(me>=1);
2128	if (lson>num) return;
2129
2130	if (cmp(lson, me)<0) { // left son smaller than me? (we sort in descending order!!!)
2131	if (rson<=num && cmp(lson, rson)>0) { // right son smaller than left son?
2132	swap(me, rson);
2133	downheap(heap, heap2, rson, num);
2134	}
2135	else {
2136	swap(me, lson);
2137	downheap(heap, heap2, lson, num);
2138	}
2139	}
2140	else if (rson<=num && cmp(me, rson)>0) { // right son smaller than me?
2141	swap(me, rson);
2142	downheap(heap, heap2, rson, num);
2143	}
2144	}
2145
2146	#undef cmp
2147	#undef swap
2148
2149
2150
2151	#define cmp(i1, i2) GB_MEMCMP(dict->text+dict->offsets[heap[i1]], dict->text+dict->offsets[heap[i2]], dict->textlen)
2152	#define swap(i1, i2) do { int s = heap[i1]; heap[i1] = heap[i2]; heap[i2] = s; } while (0)
2153
2154	static void downheap2(int heap, GB_DICTIONARY dict, int me, int num) {
2155	int lson = me*2;
2156	int rson = lson+1;
2157
2158	gb_assert(me>=1);
2159	if (lson>num) return;
2160
2161	if (cmp(lson, me)>0) { // left son bigger than me?
2162	if (rson<=num && cmp(lson, rson)<0) { // right son bigger than left son?
2163	swap(me, rson);
2164	downheap2(heap, dict, rson, num);
2165	}
2166	else {
2167	swap(me, lson);
2168	downheap2(heap, dict, lson, num);
2169	}
2170	}
2171	else if (rson<=num && cmp(me, rson)<0) { // right son bigger than me?
2172	swap(me, rson);
2173	downheap2(heap, dict, rson, num);
2174	}
2175	}
2176
2177	#undef cmp
2178	#undef swap
2179
2180	static void sort_dict_offsets(GB_DICTIONARY *dict) {
2181	/* 1. sorts the 'dict->offsets' by frequency
2182	* (frequency of each offset is stored in the 'dict->resort' with the same index)
2183	* 2. initializes & sorts 'dict->resort' in alphabetic order
2184	*/
2185	int i;
2186	int num = dict->words;
2187	int *heap = dict->offsets-1;
2188	int *heap2 = dict->resort-1;
2189
2190	// sort offsets
2191
2192	for (i=num/2; i>=1; i--) downheap(heap, heap2, i, num); // make heap
2193
2194	while (num>1) { // sort heap
2195	int big = heap[1];
2196	int big2 = heap2[1];
2197
2198	heap[1] = heap[num];
2199	heap2[1] = heap2[num];
2200
2201	downheap(heap, heap2, 1, num-1);
2202
2203	heap[num] = big;
2204	heap2[num] = big2;
2205
2206	num--;
2207	}
2208
2209	// initialize dict->resort
2210
2211	for (i=0, num=dict->words; i<num; i++) dict->resort[i] = i;
2212
2213	// sort dictionary alphabetically
2214
2215	for (i=num/2; i>=1; i--) downheap2(heap2, dict, i, num); // make heap
2216
2217	while (num>1) {
2218	int big = heap2[1];
2219
2220	heap2[1] = heap2[num];
2221	downheap2(heap2, dict, 1, num-1);
2222	heap2[num] = big;
2223	num--;
2224	}
2225	}
2226
2227	// Warning dictionary is not in network byte order !!!!
2228	static GB_DICTIONARY gb_create_dictionary(O_gbdByKey gbk, long maxmem) {
2229	long data_sum;
2230	DictTree tree = build_dict_tree(gbk, maxmem, MAX_WORD_LEN, MIN_WORD_LEN, &data_sum);
2231
2232	if (tree.exists) {
2233	GB_DICTIONARY dict = (GB_DICTIONARY)gbm_get_mem(sizeof(*dict), GBM_DICT_INDEX);
2234	int maxdeep = 0;
2235	int words = count_dtree_leafs(tree, 0, &maxdeep);
2236	u_str word;
2237
2238	int wordLen;
2239	long wordFrequency;
2240	int offset = 0; // next free position in dict->text
2241	int overlap = 0; // # of bytes overlapping with last word
2242	u_str buffer;
2243	long dummy;
2244	long word_sum = 0;
2245	long overlap_sum = 0;
2246	long max_overlap = 0;
2247
2248	// reduce tree as long as it has to many leafs (>MAX_LONG_INDEX)
2249	while (words >= MAX_LONG_INDEX) {
2250
2251	words = count_dtree_leafs(tree, 0, &maxdeep);
2252	}
2253
2254	buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
2255
2256	calcCounts(tree);
2257	testCounts(tree);
2258
2259	#if DEBUG
2260	printf(" examined data was %li bytes\n", data_sum);
2261	printf(" tree contains %i words *** maximum tree depth = %i\n", words, maxdeep);
2262	#endif
2263
2264	dict->words = 0;
2265	dict->textlen = DICT_STRING_INCR;
2266	dict->text = (u_str)gbm_get_mem(DICT_STRING_INCR, GBM_DICT_INDEX);
2267	dict->offsets = (GB_NINT)gbm_get_mem(sizeof((dict->offsets))*words, GBM_DICT_INDEX);
2268	dict->resort = (GB_NINT)gbm_get_mem(sizeof((dict->resort))*words, GBM_DICT_INDEX);
2269
2270	memset(buffer, '*', maxdeep);
2271	tree = remove_word_from_dtree(tree, NULL, 0, buffer, &wordLen, &wordFrequency, &dummy);
2272	testCounts(tree);
2273
2274	while (1) {
2275	int nextWordLen = 0;
2276	int len;
2277
2278	#if DUMP_COMPRESSION_TEST>=4
2279	printf("word='%s' (occur=%li overlap=%i)\n", lstr(buffer, wordLen), wordFrequency, overlap);
2280	#endif
2281
2282	overlap_sum += overlap;
2283	if (overlap>max_overlap) max_overlap = overlap;
2284	word_sum += wordLen;
2285
2286	if (offset-overlap+wordLen > dict->textlen) { // if not enough space allocated -> realloc dictionary string
2287	u_str ntext = (u_str)gbm_get_mem(dict->textlen+DICT_STRING_INCR, GBM_DICT_INDEX);
2288
2289	memcpy(ntext, dict->text, dict->textlen);
2290	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2291
2292	dict->text = ntext;
2293	dict->textlen += DICT_STRING_INCR;
2294	}
2295
2296	dict->offsets[dict->words] = offset-overlap;
2297	dict->resort[dict->words] = wordFrequency; // temporarily miss-use this to store frequency
2298	dict->words++;
2299
2300	word = dict->text+offset-overlap;
2301	gb_assert(overlap==0 \|\| GB_MEMCMP(word, buffer, overlap)==0); // test overlapping string-part
2302	memcpy(word, buffer, wordLen); // word -> dictionary string
2303	offset += wordLen-overlap;
2304
2305	if (!tree.exists) break;
2306
2307	for (len=min(10, wordLen-1); len>=0 && nextWordLen==0; len--) {
2308	memset(buffer, '*', maxdeep);
2309	tree = remove_word_from_dtree(tree, word+wordLen-len, len, buffer, &nextWordLen, &wordFrequency, &dummy);
2310	overlap = len;
2311	}
2312
2313	wordLen = nextWordLen;
2314	}
2315
2316	gb_assert(dict->words <= MAX_LONG_INDEX);
2317	gb_assert(dict->words==words); /* dict->words == # of words stored in dictionary string
2318	* words == # of words pre-calculated */
2319
2320	#if DEBUG
2321	printf(" word_sum=%li overlap_sum=%li (%li%%) max_overlap=%li\n",
2322	word_sum, overlap_sum, (overlap_sum*100)/word_sum, max_overlap);
2323	#endif
2324
2325	if (offset<dict->textlen) { // reallocate dict->text if it was allocated too large
2326	u_str ntext = (u_str)gbm_get_mem(offset, GBM_DICT_INDEX);
2327
2328	memcpy(ntext, dict->text, offset);
2329	gbm_free_mem(dict->text, dict->textlen, GBM_DICT_INDEX);
2330
2331	dict->text = ntext;
2332	dict->textlen = offset;
2333	}
2334
2335	sort_dict_offsets(dict);
2336
2337	gbm_free_mem(buffer, maxdeep, GBM_DICT_INDEX);
2338	free_dtree(tree);
2339
2340	return dict;
2341	}
2342
2343	return NULL;
2344	}
2345
2346	static GB_ERROR readAndWrite(O_gbdByKey *gbkp) {
2347	int i;
2348	GB_ERROR error = 0;
2349
2350	for (i=0; i<gbkp->cnt && !error; i++) {
2351	GBDATA *gbd = gbkp->gbds[i];
2352	int type = GB_TYPE(gbd);
2353
2354	if (COMPRESSIBLE(type)) {
2355	long size;
2356	char *data;
2357
2358	{
2359	char d = (char)get_data_n_size(gbd, &size);
2360
2361	data = (char*)gbm_get_mem(size, GBM_DICT_INDEX);
2362	memcpy(data, d, size);
2363	gb_assert(data[size-1] == 0);
2364	}
2365
2366	switch (type) {
2367	case GB_STRING:
2368	error = GB_write_string(gbd, "");
2369	if (!error) error = GB_write_string(gbd, data);
2370	break;
2371	case GB_LINK:
2372	error = GB_write_link(gbd, "");
2373	if (!error) error = GB_write_link(gbd, data);
2374	break;
2375	case GB_BYTES:
2376	error = GB_write_bytes(gbd, 0, 0);
2377	if (!error) error = GB_write_bytes(gbd, data, size);
2378	break;
2379	case GB_INTS:
2380	error = GB_write_ints(gbd, (GB_UINT4 *)0, 0);
2381	if (!error) error = GB_write_ints(gbd, (GB_UINT4 *)data, size);
2382	break;
2383	case GB_FLOATS:
2384	error = GB_write_floats(gbd, (float*)0, 0);
2385	if (!error) error = GB_write_floats(gbd, (float*)data, size);
2386	break;
2387	default:
2388	gb_assert(0);
2389	break;
2390	}
2391
2392	gbm_free_mem(data, size, GBM_DICT_INDEX);
2393	}
2394	}
2395	return error;
2396	}
2397
2398	static GB_ERROR gb_create_dictionaries(GB_MAIN_TYPE *Main, long maxmem) {
2399	GB_ERROR error = NULL;
2400	#if defined(TEST_DICT)
2401	long uncompressed_sum = 0;
2402	long compressed_sum = 0;
2403	#endif // TEST_DICT
2404
2405	printf("Creating GBDATA-Arrays..\n");
2406
2407	if (!error) {
2408	O_gbdByKey *gbk = g_b_opti_createGbdByKey(Main);
2409	int idx;
2410
2411	printf("Creating dictionaries..\n");
2412
2413	#ifdef DEBUG
2414	// #define TEST_ONE // test only key specified below
2415	// #define TEST_SOME // test only some keys specified below
2416	#if defined(TEST_ONE)
2417	// select wanted index
2418	for (idx=0; idx<gbdByKey_cnt; idx++) { // title author dew_author ebi_journal name ua_tax date full_name ua_title
2419	if (gbk[idx].cnt && strcmp(Main->keys[idx].key, "tree")==0) break;
2420	}
2421	gb_assert(idx<gbdByKey_cnt);
2422	#endif
2423	#endif
2424
2425	#ifdef TEST_ONE
2426	// only create dictionary for index selected above (no loop)
2427	#else
2428	// create dictionaries for all indices (this is the normal operation)
2429	arb_progress progress("Optimizing key data", gbdByKey_cnt-1);
2430	for (idx = gbdByKey_cnt-1; idx >= 1 && !error; --idx, progress.inc_and_check_user_abort(error))
2431	#endif
2432
2433	{
2434	GB_DICTIONARY *dict;
2435	int compression_mask;
2436	GB_CSTR key_name = Main->keys[idx].key;
2437	int type;
2438	GBDATA gb_main = (GBDATA)Main->data;
2439
2440	#ifdef TEST_SOME
2441	if (!( // add all wanted keys here
2442	strcmp(key_name, "REF") == 0 \|\|
2443	strcmp(key_name, "ref") == 0
2444	)) continue;
2445	#endif // TEST_SOME
2446
2447	#ifndef TEST_ONE
2448	if (!gbk[idx].cnt) continue; // there are no entries with this quark
2449
2450	type = GB_TYPE(gbk[idx].gbds[0]);
2451	GB_begin_transaction(gb_main);
2452	compression_mask = gb_get_compression_mask(Main, idx, type);
2453	GB_commit_transaction(gb_main);
2454
2455	if ((compression_mask & GB_COMPRESSION_DICTIONARY) == 0) continue; // compression with dictionary is not allowed
2456	if (strcmp(key_name, "data") == 0) continue;
2457	if (strcmp(key_name, "quality") == 0) continue;
2458	#endif
2459
2460	printf("- dictionary for '%s' (idx=%i)\n", key_name, idx);
2461	GB_begin_transaction(gb_main);
2462	dict = gb_create_dictionary(&(gbk[idx]), maxmem);
2463
2464	if (dict) {
2465	/* decompress with old dictionary and write
2466	all data of actual type without compression: */
2467
2468	printf(" * Uncompressing all with old dictionary ...\n");
2469
2470	{
2471	int old_compression_mask = Main->keys[idx].compression_mask;
2472
2473	Main->keys[idx].compression_mask &= ~GB_COMPRESSION_DICTIONARY;
2474	error = readAndWrite(&gbk[idx]);
2475	Main->keys[idx].compression_mask = old_compression_mask;
2476	}
2477
2478	if (!error) {
2479	/* dictionary is saved in the following format:
2480	*
2481	* GB_NINT size
2482	* GB_NINT offsets[dict->words]
2483	* GB_NINT resort[dict->words]
2484	* char *text
2485	*/
2486
2487	int dict_buffer_size = sizeof(GB_NINT) * (1+dict->words*2) + dict->textlen;
2488	char dict_buffer = (char)gbm_get_mem(dict_buffer_size, GBM_DICT_INDEX);
2489	long old_dict_buffer_size;
2490	char *old_dict_buffer;
2491
2492	{
2493	GB_NINT nint = (GB_NINT)dict_buffer;
2494	int n;
2495
2496	*nint++ = htonl(dict->words);
2497	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->offsets[n]);
2498	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->resort[n]);
2499
2500	memcpy(nint, dict->text, dict->textlen);
2501	}
2502
2503	error = gb_load_dictionary_data(gb_main, Main->keys[idx].key, &old_dict_buffer, &old_dict_buffer_size);
2504	if (!error) {
2505	gb_save_dictionary_data(gb_main, Main->keys[idx].key, dict_buffer, dict_buffer_size);
2506
2507	// compress all data with new dictionary
2508	printf(" * Compressing all with new dictionary ...\n");
2509	error = readAndWrite(&gbk[idx]);
2510	if (error) {
2511	/* critical state: new dictionary has been written, but transaction will be aborted below.
2512	* Solution: Write back old dictionary.
2513	*/
2514	gb_save_dictionary_data(gb_main, Main->keys[idx].key, old_dict_buffer, old_dict_buffer_size);
2515	}
2516	}
2517
2518	gbm_free_mem(dict_buffer, dict_buffer_size, GBM_DICT_INDEX);
2519	if (old_dict_buffer) gbm_free_mem(old_dict_buffer, old_dict_buffer_size, GBM_DICT_INDEX);
2520
2521	#if defined(TEST_DICT)
2522	if (!error) {
2523	GB_DICTIONARY *dict_reloaded = gb_get_dictionary(Main, idx);
2524	test_dictionary(dict_reloaded, &(gbk[idx]), &uncompressed_sum, &compressed_sum);
2525	}
2526	#endif // TEST_DICT
2527	}
2528	}
2529
2530	error = GB_end_transaction(gb_main, error);
2531	}
2532
2533	#ifdef TEST_DICT
2534	if (!error) {
2535	printf(" overall uncompressed size = %li b\n"
2536	" overall compressed size = %li b (Ratio=%li%%)\n",
2537	uncompressed_sum, compressed_sum,
2538	(compressed_sum*100)/uncompressed_sum);
2539	}
2540	#endif // TEST_DICT
2541
2542	printf("Done.\n");
2543
2544	g_b_opti_freeGbdByKey(gbk);
2545	}
2546
2547	return error;
2548	}
2549
2550	GB_ERROR GB_optimize(GBDATA *gb_main) {
2551	unsigned long maxKB = GB_get_physical_memory();
2552	long maxMem;
2553	GB_ERROR error = 0;
2554	GB_UNDO_TYPE prev_undo_type = GB_get_requested_undo_type(gb_main);
2555
2556	#ifdef DEBUG
2557	maxKB /= 2;
2558	#endif
2559
2560	if (maxKB<=(LONG_MAX/1024)) maxMem = maxKB*1024;
2561	else maxMem = LONG_MAX;
2562
2563	error = GB_request_undo_type(gb_main, GB_UNDO_KILL);
2564	if (!error) {
2565	error = gb_create_dictionaries(GB_MAIN(gb_main), maxMem);
2566	if (!error) GB_disable_quicksave(gb_main, "Database optimized");
2567	ASSERT_NO_ERROR(GB_request_undo_type(gb_main, prev_undo_type));
2568	}
2569	return error;
2570	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: