Context Navigation

source: tags/arb_5.0/ARBDB/adoptimize.c

Visit:

Last change on this file was 5825, checked in by westram, 16 years ago
replaced GB_entry/GB_create/GB_search+GB_read_XXX by GBT_read_XXX improved error handling in many functions
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 84.5 KB

Line
1	#include <stdio.h>
2	#include <stdlib.h>
3	#include <string.h>
4	#include <limits.h>
5	#include <sys/types.h>
6	#include <netinet/in.h>
7
8	#include "adlocal.h"
9	#include "arbdbt.h"
10
11	#if defined(DEBUG)
12	/* #define TEST_DICT */
13	#endif /* DEBUG */
14
15	typedef unsigned char unsigned_char;
16	typedef unsigned char *u_str;
17	typedef const unsigned char *cu_str;
18
19	struct S_GB_FULL_DICT_TREE;
20	struct S_GB_SINGLE_DICT_TREE;
21
22	static int gbdByKey_cnt;
23	typedef struct /* one for each diff. keyQuark */
24	{
25	int cnt;
26	GBDATA *gbds; / gbdoff */
27	} O_gbdByKey;
28
29	typedef union U_GB_DICT_TREE
30	{
31	struct S_GB_FULL_DICT_TREE *full;
32	struct S_GB_SINGLE_DICT_TREE *single;
33	void *exists;
34
35	} GB_DICT_TREE;
36
37	typedef enum { SINGLE_NODE, FULL_NODE } GB_DICT_NODE_TYPE;
38
39	typedef struct S_GB_FULL_DICT_TREE
40	{
41	GB_DICT_NODE_TYPE typ; /* always FULL_NODE */
42	int usedSons;
43	int count[256];
44	GB_DICT_TREE son[256]; /* index == character */
45
46	} GB_FULL_DICT_TREE;
47
48	typedef struct S_GB_SINGLE_DICT_TREE
49	{
50	GB_DICT_NODE_TYPE typ; /* always SINGLE_NODE */
51	unsigned_char ch; /* the character */
52	int count; /* no of occurrences of this branch */
53	GB_DICT_TREE son;
54	GB_DICT_TREE brother;
55
56	} GB_SINGLE_DICT_TREE;
57
58	/****************************************************/
59
60	#define COMPRESSABLE(type) ((type) >= GB_BYTES && (type)<=GB_STRING)
61	#define DICT_MEM_WEIGHT 4
62
63	#define WORD_HELPFUL(wordlen, occurrences) ((long)((occurrences)3 + DICT_MEM_WEIGHT(2*sizeof(GB_NINT)+(wordlen))) \
64	< \
65	(long)((occurrences)*(wordlen)))
66	/* (occurrences)*4 compressed size
67	* 2*sizeof(GB_NINT)+(wordlen) size in dictionary
68	* (occurrences)*(wordlen) uncompressed size
69	*/
70
71	/****************************************************/
72
73	#define MIN_WORD_LEN 8 /* minimum length of words in dictionary */
74	#define MAX_WORD_LEN 50 /* maximum length of words in dictionary */
75	#define MAX_BROTHERS 10 /* maximum no of brothers linked with GB_SINGLE_DICT_TREE
76	* above we use GB_FULL_DICT_TREE */
77	#define MAX_DIFFER 2 /* percentage of difference (of occurrences of strings) below which two
78	* consecutive parts are treated as EQUAL # of occurrences */
79	#define INCR_DIFFER 1 /* the above percentage is incremented from 0 to MAX_DIFFER by INCR_DIFFER per step */
80
81	#define DICT_STRING_INCR 1024 /* dictionary string will be incremented by this size */
82
83	/****************** Tool functions *****************/
84
85	static GB_INLINE cu_str get_data_n_size(GBDATA gbd, long size) {
86	GB_CSTR data;
87	int type = GB_TYPE(gbd);
88
89	*size = 0;
90
91	switch (type) {
92	case GB_STRING: data = GB_read_char_pntr(gbd); break;
93	case GB_LINK: data = GB_read_link_pntr(gbd); break;
94	case GB_BYTES: data = GB_read_bytes_pntr(gbd); break;
95	case GB_INTS: data = (char*)GB_read_ints_pntr(gbd); break;
96	case GB_FLOATS: data = (char*)GB_read_floats_pntr(gbd); break;
97	default:
98	data = 0;
99	ad_assert(0);
100	break;
101	}
102
103	if (data) *size = GB_UNCOMPRESSED_SIZE(gbd, type);
104	return (cu_str)data;
105	}
106
107	static GB_INLINE long min(long a, long b) {
108	return a<b ? a : b;
109	}
110
111	/****************************************************/
112
113	static void g_b_opti_scanGbdByKey(GB_MAIN_TYPE Main, GBDATA gbd, O_gbdByKey *gbk)
114	{
115	unsigned int quark;
116
117	if (GB_TYPE(gbd) == GB_DB) /* CONTAINER */
118	{
119	int idx;
120	GBCONTAINER gbc = (GBCONTAINER )gbd;
121	GBDATA *gbd2;
122
123	for (idx=0; idx < gbc->d.nheader; idx++)
124	if ((gbd2=GBCONTAINER_ELEM(gbc,idx))!=NULL)
125	g_b_opti_scanGbdByKey(Main,gbd2,gbk);
126	}
127
128	quark = GB_KEY_QUARK(gbd);
129
130	if (quark)
131	{
132	ad_assert(gbk[quark].cnt < Main->keys[quark].nref \|\| quark==0);
133	ad_assert(gbk[quark].gbds != 0);
134
135	gbk[quark].gbds[ gbk[quark].cnt ] = gbd;
136	gbk[quark].cnt++;
137	}
138	}
139
140	static O_gbdByKey g_b_opti_createGbdByKey(GB_MAIN_TYPE Main)
141	{
142	int idx;
143	O_gbdByKey gbk = (O_gbdByKey )GB_calloc(Main->keycnt, sizeof(O_gbdByKey));
144
145	gbdByKey_cnt = Main->keycnt; /* always use gbdByKey_cnt instead of Main->keycnt cause Main->keycnt can change */
146
147	for (idx=1; idx<gbdByKey_cnt; idx++){
148	gbk[idx].cnt = 0;
149
150	if (Main->keys[idx].key && Main->keys[idx].nref>0) {
151	gbk[idx].gbds = (GBDATA *) GB_calloc(Main->keys[idx].nref, sizeof(GBDATA));
152	}
153	else {
154	gbk[idx].gbds = NULL;
155	}
156	}
157
158	gbk[0].cnt = 0;
159	gbk[0].gbds = (GBDATA *)GB_calloc(1,sizeof(GBDATA));
160
161	g_b_opti_scanGbdByKey(Main,(GBDATA*)Main->data,gbk);
162
163	for (idx=0; idx<gbdByKey_cnt; idx++){
164	if (gbk[idx].cnt != Main->keys[idx].nref && idx)
165	{
166	printf("idx=%i gbk[idx].cnt=%i Main->keys[idx].nref=%li\n", /* Main->keys[].nref ist falsch */
167	idx,gbk[idx].cnt,Main->keys[idx].nref);
168
169	Main->keys[idx].nref = gbk[idx].cnt;
170	}
171	/ad_assert(gbk[idx].cnt == Main->keys[idx].nref \|\| idx==0);/
172	}
173	return gbk;
174	}
175
176	static void g_b_opti_freeGbdByKey(GB_MAIN_TYPE Main, O_gbdByKey gbk)
177	{
178	int idx;
179	GBUSE(Main);
180	for (idx=0; idx<gbdByKey_cnt; idx++) free(gbk[idx].gbds);
181	free(gbk);
182	}
183
184	/****************** Convert old compression style to new style *****************/
185
186	static GB_ERROR gb_convert_compression(GBDATA *source) {
187	GB_ERROR error = 0;
188	long type = GB_TYPE(source);
189
190	if (type == GB_DB){
191	GBDATA *gb_p;
192	for (gb_p = GB_child(source); gb_p; gb_p = GB_nextChild(gb_p)) {
193	if (gb_p->flags.compressed_data \|\| GB_TYPE(gb_p) == GB_DB){
194	error = gb_convert_compression(gb_p);
195	if (error) break;
196	}
197	}
198	}
199	else {
200	char *string = 0;
201	long elems = GB_GETSIZE(source);
202	long data_size = GB_UNCOMPRESSED_SIZE(source, type);
203	long new_size = -1;
204	int expectData = 1;
205
206	switch (type) {
207	case GB_STRING:
208	case GB_LINK:
209	case GB_BYTES:
210	string = gb_uncompress_bytes(GB_GETDATA(source), data_size, &new_size);
211	if (string) {
212	ad_assert(new_size == data_size);
213	string = gbs_malloc_copy(string, data_size);
214	}
215	break;
216
217	case GB_INTS:
218	case GB_FLOATS:
219	string = gb_uncompress_longs_old(GB_GETDATA(source), elems, &new_size);
220	if (string) {
221	ad_assert(new_size == data_size);
222	string = gbs_malloc_copy(string, data_size);
223	}
224	break;
225
226	default:
227	expectData = 0;
228	break;
229	}
230
231	if (!string) {
232	if (expectData) {
233	error = GBS_global_string("Can't read old data to convert compression (Reason: %s)", GB_await_error());
234	}
235	}
236	else {
237	switch (type) {
238	case GB_STRING:
239	error = GB_write_string(source, "");
240	if (!error) error = GB_write_string(source, string);
241	break;
242
243	case GB_LINK:
244	error = GB_write_link(source, "");
245	if (!error) error = GB_write_link(source, string);
246	break;
247
248	case GB_BYTES:
249	error = GB_write_bytes(source, "", 0);
250	if (!error) error = GB_write_bytes(source, string, data_size);
251	break;
252
253	case GB_INTS:
254	case GB_FLOATS:
255	error = GB_write_pntr(source, string, data_size, elems);
256	break;
257
258	default:
259	ad_assert(0);
260	break;
261	}
262
263	free(string);
264	}
265	}
266	return error;
267	}
268
269	GB_ERROR gb_convert_V2_to_V3(GBDATA *gb_main){
270	GB_ERROR error = 0;
271	GBDATA *gb_system = GB_search(gb_main,GB_SYSTEM_FOLDER, GB_FIND);
272
273	if (!gb_system) {
274	gb_system = GB_create_container(gb_main, GB_SYSTEM_FOLDER);
275	if (GB_entry(gb_main,"extended_data")){
276	GB_warning("Converting data from old V2.0 to V2.1 Format:\n"
277	" Please Wait (may take some time)");
278	}
279	error = gb_convert_compression(gb_main);
280	GB_disable_quicksave(gb_main,"Database converted to new format");
281	}
282	return error;
283	}
284
285
286	/* ******************* Compress by dictionary ****************** */
287
288
289	/* compression tag format:
290	*
291	* unsigned int compressed:1;
292	* if compressed==0:
293	* unsigned int last:1; ==1 -> this is the last block
294	* unsigned int len:6; length of uncompressable bytes
295	* char[len];
296	* if compressed==1:
297	* unsigned int idxlen:1; ==0 -> 10-bit index
298	* ==1 -> 18-bit index
299	* unsigned int idxhigh:2; the 2 highest bits of the index
300	* unsigned int len:4; (length of word) - (MIN_COMPR_WORD_LEN-1)
301	* if len==0:
302	* char extralen; (length of word) -
303	* char[idxlen+1]; index (low,high)
304	*
305	* tag == 64 -> end of dictionary compressed block (if not coded in last uncompressed block)
306	*/
307
308	GB_INLINE int INDEX_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
309	ad_assert(idx<dict->words);
310	return ntohl(dict->offsets[idx]);
311	}
312	GB_INLINE int ALPHA_DICT_OFFSET(int idx, GB_DICTIONARY *dict) {
313	int realIndex;
314	ad_assert(idx<dict->words);
315	realIndex = ntohl(dict->resort[idx]);
316	return INDEX_DICT_OFFSET(realIndex, dict);
317	}
318
319	/* #define ALPHA_DICT_OFFSET(i) ntohl(offset[ntohl(resort[i])]) */
320	/* #define INDEX_DICT_OFFSET(i) ntohl(offset[i]) */
321
322	#define LEN_BITS 4
323	#define INDEX_BITS 2
324	#define INDEX_LEN_BITS 1
325
326	#define LEN_SHIFT 0
327	#define INDEX_SHIFT (LEN_SHIFT+LEN_BITS)
328	#define INDEX_LEN_SHIFT (INDEX_SHIFT+INDEX_BITS)
329
330	#define BITMASK(bits) ((1<<(bits))-1)
331	#define GETVAL(tag,typ) (((tag)>>typ##_SHIFT)&BITMASK(typ##_BITS))
332
333	#define MIN_SHORTLEN 6
334	#define MAX_SHORTLEN (BITMASK(LEN_BITS)+MIN_SHORTLEN-1)
335	#define MIN_LONGLEN (MAX_SHORTLEN+1)
336	#define MAX_LONGLEN (MIN_LONGLEN+255)
337
338	#define SHORTLEN_DECR (MIN_SHORTLEN-1) /* !! zero is used as flag for long len !! */
339	#define LONGLEN_DECR MIN_LONGLEN
340
341	#define MIN_COMPR_WORD_LEN MIN_SHORTLEN
342	#define MAX_COMPR_WORD_LEN MAX_LONGLEN
343
344	#define MAX_SHORT_INDEX BITMASK(INDEX_BITS+8)
345	#define MAX_LONG_INDEX BITMASK(INDEX_BITS+16)
346
347	#define LAST_COMPRESSED_BIT 64
348
349	#ifdef DEBUG
350	# define DUMP_COMPRESSION_TEST 0
351	/* 0 = only compression ratio
352	* 1 = + original/compressed/decompressed
353	* 2 = + words used to compress/uncompress
354	* 3 = + matching words in dictionary
355	* 4 = + search of words in dictionary
356	*/
357	#else
358	# define DUMP_COMPRESSION_TEST 0
359	#endif
360
361	#ifdef DEBUG
362	/* #define COUNT_CHUNKS */
363
364	#if defined(COUNT_CHUNKS)
365
366	static long uncompressedBlocks[64];
367	static long compressedBlocks[MAX_LONGLEN];
368
369	static void clearChunkCounters(void) {
370	int i;
371
372	for (i=0; i<64; i++) uncompressedBlocks[i] = 0;
373	for (i=0; i<MAX_LONGLEN; i++) compressedBlocks[i] = 0;
374	}
375
376	static void dumpChunkCounters(void) {
377	int i;
378
379	printf("------------------------------\n" "Uncompressed blocks used:\n");
380	for (i=0; i<64; i++) if (uncompressedBlocks[i]) printf(" size=%i used=%li\n", i, uncompressedBlocks[i]);
381	printf("------------------------------\n" "Words used:\n");
382	for (i=0; i<MAX_LONGLEN; i++) if (compressedBlocks[i]) printf(" size=%i used=%li\n", i, compressedBlocks[i]);
383	printf("------------------------------\n");
384	}
385	#endif /* COUNT_CHUNKS */
386
387	static cu_str lstr(cu_str s, int len) {
388	#define BUFLEN 10000
389	static unsigned_char buf[BUFLEN];
390
391	ad_assert(len<BUFLEN);
392	memcpy(buf,s,len);
393	buf[len] = 0;
394
395	return buf;
396	}
397
398	#if DUMP_COMPRESSION_TEST>=2
399
400	static cu_str dict_word(GB_DICTIONARY *dict, int idx, int len) {
401	/* GB_NINT offset = dict->offsets; /
402	return lstr(dict->text+INDEX_DICT_OFFSET(idx, dict), len);
403	}
404
405	#endif
406
407	#if DUMP_COMPRESSION_TEST>=1
408
409	static void dumpBinary(u_str data, long size) {
410	#define PER_LINE 12
411	int cnt = 0;
412
413	while (size--) {
414	unsigned_char c = *data++;
415	int bitval = 128;
416	int bits = 8;
417
418	while (bits--) {
419	putchar(c&bitval?'1':'0');
420	bitval>>=1;
421	}
422	putchar(' ');
423
424	cnt = (cnt+1)%PER_LINE;
425	if (!cnt) putchar('\n');
426	}
427
428	if (cnt) putchar('\n');
429	}
430
431	#endif
432
433	#endif
434
435	static GB_INLINE int GB_MEMCMP(const void vm1, const void vm2, long size) {
436	char c1 = (char)vm1,
437	c2 = (char)vm2;
438	int diff = 0;
439
440	while (size-- && !diff) diff = c1++-c2++;
441	return diff;
442	}
443
444	/* -------------------------------------------------- */
445
446	static int searchWord(GB_DICTIONARY dict, cu_str source, long size, unsigned long wordIndex, int *wordLen)
447	{
448	int idx = -1;
449	int l = 0;
450	int h = dict->words-1;
451	cu_str text = dict->text;
452	/* GB_NINT offset = dict->offsets; /
453	GB_NINT *resort = dict->resort;
454	int dsize = dict->textlen;
455	int ilen = 0;
456
457	while (l<h-1) {
458	int m = (l+h)/2;
459	long off = ALPHA_DICT_OFFSET(m, dict);
460	cu_str dictword = text+off;
461	long msize = min(size,dsize-off);
462
463	#if DUMP_COMPRESSION_TEST>=4
464	printf(" %s (%i)\n", lstr(dictword,20), m);
465	#endif
466
467	if (GB_MEMCMP(source, dictword, msize)<=0) h = m;
468	else l = m;
469	}
470
471	while (l<=h) {
472	int off = ALPHA_DICT_OFFSET(l, dict);
473	cu_str word = text+off;
474	int msize = (int)min(size, dsize-off);
475	int equal = 0;
476	cu_str s = source;
477
478	while (msize-- && s++==word++) equal++;
479
480	#if DUMP_COMPRESSION_TEST>=3
481	if (equal>=MIN_COMPR_WORD_LEN) {
482	printf(" EQUAL=%i '%s' (%i->%i, off=%i)", equal, lstr(text+off,equal), l, ntohl(resort[l]), ALPHA_DICT_OFFSET(l, dict));
483	printf(" (context=%s)\n", lstr(text+off-min(off,20),min(off,20)+equal+20));
484	}
485	#endif
486
487	if (equal>ilen) {
488	ilen = equal;
489	idx = ntohl(resort[l]);
490	ad_assert(idx<dict->words);
491	}
492
493	l++;
494	}
495
496	*wordIndex = idx;
497	*wordLen = (int)min(ilen,MAX_COMPR_WORD_LEN);
498
499	return idx!=-1 && ilen>=MIN_COMPR_WORD_LEN;
500	}
501
502	#ifdef DEBUG
503	int look(GB_DICTIONARY *dict, GB_CSTR source) {
504	unsigned long wordIndex;
505	int wordLen;
506	int wordFound = searchWord(dict, (cu_str)source, strlen(source), &wordIndex, &wordLen);
507
508	if (wordFound) {
509	printf("'%s' (idx=%lu, off=%i)\n", lstr(dict->text+ntohl(dict->offsets[wordIndex]), wordLen), wordIndex, ntohl(dict->offsets[wordIndex]));
510	}
511
512	return wordFound;
513	}
514	#endif
515
516
517
518	static char gb_uncompress_by_dictionary_internal(GB_DICTIONARY dict, /GBDATA gbd, /GB_CSTR s_source, const long size, GB_BOOL append_zero, long new_size) {
519	cu_str source = (cu_str)s_source;
520	u_str dest;
521	u_str buffer;
522	cu_str text = dict->text;
523	/* GB_NINT offset = dict->offsets; /
524	int done = 0;
525	long left = size;
526
527	dest = buffer = (u_str)GB_give_other_buffer(s_source,size+2);
528
529	while (left && !done) {
530	int c;
531
532	if ((c=source++)&128) / compressed data */ {
533	int indexLen = GETVAL(c,INDEX_LEN);
534	unsigned long idx = GETVAL(c,INDEX);
535
536	c = GETVAL(c,LEN); /* ==wordLen */
537	if (c) c += SHORTLEN_DECR;
538	else c = *source+++LONGLEN_DECR;
539
540	ad_assert(indexLen>=0 && indexLen<=1);
541
542	if (indexLen==0) {
543	idx = (idx <<8) \| *source++;
544	}
545	else {
546	idx = (((idx << 8) \| source[1]) << 8) \| source[0];
547	source += 2;
548	}
549
550	ad_assert(idx<(GB_ULONG)dict->words);
551
552	{
553	cu_str word = text+INDEX_DICT_OFFSET(idx, dict);
554
555	#if DUMP_COMPRESSION_TEST>=2
556	printf(" word='%s' (idx=%lu, off=%li, len=%i)\n",
557	lstr(word, c), idx, (long)ntohl(dict->offsets[idx]), c);
558	#endif
559
560	{
561	u_str d = dest;
562	ad_assert(((d + c) <= word) \|\| (d >= (word + c)));
563	while (c--) d++ = word++;
564	dest = d;
565	}
566	}
567	}
568	else { /* uncompressed bytes */
569	if (c & LAST_COMPRESSED_BIT) {
570	done = 1;
571	c ^= LAST_COMPRESSED_BIT;
572	}
573
574	left -= c;
575	{
576	u_str d = dest;
577	ad_assert(((d + c) <= source) \|\| (d >= (source + c)));
578	while (c--) d++ = source++;
579	dest=d;
580	}
581	}
582	}
583
584	if (append_zero == GB_TRUE) *dest++ = 0;
585
586	*new_size = dest-buffer;
587	ad_assert(size >= *new_size); // buffer overflow
588
589	return (char *)buffer;
590	}
591
592	char gb_uncompress_by_dictionary(GBDATA gbd, GB_CSTR s_source, long size, long *new_size)
593	{
594	GB_DICTIONARY *dict = gb_get_dictionary(GB_MAIN(gbd), GB_KEY_QUARK(gbd));
595	GB_BOOL append_zero = GB_TYPE(gbd)==GB_STRING \|\| GB_TYPE(gbd) == GB_LINK;
596
597	if (!dict) {
598	GB_ERROR error = GBS_global_string("Cannot decompress db-entry '%s' (no dictionary found)\n", GB_get_db_path(gbd));
599	GB_export_error(error);
600	return 0;
601	}
602
603	return gb_uncompress_by_dictionary_internal(dict, s_source, size, append_zero, new_size);
604	}
605
606	char gb_compress_by_dictionary(GB_DICTIONARY dict, GB_CSTR s_source, long size, long *msize, int last_flag, int search_backward, int search_forward)
607	{
608	cu_str source = (cu_str)s_source;
609	u_str dest;
610	u_str buffer;
611	cu_str unknown = source; /* start of uncompressable bytes */
612	u_str lastUncompressed = NULL; /* ptr to start of last block of uncompressable bytes (in dest) */
613
614	#if defined(ASSERTION_USED)
615	const long org_size = size;
616	#endif /* ASSERTION_USED */
617
618	ad_assert(size>0); /* compression of zero-length data fails! */
619
620	dest = buffer = (u_str)GB_give_other_buffer((GB_CSTR)source, 1+(size/63+1)+size);
621	*dest++ = GB_COMPRESSION_DICTIONARY \| last_flag;
622
623	while (size) {
624	unsigned long wordIndex;
625	int wordLen;
626	int wordFound;
627
628	if ( (wordFound = searchWord(dict, source, size, &wordIndex, &wordLen)) ) {
629	int length;
630
631	takeRest:
632	length = source-unknown;
633
634	if (length) {
635	int shift;
636	int takeShift = 0;
637	int maxShift = (int)min(search_forward, wordLen-1);
638
639	for (shift=1; shift<=maxShift; shift++) {
640	unsigned long wordIndex2;
641	int wordLen2;
642	int wordFound2;
643
644	if ( (wordFound2 = searchWord(dict, source+shift, size-shift, &wordIndex2, &wordLen2))) {
645	if (wordLen2>(wordLen+shift)) {
646	wordIndex = wordIndex2;
647	wordLen = wordLen2;
648	takeShift = shift;
649	}
650	}
651	}
652
653	if (takeShift) {
654	source += takeShift;
655	size -= takeShift;
656	length = source-unknown;
657	}
658	}
659
660	while (length) { /* if there were uncompressable bytes */
661	int take = (int)min(length,63);
662
663	#ifdef COUNT_CHUNKS
664	uncompressedBlocks[take]++;
665	#endif
666
667	lastUncompressed = dest;
668
669	dest++ = take; / tag byte */
670	memcpy(dest, unknown, take);
671	dest += take;
672	unknown += take;
673	length -= take;
674	}
675
676	ad_assert(unknown==source);
677
678	while (wordFound) { /* as long as we find words in dictionary */
679	int indexLen = wordIndex>MAX_SHORT_INDEX;
680	int indexHighBits = indexLen==0 ? wordIndex>>8 : wordIndex>>16;
681	int nextWordFound;
682	int nextWordLen;
683	unsigned long nextWordIndex;
684
685	ad_assert((long)wordIndex<dict->words);
686	ad_assert((long)wordIndex <= MAX_LONG_INDEX);
687	ad_assert(indexHighBits==(indexHighBits & BITMASK(INDEX_BITS)));
688	ad_assert(wordLen>=MIN_SHORTLEN);
689
690	lastUncompressed = NULL;
691
692	{
693	cu_str source2 = source+wordLen;
694	long size2 = size-wordLen;
695
696	if (!(nextWordFound=searchWord(dict, source+wordLen, size-wordLen, &nextWordIndex, &nextWordLen))) { /* no word right afterwards */
697	int shift;
698
699	for (shift=1; shift<=search_backward && shift<(wordLen-MIN_COMPR_WORD_LEN); shift++) {
700	/* try to cut end of word to get a better result */
701	unsigned long wordIndex2;
702	int wordLen2;
703	int wordFound2;
704
705	if ( (wordFound2=searchWord(dict, source2-shift, size2+shift, &wordIndex2, &wordLen2))) {
706	if (wordLen2>(shift+1)) {
707	wordLen -= shift;
708
709	nextWordFound = 1;
710	nextWordIndex = wordIndex2;
711	nextWordLen = wordLen2;
712	break;
713	}
714	}
715	}
716	}
717	}
718
719	#ifdef COUNT_CHUNKS
720	compressedBlocks[wordLen]++;
721	#endif
722
723	#if DUMP_COMPRESSION_TEST>=2
724	printf(" word='%s' (idx=%li, off=%i, len=%i)\n",
725	dict_word(dict, wordIndex, wordLen), wordIndex, (int)ntohl(dict->offsets[wordIndex]), wordLen);
726	#endif
727
728	if (wordLen<=MAX_SHORTLEN) {
729	*dest++ = 128 \|
730	(indexLen << INDEX_LEN_SHIFT) \|
731	(indexHighBits << INDEX_SHIFT) \|
732	((wordLen-SHORTLEN_DECR) << LEN_SHIFT);
733	}
734	else {
735	*dest++ = 128 \|
736	(indexLen << INDEX_LEN_SHIFT) \|
737	(indexHighBits << INDEX_SHIFT);
738	dest++ = wordLen-LONGLEN_DECR; / extra length byte */
739	}
740
741	dest++ = (char)wordIndex; / low index byte */
742	if (indexLen)
743	dest++ = (char)(wordIndex >> 8); / high index byte */
744
745	unknown = source += wordLen;
746	size -= wordLen;
747
748	wordFound = nextWordFound;
749	wordIndex = nextWordIndex;
750	wordLen = nextWordLen;
751	}
752	}
753	else {
754	source++;
755	if (--size==0) goto takeRest;
756	}
757	}
758
759	if (lastUncompressed) *lastUncompressed \|= LAST_COMPRESSED_BIT;
760	else *dest++ = LAST_COMPRESSED_BIT;
761
762	*msize = dest-buffer;
763
764	#if defined(ASSERTION_USED)
765	{
766	long new_size = -1;
767	char *test = gb_uncompress_by_dictionary_internal(dict, (GB_CSTR)buffer+1, org_size + GB_COMPRESSION_TAGS_SIZE_MAX, GB_TRUE, &new_size);
768
769	ad_assert(memcmp(test, s_source, org_size) == 0);
770	ad_assert((org_size+1) == new_size);
771	}
772	#endif /* ASSERTION_USED */
773
774	return (char*)buffer;
775	}
776
777
778	#if defined(TEST_DICT)
779
780	static void test_dictionary(GB_DICTIONARY dict, O_gbdByKey gbk, long uncompSum, long compSum)
781	{
782	int cnt;
783	long uncompressed_sum = 0;
784	long compressed_sum = 0;
785	long dict_size = (dict->words2+1)sizeof(GB_NINT)+dict->textlen;
786	int i;
787	long char_count[256];
788
789	for (i=0; i<256; i++) char_count[i] = 0;
790
791	printf(" * Testing compression..\n");
792
793	#ifdef COUNT_CHUNKS
794	clearChunkCounters();
795	#endif
796
797	for (cnt=0; cnt<gbk->cnt; cnt++) {
798	GBDATA *gbd = gbk->gbds[cnt];
799	int type = GB_TYPE(gbd);
800
801	if (COMPRESSABLE(type)) {
802	long size;
803	cu_str data = get_data_n_size(gbd, &size);
804	u_str copy;
805	long compressedSize;
806	int last_flag = 0;
807	u_str compressed;
808	u_str uncompressed;
809
810	if (type==GB_STRING \|\| type == GB_LINK) size--;
811
812	if (size<1) continue;
813
814	#ifndef NDEBUG
815	copy = (u_str)gbm_get_mem(size, GBM_DICT_INDEX);
816	ad_assert(copy!=0);
817	memcpy(copy, data, size);
818	#endif
819
820	#if DUMP_COMPRESSION_TEST>=1
821	printf("----------------------------\n");
822	printf("original : %3li b = '%s'\n", size, data);
823	#endif
824
825	compressed = (u_str)gb_compress_by_dictionary(dict, (GB_CSTR)data, size, &compressedSize, last_flag, 9999, 2);
826
827	#if DUMP_COMPRESSION_TEST>=1
828	printf("compressed : %3li b = '%s'\n", compressedSize, lstr(compressed,compressedSize));
829	dumpBinary(compressed, compressedSize);
830	#endif
831
832	for (i=0; i<compressedSize; i++) char_count[compressed[i]]++;
833
834	uncompressed = (u_str)gb_uncompress_by_dictionary(gbd, (char*)compressed+1, size);
835
836	#if DUMP_COMPRESSION_TEST>=1
837	printf("copy : %3li b = '%s'\n", size, lstr(copy,size));
838	printf("decompressed: %3li b = '%s'\n", size, lstr(uncompressed,size));
839	#endif
840
841	if (GB_MEMCMP(copy, uncompressed, size)!=0) {
842	int byte = 0;
843
844	while (copy[byte]==uncompressed[byte]) byte++;
845	printf("Error in compression (off=%i, '%s'", byte, lstr(copy+byte,10));
846	printf("!='%s'\n", lstr(uncompressed+byte,10));
847	}
848
849	if (compressedSize<size) {
850	uncompressed_sum += size;
851	compressed_sum += compressedSize;
852	}
853	else {
854	uncompressed_sum += size;
855	compressed_sum += size;
856	}
857
858	gbm_free_mem((char*)copy, size, GBM_DICT_INDEX);
859	}
860	}
861
862	#ifdef COUNT_CHUNKS
863	dumpChunkCounters();
864	#endif
865
866	{
867	long compressed_plus_dict = compressed_sum+dict_size;
868	char *dict_text = GBS_global_string_copy("+dict %li b", dict_size);
869	long ratio = (compressed_plus_dict*100)/uncompressed_sum;
870
871	printf(" uncompressed size = %10li b\n"
872	" compressed size = %10li b\n"
873	" %17s = %10li b (Ratio=%li%%)\n",
874	uncompressed_sum,
875	compressed_sum,
876	dict_text, compressed_plus_dict, ratio);
877
878	free(dict_text);
879	}
880
881	*uncompSum += uncompressed_sum;
882	*compSum += compressed_sum+dict_size;
883	}
884
885	#endif /* TEST_DICT */
886
887
888	/****************** Build dictionary *****************/
889
890	#ifdef DEBUG
891	#define TEST /* test trees? */
892	/* #define DUMP_TREE / / dump trees? */
893
894	/* #define DUMP_EXPAND */
895	/*
896	#define SELECT_WORDS
897	#define SELECTED_WORDS "oropl"
898	*/
899
900	# ifdef SELECT_WORDS
901	static char strnstr(char s1, int len, char *s2) {
902	char c = *s2;
903	int len2 = strlen(s2);
904
905	while (len-->=len2) {
906	if (*s1==c) {
907	if (strncmp(s1,s2,len2)==0) return s1;
908	}
909	s1++;
910	}
911
912	return NULL;
913	}
914	# endif
915
916	#ifdef DUMP_TREE
917	static void dump_dtree(int deep, GB_DICT_TREE tree)
918	{
919	static unsigned_char buffer[1024];
920
921	if (tree.full) {
922	switch (tree.full->typ) {
923	case FULL_NODE: {
924	int idx;
925
926	for (idx=0; idx<256; idx++) {
927	buffer[deep] = idx;
928	buffer[deep+1] = 0;
929
930	if (tree.full->son[idx].exists) dump_dtree(deep+1, tree.full->son[idx]);
931	else if (tree.full->count[idx]>0) printf(" '%s' (%i) [array]\n", buffer, tree.full->count[idx]);
932	}
933	break;
934	}
935	case SINGLE_NODE: {
936	buffer[deep] = tree.single->ch;
937	buffer[deep+1] = 0;
938
939	if (tree.single->son.exists) dump_dtree(deep+1, tree.single->son);
940	else printf(" '%s' (%i) [single]\n", buffer, tree.single->count);
941
942	if (tree.single->brother.exists) dump_dtree(deep, tree.single->brother);
943	break;
944	}
945	}
946	}
947	}
948	#endif
949
950	#else
951	#ifdef DUMP_TREE
952	# define dump_dtree(deep,tree)
953	#endif
954	#endif
955
956	#ifdef TEST
957	static int testCounts(GB_DICT_TREE tree)
958	/*
959	* tests if all inner nodes have correct 'count's
960	*/
961	{
962	int cnt = 0;
963
964	if (tree.exists) {
965	switch (tree.full->typ) {
966	case SINGLE_NODE: {
967	while (tree.exists) {
968	if (tree.single->son.exists) {
969	int son_cnt = testCounts(tree.single->son);
970	#ifdef COUNT_EQUAL
971	ad_assert(son_cnt==tree.single->count);
972	#else
973	ad_assert(son_cnt<=tree.single->count);
974	#endif
975	}
976
977	ad_assert(tree.single->count>0);
978	cnt += tree.single->count;
979	tree = tree.single->brother;
980	}
981	break;
982	}
983	case FULL_NODE: {
984	int idx,
985	sons = 0;
986
987	for (idx=0; idx<256; idx++) {
988	if (tree.full->son[idx].exists) {
989	int son_cnt = testCounts(tree.full->son[idx]);
990	#ifdef COUNT_EQUAL
991	ad_assert(son_cnt==tree.full->count[idx]);
992	#else
993	ad_assert(son_cnt<=tree.full->count[idx]);
994	#endif
995	if (tree.full->usedSons) ad_assert(tree.full->count[idx]>0);
996	else ad_assert(tree.full->count[idx]==0);
997
998	sons++;
999	}
1000	else if (tree.full->count[idx]) {
1001	sons++;
1002	}
1003
1004	cnt += tree.full->count[idx];
1005	}
1006
1007	ad_assert(sons==tree.full->usedSons);
1008	break;
1009	}
1010	}
1011	}
1012
1013	return cnt;
1014	}
1015
1016	/* #define TEST_MAX_OCCUR_COUNT */
1017
1018	#ifdef TEST_MAX_OCCUR_COUNT
1019	#define MAX_OCCUR_COUNT 600000
1020	#endif
1021
1022	static GB_DICT_TREE test_dtree(GB_DICT_TREE tree)
1023	/* only correct while tree is under contruction (build_dict_tree()) */
1024	{
1025	if (tree.exists) {
1026	switch (tree.full->typ) {
1027	case SINGLE_NODE: {
1028	#if defined(TEST_MAX_OCCUR_COUNT)
1029	ad_assert(tree.single->count<MAX_OCCUR_COUNT); /* eher unwahrscheinlich */
1030	#endif // TEST_MAX_OCCUR_COUNT
1031
1032	if (tree.single->son.exists) {
1033	ad_assert(tree.single->count==0);
1034	test_dtree(tree.single->son);
1035	}
1036	else {
1037	ad_assert(tree.single->count>0);
1038	}
1039
1040	if (tree.single->brother.exists) test_dtree(tree.single->brother);
1041	break;
1042	}
1043	case FULL_NODE: {
1044	int idx;
1045	int countSons = 0;
1046
1047	for (idx=0; idx<256; idx++) {
1048	#if defined(TEST_MAX_OCCUR_COUNT)
1049	ad_assert(tree.full->count[idx]<MAX_OCCUR_COUNT); /* eher unwahrscheinlich */
1050	#endif // TEST_MAX_OCCUR_COUNT
1051
1052	if (tree.full->son[idx].exists) {
1053	ad_assert(tree.full->count[idx]==0);
1054	test_dtree(tree.full->son[idx]);
1055	countSons++;
1056	}
1057	else {
1058	ad_assert(tree.full->count[idx]>=0);
1059	if (tree.full->count[idx]>0)
1060	countSons++;
1061	}
1062	}
1063
1064	ad_assert(countSons==tree.full->usedSons);
1065
1066	break;
1067	}
1068	}
1069	}
1070
1071	return tree;
1072	}
1073
1074	#else
1075	# define test_dtree(tree) /* (tree) */
1076	# define testCounts(tree) /* 0 */
1077	#endif
1078
1079
1080	static GB_DICT_TREE new_dtree(cu_str text, long len, long *memcount)
1081	/* creates a new (sub-)tree from 'text' (which has length 'len') */
1082	{
1083	GB_DICT_TREE tree;
1084
1085	if (len) {
1086	GB_SINGLE_DICT_TREE *tail = NULL,
1087	*head = NULL;
1088
1089	while (len) {
1090	if (tail) tail = tail->son.single = (GB_SINGLE_DICT_TREE)gbm_get_mem(sizeof(tail),GBM_DICT_INDEX);
1091	else tail = head = (GB_SINGLE_DICT_TREE)gbm_get_mem(sizeof(tail),GBM_DICT_INDEX);
1092
1093	(memcount) += sizeof(tail);
1094
1095	tail->typ = SINGLE_NODE;
1096	tail->ch = *text++;
1097	len--;
1098
1099	tail->brother.single = NULL;
1100	tail->son.single = NULL;
1101	}
1102
1103	tail->count = 1;
1104	tree.single = head;
1105	}
1106	else {
1107	tree.single = NULL;
1108	}
1109
1110	return tree;
1111	}
1112
1113	GB_DICT_TREE single2full_dtree(GB_DICT_TREE tree, long *memcount)
1114	{
1115	if (tree.exists && tree.single->typ==SINGLE_NODE) {
1116	GB_FULL_DICT_TREE full = (GB_FULL_DICT_TREE)gbm_get_mem(sizeof(*full), GBM_DICT_INDEX);
1117	int idx;
1118
1119	(memcount) += sizeof(full);
1120	full->typ = FULL_NODE;
1121	full->usedSons = 0;
1122
1123	for (idx=0; idx<256; idx++) {
1124	full->son[idx].exists = NULL;
1125	full->count[idx] = 0;
1126	}
1127
1128	while (tree.exists) {
1129	GB_SINGLE_DICT_TREE *t = tree.single;
1130
1131	ad_assert(t->typ==SINGLE_NODE);
1132	ad_assert(full->son[t->ch].exists==NULL);
1133
1134	full->son[t->ch] = t->son;
1135	full->count[t->ch] = t->count;
1136	full->usedSons++;
1137
1138	tree.single = t->brother.single;
1139
1140	gbm_free_mem((char)t, sizeof(t), GBM_DICT_INDEX);
1141	(memcount) -= sizeof(t);
1142	}
1143
1144	tree.full = full;
1145	}
1146
1147	return tree;
1148	}
1149
1150	static void free_dtree(GB_DICT_TREE tree)
1151	{
1152	if (tree.exists) {
1153	switch(tree.full->typ) {
1154	case SINGLE_NODE: {
1155	if (tree.single->son.exists) free_dtree(tree.single->son);
1156	if (tree.single->brother.exists) free_dtree(tree.single->brother);
1157
1158	gbm_free_mem((char)tree.single, sizeof((tree.single)), GBM_DICT_INDEX);
1159	break;
1160	}
1161	case FULL_NODE: {
1162	int idx;
1163
1164	for (idx=0; idx<256; idx++) if (tree.full->son[idx].exists) free_dtree(tree.full->son[idx]);
1165	gbm_free_mem((char)tree.full, sizeof((tree.full)), GBM_DICT_INDEX);
1166	break;
1167	}
1168	}
1169	}
1170	}
1171
1172
1173
1174	static GB_DICT_TREE cut_dtree(GB_DICT_TREE tree, int cut_count, long memcount, long leafcount)
1175	/* removes all branches from 'tree' which are referenced less/equal than cut_count
1176	* returns: the reduced tree */
1177	{
1178	if (tree.exists) {
1179	switch (tree.full->typ) {
1180	case SINGLE_NODE: {
1181	if (tree.single->son.exists) tree.single->son = cut_dtree(tree.single->son, cut_count, memcount, leafcount);
1182
1183	if (!tree.single->son.exists) { /* leaf */
1184	if (tree.single->count<=cut_count) { /* leaf with less/equal references */
1185	GB_DICT_TREE brother = tree.single->brother;
1186
1187	gbm_free_mem((char)tree.single, sizeof(tree.single), GBM_DICT_INDEX);
1188	(memcount) -= sizeof(tree.single);
1189	if (brother.exists) return cut_dtree(brother, cut_count, memcount, leafcount);
1190
1191	tree.single = NULL;
1192	break;
1193	}
1194	else {
1195	(*leafcount)++;
1196	}
1197	}
1198
1199	if (tree.single->brother.exists) tree.single->brother = cut_dtree(tree.single->brother, cut_count, memcount, leafcount);
1200	break;
1201	}
1202	case FULL_NODE: {
1203	int idx;
1204	int count = 0;
1205
1206	for (idx=0; idx<256; idx++) {
1207	if (tree.full->son[idx].exists) {
1208	tree.full->son[idx] = cut_dtree(tree.full->son[idx], cut_count, memcount, leafcount);
1209
1210	if (tree.full->son[idx].exists) count++;
1211	else tree.full->count[idx] = 0;
1212	}
1213	else if (tree.full->count[idx]>0) {
1214	if (tree.full->count[idx]<=cut_count) {
1215	tree.full->count[idx] = 0;
1216	}
1217	else {
1218	count++;
1219	(*leafcount)++;
1220	}
1221	}
1222	}
1223
1224	tree.full->usedSons = count;
1225
1226	if (!count) { /* no more sons */
1227	gbm_free_mem((char)tree.full, sizeof((tree.full)), GBM_DICT_INDEX);
1228	(memcount) -= sizeof((tree.full));
1229	tree.exists = NULL;
1230	}
1231
1232	break;
1233	}
1234	}
1235	}
1236
1237	return tree;
1238	}
1239	static GB_DICT_TREE cut_useless_words(GB_DICT_TREE tree, int deep, long *removed)
1240	/* removes/shortens all branches of 'tree' which are not useful for compression
1241	* 'deep' should be zero (incremented by cut_useless_words)
1242	* 'removed' will be set to the # of removed occurrences
1243	* returns: the reduced tree
1244	*/
1245	{
1246	*removed = 0;
1247
1248	if (tree.exists) {
1249	deep++;
1250
1251	switch (tree.full->typ) {
1252	long removed_single;
1253
1254	case SINGLE_NODE: {
1255	if (tree.single->son.exists) {
1256	tree.single->son = cut_useless_words(tree.single->son, deep, &removed_single);
1257	tree.single->count -= removed_single;
1258	*removed += removed_single;
1259	}
1260
1261	if (!tree.single->son.exists && !WORD_HELPFUL(deep,tree.single->count)) {
1262	GB_DICT_TREE brother = tree.single->brother;
1263
1264	*removed += tree.single->count;
1265	gbm_free_mem((char)tree.single, sizeof(tree.single), GBM_DICT_INDEX);
1266
1267	if (brother.exists) {
1268	tree = cut_useless_words(brother, deep-1, &removed_single);
1269	*removed += removed_single;
1270	}
1271	else {
1272	tree.exists = NULL;
1273	}
1274
1275	break;
1276	}
1277
1278	if (tree.single->brother.exists) {
1279	tree.single->brother = cut_useless_words(tree.single->brother, deep-1, &removed_single);
1280	*removed += removed_single;
1281	}
1282
1283	break;
1284	}
1285	case FULL_NODE: {
1286	int idx;
1287	int count = 0;
1288
1289	for (idx=0; idx<256; idx++) {
1290	if (tree.full->son[idx].exists) {
1291	tree.full->son[idx] = cut_useless_words(tree.full->son[idx], deep, &removed_single);
1292	tree.full->count[idx] -= removed_single;
1293	*removed += removed_single;
1294	}
1295
1296	if (tree.full->son[idx].exists) {
1297	count++;
1298	}
1299	else if (tree.full->count[idx]) {
1300	if (!WORD_HELPFUL(deep, tree.full->count[idx])) { /* useless! */
1301	*removed += tree.full->count[idx];
1302	tree.full->count[idx] = 0;
1303	}
1304	else {
1305	count++;
1306	}
1307	}
1308	}
1309
1310	tree.full->usedSons = count;
1311
1312	if (!count) { /* no more sons */
1313	gbm_free_mem((char)tree.full, sizeof((tree.full)), GBM_DICT_INDEX);
1314	tree.exists = NULL;
1315	}
1316
1317	break;
1318	}
1319	}
1320	}
1321
1322	return tree;
1323	}
1324
1325	static GB_DICT_TREE add_dtree_to_dtree(GB_DICT_TREE toAdd, GB_DICT_TREE to, long *memcount)
1326	/* adds 'toAdd' as brother of 'to' (must be leftmost of all SINGLE_NODEs or a FULL_NODE)
1327	* returns: the leftmost of all SINGLE_NODEs or a FULL_NODE
1328	*/
1329	{
1330	GB_DICT_TREE tree = toAdd;
1331
1332	ad_assert(toAdd.single->typ==SINGLE_NODE);
1333
1334	if (to.exists) {
1335	switch (to.full->typ) {
1336	case SINGLE_NODE: {
1337	GB_SINGLE_DICT_TREE *left = to.single;
1338
1339	ad_assert(left!=0);
1340
1341	if (toAdd.single->ch < to.single->ch) {
1342	toAdd.single->brother = to;
1343	return toAdd;
1344	}
1345
1346	while (to.single->brother.exists) {
1347	if (toAdd.single->ch < to.single->brother.single->ch) {
1348	toAdd.single->brother = to.single->brother;
1349	to.single->brother = toAdd;
1350
1351	tree.single = left;
1352	return tree;
1353	}
1354	to = to.single->brother;
1355	}
1356
1357	to.single->brother = toAdd;
1358	tree.single = left;
1359	break;
1360	}
1361	case FULL_NODE: {
1362	unsigned_char ch = toAdd.single->ch;
1363
1364	ad_assert(to.full->son[ch].exists==NULL);
1365	ad_assert(to.full->count[ch]==0); /* if this fails, count must be added & tested */
1366	ad_assert(toAdd.single->brother.exists==NULL);
1367
1368	to.full->son[ch] = toAdd.single->son;
1369	to.full->count[ch] = toAdd.single->count;
1370	to.full->usedSons++;
1371
1372	tree = to;
1373
1374	gbm_free_mem((char)toAdd.single, sizeof((toAdd.single)), GBM_DICT_INDEX);
1375	(*memcount) -= sizeof(toAdd.single);
1376
1377	break;
1378	}
1379	}
1380	}
1381
1382	return tree;
1383	}
1384
1385	static GB_DICT_TREE add_to_dtree(GB_DICT_TREE tree, cu_str text, long len, long *memcount)
1386	/* adds the string 'text' (which has length 'len') to 'tree'
1387	* returns: new tree
1388	*/
1389	{
1390	if (tree.exists) {
1391	switch (tree.full->typ) {
1392	case SINGLE_NODE: {
1393	GB_SINGLE_DICT_TREE *t = tree.single;
1394	int count = 0;
1395
1396	do {
1397	count++;
1398	if (t->ch==text[0]) { /* we found an existing subtree */
1399	if (len>1) {
1400	t->son = add_to_dtree(t->son, text+1, len-1, memcount); /* add rest of text to subtree */
1401	}
1402	else {
1403	ad_assert(len==1);
1404	ad_assert(t->son.exists==NULL);
1405	t->count++;
1406	}
1407
1408	return count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree;
1409	}
1410	else if (t->ch > text[0]) {
1411	break;
1412	}
1413	}
1414	while ((t=t->brother.single)!=NULL);
1415
1416	tree = add_dtree_to_dtree(new_dtree(text,len,memcount), /* otherwise we create a new subtree */
1417	count>MAX_BROTHERS ? single2full_dtree(tree, memcount) : tree,
1418	memcount);
1419	break;
1420	}
1421	case FULL_NODE: {
1422	unsigned_char ch = text[0];
1423
1424	if (tree.full->son[ch].exists) {
1425	tree.full->son[ch] = add_to_dtree(tree.full->son[ch], text+1, len-1, memcount);
1426	}
1427	else {
1428	tree.full->son[ch] = new_dtree(text+1, len-1, memcount);
1429	if (!tree.full->son[ch].exists) {
1430	if (tree.full->count[ch]==0) tree.full->usedSons++;
1431	tree.full->count[ch]++;
1432	}
1433	else {
1434	tree.full->usedSons++;
1435	}
1436	}
1437	break;
1438	}
1439	}
1440
1441	return tree;
1442	}
1443
1444	return new_dtree(text,len,memcount);
1445	}
1446
1447	static long calcCounts(GB_DICT_TREE tree)
1448	{
1449	long cnt = 0;
1450
1451	ad_assert(tree.exists!=0);
1452
1453	switch (tree.full->typ) {
1454	case SINGLE_NODE: {
1455	while (tree.exists) {
1456	if (tree.single->son.exists) tree.single->count = calcCounts(tree.single->son);
1457	ad_assert(tree.single->count>0);
1458	cnt += tree.single->count;
1459	tree = tree.single->brother;
1460	}
1461	break;
1462	}
1463	case FULL_NODE: {
1464	int idx;
1465
1466	for (idx=0; idx<256; idx++) {
1467	if (tree.full->son[idx].exists) {
1468	tree.full->count[idx] = calcCounts(tree.full->son[idx]);
1469	ad_assert(tree.full->count[idx]>0);
1470	}
1471	else {
1472	ad_assert(tree.full->count[idx]>=0);
1473	}
1474	cnt += tree.full->count[idx];
1475	}
1476	break;
1477	}
1478	}
1479
1480	return cnt;
1481	}
1482
1483	static int count_dtree_leafs(GB_DICT_TREE tree, int deep, int *maxdeep) {
1484	/* returns # of leafs and max. depth of tree */
1485	int leafs = 0;
1486
1487	ad_assert(tree.exists!=0);
1488
1489	if (++deep>maxdeep) maxdeep = deep;
1490
1491	switch(tree.full->typ) {
1492	case SINGLE_NODE: {
1493	if (tree.single->son.exists) leafs += count_dtree_leafs(tree.single->son, deep, maxdeep);
1494	else leafs++;
1495	if (tree.single->brother.exists) leafs += count_dtree_leafs(tree.single->brother, deep, maxdeep);
1496	break;
1497	}
1498	case FULL_NODE: {
1499	int idx;
1500
1501	for (idx=0; idx<256; idx++) {
1502	if (tree.full->son[idx].exists) leafs += count_dtree_leafs(tree.full->son[idx], deep, maxdeep);
1503	else if (tree.full->count[idx]) leafs++;
1504	}
1505	break;
1506	}
1507	}
1508
1509	return leafs;
1510	}
1511
1512	static int COUNT(GB_DICT_TREE tree) {
1513	/* counts sum of # of occurencies of tree */
1514	int cnt = 0;
1515
1516	switch(tree.single->typ) {
1517	case SINGLE_NODE: {
1518	while (tree.exists) {
1519	cnt += tree.single->count;
1520	tree = tree.single->brother;
1521	}
1522	break;
1523	}
1524	case FULL_NODE: {
1525	int idx;
1526
1527	for (idx=0; idx<256; idx++) cnt += tree.full->count[idx];
1528	break;
1529	}
1530	}
1531
1532	return cnt;
1533	}
1534
1535	static GB_DICT_TREE removeSubsequentString(GB_DICT_TREE *tree_pntr, cu_str buffer, int len, int max_occur) {
1536	/*
1537	* searches tree for 'buffer' (length='len')
1538	*
1539	* returns - rest below found string
1540	* (if found and if the # of occurrences of the string is less/equal than 'max_occur')
1541	* - NULL otherwise
1542	*
1543	* removes the whole found string from the tree (not only the rest!)
1544	*/
1545	GB_DICT_TREE tree = *tree_pntr, rest;
1546	static int restCount;
1547
1548	rest.exists = NULL;
1549
1550	ad_assert(tree.exists!=0);
1551	ad_assert(len>0);
1552
1553	switch (tree.full->typ) {
1554	case SINGLE_NODE: {
1555	while (tree.single->ch <= buffer[0]) {
1556	if (tree.single->ch == buffer[0]) { /* found wanted character */
1557	if (tree.single->son.exists) {
1558	if (len==1) {
1559	if (tree.single->count <= max_occur) {
1560	rest = tree.single->son;
1561	restCount = COUNT(rest);
1562	tree.single->son.exists = NULL;
1563	}
1564	}
1565	else {
1566	rest = removeSubsequentString(&tree.single->son, buffer+1, len-1, max_occur);
1567	}
1568	}
1569
1570	if (rest.exists) { /* the string was found */
1571	tree.single->count -= restCount;
1572	ad_assert(tree.single->count >= 0);
1573
1574	if (!tree.single->count) { /* empty subtree -> delete myself */
1575	GB_DICT_TREE brother = tree.single->brother;
1576
1577	tree.single->brother.exists = NULL; /* elsewise it would be free'ed by free_dtree */
1578	free_dtree(tree);
1579	*tree_pntr = tree = brother;
1580	}
1581	}
1582
1583	break;
1584	}
1585
1586	tree_pntr = &(tree.single->brother);
1587	if (!(tree = tree.single->brother).exists) break;
1588	}
1589
1590	break;
1591	}
1592	case FULL_NODE: {
1593	unsigned_char ch;
1594
1595	if (tree.full->son[ch=buffer[0]].exists) {
1596	if (len==1) {
1597	if (tree.full->count[ch] <= max_occur) {
1598	rest = tree.full->son[ch];
1599	restCount = COUNT(rest);
1600	tree.full->son[ch].exists = NULL;
1601	}
1602	}
1603	else {
1604	rest = removeSubsequentString(&tree.full->son[ch], buffer+1, len-1, max_occur);
1605	}
1606
1607	if (rest.exists) {
1608	ad_assert(restCount>0);
1609	tree.full->count[ch] -= restCount;
1610	ad_assert(tree.full->count[ch]>=0);
1611	if (tree.full->count[ch]==0) {
1612	ad_assert(tree.full->son[ch].exists==NULL);
1613
1614	if (--tree.full->usedSons==0) { /* last son deleted -> delete myself */
1615	free_dtree(tree);
1616	tree.exists = NULL;
1617	*tree_pntr = tree;
1618	}
1619	}
1620	}
1621	}
1622
1623	break;
1624	}
1625	}
1626
1627	return rest;
1628	}
1629
1630	static cu_str memstr(cu_str stringStart, int stringStartLen, cu_str inString, int inStringLen) {
1631	if (!inStringLen) return stringStart; /* string of length zero is found everywhere */
1632
1633	while (stringStartLen) {
1634	cu_str found = (cu_str)memchr(stringStart, inString[0], stringStartLen);
1635
1636	if (!found) break;
1637
1638	stringStartLen -= found-stringStart;
1639	stringStart = found;
1640
1641	if (stringStartLen<inStringLen) break;
1642
1643	if (GB_MEMCMP(stringStart,inString,inStringLen)==0) return stringStart;
1644
1645	stringStart++;
1646	stringStartLen--;
1647	}
1648
1649	return NULL;
1650	}
1651
1652
1653	static int expandBranches(u_str buffer, int deep, int minwordlen, int maxdeep, GB_DICT_TREE tree, GB_DICT_TREE root, int max_percent) {
1654	/*
1655	* expands all branches in 'tree'
1656	*
1657	* this is done by searching every of these branches in 'root' and moving any subsequent parts from there to 'tree'
1658	* (this is only done, if the # of occurrences of the found part does not exceed the # of occurrences of 'tree' more than 'max_percent' percent)
1659	*
1660	* 'buffer' strings are rebuild here while descending the tree (length of buffer==MAX_WORD_LEN)
1661	* 'deep' recursion level
1662	* 'maxdeep' maximum recursion level
1663	* 'minwordlen' is the length of the words to search (usually equal to MIN_WORD_LEN-1)
1664	*
1665	* returns the # of occurrences which were added to 'tree'
1666	*/
1667	int expand = 0; /* calculate count-sum of added subsequent parts */
1668
1669	ad_assert(tree.exists!=0);
1670
1671	if (deep<maxdeep) {
1672	switch (tree.full->typ) {
1673	case SINGLE_NODE: {
1674	while (tree.exists) {
1675	buffer[deep] = tree.single->ch;
1676
1677	if (!tree.single->son.exists) {
1678	GB_DICT_TREE rest;
1679	u_str buf = buffer+1;
1680	int len = deep;
1681
1682	if (len>minwordlen) { /* do not search more than MIN_WORD_LEN-1 chars */
1683	buf += len-minwordlen;
1684	len = minwordlen;
1685	}
1686
1687	if (len==minwordlen) {
1688	cu_str self = memstr(buffer, deep+1, buf, len);
1689
1690	ad_assert(self!=0);
1691	if (self==buf) rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.single->count)/100);
1692	else rest.exists = NULL;
1693	}
1694	else {
1695	rest.exists = NULL;
1696	}
1697
1698	if (rest.exists) {
1699	int cnt = COUNT(rest);
1700
1701	tree.single->son = rest;
1702	tree.single->count += cnt;
1703	expand += cnt;
1704	#ifdef DUMP_EXPAND
1705	#define DUMP_MORE 1
1706	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1707	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1708	#endif
1709	}
1710	}
1711
1712	if (tree.single->son.exists) {
1713	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.single->son, root, max_percent);
1714
1715	expand += added;
1716	tree.single->count += added;
1717	}
1718
1719	tree = tree.single->brother;
1720	}
1721
1722	break;
1723	}
1724	case FULL_NODE: {
1725	int idx;
1726
1727	for (idx=0; idx<256; idx++) {
1728	buffer[deep] = idx;
1729
1730	if (!tree.full->son[idx].exists && tree.full->count[idx]) /* leaf */ {
1731	GB_DICT_TREE rest;
1732	u_str buf = buffer+1;
1733	int len = deep;
1734
1735	if (len>minwordlen) { /* do not search more than MIN_WORD_LEN-1 chars */
1736	buf += len-minwordlen;
1737	len = minwordlen;
1738	}
1739
1740	if (len==minwordlen) {
1741	cu_str self = memstr(buffer, deep+1, buf, len);
1742
1743	ad_assert(self!=0);
1744	if (self==buf)
1745	rest = removeSubsequentString(&root, buf, len, ((100+max_percent)*tree.full->count[idx])/100);
1746	else
1747	rest.exists = NULL;
1748	}
1749	else {
1750	rest.exists = NULL;
1751	}
1752
1753	if (rest.exists) { /* substring found! */
1754	int cnt = COUNT(rest);
1755
1756	if (tree.full->count[idx]==0) tree.full->usedSons++;
1757	tree.full->son[idx] = rest;
1758	tree.full->count[idx] += cnt;
1759
1760	expand += cnt;
1761	#ifdef DUMP_EXPAND
1762	printf("expanding '%s'", lstr(buffer, deep+1+DUMP_MORE));
1763	printf(" (searching for '%s') -> found %i nodes\n", lstr(buf, len+DUMP_MORE), cnt);
1764	#endif
1765	}
1766	}
1767
1768	if (tree.full->son[idx].exists) {
1769	int added = expandBranches(buffer, deep+1, minwordlen, maxdeep, tree.full->son[idx], root, max_percent);
1770
1771	expand += added;
1772	tree.full->count[idx] += added;
1773	}
1774	}
1775
1776	break;
1777	}
1778	}
1779	}
1780
1781	return expand;
1782	}
1783
1784	static GB_DICT_TREE build_dict_tree(O_gbdByKey gbk, long maxmem, long maxdeep, long minwordlen, long data_sum)
1785	/* builds a tree of the most used words
1786	*
1787	* 'maxmem' is the amount of memory that will be allocated
1788	* 'maxdeep' is the maximum length of the _returned_ words
1789	* 'minwordlen' is the minimum length a word needs to get into the tree
1790	* this is used in the first pass as maximum tree depth
1791	* 'data_sum' will be set to the overall-size of data of which the tree was built
1792	*/
1793	{
1794	GB_DICT_TREE tree;
1795	long memcount = 0;
1796	long leafs = 0;
1797
1798	*data_sum = 0;
1799
1800	{
1801	int cnt;
1802	long lowmem = (maxmem*9)/10;
1803	int cut_count = 1;
1804
1805	/* Build 8-level-deep tree of all existing words */
1806
1807	tree.exists = NULL; /* empty tree */
1808
1809	for (cnt=0; cnt<gbk->cnt; cnt++) {
1810	GBDATA *gbd = gbk->gbds[cnt];
1811	int type = GB_TYPE(gbd);
1812
1813	if (COMPRESSABLE(type)) {
1814	long size;
1815	cu_str data = get_data_n_size(gbd, &size);
1816	cu_str lastWord;
1817
1818	if (type==GB_STRING \|\| type == GB_LINK) size--;
1819	if (size<minwordlen) continue;
1820
1821	*data_sum += size;
1822	lastWord = data+size-minwordlen;
1823
1824	#ifdef SELECT_WORDS
1825	if (strnstr(data,size,SELECTED_WORDS)) /* test some words only */
1826	#endif
1827	{
1828
1829	for ( ; data<=lastWord; data++) {
1830	tree = add_to_dtree(tree, data, minwordlen, &memcount);
1831
1832	while (memcount>maxmem) {
1833	leafs = 0;
1834	tree = cut_dtree(tree, cut_count, &memcount, &leafs);
1835	if (memcount<=lowmem) break;
1836	cut_count++;
1837	}
1838	}
1839	}
1840	}
1841	}
1842	}
1843
1844	/test_dtree(tree); /
1845	{
1846	int cutoff = 1;
1847
1848	leafs = 0;
1849	tree = cut_dtree(tree, cutoff, &memcount, &leafs); /* cut all single elements */
1850	test_dtree(tree);
1851
1852	#if defined(DEBUG)
1853	if (tree.exists) {
1854	int maxdeep2 = 0;
1855	long counted = count_dtree_leafs(tree, 0, &maxdeep2);
1856	ad_assert(leafs == counted);
1857	}
1858	#endif // DEBUG
1859
1860	/* avoid directory overflow (max. 18bit) */
1861	while (leafs >= MAX_LONG_INDEX) {
1862	leafs = 0;
1863	++cutoff;
1864	#if defined(DEBUG)
1865	printf("Directory overflow (%li) -- reducing size (cutoff = %i)\n", leafs, cutoff);
1866	#endif // DEBUG
1867	tree = cut_dtree(tree, cutoff, &memcount, &leafs);
1868	}
1869	}
1870	#ifdef DUMP_TREE
1871	printf("----------------------- tree with short branches:\n");
1872	dump_dtree(0,tree);
1873	printf("---------------------------\n");
1874	#endif
1875
1876	/* Try to create longer branches */
1877
1878	if (tree.exists) {
1879	int add_count;
1880	u_str buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
1881	int max_differ;
1882	long dummy;
1883
1884	if (tree.full->typ != FULL_NODE) tree = single2full_dtree(tree,&memcount); /* ensure root is FULL_NODE */
1885
1886	test_dtree(tree);
1887	calcCounts(tree); /* calculate counters of inner nodes */
1888	testCounts(tree);
1889
1890	for (max_differ=0; max_differ<=MAX_DIFFER; max_differ+=INCR_DIFFER) { /* percent of allowed difference for concatenating tree branches */
1891	do {
1892	int idx;
1893	add_count = 0;
1894
1895	for (idx=0; idx<256; idx++) {
1896	if (tree.full->son[idx].exists) {
1897	int added;
1898
1899	buffer[0] = idx;
1900	added = expandBranches(buffer, 1, minwordlen-1, maxdeep, tree.full->son[idx], tree, max_differ);
1901	tree.full->count[idx] += added;
1902	add_count += added;
1903	}
1904	}
1905	}
1906	while (add_count);
1907	}
1908
1909	gbm_free_mem((char*)buffer, maxdeep, GBM_DICT_INDEX);
1910
1911	tree = cut_useless_words(tree,0,&dummy);
1912	}
1913
1914	#ifdef DUMP_TREE
1915	printf("----------------------- tree with expanded branches:\n");
1916	dump_dtree(0,tree);
1917	printf("-----------------------\n");
1918	#endif
1919	testCounts(tree);
1920
1921	return tree;
1922	}
1923
1924	static GB_DICT_TREE remove_word_from_dtree(GB_DICT_TREE tree, cu_str wordStart, int wordLen, u_str resultBuffer, int resultLen, long resultFrequency, long *removed) {
1925	/* searches 'tree' for a word starting with 'wordStart' an removes it from the tree
1926	* if there are more than one possibilities, the returned word will be the one with the most occurrences
1927	* if there was no possibility -> resultLen==0, tree unchanged
1928	* otherwise: resultBuffer contains the word, returns new tree with word removed
1929	*/
1930	long removed_single = 0;
1931	ad_assert(tree.exists!=0);
1932	*removed = 0;
1933
1934	if (wordLen) { /* search wanted path into tree */
1935	switch(tree.full->typ) {
1936	case SINGLE_NODE: {
1937	if (tree.single->ch==*wordStart) {
1938	resultBuffer = wordStart;
1939
1940	if (tree.single->son.exists) {
1941	ad_assert(tree.single->count>0);
1942	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart+1, wordLen-1,
1943	resultBuffer+1, resultLen, resultFrequency,
1944	&removed_single);
1945	if (resultLen) { / word removed */
1946	ad_assert(tree.single->count>=removed_single);
1947	tree.single->count -= removed_single;
1948	*removed += removed_single;
1949	(*resultLen)++;
1950	}
1951	}
1952	else {
1953	resultLen = wordLen==1; / if wordLen==1 -> fully overlapping word found */
1954	*resultFrequency = tree.single->count;
1955	}
1956
1957	if (!tree.single->son.exists && resultLen) { / if no son and a word was found -> remove branch */
1958	GB_DICT_TREE brother = tree.single->brother;
1959
1960	*removed += tree.single->count;
1961	gbm_free_mem((char)tree.single, sizeof(tree.single), GBM_DICT_INDEX);
1962
1963	if (brother.exists) tree = brother;
1964	else tree.exists = NULL;
1965	}
1966	}
1967	else if (tree.single->ch < *wordStart && tree.single->brother.exists) {
1968	tree.single->brother = remove_word_from_dtree(tree.single->brother, wordStart, wordLen,
1969	resultBuffer, resultLen, resultFrequency,
1970	&removed_single);
1971	if (resultLen) removed += removed_single;
1972	}
1973	else {
1974	resultLen = 0; / not found */
1975	}
1976
1977	break;
1978	}
1979	case FULL_NODE: {
1980	unsigned_char ch = *wordStart;
1981	*resultBuffer = ch;
1982
1983	if (tree.full->son[ch].exists) {
1984	tree.full->son[ch] = remove_word_from_dtree(tree.full->son[ch], wordStart+1, wordLen-1,
1985	resultBuffer+1, resultLen, resultFrequency,
1986	&removed_single);
1987	if (*resultLen) {
1988	if (tree.full->son[ch].exists) { /* another son? */
1989	tree.full->count[ch] -= removed_single;
1990	}
1991	else { /* last son -> remove whole branch */
1992	removed_single = tree.full->count[ch];
1993	tree.full->count[ch] = 0;
1994	tree.full->usedSons--;
1995	}
1996
1997	*removed += removed_single;
1998	(*resultLen)++;
1999	}
2000	}
2001	else if (tree.full->count[ch]) {
2002	*resultLen = (wordLen==1);
2003
2004	if (*resultLen) {
2005	removed += removed_single = resultFrequency = tree.full->count[ch];
2006	tree.full->count[ch] = 0;
2007	tree.full->usedSons--;
2008	}
2009	}
2010	else {
2011	resultLen = 0; / not found */
2012	}
2013
2014	if (!tree.full->usedSons) {
2015	free_dtree(tree);
2016	tree.exists = NULL;
2017	}
2018
2019	break;
2020	}
2021	}
2022	}
2023	else { /* take any word */
2024	switch(tree.full->typ) {
2025	case SINGLE_NODE: {
2026	*resultBuffer = tree.single->ch;
2027	ad_assert(tree.single->count>0);
2028
2029	if (tree.single->son.exists) {
2030	tree.single->son = remove_word_from_dtree(tree.single->son, wordStart, wordLen,
2031	resultBuffer+1, resultLen, resultFrequency,
2032	&removed_single);
2033	ad_assert(*resultLen);
2034	(*resultLen)++;
2035	}
2036	else {
2037	*resultLen = 1;
2038	*resultFrequency = tree.single->count;
2039	removed_single = tree.single->count;
2040	}
2041
2042	ad_assert(*resultFrequency>0);
2043
2044	if (tree.single->son.exists) {
2045	ad_assert(tree.single->count>=removed_single);
2046	tree.single->count -= removed_single;
2047	*removed += removed_single;
2048	}
2049	else {
2050	GB_DICT_TREE brother = tree.single->brother;
2051
2052	*removed += tree.single->count;
2053	gbm_free_mem((char)tree.single, sizeof(tree.single), GBM_DICT_INDEX);
2054
2055	if (brother.exists) tree = brother;
2056	else tree.exists = NULL;
2057	}
2058
2059	break;
2060	}
2061	case FULL_NODE: {
2062	int idx;
2063
2064	for (idx=0; idx<256; idx++) {
2065	if (tree.full->son[idx].exists) {
2066	*resultBuffer = idx;
2067	tree.full->son[idx] = remove_word_from_dtree(tree.full->son[idx], wordStart, wordLen,
2068	resultBuffer+1, resultLen, resultFrequency,
2069	&removed_single);
2070	ad_assert(*resultLen);
2071	(*resultLen)++;
2072
2073	if (!tree.full->son[idx].exists) { /* son branch removed -> zero count */
2074	removed_single = tree.full->count[idx];
2075	tree.full->count[idx] = 0;
2076	tree.full->usedSons--;
2077	}
2078	else {
2079	tree.full->count[idx] -= removed_single;
2080	ad_assert(tree.full->count[idx]>0);
2081	}
2082
2083	break;
2084	}
2085	else if (tree.full->count[idx]) {
2086	*resultBuffer = idx;
2087	*resultLen = 1;
2088	*resultFrequency = tree.full->count[idx];
2089	removed_single = tree.full->count[idx];
2090	tree.full->count[idx] = 0;
2091	tree.full->usedSons--;
2092	break;
2093	}
2094	}
2095
2096	ad_assert(idx<256); /* ad_assert break was used to exit loop (== node had a son) */
2097
2098	*removed += removed_single;
2099
2100	if (!tree.full->usedSons) {
2101	free_dtree(tree);
2102	tree.exists = NULL;
2103	}
2104
2105	break;
2106	}
2107	}
2108	}
2109
2110	#ifdef DEBUG
2111	if (*resultLen) {
2112	ad_assert(*resultLen>0);
2113	ad_assert(*resultFrequency>0);
2114	ad_assert(*resultLen>=wordLen);
2115	}
2116	#endif
2117
2118	return tree;
2119	}
2120
2121	#if 0
2122	# define TEST_SORTED
2123	static void dump_dictionary(GB_DICTIONARY *dict)
2124	{
2125	int idx;
2126
2127	printf("dictionary\n"
2128	" words = %i\n"
2129	" textlen = %i\n",
2130	dict->words,
2131	dict->textlen);
2132
2133	for (idx=0; idx<dict->words; idx++) {
2134	u_str word = dict->text+ALPHA_DICT_OFFSET(idx, dict);
2135	/dict->offsets[dict->resort[idx]];/ /* @@@@ ntoh */
2136
2137	printf(" word%03i='%s' (%i)\n", idx, lstr(word,30), ntohl(dict->resort[idx]));
2138	}
2139	}
2140	#endif
2141
2142
2143	#define cmp(i1,i2) (heap2[i1]-heap2[i2])
2144	#define swap(i1,i2) do \
2145	{ \
2146	int s = heap[i1]; \
2147	heap[i1] = heap[i2]; \
2148	heap[i2] = s; \
2149	\
2150	s = heap2[i1]; \
2151	heap2[i1] = heap2[i2]; \
2152	heap2[i2] = s; \
2153	} \
2154	while (0)
2155
2156	static void downheap(int heap, int heap2, int me, int num) {
2157	int lson = me*2;
2158	int rson = lson+1;
2159
2160	ad_assert(me>=1);
2161	if (lson>num) return;
2162
2163	if (cmp(lson,me)<0) { /* left son smaller than me? (we sort in descending order!!!) */
2164	if (rson<=num && cmp(lson,rson)>0) { /* right son smaller than left son? */
2165	swap(me, rson);
2166	downheap(heap, heap2, rson, num);
2167	}
2168	else {
2169	swap(me, lson);
2170	downheap(heap, heap2, lson, num);
2171	}
2172	}
2173	else if (rson<=num && cmp(me, rson)>0) { /* right son smaller than me? */
2174	swap(me, rson);
2175	downheap(heap, heap2, rson, num);
2176	}
2177	}
2178
2179	#undef cmp
2180	#undef swap
2181
2182
2183
2184	#define cmp(i1,i2) GB_MEMCMP(dict->text+dict->offsets[heap[i1]], dict->text+dict->offsets[heap[i2]], dict->textlen)
2185	#define swap(i1,i2) do { int s = heap[i1]; heap[i1] = heap[i2]; heap[i2] = s; } while(0)
2186
2187	static void downheap2(int heap, GB_DICTIONARY dict, int me, int num) {
2188	int lson = me*2;
2189	int rson = lson+1;
2190
2191	ad_assert(me>=1);
2192	if (lson>num) return;
2193
2194	if (cmp(lson,me)>0) { /* left son bigger than me? */
2195	if (rson<=num && cmp(lson,rson)<0) { /* right son bigger than left son? */
2196	swap(me, rson);
2197	downheap2(heap, dict, rson, num);
2198	}
2199	else {
2200	swap(me, lson);
2201	downheap2(heap, dict, lson, num);
2202	}
2203	}
2204	else if (rson<=num && cmp(me, rson)<0) { /* right son bigger than me? */
2205	swap(me, rson);
2206	downheap2(heap, dict, rson, num);
2207	}
2208	}
2209
2210	#undef cmp
2211	#undef swap
2212
2213	static void sort_dict_offsets(GB_DICTIONARY *dict) {
2214	/* 1. sorts the 'dict->offsets' by frequency
2215	* (frequency of each offset is stored in the 'dict->resort' with the same index)
2216	* 2. initializes & sorts 'dict->resort' in alphabethical order
2217	*/
2218	int i;
2219	int num = dict->words;
2220	int *heap = dict->offsets-1;
2221	int *heap2 = dict->resort-1;
2222
2223	/* sort offsets */
2224
2225	for (i=num/2; i>=1; i--) downheap(heap, heap2, i, num); /* make heap */
2226
2227	while (num>1) { /* sort heap */
2228	int big = heap[1];
2229	int big2 = heap2[1];
2230
2231	heap[1] = heap[num];
2232	heap2[1] = heap2[num];
2233
2234	downheap(heap, heap2, 1, num-1);
2235
2236	heap[num] = big;
2237	heap2[num] = big2;
2238
2239	num--;
2240	}
2241
2242	#ifdef TEST_SORTED
2243	for (i=2,num=dict->words; i<=num; i++) ad_assert(heap2[i-1]>=heap2[i]); /* test if sorted correctly */
2244	#endif
2245
2246	/* initialize dict->resort */
2247
2248	for (i=0, num=dict->words; i<num; i++) dict->resort[i] = i;
2249
2250	/* sort dictionary alphabetically */
2251
2252	for (i=num/2; i>=1; i--) downheap2(heap2, dict, i, num); /* make heap */
2253
2254	while (num>1) {
2255	int big = heap2[1];
2256
2257	heap2[1] = heap2[num];
2258	downheap2(heap2, dict, 1, num-1);
2259	heap2[num] = big;
2260	num--;
2261	}
2262
2263	#ifdef TEST_SORTED
2264	for (i=1,num=dict->words; i<num; i++) {
2265	u_str word1 = dict->text+dict->offsets[dict->resort[i-1]];
2266	u_str word2 = dict->text+dict->offsets[dict->resort[i]];
2267	ad_assert(GB_MEMCMP(word1,word2,dict->textlen)<=0);
2268	}
2269	#endif
2270	}
2271
2272	/* Warning dictionary is not in network byte order !!!! */
2273	static GB_DICTIONARY gb_create_dictionary(O_gbdByKey gbk, long maxmem) {
2274	long data_sum;
2275	GB_DICT_TREE tree = build_dict_tree(gbk, maxmem, MAX_WORD_LEN, MIN_WORD_LEN, &data_sum);
2276
2277	if (tree.exists) {
2278	GB_DICTIONARY dict = (GB_DICTIONARY)gbm_get_mem(sizeof(*dict), GBM_DICT_INDEX);
2279	int maxdeep = 0;
2280	int words = count_dtree_leafs(tree, 0, &maxdeep);
2281	u_str word;
2282
2283	int wordLen;
2284	long wordFrequency;
2285	int offset = 0; /* next free position in dict->text */
2286	int overlap = 0; /* # of bytes overlapping with last word */
2287	u_str buffer;
2288	long dummy;
2289	long word_sum = 0;
2290	long overlap_sum = 0;
2291	long max_overlap = 0;
2292
2293	/* reduce tree as long as it has to many leafs (>MAX_LONG_INDEX) */
2294	while (words >= MAX_LONG_INDEX) {
2295
2296	words = count_dtree_leafs(tree, 0, &maxdeep);
2297	}
2298
2299	buffer = (u_str)gbm_get_mem(maxdeep, GBM_DICT_INDEX);
2300
2301	calcCounts(tree);
2302	testCounts(tree);
2303
2304	#if DEBUG
2305	printf(" examined data was %li bytes\n", data_sum);
2306	printf(" tree contains %i words *** maximum tree depth = %i\n", words, maxdeep);
2307	#endif
2308
2309	dict->words = 0;
2310	dict->textlen = DICT_STRING_INCR;
2311	dict->text = (u_str)gbm_get_mem(DICT_STRING_INCR, GBM_DICT_INDEX);
2312	dict->offsets = (GB_NINT)gbm_get_mem(sizeof((dict->offsets))*words, GBM_DICT_INDEX);
2313	dict->resort = (GB_NINT)gbm_get_mem(sizeof((dict->resort))*words, GBM_DICT_INDEX);
2314
2315	memset(buffer, '*', maxdeep);
2316	tree = remove_word_from_dtree(tree, NULL, 0, buffer, &wordLen, &wordFrequency, &dummy);
2317	testCounts(tree);
2318
2319	while (1) {
2320	int nextWordLen = 0;
2321	int len;
2322
2323	#if DUMP_COMPRESSION_TEST>=4
2324	printf("word='%s' (occur=%li overlap=%i)\n", lstr(buffer, wordLen), wordFrequency, overlap);
2325	#endif
2326
2327	overlap_sum += overlap;
2328	if (overlap>max_overlap) max_overlap = overlap;
2329	word_sum += wordLen;
2330
2331	if (offset-overlap+wordLen > dict->textlen) { /* if not enough space allocated -> realloc dictionary string */
2332	u_str ntext = (u_str)gbm_get_mem(dict->textlen+DICT_STRING_INCR, GBM_DICT_INDEX);
2333
2334	memcpy(ntext, dict->text, dict->textlen);
2335	gbm_free_mem((char*)dict->text, dict->textlen, GBM_DICT_INDEX);
2336
2337	dict->text = ntext;
2338	dict->textlen += DICT_STRING_INCR;
2339	}
2340
2341	dict->offsets[dict->words] = offset-overlap;
2342	dict->resort[dict->words] = wordFrequency; /* temporarily miss-use this to store frequency */
2343	dict->words++;
2344
2345	word = dict->text+offset-overlap;
2346	ad_assert(overlap==0 \|\| GB_MEMCMP(word,buffer,overlap)==0); /* test overlapping string-part */
2347	memcpy(word, buffer, wordLen); /* word -> dictionary string */
2348	offset += wordLen-overlap;
2349
2350	if (!tree.exists) break;
2351
2352	for (len=min(10,wordLen-1); len>=0 && nextWordLen==0; len--) {
2353	memset(buffer, '*', maxdeep);
2354	tree = remove_word_from_dtree(tree, word+wordLen-len, len, buffer, &nextWordLen, &wordFrequency, &dummy);
2355	overlap = len;
2356	}
2357
2358	wordLen = nextWordLen;
2359	}
2360
2361	ad_assert(dict->words <= MAX_LONG_INDEX);
2362	ad_assert(dict->words==words); /* dict->words == # of words stored in dictionary string
2363	* words == # of words pre-calculated */
2364
2365	#if DEBUG
2366	printf(" word_sum=%li overlap_sum=%li (%li%%) max_overlap=%li\n",
2367	word_sum, overlap_sum, (overlap_sum*100)/word_sum, max_overlap);
2368	#endif
2369
2370	if (offset<dict->textlen) { /* reallocate dict->text if it was allocated too large */
2371	u_str ntext = (u_str)gbm_get_mem(offset, GBM_DICT_INDEX);
2372
2373	memcpy(ntext, dict->text, offset);
2374	gbm_free_mem((char*)dict->text, dict->textlen, GBM_DICT_INDEX);
2375
2376	dict->text = ntext;
2377	dict->textlen = offset;
2378	}
2379
2380	sort_dict_offsets(dict);
2381
2382	gbm_free_mem((char*)buffer, maxdeep, GBM_DICT_INDEX);
2383	free_dtree(tree);
2384
2385	return dict;
2386	}
2387
2388	return NULL;
2389	}
2390
2391	static GB_ERROR readAndWrite(O_gbdByKey *gbkp) {
2392	int i;
2393	GB_ERROR error = 0;
2394
2395	for (i=0; i<gbkp->cnt && !error; i++) {
2396	GBDATA *gbd = gbkp->gbds[i];
2397	int type = GB_TYPE(gbd);
2398
2399	if (COMPRESSABLE(type)) {
2400	long size;
2401	char *data;
2402
2403	{
2404	char d = (char)get_data_n_size(gbd, &size);
2405
2406	data = gbm_get_mem(size, GBM_DICT_INDEX);
2407	memcpy(data, d, size);
2408	ad_assert(data[size-1] == 0);
2409	}
2410
2411	switch(type) {
2412	case GB_STRING:
2413	error = GB_write_string(gbd, "");
2414	if (!error) error = GB_write_string(gbd, data);
2415	break;
2416	case GB_LINK:
2417	error = GB_write_link(gbd, "");
2418	if (!error) error = GB_write_link(gbd, data);
2419	break;
2420	case GB_BYTES:
2421	error = GB_write_bytes(gbd, 0, 0);
2422	if (!error) error = GB_write_bytes(gbd, data, size);
2423	break;
2424	case GB_INTS:
2425	error = GB_write_ints(gbd, (GB_UINT4 *)0, 0);
2426	if (!error) error = GB_write_ints(gbd, (GB_UINT4 *)data, size);
2427	break;
2428	case GB_FLOATS:
2429	error = GB_write_floats(gbd, (float*)0, 0);
2430	if (!error) error = GB_write_floats(gbd, (float*)data, size);
2431	break;
2432	default:
2433	ad_assert(0);
2434	break;
2435	}
2436
2437	gbm_free_mem(data, size, GBM_DICT_INDEX);
2438	}
2439	}
2440	return error;
2441	}
2442
2443	GB_ERROR gb_create_dictionaries(GB_MAIN_TYPE *Main, long maxmem) {
2444	GB_ERROR error = NULL;
2445	#if defined(TEST_DICT)
2446	long uncompressed_sum = 0;
2447	long compressed_sum = 0;
2448	#endif /* TEST_DICT */
2449
2450	/error = GB_begin_transaction((GBDATA)Main->data);*/
2451
2452	printf("Creating GBDATA-Arrays..\n");
2453
2454	if (!error) {
2455	O_gbdByKey *gbk = g_b_opti_createGbdByKey(Main);
2456	int idx;
2457
2458	printf("Creating dictionaries..\n");
2459
2460	#ifdef DEBUG
2461	/* #define TEST_ONE / / test only key specified below */
2462	/* #define TEST_SOME / / test only some keys specified below */
2463	#if defined(TEST_ONE)
2464	/* select wanted index */
2465	for (idx=0; idx<gbdByKey_cnt; idx++) { /* title author dew_author ebi_journal name ua_tax date full_name ua_title */
2466	if (gbk[idx].cnt && strcmp(Main->keys[idx].key, "tree")==0) break;
2467	}
2468	ad_assert(idx<gbdByKey_cnt);
2469	#endif
2470	#endif
2471
2472	#ifdef TEST_ONE
2473	/* only create dictionary for index selected above (no loop) */
2474	#else
2475	/* create dictionaries for all indices (this is the normal operation) */
2476	for (idx = gbdByKey_cnt-1; idx >= 1 && !error; --idx)
2477	/* for (idx = 1; idx<gbdByKey_cnt && !error; ++idx) */
2478	#endif
2479
2480	{
2481	GB_DICTIONARY *dict;
2482	int compression_mask;
2483	GB_CSTR key_name = Main->keys[idx].key;
2484	int type;
2485	GBDATA gb_main = (GBDATA)Main->data;
2486
2487	#ifdef TEST_SOME
2488	if (!( /* add all wanted keys here */
2489	strcmp(key_name, "REF") == 0 \|\|
2490	strcmp(key_name, "ref") == 0
2491	)) continue;
2492	#endif /* TEST_SOME */
2493
2494	#ifndef TEST_ONE
2495	GB_status(idx/(double)gbdByKey_cnt);
2496	if (!gbk[idx].cnt) continue; /* there are no entries with this quark */
2497
2498	type = GB_TYPE(gbk[idx].gbds[0]);
2499	compression_mask = gb_get_compression_mask(Main, idx, type);
2500	if ((compression_mask & GB_COMPRESSION_DICTIONARY)==0) continue; /* compression with dictionary is not allowed */
2501	if (strcmp(key_name,"data")==0) continue;
2502	if (strcmp(key_name,"quality")==0) continue;
2503	#endif
2504
2505	printf("- dictionary for '%s' (idx=%i)\n", key_name, idx);
2506	GB_begin_transaction(gb_main);
2507	dict = gb_create_dictionary(&(gbk[idx]), maxmem);
2508
2509	if (dict) {
2510	/* decompress with old dictionary and write
2511	all data of actual type without compression: */
2512
2513	printf(" * Uncompressing all with old dictionary ...\n");
2514
2515	{
2516	int old_compression_mask = Main->keys[idx].compression_mask;
2517
2518	Main->keys[idx].compression_mask &= ~GB_COMPRESSION_DICTIONARY;
2519	error = readAndWrite(&gbk[idx]);
2520	Main->keys[idx].compression_mask = old_compression_mask;
2521	}
2522
2523	if (!error) {
2524	/* dictionary is saved in the following format:
2525	*
2526	* GB_NINT size
2527	* GB_NINT offsets[dict->words]
2528	* GB_NINT resort[dict->words]
2529	* char *text
2530	*/
2531
2532	int dict_buffer_size = sizeof(GB_NINT) * (1+dict->words*2) + dict->textlen;
2533	char *dict_buffer = gbm_get_mem(dict_buffer_size, GBM_DICT_INDEX);
2534	long old_dict_buffer_size;
2535	char *old_dict_buffer;
2536
2537	{
2538	GB_NINT nint = (GB_NINT)dict_buffer;
2539	int n;
2540
2541	*nint++ = htonl(dict->words);
2542	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->offsets[n]);
2543	for (n=0; n<dict->words; n++) *nint++ = htonl(dict->resort[n]);
2544
2545	memcpy(nint, dict->text, dict->textlen);
2546	}
2547
2548	error = gb_load_dictionary_data(gb_main, Main->keys[idx].key, &old_dict_buffer, &old_dict_buffer_size);
2549	if (!error) {
2550	gb_save_dictionary_data(gb_main, Main->keys[idx].key, dict_buffer, dict_buffer_size);
2551
2552	/* compress all data with new dictionary */
2553	printf(" * Compressing all with new dictionary ...\n");
2554	error = readAndWrite(&gbk[idx]);
2555	if (error) {
2556	/* critical state: new dictionary has been written, but transaction will be aborted below.
2557	* Solution: Write back old dictionary.
2558	*/
2559	gb_save_dictionary_data(gb_main, Main->keys[idx].key, old_dict_buffer, old_dict_buffer_size);
2560	}
2561	}
2562
2563	gbm_free_mem(dict_buffer, dict_buffer_size, GBM_DICT_INDEX);
2564	if (old_dict_buffer) gbm_free_mem(old_dict_buffer, old_dict_buffer_size, GBM_DICT_INDEX);
2565
2566	#if defined(TEST_DICT)
2567	if (!error) {
2568	GB_DICTIONARY *dict_reloaded = gb_get_dictionary(Main, idx);
2569	test_dictionary(dict_reloaded,&(gbk[idx]), &uncompressed_sum, &compressed_sum);
2570	}
2571	#endif /* TEST_DICT */
2572	}
2573	}
2574
2575	error = GB_end_transaction(gb_main, error);
2576	}
2577
2578	#ifdef TEST_DICT
2579	if (!error) {
2580	printf(" overall uncompressed size = %li b\n"
2581	" overall compressed size = %li b (Ratio=%li%%)\n",
2582	uncompressed_sum, compressed_sum,
2583	(compressed_sum*100)/uncompressed_sum);
2584	}
2585	#endif /* TEST_DICT */
2586
2587	printf("Done.\n");
2588
2589	g_b_opti_freeGbdByKey(Main,gbk);
2590	/* error = GB_commit_transaction((GBDATA)Main->data);/
2591	}
2592
2593	return error;
2594	}
2595
2596	GB_ERROR GB_optimize(GBDATA *gb_main) {
2597	unsigned long maxKB = GB_get_physical_memory();
2598	long maxMem;
2599	GB_ERROR error = 0;
2600	GB_UNDO_TYPE undo_type = GB_get_requested_undo_type(gb_main);
2601
2602	#ifdef DEBUG
2603	maxKB /= 2;
2604	#endif
2605
2606	if (maxKB<=(LONG_MAX/1024)) maxMem = maxKB*1024;
2607	else maxMem = LONG_MAX;
2608
2609	GB_request_undo_type(gb_main,GB_UNDO_KILL);
2610
2611	error = gb_create_dictionaries(GB_MAIN(gb_main), maxMem);
2612
2613	GB_disable_quicksave(gb_main,"Database optimized");
2614	GB_request_undo_type(gb_main,undo_type);
2615
2616	return error;
2617	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: