source: branches/port5/ptpan/PTP_buildtree.cxx

Last change on this file was 5908, checked in by westram, 16 years ago
  • source files with identical names are really a pain when using valgrind
File size: 99.5 KB
Line 
1#include <stdio.h>
2#include <stdlib.h>
3#include <string.h>
4// #include <malloc.h>
5#include <unistd.h>
6#include <PT_server.h>
7#include <PT_server_prototypes.h>
8#include "ptpan.h"
9#include "pt_prototypes.h"
10
11/* nice niffy macro to get a compressed sequence code */
12#define GetSeqCodeQuick(pos) \
13  (((seqptr[(pos) / MAXCODEFITLONG] \
14      >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) \
15     / pg->pg_PowerTable[MAXCODEFITLONG - ((pos) % MAXCODEFITLONG) - 1]) \
16    % pg->pg_AlphaSize)
17
18#define GetSeqCode(pos) ((pos >= pg->pg_TotalRawSize) ? SEQCODE_N : GetSeqCodeQuick(pos))
19
20/* /// "BuildStdSuffixTree()" */
21BOOL BuildStdSuffixTree(struct PTPanGlobal *pg)
22{
23  STRPTR newtreename;
24  struct PTPanPartition *pp;
25  ULONG memfree;
26
27  printf("********************************\n"
28        "* Building Std Suffix Index... *\n"
29        "********************************\n");
30  // Delete old tree first (why, can't we just build a new one and
31  // then rename it? Needs some extra disk space then though)
32  if(unlink(pg->pg_IndexName))
33  {
34    if(GB_size_of_file(pg->pg_IndexName) >= 0)
35    {
36      fprintf(stderr, "Cannot remove %s\n", pg->pg_IndexName);
37      return(FALSE);
38    }
39  }
40
41  // allocate memory for a temporary filename
42  newtreename = (STRPTR) malloc(strlen(pg->pg_IndexName) + 2);
43  strcpy(newtreename, pg->pg_IndexName);
44  strcat(newtreename, "~");
45
46  pg->pg_IndexFile = fopen(newtreename, "w"); /* open file for output */
47  if(!pg->pg_IndexFile)
48  {
49    fprintf(stderr, "Cannot open %s for output.\n", newtreename);
50    free(newtreename);
51    return(FALSE);
52  }
53  GB_set_mode_of_file(newtreename, 0666);
54
55  //GB_begin_transaction(pg->pg_MainDB);
56
57  /* build index */
58  BuildMergedDatabase(pg);
59
60  printf("Freeing alignment cache to save memory...");
61  memfree = FlushCache(pg->pg_SpeciesCache);
62  printf("%ld KB freed.\n", memfree >> 10);
63
64  /* everything has to fit into one partition */
65  pp = (struct PTPanPartition *) calloc(1, sizeof(struct PTPanPartition));
66  if(!pp)
67  {
68    return(FALSE); /* out of memory */
69  }
70
71  /* fill in sensible values */
72  pp->pp_PTPanGlobal = pg;
73  pp->pp_ID = 0;
74  pp->pp_Prefix = 0;
75  pp->pp_PrefixLen = 0;
76  pp->pp_Size = pg->pg_TotalRawSize;
77  pp->pp_RawOffset = 0;
78  pp->pp_PartitionName = (STRPTR) calloc(strlen(pg->pg_IndexName) + 5, 1);
79  strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 3);
80  strcat(pp->pp_PartitionName, "sfx");
81  AddTail(&pg->pg_Partitions, &pp->pp_Node);
82  pg->pg_NumPartitions = 1;
83  printf("Using only one partition for %ld leaves.\n", pp->pp_Size);
84
85  WriteIndexHeader(pg);
86  fclose(pg->pg_IndexFile);
87
88  BuildMemoryStdSuffixTree(pp);
89
90  /* write out tree */
91  printf(">>> Phase 3: Writing tree to secondary storage... <<<\n");
92  WriteStdSuffixTreeToDisk(pp);
93
94  printf(">>> Phase 4: Freeing memory and cleaning it up... <<<\n");
95
96  /* return some memory not used anymore */
97  freeset(pp->pp_StdSfxNodes, NULL);
98
99  if(GB_rename_file(newtreename, pg->pg_IndexName))
100  {
101    GB_print_error();
102  }
103
104  if(GB_set_mode_of_file(pg->pg_IndexName, 0666))
105  {
106    GB_print_error();
107  }
108  free(newtreename);
109  return(TRUE);
110}
111/* \\\ */
112
113/* /// "BuildMemoryStdSuffixTree()" */
114BOOL BuildMemoryStdSuffixTree(struct PTPanPartition *pp)
115{
116  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
117  ULONG oldheadnum;
118  ULONG oldleafnum;
119  ULONG nodecnt;
120  //struct StdSfxNode *parnode;
121  struct StdSfxNode *vnode;
122  struct StdSfxNode *oldheadnode;
123  struct StdSfxNode *oldleafnode;
124  ULONG vnum, parnum;
125
126  BenchTimePassed(pg);
127
128  pp->pp_SfxMemorySize = pp->pp_Size * sizeof(struct StdSfxNode) * 2;
129  pp->pp_StdSfxNodes = (struct StdSfxNode *) calloc(pp->pp_Size * 2, sizeof(struct StdSfxNode));
130  if(!pp->pp_StdSfxNodes)
131  {
132    printf("Couldn't allocate %ld KB for suffix nodes.\n",
133           pp->pp_SfxMemorySize >> 10);
134    return(FALSE);
135  }
136  /* init pointing offsets */
137  pp->pp_NumBigNodes = 0;
138
139  printf("Allocated %ld KB suffix nodes buffer.\n", pp->pp_SfxMemorySize >> 10);
140
141  /* fill in root node */
142  vnode = pp->pp_StdSfxNodes;
143  vnode->ssn_Parent = 0;
144  vnode->ssn_FirstChild = 1;
145  vnode++;
146  vnode->ssn_Parent = 0;
147  vnode->ssn_StartPos = 0;
148  vnode->ssn_EdgeLen = pg->pg_TotalRawSize;
149  vnode->ssn_FirstChild = 0;
150  vnode->ssn_NextSibling = 0;
151  vnum = 1;
152  oldheadnum = 0;
153  oldleafnum = 1;
154  pp->pp_NumBigNodes = 2;
155
156  /* main loop to build up the tree */
157  /* NOTE: as a special precaution, all longwords have MAXCODEFITLONG code length */
158
159  for(nodecnt = 1; nodecnt < pg->pg_TotalRawSize; nodecnt++)
160  {
161    ULONG gst, gend; // gamma
162    ULONG bst, bend; // beta
163
164    oldleafnode = &pp->pp_StdSfxNodes[oldleafnum];
165    oldheadnode = &pp->pp_StdSfxNodes[oldheadnum];
166    gst = oldleafnode->ssn_StartPos;
167    gend = gst + oldleafnode->ssn_EdgeLen;
168
169    //printf("%ld\n", nodecnt);
170
171    //cerr << "Step " << i << ": oldleaf(" << source.substr(gst, gend - gst) << ")->" << source.substr(i, n - i) << endl;
172    //printTree(source, r, 0);
173
174    if(!oldheadnum) // oldhead ist wurzel
175    {
176      //printf("  oldhead == root\n");
177      //vnode = pp->pp_StdSfxNodes;
178      vnum = 0;
179      gst++;
180    }
181    else if((vnum = oldheadnode->ssn_Prime)) // oldhead ist nicht Wurzel hat aber einen Querlink
182    {
183      //printf("  oldhead has nodeprime\n");
184    } else { // oldhead ist nicht Wurzel hat aber keinen Querlink
185      parnum = oldheadnode->ssn_Parent;
186      bst = oldheadnode->ssn_StartPos;
187      bend = bst + oldheadnode->ssn_EdgeLen;
188      //cerr << "  oldhead(" << source.substr(bst, bend - bst) << ") has no nodeprime and is not root" << endl;
189      if(!parnum)
190      {
191        //printf("    p was root\n");
192        bst++;
193        vnum = FastFindStdSfxNode(pp, 0, bst, bend);
194      } else {
195        vnum = FastFindStdSfxNode(pp, (pp->pp_StdSfxNodes[parnum]).ssn_Prime, bst, bend);
196        //printf("    p->nodeprime was leaf\n");
197        /*cerr << "    p->nodeprime was leaf("
198          << source.substr(u->sfxstart, u->sfxend - u->sfxstart) << endl;*/
199      }
200      /*cerr << "  Call to fastFindNode(., ., " << bst << ", " << bend
201        << ", '" << source.substr(bst, bend - bst) << "')" << endl;*/
202      oldheadnode->ssn_Prime = vnum;
203    }
204    /*cerr << "  Call to findNode(., ., " << gst << ", " << gend
205      << ", '" << source.substr(gst, gend - gst) << "')" << endl;*/
206    //printf("FindNode %ld, %ld-%ld\n", vnum, gst, gend);
207    oldheadnum = FindStdSfxNode(pp, vnum, gst, gend);
208    /*cerr << "  Call to insertNode(., " << gst << ", " << gend
209      << ", '" << source.substr(gst, gend - gst) << "')" << endl;*/
210    //printf("InsertNode %ld, %ld-%ld\n", oldheadnum, gst, gend);
211    oldleafnum = InsertStdSfxNode(pp, gst, gend, oldheadnum);
212    if((nodecnt & 0x3fff) == 0)
213    {
214      if((nodecnt >> 14) % 50)
215      {
216        printf(".");
217        fflush(stdout);
218      } else {
219        printf(". %2ld%%\n", nodecnt / (pg->pg_TotalRawSize / 100));
220      }
221    }
222  }
223  printf("DONE! (%ld KB unused)\n", (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10);
224
225  printf("Nodes     : %6ld\n", pp->pp_NumBigNodes);
226  pg->pg_Bench.ts_MemTree += BenchTimePassed(pg);
227  return(TRUE);
228}
229/* \\\ */
230
231// Suche nach bestimmter Kindkante (konstante Zeit, da alphabetgroesse konstant).
232/* /// "FindStdSfxChildNode()" */
233inline
234ULONG FindStdSfxChildNode(struct PTPanPartition *pp, ULONG nodenum, ULONG pos)
235{
236  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
237  ULONG *seqptr = pg->pg_MergedRawData;
238  ULONG c = GetSeqCodeQuick(pos);
239  struct StdSfxNode *node;
240
241  //printf("Searching %ld at %ld in node %ld: ", c, pos, nodenum);
242  nodenum = (pp->pp_StdSfxNodes[nodenum]).ssn_FirstChild;
243  while(nodenum)
244  {
245    node = &pp->pp_StdSfxNodes[nodenum];
246    if(GetSeqCodeQuick(node->ssn_StartPos) == c)
247    {
248      break;
249    }
250    nodenum = node->ssn_NextSibling;
251  }
252  //printf("%ld\n", nodenum);
253  return(nodenum);
254}
255/* \\\ */
256
257/* /// "SplitStdSfxNode()" */
258// Neue Node einfuegen und initialisieren
259ULONG SplitStdSfxNode(struct PTPanPartition *pp, ULONG leafnum)
260{
261  struct StdSfxNode *leafnode = &pp->pp_StdSfxNodes[leafnum];
262  ULONG parnum = leafnode->ssn_Parent;
263  struct StdSfxNode *parnode = &pp->pp_StdSfxNodes[parnum];
264  ULONG inum = pp->pp_NumBigNodes++;
265  struct StdSfxNode *inode = &pp->pp_StdSfxNodes[inum];
266  struct StdSfxNode *tmpnode;
267
268  //printf("Split node %ld\n", leafnum);
269  if(parnode->ssn_FirstChild == leafnum) // case 1: leaf is first child
270  {
271    // correct all linkages
272    parnode->ssn_FirstChild = inum;
273  } else { // case 2 leaf is some other child
274    // find previous sibling of leaf
275    tmpnode = &pp->pp_StdSfxNodes[parnode->ssn_FirstChild];
276    while(tmpnode->ssn_NextSibling != leafnum)
277    {
278      tmpnode = &pp->pp_StdSfxNodes[tmpnode->ssn_NextSibling];
279    }
280    // correct all linkages
281    tmpnode->ssn_NextSibling = inum;
282  }
283  inode->ssn_FirstChild = leafnum;
284  inode->ssn_NextSibling = leafnode->ssn_NextSibling;
285  inode->ssn_Parent = parnum;
286  leafnode->ssn_NextSibling = 0;
287  leafnode->ssn_Parent = inum;
288  return(inum);
289}
290/* \\\ */
291
292/* /// "FindStdSfxNode()" */
293// langsames Finden der naechsten Node (wir wissen nicht, ob diese existiert)
294ULONG FindStdSfxNode(struct PTPanPartition *pp, ULONG snum, ULONG &sfxstart, ULONG sfxend)
295{
296  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
297  ULONG *seqptr = pg->pg_MergedRawData;
298  //struct StdSfxNode *snode;
299
300  //printf("FindStdSfxNode: %ld, (%ld-%ld)\n", snum, sfxstart, sfxend);
301  if(sfxstart == sfxend) // terminal character is always found
302  {
303    return(snum);
304  }
305  //snode = &pp->pp_StdSfxNodes[snum];
306  do
307  {
308    ULONG i,ic;
309    ULONG fst, fend;
310    struct StdSfxNode *leafnode;
311    struct StdSfxNode *inode;
312    ULONG inum;
313    ULONG leafnum;
314
315    leafnum = FindStdSfxChildNode(pp, snum, sfxstart);
316    if(!leafnum)
317    {
318      return(snum);
319    }
320    leafnode = &pp->pp_StdSfxNodes[leafnum];
321    fst = leafnode->ssn_StartPos;
322    fend = fst + leafnode->ssn_EdgeLen;
323
324    // Zeichen fuer Zeichen ueberpruefen
325    for(i = fst+1, ic = sfxstart+1; i < fend; i++, ic++)
326    {
327      if(i-fst+sfxstart >= sfxend) // Suchstringende erreicht -> "$" Blatt
328        break;
329      if(GetSeqCodeQuick(i) != GetSeqCodeQuick(ic)) // Zeichendifferenz?
330        break;
331    }
332    //printf("i: %ld\n", i);
333    if(i != fend) // Muss leaf gesplitted werden?
334    {
335      inum = SplitStdSfxNode(pp, leafnum); // Generate new node
336      inode = &pp->pp_StdSfxNodes[inum];
337      inode->ssn_StartPos = fst; // neues Kind hat selben Suffixstart
338      inode->ssn_EdgeLen = i - fst; // aber Suffix endet zwischen den beiden
339      leafnode->ssn_StartPos = i; // leaf hat neuen Suffixstart bei i
340      leafnode->ssn_EdgeLen = fend - i; // leaf Suffixend bleibt unveraendert
341      /*printf("Inode %ld (%ld-%ld), leafnode %ld (%ld-%ld)\n",
342        inum, fst, i, leafnum, i, fend);*/
343      sfxstart += i - fst; // Suffixstart fuer insertNode() korrigieren
344      return(inum);
345    } else { // Knoten vollstaendig gematcht
346      snum = leafnum;
347      sfxstart += fend - fst;
348    }
349  } while(TRUE);
350}
351/* \\\ */
352
353/* /// "FastFindStdSfxNode()" */
354ULONG FastFindStdSfxNode(struct PTPanPartition *pp, ULONG snum, ULONG sfxstart, ULONG sfxend)
355{
356  //struct StdSfxNode *snode;
357
358  /* fast finding of next node (we know that it has to exist) */
359
360  if(sfxstart == sfxend)
361  {
362    return(snum);
363  }
364  do
365  {
366    ULONG fst, fend;
367    ULONG i;
368    struct StdSfxNode *leafnode;
369    struct StdSfxNode *inode;
370    ULONG inum;
371    ULONG leafnum;
372
373    leafnum = FindStdSfxChildNode(pp, snum, sfxstart);
374    if(!leafnum)
375    {
376      printf("Shit!\n");
377      exit(1);
378    }
379    leafnode = &pp->pp_StdSfxNodes[leafnum];
380    i = leafnode->ssn_EdgeLen;
381
382    if(sfxstart + i == sfxend) // do we terminate at a leaf?
383    {
384      return(leafnum);
385    }
386    fst = leafnode->ssn_StartPos;
387    fend = fst + i;
388    if(sfxstart + i > sfxend) // Bleiben wir innerhalb des Blattes haengen?
389    {
390      inum = SplitStdSfxNode(pp, leafnum); // Generate new node
391      inode = &pp->pp_StdSfxNodes[inum];
392      inode->ssn_StartPos = fst; // neues Kind hat selben Suffixstart
393      inode->ssn_EdgeLen = sfxend - sfxstart; // aber Suffix endet zwischen den beiden
394      leafnode->ssn_StartPos = fst + (sfxend - sfxstart); // leaf hat neuen Suffixstart bei i
395      leafnode->ssn_EdgeLen = fend - leafnode->ssn_StartPos; // leaf Suffixend bleibt unveraendert
396      return(inum);
397    }
398    sfxstart += i;
399    snum = leafnum;
400  } while(TRUE);
401}
402/* \\\ */
403
404/* /// "InsertStdSfxNode()" */
405ULONG InsertStdSfxNode(struct PTPanPartition *pp, ULONG sfxstart, ULONG sfxend, ULONG parnum)
406{
407  struct StdSfxNode *parnode = &pp->pp_StdSfxNodes[parnum];
408  ULONG inum = pp->pp_NumBigNodes++;
409  struct StdSfxNode *inode = &pp->pp_StdSfxNodes[inum];
410  ULONG tmpnum;
411  struct StdSfxNode *tmpnode;
412
413  inode->ssn_Parent = parnum;
414  inode->ssn_StartPos = sfxstart;
415  inode->ssn_EdgeLen = sfxend - sfxstart;
416  if((tmpnum = parnode->ssn_FirstChild))
417  {
418    tmpnode = &pp->pp_StdSfxNodes[tmpnum];
419    while((tmpnum = tmpnode->ssn_NextSibling))
420    {
421      tmpnode = &pp->pp_StdSfxNodes[tmpnum];
422    }
423    tmpnode->ssn_NextSibling = inum;
424  } else {
425    parnode->ssn_FirstChild = inum;
426  }
427  return(inum);
428}
429/* \\\ */
430
431/* /// "WriteStdSuffixTreeToDisk()" */
432BOOL WriteStdSuffixTreeToDisk(struct PTPanPartition *pp)
433{
434  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
435  struct StdSfxNode *ssn;
436  struct StdSfxNodeOnDisk *ssndisk;
437  ULONG cnt;
438  ULONG buffill;
439  ULONG chunksize = 1UL<<20;
440  ULONG pval;
441  ULONG *seqptr;
442  UWORD lcnt;
443  STRPTR tarseq;
444
445  pg->pg_Bench.ts_Reloc += BenchTimePassed(pg);
446  /* now finally write it to disk */
447  pp->pp_PartitionFile = fopen(pp->pp_PartitionName, "w");
448  if(!pp->pp_PartitionFile)
449  {
450    printf("ERROR: Couldn't open partition file %s for writing!\n",
451        pp->pp_PartitionName);
452    return(FALSE);
453  }
454
455  pp->pp_DiskTreeSize = pp->pp_NumBigNodes * sizeof(struct StdSfxNodeOnDisk);
456  pp->pp_DiskBufferSize = sizeof(StdSfxNodeOnDisk) * chunksize;
457  pp->pp_DiskBuffer = (UBYTE *) calloc(1, pp->pp_DiskBufferSize);
458  pp->pp_DiskPos = 0;
459
460  WriteStdSuffixTreeHeader(pp);
461
462  printf("Writing tree (%ld KB)",pp->pp_DiskTreeSize >> 10);
463  fflush(NULL);
464  cnt = pp->pp_NumBigNodes;
465  ssn = pp->pp_StdSfxNodes;
466  do
467  {
468    ssndisk = (struct StdSfxNodeOnDisk *) pp->pp_DiskBuffer;
469    buffill = 0;
470    do
471    {
472      ssndisk->ssn_StartPos = ssn->ssn_StartPos;
473      ssndisk->ssn_EdgeLen = ssn->ssn_EdgeLen;
474      ssndisk->ssn_FirstChild = ssn->ssn_FirstChild;
475      ssndisk->ssn_NextSibling = ssn->ssn_NextSibling;
476      ssndisk++;
477      ssn++;
478      cnt--;
479    } while((++buffill < chunksize) && cnt);
480
481    printf(".");
482    fflush(NULL);
483    fwrite(pp->pp_DiskBuffer, sizeof(struct StdSfxNodeOnDisk) * buffill, 1, pp->pp_PartitionFile);
484  } while(cnt);
485  printf(".\n");
486  printf("Writing raw text (%ld KB)",pp->pp_DiskTreeSize >> 10);
487  cnt = (pg->pg_TotalRawSize / MAXCODEFITLONG) + 1;
488  seqptr = pg->pg_MergedRawData;
489  tarseq = (STRPTR) pp->pp_DiskBuffer;
490  pp->pp_DiskPos = 0;
491  do
492  {
493    /* get next longword */
494    pval = *seqptr++;
495    lcnt = MAXCODEFITLONG;
496    pval >>= pg->pg_BitsShiftTable[MAXCODEFITLONG];
497    /* unpack compressed longword */
498    do
499    {
500      *tarseq++ = (pval / pg->pg_PowerTable[--lcnt]) % pg->pg_AlphaSize;
501    } while(lcnt);
502    pp->pp_DiskPos += MAXCODEFITLONG;
503    if(pp->pp_DiskPos > pp->pp_DiskBufferSize - MAXCODEFITLONG)
504    {
505      fwrite(pp->pp_DiskBuffer, pp->pp_DiskPos, 1, pp->pp_PartitionFile);
506      tarseq = (STRPTR) pp->pp_DiskBuffer;
507      pp->pp_DiskPos = 0;
508    }
509  } while(--cnt);
510  pp->pp_DiskIdxSpace = ftell(pp->pp_PartitionFile);
511  fclose(pp->pp_PartitionFile);
512  free(pp->pp_DiskBuffer);
513
514  pg->pg_Bench.ts_Writing += BenchTimePassed(pg);
515
516  return(TRUE);
517}
518/* \\\ */
519
520
521/* /// "BuildPTPanIndex()" */
522/* build a whole new fresh and tidy index (main routine) */
523BOOL BuildPTPanIndex(struct PTPanGlobal *pg)
524{
525  STRPTR newtreename;
526  struct PTPanPartition *pp;
527  ULONG memfree;
528
529  printf("********************************\n"
530        "* Building new PT Pan Index... *\n"
531        "********************************\n");
532  // Delete old tree first (why, can't we just build a new one and
533  // then rename it? Needs some extra disk space then though)
534  if(unlink(pg->pg_IndexName))
535  {
536    if(GB_size_of_file(pg->pg_IndexName) >= 0)
537    {
538      fprintf(stderr, "Cannot remove %s\n", pg->pg_IndexName);
539      return(FALSE);
540    }
541  }
542
543  // allocate memory for a temporary filename
544  newtreename = (STRPTR) malloc(strlen(pg->pg_IndexName) + 2);
545  strcpy(newtreename, pg->pg_IndexName);
546  strcat(newtreename, "~");
547
548  pg->pg_IndexFile = fopen(newtreename, "w"); /* open file for output */
549  if(!pg->pg_IndexFile)
550  {
551    fprintf(stderr, "Cannot open %s for output.\n", newtreename);
552    free(newtreename);
553    return(FALSE);
554  }
555  GB_set_mode_of_file(newtreename, 0666);
556
557  //GB_begin_transaction(pg->pg_MainDB);
558
559  /* build index */
560  BuildMergedDatabase(pg);
561
562  printf("Freeing alignment cache to save memory...");
563  memfree = FlushCache(pg->pg_SpeciesCache);
564  printf("%ld KB freed.\n", memfree >> 10);
565
566  PartitionPrefixScan(pg);
567
568  WriteIndexHeader(pg);
569  fclose(pg->pg_IndexFile);
570
571  /* build tree for each partition */
572  pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head;
573  while(pp->pp_Node.ln_Succ)
574  {
575    CreateTreeForPartition(pp);
576    pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ;
577  }
578  //CreatePartitionLookup(pg);
579
580  //GB_commit_transaction(pg->pg_MainDB);
581
582  //if(GB_rename_file(newtreename, pg->pg_IndexName)) *** FIXME ***
583  if(GB_rename_file(newtreename, pg->pg_IndexName))
584  {
585    GB_print_error();
586  }
587
588  if(GB_set_mode_of_file(pg->pg_IndexName, 0666))
589  {
590    GB_print_error();
591  }
592  free(newtreename);
593  return(TRUE);
594}
595/* \\\ */
596
597/* /// "BuildMergedDatabase()" */
598BOOL BuildMergedDatabase(struct PTPanGlobal *pg)
599{
600  struct PTPanSpecies *ps;
601  ULONG *seqptr;
602  ULONG seqcode;
603  ULONG cnt;
604  ULONG pval;
605  ULONG len;
606  BOOL dbopen = FALSE;
607  ULONG verlen;
608  ULONG abspos;
609  ULONG specabspos;
610  ULONG hash;
611
612  BenchTimePassed(pg);
613
614  /* allocate memory for compressed data */
615  /* about the +3: 1 for rounding, 1 for MAXCODEFITLONG*SEQCODE_N
616     and 1 for terminal */
617  pg->pg_MergedRawData = (ULONG *) malloc(((pg->pg_TotalRawSize /
618                         MAXCODEFITLONG) + 3) * sizeof(ULONG));
619  if(!pg->pg_MergedRawData)
620  {
621    printf("Sorry, couldn't allocate %ld KB of memory for the compressed DB!\n",
622           (((pg->pg_TotalRawSize / MAXCODEFITLONG) + 3) * sizeof(ULONG)) >> 10);
623    return(FALSE);
624  }
625
626  /* init */
627  seqptr = pg->pg_MergedRawData;
628  cnt = 0;
629  pval = 0;
630  len = 4;
631  abspos = 0;
632
633  /* note: This has to be modified a bit to support compressed databases of >2GB
634     alignment data using pp->pp_RawPartitionOffset */
635
636  printf("Step 1: Building compressed database...\n");
637  /* doing a linear scan -- caching is useless */
638  DisableCache(pg->pg_SpeciesCache);
639
640  /* traverse all species */
641  ps = (struct PTPanSpecies *) pg->pg_Species.lh_Head;
642  while(ps->ps_Node.ln_Succ)
643  {
644    /* compress sequence */
645    STRPTR srcstr;
646    UBYTE code;
647
648    if(abspos != ps->ps_AbsOffset)
649    {
650      /* species seems to be corrupt! */
651      printf("AbsPos %ld != %ld mismatch at %s\n",
652        abspos, ps->ps_AbsOffset, ps->ps_Name);
653    }
654
655
656    verlen = 0;
657    hash = 0;
658    specabspos = 0;
659    ULONG bitpos = 0;
660    ULONG count;
661    while((code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count)) != 0xff)
662    {
663#ifdef ALLOWDOTSINMATCH
664      if ((code == '.') && (count == 1))
665      {
666#if 1       // debug       
667            ULONG tmpbitpos = bitpos;
668            ULONG tmpcount;
669            printf("Species: %s, abspos: %li, AbsOffset: %li >> %li*%c, ", 
670                   ps->ps_Name, abspos, ps->ps_AbsOffset, tmpcount, code);
671            for (int i = 0; i < 10; ++i)
672            {
673                code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, tmpbitpos, tmpcount);
674                if (code == 0xff) break;
675                printf("%li*%c, ", tmpcount, code);
676            }
677            printf("\n");
678#endif           
679            code = 'N';
680      }
681#endif
682      if(pg->pg_SeqCodeValidTable[code])
683      {
684        /* add sequence code */
685        if(verlen++ < ps->ps_RawDataSize)
686        {
687          abspos++;
688          specabspos++;
689          seqcode = pg->pg_CompressTable[code];
690          pval *= pg->pg_AlphaSize;
691          pval += seqcode;
692          /* calculate hash */
693          hash *= pg->pg_AlphaSize;
694          hash += seqcode;
695          hash %= HASHPRIME;
696          /* check, if storage capacity was reached? */
697          if(++cnt == MAXCODEFITLONG)
698          {
699            /* write out compressed longword (with eof bit) */
700            //printf("[%08lx]", pval | pg->pg_BitsMaskTable[cnt]);
701            *seqptr++ = (pval << pg->pg_BitsShiftTable[cnt]) | pg->pg_BitsMaskTable[cnt];
702            cnt = 0;
703            pval = 0;
704            len += 4;
705          }
706        }
707      }
708    }
709    if(verlen != ps->ps_RawDataSize)
710    {
711      printf("Len %ld != %ld mismatch with %s\n",
712        verlen, ps->ps_RawDataSize, ps->ps_Name);
713      printf("Please check if this alignment is somehow corrupt!\n");
714    }
715    //printf("\n");
716    ps->ps_SeqHash = hash;
717    ps = (struct PTPanSpecies *) ps->ps_Node.ln_Succ;
718  }
719
720  /* write pending bits (with eof bit) */
721  /* after a lot of experimenting, padding with SEQCODE_N is the only sensible thing
722     to keep code size down and reduce cases */
723  *seqptr++ = ((pval * pg->pg_PowerTable[MAXCODEFITLONG - cnt])
724               << pg->pg_BitsShiftTable[MAXCODEFITLONG])
725    | pg->pg_BitsMaskTable[MAXCODEFITLONG];
726  //printf("[%08lx]\n", seqptr[-1]);
727
728  /* add a final padding longword with SEQCODE_N */
729  *seqptr++ = pg->pg_BitsMaskTable[MAXCODEFITLONG];
730
731  /* and a terminating bit */
732  *seqptr = pg->pg_BitsMaskTable[0];
733
734  if(dbopen) /* close DB, if open */
735  {
736    GB_commit_transaction(pg->pg_MainDB);
737  }
738
739  /* Enable caching again */
740  EnableCache(pg->pg_SpeciesCache);
741
742#if 0 /* debug */
743  {
744    FILE *fh;
745    char tmpbuf[80];
746    STRPTR str;
747
748    sprintf(tmpbuf, "%s.raw", pg->pg_IndexName);
749    str = DecompressSequence(pg, pg->pg_MergedRawData);
750    fh = fopen(tmpbuf, "w");
751    fputs(str, fh);
752    fclose(fh);
753    free(str);
754  }
755#endif
756
757  /* calculate hash sum over all data */
758  //pg->pg_AllHashSum = GetSeqHash(pg, 0, pg->pg_TotalRawSize, 0);
759  /* formerly a global hash, now only a random key to check integrity
760     with other files. Database up2date state is checked with sequence
761     hashes */
762  pg->pg_AllHashSum = rand();
763  pg->pg_Bench.ts_MergeDB = BenchTimePassed(pg);
764
765  printf("Merged compressed database size: %ld KB\n", len >> 10);
766  return(TRUE);
767}
768/* \\\ */
769
770/* /// "PartitionPrefixScan() */
771BOOL PartitionPrefixScan(struct PTPanGlobal *pg)
772{
773  struct PTPanPartition *pp;
774  ULONG cnt;
775  ULONG prefix;
776  ULONG seqpos;
777  ULONG *seqptr;
778  ULONG pval;
779  ULONG maxpartblock;
780  ULONG partsize;
781  ULONG leftpresize;
782  ULONG rightpresize;
783  ULONG prefixstart;
784  UWORD partcnt;
785
786  BenchTimePassed(pg);
787  printf("Step 2: Partition calculation...\n");
788  if(pg->pg_TotalRawSize < pg->pg_MaxPartitionSize)
789  {
790    /* everything fits into one partition */
791    pp = (struct PTPanPartition *) calloc(1, sizeof(struct PTPanPartition));
792    if(!pp)
793    {
794      return(FALSE); /* out of memory */
795    }
796
797    /* fill in sensible values */
798    pp->pp_PTPanGlobal = pg;
799    pp->pp_ID = 0;
800    pp->pp_Prefix = 0;
801    pp->pp_PrefixLen = 0;
802    pp->pp_Size = pg->pg_TotalRawSize;
803    pp->pp_RawOffset = 0;
804    pp->pp_PartitionName = (STRPTR) calloc(strlen(pg->pg_IndexName) + 5, 1);
805    strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 2);
806    strcat(pp->pp_PartitionName, "t000");
807    AddTail(&pg->pg_Partitions, &pp->pp_Node);
808    pg->pg_NumPartitions = 1;
809    printf("Using only one partition for %ld leaves.\n", pp->pp_Size);
810    return(TRUE);
811  }
812  if(!pg->pg_MergedRawData) /* safe checking */
813  {
814    printf("Huh? No merged raw data. Blame your programmer NOW!\n");
815    return(FALSE);
816  }
817
818  /* make histogram */
819  pg->pg_HistoTable = (ULONG *) calloc(pg->pg_PowerTable[MAXPREFIXSIZE], sizeof(ULONG));
820  if(!pg->pg_HistoTable)
821  {
822    printf("Out of memory for histogram!\n");
823    return(FALSE);
824  }
825
826  /* NOTE: ordering for the index of a prefix in the table is:
827     c_{m} + c_{m-1} * 5 + ... + c_{1} * 5^{m-1}
828     This means that the last part of the prefix has the lowest
829     significance. */
830
831  /* scan through the compressed database */
832  printf("Scanning through compact data...\n");
833  seqptr = pg->pg_MergedRawData;
834  prefix = 0;
835  cnt = 0;
836  pval = 0;
837  for(seqpos = 0; seqpos < pg->pg_TotalRawSize + MAXPREFIXSIZE - 1; seqpos++)
838  {
839    /* get sequence code from packed data */
840    if(!cnt)
841    {
842      pval = *seqptr++;
843      cnt = GetCompressedLongSize(pg, pval);
844      pval >>= pg->pg_BitsShiftTable[cnt];
845    }
846
847    /* generate new prefix code */
848    prefix %= pg->pg_PowerTable[MAXPREFIXSIZE - 1];
849    prefix *= pg->pg_AlphaSize;
850    prefix += (pval / pg->pg_PowerTable[--cnt]) % pg->pg_AlphaSize;
851
852    /* increase histogram value */
853    if(seqpos >= MAXPREFIXSIZE - 1)
854    {
855      pg->pg_HistoTable[prefix]++;
856    }
857  }
858
859  /* generate partitions */
860  cnt = 0;
861  prefixstart = 0;
862  maxpartblock = pg->pg_PowerTable[MAXPREFIXSIZE];
863  partsize = 0;
864  partcnt = 0;
865  pg->pg_MaxPrefixLen = 0;
866  for(prefix = 0; prefix < pg->pg_PowerTable[MAXPREFIXSIZE];)
867  {
868    partsize++;
869    cnt += pg->pg_HistoTable[prefix];
870    if((cnt > pg->pg_MaxPartitionSize) || (partsize >= maxpartblock))
871    {
872      /* partition is full! */
873      if(prefixstart == prefix)
874      {
875        printf("Warning: Partition overflow! Increase MAXPREFIXSIZE!\n");
876        break;
877      } else {
878        ULONG ppos;
879        /* check first, if we had to partition the thing anyway */
880        if(partsize < maxpartblock)
881        {
882          /* find out, how many leaves of the tree can be merged */
883          for(leftpresize = 0; leftpresize < MAXPREFIXSIZE; leftpresize++)
884          {
885            if(((prefixstart / pg->pg_PowerTable[leftpresize]) %
886                pg->pg_AlphaSize) != 0)
887            {
888              break;
889            }
890          }
891          /* check, if we're still in the same block */
892          if(prefix / pg->pg_PowerTable[leftpresize] ==
893             prefixstart / pg->pg_PowerTable[leftpresize])
894          {
895            for(rightpresize = 0; rightpresize <= leftpresize; rightpresize++)
896            {
897              if(prefixstart + pg->pg_PowerTable[rightpresize] > prefix)
898              {
899                break;
900              }
901            }
902            rightpresize--;
903            /* setup maxpartblock for all subpartions */
904            maxpartblock = pg->pg_PowerTable[rightpresize];
905            prefix = prefixstart + maxpartblock - 1;
906          } else {
907            /* we crossed the boundary, do last partition */
908            prefix = prefixstart + pg->pg_PowerTable[leftpresize] - 1;
909            maxpartblock = pg->pg_PowerTable[MAXPREFIXSIZE] - prefix - 1;
910          }
911        } else {
912          /* we had to split the tree before, so this is just
913             another leaf */
914          if((prefix + 1) % (maxpartblock * pg->pg_AlphaSize) == 0)
915          {
916            /* we have reached the last block, go back to the
917               normal search */
918            maxpartblock = pg->pg_PowerTable[MAXPREFIXSIZE] - prefix - 1;
919          }
920        }
921        //printf("New range: %ld - %ld\n", prefixstart, prefix);
922        /* recalculate leaf count */
923        cnt = 0;
924        for(ppos = prefixstart; ppos <= prefix; ppos++)
925        {
926          cnt += pg->pg_HistoTable[ppos];
927        }
928
929        /* don't create empty trees! */
930        if(cnt)
931        {
932          /* get prefix length */
933          for(leftpresize = 1; leftpresize < MAXPREFIXSIZE; leftpresize++)
934          {
935            if((prefix - prefixstart) < pg->pg_PowerTable[leftpresize])
936            {
937              break;
938            }
939          }
940          pp = (struct PTPanPartition *) calloc(1, sizeof(struct PTPanPartition));
941          if(!pp)
942          {
943            return(FALSE); /* out of memory */
944          }
945
946          /* fill in sensible values */
947          pp->pp_PTPanGlobal = pg;
948          pp->pp_ID = partcnt++;
949          pp->pp_Prefix = prefix / pg->pg_PowerTable[leftpresize];
950          pp->pp_PrefixLen = MAXPREFIXSIZE - leftpresize;
951          pp->pp_Size = cnt;
952      /* this is the point where you would change some things to support larger
953         raw datasets than 2 GB */
954      pp->pp_RawOffset = 0; /* FIXME */
955      pp->pp_PartitionName = (STRPTR) calloc(strlen(pg->pg_IndexName) + 5, 1);
956      strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 2);
957      sprintf(&pp->pp_PartitionName[strlen(pg->pg_IndexName) - 2], "t%03ld",
958      pp->pp_ID);
959          AddTail(&pg->pg_Partitions, &pp->pp_Node);
960          /* check, if we need a bigger prefix table */
961          if(pp->pp_PrefixLen > pg->pg_MaxPrefixLen)
962          {
963            pg->pg_MaxPrefixLen = pp->pp_PrefixLen;
964          }
965#ifdef DEBUG
966    {
967        STRPTR tarseq = pg->pg_TempBuffer;
968        ULONG ccnt = pp->pp_PrefixLen;
969        do
970        {
971        *tarseq++ = pg->pg_DecompressTable[(pp->pp_Prefix / pg->pg_PowerTable[--ccnt])
972                                                        % pg->pg_AlphaSize];
973        } while(ccnt);
974        *tarseq = 0;
975        printf("New partition %ld - %ld, size %ld, prefix = %ld, prefixlen = %ld %s\n",
976        prefixstart, prefix, cnt, pp->pp_Prefix, pp->pp_PrefixLen,
977        pg->pg_TempBuffer);
978    }
979#endif
980        } else {
981#ifdef DEBUG
982          printf("Empty partition %ld - %ld\n", prefixstart, prefix);
983#endif
984        }
985        prefix++;
986        /* restart */
987        prefixstart = prefix;
988        cnt = 0;
989        partsize = 0;
990      }
991    } else {
992      prefix++;
993    }
994  }
995  free(pg->pg_HistoTable);
996  pg->pg_NumPartitions = partcnt;
997  printf("Using %d partitions.\n", partcnt);
998  pg->pg_Bench.ts_PrefixScan = BenchTimePassed(pg);
999  return(TRUE);
1000}
1001/* \\\ */
1002
1003/* /// "CreateTreeForPartition()" */
1004BOOL CreateTreeForPartition(struct PTPanPartition *pp)
1005{
1006  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
1007
1008  printf("*** Partition %ld/%d: %ld nodes (%ld%%) ***\n",
1009    pp->pp_ID + 1, pg->pg_NumPartitions,
1010    pp->pp_Size, pp->pp_Size / (pg->pg_TotalRawSize / 100));
1011
1012  printf(">>> Phase 1: Building up suffix tree in memory... <<<\n");
1013  BuildMemoryTree(pp);
1014
1015  /* prepare tree building */
1016  printf(">>> Phase 2: Calculating tree statistical data... <<<\n");
1017  CalculateTreeStats(pp);
1018
1019  /* write out tree */
1020  printf(">>> Phase 3: Writing tree to secondary storage... <<<\n");
1021  WriteTreeToDisk(pp);
1022
1023  printf(">>> Phase 4: Freeing memory and cleaning it up... <<<\n");
1024
1025  /* return some memory not used anymore */
1026  freeset(pp->pp_SfxNodes, NULL);
1027  freeset(pp->pp_BranchCode, NULL);
1028  freeset(pp->pp_ShortEdgeCode, NULL);
1029  freeset(pp->pp_LongEdgeLenCode, NULL);
1030  freeset(pp->pp_LongDictRaw, NULL);
1031  freeset(pp->pp_LeafBuffer, NULL);
1032
1033  pp->pp_LevelStats     = NULL;
1034  pp->pp_LeafBufferSize = 0;
1035
1036  return(TRUE);
1037}
1038/* \\\ */
1039
1040/* /// "BuildMemoryTree()" */
1041BOOL BuildMemoryTree(struct PTPanPartition *pp)
1042{
1043  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
1044  ULONG *seqptr ;
1045  ULONG pval, pvalnext;
1046  ULONG cnt;
1047  ULONG nodecnt;
1048  ULONG pos;
1049  ULONG len;
1050  ULONG window;
1051
1052  BenchTimePassed(pg);
1053  /* setup node buffer:
1054     some notes about the node organisation: nodes are kept in a partly
1055     compressed way in memory already: they only store the number of edges they
1056     need. Therefore the big node buffer is split into two parts:
1057     - Nodes with three to five edges starting from the first offset and increasing
1058     - Nodes with two edges starting from the very end and decreasing
1059     - Leaf nodes are stored directly as offset inside a node with the MSB indicating
1060       that this ptr is a leaf.
1061  */
1062  /* this is based on empirical data: Big Nodes are approx 8-15%, small nodes 60-90% */
1063  pp->pp_SfxMemorySize = ((((pp->pp_Size / 100) * SMALLNODESPERCENT) *
1064                           sizeof(struct SfxNode2Edges)) +
1065                          (((pp->pp_Size / 100) * BIGNODESPERCENT) *
1066                           sizeof(struct SfxNodeNEdges)) + (4UL << 20)) & ~3;
1067  if(pp->pp_SfxMemorySize >= (1UL << 30))
1068  {
1069    pp->pp_SfxMemorySize = (1UL << 30)-4;
1070    printf("Warning! Memory limited to 1 GB! Might run out!\n");
1071  }
1072  pp->pp_SfxNodes = (UBYTE *) malloc(pp->pp_SfxMemorySize);
1073  if(!pp->pp_SfxNodes)
1074  {
1075    printf("Couldn't allocate %ld KB for suffix nodes.\n",
1076           pp->pp_SfxMemorySize >> 10);
1077    return(FALSE);
1078  }
1079  /* init pointing offsets */
1080  pp->pp_SfxNEdgeOffset = 0;
1081  pp->pp_Sfx2EdgeOffset = pp->pp_SfxMemorySize;
1082
1083  printf("Allocated %ld KB suffix nodes buffer.\n", pp->pp_SfxMemorySize >> 10);
1084
1085  /* fill in root node */
1086  pp->pp_SfxRoot = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_SfxNEdgeOffset];
1087  pp->pp_SfxRoot->sn_Parent = 0;
1088  pp->pp_SfxRoot->sn_StartPos = 0;//pos;
1089  pp->pp_SfxRoot->sn_EdgeLen = 0;//pg->pg_TotalRawSize - len;
1090  memset(pp->pp_SfxRoot->sn_Children, 0, pg->pg_AlphaSize * sizeof(ULONG));
1091  pp->pp_SfxNEdgeOffset += sizeof(struct SfxNodeNEdges);
1092
1093  /* main loop to build up the tree */
1094  /* NOTE: as a special precaution, all longwords have MAXCODEFITLONG code length */
1095
1096  /* allocate quick lookup table. This is used to speed up traversal of the
1097     MAXQPREFIXLOOKUPSIZE lowest levels */
1098  pp->pp_QuickPrefixLookup = (ULONG *) calloc(pg->pg_PowerTable[MAXQPREFIXLOOKUPSIZE],
1099                                              sizeof(ULONG));
1100  if(!pp->pp_QuickPrefixLookup)
1101  {
1102    printf("Out of memory for Quick Prefix Lookup!\n");
1103    return(FALSE);
1104  }
1105
1106  len = pg->pg_TotalRawSize;
1107  seqptr = pg->pg_MergedRawData;
1108  /* get first longword */
1109  pval = *seqptr++;
1110  pval >>= pg->pg_BitsShiftTable[MAXCODEFITLONG];
1111  cnt = MAXCODEFITLONG;
1112  /* get second longword */
1113  pvalnext = *seqptr++;
1114  pvalnext >>= pg->pg_BitsShiftTable[MAXCODEFITLONG];
1115  pos = 0;
1116  nodecnt = 0;
1117  pp->pp_QuickPrefixCount = 0;
1118  len -= MAXCODEFITLONG;
1119  do
1120  {
1121    BOOL takepos;
1122
1123    window = ((pval % pg->pg_PowerTable[cnt]) *
1124              pg->pg_PowerTable[MAXCODEFITLONG - cnt]) +
1125      (pvalnext / pg->pg_PowerTable[cnt]);
1126    if(pp->pp_PrefixLen)
1127    {
1128      /* check, if the prefix matches */
1129      takepos = (window / pg->pg_PowerTable[MAXCODEFITLONG - pp->pp_PrefixLen] == pp->pp_Prefix);
1130    } else {
1131      takepos = TRUE; /* it's all one big partition */
1132    }
1133
1134    if(takepos) /* only add this position, if it matches the prefix */
1135    {
1136      if(!(InsertTreePos(pp, pos, window)))
1137      {
1138        break;
1139      }
1140      if((++nodecnt & 0x3fff) == 0)
1141      {
1142        if((nodecnt >> 14) % 50)
1143        {
1144          printf(".");
1145          fflush(stdout);
1146        } else {
1147          printf(". %2ld%% (%6ld KB free)\n",
1148                 pos / (pg->pg_TotalRawSize / 100),
1149                 (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10);
1150        }
1151      }
1152    }
1153
1154    /* get next byte */
1155    cnt--;
1156    if(len)
1157    {
1158      if(!cnt)
1159      {
1160        /* get next position */
1161        pval = pvalnext;
1162        pvalnext = *seqptr++;
1163        pvalnext >>= pg->pg_BitsShiftTable[MAXCODEFITLONG];
1164        cnt = MAXCODEFITLONG;
1165      }
1166      len--;
1167    } else {
1168      if(!cnt)
1169      {
1170        pval = pvalnext;
1171        pvalnext = 0;
1172        cnt = MAXCODEFITLONG;
1173      }
1174    }
1175  } while(++pos < pg->pg_TotalRawSize);
1176  pp->pp_NumBigNodes = pp->pp_SfxNEdgeOffset / sizeof(struct SfxNodeNEdges);
1177  pp->pp_NumSmallNodes = (pp->pp_SfxMemorySize - pp->pp_Sfx2EdgeOffset) /
1178    sizeof(struct SfxNode2Edges);
1179
1180  printf("DONE! (%ld KB unused)\n", (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10);
1181
1182  /* free some memory not required anymore */
1183  freeset(pp->pp_QuickPrefixLookup, NULL);
1184
1185  printf("Quick Prefix Lookup speedup: %ld%% (%ld)\n",
1186         (pp->pp_QuickPrefixCount * 100) / pp->pp_Size, pp->pp_QuickPrefixCount);
1187
1188  if(pp->pp_Size != nodecnt)
1189  {
1190    printf("Something very bad has happened! Predicted partition size [%ld] didn't\n"
1191           "match the actual generated nodes [%ld].\n",
1192           pp->pp_Size, nodecnt);
1193    return(FALSE);
1194  }
1195
1196  printf("Nodes     : %6ld\n", nodecnt);
1197  printf("SmallNodes: %6ld (%ld%%)\n",
1198         pp->pp_NumSmallNodes,
1199         (pp->pp_NumSmallNodes * 100) / nodecnt);
1200  printf("BigNodes  : %6ld (%ld%%)\n",
1201         pp->pp_NumBigNodes,
1202         (pp->pp_NumBigNodes * 100) / nodecnt);
1203
1204  pg->pg_Bench.ts_MemTree += BenchTimePassed(pg);
1205  return(TRUE);
1206}
1207/* \\\ */
1208
1209/* /// "CommonSequenceLength()" */
1210ULONG CommonSequenceLength(struct PTPanPartition *pp,
1211                           ULONG spos1, ULONG spos2, ULONG maxlen)
1212{
1213  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
1214  ULONG *seqptr = pg->pg_MergedRawData;
1215  ULONG off1 = spos1 / MAXCODEFITLONG;
1216  ULONG cnt1 = spos1 % MAXCODEFITLONG;
1217  ULONG off2 = spos2 / MAXCODEFITLONG;
1218  ULONG cnt2 = spos2 % MAXCODEFITLONG;
1219  ULONG len = 0;
1220
1221  /* a note on the implementation: this routine will not work correctly,
1222     if the sequences are identical or completely of SEQCODE_N, as it
1223     doesn't detect the end of sequence bits, but assumes MAXCODEFITLONG
1224     entries for all longwords. */
1225  /* compare one code */
1226  while((((seqptr[off1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) /
1227         pg->pg_PowerTable[MAXCODEFITLONG - cnt1 - 1]) % pg->pg_AlphaSize) ==
1228       (((seqptr[off2] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) /
1229         pg->pg_PowerTable[MAXCODEFITLONG - cnt2 - 1]) % pg->pg_AlphaSize))
1230  {
1231    /* loop while code is identical */
1232    len++;
1233    if(len >= maxlen)
1234    {
1235      break;
1236    }
1237    if(++cnt1 >= MAXCODEFITLONG)
1238    {
1239      cnt1 = 0;
1240      off1++;
1241    }
1242    if(++cnt2 >= MAXCODEFITLONG)
1243    {
1244      cnt2 = 0;
1245      off2++;
1246    }
1247  }
1248  return(len);
1249}
1250/* \\\ */
1251
1252/* /// "CompareCompressedSequence()" */
1253LONG CompareCompressedSequence(struct PTPanGlobal *pg, ULONG spos1, ULONG spos2)
1254{
1255  ULONG *seqptr = pg->pg_MergedRawData;
1256  ULONG off1 = spos1 / MAXCODEFITLONG;
1257  ULONG cnt1 = spos1 % MAXCODEFITLONG;
1258  ULONG off2 = spos2 / MAXCODEFITLONG;
1259  ULONG cnt2 = spos2 % MAXCODEFITLONG;
1260  ULONG window1, window2;
1261
1262  /* a note on the implementation: this routine will not work correctly,
1263     if the sequences are identical or completely of SEQCODE_N, as it
1264     doesn't detect the end of sequence bits, but assumes MAXCODEFITLONG
1265     entries for all longwords. */
1266  do
1267  {
1268    /* get windows */
1269    /* I know this looks like very obfusciated code, but take your time and you will
1270       understand: take a few codebits from the one longword, add a few bits from
1271       the other */
1272    window1 = (((seqptr[off1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG])
1273                % pg->pg_PowerTable[MAXCODEFITLONG - cnt1]) * pg->pg_PowerTable[cnt1])
1274      + ((seqptr[off1 + 1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) /
1275         pg->pg_PowerTable[MAXCODEFITLONG - cnt1]);
1276    window2 = (((seqptr[off2] >> pg->pg_BitsShiftTable[MAXCODEFITLONG])
1277                % pg->pg_PowerTable[MAXCODEFITLONG - cnt2]) * pg->pg_PowerTable[cnt2])
1278      + ((seqptr[off2 + 1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) /
1279         pg->pg_PowerTable[MAXCODEFITLONG - cnt2]);
1280
1281    /* compare the windows */
1282    if(window1 == window2)
1283    {
1284      /* was the same, look at next position */
1285      if(++cnt1 >= MAXCODEFITLONG)
1286      {
1287        cnt1 = 0;
1288        off1++;
1289      }
1290      if(++cnt2 >= MAXCODEFITLONG)
1291      {
1292        cnt2 = 0;
1293        off2++;
1294      }
1295    } else {
1296      return(((LONG) window1) - ((LONG) window2));
1297    }
1298  } while(TRUE);
1299  return(0); /* never reached */
1300}
1301/* \\\ */
1302
1303/* /// "InsertTreePos()" */
1304BOOL InsertTreePos(struct PTPanPartition *pp, ULONG pos, ULONG window)
1305{
1306  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
1307  struct SfxNode *sfxnode = pp->pp_SfxRoot;
1308  struct SfxNode *prevnode;
1309  ULONG *seqptr = pg->pg_MergedRawData;
1310  ULONG relptr;
1311  ULONG len;
1312  UBYTE seqcode;
1313  ULONG childptr;
1314  UWORD childidx, childcnt;
1315  UWORD previdx;
1316  BOOL childisleaf;
1317  ULONG prefix;
1318  ULONG treepos;
1319
1320  prefix = window / pg->pg_PowerTable[MAXCODEFITLONG - MAXQPREFIXLOOKUPSIZE];
1321  /* see if we can use a quick lookup to skip the root levels of the tree */
1322  if((childptr = pp->pp_QuickPrefixLookup[prefix]) &&
1323    (pos + MAXQPREFIXLOOKUPSIZE < pg->pg_TotalRawSize))
1324  {
1325    pp->pp_QuickPrefixCount++;
1326    sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[childptr];
1327    pos += MAXQPREFIXLOOKUPSIZE;
1328    treepos = MAXQPREFIXLOOKUPSIZE;
1329  } else {
1330    treepos = 0;
1331  }
1332  len = pg->pg_TotalRawSize - pos;
1333  while(len)
1334  {
1335    /* get first sequence code */
1336    seqcode = GetSeqCodeQuick(pos);
1337
1338    /*printf("[%ld%c] ", pos, //((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes),
1339      pg->pg_DecompressTable[seqcode]);*/
1340
1341    /* check, if there's already a child */
1342    relptr = 0;
1343    childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
1344
1345    while(childidx--)
1346    {
1347      childptr = sfxnode->sn_Children[childidx];
1348      if(childptr >> LEAFBIT)
1349      {
1350        /* this is a leaf pointer and doesn't contain a seqcode */
1351        childptr &= ~LEAFMASK;
1352    childptr += treepos;
1353        //printf("<%c>", pg->pg_DecompressTable[GetSeqCodeQuick(childptr)]);
1354        if(GetSeqCodeQuick(childptr) == seqcode)
1355        {
1356          /* fill in a dummy relptr -- we'll use childptr later */
1357          relptr = ~0UL;
1358          childisleaf = TRUE;
1359          break;
1360        }
1361      } else {
1362        //printf("[%c]", pg->pg_DecompressTable[childptr >> RELOFFSETBITS]);
1363        if((childptr >> RELOFFSETBITS) == seqcode)
1364        {
1365          /* hey, we actually found the right child */
1366          relptr = (childptr & RELOFFSETMASK) << 2;
1367          childisleaf = FALSE;
1368          break;
1369        }
1370      }
1371    }
1372    /* did we find a child? */
1373    if(relptr)
1374    {
1375      ULONG matchsize;
1376
1377      if(childisleaf)
1378      {
1379        struct SfxNode *splitnode;
1380        /* relptr is no pointer to a node, but a startpos instead */
1381        matchsize = CommonSequenceLength(pp, pos, childptr, pg->pg_TotalRawSize - childptr);
1382
1383        /* this will always lead to partial matches! */
1384
1385        /* allocate a new branching node */
1386        pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNode2Edges);
1387
1388        /*printf("Leaf split (pos %ld, len %ld): %d (%c != %c) -> [%ld]\n",
1389               pos, len, matchsize,
1390               pg->pg_DecompressTable[GetSeqCodeQuick(childptr + matchsize)],
1391               pg->pg_DecompressTable[GetSeqCodeQuick(pos + matchsize)],
1392               pp->pp_Sfx2EdgeOffset);*/
1393
1394        splitnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset];
1395        /* fill in the node with the two leaves */
1396        splitnode->sn_Parent = ((((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes)) >> 2) |
1397          (2 << RELOFFSETBITS);
1398        splitnode->sn_StartPos = childptr;
1399        splitnode->sn_EdgeLen = matchsize;
1400        splitnode->sn_Children[0] = LEAFMASK | (childptr - treepos);
1401        splitnode->sn_Children[1] = LEAFMASK | (pos - treepos);
1402    /*printf("Child0 = %ld, Child1 = %ld\n",
1403        splitnode->sn_Children[0] & ~LEAFMASK,
1404        splitnode->sn_Children[1] & ~LEAFMASK);*/
1405#if 0 // debug
1406    if(GetSeqCodeQuick(childptr + matchsize) == GetSeqCodeQuick(pos + matchsize))
1407    {
1408    printf("CIS: %ld<->%ld [%ld|%ld] (matchsize %ld)\n",
1409        GetSeqCodeQuick(childptr + matchsize),
1410        GetSeqCodeQuick(pos + matchsize),
1411        childptr,
1412        pos,
1413        matchsize);
1414    }
1415#endif
1416        /* fix downlink (child) pointer */
1417        sfxnode->sn_Children[childidx] = (seqcode << RELOFFSETBITS) | (pp->pp_Sfx2EdgeOffset >> 2);
1418        break;
1419      }
1420      //printf("->[%ld]", relptr);
1421      /* okay, there is a child, get it */
1422      prevnode = sfxnode;
1423      previdx = childidx; /* is needed for correcting the downlink ptr */
1424      sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[relptr];
1425      /* compare its edge */
1426      if(sfxnode->sn_EdgeLen > 1)
1427      {
1428        matchsize = CommonSequenceLength(pp, pos, sfxnode->sn_StartPos, sfxnode->sn_EdgeLen);
1429      } else {
1430        matchsize = 1;
1431      }
1432      if(matchsize < sfxnode->sn_EdgeLen) /* did the whole edge match? */
1433      {
1434        struct SfxNode *upnode;
1435        //printf("Partmatch(%ld, %ld): %d\n", pos, len, matchsize);
1436        /* we only had a partial match, we need to split the node */
1437
1438        /* allocate a new node */
1439        pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNode2Edges);
1440        upnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset];
1441
1442        /* fix linkage of middle node and set two edges */
1443        upnode->sn_Parent = ((((ULONG) prevnode) - ((ULONG) pp->pp_SfxNodes)) >> 2) |
1444          (2 << RELOFFSETBITS);
1445        upnode->sn_StartPos = sfxnode->sn_StartPos;
1446        upnode->sn_EdgeLen = matchsize;
1447        upnode->sn_Children[0] = ((((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes)) >> 2) |
1448          (GetSeqCodeQuick(upnode->sn_StartPos + matchsize) << RELOFFSETBITS);
1449        /* enter child leaf node */
1450        upnode->sn_Children[1] = LEAFMASK | (pos - treepos);
1451
1452#if 0 // debug
1453    if(GetSeqCodeQuick(upnode->sn_StartPos + matchsize) == GetSeqCodeQuick(pos + matchsize))
1454    {
1455    printf("SN: %ld<->%ld\n",
1456        GetSeqCodeQuick(upnode->sn_StartPos + matchsize),
1457        GetSeqCodeQuick(pos + matchsize));
1458    }
1459#endif
1460
1461        /* fix sfxnode linkage and edge */
1462        sfxnode->sn_Parent = (pp->pp_Sfx2EdgeOffset >> 2) |
1463          (sfxnode->sn_Parent & ~RELOFFSETMASK);
1464        sfxnode->sn_StartPos += matchsize;
1465        sfxnode->sn_EdgeLen -= matchsize;
1466
1467        /* fix prevnode */
1468        prevnode->sn_Children[childidx] = (pp->pp_Sfx2EdgeOffset >> 2) |
1469          (prevnode->sn_Children[childidx] & ~RELOFFSETMASK);
1470
1471        if(pp->pp_SfxNEdgeOffset >= pp->pp_Sfx2EdgeOffset)
1472        {
1473          printf("Node buffer was too small!\n");
1474          return(FALSE);
1475        }
1476        break;
1477      } else {
1478        /* the whole edge matched, just follow the path */
1479        /*printf("Wholematch(%ld, %ld): %d [%d] -> %ld\n",
1480          pos, len, matchsize, seqcode, (((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes)));*/
1481        pos += matchsize;
1482        len -= matchsize;
1483        treepos += matchsize;
1484      }
1485    } else {
1486      childcnt = (sfxnode->sn_Parent >> RELOFFSETBITS);
1487      /*printf("New leaf[%d]-(pos %ld, len %ld): (%c)\n",
1488        childcnt, pos, len, pg->pg_DecompressTable[seqcode]);*/
1489      if((childcnt == 2) && (sfxnode != pp->pp_SfxRoot))
1490      {
1491        struct SfxNode *bignode;
1492        struct SfxNode *lastnode;
1493
1494        /* we need to expand this node from 2 to 5 branches first */
1495        /*printf("2T5 [%ld]->[%ld] [%ld]->[%ld]\n",
1496               ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes),
1497               pp->pp_SfxNEdgeOffset,
1498               pp->pp_Sfx2EdgeOffset,
1499               ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes));*/
1500
1501        /* allocate a new big node */
1502        bignode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_SfxNEdgeOffset];
1503
1504        /* copy node */
1505        memcpy(bignode, sfxnode, sizeof(struct SfxNode2Edges));
1506
1507        /* fix prevnode -> bignode child pointer */
1508        prevnode->sn_Children[previdx] = (pp->pp_SfxNEdgeOffset >> 2) |
1509          (prevnode->sn_Children[previdx] & ~RELOFFSETMASK);
1510
1511        /* fix children -> bignode parent pointers */
1512        childidx = 2; //(bignode->sn_Parent >> RELOFFSETBITS);
1513        while(childidx--)
1514        {
1515          childptr = bignode->sn_Children[childidx];
1516          if(!(childptr >> LEAFBIT)) /* only check for real nodes */
1517          {
1518            relptr = (childptr & RELOFFSETMASK) << 2;
1519            //printf("Fixup childidx=%d from %ld\n", childidx, relptr);
1520            /* fix the pointer to new location */
1521            prevnode = (struct SfxNode *) &pp->pp_SfxNodes[relptr];
1522            prevnode->sn_Parent = (prevnode->sn_Parent & ~RELOFFSETMASK) |
1523              (((ULONG) bignode) - ((ULONG) pp->pp_SfxNodes) >> 2);
1524          }
1525        }
1526
1527        /* avoid copy, if both are the same */
1528        if(pp->pp_Sfx2EdgeOffset !=
1529           ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes))
1530        {
1531          /* regain memory by copying last two edge node into hole */
1532          lastnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset];
1533
1534          /* copy node */
1535          memcpy(sfxnode, lastnode, sizeof(struct SfxNode2Edges));
1536
1537          /* find lastnode->parent->lastnode downward pointer */
1538          prevnode = (struct SfxNode *) &pp->pp_SfxNodes[(lastnode->sn_Parent & RELOFFSETMASK) << 2];
1539          childidx = (prevnode->sn_Parent >> RELOFFSETBITS);
1540          while(childidx--)
1541          {
1542            childptr = prevnode->sn_Children[childidx];
1543            if(!(childptr >> LEAFBIT)) /* only check for real nodes */
1544            {
1545              if((childptr & RELOFFSETMASK) == (pp->pp_Sfx2EdgeOffset >> 2))
1546              {
1547                /* fix the pointer to new location */
1548                /*printf("Fixdown childidx=%d from %ld\n",
1549                  childidx, lastnode->sn_Parent & RELOFFSETMASK);*/
1550                prevnode->sn_Children[childidx] = (childptr & ~RELOFFSETMASK) |
1551                  (((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes) >> 2);
1552                break;
1553              }
1554            }
1555          }
1556
1557          /* fix children->parent upward pointers */
1558          childidx = 2;//(lastnode->sn_Parent >> RELOFFSETBITS);
1559          while(childidx--)
1560          {
1561            childptr = lastnode->sn_Children[childidx];
1562            if(!(childptr >> LEAFBIT)) /* only check for real nodes */
1563            {
1564              relptr = (childptr & RELOFFSETMASK) << 2;
1565              //printf("Fixup childidx=%d from %ld\n", childidx, relptr);
1566              /* fix the pointer to new location */
1567              prevnode = (struct SfxNode *) &pp->pp_SfxNodes[relptr];
1568              prevnode->sn_Parent = (prevnode->sn_Parent & ~RELOFFSETMASK) |
1569                (((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes) >> 2);
1570            }
1571          }
1572        }
1573        /* we're done with the fixing, now correct memory usage */
1574        sfxnode = bignode;
1575        pp->pp_Sfx2EdgeOffset += sizeof(struct SfxNode2Edges);
1576        pp->pp_SfxNEdgeOffset += sizeof(struct SfxNodeNEdges);
1577        if(treepos == MAXQPREFIXLOOKUPSIZE)
1578        {
1579          pp->pp_QuickPrefixLookup[prefix] = ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes);
1580        }
1581      }
1582
1583      /* enter new child */
1584      sfxnode->sn_Children[childcnt] = LEAFMASK | (pos - treepos);
1585      //printf("New child: %ld (%ld)\n", pos-treepos, treepos);
1586      /* increase edgecount */
1587      sfxnode->sn_Parent += (1UL << RELOFFSETBITS);
1588      break;
1589    }
1590  }
1591  //printf("Done (2E: %ld NE: %ld)\n", pp->pp_Sfx2EdgeOffset, pp->pp_SfxNEdgeOffset);
1592  return(TRUE);
1593}
1594/* \\\ */
1595
1596/* /// "CalculateTreeStats()" */
1597BOOL CalculateTreeStats(struct PTPanPartition *pp)
1598{
1599  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
1600  ULONG cnt;
1601  ULONG edgetotal;
1602  ULONG bitsused;
1603  ULONG threshold;
1604
1605  BenchTimePassed(pg);
1606  /* now we need some statistical data. this includes:
1607     a) the maximum depth of the tree
1608     b) the number of nodes and leaves on each level (i.e. population)
1609        out of b) we will generate the TREE PRUNING DEPTH
1610     c) the appearance of all branch combinations (branch combo mask) up to the pruning depth
1611        out of c) we will generate a HUFFMAN CODE for the branch ptr mask
1612     d) the occurrences of the short edges codes and the number of long edges
1613        out of d) we will generate a HUFFMAN CODE for the edge codes
1614        moreover, it replaces the short edges by the indexes to the huffman code (sn_StartPos
1615        is recycled, gets bit 30 set).
1616     e) generating a HUFFMAN CODE fo the long edge lengths
1617     f) generating a lookup DICTIONARY for the long edge codes
1618        f) will allow storing shorter label pointers and only a fraction of memory for lookup
1619        long edges will be replaced by the position in the dictionary (sn_StartPos is recycled,
1620        gets bit 31 set).
1621  */
1622  //memset(pp->pp_EdgeBranchTable, 0, ALPHASIZE * sizeof(ULONG));
1623
1624#if 0 /* debug */
1625  pp->pp_VerifyArray = (UBYTE *) calloc(pg->pg_TotalRawSize+1, 1);
1626
1627  /* verify the correctness of the tree */
1628  printf("Verifying correctness of the tree...\n");
1629  GetTreeStatsVerifyRec(pp, 0, 0, 0);
1630
1631  for(cnt = 0; cnt < ((pg->pg_TotalRawSize + 7) >> 3); cnt++)
1632  {
1633    if(pp->pp_VerifyArray[cnt] != 0xff)
1634    {
1635        printf("Hole in Position %ld to %ld (%02x)\n",
1636        cnt << 3, (cnt << 3)+7, pp->pp_VerifyArray[cnt]);
1637    } else {
1638      /*printf("Good in Position %ld to %ld (%02x)\n",
1639    cnt << 3, (cnt << 3)+7, pp->pp_VerifyArray[cnt]);*/
1640    }
1641  }
1642  free(pp->pp_VerifyArray);
1643#endif
1644
1645  /* calculate maximum depth of the tree */
1646  pp->pp_MaxTreeDepth = 0;
1647  GetTreeStatsTreeDepthRec(pp, 0, 0);
1648  pp->pp_MaxTreeDepth++; /* increase by one due to leaf level */
1649
1650  printf("Max tree depth: %ld\n", pp->pp_MaxTreeDepth);
1651
1652  /* generate statistical data, part 2: level population */
1653  pp->pp_LevelStats = (struct TreeLevelStats *) calloc(pp->pp_MaxTreeDepth,
1654                                sizeof(struct TreeLevelStats));
1655
1656  GetTreeStatsLevelRec(pp, 0, 0);
1657
1658  /* calculate missing total values and pruning position */
1659  if(pg->pg_PruneLength)
1660  {
1661    pp->pp_TreePruneDepth = pg->pg_PruneLength;
1662    pp->pp_TreePruneLength = pg->pg_PruneLength;
1663  } else {
1664    pp->pp_TreePruneDepth = 20; /* FIXME */
1665    pp->pp_TreePruneLength = 20; /* FIXME */
1666  }
1667  cnt = pp->pp_MaxTreeDepth;
1668  while(cnt--)
1669  {
1670    pp->pp_LevelStats[cnt].tls_TotalLeafCount = pp->pp_LevelStats[cnt].tls_LeafCount;
1671    pp->pp_LevelStats[cnt].tls_TotalNodeCount = pp->pp_LevelStats[cnt].tls_NodeCount;
1672    if(cnt < pp->pp_MaxTreeDepth-1)
1673    {
1674      pp->pp_LevelStats[cnt].tls_TotalLeafCount += pp->pp_LevelStats[cnt+1].tls_TotalLeafCount;
1675      pp->pp_LevelStats[cnt].tls_TotalNodeCount += pp->pp_LevelStats[cnt+1].tls_TotalNodeCount;
1676
1677      /* calculate tree pruning depth. currently, will prune at 66% of the leaves
1678    covered */
1679      if(!pp->pp_TreePruneDepth)
1680      {
1681    if(pp->pp_LevelStats[cnt].tls_TotalLeafCount > pp->pp_Size / 3)
1682    {
1683    pp->pp_TreePruneDepth = cnt;
1684    }
1685      }
1686    }
1687  }
1688
1689  /* debug output */
1690#if 0
1691  for(cnt = 0; cnt < pp->pp_MaxTreeDepth; cnt++)
1692  {
1693    printf("Level %3ld: Nodes=%6ld, Leaves=%6ld, TotalNodes=%6ld, TotalLeaves=%6ld\n",
1694    cnt,
1695    pp->pp_LevelStats[cnt].tls_NodeCount,
1696    pp->pp_LevelStats[cnt].tls_LeafCount,
1697    pp->pp_LevelStats[cnt].tls_TotalNodeCount,
1698    pp->pp_LevelStats[cnt].tls_TotalLeafCount);
1699  }
1700#endif
1701
1702  printf("Tree pruning at depth %ld, length %ld.\n",
1703    pp->pp_TreePruneDepth,
1704    pp->pp_TreePruneLength);
1705
1706  freeset(pp->pp_LevelStats, NULL);
1707
1708  /* allocate branch histogram */
1709  pp->pp_BranchCode = (struct HuffCode *) calloc(1UL << pg->pg_AlphaSize,
1710                    sizeof(struct HuffCode));
1711  if(!pp->pp_BranchCode)
1712  {
1713    printf("Out of memory for Branch Histogram!\n");
1714    return(FALSE);
1715  }
1716  GetTreeStatsBranchHistoRec(pp, 0, 0, 0);
1717
1718  /* generate statistical data, part 3: edge lengths and combinations */
1719
1720  /* allocate short edge code histogram (for edges between of 2-7 base pairs) */
1721  bitsused = pg->pg_BitsUseTable[SHORTEDGEMAX]+1;
1722  pp->pp_ShortEdgeCode = (struct HuffCode *) calloc((1UL << bitsused),
1723                        sizeof(struct HuffCode));
1724  if(!pp->pp_ShortEdgeCode)
1725  {
1726    printf("Out of memory for Short Edge Histogram!\n");
1727    return(FALSE);
1728  }
1729
1730  /* get short edge stats */
1731  pp->pp_EdgeCount = 0;
1732  pp->pp_ShortEdgeCode[1].hc_Weight++;
1733  GetTreeStatsShortEdgesRec(pp, 0, 0, 0);
1734
1735  /* define threshold for small edge optimization */
1736  threshold = pp->pp_EdgeCount / 10000;
1737
1738  /* calculate the number of longedges required */
1739  edgetotal = pp->pp_EdgeCount;
1740  for(cnt = 1; cnt < (1UL << bitsused); cnt++)
1741  {
1742    if(pp->pp_ShortEdgeCode[cnt].hc_Weight > threshold) /* code will remain in the table */
1743    {
1744      edgetotal -= pp->pp_ShortEdgeCode[cnt].hc_Weight;
1745    }
1746  }
1747  /* code 0 will be used for long edges */
1748  pp->pp_ShortEdgeCode[0].hc_Weight = edgetotal;
1749
1750  printf("Considering %ld (%ld+%ld) edges for the final tree.\n",
1751    pp->pp_EdgeCount, pp->pp_EdgeCount - edgetotal, edgetotal);
1752
1753  /* generate huffman code for short edge, but only for codes that provide at least
1754     1/10000th of the edges (other stuff goes into the long edges) */
1755  printf("Generating huffman code for short edges\n");
1756  BuildHuffmanCode(pp->pp_ShortEdgeCode, (1UL << bitsused), threshold);
1757
1758#if 0
1759  /* debug */
1760  for(cnt = 0; cnt < (1UL << bitsused); cnt++)
1761  {
1762    WORD bitcnt;
1763    if(pp->pp_ShortEdgeCode[cnt].hc_CodeLength)
1764    {
1765      printf("%6ld: %7ld -> %2d ", cnt, pp->pp_ShortEdgeCode[cnt].hc_Weight,
1766        pp->pp_ShortEdgeCode[cnt].hc_CodeLength);
1767      for(bitcnt = pp->pp_ShortEdgeCode[cnt].hc_CodeLength - 1; bitcnt >= 0; bitcnt--)
1768      {
1769    printf("%s", pp->pp_ShortEdgeCode[cnt].hc_Codec & (1UL << bitcnt) ? "1" : "0");
1770      }
1771      printf("\n");
1772    }
1773  }
1774#endif
1775
1776  /* now generate dictionary for the long edges. This is done by generating an array of
1777     all long edges and then sorting it according to the length. Then, the dictionary
1778     string is built up. It replaces the starting pos with the pos in the dictionary
1779     and sets bit 31 to indicate this.
1780   */
1781  //GetTreeStatsDebugRec(pp, 0, 0);
1782  pg->pg_Bench.ts_TreeStats += BenchTimePassed(pg);
1783
1784  BuildLongEdgeDictionary(pp);
1785
1786  BenchTimePassed(pg);
1787
1788  /* generate huffman code for edge bit mask, saves 1-2 bits per node */
1789  printf("Generating huffman code for branch mask\n");
1790  BuildHuffmanCode(pp->pp_BranchCode, (1UL << pg->pg_AlphaSize), 0);
1791
1792  /* debug output */
1793#if 0
1794  for(cnt = 0; cnt < (1UL << pg->pg_AlphaSize); cnt++)
1795  {
1796    WORD bitcnt;
1797    if(pp->pp_BranchCode[cnt].hc_Weight)
1798    {
1799      printf("%2ld: %6ld -> %2d ", cnt, pp->pp_BranchCode[cnt].hc_Weight,
1800        pp->pp_BranchCode[cnt].hc_CodeLength);
1801      for(bitcnt = 0; bitcnt < pg->pg_AlphaSize; bitcnt++)
1802      {
1803    printf("%c", (cnt & (1UL << bitcnt)) ? pg->pg_DecompressTable[bitcnt] : ' ');
1804      }
1805      printf(" ");
1806      for(bitcnt = pp->pp_BranchCode[cnt].hc_CodeLength - 1; bitcnt >= 0; bitcnt--)
1807      {
1808    printf("%s", pp->pp_BranchCode[cnt].hc_Codec & (1UL << bitcnt) ? "1" : "0");
1809      }
1810      printf("\n");
1811    }
1812  }
1813#endif
1814
1815  pg->pg_Bench.ts_TreeStats += BenchTimePassed(pg);
1816  //GetTreeStatsDebugRec(pp, 0, 0);
1817
1818  return(TRUE);
1819}
1820/* \\\ */
1821
1822/* /// "GetTreeStatsDebugRec()" */
1823void GetTreeStatsDebugRec(struct PTPanPartition *pp, ULONG pos, ULONG level)
1824{
1825  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
1826  struct SfxNode *sfxnode;
1827  ULONG childptr;
1828  UWORD childidx;
1829  ULONG cnt;
1830  ULONG *seqptr = pg->pg_MergedRawData;
1831
1832  level++;
1833  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
1834  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
1835  printf("Pos: [%08lx], Level %ld, Edge [",
1836    pos, level);
1837  if(sfxnode->sn_StartPos & (3UL << 30))
1838  {
1839    printf("%08lx", sfxnode->sn_StartPos);
1840  } else {
1841    for(cnt = 0; cnt < sfxnode->sn_EdgeLen; cnt++)
1842    {
1843      printf("%c", pg->pg_DecompressTable[GetSeqCodeQuick(cnt + sfxnode->sn_StartPos)]);
1844    }
1845  }
1846  printf("] EdgeStart %ld, EdgeLen %d children %d\nChildren: ",
1847    sfxnode->sn_StartPos, sfxnode->sn_EdgeLen, childidx);
1848  while(childidx--)
1849  {
1850    childptr = sfxnode->sn_Children[childidx];
1851    printf("[%08lx] ", childptr);
1852  }
1853  printf("\n");
1854
1855  if(level < 3)
1856  {
1857    /* traverse children */
1858    childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
1859    while(childidx--)
1860    {
1861      childptr = sfxnode->sn_Children[childidx];
1862      if(!(childptr >> LEAFBIT))
1863      {
1864    /* this is a normal node pointer, recurse */
1865    GetTreeStatsDebugRec(pp, (childptr & RELOFFSETMASK) << 2, level);
1866      }
1867    }
1868  }
1869  printf("End Level %ld\n", level);
1870}
1871/* \\\ */
1872
1873/* /// "GetTreeStatsTreeDepthRec()" */
1874void GetTreeStatsTreeDepthRec(struct PTPanPartition *pp, ULONG pos, ULONG level)
1875{
1876  struct SfxNode *sfxnode;
1877  ULONG childptr;
1878  UWORD childidx;
1879
1880  /* calculate maximum tree depth */
1881  if(++level > pp->pp_MaxTreeDepth)
1882  {
1883    pp->pp_MaxTreeDepth = level;
1884  }
1885
1886  /* traverse children */
1887  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
1888  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
1889  while(childidx--)
1890  {
1891    childptr = sfxnode->sn_Children[childidx];
1892    if(!(childptr >> LEAFBIT))
1893    {
1894      /* this is a normal node pointer, recurse */
1895      GetTreeStatsTreeDepthRec(pp, (childptr & RELOFFSETMASK) << 2, level);
1896    }
1897  }
1898}
1899/* \\\ */
1900
1901/* /// "GetTreeStatsLevelRec()" */
1902void GetTreeStatsLevelRec(struct PTPanPartition *pp, ULONG pos, ULONG level)
1903{
1904  struct SfxNode *sfxnode;
1905  ULONG childptr;
1906  UWORD childidx;
1907
1908  /* traverse children */
1909  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
1910  //sfxnode->sn_Parent |= RELOFFSETMASK;
1911  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
1912  while(childidx--)
1913  {
1914    childptr = sfxnode->sn_Children[childidx];
1915    if(childptr >> LEAFBIT)
1916    {
1917      /* update leaf counter */
1918      pp->pp_LevelStats[level+1].tls_LeafCount++;
1919    } else {
1920      /* this is a normal node pointer, recurse */
1921      GetTreeStatsLevelRec(pp, (childptr & RELOFFSETMASK) << 2, level+1);
1922      /* update node counter */
1923      pp->pp_LevelStats[level].tls_NodeCount++;
1924    }
1925  }
1926}
1927/* \\\ */
1928
1929/* /// "GetTreeStatsShortEdgesRec()" */
1930void GetTreeStatsShortEdgesRec(struct PTPanPartition *pp,
1931                    ULONG pos, ULONG level, ULONG elen)
1932{
1933  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
1934  ULONG edgelen;
1935  ULONG epos;
1936  ULONG prefix;
1937  ULONG *seqptr = pg->pg_MergedRawData;
1938  struct SfxNode *sfxnode;
1939  ULONG childptr;
1940  UWORD childidx;
1941
1942  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
1943  /* update short edge code histogram */
1944  epos = sfxnode->sn_StartPos + 1;
1945  edgelen = sfxnode->sn_EdgeLen;
1946  /* do we need to truncate the edge? */
1947  if(elen + edgelen > pp->pp_TreePruneLength)
1948  {
1949    edgelen = pp->pp_TreePruneLength - elen;
1950    sfxnode->sn_EdgeLen = edgelen;
1951  }
1952  elen += edgelen;
1953  if((edgelen > 1) && (edgelen <= SHORTEDGEMAX))
1954  {
1955    prefix = 0;
1956    while(--edgelen)
1957    {
1958      prefix *= pg->pg_AlphaSize;
1959      prefix += GetSeqCodeQuick(epos);
1960      epos++;
1961    }
1962    /* add stop bit */
1963    prefix |= 1UL << pg->pg_BitsUseTable[sfxnode->sn_EdgeLen - 1];
1964    pp->pp_ShortEdgeCode[prefix].hc_Weight++;
1965  }
1966  else if(edgelen == 1)
1967  {
1968    pp->pp_ShortEdgeCode[1].hc_Weight++; /* also count length==1 edges */
1969  }
1970
1971  /* increase edgecount */
1972  pp->pp_EdgeCount++;
1973
1974  /* check for maximum depth reached */
1975  if((++level >= pp->pp_TreePruneDepth) || (elen >= pp->pp_TreePruneLength))
1976  {
1977    //printf("SE Level %ld, Epos %ld\n", level, elen);
1978    return;
1979  }
1980
1981  /* traverse children */
1982  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
1983  while(childidx--)
1984  {
1985    childptr = sfxnode->sn_Children[childidx];
1986    if(!(childptr >> LEAFBIT))
1987    {
1988      /* this is a normal node pointer, recurse */
1989      GetTreeStatsShortEdgesRec(pp, (childptr & RELOFFSETMASK) << 2, level, elen);
1990    } else {
1991      /* implicit singular edge */
1992      childptr &= ~LEAFMASK;
1993      epos = childptr + 1;
1994      edgelen = pp->pp_TreePruneLength - elen;
1995      if((edgelen > 1) && (edgelen <= SHORTEDGEMAX))
1996      {
1997    prefix = 0;
1998    while(--edgelen)
1999    {
2000    prefix *= pg->pg_AlphaSize;
2001    prefix += GetSeqCodeQuick(epos);
2002    epos++;
2003    }
2004    /* add stop bit */
2005    prefix |= 1UL << pg->pg_BitsUseTable[pp->pp_TreePruneLength - elen - 1];
2006    pp->pp_ShortEdgeCode[prefix].hc_Weight++;
2007      }
2008      else if(edgelen == 1)
2009      {
2010    pp->pp_ShortEdgeCode[1].hc_Weight++; /* also count length==1 edges */
2011      }
2012      /* increase edgecount */
2013      pp->pp_EdgeCount++;
2014    }
2015  }
2016}
2017/* \\\ */
2018
2019/* /// "GetTreeStatsLongEdgesRec()" */
2020void GetTreeStatsLongEdgesRec(struct PTPanPartition *pp,
2021                ULONG pos, ULONG level, ULONG elen)
2022{
2023  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
2024  ULONG edgelen;
2025  ULONG epos;
2026  ULONG prefix;
2027  ULONG *seqptr = pg->pg_MergedRawData;
2028  struct SfxNode *sfxnode;
2029  ULONG childptr;
2030  UWORD childidx;
2031  UWORD seqcode;
2032
2033  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
2034  /* update short edge code histogram */
2035  epos = sfxnode->sn_StartPos + 1;
2036  edgelen = sfxnode->sn_EdgeLen;
2037  /* do we need to truncate the edge? */
2038#if 0 /* debug */
2039  if(elen + edgelen > pp->pp_TreePruneLength)
2040  {
2041    edgelen = pp->pp_TreePruneLength - elen;
2042    printf("DARF NICHT!");
2043    sfxnode->sn_EdgeLen = edgelen;
2044  }
2045#endif
2046  elen += edgelen;
2047  if(edgelen > 1)
2048  {
2049    if(edgelen <= SHORTEDGEMAX)
2050    {
2051      prefix = 0;
2052      while(--edgelen)
2053      {
2054    prefix *= pg->pg_AlphaSize;
2055    prefix += GetSeqCodeQuick(epos);
2056    epos++;
2057      }
2058
2059      /* add stop bit */
2060      prefix |= 1UL << pg->pg_BitsUseTable[sfxnode->sn_EdgeLen - 1];
2061      /* check, if this edge doesn't have a huffman code */
2062      if(!pp->pp_ShortEdgeCode[prefix].hc_CodeLength)
2063      {
2064    pp->pp_LongEdges[pp->pp_LongEdgeCount++] = sfxnode;
2065      } else {
2066    /* replace sn_StartPos by huffman code index */
2067    sfxnode->sn_StartPos = prefix | (1UL << 30);
2068      }
2069    } else {
2070      /* this is a long edge anyway and has no huffman code */
2071      pp->pp_LongEdges[pp->pp_LongEdgeCount++] = sfxnode;
2072    }
2073  }
2074  else if(edgelen == 1)
2075  {
2076    /* replace sn_StartPos by huffman code index */
2077    sfxnode->sn_StartPos = 1 | (1UL << 30);
2078  }
2079
2080  /* check for maximum depth reached */
2081  if((++level >= pp->pp_TreePruneDepth) || (elen >= pp->pp_TreePruneLength))
2082  {
2083    //printf("LE Level %ld, Epos %ld\n", level, elen);
2084    return;
2085  }
2086
2087  /* traverse children */
2088  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
2089  while(childidx--)
2090  {
2091    childptr = sfxnode->sn_Children[childidx];
2092#if 1 // extra leaves switch (see 4.3.4 of diploma thesis)
2093    if(childptr >> LEAFBIT)
2094    {
2095      /* implicit singular edge */
2096      struct SfxNode *tinynode;
2097      /* allocate a new branching node */
2098      pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNode2Edges) - sizeof(ULONG); /* only one child! */
2099      if(pp->pp_SfxNEdgeOffset >= pp->pp_Sfx2EdgeOffset)
2100      {
2101    printf("Node buffer was too small!\n");
2102    return;
2103      }
2104      childptr &= ~LEAFMASK;
2105      /* fill in node data */
2106      tinynode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset];
2107      tinynode->sn_Parent = 1UL << RELOFFSETBITS; /* one edge */
2108      tinynode->sn_StartPos = childptr + elen;
2109      tinynode->sn_EdgeLen = pp->pp_TreePruneLength - elen;
2110      seqcode = GetSeqCodeQuick(childptr + elen + tinynode->sn_EdgeLen);
2111      tinynode->sn_AlphaMask = 1UL << SEQCODE_N; /* we don't need this branch code anymore */
2112      pp->pp_BranchCode[tinynode->sn_AlphaMask].hc_Weight++;
2113      tinynode->sn_Children[0] = childptr | LEAFMASK;
2114      /* fix link */
2115      seqcode = GetSeqCodeQuick(childptr + elen);
2116      childptr = pp->pp_Sfx2EdgeOffset >> 2;
2117      sfxnode->sn_Children[childidx] = childptr | (seqcode << RELOFFSETBITS);
2118    }
2119#endif
2120    if(!(childptr >> LEAFBIT))
2121    {
2122      /* this is a normal node pointer, recurse */
2123      GetTreeStatsLongEdgesRec(pp, (childptr & RELOFFSETMASK) << 2, level, elen);
2124    }
2125  }
2126}
2127/* \\\ */
2128
2129/* /// "GetTreeStatsBranchHistoRec()" */
2130void GetTreeStatsBranchHistoRec(struct PTPanPartition *pp,
2131                ULONG pos, ULONG level, ULONG elen)
2132{
2133  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
2134  struct SfxNode *sfxnode;
2135  ULONG *seqptr = pg->pg_MergedRawData;
2136  ULONG childptr;
2137  UWORD childidx;
2138  ULONG alphamask = 0;
2139  ULONG tmpmem[ALPHASIZE];
2140  UWORD seqcode;
2141
2142  /* traverse children */
2143  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
2144
2145  level++;
2146  elen += sfxnode->sn_EdgeLen;
2147
2148  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
2149  while(childidx--)
2150  {
2151    childptr = sfxnode->sn_Children[childidx];
2152    if(childptr >> LEAFBIT)
2153    {
2154      /* this is a leaf pointer and doesn't contain a seqcode */
2155      childptr &= ~LEAFMASK;
2156      seqcode = GetSeqCodeQuick(childptr + elen);
2157      tmpmem[seqcode] = sfxnode->sn_Children[childidx];
2158      alphamask |= 1UL << seqcode;
2159    } else {
2160      /* this is a normal node pointer, recurse */
2161      seqcode = childptr >> RELOFFSETBITS;
2162      tmpmem[seqcode] = sfxnode->sn_Children[childidx];
2163      alphamask |= 1UL << seqcode;
2164      /* check for maximum depth reached */
2165      if((level < pp->pp_TreePruneDepth) && (elen < pp->pp_TreePruneLength))
2166      {
2167    GetTreeStatsBranchHistoRec(pp, (childptr & RELOFFSETMASK) << 2, level, elen);
2168      }
2169    }
2170  }
2171  /* update branch histogramm */
2172  pp->pp_BranchCode[alphamask].hc_Weight++;
2173  sfxnode->sn_AlphaMask = alphamask;
2174
2175  /* sort branches and enter alphamask */
2176  childidx = 0;
2177  seqcode = 0;
2178  do
2179  {
2180    if(alphamask & (1UL << seqcode))
2181    {
2182      sfxnode->sn_Children[childidx++] = tmpmem[seqcode];
2183    }
2184  } while(++seqcode < pg->pg_AlphaSize);
2185}
2186/* \\\ */
2187
2188/* /// "GetTreeStatsVerifyRec()" */
2189void GetTreeStatsVerifyRec(struct PTPanPartition *pp, ULONG pos, ULONG treepos, ULONG hash)
2190{
2191  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
2192  struct SfxNode *sfxnode;
2193  ULONG *seqptr = pg->pg_MergedRawData;
2194  ULONG childptr;
2195  UWORD childidx;
2196  ULONG newhash;
2197
2198  /* traverse children */
2199  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
2200  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
2201  if(sfxnode->sn_EdgeLen)
2202  {
2203    treepos += sfxnode->sn_EdgeLen;
2204    hash = GetSeqHash(pg, sfxnode->sn_StartPos, sfxnode->sn_EdgeLen, hash);
2205  }
2206  while(childidx--)
2207  {
2208    childptr = sfxnode->sn_Children[childidx];
2209    if(childptr >> LEAFBIT)
2210    {
2211      /* this is a leaf pointer and doesn't contain a seqcode */
2212      childptr &= ~LEAFMASK;
2213      childptr += treepos;
2214      /* set bit to verify position */
2215      if(childptr < treepos)
2216      {
2217    printf("Childptr %ld < treepos %ld (pos %ld)\n", childptr, treepos, pos);
2218      }
2219      if(childptr >= pg->pg_TotalRawSize)
2220      {
2221    printf("Childptr %ld > total size %ld (pos %ld, treepos %ld, sn_EdgeLen %d)\n",
2222        childptr, pg->pg_TotalRawSize, pos, treepos, sfxnode->sn_EdgeLen);
2223      }
2224
2225      if(pp->pp_VerifyArray)
2226      {
2227    if(pp->pp_VerifyArray[(childptr-treepos) >> 3] & (1UL << ((childptr-treepos) & 7)))
2228    {
2229    printf("Clash at pos %ld, ptr %ld\n", pos, childptr - treepos);
2230    } else {
2231    pp->pp_VerifyArray[(childptr-treepos) >> 3] |= (1UL << ((childptr-treepos) & 7));
2232    }
2233      }
2234
2235      newhash = GetSeqHash(pg, childptr - treepos, treepos, 0);
2236      if(newhash != hash)
2237      {
2238        STRPTR tmpstr = (STRPTR) malloc(treepos+1);
2239        DecompressSequencePartTo(pg, seqptr,
2240                                 sfxnode->sn_StartPos + sfxnode->sn_EdgeLen - treepos,
2241                                 treepos, tmpstr);
2242
2243        printf("Hash mismatch for %s (%ld != %ld), treepos = %ld\n",
2244               tmpstr, newhash, hash, treepos);
2245        free(tmpstr);
2246      } else {
2247        //printf("Good");
2248      }
2249    } else {
2250      /* this is a normal node pointer, recurse */
2251      GetTreeStatsVerifyRec(pp, (childptr & RELOFFSETMASK) << 2, treepos, hash);
2252    }
2253  }
2254}
2255/* \\\ */
2256
2257/* /// "GetTreeStatsLeafCountRec()" */
2258ULONG GetTreeStatsLeafCountRec(struct PTPanPartition *pp, ULONG pos)
2259{
2260  struct SfxNode *sfxnode;
2261  ULONG childptr;
2262  UWORD childidx;
2263  ULONG cnt = 0;
2264
2265  /* traverse children */
2266  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
2267  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
2268  //printf("P=%08lx LC: %ld [%d]\n", pos, childidx, sfxnode->sn_AlphaMask);
2269  while(childidx--)
2270  {
2271    childptr = sfxnode->sn_Children[childidx];
2272    if(childptr >> LEAFBIT)
2273    {
2274      /* this is a leaf pointer and doesn't contain a seqcode */
2275      cnt++;
2276    } else {
2277      /* this is a normal node pointer, recurse */
2278      cnt += GetTreeStatsLeafCountRec(pp, (childptr & RELOFFSETMASK) << 2);
2279    }
2280  }
2281  return(cnt);
2282}
2283/* \\\ */
2284
2285/* /// "GetTreeStatsLeafCollectRec()" */
2286void GetTreeStatsLeafCollectRec(struct PTPanPartition *pp, ULONG pos)
2287{
2288  struct SfxNode *sfxnode;
2289  ULONG childptr;
2290  UWORD childidx;
2291
2292  /* traverse children */
2293  sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos];
2294  childidx = sfxnode->sn_Parent >> RELOFFSETBITS;
2295  while(childidx--)
2296  {
2297    childptr = sfxnode->sn_Children[childidx];
2298    if(childptr >> LEAFBIT)
2299    {
2300      /* this is a leaf pointer and doesn't contain a seqcode */
2301      *pp->pp_LeafBufferPtr++ = childptr & ~LEAFMASK;
2302    } else {
2303      /* this is a normal node pointer, recurse */
2304      GetTreeStatsLeafCollectRec(pp, (childptr & RELOFFSETMASK) << 2);
2305    }
2306  }
2307}
2308/* \\\ */
2309
2310/* /// "LongEdgeLengthCompare()" */
2311LONG LongEdgeLengthCompare(const struct SfxNode **node1, const struct SfxNode **node2)
2312{
2313  return(((LONG) (*node2)->sn_EdgeLen) - ((LONG) (*node1)->sn_EdgeLen));
2314}
2315/* \\\ */
2316
2317/* /// "LongEdgePosCompare()" */
2318LONG LongEdgePosCompare(const struct SfxNode **node1, const struct SfxNode **node2)
2319{
2320  return(((LONG) (*node1)->sn_StartPos) - ((LONG) (*node2)->sn_StartPos));
2321}
2322/* \\\ */
2323
2324/* /// "LongEdgeLabelCompare()" */
2325LONG LongEdgeLabelCompare(struct SfxNode **node1, struct SfxNode **node2)
2326{
2327  struct PTPanGlobal *pg = PTPanGlobalPtr;
2328  ULONG *seqptr = pg->pg_MergedRawData;
2329  ULONG spos1 = (*node1)->sn_StartPos;
2330  ULONG spos2 = (*node2)->sn_StartPos;
2331  ULONG len = (*node1)->sn_EdgeLen;
2332  UBYTE seqcode1, seqcode2;
2333
2334  if(spos1 == spos2) /* no need to compare */
2335  {
2336    if(len < (*node2)->sn_EdgeLen)
2337    {
2338      return(-1);
2339    }
2340    else if(len > (*node2)->sn_EdgeLen)
2341    {
2342      return(1);
2343    }
2344    return(0); /* string exactly equal */
2345  }
2346
2347  if((*node2)->sn_EdgeLen < len)
2348  {
2349    len = (*node2)->sn_EdgeLen;
2350  }
2351
2352  /* compare sequences */
2353  do
2354  {
2355    seqcode1 = GetSeqCodeQuick(spos1);
2356    seqcode2 = GetSeqCodeQuick(spos2);
2357    if(seqcode1 < seqcode2)
2358    {
2359      return(-1);
2360    }
2361    else if(seqcode1 > seqcode2)
2362    {
2363      return(1);
2364    }
2365    spos1++;
2366    spos2++;
2367  } while(--len);
2368
2369  /* sequence prefixes were the same! */
2370  if((*node1)->sn_EdgeLen >= (*node2)->sn_EdgeLen)
2371  {
2372    /* move starting pos "down" */
2373    if((*node1)->sn_StartPos < (*node2)->sn_StartPos)
2374    {
2375      //printf("Moved %ld -> %ld\n", (*node2)->sn_StartPos, (*node1)->sn_StartPos);
2376      (*node2)->sn_StartPos = (*node1)->sn_StartPos;
2377    }
2378    if((*node1)->sn_EdgeLen == (*node2)->sn_EdgeLen)
2379    {
2380      return(0);
2381    } else {
2382      return(1);
2383    }
2384  } else {
2385    /* shorter sequence is "smaller" */
2386    return(-1);
2387  }
2388}
2389/* \\\ */
2390
2391/* /// "GetSeqHash()" */
2392ULONG GetSeqHash(struct PTPanGlobal *pg, ULONG seqpos, ULONG len, ULONG hash)
2393{
2394  ULONG *seqptr = &pg->pg_MergedRawData[seqpos / MAXCODEFITLONG];
2395  ULONG modval = MAXCODEFITLONG - (seqpos % MAXCODEFITLONG);
2396  ULONG pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG];
2397
2398  /* calculate the hash value over the string */
2399  while(len--)
2400  {
2401    hash *= pg->pg_AlphaSize;
2402    if(--modval)
2403    {
2404      hash += (pval / pg->pg_PowerTable[modval]) % pg->pg_AlphaSize;
2405    } else {
2406      hash += pval % pg->pg_AlphaSize;
2407      pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG];
2408      modval = MAXCODEFITLONG;
2409    }
2410    hash %= HASHPRIME;
2411  }
2412  return(hash);
2413}
2414/* \\\ */
2415
2416/* /// "GetSeqHashBackwards()" */
2417ULONG GetSeqHashBackwards(struct PTPanGlobal *pg, ULONG seqpos, ULONG len, ULONG hash)
2418{
2419  seqpos += len;
2420  {
2421    ULONG *seqptr = &pg->pg_MergedRawData[seqpos / MAXCODEFITLONG];
2422    ULONG modval = MAXCODEFITLONG - (seqpos % MAXCODEFITLONG);
2423    ULONG pval = *seqptr >> pg->pg_BitsShiftTable[MAXCODEFITLONG];
2424
2425    /* calculate the hash value over the string */
2426    while(len--)
2427    {
2428      hash *= pg->pg_AlphaSize;
2429      if(modval < MAXCODEFITLONG)
2430      {
2431    hash += (pval / pg->pg_PowerTable[modval++]) % pg->pg_AlphaSize;
2432      } else {
2433    modval = 1;
2434    pval = *(--seqptr) >> pg->pg_BitsShiftTable[MAXCODEFITLONG];
2435    hash += pval % pg->pg_AlphaSize;
2436      }
2437      hash %= HASHPRIME;
2438    }
2439  }
2440  return(hash);
2441}
2442/* \\\ */
2443
2444/* /// "CheckLongEdgeMatch()" */
2445BOOL CheckLongEdgeMatch(struct PTPanPartition *pp, ULONG seqpos, ULONG edgelen,
2446                        ULONG dictpos)
2447{
2448  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
2449  STRPTR dictptr = &pp->pp_LongDict[dictpos];
2450  ULONG *seqptr = &pg->pg_MergedRawData[seqpos / MAXCODEFITLONG];
2451  ULONG modval = MAXCODEFITLONG - (seqpos % MAXCODEFITLONG);
2452  ULONG pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG];
2453
2454  while(edgelen--)
2455  {
2456    if(--modval)
2457    {
2458      if(pg->pg_DecompressTable[(pval / pg->pg_PowerTable[modval]) % pg->pg_AlphaSize] !=
2459         *dictptr++)
2460      {
2461        return(FALSE);
2462      }
2463    } else {
2464      if(pg->pg_DecompressTable[pval % pg->pg_AlphaSize] != *dictptr++)
2465      {
2466        return(FALSE);
2467      }
2468      pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG];
2469      modval = MAXCODEFITLONG;
2470    }
2471  }
2472  return(TRUE);
2473}
2474/* \\\ */
2475
2476/* /// "BuildLongEdgeDictionary()" */
2477BOOL BuildLongEdgeDictionary(struct PTPanPartition *pp)
2478{
2479  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
2480  struct SfxNode *sfxnode;
2481  ULONG *seqptr = pg->pg_MergedRawData;
2482  ULONG cnt;
2483  ULONG apos;
2484  ULONG spos;
2485  STRPTR dictptr;
2486  ULONG edgelen;
2487  ULONG dictsize;
2488  ULONG hashval;
2489  struct HashEntry *hash;
2490  ULONG walkinghash;
2491  ULONG lastedgelen;
2492  ULONG subfact;
2493  BOOL notfound;
2494  ULONG hashhit, hashmiss, walkmiss, walkhit, stringcnt;
2495  BOOL hassweep;
2496  BOOL quicksweep;
2497  BOOL safeskip;
2498
2499  ULONG olddictsize;
2500
2501  BenchTimePassed(pg);
2502  /* allocate long edge array */
2503  pp->pp_LongEdges = (struct SfxNode **) calloc(pp->pp_EdgeCount,
2504                                                sizeof(struct SfxNode *));
2505  if(!pp->pp_LongEdges)
2506  {
2507    printf("Out of memory for Long Edges Array!\n");
2508    return(FALSE);
2509  }
2510  pp->pp_LongEdgeCount = 0;
2511  GetTreeStatsLongEdgesRec(pp, 0, 0, 0);
2512  printf("Long Edge Array filled with %ld entries ", pp->pp_LongEdgeCount);
2513  printf("(%ld KB unused)\n", (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10);
2514
2515#if 0 /* debug */
2516  for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2517  {
2518    printf("%ld: %ld\n", cnt, pp->pp_LongEdges[cnt]->sn_EdgeLen);
2519  }
2520#endif
2521  /* now sort array */
2522  if(pp->pp_LongEdgeCount > 2)
2523  {
2524#if 1 /* disable this, if sorting takes too long, but
2525         this causes the dictionary to explode */
2526#if 0 // debug
2527    printf("Before sorting:\n");
2528    for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2529    {
2530      DecompressSequencePartTo(pg, seqptr,
2531                pp->pp_LongEdges[cnt]->sn_StartPos,
2532                pp->pp_LongEdges[cnt]->sn_EdgeLen,
2533                pg->pg_TempBuffer);
2534      printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos,
2535        pg->pg_TempBuffer);
2536    }
2537#endif
2538    printf("Sorting (Pass 1)...\n");
2539    qsort(pp->pp_LongEdges, pp->pp_LongEdgeCount, sizeof(struct SfxNode *),
2540    (int (*)(const void *, const void *)) LongEdgeLabelCompare);
2541    /* some edges might now have been moved to the front after sorting,
2542       but due to the sorting, these will be alternating, with edges of the
2543       same length, so fix these in O(n) */
2544#if 0 // debug
2545    printf("Before prefix clustering:\n");
2546    for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2547    {
2548      DecompressSequencePartTo(pg, seqptr,
2549            pp->pp_LongEdges[cnt]->sn_StartPos,
2550            pp->pp_LongEdges[cnt]->sn_EdgeLen,
2551            pg->pg_TempBuffer);
2552            printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos,
2553        pg->pg_TempBuffer);
2554    }
2555#endif
2556    for(cnt = pp->pp_LongEdgeCount-1; cnt > 0; cnt--)
2557    {
2558      ULONG spos1 = pp->pp_LongEdges[cnt-1]->sn_StartPos;
2559      ULONG spos2 = pp->pp_LongEdges[cnt]->sn_StartPos;
2560      ULONG len = pp->pp_LongEdges[cnt-1]->sn_EdgeLen;
2561
2562      if(len <= pp->pp_LongEdges[cnt]->sn_EdgeLen && (spos1 != spos2))
2563      {
2564    /* compare sequences */
2565    do
2566    {
2567    if(GetSeqCodeQuick(spos1) != GetSeqCodeQuick(spos2))
2568    {
2569        break;
2570    }
2571    spos1++;
2572    spos2++;
2573    } while(--len);
2574    if(!len)
2575    {
2576    /* was equal */
2577    pp->pp_LongEdges[cnt-1]->sn_StartPos = pp->pp_LongEdges[cnt]->sn_StartPos;
2578    }
2579      }
2580    }
2581#if 0 // debug
2582    printf("After prefix clustering:\n");
2583    for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2584    {
2585      DecompressSequencePartTo(pg, seqptr,
2586                pp->pp_LongEdges[cnt]->sn_StartPos,
2587                pp->pp_LongEdges[cnt]->sn_EdgeLen,
2588                pg->pg_TempBuffer);
2589      printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos,
2590        pg->pg_TempBuffer);
2591    }
2592#endif
2593    printf("Sorting (Pass 2)...\n");
2594    qsort(pp->pp_LongEdges, pp->pp_LongEdgeCount, sizeof(struct SfxNode *),
2595    (int (*)(const void *, const void *)) LongEdgePosCompare);
2596#if 0 // debug
2597    printf("After offset sorting\n");
2598    for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2599    {
2600      DecompressSequencePartTo(pg, seqptr,
2601                pp->pp_LongEdges[cnt]->sn_StartPos,
2602                pp->pp_LongEdges[cnt]->sn_EdgeLen,
2603                pg->pg_TempBuffer);
2604      printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos,
2605        pg->pg_TempBuffer);
2606    }
2607#endif
2608
2609    {
2610      ULONG ivalstart = 0;
2611      ULONG ivalend = 0;
2612      ULONG ivalcnt = 0;
2613      ULONG ivalsum = 0;
2614      ULONG ivalextend = 0;
2615      ULONG oldcnt = pp->pp_LongEdgeCount;
2616
2617      /* examine edges and create bigger intervals */
2618      for(cnt = 0; cnt < oldcnt; cnt++)
2619      {
2620    if(pp->pp_LongEdges[cnt]->sn_StartPos > ivalend)
2621    {
2622    /* create new interval (these thresholds were found out using lots
2623             of testing) */
2624    if((ivalextend > 5) &&
2625        (ivalend - ivalstart > 14))
2626    {
2627        //printf("Ival: %ld - %ld\n", ivalstart, ivalend);
2628        if((pp->pp_SfxNEdgeOffset < pp->pp_Sfx2EdgeOffset - sizeof(struct SfxNodeStub)) &&
2629        (pp->pp_LongEdgeCount < pp->pp_EdgeCount))
2630        {
2631        /* generate a small stub node, that will lead the array building */
2632        pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNodeStub); /* only a stub! */
2633        sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset];
2634        sfxnode->sn_StartPos = ivalstart;
2635        sfxnode->sn_EdgeLen = ivalend - ivalstart + 1;
2636        sfxnode->sn_AlphaMask = 0xCAFE;
2637        pp->pp_LongEdges[pp->pp_LongEdgeCount++] = sfxnode;
2638        ivalcnt++;
2639        ivalsum += sfxnode->sn_EdgeLen;
2640        } else {
2641        printf("Out of mem!\n");
2642        cnt = oldcnt;
2643        }
2644        ivalextend = 0;
2645    }
2646    ivalstart = pp->pp_LongEdges[cnt]->sn_StartPos;
2647    ivalend = ivalstart + pp->pp_LongEdges[cnt]->sn_EdgeLen - 1;
2648    } else {
2649    /* check, if we have to enlarge this interval...
2650        (is this an attempt to compensate something? :) )*/
2651    if((pp->pp_LongEdges[cnt]->sn_StartPos +
2652        pp->pp_LongEdges[cnt]->sn_EdgeLen - 1 > ivalend))
2653        //(ivalstart + (pp->pp_LongEdges[cnt]->sn_EdgeLen * 2) > ivalend))
2654    {
2655        ivalend = pp->pp_LongEdges[cnt]->sn_StartPos +
2656        pp->pp_LongEdges[cnt]->sn_EdgeLen - 1;
2657        if(pp->pp_LongEdges[cnt]->sn_StartPos > ivalstart)
2658        {
2659        ivalextend++;
2660        }
2661    }
2662    }
2663      }
2664      printf("Additional intervals generated %ld (%ld KB)\n", ivalcnt, ivalsum >> 10);
2665    }
2666#endif
2667    printf("Sorting (Pass 3)...\n");
2668    qsort(pp->pp_LongEdges, pp->pp_LongEdgeCount, sizeof(struct SfxNode *),
2669    (int (*)(const void *, const void *)) LongEdgeLengthCompare);
2670#if 0 // debug
2671    printf("After length sorting\n");
2672    for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2673    {
2674      DecompressSequencePartTo(pg, seqptr,
2675                pp->pp_LongEdges[cnt]->sn_StartPos,
2676                pp->pp_LongEdges[cnt]->sn_EdgeLen,
2677                pg->pg_TempBuffer);
2678      printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos,
2679        pg->pg_TempBuffer);
2680    }
2681#endif
2682  }
2683  pg->pg_Bench.ts_LongDictPre += BenchTimePassed(pg);
2684  if(!pp->pp_LongEdgeCount)
2685  {
2686    /* catch special case of no long edges */
2687    pp->pp_LongEdgeLenSize = 1;
2688    pp->pp_LongEdgeLenCode = (struct HuffCode *) calloc(pp->pp_LongEdgeLenSize,
2689                            sizeof(struct HuffCode));
2690    //pp->pp_LongEdgeLenCode[0].hc_CodeLength = 1;
2691    pp->pp_LongDictSize = 1;
2692    pp->pp_LongDict = (STRPTR) malloc(pp->pp_LongDictSize);
2693    *pp->pp_LongDict = 0;
2694    dictsize = 0;
2695  } else {
2696    pp->pp_LongEdgeLenSize = pp->pp_LongEdges[0]->sn_EdgeLen + 1;
2697    /* allocate long edge len histogram */
2698    pp->pp_LongEdgeLenCode = (struct HuffCode *) calloc(pp->pp_LongEdgeLenSize,
2699                            sizeof(struct HuffCode));
2700    if(!pp->pp_LongEdgeLenCode)
2701    {
2702      printf("Out of memory for Long Edge Length Histogram!\n");
2703      return(FALSE);
2704    }
2705
2706    /* count lengths */
2707    for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2708    {
2709      if(pp->pp_LongEdges[cnt]->sn_AlphaMask != 0xCAFE)
2710      {
2711    pp->pp_LongEdgeLenCode[pp->pp_LongEdges[cnt]->sn_EdgeLen].hc_Weight++;
2712      }
2713    }
2714
2715    printf("Longest edge: %ld\n", pp->pp_LongEdgeLenSize - 1);
2716
2717    /* generate huffman code for edge lengths */
2718    printf("Generating huffman code for long edges length\n");
2719    BuildHuffmanCode(pp->pp_LongEdgeLenCode, pp->pp_LongEdgeLenSize, 0);
2720
2721#if 0
2722    /* debug */
2723    for(cnt = 0; cnt < pp->pp_LongEdgeLenSize; cnt++)
2724    {
2725      WORD bitcnt;
2726      if(pp->pp_LongEdgeLenCode[cnt].hc_CodeLength)
2727      {
2728        printf("%6ld: %7ld -> %2d ", cnt, pp->pp_LongEdgeLenCode[cnt].hc_Weight,
2729        pp->pp_LongEdgeLenCode[cnt].hc_CodeLength);
2730    for(bitcnt = pp->pp_LongEdgeLenCode[cnt].hc_CodeLength - 1; bitcnt >= 0; bitcnt--)
2731    {
2732    printf("%s", pp->pp_LongEdgeLenCode[cnt].hc_Codec & (1UL << bitcnt) ? "1" : "0");
2733    }
2734    printf("\n");
2735      }
2736    }
2737#endif
2738
2739    /* now allocate a buffer for building the dictionary */
2740    pp->pp_LongDictSize = 16UL << 10; /* start with 16KB */
2741    pp->pp_LongDict = (STRPTR) malloc(pp->pp_LongDictSize);
2742    if(!pp->pp_LongDict)
2743    {
2744      printf("Out of memory for long edges dictionary!\n");
2745      free(pp->pp_LongEdges);
2746      return(FALSE);
2747    }
2748    pp->pp_LongDict[0] = 0;
2749    dictsize = 0;
2750
2751    /* allocate the hash table to speed up search */
2752    pp->pp_LongDictHashSize = 256UL << 10;
2753    pp->pp_LongDictHash = AllocHashArray(pp->pp_LongDictHashSize);
2754    if(!pp->pp_LongDictHash)
2755    {
2756      printf("Out of memory for long edges dictionary hash!\n");
2757      free(pp->pp_LongEdges);
2758      free(pp->pp_LongDict);
2759      return(FALSE);
2760    }
2761    printf("Allocated a hash for %ld entries...\n", pp->pp_LongDictHashSize);
2762
2763    /* some statistical data */
2764    hashhit   = 0; /* string found in hash */
2765    hashmiss  = 0; /* string found in hash, but was no match */
2766    walkmiss  = 0; /* wrong fingerprint matches */
2767    walkhit   = 0; /* string found during walk */
2768    stringcnt = 1; /* number of strings generated */
2769
2770    /* create special initial string to avoid dictionary exploding by fragmented truncated edges */
2771    sfxnode = pp->pp_LongEdges[0];
2772    spos = sfxnode->sn_StartPos + 1;
2773    edgelen = sfxnode->sn_EdgeLen - 1;
2774    dictptr = pp->pp_LongDict;
2775    dictsize = edgelen;
2776    while(edgelen--)
2777    {
2778      *dictptr++ = pg->pg_DecompressTable[GetSeqCodeQuick(spos)];
2779      spos++;
2780    }
2781    /* don't forget the termination char */
2782    *dictptr = 0;
2783
2784    /* insert all edges */
2785    lastedgelen = pp->pp_LongEdges[0]->sn_EdgeLen;
2786    hassweep = FALSE;
2787    olddictsize = 0;
2788    for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++)
2789    {
2790#if 0 /* debug */
2791      if((dictsize > olddictsize + 40) || (cnt == pp->pp_LongEdgeCount-1))
2792      {
2793    fprintf(stderr, "%ld %ld\n", cnt, dictsize);
2794    olddictsize = dictsize;
2795      }
2796#endif
2797      if(((cnt+1) & 0x3ff) == 0)
2798      {
2799    if(((cnt+1) >> 10) % 50)
2800    {
2801    printf(".");
2802    fflush(stdout);
2803    } else {
2804    printf(". %2ld%% (%ld KB, %ld strings)\n",
2805        (cnt * 100) / pp->pp_LongEdgeCount, dictsize >> 10, stringcnt);
2806    }
2807      }
2808      sfxnode = pp->pp_LongEdges[cnt];
2809      /* note that we can skip the first base because it is part of the branch */
2810      spos = sfxnode->sn_StartPos + 1;
2811      edgelen = sfxnode->sn_EdgeLen - 1;
2812
2813      /*if(edgelen != lastedgelen)
2814    {
2815    printf("%6ld\n", edgelen);
2816    }*/
2817      /* if we have swept over the dictionary, it is safe to skip search */
2818      safeskip = hassweep;
2819
2820      /* calculate hash value */
2821      hashval = GetSeqHashBackwards(pg, spos, edgelen, 0);
2822      //printf("[%ld] Len %ld, Hashval = %ld, dictsize = %ld ", cnt, edgelen, hashval, dictsize);
2823      if((hash = GetHashEntry(pp->pp_LongDictHash, hashval)))
2824      {
2825    //printf("Hash ");
2826    /* we will check only atmost 12 characters. the probability, that the
2827    edge had the same hash value and more than 12 matching characters, is
2828    unbelievably low */
2829    if(CheckLongEdgeMatch(pp, spos, edgelen > 12 ? 12 : edgelen, hash->he_Data))
2830    {
2831    hashhit++;
2832    //printf("match\n");
2833    sfxnode->sn_StartPos = hash->he_Data | (1UL << 31);
2834    continue;
2835    }
2836    hashmiss++;
2837    //printf("miss\n");
2838    /* we had a hash miss, consider it not safe anymore to skip the search */
2839    safeskip = FALSE;
2840      }
2841
2842      if(edgelen < lastedgelen)
2843      {
2844    if(pp->pp_LongDictHash->ha_Used > (pp->pp_LongDictHash->ha_Size >> 3))
2845    {
2846    /* seems as if the hash full by more than 1/8 and needs some clearing */
2847    //printf("Clearing hash...\n");
2848          ClearHashArray(pp->pp_LongDictHash);
2849    }
2850    hassweep = FALSE;
2851    safeskip = FALSE;
2852      }
2853      /* sorry, have to do a linear search */
2854      notfound = TRUE;
2855
2856      /* if he have swept over the dictionary, generating all possible hash values
2857    and did not hit the hash, it is very impossible that we will find it
2858    anyway, so we just append it.
2859      */
2860      if(!safeskip)
2861      {
2862    /* first attempt was to walk through the dictionary from the
2863    beginning to the end. However, in some bright moment,
2864    I thought about traversing the dictionary the other
2865    way round and see if this works better */
2866    /* hash = \sum{i < m}{dict[pos+i]*5^(m-i)}
2867    to walk right:
2868    hash = oldhash * 5 + dict[pos] - dict[pos-m] * (5^(m+1))
2869    to walk left:
2870    hash = oldhash - dict[pos] / 5 + dict[pos] * (5^m) */
2871
2872    /* init walking hash/finger print value */
2873    apos = dictsize;
2874    dictptr = &pp->pp_LongDict[apos];
2875    subfact = 1;
2876    walkinghash = 0;
2877
2878    //printf("edgelen = %ld\n", edgelen);
2879    do
2880    {
2881    /* calculate character outshifting multiplicator */
2882    subfact *= pg->pg_AlphaSize;
2883    subfact %= HASHPRIME;
2884    /* calculate finger print */
2885    walkinghash *= pg->pg_AlphaSize;
2886    walkinghash += pg->pg_CompressTable[*(--dictptr)];
2887    walkinghash %= HASHPRIME;
2888    } while(--apos > dictsize - edgelen);
2889
2890    /* did the length shrink and does it pay to do a sweep? */
2891    if((edgelen < lastedgelen) &&
2892    (cnt + 40 < pp->pp_LongEdgeCount) &&
2893    (edgelen == (ULONG) pp->pp_LongEdges[cnt + 40]->sn_EdgeLen - 1))
2894    {
2895    //printf("Q[%ld<-%ld]", edgelen, pp->pp_LongEdges[cnt + 15]->sn_EdgeLen - 1);
2896    //printf("Quicksweep\n");
2897    quicksweep = TRUE;
2898    hassweep = TRUE;
2899    } else {
2900    quicksweep = FALSE;
2901    }
2902    //hassweep = quicksweep = FALSE; /* FIXME */
2903    /* loop until found or end of dictionary is reached */
2904    do
2905    {
2906    //printf("Apos = %ld, WH = %ld\n", apos, walkinghash);
2907    /* finger print value matches */
2908    if(quicksweep)
2909    {
2910        if(!(GetHashEntry(pp->pp_LongDictHash, walkinghash)))
2911        {
2912        InsertHashEntry(pp->pp_LongDictHash, walkinghash, apos);
2913        }
2914    }
2915    if(walkinghash == hashval)
2916    {
2917        //printf("Walk ");
2918        /* verify hit (well, to a high probability) */
2919        if(CheckLongEdgeMatch(pp, spos, edgelen > 12 ? 12 : edgelen, apos))
2920        {
2921        //printf("hit\n");
2922#if 0 /* debug */
2923        if(safeskip)
2924        {
2925        printf("We would have missed %ld [%ld != %08lx], %ld!\n",
2926            apos, hashval,
2927            (ULONG) GetHashEntry(pp->pp_LongDictHash, walkinghash),
2928            edgelen);
2929      }
2930#endif
2931        /* found it! */
2932        walkhit++;
2933        sfxnode->sn_StartPos = apos | (1UL << 31);
2934        //printf("Walk: %f\n", (double) ((double) apos / (double) dictsize));
2935        notfound = FALSE;
2936        /* we have to finish the scan in quicksweep mode */
2937        if(quicksweep)
2938        {
2939        hashval = ~0UL; /* make sure we won't find another hit */
2940        } else {
2941        /* insert hash entry */
2942        InsertHashEntry(pp->pp_LongDictHash, hashval, apos);
2943        break;
2944        }
2945        } else {
2946        walkmiss++;
2947        //printf("miss\n");
2948        }
2949    }
2950    if(apos)
2951    {
2952        /* calculate new hash value */
2953        walkinghash += HASHPRIME;
2954        walkinghash *= pg->pg_AlphaSize;
2955        walkinghash -= pg->pg_CompressTable[dictptr[edgelen-1]] * subfact;
2956        walkinghash += pg->pg_CompressTable[*(--dictptr)];
2957        walkinghash %= HASHPRIME;
2958        apos--;
2959    } else {
2960        break;
2961    }
2962    } while(TRUE);
2963      }
2964      lastedgelen = edgelen;
2965      /* check, if we already found it */
2966      if(notfound)
2967      {
2968    stringcnt++;
2969    apos = dictsize;
2970    dictsize += edgelen;
2971    //printf("add");
2972    //printf("Appending %ld at %ld...\n", edgelen, apos);
2973    if(dictsize >= pp->pp_LongDictSize)
2974    {
2975    STRPTR newptr;
2976    /* double the size of the buffer */
2977    pp->pp_LongDictSize <<= 1;
2978    if((newptr = (STRPTR) realloc(pp->pp_LongDict, pp->pp_LongDictSize)))
2979    {
2980        pp->pp_LongDict = newptr;
2981        //printf("Expanded Dictionary to %ld bytes.\n", pp->pp_LongDictSize);
2982    } else {
2983        printf("Out of memory while expanding long edges dictionary!\n");
2984        free(pp->pp_LongDict);
2985        free(pp->pp_LongEdges);
2986        return(FALSE);
2987    }
2988    }
2989    /* insert hash entry */
2990    InsertHashEntry(pp->pp_LongDictHash, hashval, apos);
2991    /* fix start pos */
2992    sfxnode->sn_StartPos = apos | (1UL << 31);
2993    dictptr = &pp->pp_LongDict[apos];
2994    while(edgelen--)
2995    {
2996    *dictptr++ = pg->pg_DecompressTable[GetSeqCodeQuick(spos)];
2997    spos++;
2998    }
2999    /* don't forget the termination char */
3000    *dictptr = 0;
3001      }
3002    }
3003    pp->pp_LongDictSize = dictsize;
3004    /* printf("\nHashhit %ld, Hashmiss %ld, Walkhit %ld, Walkmiss %ld\n",
3005       hashhit, hashmiss, walkhit, walkmiss);*/
3006    printf("\nDictionary size: %ld KB (%ld strings)\n", dictsize >> 10, stringcnt);
3007    //printf(pp->pp_LongDict);
3008  }
3009
3010  /* calculate bits usage */
3011  pp->pp_LongRelPtrBits = 1;
3012  while((1UL << pp->pp_LongRelPtrBits) < dictsize)
3013  {
3014    pp->pp_LongRelPtrBits++;
3015  }
3016
3017  /* compress sequence to save memory */
3018  if(!(pp->pp_LongDictRaw = CompressSequence(pg, pp->pp_LongDict)))
3019  {
3020    printf("Out of memory for compressed dictionary string!\n");
3021    FreeHashArray(pp->pp_LongDictHash);
3022    free(pp->pp_LongEdges);
3023    free(pp->pp_LongDict);
3024    return(FALSE);
3025  }
3026
3027  printf("Final compressed dictionary size: %ld bytes.\n",
3028    ((dictsize / MAXCODEFITLONG) + 1) * sizeof(ULONG));
3029
3030  /* free some memory */
3031  FreeHashArray(pp->pp_LongDictHash);
3032  free(pp->pp_LongEdges);
3033  free(pp->pp_LongDict);
3034
3035  pg->pg_Bench.ts_LongDictBuild += BenchTimePassed(pg);
3036
3037  return(TRUE);
3038}
3039/* \\\ */
3040
3041/* /// "WriteTreeToDisk()" */
3042BOOL WriteTreeToDisk(struct PTPanPartition *pp)
3043{
3044  struct PTPanGlobal *pg = pp->pp_PTPanGlobal;
3045  struct SfxNode *sfxnode;
3046  ULONG pos;
3047
3048  ULONG childptr;
3049  ULONG packsize;
3050  ULONG bytessaved;
3051  BOOL  freedisk = FALSE;
3052
3053  BenchTimePassed(pg);
3054  /* after we have prepared all the codecs and compression tables, we have to modify
3055     the tree to be able to store it on the disk. Bottom up approach.
3056     a) count the space required for the cut off leaves. They are stored as compressed
3057        arrays, each array holding at least two leaves. The leaves in the upper levels
3058        of the tree are not saved there, as a pointer to the leaf array containing only
3059        one leaf would be a waste of space (instead they are stored directly as a child
3060        ptr).
3061     b) traverse the tree in DFS order from lowest level to the root, from right to the
3062        left, enter relative pointers in the ChildPtr[] array, count the space
3063    consumption.
3064  */
3065
3066  /* calculate the relative pointers and the tree traversal */
3067  pp->pp_DiskOuterLeaves = 0;
3068  pp->pp_DiskTreeSize = 0;
3069  pp->pp_TraverseTreeRoot = ~0UL & RELOFFSETMASK;
3070 
3071  ULONG tempDiskTreeSize = FixRelativePointersRec(pp, 0, 0, 0);
3072  pp->pp_DiskTreeSize += tempDiskTreeSize;                     
3073  //printf("Total size on disk: %ld KB\n", pp->pp_DiskTreeSize >> 10);
3074
3075  pg->pg_Bench.ts_Reloc += BenchTimePassed(pg);
3076  /* now finally write it to disk */
3077  pp->pp_PartitionFile = fopen(pp->pp_PartitionName, "w");
3078  if(!pp->pp_PartitionFile)
3079  {
3080    printf("ERROR: Couldn't open partition file %s for writing!\n",
3081    pp->pp_PartitionName);
3082    return(FALSE);
3083  }
3084
3085  WriteTreeHeader(pp);
3086
3087  /* use unused node buffer for temporary write buffer */
3088  pp->pp_DiskBuffer = (UBYTE *) &pp->pp_SfxNodes[pp->pp_SfxNEdgeOffset];
3089  pp->pp_DiskBufferSize = pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset;
3090  if(pp->pp_DiskBufferSize < (128UL << 10))
3091  {
3092    /* disk buffer was much too small! */
3093    pp->pp_DiskBufferSize = 128UL << 10;
3094    pp->pp_DiskBuffer = (UBYTE *) calloc(1, pp->pp_DiskBufferSize);
3095    freedisk = TRUE;
3096  } else {
3097    if(pp->pp_DiskBufferSize > (512UL << 10))
3098    {
3099      pp->pp_DiskBufferSize = 512UL << 10;
3100    }
3101  }
3102  //printf("Diskbuffer: %ld KB\n", pp->pp_DiskBufferSize >> 10);
3103  pp->pp_DiskPos = 0;
3104  bytessaved = 0;
3105
3106#if 0 /* debug */
3107  pp->pp_BranchTree = BuildHuffmanTreeFromTable(pp->pp_BranchCode, 1UL << pg->pg_AlphaSize);
3108  pp->pp_ShortEdgeTree = BuildHuffmanTreeFromTable(pp->pp_ShortEdgeCode, 1UL << (pg->pg_BitsUseTable[SHORTEDGEMAX]+1));
3109  pp->pp_LongEdgeLenTree = BuildHuffmanTreeFromTable(pp->pp_LongEdgeLenCode, pp->pp_LongEdgeLenSize);
3110#endif
3111
3112  printf("Writing (%ld KB)",pp->pp_DiskTreeSize >> 10);
3113  fflush(NULL);
3114  pp->pp_DiskNodeCount = 0;
3115  pp->pp_DiskNodeSpace = 0;
3116  pp->pp_DiskLeafCount = 0;
3117  pp->pp_DiskLeafSpace = 0;
3118
3119  pos = pp->pp_TraverseTreeRoot;
3120  while((pos & RELOFFSETMASK) != (~0UL & RELOFFSETMASK))
3121  {
3122    childptr = (pos & RELOFFSETMASK) << 2;
3123    //printf("Pos=%08lx [%ld]\n", childptr, bytessaved);
3124    sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[childptr];
3125
3126    packsize = 0;
3127    if(sfxnode->sn_AlphaMask) /* is this a normal node or the end of the tree */
3128    {
3129      packsize = WritePackedNode(pp, childptr, &pp->pp_DiskBuffer[pp->pp_DiskPos]);
3130      pp->pp_DiskNodeCount++;
3131      pp->pp_DiskNodeSpace += packsize;
3132#if 0 /* debug */
3133      {
3134        struct TreeNode *tn;
3135        pp->pp_DiskTree = &pp->pp_DiskBuffer[pp->pp_DiskPos];
3136        tn = ReadPackedNode(pp, 0);
3137        if(tn)
3138        {
3139        if(packsize != tn->tn_Size)
3140        {
3141            ULONG bc;
3142            printf("ARGH! ARGH! ARRRRGH! NodePos %08lx [%ld != %ld]\n",
3143        pos, packsize, tn->tn_Size);
3144        for(bc = 0; bc < 32; bc++)
3145        {
3146        printf("[%02x] ", pp->pp_DiskTree[bc]);
3147        }
3148        printf("\n");
3149    }
3150    free(tn);
3151    }
3152      }
3153#endif
3154    } else {
3155      packsize = WritePackedLeaf(pp, childptr, &pp->pp_DiskBuffer[pp->pp_DiskPos]);
3156      pp->pp_DiskLeafCount++;
3157      pp->pp_DiskLeafSpace += packsize;
3158#if 0 /* debug */
3159      {
3160    struct TreeNode *tn;
3161    BOOL argh;
3162    ULONG bcnt;
3163
3164    pp->pp_DiskTree = &pp->pp_DiskBuffer[pp->pp_DiskPos];
3165    tn = ReadPackedLeaf(pp, 0);
3166    if(tn)
3167    {
3168    argh = (packsize != tn->tn_Size);
3169    for(bcnt = 0; bcnt < tn->tn_NumLeaves; bcnt++)
3170    {
3171        if(tn->tn_Leaves[bcnt] < 0)
3172        {
3173        argh = TRUE;
3174        }
3175    }
3176    if(argh)
3177    {
3178        ULONG bc;
3179        printf("ARGH! ARGH! ARRRRGH! LeafPos %08lx [%ld != %ld]\n",
3180        pos, packsize, tn->tn_Size);
3181        for(bc = 0; bc < 32; bc++)
3182        {
3183        printf("[%02x] ", pp->pp_DiskTree[bc]);
3184        }
3185        printf("\n");
3186    }
3187    free(tn);
3188    }
3189      }
3190#endif
3191    }
3192    pp->pp_DiskPos += packsize;
3193    bytessaved += packsize;
3194    /* check if disk buffer is full enough to write a new chunk */
3195    if(pp->pp_DiskPos > (pp->pp_DiskBufferSize >> 1))
3196    {
3197
3198      printf(".");
3199      fflush(NULL);
3200      fwrite(pp->pp_DiskBuffer, pp->pp_DiskPos, 1, pp->pp_PartitionFile);
3201      pp->pp_DiskPos = 0;
3202    }
3203    pos = sfxnode->sn_Parent;
3204  } // end while
3205
3206  if(pp->pp_DiskPos)
3207  {
3208    printf(".\n");
3209    fwrite(pp->pp_DiskBuffer, pp->pp_DiskPos, 1, pp->pp_PartitionFile);
3210  }
3211  pp->pp_DiskIdxSpace = ftell(pp->pp_PartitionFile);
3212  printf("%ld inner nodes     (%ld KB, %f b.p.n.)\n"
3213    "%ld leaf nodes      (%ld KB, %f b.p.n.)\n"
3214         "%ld leaves in array (%ld KB, %f b.p.l.)\n"
3215    "Overall %f bytes per base.\n",
3216    pp->pp_DiskNodeCount, pp->pp_DiskNodeSpace >> 10,
3217    (float) pp->pp_DiskNodeSpace / (float) pp->pp_DiskNodeCount,
3218    pp->pp_DiskLeafCount, pp->pp_DiskLeafSpace >> 10,
3219    (float) pp->pp_DiskLeafSpace / (float) pp->pp_DiskLeafCount,
3220    pp->pp_DiskOuterLeaves, pp->pp_DiskLeafSpace >> 10,
3221    (float) pp->pp_DiskLeafSpace  / (float) pp->pp_DiskOuterLeaves,
3222    (float) pp->pp_DiskIdxSpace / (float) pp->pp_Size);
3223  fclose(pp->pp_PartitionFile);
3224  if(freedisk)
3225  {
3226    free(pp->pp_DiskBuffer);
3227  }
3228
3229  pg->pg_Bench.ts_Writing += BenchTimePassed(pg);
3230
3231  if(bytessaved != pp->pp_DiskTreeSize)
3232  {
3233    printf("ERROR: Calculated tree size did not match written data (%ld != %ld)!\n",
3234    bytessaved, pp->pp_DiskTreeSize);
3235    return(FALSE);
3236  }
3237  return(TRUE);
3238}
3239/* \\\ */
3240
3241/* /// "CreatePartitionLookup()" */
3242BOOL CreatePartitionLookup(struct PTPanGlobal *pg)
3243{
3244  ULONG cnt;
3245  ULONG len;
3246  struct PTPanPartition *pp;
3247  UWORD range;
3248
3249  /* allocate memory for table */
3250  len = pg->pg_PowerTable[pg->pg_MaxPrefixLen];
3251  pg->pg_PartitionLookup = (struct PTPanPartition **) calloc(len, sizeof(struct PTPanPartition *));
3252  if(!pg->pg_PartitionLookup)
3253  {
3254    return(FALSE); /* out of memory */
3255  }
3256
3257  /* create lookup table */
3258  pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head;
3259  for(cnt = 0; cnt < len; cnt++)
3260  {
3261    if(!pp->pp_Node.ln_Succ)
3262    {
3263      break; /* end of partition list reached */
3264    }
3265    do
3266    {
3267      range = pg->pg_PowerTable[pg->pg_MaxPrefixLen - pp->pp_PrefixLen];
3268      if((cnt >= pp->pp_Prefix * range) && (cnt < (pp->pp_Prefix+1) * range))
3269      {
3270    //printf("Entry %ld = prefix %ld\n", cnt, pp->pp_Prefix);
3271    pg->pg_PartitionLookup[cnt] = pp;
3272    break;
3273      } else {
3274    if(cnt >= (pp->pp_Prefix+1) * range)
3275    {
3276    /* we're past this partition prefix, get next one */
3277    pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ;
3278    if(!pp->pp_Node.ln_Succ)
3279    {
3280        break; /* end of partition list reached */
3281    }
3282    } else {
3283    break; /* not yet found */
3284    }
3285      }
3286    } while(TRUE);
3287  }
3288  return(TRUE);
3289}
3290/* \\\ */
Note: See TracBrowser for help on using the repository browser.