| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <string.h> |
|---|
| 4 | // #include <malloc.h> |
|---|
| 5 | #include <unistd.h> |
|---|
| 6 | #include <PT_server.h> |
|---|
| 7 | #include <PT_server_prototypes.h> |
|---|
| 8 | #include "ptpan.h" |
|---|
| 9 | #include "pt_prototypes.h" |
|---|
| 10 | |
|---|
| 11 | /* nice niffy macro to get a compressed sequence code */ |
|---|
| 12 | #define GetSeqCodeQuick(pos) \ |
|---|
| 13 | (((seqptr[(pos) / MAXCODEFITLONG] \ |
|---|
| 14 | >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) \ |
|---|
| 15 | / pg->pg_PowerTable[MAXCODEFITLONG - ((pos) % MAXCODEFITLONG) - 1]) \ |
|---|
| 16 | % pg->pg_AlphaSize) |
|---|
| 17 | |
|---|
| 18 | #define GetSeqCode(pos) ((pos >= pg->pg_TotalRawSize) ? SEQCODE_N : GetSeqCodeQuick(pos)) |
|---|
| 19 | |
|---|
| 20 | /* /// "BuildStdSuffixTree()" */ |
|---|
| 21 | BOOL BuildStdSuffixTree(struct PTPanGlobal *pg) |
|---|
| 22 | { |
|---|
| 23 | STRPTR newtreename; |
|---|
| 24 | struct PTPanPartition *pp; |
|---|
| 25 | ULONG memfree; |
|---|
| 26 | |
|---|
| 27 | printf("********************************\n" |
|---|
| 28 | "* Building Std Suffix Index... *\n" |
|---|
| 29 | "********************************\n"); |
|---|
| 30 | // Delete old tree first (why, can't we just build a new one and |
|---|
| 31 | // then rename it? Needs some extra disk space then though) |
|---|
| 32 | if(unlink(pg->pg_IndexName)) |
|---|
| 33 | { |
|---|
| 34 | if(GB_size_of_file(pg->pg_IndexName) >= 0) |
|---|
| 35 | { |
|---|
| 36 | fprintf(stderr, "Cannot remove %s\n", pg->pg_IndexName); |
|---|
| 37 | return(FALSE); |
|---|
| 38 | } |
|---|
| 39 | } |
|---|
| 40 | |
|---|
| 41 | // allocate memory for a temporary filename |
|---|
| 42 | newtreename = (STRPTR) malloc(strlen(pg->pg_IndexName) + 2); |
|---|
| 43 | strcpy(newtreename, pg->pg_IndexName); |
|---|
| 44 | strcat(newtreename, "~"); |
|---|
| 45 | |
|---|
| 46 | pg->pg_IndexFile = fopen(newtreename, "w"); /* open file for output */ |
|---|
| 47 | if(!pg->pg_IndexFile) |
|---|
| 48 | { |
|---|
| 49 | fprintf(stderr, "Cannot open %s for output.\n", newtreename); |
|---|
| 50 | free(newtreename); |
|---|
| 51 | return(FALSE); |
|---|
| 52 | } |
|---|
| 53 | GB_set_mode_of_file(newtreename, 0666); |
|---|
| 54 | |
|---|
| 55 | //GB_begin_transaction(pg->pg_MainDB); |
|---|
| 56 | |
|---|
| 57 | /* build index */ |
|---|
| 58 | BuildMergedDatabase(pg); |
|---|
| 59 | |
|---|
| 60 | printf("Freeing alignment cache to save memory..."); |
|---|
| 61 | memfree = FlushCache(pg->pg_SpeciesCache); |
|---|
| 62 | printf("%ld KB freed.\n", memfree >> 10); |
|---|
| 63 | |
|---|
| 64 | /* everything has to fit into one partition */ |
|---|
| 65 | pp = (struct PTPanPartition *) calloc(1, sizeof(struct PTPanPartition)); |
|---|
| 66 | if(!pp) |
|---|
| 67 | { |
|---|
| 68 | return(FALSE); /* out of memory */ |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | /* fill in sensible values */ |
|---|
| 72 | pp->pp_PTPanGlobal = pg; |
|---|
| 73 | pp->pp_ID = 0; |
|---|
| 74 | pp->pp_Prefix = 0; |
|---|
| 75 | pp->pp_PrefixLen = 0; |
|---|
| 76 | pp->pp_Size = pg->pg_TotalRawSize; |
|---|
| 77 | pp->pp_RawOffset = 0; |
|---|
| 78 | pp->pp_PartitionName = (STRPTR) calloc(strlen(pg->pg_IndexName) + 5, 1); |
|---|
| 79 | strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 3); |
|---|
| 80 | strcat(pp->pp_PartitionName, "sfx"); |
|---|
| 81 | AddTail(&pg->pg_Partitions, &pp->pp_Node); |
|---|
| 82 | pg->pg_NumPartitions = 1; |
|---|
| 83 | printf("Using only one partition for %ld leaves.\n", pp->pp_Size); |
|---|
| 84 | |
|---|
| 85 | WriteIndexHeader(pg); |
|---|
| 86 | fclose(pg->pg_IndexFile); |
|---|
| 87 | |
|---|
| 88 | BuildMemoryStdSuffixTree(pp); |
|---|
| 89 | |
|---|
| 90 | /* write out tree */ |
|---|
| 91 | printf(">>> Phase 3: Writing tree to secondary storage... <<<\n"); |
|---|
| 92 | WriteStdSuffixTreeToDisk(pp); |
|---|
| 93 | |
|---|
| 94 | printf(">>> Phase 4: Freeing memory and cleaning it up... <<<\n"); |
|---|
| 95 | |
|---|
| 96 | /* return some memory not used anymore */ |
|---|
| 97 | freeset(pp->pp_StdSfxNodes, NULL); |
|---|
| 98 | |
|---|
| 99 | if(GB_rename_file(newtreename, pg->pg_IndexName)) |
|---|
| 100 | { |
|---|
| 101 | GB_print_error(); |
|---|
| 102 | } |
|---|
| 103 | |
|---|
| 104 | if(GB_set_mode_of_file(pg->pg_IndexName, 0666)) |
|---|
| 105 | { |
|---|
| 106 | GB_print_error(); |
|---|
| 107 | } |
|---|
| 108 | free(newtreename); |
|---|
| 109 | return(TRUE); |
|---|
| 110 | } |
|---|
| 111 | /* \\\ */ |
|---|
| 112 | |
|---|
| 113 | /* /// "BuildMemoryStdSuffixTree()" */ |
|---|
| 114 | BOOL BuildMemoryStdSuffixTree(struct PTPanPartition *pp) |
|---|
| 115 | { |
|---|
| 116 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 117 | ULONG oldheadnum; |
|---|
| 118 | ULONG oldleafnum; |
|---|
| 119 | ULONG nodecnt; |
|---|
| 120 | //struct StdSfxNode *parnode; |
|---|
| 121 | struct StdSfxNode *vnode; |
|---|
| 122 | struct StdSfxNode *oldheadnode; |
|---|
| 123 | struct StdSfxNode *oldleafnode; |
|---|
| 124 | ULONG vnum, parnum; |
|---|
| 125 | |
|---|
| 126 | BenchTimePassed(pg); |
|---|
| 127 | |
|---|
| 128 | pp->pp_SfxMemorySize = pp->pp_Size * sizeof(struct StdSfxNode) * 2; |
|---|
| 129 | pp->pp_StdSfxNodes = (struct StdSfxNode *) calloc(pp->pp_Size * 2, sizeof(struct StdSfxNode)); |
|---|
| 130 | if(!pp->pp_StdSfxNodes) |
|---|
| 131 | { |
|---|
| 132 | printf("Couldn't allocate %ld KB for suffix nodes.\n", |
|---|
| 133 | pp->pp_SfxMemorySize >> 10); |
|---|
| 134 | return(FALSE); |
|---|
| 135 | } |
|---|
| 136 | /* init pointing offsets */ |
|---|
| 137 | pp->pp_NumBigNodes = 0; |
|---|
| 138 | |
|---|
| 139 | printf("Allocated %ld KB suffix nodes buffer.\n", pp->pp_SfxMemorySize >> 10); |
|---|
| 140 | |
|---|
| 141 | /* fill in root node */ |
|---|
| 142 | vnode = pp->pp_StdSfxNodes; |
|---|
| 143 | vnode->ssn_Parent = 0; |
|---|
| 144 | vnode->ssn_FirstChild = 1; |
|---|
| 145 | vnode++; |
|---|
| 146 | vnode->ssn_Parent = 0; |
|---|
| 147 | vnode->ssn_StartPos = 0; |
|---|
| 148 | vnode->ssn_EdgeLen = pg->pg_TotalRawSize; |
|---|
| 149 | vnode->ssn_FirstChild = 0; |
|---|
| 150 | vnode->ssn_NextSibling = 0; |
|---|
| 151 | vnum = 1; |
|---|
| 152 | oldheadnum = 0; |
|---|
| 153 | oldleafnum = 1; |
|---|
| 154 | pp->pp_NumBigNodes = 2; |
|---|
| 155 | |
|---|
| 156 | /* main loop to build up the tree */ |
|---|
| 157 | /* NOTE: as a special precaution, all longwords have MAXCODEFITLONG code length */ |
|---|
| 158 | |
|---|
| 159 | for(nodecnt = 1; nodecnt < pg->pg_TotalRawSize; nodecnt++) |
|---|
| 160 | { |
|---|
| 161 | ULONG gst, gend; // gamma |
|---|
| 162 | ULONG bst, bend; // beta |
|---|
| 163 | |
|---|
| 164 | oldleafnode = &pp->pp_StdSfxNodes[oldleafnum]; |
|---|
| 165 | oldheadnode = &pp->pp_StdSfxNodes[oldheadnum]; |
|---|
| 166 | gst = oldleafnode->ssn_StartPos; |
|---|
| 167 | gend = gst + oldleafnode->ssn_EdgeLen; |
|---|
| 168 | |
|---|
| 169 | //printf("%ld\n", nodecnt); |
|---|
| 170 | |
|---|
| 171 | //cerr << "Step " << i << ": oldleaf(" << source.substr(gst, gend - gst) << ")->" << source.substr(i, n - i) << endl; |
|---|
| 172 | //printTree(source, r, 0); |
|---|
| 173 | |
|---|
| 174 | if(!oldheadnum) // oldhead ist wurzel |
|---|
| 175 | { |
|---|
| 176 | //printf(" oldhead == root\n"); |
|---|
| 177 | //vnode = pp->pp_StdSfxNodes; |
|---|
| 178 | vnum = 0; |
|---|
| 179 | gst++; |
|---|
| 180 | } |
|---|
| 181 | else if((vnum = oldheadnode->ssn_Prime)) // oldhead ist nicht Wurzel hat aber einen Querlink |
|---|
| 182 | { |
|---|
| 183 | //printf(" oldhead has nodeprime\n"); |
|---|
| 184 | } else { // oldhead ist nicht Wurzel hat aber keinen Querlink |
|---|
| 185 | parnum = oldheadnode->ssn_Parent; |
|---|
| 186 | bst = oldheadnode->ssn_StartPos; |
|---|
| 187 | bend = bst + oldheadnode->ssn_EdgeLen; |
|---|
| 188 | //cerr << " oldhead(" << source.substr(bst, bend - bst) << ") has no nodeprime and is not root" << endl; |
|---|
| 189 | if(!parnum) |
|---|
| 190 | { |
|---|
| 191 | //printf(" p was root\n"); |
|---|
| 192 | bst++; |
|---|
| 193 | vnum = FastFindStdSfxNode(pp, 0, bst, bend); |
|---|
| 194 | } else { |
|---|
| 195 | vnum = FastFindStdSfxNode(pp, (pp->pp_StdSfxNodes[parnum]).ssn_Prime, bst, bend); |
|---|
| 196 | //printf(" p->nodeprime was leaf\n"); |
|---|
| 197 | /*cerr << " p->nodeprime was leaf(" |
|---|
| 198 | << source.substr(u->sfxstart, u->sfxend - u->sfxstart) << endl;*/ |
|---|
| 199 | } |
|---|
| 200 | /*cerr << " Call to fastFindNode(., ., " << bst << ", " << bend |
|---|
| 201 | << ", '" << source.substr(bst, bend - bst) << "')" << endl;*/ |
|---|
| 202 | oldheadnode->ssn_Prime = vnum; |
|---|
| 203 | } |
|---|
| 204 | /*cerr << " Call to findNode(., ., " << gst << ", " << gend |
|---|
| 205 | << ", '" << source.substr(gst, gend - gst) << "')" << endl;*/ |
|---|
| 206 | //printf("FindNode %ld, %ld-%ld\n", vnum, gst, gend); |
|---|
| 207 | oldheadnum = FindStdSfxNode(pp, vnum, gst, gend); |
|---|
| 208 | /*cerr << " Call to insertNode(., " << gst << ", " << gend |
|---|
| 209 | << ", '" << source.substr(gst, gend - gst) << "')" << endl;*/ |
|---|
| 210 | //printf("InsertNode %ld, %ld-%ld\n", oldheadnum, gst, gend); |
|---|
| 211 | oldleafnum = InsertStdSfxNode(pp, gst, gend, oldheadnum); |
|---|
| 212 | if((nodecnt & 0x3fff) == 0) |
|---|
| 213 | { |
|---|
| 214 | if((nodecnt >> 14) % 50) |
|---|
| 215 | { |
|---|
| 216 | printf("."); |
|---|
| 217 | fflush(stdout); |
|---|
| 218 | } else { |
|---|
| 219 | printf(". %2ld%%\n", nodecnt / (pg->pg_TotalRawSize / 100)); |
|---|
| 220 | } |
|---|
| 221 | } |
|---|
| 222 | } |
|---|
| 223 | printf("DONE! (%ld KB unused)\n", (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10); |
|---|
| 224 | |
|---|
| 225 | printf("Nodes : %6ld\n", pp->pp_NumBigNodes); |
|---|
| 226 | pg->pg_Bench.ts_MemTree += BenchTimePassed(pg); |
|---|
| 227 | return(TRUE); |
|---|
| 228 | } |
|---|
| 229 | /* \\\ */ |
|---|
| 230 | |
|---|
| 231 | // Suche nach bestimmter Kindkante (konstante Zeit, da alphabetgroesse konstant). |
|---|
| 232 | /* /// "FindStdSfxChildNode()" */ |
|---|
| 233 | inline |
|---|
| 234 | ULONG FindStdSfxChildNode(struct PTPanPartition *pp, ULONG nodenum, ULONG pos) |
|---|
| 235 | { |
|---|
| 236 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 237 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 238 | ULONG c = GetSeqCodeQuick(pos); |
|---|
| 239 | struct StdSfxNode *node; |
|---|
| 240 | |
|---|
| 241 | //printf("Searching %ld at %ld in node %ld: ", c, pos, nodenum); |
|---|
| 242 | nodenum = (pp->pp_StdSfxNodes[nodenum]).ssn_FirstChild; |
|---|
| 243 | while(nodenum) |
|---|
| 244 | { |
|---|
| 245 | node = &pp->pp_StdSfxNodes[nodenum]; |
|---|
| 246 | if(GetSeqCodeQuick(node->ssn_StartPos) == c) |
|---|
| 247 | { |
|---|
| 248 | break; |
|---|
| 249 | } |
|---|
| 250 | nodenum = node->ssn_NextSibling; |
|---|
| 251 | } |
|---|
| 252 | //printf("%ld\n", nodenum); |
|---|
| 253 | return(nodenum); |
|---|
| 254 | } |
|---|
| 255 | /* \\\ */ |
|---|
| 256 | |
|---|
| 257 | /* /// "SplitStdSfxNode()" */ |
|---|
| 258 | // Neue Node einfuegen und initialisieren |
|---|
| 259 | ULONG SplitStdSfxNode(struct PTPanPartition *pp, ULONG leafnum) |
|---|
| 260 | { |
|---|
| 261 | struct StdSfxNode *leafnode = &pp->pp_StdSfxNodes[leafnum]; |
|---|
| 262 | ULONG parnum = leafnode->ssn_Parent; |
|---|
| 263 | struct StdSfxNode *parnode = &pp->pp_StdSfxNodes[parnum]; |
|---|
| 264 | ULONG inum = pp->pp_NumBigNodes++; |
|---|
| 265 | struct StdSfxNode *inode = &pp->pp_StdSfxNodes[inum]; |
|---|
| 266 | struct StdSfxNode *tmpnode; |
|---|
| 267 | |
|---|
| 268 | //printf("Split node %ld\n", leafnum); |
|---|
| 269 | if(parnode->ssn_FirstChild == leafnum) // case 1: leaf is first child |
|---|
| 270 | { |
|---|
| 271 | // correct all linkages |
|---|
| 272 | parnode->ssn_FirstChild = inum; |
|---|
| 273 | } else { // case 2 leaf is some other child |
|---|
| 274 | // find previous sibling of leaf |
|---|
| 275 | tmpnode = &pp->pp_StdSfxNodes[parnode->ssn_FirstChild]; |
|---|
| 276 | while(tmpnode->ssn_NextSibling != leafnum) |
|---|
| 277 | { |
|---|
| 278 | tmpnode = &pp->pp_StdSfxNodes[tmpnode->ssn_NextSibling]; |
|---|
| 279 | } |
|---|
| 280 | // correct all linkages |
|---|
| 281 | tmpnode->ssn_NextSibling = inum; |
|---|
| 282 | } |
|---|
| 283 | inode->ssn_FirstChild = leafnum; |
|---|
| 284 | inode->ssn_NextSibling = leafnode->ssn_NextSibling; |
|---|
| 285 | inode->ssn_Parent = parnum; |
|---|
| 286 | leafnode->ssn_NextSibling = 0; |
|---|
| 287 | leafnode->ssn_Parent = inum; |
|---|
| 288 | return(inum); |
|---|
| 289 | } |
|---|
| 290 | /* \\\ */ |
|---|
| 291 | |
|---|
| 292 | /* /// "FindStdSfxNode()" */ |
|---|
| 293 | // langsames Finden der naechsten Node (wir wissen nicht, ob diese existiert) |
|---|
| 294 | ULONG FindStdSfxNode(struct PTPanPartition *pp, ULONG snum, ULONG &sfxstart, ULONG sfxend) |
|---|
| 295 | { |
|---|
| 296 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 297 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 298 | //struct StdSfxNode *snode; |
|---|
| 299 | |
|---|
| 300 | //printf("FindStdSfxNode: %ld, (%ld-%ld)\n", snum, sfxstart, sfxend); |
|---|
| 301 | if(sfxstart == sfxend) // terminal character is always found |
|---|
| 302 | { |
|---|
| 303 | return(snum); |
|---|
| 304 | } |
|---|
| 305 | //snode = &pp->pp_StdSfxNodes[snum]; |
|---|
| 306 | do |
|---|
| 307 | { |
|---|
| 308 | ULONG i,ic; |
|---|
| 309 | ULONG fst, fend; |
|---|
| 310 | struct StdSfxNode *leafnode; |
|---|
| 311 | struct StdSfxNode *inode; |
|---|
| 312 | ULONG inum; |
|---|
| 313 | ULONG leafnum; |
|---|
| 314 | |
|---|
| 315 | leafnum = FindStdSfxChildNode(pp, snum, sfxstart); |
|---|
| 316 | if(!leafnum) |
|---|
| 317 | { |
|---|
| 318 | return(snum); |
|---|
| 319 | } |
|---|
| 320 | leafnode = &pp->pp_StdSfxNodes[leafnum]; |
|---|
| 321 | fst = leafnode->ssn_StartPos; |
|---|
| 322 | fend = fst + leafnode->ssn_EdgeLen; |
|---|
| 323 | |
|---|
| 324 | // Zeichen fuer Zeichen ueberpruefen |
|---|
| 325 | for(i = fst+1, ic = sfxstart+1; i < fend; i++, ic++) |
|---|
| 326 | { |
|---|
| 327 | if(i-fst+sfxstart >= sfxend) // Suchstringende erreicht -> "$" Blatt |
|---|
| 328 | break; |
|---|
| 329 | if(GetSeqCodeQuick(i) != GetSeqCodeQuick(ic)) // Zeichendifferenz? |
|---|
| 330 | break; |
|---|
| 331 | } |
|---|
| 332 | //printf("i: %ld\n", i); |
|---|
| 333 | if(i != fend) // Muss leaf gesplitted werden? |
|---|
| 334 | { |
|---|
| 335 | inum = SplitStdSfxNode(pp, leafnum); // Generate new node |
|---|
| 336 | inode = &pp->pp_StdSfxNodes[inum]; |
|---|
| 337 | inode->ssn_StartPos = fst; // neues Kind hat selben Suffixstart |
|---|
| 338 | inode->ssn_EdgeLen = i - fst; // aber Suffix endet zwischen den beiden |
|---|
| 339 | leafnode->ssn_StartPos = i; // leaf hat neuen Suffixstart bei i |
|---|
| 340 | leafnode->ssn_EdgeLen = fend - i; // leaf Suffixend bleibt unveraendert |
|---|
| 341 | /*printf("Inode %ld (%ld-%ld), leafnode %ld (%ld-%ld)\n", |
|---|
| 342 | inum, fst, i, leafnum, i, fend);*/ |
|---|
| 343 | sfxstart += i - fst; // Suffixstart fuer insertNode() korrigieren |
|---|
| 344 | return(inum); |
|---|
| 345 | } else { // Knoten vollstaendig gematcht |
|---|
| 346 | snum = leafnum; |
|---|
| 347 | sfxstart += fend - fst; |
|---|
| 348 | } |
|---|
| 349 | } while(TRUE); |
|---|
| 350 | } |
|---|
| 351 | /* \\\ */ |
|---|
| 352 | |
|---|
| 353 | /* /// "FastFindStdSfxNode()" */ |
|---|
| 354 | ULONG FastFindStdSfxNode(struct PTPanPartition *pp, ULONG snum, ULONG sfxstart, ULONG sfxend) |
|---|
| 355 | { |
|---|
| 356 | //struct StdSfxNode *snode; |
|---|
| 357 | |
|---|
| 358 | /* fast finding of next node (we know that it has to exist) */ |
|---|
| 359 | |
|---|
| 360 | if(sfxstart == sfxend) |
|---|
| 361 | { |
|---|
| 362 | return(snum); |
|---|
| 363 | } |
|---|
| 364 | do |
|---|
| 365 | { |
|---|
| 366 | ULONG fst, fend; |
|---|
| 367 | ULONG i; |
|---|
| 368 | struct StdSfxNode *leafnode; |
|---|
| 369 | struct StdSfxNode *inode; |
|---|
| 370 | ULONG inum; |
|---|
| 371 | ULONG leafnum; |
|---|
| 372 | |
|---|
| 373 | leafnum = FindStdSfxChildNode(pp, snum, sfxstart); |
|---|
| 374 | if(!leafnum) |
|---|
| 375 | { |
|---|
| 376 | printf("Shit!\n"); |
|---|
| 377 | exit(1); |
|---|
| 378 | } |
|---|
| 379 | leafnode = &pp->pp_StdSfxNodes[leafnum]; |
|---|
| 380 | i = leafnode->ssn_EdgeLen; |
|---|
| 381 | |
|---|
| 382 | if(sfxstart + i == sfxend) // do we terminate at a leaf? |
|---|
| 383 | { |
|---|
| 384 | return(leafnum); |
|---|
| 385 | } |
|---|
| 386 | fst = leafnode->ssn_StartPos; |
|---|
| 387 | fend = fst + i; |
|---|
| 388 | if(sfxstart + i > sfxend) // Bleiben wir innerhalb des Blattes haengen? |
|---|
| 389 | { |
|---|
| 390 | inum = SplitStdSfxNode(pp, leafnum); // Generate new node |
|---|
| 391 | inode = &pp->pp_StdSfxNodes[inum]; |
|---|
| 392 | inode->ssn_StartPos = fst; // neues Kind hat selben Suffixstart |
|---|
| 393 | inode->ssn_EdgeLen = sfxend - sfxstart; // aber Suffix endet zwischen den beiden |
|---|
| 394 | leafnode->ssn_StartPos = fst + (sfxend - sfxstart); // leaf hat neuen Suffixstart bei i |
|---|
| 395 | leafnode->ssn_EdgeLen = fend - leafnode->ssn_StartPos; // leaf Suffixend bleibt unveraendert |
|---|
| 396 | return(inum); |
|---|
| 397 | } |
|---|
| 398 | sfxstart += i; |
|---|
| 399 | snum = leafnum; |
|---|
| 400 | } while(TRUE); |
|---|
| 401 | } |
|---|
| 402 | /* \\\ */ |
|---|
| 403 | |
|---|
| 404 | /* /// "InsertStdSfxNode()" */ |
|---|
| 405 | ULONG InsertStdSfxNode(struct PTPanPartition *pp, ULONG sfxstart, ULONG sfxend, ULONG parnum) |
|---|
| 406 | { |
|---|
| 407 | struct StdSfxNode *parnode = &pp->pp_StdSfxNodes[parnum]; |
|---|
| 408 | ULONG inum = pp->pp_NumBigNodes++; |
|---|
| 409 | struct StdSfxNode *inode = &pp->pp_StdSfxNodes[inum]; |
|---|
| 410 | ULONG tmpnum; |
|---|
| 411 | struct StdSfxNode *tmpnode; |
|---|
| 412 | |
|---|
| 413 | inode->ssn_Parent = parnum; |
|---|
| 414 | inode->ssn_StartPos = sfxstart; |
|---|
| 415 | inode->ssn_EdgeLen = sfxend - sfxstart; |
|---|
| 416 | if((tmpnum = parnode->ssn_FirstChild)) |
|---|
| 417 | { |
|---|
| 418 | tmpnode = &pp->pp_StdSfxNodes[tmpnum]; |
|---|
| 419 | while((tmpnum = tmpnode->ssn_NextSibling)) |
|---|
| 420 | { |
|---|
| 421 | tmpnode = &pp->pp_StdSfxNodes[tmpnum]; |
|---|
| 422 | } |
|---|
| 423 | tmpnode->ssn_NextSibling = inum; |
|---|
| 424 | } else { |
|---|
| 425 | parnode->ssn_FirstChild = inum; |
|---|
| 426 | } |
|---|
| 427 | return(inum); |
|---|
| 428 | } |
|---|
| 429 | /* \\\ */ |
|---|
| 430 | |
|---|
| 431 | /* /// "WriteStdSuffixTreeToDisk()" */ |
|---|
| 432 | BOOL WriteStdSuffixTreeToDisk(struct PTPanPartition *pp) |
|---|
| 433 | { |
|---|
| 434 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 435 | struct StdSfxNode *ssn; |
|---|
| 436 | struct StdSfxNodeOnDisk *ssndisk; |
|---|
| 437 | ULONG cnt; |
|---|
| 438 | ULONG buffill; |
|---|
| 439 | ULONG chunksize = 1UL<<20; |
|---|
| 440 | ULONG pval; |
|---|
| 441 | ULONG *seqptr; |
|---|
| 442 | UWORD lcnt; |
|---|
| 443 | STRPTR tarseq; |
|---|
| 444 | |
|---|
| 445 | pg->pg_Bench.ts_Reloc += BenchTimePassed(pg); |
|---|
| 446 | /* now finally write it to disk */ |
|---|
| 447 | pp->pp_PartitionFile = fopen(pp->pp_PartitionName, "w"); |
|---|
| 448 | if(!pp->pp_PartitionFile) |
|---|
| 449 | { |
|---|
| 450 | printf("ERROR: Couldn't open partition file %s for writing!\n", |
|---|
| 451 | pp->pp_PartitionName); |
|---|
| 452 | return(FALSE); |
|---|
| 453 | } |
|---|
| 454 | |
|---|
| 455 | pp->pp_DiskTreeSize = pp->pp_NumBigNodes * sizeof(struct StdSfxNodeOnDisk); |
|---|
| 456 | pp->pp_DiskBufferSize = sizeof(StdSfxNodeOnDisk) * chunksize; |
|---|
| 457 | pp->pp_DiskBuffer = (UBYTE *) calloc(1, pp->pp_DiskBufferSize); |
|---|
| 458 | pp->pp_DiskPos = 0; |
|---|
| 459 | |
|---|
| 460 | WriteStdSuffixTreeHeader(pp); |
|---|
| 461 | |
|---|
| 462 | printf("Writing tree (%ld KB)",pp->pp_DiskTreeSize >> 10); |
|---|
| 463 | fflush(NULL); |
|---|
| 464 | cnt = pp->pp_NumBigNodes; |
|---|
| 465 | ssn = pp->pp_StdSfxNodes; |
|---|
| 466 | do |
|---|
| 467 | { |
|---|
| 468 | ssndisk = (struct StdSfxNodeOnDisk *) pp->pp_DiskBuffer; |
|---|
| 469 | buffill = 0; |
|---|
| 470 | do |
|---|
| 471 | { |
|---|
| 472 | ssndisk->ssn_StartPos = ssn->ssn_StartPos; |
|---|
| 473 | ssndisk->ssn_EdgeLen = ssn->ssn_EdgeLen; |
|---|
| 474 | ssndisk->ssn_FirstChild = ssn->ssn_FirstChild; |
|---|
| 475 | ssndisk->ssn_NextSibling = ssn->ssn_NextSibling; |
|---|
| 476 | ssndisk++; |
|---|
| 477 | ssn++; |
|---|
| 478 | cnt--; |
|---|
| 479 | } while((++buffill < chunksize) && cnt); |
|---|
| 480 | |
|---|
| 481 | printf("."); |
|---|
| 482 | fflush(NULL); |
|---|
| 483 | fwrite(pp->pp_DiskBuffer, sizeof(struct StdSfxNodeOnDisk) * buffill, 1, pp->pp_PartitionFile); |
|---|
| 484 | } while(cnt); |
|---|
| 485 | printf(".\n"); |
|---|
| 486 | printf("Writing raw text (%ld KB)",pp->pp_DiskTreeSize >> 10); |
|---|
| 487 | cnt = (pg->pg_TotalRawSize / MAXCODEFITLONG) + 1; |
|---|
| 488 | seqptr = pg->pg_MergedRawData; |
|---|
| 489 | tarseq = (STRPTR) pp->pp_DiskBuffer; |
|---|
| 490 | pp->pp_DiskPos = 0; |
|---|
| 491 | do |
|---|
| 492 | { |
|---|
| 493 | /* get next longword */ |
|---|
| 494 | pval = *seqptr++; |
|---|
| 495 | lcnt = MAXCODEFITLONG; |
|---|
| 496 | pval >>= pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 497 | /* unpack compressed longword */ |
|---|
| 498 | do |
|---|
| 499 | { |
|---|
| 500 | *tarseq++ = (pval / pg->pg_PowerTable[--lcnt]) % pg->pg_AlphaSize; |
|---|
| 501 | } while(lcnt); |
|---|
| 502 | pp->pp_DiskPos += MAXCODEFITLONG; |
|---|
| 503 | if(pp->pp_DiskPos > pp->pp_DiskBufferSize - MAXCODEFITLONG) |
|---|
| 504 | { |
|---|
| 505 | fwrite(pp->pp_DiskBuffer, pp->pp_DiskPos, 1, pp->pp_PartitionFile); |
|---|
| 506 | tarseq = (STRPTR) pp->pp_DiskBuffer; |
|---|
| 507 | pp->pp_DiskPos = 0; |
|---|
| 508 | } |
|---|
| 509 | } while(--cnt); |
|---|
| 510 | pp->pp_DiskIdxSpace = ftell(pp->pp_PartitionFile); |
|---|
| 511 | fclose(pp->pp_PartitionFile); |
|---|
| 512 | free(pp->pp_DiskBuffer); |
|---|
| 513 | |
|---|
| 514 | pg->pg_Bench.ts_Writing += BenchTimePassed(pg); |
|---|
| 515 | |
|---|
| 516 | return(TRUE); |
|---|
| 517 | } |
|---|
| 518 | /* \\\ */ |
|---|
| 519 | |
|---|
| 520 | |
|---|
| 521 | /* /// "BuildPTPanIndex()" */ |
|---|
| 522 | /* build a whole new fresh and tidy index (main routine) */ |
|---|
| 523 | BOOL BuildPTPanIndex(struct PTPanGlobal *pg) |
|---|
| 524 | { |
|---|
| 525 | STRPTR newtreename; |
|---|
| 526 | struct PTPanPartition *pp; |
|---|
| 527 | ULONG memfree; |
|---|
| 528 | |
|---|
| 529 | printf("********************************\n" |
|---|
| 530 | "* Building new PT Pan Index... *\n" |
|---|
| 531 | "********************************\n"); |
|---|
| 532 | // Delete old tree first (why, can't we just build a new one and |
|---|
| 533 | // then rename it? Needs some extra disk space then though) |
|---|
| 534 | if(unlink(pg->pg_IndexName)) |
|---|
| 535 | { |
|---|
| 536 | if(GB_size_of_file(pg->pg_IndexName) >= 0) |
|---|
| 537 | { |
|---|
| 538 | fprintf(stderr, "Cannot remove %s\n", pg->pg_IndexName); |
|---|
| 539 | return(FALSE); |
|---|
| 540 | } |
|---|
| 541 | } |
|---|
| 542 | |
|---|
| 543 | // allocate memory for a temporary filename |
|---|
| 544 | newtreename = (STRPTR) malloc(strlen(pg->pg_IndexName) + 2); |
|---|
| 545 | strcpy(newtreename, pg->pg_IndexName); |
|---|
| 546 | strcat(newtreename, "~"); |
|---|
| 547 | |
|---|
| 548 | pg->pg_IndexFile = fopen(newtreename, "w"); /* open file for output */ |
|---|
| 549 | if(!pg->pg_IndexFile) |
|---|
| 550 | { |
|---|
| 551 | fprintf(stderr, "Cannot open %s for output.\n", newtreename); |
|---|
| 552 | free(newtreename); |
|---|
| 553 | return(FALSE); |
|---|
| 554 | } |
|---|
| 555 | GB_set_mode_of_file(newtreename, 0666); |
|---|
| 556 | |
|---|
| 557 | //GB_begin_transaction(pg->pg_MainDB); |
|---|
| 558 | |
|---|
| 559 | /* build index */ |
|---|
| 560 | BuildMergedDatabase(pg); |
|---|
| 561 | |
|---|
| 562 | printf("Freeing alignment cache to save memory..."); |
|---|
| 563 | memfree = FlushCache(pg->pg_SpeciesCache); |
|---|
| 564 | printf("%ld KB freed.\n", memfree >> 10); |
|---|
| 565 | |
|---|
| 566 | PartitionPrefixScan(pg); |
|---|
| 567 | |
|---|
| 568 | WriteIndexHeader(pg); |
|---|
| 569 | fclose(pg->pg_IndexFile); |
|---|
| 570 | |
|---|
| 571 | /* build tree for each partition */ |
|---|
| 572 | pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head; |
|---|
| 573 | while(pp->pp_Node.ln_Succ) |
|---|
| 574 | { |
|---|
| 575 | CreateTreeForPartition(pp); |
|---|
| 576 | pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ; |
|---|
| 577 | } |
|---|
| 578 | //CreatePartitionLookup(pg); |
|---|
| 579 | |
|---|
| 580 | //GB_commit_transaction(pg->pg_MainDB); |
|---|
| 581 | |
|---|
| 582 | //if(GB_rename_file(newtreename, pg->pg_IndexName)) *** FIXME *** |
|---|
| 583 | if(GB_rename_file(newtreename, pg->pg_IndexName)) |
|---|
| 584 | { |
|---|
| 585 | GB_print_error(); |
|---|
| 586 | } |
|---|
| 587 | |
|---|
| 588 | if(GB_set_mode_of_file(pg->pg_IndexName, 0666)) |
|---|
| 589 | { |
|---|
| 590 | GB_print_error(); |
|---|
| 591 | } |
|---|
| 592 | free(newtreename); |
|---|
| 593 | return(TRUE); |
|---|
| 594 | } |
|---|
| 595 | /* \\\ */ |
|---|
| 596 | |
|---|
| 597 | /* /// "BuildMergedDatabase()" */ |
|---|
| 598 | BOOL BuildMergedDatabase(struct PTPanGlobal *pg) |
|---|
| 599 | { |
|---|
| 600 | struct PTPanSpecies *ps; |
|---|
| 601 | ULONG *seqptr; |
|---|
| 602 | ULONG seqcode; |
|---|
| 603 | ULONG cnt; |
|---|
| 604 | ULONG pval; |
|---|
| 605 | ULONG len; |
|---|
| 606 | BOOL dbopen = FALSE; |
|---|
| 607 | ULONG verlen; |
|---|
| 608 | ULONG abspos; |
|---|
| 609 | ULONG specabspos; |
|---|
| 610 | ULONG hash; |
|---|
| 611 | |
|---|
| 612 | BenchTimePassed(pg); |
|---|
| 613 | |
|---|
| 614 | /* allocate memory for compressed data */ |
|---|
| 615 | /* about the +3: 1 for rounding, 1 for MAXCODEFITLONG*SEQCODE_N |
|---|
| 616 | and 1 for terminal */ |
|---|
| 617 | pg->pg_MergedRawData = (ULONG *) malloc(((pg->pg_TotalRawSize / |
|---|
| 618 | MAXCODEFITLONG) + 3) * sizeof(ULONG)); |
|---|
| 619 | if(!pg->pg_MergedRawData) |
|---|
| 620 | { |
|---|
| 621 | printf("Sorry, couldn't allocate %ld KB of memory for the compressed DB!\n", |
|---|
| 622 | (((pg->pg_TotalRawSize / MAXCODEFITLONG) + 3) * sizeof(ULONG)) >> 10); |
|---|
| 623 | return(FALSE); |
|---|
| 624 | } |
|---|
| 625 | |
|---|
| 626 | /* init */ |
|---|
| 627 | seqptr = pg->pg_MergedRawData; |
|---|
| 628 | cnt = 0; |
|---|
| 629 | pval = 0; |
|---|
| 630 | len = 4; |
|---|
| 631 | abspos = 0; |
|---|
| 632 | |
|---|
| 633 | /* note: This has to be modified a bit to support compressed databases of >2GB |
|---|
| 634 | alignment data using pp->pp_RawPartitionOffset */ |
|---|
| 635 | |
|---|
| 636 | printf("Step 1: Building compressed database...\n"); |
|---|
| 637 | /* doing a linear scan -- caching is useless */ |
|---|
| 638 | DisableCache(pg->pg_SpeciesCache); |
|---|
| 639 | |
|---|
| 640 | /* traverse all species */ |
|---|
| 641 | ps = (struct PTPanSpecies *) pg->pg_Species.lh_Head; |
|---|
| 642 | while(ps->ps_Node.ln_Succ) |
|---|
| 643 | { |
|---|
| 644 | /* compress sequence */ |
|---|
| 645 | STRPTR srcstr; |
|---|
| 646 | UBYTE code; |
|---|
| 647 | |
|---|
| 648 | if(abspos != ps->ps_AbsOffset) |
|---|
| 649 | { |
|---|
| 650 | /* species seems to be corrupt! */ |
|---|
| 651 | printf("AbsPos %ld != %ld mismatch at %s\n", |
|---|
| 652 | abspos, ps->ps_AbsOffset, ps->ps_Name); |
|---|
| 653 | } |
|---|
| 654 | |
|---|
| 655 | |
|---|
| 656 | verlen = 0; |
|---|
| 657 | hash = 0; |
|---|
| 658 | specabspos = 0; |
|---|
| 659 | ULONG bitpos = 0; |
|---|
| 660 | ULONG count; |
|---|
| 661 | while((code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, bitpos, count)) != 0xff) |
|---|
| 662 | { |
|---|
| 663 | #ifdef ALLOWDOTSINMATCH |
|---|
| 664 | if ((code == '.') && (count == 1)) |
|---|
| 665 | { |
|---|
| 666 | #if 1 // debug |
|---|
| 667 | ULONG tmpbitpos = bitpos; |
|---|
| 668 | ULONG tmpcount; |
|---|
| 669 | printf("Species: %s, abspos: %li, AbsOffset: %li >> %li*%c, ", |
|---|
| 670 | ps->ps_Name, abspos, ps->ps_AbsOffset, tmpcount, code); |
|---|
| 671 | for (int i = 0; i < 10; ++i) |
|---|
| 672 | { |
|---|
| 673 | code = GetNextCharacter(pg, ps->ps_SeqDataCompressed, tmpbitpos, tmpcount); |
|---|
| 674 | if (code == 0xff) break; |
|---|
| 675 | printf("%li*%c, ", tmpcount, code); |
|---|
| 676 | } |
|---|
| 677 | printf("\n"); |
|---|
| 678 | #endif |
|---|
| 679 | code = 'N'; |
|---|
| 680 | } |
|---|
| 681 | #endif |
|---|
| 682 | if(pg->pg_SeqCodeValidTable[code]) |
|---|
| 683 | { |
|---|
| 684 | /* add sequence code */ |
|---|
| 685 | if(verlen++ < ps->ps_RawDataSize) |
|---|
| 686 | { |
|---|
| 687 | abspos++; |
|---|
| 688 | specabspos++; |
|---|
| 689 | seqcode = pg->pg_CompressTable[code]; |
|---|
| 690 | pval *= pg->pg_AlphaSize; |
|---|
| 691 | pval += seqcode; |
|---|
| 692 | /* calculate hash */ |
|---|
| 693 | hash *= pg->pg_AlphaSize; |
|---|
| 694 | hash += seqcode; |
|---|
| 695 | hash %= HASHPRIME; |
|---|
| 696 | /* check, if storage capacity was reached? */ |
|---|
| 697 | if(++cnt == MAXCODEFITLONG) |
|---|
| 698 | { |
|---|
| 699 | /* write out compressed longword (with eof bit) */ |
|---|
| 700 | //printf("[%08lx]", pval | pg->pg_BitsMaskTable[cnt]); |
|---|
| 701 | *seqptr++ = (pval << pg->pg_BitsShiftTable[cnt]) | pg->pg_BitsMaskTable[cnt]; |
|---|
| 702 | cnt = 0; |
|---|
| 703 | pval = 0; |
|---|
| 704 | len += 4; |
|---|
| 705 | } |
|---|
| 706 | } |
|---|
| 707 | } |
|---|
| 708 | } |
|---|
| 709 | if(verlen != ps->ps_RawDataSize) |
|---|
| 710 | { |
|---|
| 711 | printf("Len %ld != %ld mismatch with %s\n", |
|---|
| 712 | verlen, ps->ps_RawDataSize, ps->ps_Name); |
|---|
| 713 | printf("Please check if this alignment is somehow corrupt!\n"); |
|---|
| 714 | } |
|---|
| 715 | //printf("\n"); |
|---|
| 716 | ps->ps_SeqHash = hash; |
|---|
| 717 | ps = (struct PTPanSpecies *) ps->ps_Node.ln_Succ; |
|---|
| 718 | } |
|---|
| 719 | |
|---|
| 720 | /* write pending bits (with eof bit) */ |
|---|
| 721 | /* after a lot of experimenting, padding with SEQCODE_N is the only sensible thing |
|---|
| 722 | to keep code size down and reduce cases */ |
|---|
| 723 | *seqptr++ = ((pval * pg->pg_PowerTable[MAXCODEFITLONG - cnt]) |
|---|
| 724 | << pg->pg_BitsShiftTable[MAXCODEFITLONG]) |
|---|
| 725 | | pg->pg_BitsMaskTable[MAXCODEFITLONG]; |
|---|
| 726 | //printf("[%08lx]\n", seqptr[-1]); |
|---|
| 727 | |
|---|
| 728 | /* add a final padding longword with SEQCODE_N */ |
|---|
| 729 | *seqptr++ = pg->pg_BitsMaskTable[MAXCODEFITLONG]; |
|---|
| 730 | |
|---|
| 731 | /* and a terminating bit */ |
|---|
| 732 | *seqptr = pg->pg_BitsMaskTable[0]; |
|---|
| 733 | |
|---|
| 734 | if(dbopen) /* close DB, if open */ |
|---|
| 735 | { |
|---|
| 736 | GB_commit_transaction(pg->pg_MainDB); |
|---|
| 737 | } |
|---|
| 738 | |
|---|
| 739 | /* Enable caching again */ |
|---|
| 740 | EnableCache(pg->pg_SpeciesCache); |
|---|
| 741 | |
|---|
| 742 | #if 0 /* debug */ |
|---|
| 743 | { |
|---|
| 744 | FILE *fh; |
|---|
| 745 | char tmpbuf[80]; |
|---|
| 746 | STRPTR str; |
|---|
| 747 | |
|---|
| 748 | sprintf(tmpbuf, "%s.raw", pg->pg_IndexName); |
|---|
| 749 | str = DecompressSequence(pg, pg->pg_MergedRawData); |
|---|
| 750 | fh = fopen(tmpbuf, "w"); |
|---|
| 751 | fputs(str, fh); |
|---|
| 752 | fclose(fh); |
|---|
| 753 | free(str); |
|---|
| 754 | } |
|---|
| 755 | #endif |
|---|
| 756 | |
|---|
| 757 | /* calculate hash sum over all data */ |
|---|
| 758 | //pg->pg_AllHashSum = GetSeqHash(pg, 0, pg->pg_TotalRawSize, 0); |
|---|
| 759 | /* formerly a global hash, now only a random key to check integrity |
|---|
| 760 | with other files. Database up2date state is checked with sequence |
|---|
| 761 | hashes */ |
|---|
| 762 | pg->pg_AllHashSum = rand(); |
|---|
| 763 | pg->pg_Bench.ts_MergeDB = BenchTimePassed(pg); |
|---|
| 764 | |
|---|
| 765 | printf("Merged compressed database size: %ld KB\n", len >> 10); |
|---|
| 766 | return(TRUE); |
|---|
| 767 | } |
|---|
| 768 | /* \\\ */ |
|---|
| 769 | |
|---|
| 770 | /* /// "PartitionPrefixScan() */ |
|---|
| 771 | BOOL PartitionPrefixScan(struct PTPanGlobal *pg) |
|---|
| 772 | { |
|---|
| 773 | struct PTPanPartition *pp; |
|---|
| 774 | ULONG cnt; |
|---|
| 775 | ULONG prefix; |
|---|
| 776 | ULONG seqpos; |
|---|
| 777 | ULONG *seqptr; |
|---|
| 778 | ULONG pval; |
|---|
| 779 | ULONG maxpartblock; |
|---|
| 780 | ULONG partsize; |
|---|
| 781 | ULONG leftpresize; |
|---|
| 782 | ULONG rightpresize; |
|---|
| 783 | ULONG prefixstart; |
|---|
| 784 | UWORD partcnt; |
|---|
| 785 | |
|---|
| 786 | BenchTimePassed(pg); |
|---|
| 787 | printf("Step 2: Partition calculation...\n"); |
|---|
| 788 | if(pg->pg_TotalRawSize < pg->pg_MaxPartitionSize) |
|---|
| 789 | { |
|---|
| 790 | /* everything fits into one partition */ |
|---|
| 791 | pp = (struct PTPanPartition *) calloc(1, sizeof(struct PTPanPartition)); |
|---|
| 792 | if(!pp) |
|---|
| 793 | { |
|---|
| 794 | return(FALSE); /* out of memory */ |
|---|
| 795 | } |
|---|
| 796 | |
|---|
| 797 | /* fill in sensible values */ |
|---|
| 798 | pp->pp_PTPanGlobal = pg; |
|---|
| 799 | pp->pp_ID = 0; |
|---|
| 800 | pp->pp_Prefix = 0; |
|---|
| 801 | pp->pp_PrefixLen = 0; |
|---|
| 802 | pp->pp_Size = pg->pg_TotalRawSize; |
|---|
| 803 | pp->pp_RawOffset = 0; |
|---|
| 804 | pp->pp_PartitionName = (STRPTR) calloc(strlen(pg->pg_IndexName) + 5, 1); |
|---|
| 805 | strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 2); |
|---|
| 806 | strcat(pp->pp_PartitionName, "t000"); |
|---|
| 807 | AddTail(&pg->pg_Partitions, &pp->pp_Node); |
|---|
| 808 | pg->pg_NumPartitions = 1; |
|---|
| 809 | printf("Using only one partition for %ld leaves.\n", pp->pp_Size); |
|---|
| 810 | return(TRUE); |
|---|
| 811 | } |
|---|
| 812 | if(!pg->pg_MergedRawData) /* safe checking */ |
|---|
| 813 | { |
|---|
| 814 | printf("Huh? No merged raw data. Blame your programmer NOW!\n"); |
|---|
| 815 | return(FALSE); |
|---|
| 816 | } |
|---|
| 817 | |
|---|
| 818 | /* make histogram */ |
|---|
| 819 | pg->pg_HistoTable = (ULONG *) calloc(pg->pg_PowerTable[MAXPREFIXSIZE], sizeof(ULONG)); |
|---|
| 820 | if(!pg->pg_HistoTable) |
|---|
| 821 | { |
|---|
| 822 | printf("Out of memory for histogram!\n"); |
|---|
| 823 | return(FALSE); |
|---|
| 824 | } |
|---|
| 825 | |
|---|
| 826 | /* NOTE: ordering for the index of a prefix in the table is: |
|---|
| 827 | c_{m} + c_{m-1} * 5 + ... + c_{1} * 5^{m-1} |
|---|
| 828 | This means that the last part of the prefix has the lowest |
|---|
| 829 | significance. */ |
|---|
| 830 | |
|---|
| 831 | /* scan through the compressed database */ |
|---|
| 832 | printf("Scanning through compact data...\n"); |
|---|
| 833 | seqptr = pg->pg_MergedRawData; |
|---|
| 834 | prefix = 0; |
|---|
| 835 | cnt = 0; |
|---|
| 836 | pval = 0; |
|---|
| 837 | for(seqpos = 0; seqpos < pg->pg_TotalRawSize + MAXPREFIXSIZE - 1; seqpos++) |
|---|
| 838 | { |
|---|
| 839 | /* get sequence code from packed data */ |
|---|
| 840 | if(!cnt) |
|---|
| 841 | { |
|---|
| 842 | pval = *seqptr++; |
|---|
| 843 | cnt = GetCompressedLongSize(pg, pval); |
|---|
| 844 | pval >>= pg->pg_BitsShiftTable[cnt]; |
|---|
| 845 | } |
|---|
| 846 | |
|---|
| 847 | /* generate new prefix code */ |
|---|
| 848 | prefix %= pg->pg_PowerTable[MAXPREFIXSIZE - 1]; |
|---|
| 849 | prefix *= pg->pg_AlphaSize; |
|---|
| 850 | prefix += (pval / pg->pg_PowerTable[--cnt]) % pg->pg_AlphaSize; |
|---|
| 851 | |
|---|
| 852 | /* increase histogram value */ |
|---|
| 853 | if(seqpos >= MAXPREFIXSIZE - 1) |
|---|
| 854 | { |
|---|
| 855 | pg->pg_HistoTable[prefix]++; |
|---|
| 856 | } |
|---|
| 857 | } |
|---|
| 858 | |
|---|
| 859 | /* generate partitions */ |
|---|
| 860 | cnt = 0; |
|---|
| 861 | prefixstart = 0; |
|---|
| 862 | maxpartblock = pg->pg_PowerTable[MAXPREFIXSIZE]; |
|---|
| 863 | partsize = 0; |
|---|
| 864 | partcnt = 0; |
|---|
| 865 | pg->pg_MaxPrefixLen = 0; |
|---|
| 866 | for(prefix = 0; prefix < pg->pg_PowerTable[MAXPREFIXSIZE];) |
|---|
| 867 | { |
|---|
| 868 | partsize++; |
|---|
| 869 | cnt += pg->pg_HistoTable[prefix]; |
|---|
| 870 | if((cnt > pg->pg_MaxPartitionSize) || (partsize >= maxpartblock)) |
|---|
| 871 | { |
|---|
| 872 | /* partition is full! */ |
|---|
| 873 | if(prefixstart == prefix) |
|---|
| 874 | { |
|---|
| 875 | printf("Warning: Partition overflow! Increase MAXPREFIXSIZE!\n"); |
|---|
| 876 | break; |
|---|
| 877 | } else { |
|---|
| 878 | ULONG ppos; |
|---|
| 879 | /* check first, if we had to partition the thing anyway */ |
|---|
| 880 | if(partsize < maxpartblock) |
|---|
| 881 | { |
|---|
| 882 | /* find out, how many leaves of the tree can be merged */ |
|---|
| 883 | for(leftpresize = 0; leftpresize < MAXPREFIXSIZE; leftpresize++) |
|---|
| 884 | { |
|---|
| 885 | if(((prefixstart / pg->pg_PowerTable[leftpresize]) % |
|---|
| 886 | pg->pg_AlphaSize) != 0) |
|---|
| 887 | { |
|---|
| 888 | break; |
|---|
| 889 | } |
|---|
| 890 | } |
|---|
| 891 | /* check, if we're still in the same block */ |
|---|
| 892 | if(prefix / pg->pg_PowerTable[leftpresize] == |
|---|
| 893 | prefixstart / pg->pg_PowerTable[leftpresize]) |
|---|
| 894 | { |
|---|
| 895 | for(rightpresize = 0; rightpresize <= leftpresize; rightpresize++) |
|---|
| 896 | { |
|---|
| 897 | if(prefixstart + pg->pg_PowerTable[rightpresize] > prefix) |
|---|
| 898 | { |
|---|
| 899 | break; |
|---|
| 900 | } |
|---|
| 901 | } |
|---|
| 902 | rightpresize--; |
|---|
| 903 | /* setup maxpartblock for all subpartions */ |
|---|
| 904 | maxpartblock = pg->pg_PowerTable[rightpresize]; |
|---|
| 905 | prefix = prefixstart + maxpartblock - 1; |
|---|
| 906 | } else { |
|---|
| 907 | /* we crossed the boundary, do last partition */ |
|---|
| 908 | prefix = prefixstart + pg->pg_PowerTable[leftpresize] - 1; |
|---|
| 909 | maxpartblock = pg->pg_PowerTable[MAXPREFIXSIZE] - prefix - 1; |
|---|
| 910 | } |
|---|
| 911 | } else { |
|---|
| 912 | /* we had to split the tree before, so this is just |
|---|
| 913 | another leaf */ |
|---|
| 914 | if((prefix + 1) % (maxpartblock * pg->pg_AlphaSize) == 0) |
|---|
| 915 | { |
|---|
| 916 | /* we have reached the last block, go back to the |
|---|
| 917 | normal search */ |
|---|
| 918 | maxpartblock = pg->pg_PowerTable[MAXPREFIXSIZE] - prefix - 1; |
|---|
| 919 | } |
|---|
| 920 | } |
|---|
| 921 | //printf("New range: %ld - %ld\n", prefixstart, prefix); |
|---|
| 922 | /* recalculate leaf count */ |
|---|
| 923 | cnt = 0; |
|---|
| 924 | for(ppos = prefixstart; ppos <= prefix; ppos++) |
|---|
| 925 | { |
|---|
| 926 | cnt += pg->pg_HistoTable[ppos]; |
|---|
| 927 | } |
|---|
| 928 | |
|---|
| 929 | /* don't create empty trees! */ |
|---|
| 930 | if(cnt) |
|---|
| 931 | { |
|---|
| 932 | /* get prefix length */ |
|---|
| 933 | for(leftpresize = 1; leftpresize < MAXPREFIXSIZE; leftpresize++) |
|---|
| 934 | { |
|---|
| 935 | if((prefix - prefixstart) < pg->pg_PowerTable[leftpresize]) |
|---|
| 936 | { |
|---|
| 937 | break; |
|---|
| 938 | } |
|---|
| 939 | } |
|---|
| 940 | pp = (struct PTPanPartition *) calloc(1, sizeof(struct PTPanPartition)); |
|---|
| 941 | if(!pp) |
|---|
| 942 | { |
|---|
| 943 | return(FALSE); /* out of memory */ |
|---|
| 944 | } |
|---|
| 945 | |
|---|
| 946 | /* fill in sensible values */ |
|---|
| 947 | pp->pp_PTPanGlobal = pg; |
|---|
| 948 | pp->pp_ID = partcnt++; |
|---|
| 949 | pp->pp_Prefix = prefix / pg->pg_PowerTable[leftpresize]; |
|---|
| 950 | pp->pp_PrefixLen = MAXPREFIXSIZE - leftpresize; |
|---|
| 951 | pp->pp_Size = cnt; |
|---|
| 952 | /* this is the point where you would change some things to support larger |
|---|
| 953 | raw datasets than 2 GB */ |
|---|
| 954 | pp->pp_RawOffset = 0; /* FIXME */ |
|---|
| 955 | pp->pp_PartitionName = (STRPTR) calloc(strlen(pg->pg_IndexName) + 5, 1); |
|---|
| 956 | strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 2); |
|---|
| 957 | sprintf(&pp->pp_PartitionName[strlen(pg->pg_IndexName) - 2], "t%03ld", |
|---|
| 958 | pp->pp_ID); |
|---|
| 959 | AddTail(&pg->pg_Partitions, &pp->pp_Node); |
|---|
| 960 | /* check, if we need a bigger prefix table */ |
|---|
| 961 | if(pp->pp_PrefixLen > pg->pg_MaxPrefixLen) |
|---|
| 962 | { |
|---|
| 963 | pg->pg_MaxPrefixLen = pp->pp_PrefixLen; |
|---|
| 964 | } |
|---|
| 965 | #ifdef DEBUG |
|---|
| 966 | { |
|---|
| 967 | STRPTR tarseq = pg->pg_TempBuffer; |
|---|
| 968 | ULONG ccnt = pp->pp_PrefixLen; |
|---|
| 969 | do |
|---|
| 970 | { |
|---|
| 971 | *tarseq++ = pg->pg_DecompressTable[(pp->pp_Prefix / pg->pg_PowerTable[--ccnt]) |
|---|
| 972 | % pg->pg_AlphaSize]; |
|---|
| 973 | } while(ccnt); |
|---|
| 974 | *tarseq = 0; |
|---|
| 975 | printf("New partition %ld - %ld, size %ld, prefix = %ld, prefixlen = %ld %s\n", |
|---|
| 976 | prefixstart, prefix, cnt, pp->pp_Prefix, pp->pp_PrefixLen, |
|---|
| 977 | pg->pg_TempBuffer); |
|---|
| 978 | } |
|---|
| 979 | #endif |
|---|
| 980 | } else { |
|---|
| 981 | #ifdef DEBUG |
|---|
| 982 | printf("Empty partition %ld - %ld\n", prefixstart, prefix); |
|---|
| 983 | #endif |
|---|
| 984 | } |
|---|
| 985 | prefix++; |
|---|
| 986 | /* restart */ |
|---|
| 987 | prefixstart = prefix; |
|---|
| 988 | cnt = 0; |
|---|
| 989 | partsize = 0; |
|---|
| 990 | } |
|---|
| 991 | } else { |
|---|
| 992 | prefix++; |
|---|
| 993 | } |
|---|
| 994 | } |
|---|
| 995 | free(pg->pg_HistoTable); |
|---|
| 996 | pg->pg_NumPartitions = partcnt; |
|---|
| 997 | printf("Using %d partitions.\n", partcnt); |
|---|
| 998 | pg->pg_Bench.ts_PrefixScan = BenchTimePassed(pg); |
|---|
| 999 | return(TRUE); |
|---|
| 1000 | } |
|---|
| 1001 | /* \\\ */ |
|---|
| 1002 | |
|---|
| 1003 | /* /// "CreateTreeForPartition()" */ |
|---|
| 1004 | BOOL CreateTreeForPartition(struct PTPanPartition *pp) |
|---|
| 1005 | { |
|---|
| 1006 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 1007 | |
|---|
| 1008 | printf("*** Partition %ld/%d: %ld nodes (%ld%%) ***\n", |
|---|
| 1009 | pp->pp_ID + 1, pg->pg_NumPartitions, |
|---|
| 1010 | pp->pp_Size, pp->pp_Size / (pg->pg_TotalRawSize / 100)); |
|---|
| 1011 | |
|---|
| 1012 | printf(">>> Phase 1: Building up suffix tree in memory... <<<\n"); |
|---|
| 1013 | BuildMemoryTree(pp); |
|---|
| 1014 | |
|---|
| 1015 | /* prepare tree building */ |
|---|
| 1016 | printf(">>> Phase 2: Calculating tree statistical data... <<<\n"); |
|---|
| 1017 | CalculateTreeStats(pp); |
|---|
| 1018 | |
|---|
| 1019 | /* write out tree */ |
|---|
| 1020 | printf(">>> Phase 3: Writing tree to secondary storage... <<<\n"); |
|---|
| 1021 | WriteTreeToDisk(pp); |
|---|
| 1022 | |
|---|
| 1023 | printf(">>> Phase 4: Freeing memory and cleaning it up... <<<\n"); |
|---|
| 1024 | |
|---|
| 1025 | /* return some memory not used anymore */ |
|---|
| 1026 | freeset(pp->pp_SfxNodes, NULL); |
|---|
| 1027 | freeset(pp->pp_BranchCode, NULL); |
|---|
| 1028 | freeset(pp->pp_ShortEdgeCode, NULL); |
|---|
| 1029 | freeset(pp->pp_LongEdgeLenCode, NULL); |
|---|
| 1030 | freeset(pp->pp_LongDictRaw, NULL); |
|---|
| 1031 | freeset(pp->pp_LeafBuffer, NULL); |
|---|
| 1032 | |
|---|
| 1033 | pp->pp_LevelStats = NULL; |
|---|
| 1034 | pp->pp_LeafBufferSize = 0; |
|---|
| 1035 | |
|---|
| 1036 | return(TRUE); |
|---|
| 1037 | } |
|---|
| 1038 | /* \\\ */ |
|---|
| 1039 | |
|---|
| 1040 | /* /// "BuildMemoryTree()" */ |
|---|
| 1041 | BOOL BuildMemoryTree(struct PTPanPartition *pp) |
|---|
| 1042 | { |
|---|
| 1043 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 1044 | ULONG *seqptr ; |
|---|
| 1045 | ULONG pval, pvalnext; |
|---|
| 1046 | ULONG cnt; |
|---|
| 1047 | ULONG nodecnt; |
|---|
| 1048 | ULONG pos; |
|---|
| 1049 | ULONG len; |
|---|
| 1050 | ULONG window; |
|---|
| 1051 | |
|---|
| 1052 | BenchTimePassed(pg); |
|---|
| 1053 | /* setup node buffer: |
|---|
| 1054 | some notes about the node organisation: nodes are kept in a partly |
|---|
| 1055 | compressed way in memory already: they only store the number of edges they |
|---|
| 1056 | need. Therefore the big node buffer is split into two parts: |
|---|
| 1057 | - Nodes with three to five edges starting from the first offset and increasing |
|---|
| 1058 | - Nodes with two edges starting from the very end and decreasing |
|---|
| 1059 | - Leaf nodes are stored directly as offset inside a node with the MSB indicating |
|---|
| 1060 | that this ptr is a leaf. |
|---|
| 1061 | */ |
|---|
| 1062 | /* this is based on empirical data: Big Nodes are approx 8-15%, small nodes 60-90% */ |
|---|
| 1063 | pp->pp_SfxMemorySize = ((((pp->pp_Size / 100) * SMALLNODESPERCENT) * |
|---|
| 1064 | sizeof(struct SfxNode2Edges)) + |
|---|
| 1065 | (((pp->pp_Size / 100) * BIGNODESPERCENT) * |
|---|
| 1066 | sizeof(struct SfxNodeNEdges)) + (4UL << 20)) & ~3; |
|---|
| 1067 | if(pp->pp_SfxMemorySize >= (1UL << 30)) |
|---|
| 1068 | { |
|---|
| 1069 | pp->pp_SfxMemorySize = (1UL << 30)-4; |
|---|
| 1070 | printf("Warning! Memory limited to 1 GB! Might run out!\n"); |
|---|
| 1071 | } |
|---|
| 1072 | pp->pp_SfxNodes = (UBYTE *) malloc(pp->pp_SfxMemorySize); |
|---|
| 1073 | if(!pp->pp_SfxNodes) |
|---|
| 1074 | { |
|---|
| 1075 | printf("Couldn't allocate %ld KB for suffix nodes.\n", |
|---|
| 1076 | pp->pp_SfxMemorySize >> 10); |
|---|
| 1077 | return(FALSE); |
|---|
| 1078 | } |
|---|
| 1079 | /* init pointing offsets */ |
|---|
| 1080 | pp->pp_SfxNEdgeOffset = 0; |
|---|
| 1081 | pp->pp_Sfx2EdgeOffset = pp->pp_SfxMemorySize; |
|---|
| 1082 | |
|---|
| 1083 | printf("Allocated %ld KB suffix nodes buffer.\n", pp->pp_SfxMemorySize >> 10); |
|---|
| 1084 | |
|---|
| 1085 | /* fill in root node */ |
|---|
| 1086 | pp->pp_SfxRoot = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_SfxNEdgeOffset]; |
|---|
| 1087 | pp->pp_SfxRoot->sn_Parent = 0; |
|---|
| 1088 | pp->pp_SfxRoot->sn_StartPos = 0;//pos; |
|---|
| 1089 | pp->pp_SfxRoot->sn_EdgeLen = 0;//pg->pg_TotalRawSize - len; |
|---|
| 1090 | memset(pp->pp_SfxRoot->sn_Children, 0, pg->pg_AlphaSize * sizeof(ULONG)); |
|---|
| 1091 | pp->pp_SfxNEdgeOffset += sizeof(struct SfxNodeNEdges); |
|---|
| 1092 | |
|---|
| 1093 | /* main loop to build up the tree */ |
|---|
| 1094 | /* NOTE: as a special precaution, all longwords have MAXCODEFITLONG code length */ |
|---|
| 1095 | |
|---|
| 1096 | /* allocate quick lookup table. This is used to speed up traversal of the |
|---|
| 1097 | MAXQPREFIXLOOKUPSIZE lowest levels */ |
|---|
| 1098 | pp->pp_QuickPrefixLookup = (ULONG *) calloc(pg->pg_PowerTable[MAXQPREFIXLOOKUPSIZE], |
|---|
| 1099 | sizeof(ULONG)); |
|---|
| 1100 | if(!pp->pp_QuickPrefixLookup) |
|---|
| 1101 | { |
|---|
| 1102 | printf("Out of memory for Quick Prefix Lookup!\n"); |
|---|
| 1103 | return(FALSE); |
|---|
| 1104 | } |
|---|
| 1105 | |
|---|
| 1106 | len = pg->pg_TotalRawSize; |
|---|
| 1107 | seqptr = pg->pg_MergedRawData; |
|---|
| 1108 | /* get first longword */ |
|---|
| 1109 | pval = *seqptr++; |
|---|
| 1110 | pval >>= pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 1111 | cnt = MAXCODEFITLONG; |
|---|
| 1112 | /* get second longword */ |
|---|
| 1113 | pvalnext = *seqptr++; |
|---|
| 1114 | pvalnext >>= pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 1115 | pos = 0; |
|---|
| 1116 | nodecnt = 0; |
|---|
| 1117 | pp->pp_QuickPrefixCount = 0; |
|---|
| 1118 | len -= MAXCODEFITLONG; |
|---|
| 1119 | do |
|---|
| 1120 | { |
|---|
| 1121 | BOOL takepos; |
|---|
| 1122 | |
|---|
| 1123 | window = ((pval % pg->pg_PowerTable[cnt]) * |
|---|
| 1124 | pg->pg_PowerTable[MAXCODEFITLONG - cnt]) + |
|---|
| 1125 | (pvalnext / pg->pg_PowerTable[cnt]); |
|---|
| 1126 | if(pp->pp_PrefixLen) |
|---|
| 1127 | { |
|---|
| 1128 | /* check, if the prefix matches */ |
|---|
| 1129 | takepos = (window / pg->pg_PowerTable[MAXCODEFITLONG - pp->pp_PrefixLen] == pp->pp_Prefix); |
|---|
| 1130 | } else { |
|---|
| 1131 | takepos = TRUE; /* it's all one big partition */ |
|---|
| 1132 | } |
|---|
| 1133 | |
|---|
| 1134 | if(takepos) /* only add this position, if it matches the prefix */ |
|---|
| 1135 | { |
|---|
| 1136 | if(!(InsertTreePos(pp, pos, window))) |
|---|
| 1137 | { |
|---|
| 1138 | break; |
|---|
| 1139 | } |
|---|
| 1140 | if((++nodecnt & 0x3fff) == 0) |
|---|
| 1141 | { |
|---|
| 1142 | if((nodecnt >> 14) % 50) |
|---|
| 1143 | { |
|---|
| 1144 | printf("."); |
|---|
| 1145 | fflush(stdout); |
|---|
| 1146 | } else { |
|---|
| 1147 | printf(". %2ld%% (%6ld KB free)\n", |
|---|
| 1148 | pos / (pg->pg_TotalRawSize / 100), |
|---|
| 1149 | (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10); |
|---|
| 1150 | } |
|---|
| 1151 | } |
|---|
| 1152 | } |
|---|
| 1153 | |
|---|
| 1154 | /* get next byte */ |
|---|
| 1155 | cnt--; |
|---|
| 1156 | if(len) |
|---|
| 1157 | { |
|---|
| 1158 | if(!cnt) |
|---|
| 1159 | { |
|---|
| 1160 | /* get next position */ |
|---|
| 1161 | pval = pvalnext; |
|---|
| 1162 | pvalnext = *seqptr++; |
|---|
| 1163 | pvalnext >>= pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 1164 | cnt = MAXCODEFITLONG; |
|---|
| 1165 | } |
|---|
| 1166 | len--; |
|---|
| 1167 | } else { |
|---|
| 1168 | if(!cnt) |
|---|
| 1169 | { |
|---|
| 1170 | pval = pvalnext; |
|---|
| 1171 | pvalnext = 0; |
|---|
| 1172 | cnt = MAXCODEFITLONG; |
|---|
| 1173 | } |
|---|
| 1174 | } |
|---|
| 1175 | } while(++pos < pg->pg_TotalRawSize); |
|---|
| 1176 | pp->pp_NumBigNodes = pp->pp_SfxNEdgeOffset / sizeof(struct SfxNodeNEdges); |
|---|
| 1177 | pp->pp_NumSmallNodes = (pp->pp_SfxMemorySize - pp->pp_Sfx2EdgeOffset) / |
|---|
| 1178 | sizeof(struct SfxNode2Edges); |
|---|
| 1179 | |
|---|
| 1180 | printf("DONE! (%ld KB unused)\n", (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10); |
|---|
| 1181 | |
|---|
| 1182 | /* free some memory not required anymore */ |
|---|
| 1183 | freeset(pp->pp_QuickPrefixLookup, NULL); |
|---|
| 1184 | |
|---|
| 1185 | printf("Quick Prefix Lookup speedup: %ld%% (%ld)\n", |
|---|
| 1186 | (pp->pp_QuickPrefixCount * 100) / pp->pp_Size, pp->pp_QuickPrefixCount); |
|---|
| 1187 | |
|---|
| 1188 | if(pp->pp_Size != nodecnt) |
|---|
| 1189 | { |
|---|
| 1190 | printf("Something very bad has happened! Predicted partition size [%ld] didn't\n" |
|---|
| 1191 | "match the actual generated nodes [%ld].\n", |
|---|
| 1192 | pp->pp_Size, nodecnt); |
|---|
| 1193 | return(FALSE); |
|---|
| 1194 | } |
|---|
| 1195 | |
|---|
| 1196 | printf("Nodes : %6ld\n", nodecnt); |
|---|
| 1197 | printf("SmallNodes: %6ld (%ld%%)\n", |
|---|
| 1198 | pp->pp_NumSmallNodes, |
|---|
| 1199 | (pp->pp_NumSmallNodes * 100) / nodecnt); |
|---|
| 1200 | printf("BigNodes : %6ld (%ld%%)\n", |
|---|
| 1201 | pp->pp_NumBigNodes, |
|---|
| 1202 | (pp->pp_NumBigNodes * 100) / nodecnt); |
|---|
| 1203 | |
|---|
| 1204 | pg->pg_Bench.ts_MemTree += BenchTimePassed(pg); |
|---|
| 1205 | return(TRUE); |
|---|
| 1206 | } |
|---|
| 1207 | /* \\\ */ |
|---|
| 1208 | |
|---|
| 1209 | /* /// "CommonSequenceLength()" */ |
|---|
| 1210 | ULONG CommonSequenceLength(struct PTPanPartition *pp, |
|---|
| 1211 | ULONG spos1, ULONG spos2, ULONG maxlen) |
|---|
| 1212 | { |
|---|
| 1213 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 1214 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 1215 | ULONG off1 = spos1 / MAXCODEFITLONG; |
|---|
| 1216 | ULONG cnt1 = spos1 % MAXCODEFITLONG; |
|---|
| 1217 | ULONG off2 = spos2 / MAXCODEFITLONG; |
|---|
| 1218 | ULONG cnt2 = spos2 % MAXCODEFITLONG; |
|---|
| 1219 | ULONG len = 0; |
|---|
| 1220 | |
|---|
| 1221 | /* a note on the implementation: this routine will not work correctly, |
|---|
| 1222 | if the sequences are identical or completely of SEQCODE_N, as it |
|---|
| 1223 | doesn't detect the end of sequence bits, but assumes MAXCODEFITLONG |
|---|
| 1224 | entries for all longwords. */ |
|---|
| 1225 | /* compare one code */ |
|---|
| 1226 | while((((seqptr[off1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) / |
|---|
| 1227 | pg->pg_PowerTable[MAXCODEFITLONG - cnt1 - 1]) % pg->pg_AlphaSize) == |
|---|
| 1228 | (((seqptr[off2] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) / |
|---|
| 1229 | pg->pg_PowerTable[MAXCODEFITLONG - cnt2 - 1]) % pg->pg_AlphaSize)) |
|---|
| 1230 | { |
|---|
| 1231 | /* loop while code is identical */ |
|---|
| 1232 | len++; |
|---|
| 1233 | if(len >= maxlen) |
|---|
| 1234 | { |
|---|
| 1235 | break; |
|---|
| 1236 | } |
|---|
| 1237 | if(++cnt1 >= MAXCODEFITLONG) |
|---|
| 1238 | { |
|---|
| 1239 | cnt1 = 0; |
|---|
| 1240 | off1++; |
|---|
| 1241 | } |
|---|
| 1242 | if(++cnt2 >= MAXCODEFITLONG) |
|---|
| 1243 | { |
|---|
| 1244 | cnt2 = 0; |
|---|
| 1245 | off2++; |
|---|
| 1246 | } |
|---|
| 1247 | } |
|---|
| 1248 | return(len); |
|---|
| 1249 | } |
|---|
| 1250 | /* \\\ */ |
|---|
| 1251 | |
|---|
| 1252 | /* /// "CompareCompressedSequence()" */ |
|---|
| 1253 | LONG CompareCompressedSequence(struct PTPanGlobal *pg, ULONG spos1, ULONG spos2) |
|---|
| 1254 | { |
|---|
| 1255 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 1256 | ULONG off1 = spos1 / MAXCODEFITLONG; |
|---|
| 1257 | ULONG cnt1 = spos1 % MAXCODEFITLONG; |
|---|
| 1258 | ULONG off2 = spos2 / MAXCODEFITLONG; |
|---|
| 1259 | ULONG cnt2 = spos2 % MAXCODEFITLONG; |
|---|
| 1260 | ULONG window1, window2; |
|---|
| 1261 | |
|---|
| 1262 | /* a note on the implementation: this routine will not work correctly, |
|---|
| 1263 | if the sequences are identical or completely of SEQCODE_N, as it |
|---|
| 1264 | doesn't detect the end of sequence bits, but assumes MAXCODEFITLONG |
|---|
| 1265 | entries for all longwords. */ |
|---|
| 1266 | do |
|---|
| 1267 | { |
|---|
| 1268 | /* get windows */ |
|---|
| 1269 | /* I know this looks like very obfusciated code, but take your time and you will |
|---|
| 1270 | understand: take a few codebits from the one longword, add a few bits from |
|---|
| 1271 | the other */ |
|---|
| 1272 | window1 = (((seqptr[off1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) |
|---|
| 1273 | % pg->pg_PowerTable[MAXCODEFITLONG - cnt1]) * pg->pg_PowerTable[cnt1]) |
|---|
| 1274 | + ((seqptr[off1 + 1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) / |
|---|
| 1275 | pg->pg_PowerTable[MAXCODEFITLONG - cnt1]); |
|---|
| 1276 | window2 = (((seqptr[off2] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) |
|---|
| 1277 | % pg->pg_PowerTable[MAXCODEFITLONG - cnt2]) * pg->pg_PowerTable[cnt2]) |
|---|
| 1278 | + ((seqptr[off2 + 1] >> pg->pg_BitsShiftTable[MAXCODEFITLONG]) / |
|---|
| 1279 | pg->pg_PowerTable[MAXCODEFITLONG - cnt2]); |
|---|
| 1280 | |
|---|
| 1281 | /* compare the windows */ |
|---|
| 1282 | if(window1 == window2) |
|---|
| 1283 | { |
|---|
| 1284 | /* was the same, look at next position */ |
|---|
| 1285 | if(++cnt1 >= MAXCODEFITLONG) |
|---|
| 1286 | { |
|---|
| 1287 | cnt1 = 0; |
|---|
| 1288 | off1++; |
|---|
| 1289 | } |
|---|
| 1290 | if(++cnt2 >= MAXCODEFITLONG) |
|---|
| 1291 | { |
|---|
| 1292 | cnt2 = 0; |
|---|
| 1293 | off2++; |
|---|
| 1294 | } |
|---|
| 1295 | } else { |
|---|
| 1296 | return(((LONG) window1) - ((LONG) window2)); |
|---|
| 1297 | } |
|---|
| 1298 | } while(TRUE); |
|---|
| 1299 | return(0); /* never reached */ |
|---|
| 1300 | } |
|---|
| 1301 | /* \\\ */ |
|---|
| 1302 | |
|---|
| 1303 | /* /// "InsertTreePos()" */ |
|---|
| 1304 | BOOL InsertTreePos(struct PTPanPartition *pp, ULONG pos, ULONG window) |
|---|
| 1305 | { |
|---|
| 1306 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 1307 | struct SfxNode *sfxnode = pp->pp_SfxRoot; |
|---|
| 1308 | struct SfxNode *prevnode; |
|---|
| 1309 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 1310 | ULONG relptr; |
|---|
| 1311 | ULONG len; |
|---|
| 1312 | UBYTE seqcode; |
|---|
| 1313 | ULONG childptr; |
|---|
| 1314 | UWORD childidx, childcnt; |
|---|
| 1315 | UWORD previdx; |
|---|
| 1316 | BOOL childisleaf; |
|---|
| 1317 | ULONG prefix; |
|---|
| 1318 | ULONG treepos; |
|---|
| 1319 | |
|---|
| 1320 | prefix = window / pg->pg_PowerTable[MAXCODEFITLONG - MAXQPREFIXLOOKUPSIZE]; |
|---|
| 1321 | /* see if we can use a quick lookup to skip the root levels of the tree */ |
|---|
| 1322 | if((childptr = pp->pp_QuickPrefixLookup[prefix]) && |
|---|
| 1323 | (pos + MAXQPREFIXLOOKUPSIZE < pg->pg_TotalRawSize)) |
|---|
| 1324 | { |
|---|
| 1325 | pp->pp_QuickPrefixCount++; |
|---|
| 1326 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[childptr]; |
|---|
| 1327 | pos += MAXQPREFIXLOOKUPSIZE; |
|---|
| 1328 | treepos = MAXQPREFIXLOOKUPSIZE; |
|---|
| 1329 | } else { |
|---|
| 1330 | treepos = 0; |
|---|
| 1331 | } |
|---|
| 1332 | len = pg->pg_TotalRawSize - pos; |
|---|
| 1333 | while(len) |
|---|
| 1334 | { |
|---|
| 1335 | /* get first sequence code */ |
|---|
| 1336 | seqcode = GetSeqCodeQuick(pos); |
|---|
| 1337 | |
|---|
| 1338 | /*printf("[%ld%c] ", pos, //((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes), |
|---|
| 1339 | pg->pg_DecompressTable[seqcode]);*/ |
|---|
| 1340 | |
|---|
| 1341 | /* check, if there's already a child */ |
|---|
| 1342 | relptr = 0; |
|---|
| 1343 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 1344 | |
|---|
| 1345 | while(childidx--) |
|---|
| 1346 | { |
|---|
| 1347 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 1348 | if(childptr >> LEAFBIT) |
|---|
| 1349 | { |
|---|
| 1350 | /* this is a leaf pointer and doesn't contain a seqcode */ |
|---|
| 1351 | childptr &= ~LEAFMASK; |
|---|
| 1352 | childptr += treepos; |
|---|
| 1353 | //printf("<%c>", pg->pg_DecompressTable[GetSeqCodeQuick(childptr)]); |
|---|
| 1354 | if(GetSeqCodeQuick(childptr) == seqcode) |
|---|
| 1355 | { |
|---|
| 1356 | /* fill in a dummy relptr -- we'll use childptr later */ |
|---|
| 1357 | relptr = ~0UL; |
|---|
| 1358 | childisleaf = TRUE; |
|---|
| 1359 | break; |
|---|
| 1360 | } |
|---|
| 1361 | } else { |
|---|
| 1362 | //printf("[%c]", pg->pg_DecompressTable[childptr >> RELOFFSETBITS]); |
|---|
| 1363 | if((childptr >> RELOFFSETBITS) == seqcode) |
|---|
| 1364 | { |
|---|
| 1365 | /* hey, we actually found the right child */ |
|---|
| 1366 | relptr = (childptr & RELOFFSETMASK) << 2; |
|---|
| 1367 | childisleaf = FALSE; |
|---|
| 1368 | break; |
|---|
| 1369 | } |
|---|
| 1370 | } |
|---|
| 1371 | } |
|---|
| 1372 | /* did we find a child? */ |
|---|
| 1373 | if(relptr) |
|---|
| 1374 | { |
|---|
| 1375 | ULONG matchsize; |
|---|
| 1376 | |
|---|
| 1377 | if(childisleaf) |
|---|
| 1378 | { |
|---|
| 1379 | struct SfxNode *splitnode; |
|---|
| 1380 | /* relptr is no pointer to a node, but a startpos instead */ |
|---|
| 1381 | matchsize = CommonSequenceLength(pp, pos, childptr, pg->pg_TotalRawSize - childptr); |
|---|
| 1382 | |
|---|
| 1383 | /* this will always lead to partial matches! */ |
|---|
| 1384 | |
|---|
| 1385 | /* allocate a new branching node */ |
|---|
| 1386 | pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNode2Edges); |
|---|
| 1387 | |
|---|
| 1388 | /*printf("Leaf split (pos %ld, len %ld): %d (%c != %c) -> [%ld]\n", |
|---|
| 1389 | pos, len, matchsize, |
|---|
| 1390 | pg->pg_DecompressTable[GetSeqCodeQuick(childptr + matchsize)], |
|---|
| 1391 | pg->pg_DecompressTable[GetSeqCodeQuick(pos + matchsize)], |
|---|
| 1392 | pp->pp_Sfx2EdgeOffset);*/ |
|---|
| 1393 | |
|---|
| 1394 | splitnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset]; |
|---|
| 1395 | /* fill in the node with the two leaves */ |
|---|
| 1396 | splitnode->sn_Parent = ((((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes)) >> 2) | |
|---|
| 1397 | (2 << RELOFFSETBITS); |
|---|
| 1398 | splitnode->sn_StartPos = childptr; |
|---|
| 1399 | splitnode->sn_EdgeLen = matchsize; |
|---|
| 1400 | splitnode->sn_Children[0] = LEAFMASK | (childptr - treepos); |
|---|
| 1401 | splitnode->sn_Children[1] = LEAFMASK | (pos - treepos); |
|---|
| 1402 | /*printf("Child0 = %ld, Child1 = %ld\n", |
|---|
| 1403 | splitnode->sn_Children[0] & ~LEAFMASK, |
|---|
| 1404 | splitnode->sn_Children[1] & ~LEAFMASK);*/ |
|---|
| 1405 | #if 0 // debug |
|---|
| 1406 | if(GetSeqCodeQuick(childptr + matchsize) == GetSeqCodeQuick(pos + matchsize)) |
|---|
| 1407 | { |
|---|
| 1408 | printf("CIS: %ld<->%ld [%ld|%ld] (matchsize %ld)\n", |
|---|
| 1409 | GetSeqCodeQuick(childptr + matchsize), |
|---|
| 1410 | GetSeqCodeQuick(pos + matchsize), |
|---|
| 1411 | childptr, |
|---|
| 1412 | pos, |
|---|
| 1413 | matchsize); |
|---|
| 1414 | } |
|---|
| 1415 | #endif |
|---|
| 1416 | /* fix downlink (child) pointer */ |
|---|
| 1417 | sfxnode->sn_Children[childidx] = (seqcode << RELOFFSETBITS) | (pp->pp_Sfx2EdgeOffset >> 2); |
|---|
| 1418 | break; |
|---|
| 1419 | } |
|---|
| 1420 | //printf("->[%ld]", relptr); |
|---|
| 1421 | /* okay, there is a child, get it */ |
|---|
| 1422 | prevnode = sfxnode; |
|---|
| 1423 | previdx = childidx; /* is needed for correcting the downlink ptr */ |
|---|
| 1424 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[relptr]; |
|---|
| 1425 | /* compare its edge */ |
|---|
| 1426 | if(sfxnode->sn_EdgeLen > 1) |
|---|
| 1427 | { |
|---|
| 1428 | matchsize = CommonSequenceLength(pp, pos, sfxnode->sn_StartPos, sfxnode->sn_EdgeLen); |
|---|
| 1429 | } else { |
|---|
| 1430 | matchsize = 1; |
|---|
| 1431 | } |
|---|
| 1432 | if(matchsize < sfxnode->sn_EdgeLen) /* did the whole edge match? */ |
|---|
| 1433 | { |
|---|
| 1434 | struct SfxNode *upnode; |
|---|
| 1435 | //printf("Partmatch(%ld, %ld): %d\n", pos, len, matchsize); |
|---|
| 1436 | /* we only had a partial match, we need to split the node */ |
|---|
| 1437 | |
|---|
| 1438 | /* allocate a new node */ |
|---|
| 1439 | pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNode2Edges); |
|---|
| 1440 | upnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset]; |
|---|
| 1441 | |
|---|
| 1442 | /* fix linkage of middle node and set two edges */ |
|---|
| 1443 | upnode->sn_Parent = ((((ULONG) prevnode) - ((ULONG) pp->pp_SfxNodes)) >> 2) | |
|---|
| 1444 | (2 << RELOFFSETBITS); |
|---|
| 1445 | upnode->sn_StartPos = sfxnode->sn_StartPos; |
|---|
| 1446 | upnode->sn_EdgeLen = matchsize; |
|---|
| 1447 | upnode->sn_Children[0] = ((((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes)) >> 2) | |
|---|
| 1448 | (GetSeqCodeQuick(upnode->sn_StartPos + matchsize) << RELOFFSETBITS); |
|---|
| 1449 | /* enter child leaf node */ |
|---|
| 1450 | upnode->sn_Children[1] = LEAFMASK | (pos - treepos); |
|---|
| 1451 | |
|---|
| 1452 | #if 0 // debug |
|---|
| 1453 | if(GetSeqCodeQuick(upnode->sn_StartPos + matchsize) == GetSeqCodeQuick(pos + matchsize)) |
|---|
| 1454 | { |
|---|
| 1455 | printf("SN: %ld<->%ld\n", |
|---|
| 1456 | GetSeqCodeQuick(upnode->sn_StartPos + matchsize), |
|---|
| 1457 | GetSeqCodeQuick(pos + matchsize)); |
|---|
| 1458 | } |
|---|
| 1459 | #endif |
|---|
| 1460 | |
|---|
| 1461 | /* fix sfxnode linkage and edge */ |
|---|
| 1462 | sfxnode->sn_Parent = (pp->pp_Sfx2EdgeOffset >> 2) | |
|---|
| 1463 | (sfxnode->sn_Parent & ~RELOFFSETMASK); |
|---|
| 1464 | sfxnode->sn_StartPos += matchsize; |
|---|
| 1465 | sfxnode->sn_EdgeLen -= matchsize; |
|---|
| 1466 | |
|---|
| 1467 | /* fix prevnode */ |
|---|
| 1468 | prevnode->sn_Children[childidx] = (pp->pp_Sfx2EdgeOffset >> 2) | |
|---|
| 1469 | (prevnode->sn_Children[childidx] & ~RELOFFSETMASK); |
|---|
| 1470 | |
|---|
| 1471 | if(pp->pp_SfxNEdgeOffset >= pp->pp_Sfx2EdgeOffset) |
|---|
| 1472 | { |
|---|
| 1473 | printf("Node buffer was too small!\n"); |
|---|
| 1474 | return(FALSE); |
|---|
| 1475 | } |
|---|
| 1476 | break; |
|---|
| 1477 | } else { |
|---|
| 1478 | /* the whole edge matched, just follow the path */ |
|---|
| 1479 | /*printf("Wholematch(%ld, %ld): %d [%d] -> %ld\n", |
|---|
| 1480 | pos, len, matchsize, seqcode, (((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes)));*/ |
|---|
| 1481 | pos += matchsize; |
|---|
| 1482 | len -= matchsize; |
|---|
| 1483 | treepos += matchsize; |
|---|
| 1484 | } |
|---|
| 1485 | } else { |
|---|
| 1486 | childcnt = (sfxnode->sn_Parent >> RELOFFSETBITS); |
|---|
| 1487 | /*printf("New leaf[%d]-(pos %ld, len %ld): (%c)\n", |
|---|
| 1488 | childcnt, pos, len, pg->pg_DecompressTable[seqcode]);*/ |
|---|
| 1489 | if((childcnt == 2) && (sfxnode != pp->pp_SfxRoot)) |
|---|
| 1490 | { |
|---|
| 1491 | struct SfxNode *bignode; |
|---|
| 1492 | struct SfxNode *lastnode; |
|---|
| 1493 | |
|---|
| 1494 | /* we need to expand this node from 2 to 5 branches first */ |
|---|
| 1495 | /*printf("2T5 [%ld]->[%ld] [%ld]->[%ld]\n", |
|---|
| 1496 | ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes), |
|---|
| 1497 | pp->pp_SfxNEdgeOffset, |
|---|
| 1498 | pp->pp_Sfx2EdgeOffset, |
|---|
| 1499 | ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes));*/ |
|---|
| 1500 | |
|---|
| 1501 | /* allocate a new big node */ |
|---|
| 1502 | bignode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_SfxNEdgeOffset]; |
|---|
| 1503 | |
|---|
| 1504 | /* copy node */ |
|---|
| 1505 | memcpy(bignode, sfxnode, sizeof(struct SfxNode2Edges)); |
|---|
| 1506 | |
|---|
| 1507 | /* fix prevnode -> bignode child pointer */ |
|---|
| 1508 | prevnode->sn_Children[previdx] = (pp->pp_SfxNEdgeOffset >> 2) | |
|---|
| 1509 | (prevnode->sn_Children[previdx] & ~RELOFFSETMASK); |
|---|
| 1510 | |
|---|
| 1511 | /* fix children -> bignode parent pointers */ |
|---|
| 1512 | childidx = 2; //(bignode->sn_Parent >> RELOFFSETBITS); |
|---|
| 1513 | while(childidx--) |
|---|
| 1514 | { |
|---|
| 1515 | childptr = bignode->sn_Children[childidx]; |
|---|
| 1516 | if(!(childptr >> LEAFBIT)) /* only check for real nodes */ |
|---|
| 1517 | { |
|---|
| 1518 | relptr = (childptr & RELOFFSETMASK) << 2; |
|---|
| 1519 | //printf("Fixup childidx=%d from %ld\n", childidx, relptr); |
|---|
| 1520 | /* fix the pointer to new location */ |
|---|
| 1521 | prevnode = (struct SfxNode *) &pp->pp_SfxNodes[relptr]; |
|---|
| 1522 | prevnode->sn_Parent = (prevnode->sn_Parent & ~RELOFFSETMASK) | |
|---|
| 1523 | (((ULONG) bignode) - ((ULONG) pp->pp_SfxNodes) >> 2); |
|---|
| 1524 | } |
|---|
| 1525 | } |
|---|
| 1526 | |
|---|
| 1527 | /* avoid copy, if both are the same */ |
|---|
| 1528 | if(pp->pp_Sfx2EdgeOffset != |
|---|
| 1529 | ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes)) |
|---|
| 1530 | { |
|---|
| 1531 | /* regain memory by copying last two edge node into hole */ |
|---|
| 1532 | lastnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset]; |
|---|
| 1533 | |
|---|
| 1534 | /* copy node */ |
|---|
| 1535 | memcpy(sfxnode, lastnode, sizeof(struct SfxNode2Edges)); |
|---|
| 1536 | |
|---|
| 1537 | /* find lastnode->parent->lastnode downward pointer */ |
|---|
| 1538 | prevnode = (struct SfxNode *) &pp->pp_SfxNodes[(lastnode->sn_Parent & RELOFFSETMASK) << 2]; |
|---|
| 1539 | childidx = (prevnode->sn_Parent >> RELOFFSETBITS); |
|---|
| 1540 | while(childidx--) |
|---|
| 1541 | { |
|---|
| 1542 | childptr = prevnode->sn_Children[childidx]; |
|---|
| 1543 | if(!(childptr >> LEAFBIT)) /* only check for real nodes */ |
|---|
| 1544 | { |
|---|
| 1545 | if((childptr & RELOFFSETMASK) == (pp->pp_Sfx2EdgeOffset >> 2)) |
|---|
| 1546 | { |
|---|
| 1547 | /* fix the pointer to new location */ |
|---|
| 1548 | /*printf("Fixdown childidx=%d from %ld\n", |
|---|
| 1549 | childidx, lastnode->sn_Parent & RELOFFSETMASK);*/ |
|---|
| 1550 | prevnode->sn_Children[childidx] = (childptr & ~RELOFFSETMASK) | |
|---|
| 1551 | (((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes) >> 2); |
|---|
| 1552 | break; |
|---|
| 1553 | } |
|---|
| 1554 | } |
|---|
| 1555 | } |
|---|
| 1556 | |
|---|
| 1557 | /* fix children->parent upward pointers */ |
|---|
| 1558 | childidx = 2;//(lastnode->sn_Parent >> RELOFFSETBITS); |
|---|
| 1559 | while(childidx--) |
|---|
| 1560 | { |
|---|
| 1561 | childptr = lastnode->sn_Children[childidx]; |
|---|
| 1562 | if(!(childptr >> LEAFBIT)) /* only check for real nodes */ |
|---|
| 1563 | { |
|---|
| 1564 | relptr = (childptr & RELOFFSETMASK) << 2; |
|---|
| 1565 | //printf("Fixup childidx=%d from %ld\n", childidx, relptr); |
|---|
| 1566 | /* fix the pointer to new location */ |
|---|
| 1567 | prevnode = (struct SfxNode *) &pp->pp_SfxNodes[relptr]; |
|---|
| 1568 | prevnode->sn_Parent = (prevnode->sn_Parent & ~RELOFFSETMASK) | |
|---|
| 1569 | (((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes) >> 2); |
|---|
| 1570 | } |
|---|
| 1571 | } |
|---|
| 1572 | } |
|---|
| 1573 | /* we're done with the fixing, now correct memory usage */ |
|---|
| 1574 | sfxnode = bignode; |
|---|
| 1575 | pp->pp_Sfx2EdgeOffset += sizeof(struct SfxNode2Edges); |
|---|
| 1576 | pp->pp_SfxNEdgeOffset += sizeof(struct SfxNodeNEdges); |
|---|
| 1577 | if(treepos == MAXQPREFIXLOOKUPSIZE) |
|---|
| 1578 | { |
|---|
| 1579 | pp->pp_QuickPrefixLookup[prefix] = ((ULONG) sfxnode) - ((ULONG) pp->pp_SfxNodes); |
|---|
| 1580 | } |
|---|
| 1581 | } |
|---|
| 1582 | |
|---|
| 1583 | /* enter new child */ |
|---|
| 1584 | sfxnode->sn_Children[childcnt] = LEAFMASK | (pos - treepos); |
|---|
| 1585 | //printf("New child: %ld (%ld)\n", pos-treepos, treepos); |
|---|
| 1586 | /* increase edgecount */ |
|---|
| 1587 | sfxnode->sn_Parent += (1UL << RELOFFSETBITS); |
|---|
| 1588 | break; |
|---|
| 1589 | } |
|---|
| 1590 | } |
|---|
| 1591 | //printf("Done (2E: %ld NE: %ld)\n", pp->pp_Sfx2EdgeOffset, pp->pp_SfxNEdgeOffset); |
|---|
| 1592 | return(TRUE); |
|---|
| 1593 | } |
|---|
| 1594 | /* \\\ */ |
|---|
| 1595 | |
|---|
| 1596 | /* /// "CalculateTreeStats()" */ |
|---|
| 1597 | BOOL CalculateTreeStats(struct PTPanPartition *pp) |
|---|
| 1598 | { |
|---|
| 1599 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 1600 | ULONG cnt; |
|---|
| 1601 | ULONG edgetotal; |
|---|
| 1602 | ULONG bitsused; |
|---|
| 1603 | ULONG threshold; |
|---|
| 1604 | |
|---|
| 1605 | BenchTimePassed(pg); |
|---|
| 1606 | /* now we need some statistical data. this includes: |
|---|
| 1607 | a) the maximum depth of the tree |
|---|
| 1608 | b) the number of nodes and leaves on each level (i.e. population) |
|---|
| 1609 | out of b) we will generate the TREE PRUNING DEPTH |
|---|
| 1610 | c) the appearance of all branch combinations (branch combo mask) up to the pruning depth |
|---|
| 1611 | out of c) we will generate a HUFFMAN CODE for the branch ptr mask |
|---|
| 1612 | d) the occurrences of the short edges codes and the number of long edges |
|---|
| 1613 | out of d) we will generate a HUFFMAN CODE for the edge codes |
|---|
| 1614 | moreover, it replaces the short edges by the indexes to the huffman code (sn_StartPos |
|---|
| 1615 | is recycled, gets bit 30 set). |
|---|
| 1616 | e) generating a HUFFMAN CODE fo the long edge lengths |
|---|
| 1617 | f) generating a lookup DICTIONARY for the long edge codes |
|---|
| 1618 | f) will allow storing shorter label pointers and only a fraction of memory for lookup |
|---|
| 1619 | long edges will be replaced by the position in the dictionary (sn_StartPos is recycled, |
|---|
| 1620 | gets bit 31 set). |
|---|
| 1621 | */ |
|---|
| 1622 | //memset(pp->pp_EdgeBranchTable, 0, ALPHASIZE * sizeof(ULONG)); |
|---|
| 1623 | |
|---|
| 1624 | #if 0 /* debug */ |
|---|
| 1625 | pp->pp_VerifyArray = (UBYTE *) calloc(pg->pg_TotalRawSize+1, 1); |
|---|
| 1626 | |
|---|
| 1627 | /* verify the correctness of the tree */ |
|---|
| 1628 | printf("Verifying correctness of the tree...\n"); |
|---|
| 1629 | GetTreeStatsVerifyRec(pp, 0, 0, 0); |
|---|
| 1630 | |
|---|
| 1631 | for(cnt = 0; cnt < ((pg->pg_TotalRawSize + 7) >> 3); cnt++) |
|---|
| 1632 | { |
|---|
| 1633 | if(pp->pp_VerifyArray[cnt] != 0xff) |
|---|
| 1634 | { |
|---|
| 1635 | printf("Hole in Position %ld to %ld (%02x)\n", |
|---|
| 1636 | cnt << 3, (cnt << 3)+7, pp->pp_VerifyArray[cnt]); |
|---|
| 1637 | } else { |
|---|
| 1638 | /*printf("Good in Position %ld to %ld (%02x)\n", |
|---|
| 1639 | cnt << 3, (cnt << 3)+7, pp->pp_VerifyArray[cnt]);*/ |
|---|
| 1640 | } |
|---|
| 1641 | } |
|---|
| 1642 | free(pp->pp_VerifyArray); |
|---|
| 1643 | #endif |
|---|
| 1644 | |
|---|
| 1645 | /* calculate maximum depth of the tree */ |
|---|
| 1646 | pp->pp_MaxTreeDepth = 0; |
|---|
| 1647 | GetTreeStatsTreeDepthRec(pp, 0, 0); |
|---|
| 1648 | pp->pp_MaxTreeDepth++; /* increase by one due to leaf level */ |
|---|
| 1649 | |
|---|
| 1650 | printf("Max tree depth: %ld\n", pp->pp_MaxTreeDepth); |
|---|
| 1651 | |
|---|
| 1652 | /* generate statistical data, part 2: level population */ |
|---|
| 1653 | pp->pp_LevelStats = (struct TreeLevelStats *) calloc(pp->pp_MaxTreeDepth, |
|---|
| 1654 | sizeof(struct TreeLevelStats)); |
|---|
| 1655 | |
|---|
| 1656 | GetTreeStatsLevelRec(pp, 0, 0); |
|---|
| 1657 | |
|---|
| 1658 | /* calculate missing total values and pruning position */ |
|---|
| 1659 | if(pg->pg_PruneLength) |
|---|
| 1660 | { |
|---|
| 1661 | pp->pp_TreePruneDepth = pg->pg_PruneLength; |
|---|
| 1662 | pp->pp_TreePruneLength = pg->pg_PruneLength; |
|---|
| 1663 | } else { |
|---|
| 1664 | pp->pp_TreePruneDepth = 20; /* FIXME */ |
|---|
| 1665 | pp->pp_TreePruneLength = 20; /* FIXME */ |
|---|
| 1666 | } |
|---|
| 1667 | cnt = pp->pp_MaxTreeDepth; |
|---|
| 1668 | while(cnt--) |
|---|
| 1669 | { |
|---|
| 1670 | pp->pp_LevelStats[cnt].tls_TotalLeafCount = pp->pp_LevelStats[cnt].tls_LeafCount; |
|---|
| 1671 | pp->pp_LevelStats[cnt].tls_TotalNodeCount = pp->pp_LevelStats[cnt].tls_NodeCount; |
|---|
| 1672 | if(cnt < pp->pp_MaxTreeDepth-1) |
|---|
| 1673 | { |
|---|
| 1674 | pp->pp_LevelStats[cnt].tls_TotalLeafCount += pp->pp_LevelStats[cnt+1].tls_TotalLeafCount; |
|---|
| 1675 | pp->pp_LevelStats[cnt].tls_TotalNodeCount += pp->pp_LevelStats[cnt+1].tls_TotalNodeCount; |
|---|
| 1676 | |
|---|
| 1677 | /* calculate tree pruning depth. currently, will prune at 66% of the leaves |
|---|
| 1678 | covered */ |
|---|
| 1679 | if(!pp->pp_TreePruneDepth) |
|---|
| 1680 | { |
|---|
| 1681 | if(pp->pp_LevelStats[cnt].tls_TotalLeafCount > pp->pp_Size / 3) |
|---|
| 1682 | { |
|---|
| 1683 | pp->pp_TreePruneDepth = cnt; |
|---|
| 1684 | } |
|---|
| 1685 | } |
|---|
| 1686 | } |
|---|
| 1687 | } |
|---|
| 1688 | |
|---|
| 1689 | /* debug output */ |
|---|
| 1690 | #if 0 |
|---|
| 1691 | for(cnt = 0; cnt < pp->pp_MaxTreeDepth; cnt++) |
|---|
| 1692 | { |
|---|
| 1693 | printf("Level %3ld: Nodes=%6ld, Leaves=%6ld, TotalNodes=%6ld, TotalLeaves=%6ld\n", |
|---|
| 1694 | cnt, |
|---|
| 1695 | pp->pp_LevelStats[cnt].tls_NodeCount, |
|---|
| 1696 | pp->pp_LevelStats[cnt].tls_LeafCount, |
|---|
| 1697 | pp->pp_LevelStats[cnt].tls_TotalNodeCount, |
|---|
| 1698 | pp->pp_LevelStats[cnt].tls_TotalLeafCount); |
|---|
| 1699 | } |
|---|
| 1700 | #endif |
|---|
| 1701 | |
|---|
| 1702 | printf("Tree pruning at depth %ld, length %ld.\n", |
|---|
| 1703 | pp->pp_TreePruneDepth, |
|---|
| 1704 | pp->pp_TreePruneLength); |
|---|
| 1705 | |
|---|
| 1706 | freeset(pp->pp_LevelStats, NULL); |
|---|
| 1707 | |
|---|
| 1708 | /* allocate branch histogram */ |
|---|
| 1709 | pp->pp_BranchCode = (struct HuffCode *) calloc(1UL << pg->pg_AlphaSize, |
|---|
| 1710 | sizeof(struct HuffCode)); |
|---|
| 1711 | if(!pp->pp_BranchCode) |
|---|
| 1712 | { |
|---|
| 1713 | printf("Out of memory for Branch Histogram!\n"); |
|---|
| 1714 | return(FALSE); |
|---|
| 1715 | } |
|---|
| 1716 | GetTreeStatsBranchHistoRec(pp, 0, 0, 0); |
|---|
| 1717 | |
|---|
| 1718 | /* generate statistical data, part 3: edge lengths and combinations */ |
|---|
| 1719 | |
|---|
| 1720 | /* allocate short edge code histogram (for edges between of 2-7 base pairs) */ |
|---|
| 1721 | bitsused = pg->pg_BitsUseTable[SHORTEDGEMAX]+1; |
|---|
| 1722 | pp->pp_ShortEdgeCode = (struct HuffCode *) calloc((1UL << bitsused), |
|---|
| 1723 | sizeof(struct HuffCode)); |
|---|
| 1724 | if(!pp->pp_ShortEdgeCode) |
|---|
| 1725 | { |
|---|
| 1726 | printf("Out of memory for Short Edge Histogram!\n"); |
|---|
| 1727 | return(FALSE); |
|---|
| 1728 | } |
|---|
| 1729 | |
|---|
| 1730 | /* get short edge stats */ |
|---|
| 1731 | pp->pp_EdgeCount = 0; |
|---|
| 1732 | pp->pp_ShortEdgeCode[1].hc_Weight++; |
|---|
| 1733 | GetTreeStatsShortEdgesRec(pp, 0, 0, 0); |
|---|
| 1734 | |
|---|
| 1735 | /* define threshold for small edge optimization */ |
|---|
| 1736 | threshold = pp->pp_EdgeCount / 10000; |
|---|
| 1737 | |
|---|
| 1738 | /* calculate the number of longedges required */ |
|---|
| 1739 | edgetotal = pp->pp_EdgeCount; |
|---|
| 1740 | for(cnt = 1; cnt < (1UL << bitsused); cnt++) |
|---|
| 1741 | { |
|---|
| 1742 | if(pp->pp_ShortEdgeCode[cnt].hc_Weight > threshold) /* code will remain in the table */ |
|---|
| 1743 | { |
|---|
| 1744 | edgetotal -= pp->pp_ShortEdgeCode[cnt].hc_Weight; |
|---|
| 1745 | } |
|---|
| 1746 | } |
|---|
| 1747 | /* code 0 will be used for long edges */ |
|---|
| 1748 | pp->pp_ShortEdgeCode[0].hc_Weight = edgetotal; |
|---|
| 1749 | |
|---|
| 1750 | printf("Considering %ld (%ld+%ld) edges for the final tree.\n", |
|---|
| 1751 | pp->pp_EdgeCount, pp->pp_EdgeCount - edgetotal, edgetotal); |
|---|
| 1752 | |
|---|
| 1753 | /* generate huffman code for short edge, but only for codes that provide at least |
|---|
| 1754 | 1/10000th of the edges (other stuff goes into the long edges) */ |
|---|
| 1755 | printf("Generating huffman code for short edges\n"); |
|---|
| 1756 | BuildHuffmanCode(pp->pp_ShortEdgeCode, (1UL << bitsused), threshold); |
|---|
| 1757 | |
|---|
| 1758 | #if 0 |
|---|
| 1759 | /* debug */ |
|---|
| 1760 | for(cnt = 0; cnt < (1UL << bitsused); cnt++) |
|---|
| 1761 | { |
|---|
| 1762 | WORD bitcnt; |
|---|
| 1763 | if(pp->pp_ShortEdgeCode[cnt].hc_CodeLength) |
|---|
| 1764 | { |
|---|
| 1765 | printf("%6ld: %7ld -> %2d ", cnt, pp->pp_ShortEdgeCode[cnt].hc_Weight, |
|---|
| 1766 | pp->pp_ShortEdgeCode[cnt].hc_CodeLength); |
|---|
| 1767 | for(bitcnt = pp->pp_ShortEdgeCode[cnt].hc_CodeLength - 1; bitcnt >= 0; bitcnt--) |
|---|
| 1768 | { |
|---|
| 1769 | printf("%s", pp->pp_ShortEdgeCode[cnt].hc_Codec & (1UL << bitcnt) ? "1" : "0"); |
|---|
| 1770 | } |
|---|
| 1771 | printf("\n"); |
|---|
| 1772 | } |
|---|
| 1773 | } |
|---|
| 1774 | #endif |
|---|
| 1775 | |
|---|
| 1776 | /* now generate dictionary for the long edges. This is done by generating an array of |
|---|
| 1777 | all long edges and then sorting it according to the length. Then, the dictionary |
|---|
| 1778 | string is built up. It replaces the starting pos with the pos in the dictionary |
|---|
| 1779 | and sets bit 31 to indicate this. |
|---|
| 1780 | */ |
|---|
| 1781 | //GetTreeStatsDebugRec(pp, 0, 0); |
|---|
| 1782 | pg->pg_Bench.ts_TreeStats += BenchTimePassed(pg); |
|---|
| 1783 | |
|---|
| 1784 | BuildLongEdgeDictionary(pp); |
|---|
| 1785 | |
|---|
| 1786 | BenchTimePassed(pg); |
|---|
| 1787 | |
|---|
| 1788 | /* generate huffman code for edge bit mask, saves 1-2 bits per node */ |
|---|
| 1789 | printf("Generating huffman code for branch mask\n"); |
|---|
| 1790 | BuildHuffmanCode(pp->pp_BranchCode, (1UL << pg->pg_AlphaSize), 0); |
|---|
| 1791 | |
|---|
| 1792 | /* debug output */ |
|---|
| 1793 | #if 0 |
|---|
| 1794 | for(cnt = 0; cnt < (1UL << pg->pg_AlphaSize); cnt++) |
|---|
| 1795 | { |
|---|
| 1796 | WORD bitcnt; |
|---|
| 1797 | if(pp->pp_BranchCode[cnt].hc_Weight) |
|---|
| 1798 | { |
|---|
| 1799 | printf("%2ld: %6ld -> %2d ", cnt, pp->pp_BranchCode[cnt].hc_Weight, |
|---|
| 1800 | pp->pp_BranchCode[cnt].hc_CodeLength); |
|---|
| 1801 | for(bitcnt = 0; bitcnt < pg->pg_AlphaSize; bitcnt++) |
|---|
| 1802 | { |
|---|
| 1803 | printf("%c", (cnt & (1UL << bitcnt)) ? pg->pg_DecompressTable[bitcnt] : ' '); |
|---|
| 1804 | } |
|---|
| 1805 | printf(" "); |
|---|
| 1806 | for(bitcnt = pp->pp_BranchCode[cnt].hc_CodeLength - 1; bitcnt >= 0; bitcnt--) |
|---|
| 1807 | { |
|---|
| 1808 | printf("%s", pp->pp_BranchCode[cnt].hc_Codec & (1UL << bitcnt) ? "1" : "0"); |
|---|
| 1809 | } |
|---|
| 1810 | printf("\n"); |
|---|
| 1811 | } |
|---|
| 1812 | } |
|---|
| 1813 | #endif |
|---|
| 1814 | |
|---|
| 1815 | pg->pg_Bench.ts_TreeStats += BenchTimePassed(pg); |
|---|
| 1816 | //GetTreeStatsDebugRec(pp, 0, 0); |
|---|
| 1817 | |
|---|
| 1818 | return(TRUE); |
|---|
| 1819 | } |
|---|
| 1820 | /* \\\ */ |
|---|
| 1821 | |
|---|
| 1822 | /* /// "GetTreeStatsDebugRec()" */ |
|---|
| 1823 | void GetTreeStatsDebugRec(struct PTPanPartition *pp, ULONG pos, ULONG level) |
|---|
| 1824 | { |
|---|
| 1825 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 1826 | struct SfxNode *sfxnode; |
|---|
| 1827 | ULONG childptr; |
|---|
| 1828 | UWORD childidx; |
|---|
| 1829 | ULONG cnt; |
|---|
| 1830 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 1831 | |
|---|
| 1832 | level++; |
|---|
| 1833 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 1834 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 1835 | printf("Pos: [%08lx], Level %ld, Edge [", |
|---|
| 1836 | pos, level); |
|---|
| 1837 | if(sfxnode->sn_StartPos & (3UL << 30)) |
|---|
| 1838 | { |
|---|
| 1839 | printf("%08lx", sfxnode->sn_StartPos); |
|---|
| 1840 | } else { |
|---|
| 1841 | for(cnt = 0; cnt < sfxnode->sn_EdgeLen; cnt++) |
|---|
| 1842 | { |
|---|
| 1843 | printf("%c", pg->pg_DecompressTable[GetSeqCodeQuick(cnt + sfxnode->sn_StartPos)]); |
|---|
| 1844 | } |
|---|
| 1845 | } |
|---|
| 1846 | printf("] EdgeStart %ld, EdgeLen %d children %d\nChildren: ", |
|---|
| 1847 | sfxnode->sn_StartPos, sfxnode->sn_EdgeLen, childidx); |
|---|
| 1848 | while(childidx--) |
|---|
| 1849 | { |
|---|
| 1850 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 1851 | printf("[%08lx] ", childptr); |
|---|
| 1852 | } |
|---|
| 1853 | printf("\n"); |
|---|
| 1854 | |
|---|
| 1855 | if(level < 3) |
|---|
| 1856 | { |
|---|
| 1857 | /* traverse children */ |
|---|
| 1858 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 1859 | while(childidx--) |
|---|
| 1860 | { |
|---|
| 1861 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 1862 | if(!(childptr >> LEAFBIT)) |
|---|
| 1863 | { |
|---|
| 1864 | /* this is a normal node pointer, recurse */ |
|---|
| 1865 | GetTreeStatsDebugRec(pp, (childptr & RELOFFSETMASK) << 2, level); |
|---|
| 1866 | } |
|---|
| 1867 | } |
|---|
| 1868 | } |
|---|
| 1869 | printf("End Level %ld\n", level); |
|---|
| 1870 | } |
|---|
| 1871 | /* \\\ */ |
|---|
| 1872 | |
|---|
| 1873 | /* /// "GetTreeStatsTreeDepthRec()" */ |
|---|
| 1874 | void GetTreeStatsTreeDepthRec(struct PTPanPartition *pp, ULONG pos, ULONG level) |
|---|
| 1875 | { |
|---|
| 1876 | struct SfxNode *sfxnode; |
|---|
| 1877 | ULONG childptr; |
|---|
| 1878 | UWORD childidx; |
|---|
| 1879 | |
|---|
| 1880 | /* calculate maximum tree depth */ |
|---|
| 1881 | if(++level > pp->pp_MaxTreeDepth) |
|---|
| 1882 | { |
|---|
| 1883 | pp->pp_MaxTreeDepth = level; |
|---|
| 1884 | } |
|---|
| 1885 | |
|---|
| 1886 | /* traverse children */ |
|---|
| 1887 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 1888 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 1889 | while(childidx--) |
|---|
| 1890 | { |
|---|
| 1891 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 1892 | if(!(childptr >> LEAFBIT)) |
|---|
| 1893 | { |
|---|
| 1894 | /* this is a normal node pointer, recurse */ |
|---|
| 1895 | GetTreeStatsTreeDepthRec(pp, (childptr & RELOFFSETMASK) << 2, level); |
|---|
| 1896 | } |
|---|
| 1897 | } |
|---|
| 1898 | } |
|---|
| 1899 | /* \\\ */ |
|---|
| 1900 | |
|---|
| 1901 | /* /// "GetTreeStatsLevelRec()" */ |
|---|
| 1902 | void GetTreeStatsLevelRec(struct PTPanPartition *pp, ULONG pos, ULONG level) |
|---|
| 1903 | { |
|---|
| 1904 | struct SfxNode *sfxnode; |
|---|
| 1905 | ULONG childptr; |
|---|
| 1906 | UWORD childidx; |
|---|
| 1907 | |
|---|
| 1908 | /* traverse children */ |
|---|
| 1909 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 1910 | //sfxnode->sn_Parent |= RELOFFSETMASK; |
|---|
| 1911 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 1912 | while(childidx--) |
|---|
| 1913 | { |
|---|
| 1914 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 1915 | if(childptr >> LEAFBIT) |
|---|
| 1916 | { |
|---|
| 1917 | /* update leaf counter */ |
|---|
| 1918 | pp->pp_LevelStats[level+1].tls_LeafCount++; |
|---|
| 1919 | } else { |
|---|
| 1920 | /* this is a normal node pointer, recurse */ |
|---|
| 1921 | GetTreeStatsLevelRec(pp, (childptr & RELOFFSETMASK) << 2, level+1); |
|---|
| 1922 | /* update node counter */ |
|---|
| 1923 | pp->pp_LevelStats[level].tls_NodeCount++; |
|---|
| 1924 | } |
|---|
| 1925 | } |
|---|
| 1926 | } |
|---|
| 1927 | /* \\\ */ |
|---|
| 1928 | |
|---|
| 1929 | /* /// "GetTreeStatsShortEdgesRec()" */ |
|---|
| 1930 | void GetTreeStatsShortEdgesRec(struct PTPanPartition *pp, |
|---|
| 1931 | ULONG pos, ULONG level, ULONG elen) |
|---|
| 1932 | { |
|---|
| 1933 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 1934 | ULONG edgelen; |
|---|
| 1935 | ULONG epos; |
|---|
| 1936 | ULONG prefix; |
|---|
| 1937 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 1938 | struct SfxNode *sfxnode; |
|---|
| 1939 | ULONG childptr; |
|---|
| 1940 | UWORD childidx; |
|---|
| 1941 | |
|---|
| 1942 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 1943 | /* update short edge code histogram */ |
|---|
| 1944 | epos = sfxnode->sn_StartPos + 1; |
|---|
| 1945 | edgelen = sfxnode->sn_EdgeLen; |
|---|
| 1946 | /* do we need to truncate the edge? */ |
|---|
| 1947 | if(elen + edgelen > pp->pp_TreePruneLength) |
|---|
| 1948 | { |
|---|
| 1949 | edgelen = pp->pp_TreePruneLength - elen; |
|---|
| 1950 | sfxnode->sn_EdgeLen = edgelen; |
|---|
| 1951 | } |
|---|
| 1952 | elen += edgelen; |
|---|
| 1953 | if((edgelen > 1) && (edgelen <= SHORTEDGEMAX)) |
|---|
| 1954 | { |
|---|
| 1955 | prefix = 0; |
|---|
| 1956 | while(--edgelen) |
|---|
| 1957 | { |
|---|
| 1958 | prefix *= pg->pg_AlphaSize; |
|---|
| 1959 | prefix += GetSeqCodeQuick(epos); |
|---|
| 1960 | epos++; |
|---|
| 1961 | } |
|---|
| 1962 | /* add stop bit */ |
|---|
| 1963 | prefix |= 1UL << pg->pg_BitsUseTable[sfxnode->sn_EdgeLen - 1]; |
|---|
| 1964 | pp->pp_ShortEdgeCode[prefix].hc_Weight++; |
|---|
| 1965 | } |
|---|
| 1966 | else if(edgelen == 1) |
|---|
| 1967 | { |
|---|
| 1968 | pp->pp_ShortEdgeCode[1].hc_Weight++; /* also count length==1 edges */ |
|---|
| 1969 | } |
|---|
| 1970 | |
|---|
| 1971 | /* increase edgecount */ |
|---|
| 1972 | pp->pp_EdgeCount++; |
|---|
| 1973 | |
|---|
| 1974 | /* check for maximum depth reached */ |
|---|
| 1975 | if((++level >= pp->pp_TreePruneDepth) || (elen >= pp->pp_TreePruneLength)) |
|---|
| 1976 | { |
|---|
| 1977 | //printf("SE Level %ld, Epos %ld\n", level, elen); |
|---|
| 1978 | return; |
|---|
| 1979 | } |
|---|
| 1980 | |
|---|
| 1981 | /* traverse children */ |
|---|
| 1982 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 1983 | while(childidx--) |
|---|
| 1984 | { |
|---|
| 1985 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 1986 | if(!(childptr >> LEAFBIT)) |
|---|
| 1987 | { |
|---|
| 1988 | /* this is a normal node pointer, recurse */ |
|---|
| 1989 | GetTreeStatsShortEdgesRec(pp, (childptr & RELOFFSETMASK) << 2, level, elen); |
|---|
| 1990 | } else { |
|---|
| 1991 | /* implicit singular edge */ |
|---|
| 1992 | childptr &= ~LEAFMASK; |
|---|
| 1993 | epos = childptr + 1; |
|---|
| 1994 | edgelen = pp->pp_TreePruneLength - elen; |
|---|
| 1995 | if((edgelen > 1) && (edgelen <= SHORTEDGEMAX)) |
|---|
| 1996 | { |
|---|
| 1997 | prefix = 0; |
|---|
| 1998 | while(--edgelen) |
|---|
| 1999 | { |
|---|
| 2000 | prefix *= pg->pg_AlphaSize; |
|---|
| 2001 | prefix += GetSeqCodeQuick(epos); |
|---|
| 2002 | epos++; |
|---|
| 2003 | } |
|---|
| 2004 | /* add stop bit */ |
|---|
| 2005 | prefix |= 1UL << pg->pg_BitsUseTable[pp->pp_TreePruneLength - elen - 1]; |
|---|
| 2006 | pp->pp_ShortEdgeCode[prefix].hc_Weight++; |
|---|
| 2007 | } |
|---|
| 2008 | else if(edgelen == 1) |
|---|
| 2009 | { |
|---|
| 2010 | pp->pp_ShortEdgeCode[1].hc_Weight++; /* also count length==1 edges */ |
|---|
| 2011 | } |
|---|
| 2012 | /* increase edgecount */ |
|---|
| 2013 | pp->pp_EdgeCount++; |
|---|
| 2014 | } |
|---|
| 2015 | } |
|---|
| 2016 | } |
|---|
| 2017 | /* \\\ */ |
|---|
| 2018 | |
|---|
| 2019 | /* /// "GetTreeStatsLongEdgesRec()" */ |
|---|
| 2020 | void GetTreeStatsLongEdgesRec(struct PTPanPartition *pp, |
|---|
| 2021 | ULONG pos, ULONG level, ULONG elen) |
|---|
| 2022 | { |
|---|
| 2023 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 2024 | ULONG edgelen; |
|---|
| 2025 | ULONG epos; |
|---|
| 2026 | ULONG prefix; |
|---|
| 2027 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 2028 | struct SfxNode *sfxnode; |
|---|
| 2029 | ULONG childptr; |
|---|
| 2030 | UWORD childidx; |
|---|
| 2031 | UWORD seqcode; |
|---|
| 2032 | |
|---|
| 2033 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 2034 | /* update short edge code histogram */ |
|---|
| 2035 | epos = sfxnode->sn_StartPos + 1; |
|---|
| 2036 | edgelen = sfxnode->sn_EdgeLen; |
|---|
| 2037 | /* do we need to truncate the edge? */ |
|---|
| 2038 | #if 0 /* debug */ |
|---|
| 2039 | if(elen + edgelen > pp->pp_TreePruneLength) |
|---|
| 2040 | { |
|---|
| 2041 | edgelen = pp->pp_TreePruneLength - elen; |
|---|
| 2042 | printf("DARF NICHT!"); |
|---|
| 2043 | sfxnode->sn_EdgeLen = edgelen; |
|---|
| 2044 | } |
|---|
| 2045 | #endif |
|---|
| 2046 | elen += edgelen; |
|---|
| 2047 | if(edgelen > 1) |
|---|
| 2048 | { |
|---|
| 2049 | if(edgelen <= SHORTEDGEMAX) |
|---|
| 2050 | { |
|---|
| 2051 | prefix = 0; |
|---|
| 2052 | while(--edgelen) |
|---|
| 2053 | { |
|---|
| 2054 | prefix *= pg->pg_AlphaSize; |
|---|
| 2055 | prefix += GetSeqCodeQuick(epos); |
|---|
| 2056 | epos++; |
|---|
| 2057 | } |
|---|
| 2058 | |
|---|
| 2059 | /* add stop bit */ |
|---|
| 2060 | prefix |= 1UL << pg->pg_BitsUseTable[sfxnode->sn_EdgeLen - 1]; |
|---|
| 2061 | /* check, if this edge doesn't have a huffman code */ |
|---|
| 2062 | if(!pp->pp_ShortEdgeCode[prefix].hc_CodeLength) |
|---|
| 2063 | { |
|---|
| 2064 | pp->pp_LongEdges[pp->pp_LongEdgeCount++] = sfxnode; |
|---|
| 2065 | } else { |
|---|
| 2066 | /* replace sn_StartPos by huffman code index */ |
|---|
| 2067 | sfxnode->sn_StartPos = prefix | (1UL << 30); |
|---|
| 2068 | } |
|---|
| 2069 | } else { |
|---|
| 2070 | /* this is a long edge anyway and has no huffman code */ |
|---|
| 2071 | pp->pp_LongEdges[pp->pp_LongEdgeCount++] = sfxnode; |
|---|
| 2072 | } |
|---|
| 2073 | } |
|---|
| 2074 | else if(edgelen == 1) |
|---|
| 2075 | { |
|---|
| 2076 | /* replace sn_StartPos by huffman code index */ |
|---|
| 2077 | sfxnode->sn_StartPos = 1 | (1UL << 30); |
|---|
| 2078 | } |
|---|
| 2079 | |
|---|
| 2080 | /* check for maximum depth reached */ |
|---|
| 2081 | if((++level >= pp->pp_TreePruneDepth) || (elen >= pp->pp_TreePruneLength)) |
|---|
| 2082 | { |
|---|
| 2083 | //printf("LE Level %ld, Epos %ld\n", level, elen); |
|---|
| 2084 | return; |
|---|
| 2085 | } |
|---|
| 2086 | |
|---|
| 2087 | /* traverse children */ |
|---|
| 2088 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 2089 | while(childidx--) |
|---|
| 2090 | { |
|---|
| 2091 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 2092 | #if 1 // extra leaves switch (see 4.3.4 of diploma thesis) |
|---|
| 2093 | if(childptr >> LEAFBIT) |
|---|
| 2094 | { |
|---|
| 2095 | /* implicit singular edge */ |
|---|
| 2096 | struct SfxNode *tinynode; |
|---|
| 2097 | /* allocate a new branching node */ |
|---|
| 2098 | pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNode2Edges) - sizeof(ULONG); /* only one child! */ |
|---|
| 2099 | if(pp->pp_SfxNEdgeOffset >= pp->pp_Sfx2EdgeOffset) |
|---|
| 2100 | { |
|---|
| 2101 | printf("Node buffer was too small!\n"); |
|---|
| 2102 | return; |
|---|
| 2103 | } |
|---|
| 2104 | childptr &= ~LEAFMASK; |
|---|
| 2105 | /* fill in node data */ |
|---|
| 2106 | tinynode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset]; |
|---|
| 2107 | tinynode->sn_Parent = 1UL << RELOFFSETBITS; /* one edge */ |
|---|
| 2108 | tinynode->sn_StartPos = childptr + elen; |
|---|
| 2109 | tinynode->sn_EdgeLen = pp->pp_TreePruneLength - elen; |
|---|
| 2110 | seqcode = GetSeqCodeQuick(childptr + elen + tinynode->sn_EdgeLen); |
|---|
| 2111 | tinynode->sn_AlphaMask = 1UL << SEQCODE_N; /* we don't need this branch code anymore */ |
|---|
| 2112 | pp->pp_BranchCode[tinynode->sn_AlphaMask].hc_Weight++; |
|---|
| 2113 | tinynode->sn_Children[0] = childptr | LEAFMASK; |
|---|
| 2114 | /* fix link */ |
|---|
| 2115 | seqcode = GetSeqCodeQuick(childptr + elen); |
|---|
| 2116 | childptr = pp->pp_Sfx2EdgeOffset >> 2; |
|---|
| 2117 | sfxnode->sn_Children[childidx] = childptr | (seqcode << RELOFFSETBITS); |
|---|
| 2118 | } |
|---|
| 2119 | #endif |
|---|
| 2120 | if(!(childptr >> LEAFBIT)) |
|---|
| 2121 | { |
|---|
| 2122 | /* this is a normal node pointer, recurse */ |
|---|
| 2123 | GetTreeStatsLongEdgesRec(pp, (childptr & RELOFFSETMASK) << 2, level, elen); |
|---|
| 2124 | } |
|---|
| 2125 | } |
|---|
| 2126 | } |
|---|
| 2127 | /* \\\ */ |
|---|
| 2128 | |
|---|
| 2129 | /* /// "GetTreeStatsBranchHistoRec()" */ |
|---|
| 2130 | void GetTreeStatsBranchHistoRec(struct PTPanPartition *pp, |
|---|
| 2131 | ULONG pos, ULONG level, ULONG elen) |
|---|
| 2132 | { |
|---|
| 2133 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 2134 | struct SfxNode *sfxnode; |
|---|
| 2135 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 2136 | ULONG childptr; |
|---|
| 2137 | UWORD childidx; |
|---|
| 2138 | ULONG alphamask = 0; |
|---|
| 2139 | ULONG tmpmem[ALPHASIZE]; |
|---|
| 2140 | UWORD seqcode; |
|---|
| 2141 | |
|---|
| 2142 | /* traverse children */ |
|---|
| 2143 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 2144 | |
|---|
| 2145 | level++; |
|---|
| 2146 | elen += sfxnode->sn_EdgeLen; |
|---|
| 2147 | |
|---|
| 2148 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 2149 | while(childidx--) |
|---|
| 2150 | { |
|---|
| 2151 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 2152 | if(childptr >> LEAFBIT) |
|---|
| 2153 | { |
|---|
| 2154 | /* this is a leaf pointer and doesn't contain a seqcode */ |
|---|
| 2155 | childptr &= ~LEAFMASK; |
|---|
| 2156 | seqcode = GetSeqCodeQuick(childptr + elen); |
|---|
| 2157 | tmpmem[seqcode] = sfxnode->sn_Children[childidx]; |
|---|
| 2158 | alphamask |= 1UL << seqcode; |
|---|
| 2159 | } else { |
|---|
| 2160 | /* this is a normal node pointer, recurse */ |
|---|
| 2161 | seqcode = childptr >> RELOFFSETBITS; |
|---|
| 2162 | tmpmem[seqcode] = sfxnode->sn_Children[childidx]; |
|---|
| 2163 | alphamask |= 1UL << seqcode; |
|---|
| 2164 | /* check for maximum depth reached */ |
|---|
| 2165 | if((level < pp->pp_TreePruneDepth) && (elen < pp->pp_TreePruneLength)) |
|---|
| 2166 | { |
|---|
| 2167 | GetTreeStatsBranchHistoRec(pp, (childptr & RELOFFSETMASK) << 2, level, elen); |
|---|
| 2168 | } |
|---|
| 2169 | } |
|---|
| 2170 | } |
|---|
| 2171 | /* update branch histogramm */ |
|---|
| 2172 | pp->pp_BranchCode[alphamask].hc_Weight++; |
|---|
| 2173 | sfxnode->sn_AlphaMask = alphamask; |
|---|
| 2174 | |
|---|
| 2175 | /* sort branches and enter alphamask */ |
|---|
| 2176 | childidx = 0; |
|---|
| 2177 | seqcode = 0; |
|---|
| 2178 | do |
|---|
| 2179 | { |
|---|
| 2180 | if(alphamask & (1UL << seqcode)) |
|---|
| 2181 | { |
|---|
| 2182 | sfxnode->sn_Children[childidx++] = tmpmem[seqcode]; |
|---|
| 2183 | } |
|---|
| 2184 | } while(++seqcode < pg->pg_AlphaSize); |
|---|
| 2185 | } |
|---|
| 2186 | /* \\\ */ |
|---|
| 2187 | |
|---|
| 2188 | /* /// "GetTreeStatsVerifyRec()" */ |
|---|
| 2189 | void GetTreeStatsVerifyRec(struct PTPanPartition *pp, ULONG pos, ULONG treepos, ULONG hash) |
|---|
| 2190 | { |
|---|
| 2191 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 2192 | struct SfxNode *sfxnode; |
|---|
| 2193 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 2194 | ULONG childptr; |
|---|
| 2195 | UWORD childidx; |
|---|
| 2196 | ULONG newhash; |
|---|
| 2197 | |
|---|
| 2198 | /* traverse children */ |
|---|
| 2199 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 2200 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 2201 | if(sfxnode->sn_EdgeLen) |
|---|
| 2202 | { |
|---|
| 2203 | treepos += sfxnode->sn_EdgeLen; |
|---|
| 2204 | hash = GetSeqHash(pg, sfxnode->sn_StartPos, sfxnode->sn_EdgeLen, hash); |
|---|
| 2205 | } |
|---|
| 2206 | while(childidx--) |
|---|
| 2207 | { |
|---|
| 2208 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 2209 | if(childptr >> LEAFBIT) |
|---|
| 2210 | { |
|---|
| 2211 | /* this is a leaf pointer and doesn't contain a seqcode */ |
|---|
| 2212 | childptr &= ~LEAFMASK; |
|---|
| 2213 | childptr += treepos; |
|---|
| 2214 | /* set bit to verify position */ |
|---|
| 2215 | if(childptr < treepos) |
|---|
| 2216 | { |
|---|
| 2217 | printf("Childptr %ld < treepos %ld (pos %ld)\n", childptr, treepos, pos); |
|---|
| 2218 | } |
|---|
| 2219 | if(childptr >= pg->pg_TotalRawSize) |
|---|
| 2220 | { |
|---|
| 2221 | printf("Childptr %ld > total size %ld (pos %ld, treepos %ld, sn_EdgeLen %d)\n", |
|---|
| 2222 | childptr, pg->pg_TotalRawSize, pos, treepos, sfxnode->sn_EdgeLen); |
|---|
| 2223 | } |
|---|
| 2224 | |
|---|
| 2225 | if(pp->pp_VerifyArray) |
|---|
| 2226 | { |
|---|
| 2227 | if(pp->pp_VerifyArray[(childptr-treepos) >> 3] & (1UL << ((childptr-treepos) & 7))) |
|---|
| 2228 | { |
|---|
| 2229 | printf("Clash at pos %ld, ptr %ld\n", pos, childptr - treepos); |
|---|
| 2230 | } else { |
|---|
| 2231 | pp->pp_VerifyArray[(childptr-treepos) >> 3] |= (1UL << ((childptr-treepos) & 7)); |
|---|
| 2232 | } |
|---|
| 2233 | } |
|---|
| 2234 | |
|---|
| 2235 | newhash = GetSeqHash(pg, childptr - treepos, treepos, 0); |
|---|
| 2236 | if(newhash != hash) |
|---|
| 2237 | { |
|---|
| 2238 | STRPTR tmpstr = (STRPTR) malloc(treepos+1); |
|---|
| 2239 | DecompressSequencePartTo(pg, seqptr, |
|---|
| 2240 | sfxnode->sn_StartPos + sfxnode->sn_EdgeLen - treepos, |
|---|
| 2241 | treepos, tmpstr); |
|---|
| 2242 | |
|---|
| 2243 | printf("Hash mismatch for %s (%ld != %ld), treepos = %ld\n", |
|---|
| 2244 | tmpstr, newhash, hash, treepos); |
|---|
| 2245 | free(tmpstr); |
|---|
| 2246 | } else { |
|---|
| 2247 | //printf("Good"); |
|---|
| 2248 | } |
|---|
| 2249 | } else { |
|---|
| 2250 | /* this is a normal node pointer, recurse */ |
|---|
| 2251 | GetTreeStatsVerifyRec(pp, (childptr & RELOFFSETMASK) << 2, treepos, hash); |
|---|
| 2252 | } |
|---|
| 2253 | } |
|---|
| 2254 | } |
|---|
| 2255 | /* \\\ */ |
|---|
| 2256 | |
|---|
| 2257 | /* /// "GetTreeStatsLeafCountRec()" */ |
|---|
| 2258 | ULONG GetTreeStatsLeafCountRec(struct PTPanPartition *pp, ULONG pos) |
|---|
| 2259 | { |
|---|
| 2260 | struct SfxNode *sfxnode; |
|---|
| 2261 | ULONG childptr; |
|---|
| 2262 | UWORD childidx; |
|---|
| 2263 | ULONG cnt = 0; |
|---|
| 2264 | |
|---|
| 2265 | /* traverse children */ |
|---|
| 2266 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 2267 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 2268 | //printf("P=%08lx LC: %ld [%d]\n", pos, childidx, sfxnode->sn_AlphaMask); |
|---|
| 2269 | while(childidx--) |
|---|
| 2270 | { |
|---|
| 2271 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 2272 | if(childptr >> LEAFBIT) |
|---|
| 2273 | { |
|---|
| 2274 | /* this is a leaf pointer and doesn't contain a seqcode */ |
|---|
| 2275 | cnt++; |
|---|
| 2276 | } else { |
|---|
| 2277 | /* this is a normal node pointer, recurse */ |
|---|
| 2278 | cnt += GetTreeStatsLeafCountRec(pp, (childptr & RELOFFSETMASK) << 2); |
|---|
| 2279 | } |
|---|
| 2280 | } |
|---|
| 2281 | return(cnt); |
|---|
| 2282 | } |
|---|
| 2283 | /* \\\ */ |
|---|
| 2284 | |
|---|
| 2285 | /* /// "GetTreeStatsLeafCollectRec()" */ |
|---|
| 2286 | void GetTreeStatsLeafCollectRec(struct PTPanPartition *pp, ULONG pos) |
|---|
| 2287 | { |
|---|
| 2288 | struct SfxNode *sfxnode; |
|---|
| 2289 | ULONG childptr; |
|---|
| 2290 | UWORD childidx; |
|---|
| 2291 | |
|---|
| 2292 | /* traverse children */ |
|---|
| 2293 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pos]; |
|---|
| 2294 | childidx = sfxnode->sn_Parent >> RELOFFSETBITS; |
|---|
| 2295 | while(childidx--) |
|---|
| 2296 | { |
|---|
| 2297 | childptr = sfxnode->sn_Children[childidx]; |
|---|
| 2298 | if(childptr >> LEAFBIT) |
|---|
| 2299 | { |
|---|
| 2300 | /* this is a leaf pointer and doesn't contain a seqcode */ |
|---|
| 2301 | *pp->pp_LeafBufferPtr++ = childptr & ~LEAFMASK; |
|---|
| 2302 | } else { |
|---|
| 2303 | /* this is a normal node pointer, recurse */ |
|---|
| 2304 | GetTreeStatsLeafCollectRec(pp, (childptr & RELOFFSETMASK) << 2); |
|---|
| 2305 | } |
|---|
| 2306 | } |
|---|
| 2307 | } |
|---|
| 2308 | /* \\\ */ |
|---|
| 2309 | |
|---|
| 2310 | /* /// "LongEdgeLengthCompare()" */ |
|---|
| 2311 | LONG LongEdgeLengthCompare(const struct SfxNode **node1, const struct SfxNode **node2) |
|---|
| 2312 | { |
|---|
| 2313 | return(((LONG) (*node2)->sn_EdgeLen) - ((LONG) (*node1)->sn_EdgeLen)); |
|---|
| 2314 | } |
|---|
| 2315 | /* \\\ */ |
|---|
| 2316 | |
|---|
| 2317 | /* /// "LongEdgePosCompare()" */ |
|---|
| 2318 | LONG LongEdgePosCompare(const struct SfxNode **node1, const struct SfxNode **node2) |
|---|
| 2319 | { |
|---|
| 2320 | return(((LONG) (*node1)->sn_StartPos) - ((LONG) (*node2)->sn_StartPos)); |
|---|
| 2321 | } |
|---|
| 2322 | /* \\\ */ |
|---|
| 2323 | |
|---|
| 2324 | /* /// "LongEdgeLabelCompare()" */ |
|---|
| 2325 | LONG LongEdgeLabelCompare(struct SfxNode **node1, struct SfxNode **node2) |
|---|
| 2326 | { |
|---|
| 2327 | struct PTPanGlobal *pg = PTPanGlobalPtr; |
|---|
| 2328 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 2329 | ULONG spos1 = (*node1)->sn_StartPos; |
|---|
| 2330 | ULONG spos2 = (*node2)->sn_StartPos; |
|---|
| 2331 | ULONG len = (*node1)->sn_EdgeLen; |
|---|
| 2332 | UBYTE seqcode1, seqcode2; |
|---|
| 2333 | |
|---|
| 2334 | if(spos1 == spos2) /* no need to compare */ |
|---|
| 2335 | { |
|---|
| 2336 | if(len < (*node2)->sn_EdgeLen) |
|---|
| 2337 | { |
|---|
| 2338 | return(-1); |
|---|
| 2339 | } |
|---|
| 2340 | else if(len > (*node2)->sn_EdgeLen) |
|---|
| 2341 | { |
|---|
| 2342 | return(1); |
|---|
| 2343 | } |
|---|
| 2344 | return(0); /* string exactly equal */ |
|---|
| 2345 | } |
|---|
| 2346 | |
|---|
| 2347 | if((*node2)->sn_EdgeLen < len) |
|---|
| 2348 | { |
|---|
| 2349 | len = (*node2)->sn_EdgeLen; |
|---|
| 2350 | } |
|---|
| 2351 | |
|---|
| 2352 | /* compare sequences */ |
|---|
| 2353 | do |
|---|
| 2354 | { |
|---|
| 2355 | seqcode1 = GetSeqCodeQuick(spos1); |
|---|
| 2356 | seqcode2 = GetSeqCodeQuick(spos2); |
|---|
| 2357 | if(seqcode1 < seqcode2) |
|---|
| 2358 | { |
|---|
| 2359 | return(-1); |
|---|
| 2360 | } |
|---|
| 2361 | else if(seqcode1 > seqcode2) |
|---|
| 2362 | { |
|---|
| 2363 | return(1); |
|---|
| 2364 | } |
|---|
| 2365 | spos1++; |
|---|
| 2366 | spos2++; |
|---|
| 2367 | } while(--len); |
|---|
| 2368 | |
|---|
| 2369 | /* sequence prefixes were the same! */ |
|---|
| 2370 | if((*node1)->sn_EdgeLen >= (*node2)->sn_EdgeLen) |
|---|
| 2371 | { |
|---|
| 2372 | /* move starting pos "down" */ |
|---|
| 2373 | if((*node1)->sn_StartPos < (*node2)->sn_StartPos) |
|---|
| 2374 | { |
|---|
| 2375 | //printf("Moved %ld -> %ld\n", (*node2)->sn_StartPos, (*node1)->sn_StartPos); |
|---|
| 2376 | (*node2)->sn_StartPos = (*node1)->sn_StartPos; |
|---|
| 2377 | } |
|---|
| 2378 | if((*node1)->sn_EdgeLen == (*node2)->sn_EdgeLen) |
|---|
| 2379 | { |
|---|
| 2380 | return(0); |
|---|
| 2381 | } else { |
|---|
| 2382 | return(1); |
|---|
| 2383 | } |
|---|
| 2384 | } else { |
|---|
| 2385 | /* shorter sequence is "smaller" */ |
|---|
| 2386 | return(-1); |
|---|
| 2387 | } |
|---|
| 2388 | } |
|---|
| 2389 | /* \\\ */ |
|---|
| 2390 | |
|---|
| 2391 | /* /// "GetSeqHash()" */ |
|---|
| 2392 | ULONG GetSeqHash(struct PTPanGlobal *pg, ULONG seqpos, ULONG len, ULONG hash) |
|---|
| 2393 | { |
|---|
| 2394 | ULONG *seqptr = &pg->pg_MergedRawData[seqpos / MAXCODEFITLONG]; |
|---|
| 2395 | ULONG modval = MAXCODEFITLONG - (seqpos % MAXCODEFITLONG); |
|---|
| 2396 | ULONG pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 2397 | |
|---|
| 2398 | /* calculate the hash value over the string */ |
|---|
| 2399 | while(len--) |
|---|
| 2400 | { |
|---|
| 2401 | hash *= pg->pg_AlphaSize; |
|---|
| 2402 | if(--modval) |
|---|
| 2403 | { |
|---|
| 2404 | hash += (pval / pg->pg_PowerTable[modval]) % pg->pg_AlphaSize; |
|---|
| 2405 | } else { |
|---|
| 2406 | hash += pval % pg->pg_AlphaSize; |
|---|
| 2407 | pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 2408 | modval = MAXCODEFITLONG; |
|---|
| 2409 | } |
|---|
| 2410 | hash %= HASHPRIME; |
|---|
| 2411 | } |
|---|
| 2412 | return(hash); |
|---|
| 2413 | } |
|---|
| 2414 | /* \\\ */ |
|---|
| 2415 | |
|---|
| 2416 | /* /// "GetSeqHashBackwards()" */ |
|---|
| 2417 | ULONG GetSeqHashBackwards(struct PTPanGlobal *pg, ULONG seqpos, ULONG len, ULONG hash) |
|---|
| 2418 | { |
|---|
| 2419 | seqpos += len; |
|---|
| 2420 | { |
|---|
| 2421 | ULONG *seqptr = &pg->pg_MergedRawData[seqpos / MAXCODEFITLONG]; |
|---|
| 2422 | ULONG modval = MAXCODEFITLONG - (seqpos % MAXCODEFITLONG); |
|---|
| 2423 | ULONG pval = *seqptr >> pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 2424 | |
|---|
| 2425 | /* calculate the hash value over the string */ |
|---|
| 2426 | while(len--) |
|---|
| 2427 | { |
|---|
| 2428 | hash *= pg->pg_AlphaSize; |
|---|
| 2429 | if(modval < MAXCODEFITLONG) |
|---|
| 2430 | { |
|---|
| 2431 | hash += (pval / pg->pg_PowerTable[modval++]) % pg->pg_AlphaSize; |
|---|
| 2432 | } else { |
|---|
| 2433 | modval = 1; |
|---|
| 2434 | pval = *(--seqptr) >> pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 2435 | hash += pval % pg->pg_AlphaSize; |
|---|
| 2436 | } |
|---|
| 2437 | hash %= HASHPRIME; |
|---|
| 2438 | } |
|---|
| 2439 | } |
|---|
| 2440 | return(hash); |
|---|
| 2441 | } |
|---|
| 2442 | /* \\\ */ |
|---|
| 2443 | |
|---|
| 2444 | /* /// "CheckLongEdgeMatch()" */ |
|---|
| 2445 | BOOL CheckLongEdgeMatch(struct PTPanPartition *pp, ULONG seqpos, ULONG edgelen, |
|---|
| 2446 | ULONG dictpos) |
|---|
| 2447 | { |
|---|
| 2448 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 2449 | STRPTR dictptr = &pp->pp_LongDict[dictpos]; |
|---|
| 2450 | ULONG *seqptr = &pg->pg_MergedRawData[seqpos / MAXCODEFITLONG]; |
|---|
| 2451 | ULONG modval = MAXCODEFITLONG - (seqpos % MAXCODEFITLONG); |
|---|
| 2452 | ULONG pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 2453 | |
|---|
| 2454 | while(edgelen--) |
|---|
| 2455 | { |
|---|
| 2456 | if(--modval) |
|---|
| 2457 | { |
|---|
| 2458 | if(pg->pg_DecompressTable[(pval / pg->pg_PowerTable[modval]) % pg->pg_AlphaSize] != |
|---|
| 2459 | *dictptr++) |
|---|
| 2460 | { |
|---|
| 2461 | return(FALSE); |
|---|
| 2462 | } |
|---|
| 2463 | } else { |
|---|
| 2464 | if(pg->pg_DecompressTable[pval % pg->pg_AlphaSize] != *dictptr++) |
|---|
| 2465 | { |
|---|
| 2466 | return(FALSE); |
|---|
| 2467 | } |
|---|
| 2468 | pval = *seqptr++ >> pg->pg_BitsShiftTable[MAXCODEFITLONG]; |
|---|
| 2469 | modval = MAXCODEFITLONG; |
|---|
| 2470 | } |
|---|
| 2471 | } |
|---|
| 2472 | return(TRUE); |
|---|
| 2473 | } |
|---|
| 2474 | /* \\\ */ |
|---|
| 2475 | |
|---|
| 2476 | /* /// "BuildLongEdgeDictionary()" */ |
|---|
| 2477 | BOOL BuildLongEdgeDictionary(struct PTPanPartition *pp) |
|---|
| 2478 | { |
|---|
| 2479 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 2480 | struct SfxNode *sfxnode; |
|---|
| 2481 | ULONG *seqptr = pg->pg_MergedRawData; |
|---|
| 2482 | ULONG cnt; |
|---|
| 2483 | ULONG apos; |
|---|
| 2484 | ULONG spos; |
|---|
| 2485 | STRPTR dictptr; |
|---|
| 2486 | ULONG edgelen; |
|---|
| 2487 | ULONG dictsize; |
|---|
| 2488 | ULONG hashval; |
|---|
| 2489 | struct HashEntry *hash; |
|---|
| 2490 | ULONG walkinghash; |
|---|
| 2491 | ULONG lastedgelen; |
|---|
| 2492 | ULONG subfact; |
|---|
| 2493 | BOOL notfound; |
|---|
| 2494 | ULONG hashhit, hashmiss, walkmiss, walkhit, stringcnt; |
|---|
| 2495 | BOOL hassweep; |
|---|
| 2496 | BOOL quicksweep; |
|---|
| 2497 | BOOL safeskip; |
|---|
| 2498 | |
|---|
| 2499 | ULONG olddictsize; |
|---|
| 2500 | |
|---|
| 2501 | BenchTimePassed(pg); |
|---|
| 2502 | /* allocate long edge array */ |
|---|
| 2503 | pp->pp_LongEdges = (struct SfxNode **) calloc(pp->pp_EdgeCount, |
|---|
| 2504 | sizeof(struct SfxNode *)); |
|---|
| 2505 | if(!pp->pp_LongEdges) |
|---|
| 2506 | { |
|---|
| 2507 | printf("Out of memory for Long Edges Array!\n"); |
|---|
| 2508 | return(FALSE); |
|---|
| 2509 | } |
|---|
| 2510 | pp->pp_LongEdgeCount = 0; |
|---|
| 2511 | GetTreeStatsLongEdgesRec(pp, 0, 0, 0); |
|---|
| 2512 | printf("Long Edge Array filled with %ld entries ", pp->pp_LongEdgeCount); |
|---|
| 2513 | printf("(%ld KB unused)\n", (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset) >> 10); |
|---|
| 2514 | |
|---|
| 2515 | #if 0 /* debug */ |
|---|
| 2516 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2517 | { |
|---|
| 2518 | printf("%ld: %ld\n", cnt, pp->pp_LongEdges[cnt]->sn_EdgeLen); |
|---|
| 2519 | } |
|---|
| 2520 | #endif |
|---|
| 2521 | /* now sort array */ |
|---|
| 2522 | if(pp->pp_LongEdgeCount > 2) |
|---|
| 2523 | { |
|---|
| 2524 | #if 1 /* disable this, if sorting takes too long, but |
|---|
| 2525 | this causes the dictionary to explode */ |
|---|
| 2526 | #if 0 // debug |
|---|
| 2527 | printf("Before sorting:\n"); |
|---|
| 2528 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2529 | { |
|---|
| 2530 | DecompressSequencePartTo(pg, seqptr, |
|---|
| 2531 | pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2532 | pp->pp_LongEdges[cnt]->sn_EdgeLen, |
|---|
| 2533 | pg->pg_TempBuffer); |
|---|
| 2534 | printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2535 | pg->pg_TempBuffer); |
|---|
| 2536 | } |
|---|
| 2537 | #endif |
|---|
| 2538 | printf("Sorting (Pass 1)...\n"); |
|---|
| 2539 | qsort(pp->pp_LongEdges, pp->pp_LongEdgeCount, sizeof(struct SfxNode *), |
|---|
| 2540 | (int (*)(const void *, const void *)) LongEdgeLabelCompare); |
|---|
| 2541 | /* some edges might now have been moved to the front after sorting, |
|---|
| 2542 | but due to the sorting, these will be alternating, with edges of the |
|---|
| 2543 | same length, so fix these in O(n) */ |
|---|
| 2544 | #if 0 // debug |
|---|
| 2545 | printf("Before prefix clustering:\n"); |
|---|
| 2546 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2547 | { |
|---|
| 2548 | DecompressSequencePartTo(pg, seqptr, |
|---|
| 2549 | pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2550 | pp->pp_LongEdges[cnt]->sn_EdgeLen, |
|---|
| 2551 | pg->pg_TempBuffer); |
|---|
| 2552 | printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2553 | pg->pg_TempBuffer); |
|---|
| 2554 | } |
|---|
| 2555 | #endif |
|---|
| 2556 | for(cnt = pp->pp_LongEdgeCount-1; cnt > 0; cnt--) |
|---|
| 2557 | { |
|---|
| 2558 | ULONG spos1 = pp->pp_LongEdges[cnt-1]->sn_StartPos; |
|---|
| 2559 | ULONG spos2 = pp->pp_LongEdges[cnt]->sn_StartPos; |
|---|
| 2560 | ULONG len = pp->pp_LongEdges[cnt-1]->sn_EdgeLen; |
|---|
| 2561 | |
|---|
| 2562 | if(len <= pp->pp_LongEdges[cnt]->sn_EdgeLen && (spos1 != spos2)) |
|---|
| 2563 | { |
|---|
| 2564 | /* compare sequences */ |
|---|
| 2565 | do |
|---|
| 2566 | { |
|---|
| 2567 | if(GetSeqCodeQuick(spos1) != GetSeqCodeQuick(spos2)) |
|---|
| 2568 | { |
|---|
| 2569 | break; |
|---|
| 2570 | } |
|---|
| 2571 | spos1++; |
|---|
| 2572 | spos2++; |
|---|
| 2573 | } while(--len); |
|---|
| 2574 | if(!len) |
|---|
| 2575 | { |
|---|
| 2576 | /* was equal */ |
|---|
| 2577 | pp->pp_LongEdges[cnt-1]->sn_StartPos = pp->pp_LongEdges[cnt]->sn_StartPos; |
|---|
| 2578 | } |
|---|
| 2579 | } |
|---|
| 2580 | } |
|---|
| 2581 | #if 0 // debug |
|---|
| 2582 | printf("After prefix clustering:\n"); |
|---|
| 2583 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2584 | { |
|---|
| 2585 | DecompressSequencePartTo(pg, seqptr, |
|---|
| 2586 | pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2587 | pp->pp_LongEdges[cnt]->sn_EdgeLen, |
|---|
| 2588 | pg->pg_TempBuffer); |
|---|
| 2589 | printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2590 | pg->pg_TempBuffer); |
|---|
| 2591 | } |
|---|
| 2592 | #endif |
|---|
| 2593 | printf("Sorting (Pass 2)...\n"); |
|---|
| 2594 | qsort(pp->pp_LongEdges, pp->pp_LongEdgeCount, sizeof(struct SfxNode *), |
|---|
| 2595 | (int (*)(const void *, const void *)) LongEdgePosCompare); |
|---|
| 2596 | #if 0 // debug |
|---|
| 2597 | printf("After offset sorting\n"); |
|---|
| 2598 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2599 | { |
|---|
| 2600 | DecompressSequencePartTo(pg, seqptr, |
|---|
| 2601 | pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2602 | pp->pp_LongEdges[cnt]->sn_EdgeLen, |
|---|
| 2603 | pg->pg_TempBuffer); |
|---|
| 2604 | printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2605 | pg->pg_TempBuffer); |
|---|
| 2606 | } |
|---|
| 2607 | #endif |
|---|
| 2608 | |
|---|
| 2609 | { |
|---|
| 2610 | ULONG ivalstart = 0; |
|---|
| 2611 | ULONG ivalend = 0; |
|---|
| 2612 | ULONG ivalcnt = 0; |
|---|
| 2613 | ULONG ivalsum = 0; |
|---|
| 2614 | ULONG ivalextend = 0; |
|---|
| 2615 | ULONG oldcnt = pp->pp_LongEdgeCount; |
|---|
| 2616 | |
|---|
| 2617 | /* examine edges and create bigger intervals */ |
|---|
| 2618 | for(cnt = 0; cnt < oldcnt; cnt++) |
|---|
| 2619 | { |
|---|
| 2620 | if(pp->pp_LongEdges[cnt]->sn_StartPos > ivalend) |
|---|
| 2621 | { |
|---|
| 2622 | /* create new interval (these thresholds were found out using lots |
|---|
| 2623 | of testing) */ |
|---|
| 2624 | if((ivalextend > 5) && |
|---|
| 2625 | (ivalend - ivalstart > 14)) |
|---|
| 2626 | { |
|---|
| 2627 | //printf("Ival: %ld - %ld\n", ivalstart, ivalend); |
|---|
| 2628 | if((pp->pp_SfxNEdgeOffset < pp->pp_Sfx2EdgeOffset - sizeof(struct SfxNodeStub)) && |
|---|
| 2629 | (pp->pp_LongEdgeCount < pp->pp_EdgeCount)) |
|---|
| 2630 | { |
|---|
| 2631 | /* generate a small stub node, that will lead the array building */ |
|---|
| 2632 | pp->pp_Sfx2EdgeOffset -= sizeof(struct SfxNodeStub); /* only a stub! */ |
|---|
| 2633 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[pp->pp_Sfx2EdgeOffset]; |
|---|
| 2634 | sfxnode->sn_StartPos = ivalstart; |
|---|
| 2635 | sfxnode->sn_EdgeLen = ivalend - ivalstart + 1; |
|---|
| 2636 | sfxnode->sn_AlphaMask = 0xCAFE; |
|---|
| 2637 | pp->pp_LongEdges[pp->pp_LongEdgeCount++] = sfxnode; |
|---|
| 2638 | ivalcnt++; |
|---|
| 2639 | ivalsum += sfxnode->sn_EdgeLen; |
|---|
| 2640 | } else { |
|---|
| 2641 | printf("Out of mem!\n"); |
|---|
| 2642 | cnt = oldcnt; |
|---|
| 2643 | } |
|---|
| 2644 | ivalextend = 0; |
|---|
| 2645 | } |
|---|
| 2646 | ivalstart = pp->pp_LongEdges[cnt]->sn_StartPos; |
|---|
| 2647 | ivalend = ivalstart + pp->pp_LongEdges[cnt]->sn_EdgeLen - 1; |
|---|
| 2648 | } else { |
|---|
| 2649 | /* check, if we have to enlarge this interval... |
|---|
| 2650 | (is this an attempt to compensate something? :) )*/ |
|---|
| 2651 | if((pp->pp_LongEdges[cnt]->sn_StartPos + |
|---|
| 2652 | pp->pp_LongEdges[cnt]->sn_EdgeLen - 1 > ivalend)) |
|---|
| 2653 | //(ivalstart + (pp->pp_LongEdges[cnt]->sn_EdgeLen * 2) > ivalend)) |
|---|
| 2654 | { |
|---|
| 2655 | ivalend = pp->pp_LongEdges[cnt]->sn_StartPos + |
|---|
| 2656 | pp->pp_LongEdges[cnt]->sn_EdgeLen - 1; |
|---|
| 2657 | if(pp->pp_LongEdges[cnt]->sn_StartPos > ivalstart) |
|---|
| 2658 | { |
|---|
| 2659 | ivalextend++; |
|---|
| 2660 | } |
|---|
| 2661 | } |
|---|
| 2662 | } |
|---|
| 2663 | } |
|---|
| 2664 | printf("Additional intervals generated %ld (%ld KB)\n", ivalcnt, ivalsum >> 10); |
|---|
| 2665 | } |
|---|
| 2666 | #endif |
|---|
| 2667 | printf("Sorting (Pass 3)...\n"); |
|---|
| 2668 | qsort(pp->pp_LongEdges, pp->pp_LongEdgeCount, sizeof(struct SfxNode *), |
|---|
| 2669 | (int (*)(const void *, const void *)) LongEdgeLengthCompare); |
|---|
| 2670 | #if 0 // debug |
|---|
| 2671 | printf("After length sorting\n"); |
|---|
| 2672 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2673 | { |
|---|
| 2674 | DecompressSequencePartTo(pg, seqptr, |
|---|
| 2675 | pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2676 | pp->pp_LongEdges[cnt]->sn_EdgeLen, |
|---|
| 2677 | pg->pg_TempBuffer); |
|---|
| 2678 | printf("%6ld: %7ld: %s\n", cnt, pp->pp_LongEdges[cnt]->sn_StartPos, |
|---|
| 2679 | pg->pg_TempBuffer); |
|---|
| 2680 | } |
|---|
| 2681 | #endif |
|---|
| 2682 | } |
|---|
| 2683 | pg->pg_Bench.ts_LongDictPre += BenchTimePassed(pg); |
|---|
| 2684 | if(!pp->pp_LongEdgeCount) |
|---|
| 2685 | { |
|---|
| 2686 | /* catch special case of no long edges */ |
|---|
| 2687 | pp->pp_LongEdgeLenSize = 1; |
|---|
| 2688 | pp->pp_LongEdgeLenCode = (struct HuffCode *) calloc(pp->pp_LongEdgeLenSize, |
|---|
| 2689 | sizeof(struct HuffCode)); |
|---|
| 2690 | //pp->pp_LongEdgeLenCode[0].hc_CodeLength = 1; |
|---|
| 2691 | pp->pp_LongDictSize = 1; |
|---|
| 2692 | pp->pp_LongDict = (STRPTR) malloc(pp->pp_LongDictSize); |
|---|
| 2693 | *pp->pp_LongDict = 0; |
|---|
| 2694 | dictsize = 0; |
|---|
| 2695 | } else { |
|---|
| 2696 | pp->pp_LongEdgeLenSize = pp->pp_LongEdges[0]->sn_EdgeLen + 1; |
|---|
| 2697 | /* allocate long edge len histogram */ |
|---|
| 2698 | pp->pp_LongEdgeLenCode = (struct HuffCode *) calloc(pp->pp_LongEdgeLenSize, |
|---|
| 2699 | sizeof(struct HuffCode)); |
|---|
| 2700 | if(!pp->pp_LongEdgeLenCode) |
|---|
| 2701 | { |
|---|
| 2702 | printf("Out of memory for Long Edge Length Histogram!\n"); |
|---|
| 2703 | return(FALSE); |
|---|
| 2704 | } |
|---|
| 2705 | |
|---|
| 2706 | /* count lengths */ |
|---|
| 2707 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2708 | { |
|---|
| 2709 | if(pp->pp_LongEdges[cnt]->sn_AlphaMask != 0xCAFE) |
|---|
| 2710 | { |
|---|
| 2711 | pp->pp_LongEdgeLenCode[pp->pp_LongEdges[cnt]->sn_EdgeLen].hc_Weight++; |
|---|
| 2712 | } |
|---|
| 2713 | } |
|---|
| 2714 | |
|---|
| 2715 | printf("Longest edge: %ld\n", pp->pp_LongEdgeLenSize - 1); |
|---|
| 2716 | |
|---|
| 2717 | /* generate huffman code for edge lengths */ |
|---|
| 2718 | printf("Generating huffman code for long edges length\n"); |
|---|
| 2719 | BuildHuffmanCode(pp->pp_LongEdgeLenCode, pp->pp_LongEdgeLenSize, 0); |
|---|
| 2720 | |
|---|
| 2721 | #if 0 |
|---|
| 2722 | /* debug */ |
|---|
| 2723 | for(cnt = 0; cnt < pp->pp_LongEdgeLenSize; cnt++) |
|---|
| 2724 | { |
|---|
| 2725 | WORD bitcnt; |
|---|
| 2726 | if(pp->pp_LongEdgeLenCode[cnt].hc_CodeLength) |
|---|
| 2727 | { |
|---|
| 2728 | printf("%6ld: %7ld -> %2d ", cnt, pp->pp_LongEdgeLenCode[cnt].hc_Weight, |
|---|
| 2729 | pp->pp_LongEdgeLenCode[cnt].hc_CodeLength); |
|---|
| 2730 | for(bitcnt = pp->pp_LongEdgeLenCode[cnt].hc_CodeLength - 1; bitcnt >= 0; bitcnt--) |
|---|
| 2731 | { |
|---|
| 2732 | printf("%s", pp->pp_LongEdgeLenCode[cnt].hc_Codec & (1UL << bitcnt) ? "1" : "0"); |
|---|
| 2733 | } |
|---|
| 2734 | printf("\n"); |
|---|
| 2735 | } |
|---|
| 2736 | } |
|---|
| 2737 | #endif |
|---|
| 2738 | |
|---|
| 2739 | /* now allocate a buffer for building the dictionary */ |
|---|
| 2740 | pp->pp_LongDictSize = 16UL << 10; /* start with 16KB */ |
|---|
| 2741 | pp->pp_LongDict = (STRPTR) malloc(pp->pp_LongDictSize); |
|---|
| 2742 | if(!pp->pp_LongDict) |
|---|
| 2743 | { |
|---|
| 2744 | printf("Out of memory for long edges dictionary!\n"); |
|---|
| 2745 | free(pp->pp_LongEdges); |
|---|
| 2746 | return(FALSE); |
|---|
| 2747 | } |
|---|
| 2748 | pp->pp_LongDict[0] = 0; |
|---|
| 2749 | dictsize = 0; |
|---|
| 2750 | |
|---|
| 2751 | /* allocate the hash table to speed up search */ |
|---|
| 2752 | pp->pp_LongDictHashSize = 256UL << 10; |
|---|
| 2753 | pp->pp_LongDictHash = AllocHashArray(pp->pp_LongDictHashSize); |
|---|
| 2754 | if(!pp->pp_LongDictHash) |
|---|
| 2755 | { |
|---|
| 2756 | printf("Out of memory for long edges dictionary hash!\n"); |
|---|
| 2757 | free(pp->pp_LongEdges); |
|---|
| 2758 | free(pp->pp_LongDict); |
|---|
| 2759 | return(FALSE); |
|---|
| 2760 | } |
|---|
| 2761 | printf("Allocated a hash for %ld entries...\n", pp->pp_LongDictHashSize); |
|---|
| 2762 | |
|---|
| 2763 | /* some statistical data */ |
|---|
| 2764 | hashhit = 0; /* string found in hash */ |
|---|
| 2765 | hashmiss = 0; /* string found in hash, but was no match */ |
|---|
| 2766 | walkmiss = 0; /* wrong fingerprint matches */ |
|---|
| 2767 | walkhit = 0; /* string found during walk */ |
|---|
| 2768 | stringcnt = 1; /* number of strings generated */ |
|---|
| 2769 | |
|---|
| 2770 | /* create special initial string to avoid dictionary exploding by fragmented truncated edges */ |
|---|
| 2771 | sfxnode = pp->pp_LongEdges[0]; |
|---|
| 2772 | spos = sfxnode->sn_StartPos + 1; |
|---|
| 2773 | edgelen = sfxnode->sn_EdgeLen - 1; |
|---|
| 2774 | dictptr = pp->pp_LongDict; |
|---|
| 2775 | dictsize = edgelen; |
|---|
| 2776 | while(edgelen--) |
|---|
| 2777 | { |
|---|
| 2778 | *dictptr++ = pg->pg_DecompressTable[GetSeqCodeQuick(spos)]; |
|---|
| 2779 | spos++; |
|---|
| 2780 | } |
|---|
| 2781 | /* don't forget the termination char */ |
|---|
| 2782 | *dictptr = 0; |
|---|
| 2783 | |
|---|
| 2784 | /* insert all edges */ |
|---|
| 2785 | lastedgelen = pp->pp_LongEdges[0]->sn_EdgeLen; |
|---|
| 2786 | hassweep = FALSE; |
|---|
| 2787 | olddictsize = 0; |
|---|
| 2788 | for(cnt = 0; cnt < pp->pp_LongEdgeCount; cnt++) |
|---|
| 2789 | { |
|---|
| 2790 | #if 0 /* debug */ |
|---|
| 2791 | if((dictsize > olddictsize + 40) || (cnt == pp->pp_LongEdgeCount-1)) |
|---|
| 2792 | { |
|---|
| 2793 | fprintf(stderr, "%ld %ld\n", cnt, dictsize); |
|---|
| 2794 | olddictsize = dictsize; |
|---|
| 2795 | } |
|---|
| 2796 | #endif |
|---|
| 2797 | if(((cnt+1) & 0x3ff) == 0) |
|---|
| 2798 | { |
|---|
| 2799 | if(((cnt+1) >> 10) % 50) |
|---|
| 2800 | { |
|---|
| 2801 | printf("."); |
|---|
| 2802 | fflush(stdout); |
|---|
| 2803 | } else { |
|---|
| 2804 | printf(". %2ld%% (%ld KB, %ld strings)\n", |
|---|
| 2805 | (cnt * 100) / pp->pp_LongEdgeCount, dictsize >> 10, stringcnt); |
|---|
| 2806 | } |
|---|
| 2807 | } |
|---|
| 2808 | sfxnode = pp->pp_LongEdges[cnt]; |
|---|
| 2809 | /* note that we can skip the first base because it is part of the branch */ |
|---|
| 2810 | spos = sfxnode->sn_StartPos + 1; |
|---|
| 2811 | edgelen = sfxnode->sn_EdgeLen - 1; |
|---|
| 2812 | |
|---|
| 2813 | /*if(edgelen != lastedgelen) |
|---|
| 2814 | { |
|---|
| 2815 | printf("%6ld\n", edgelen); |
|---|
| 2816 | }*/ |
|---|
| 2817 | /* if we have swept over the dictionary, it is safe to skip search */ |
|---|
| 2818 | safeskip = hassweep; |
|---|
| 2819 | |
|---|
| 2820 | /* calculate hash value */ |
|---|
| 2821 | hashval = GetSeqHashBackwards(pg, spos, edgelen, 0); |
|---|
| 2822 | //printf("[%ld] Len %ld, Hashval = %ld, dictsize = %ld ", cnt, edgelen, hashval, dictsize); |
|---|
| 2823 | if((hash = GetHashEntry(pp->pp_LongDictHash, hashval))) |
|---|
| 2824 | { |
|---|
| 2825 | //printf("Hash "); |
|---|
| 2826 | /* we will check only atmost 12 characters. the probability, that the |
|---|
| 2827 | edge had the same hash value and more than 12 matching characters, is |
|---|
| 2828 | unbelievably low */ |
|---|
| 2829 | if(CheckLongEdgeMatch(pp, spos, edgelen > 12 ? 12 : edgelen, hash->he_Data)) |
|---|
| 2830 | { |
|---|
| 2831 | hashhit++; |
|---|
| 2832 | //printf("match\n"); |
|---|
| 2833 | sfxnode->sn_StartPos = hash->he_Data | (1UL << 31); |
|---|
| 2834 | continue; |
|---|
| 2835 | } |
|---|
| 2836 | hashmiss++; |
|---|
| 2837 | //printf("miss\n"); |
|---|
| 2838 | /* we had a hash miss, consider it not safe anymore to skip the search */ |
|---|
| 2839 | safeskip = FALSE; |
|---|
| 2840 | } |
|---|
| 2841 | |
|---|
| 2842 | if(edgelen < lastedgelen) |
|---|
| 2843 | { |
|---|
| 2844 | if(pp->pp_LongDictHash->ha_Used > (pp->pp_LongDictHash->ha_Size >> 3)) |
|---|
| 2845 | { |
|---|
| 2846 | /* seems as if the hash full by more than 1/8 and needs some clearing */ |
|---|
| 2847 | //printf("Clearing hash...\n"); |
|---|
| 2848 | ClearHashArray(pp->pp_LongDictHash); |
|---|
| 2849 | } |
|---|
| 2850 | hassweep = FALSE; |
|---|
| 2851 | safeskip = FALSE; |
|---|
| 2852 | } |
|---|
| 2853 | /* sorry, have to do a linear search */ |
|---|
| 2854 | notfound = TRUE; |
|---|
| 2855 | |
|---|
| 2856 | /* if he have swept over the dictionary, generating all possible hash values |
|---|
| 2857 | and did not hit the hash, it is very impossible that we will find it |
|---|
| 2858 | anyway, so we just append it. |
|---|
| 2859 | */ |
|---|
| 2860 | if(!safeskip) |
|---|
| 2861 | { |
|---|
| 2862 | /* first attempt was to walk through the dictionary from the |
|---|
| 2863 | beginning to the end. However, in some bright moment, |
|---|
| 2864 | I thought about traversing the dictionary the other |
|---|
| 2865 | way round and see if this works better */ |
|---|
| 2866 | /* hash = \sum{i < m}{dict[pos+i]*5^(m-i)} |
|---|
| 2867 | to walk right: |
|---|
| 2868 | hash = oldhash * 5 + dict[pos] - dict[pos-m] * (5^(m+1)) |
|---|
| 2869 | to walk left: |
|---|
| 2870 | hash = oldhash - dict[pos] / 5 + dict[pos] * (5^m) */ |
|---|
| 2871 | |
|---|
| 2872 | /* init walking hash/finger print value */ |
|---|
| 2873 | apos = dictsize; |
|---|
| 2874 | dictptr = &pp->pp_LongDict[apos]; |
|---|
| 2875 | subfact = 1; |
|---|
| 2876 | walkinghash = 0; |
|---|
| 2877 | |
|---|
| 2878 | //printf("edgelen = %ld\n", edgelen); |
|---|
| 2879 | do |
|---|
| 2880 | { |
|---|
| 2881 | /* calculate character outshifting multiplicator */ |
|---|
| 2882 | subfact *= pg->pg_AlphaSize; |
|---|
| 2883 | subfact %= HASHPRIME; |
|---|
| 2884 | /* calculate finger print */ |
|---|
| 2885 | walkinghash *= pg->pg_AlphaSize; |
|---|
| 2886 | walkinghash += pg->pg_CompressTable[*(--dictptr)]; |
|---|
| 2887 | walkinghash %= HASHPRIME; |
|---|
| 2888 | } while(--apos > dictsize - edgelen); |
|---|
| 2889 | |
|---|
| 2890 | /* did the length shrink and does it pay to do a sweep? */ |
|---|
| 2891 | if((edgelen < lastedgelen) && |
|---|
| 2892 | (cnt + 40 < pp->pp_LongEdgeCount) && |
|---|
| 2893 | (edgelen == (ULONG) pp->pp_LongEdges[cnt + 40]->sn_EdgeLen - 1)) |
|---|
| 2894 | { |
|---|
| 2895 | //printf("Q[%ld<-%ld]", edgelen, pp->pp_LongEdges[cnt + 15]->sn_EdgeLen - 1); |
|---|
| 2896 | //printf("Quicksweep\n"); |
|---|
| 2897 | quicksweep = TRUE; |
|---|
| 2898 | hassweep = TRUE; |
|---|
| 2899 | } else { |
|---|
| 2900 | quicksweep = FALSE; |
|---|
| 2901 | } |
|---|
| 2902 | //hassweep = quicksweep = FALSE; /* FIXME */ |
|---|
| 2903 | /* loop until found or end of dictionary is reached */ |
|---|
| 2904 | do |
|---|
| 2905 | { |
|---|
| 2906 | //printf("Apos = %ld, WH = %ld\n", apos, walkinghash); |
|---|
| 2907 | /* finger print value matches */ |
|---|
| 2908 | if(quicksweep) |
|---|
| 2909 | { |
|---|
| 2910 | if(!(GetHashEntry(pp->pp_LongDictHash, walkinghash))) |
|---|
| 2911 | { |
|---|
| 2912 | InsertHashEntry(pp->pp_LongDictHash, walkinghash, apos); |
|---|
| 2913 | } |
|---|
| 2914 | } |
|---|
| 2915 | if(walkinghash == hashval) |
|---|
| 2916 | { |
|---|
| 2917 | //printf("Walk "); |
|---|
| 2918 | /* verify hit (well, to a high probability) */ |
|---|
| 2919 | if(CheckLongEdgeMatch(pp, spos, edgelen > 12 ? 12 : edgelen, apos)) |
|---|
| 2920 | { |
|---|
| 2921 | //printf("hit\n"); |
|---|
| 2922 | #if 0 /* debug */ |
|---|
| 2923 | if(safeskip) |
|---|
| 2924 | { |
|---|
| 2925 | printf("We would have missed %ld [%ld != %08lx], %ld!\n", |
|---|
| 2926 | apos, hashval, |
|---|
| 2927 | (ULONG) GetHashEntry(pp->pp_LongDictHash, walkinghash), |
|---|
| 2928 | edgelen); |
|---|
| 2929 | } |
|---|
| 2930 | #endif |
|---|
| 2931 | /* found it! */ |
|---|
| 2932 | walkhit++; |
|---|
| 2933 | sfxnode->sn_StartPos = apos | (1UL << 31); |
|---|
| 2934 | //printf("Walk: %f\n", (double) ((double) apos / (double) dictsize)); |
|---|
| 2935 | notfound = FALSE; |
|---|
| 2936 | /* we have to finish the scan in quicksweep mode */ |
|---|
| 2937 | if(quicksweep) |
|---|
| 2938 | { |
|---|
| 2939 | hashval = ~0UL; /* make sure we won't find another hit */ |
|---|
| 2940 | } else { |
|---|
| 2941 | /* insert hash entry */ |
|---|
| 2942 | InsertHashEntry(pp->pp_LongDictHash, hashval, apos); |
|---|
| 2943 | break; |
|---|
| 2944 | } |
|---|
| 2945 | } else { |
|---|
| 2946 | walkmiss++; |
|---|
| 2947 | //printf("miss\n"); |
|---|
| 2948 | } |
|---|
| 2949 | } |
|---|
| 2950 | if(apos) |
|---|
| 2951 | { |
|---|
| 2952 | /* calculate new hash value */ |
|---|
| 2953 | walkinghash += HASHPRIME; |
|---|
| 2954 | walkinghash *= pg->pg_AlphaSize; |
|---|
| 2955 | walkinghash -= pg->pg_CompressTable[dictptr[edgelen-1]] * subfact; |
|---|
| 2956 | walkinghash += pg->pg_CompressTable[*(--dictptr)]; |
|---|
| 2957 | walkinghash %= HASHPRIME; |
|---|
| 2958 | apos--; |
|---|
| 2959 | } else { |
|---|
| 2960 | break; |
|---|
| 2961 | } |
|---|
| 2962 | } while(TRUE); |
|---|
| 2963 | } |
|---|
| 2964 | lastedgelen = edgelen; |
|---|
| 2965 | /* check, if we already found it */ |
|---|
| 2966 | if(notfound) |
|---|
| 2967 | { |
|---|
| 2968 | stringcnt++; |
|---|
| 2969 | apos = dictsize; |
|---|
| 2970 | dictsize += edgelen; |
|---|
| 2971 | //printf("add"); |
|---|
| 2972 | //printf("Appending %ld at %ld...\n", edgelen, apos); |
|---|
| 2973 | if(dictsize >= pp->pp_LongDictSize) |
|---|
| 2974 | { |
|---|
| 2975 | STRPTR newptr; |
|---|
| 2976 | /* double the size of the buffer */ |
|---|
| 2977 | pp->pp_LongDictSize <<= 1; |
|---|
| 2978 | if((newptr = (STRPTR) realloc(pp->pp_LongDict, pp->pp_LongDictSize))) |
|---|
| 2979 | { |
|---|
| 2980 | pp->pp_LongDict = newptr; |
|---|
| 2981 | //printf("Expanded Dictionary to %ld bytes.\n", pp->pp_LongDictSize); |
|---|
| 2982 | } else { |
|---|
| 2983 | printf("Out of memory while expanding long edges dictionary!\n"); |
|---|
| 2984 | free(pp->pp_LongDict); |
|---|
| 2985 | free(pp->pp_LongEdges); |
|---|
| 2986 | return(FALSE); |
|---|
| 2987 | } |
|---|
| 2988 | } |
|---|
| 2989 | /* insert hash entry */ |
|---|
| 2990 | InsertHashEntry(pp->pp_LongDictHash, hashval, apos); |
|---|
| 2991 | /* fix start pos */ |
|---|
| 2992 | sfxnode->sn_StartPos = apos | (1UL << 31); |
|---|
| 2993 | dictptr = &pp->pp_LongDict[apos]; |
|---|
| 2994 | while(edgelen--) |
|---|
| 2995 | { |
|---|
| 2996 | *dictptr++ = pg->pg_DecompressTable[GetSeqCodeQuick(spos)]; |
|---|
| 2997 | spos++; |
|---|
| 2998 | } |
|---|
| 2999 | /* don't forget the termination char */ |
|---|
| 3000 | *dictptr = 0; |
|---|
| 3001 | } |
|---|
| 3002 | } |
|---|
| 3003 | pp->pp_LongDictSize = dictsize; |
|---|
| 3004 | /* printf("\nHashhit %ld, Hashmiss %ld, Walkhit %ld, Walkmiss %ld\n", |
|---|
| 3005 | hashhit, hashmiss, walkhit, walkmiss);*/ |
|---|
| 3006 | printf("\nDictionary size: %ld KB (%ld strings)\n", dictsize >> 10, stringcnt); |
|---|
| 3007 | //printf(pp->pp_LongDict); |
|---|
| 3008 | } |
|---|
| 3009 | |
|---|
| 3010 | /* calculate bits usage */ |
|---|
| 3011 | pp->pp_LongRelPtrBits = 1; |
|---|
| 3012 | while((1UL << pp->pp_LongRelPtrBits) < dictsize) |
|---|
| 3013 | { |
|---|
| 3014 | pp->pp_LongRelPtrBits++; |
|---|
| 3015 | } |
|---|
| 3016 | |
|---|
| 3017 | /* compress sequence to save memory */ |
|---|
| 3018 | if(!(pp->pp_LongDictRaw = CompressSequence(pg, pp->pp_LongDict))) |
|---|
| 3019 | { |
|---|
| 3020 | printf("Out of memory for compressed dictionary string!\n"); |
|---|
| 3021 | FreeHashArray(pp->pp_LongDictHash); |
|---|
| 3022 | free(pp->pp_LongEdges); |
|---|
| 3023 | free(pp->pp_LongDict); |
|---|
| 3024 | return(FALSE); |
|---|
| 3025 | } |
|---|
| 3026 | |
|---|
| 3027 | printf("Final compressed dictionary size: %ld bytes.\n", |
|---|
| 3028 | ((dictsize / MAXCODEFITLONG) + 1) * sizeof(ULONG)); |
|---|
| 3029 | |
|---|
| 3030 | /* free some memory */ |
|---|
| 3031 | FreeHashArray(pp->pp_LongDictHash); |
|---|
| 3032 | free(pp->pp_LongEdges); |
|---|
| 3033 | free(pp->pp_LongDict); |
|---|
| 3034 | |
|---|
| 3035 | pg->pg_Bench.ts_LongDictBuild += BenchTimePassed(pg); |
|---|
| 3036 | |
|---|
| 3037 | return(TRUE); |
|---|
| 3038 | } |
|---|
| 3039 | /* \\\ */ |
|---|
| 3040 | |
|---|
| 3041 | /* /// "WriteTreeToDisk()" */ |
|---|
| 3042 | BOOL WriteTreeToDisk(struct PTPanPartition *pp) |
|---|
| 3043 | { |
|---|
| 3044 | struct PTPanGlobal *pg = pp->pp_PTPanGlobal; |
|---|
| 3045 | struct SfxNode *sfxnode; |
|---|
| 3046 | ULONG pos; |
|---|
| 3047 | |
|---|
| 3048 | ULONG childptr; |
|---|
| 3049 | ULONG packsize; |
|---|
| 3050 | ULONG bytessaved; |
|---|
| 3051 | BOOL freedisk = FALSE; |
|---|
| 3052 | |
|---|
| 3053 | BenchTimePassed(pg); |
|---|
| 3054 | /* after we have prepared all the codecs and compression tables, we have to modify |
|---|
| 3055 | the tree to be able to store it on the disk. Bottom up approach. |
|---|
| 3056 | a) count the space required for the cut off leaves. They are stored as compressed |
|---|
| 3057 | arrays, each array holding at least two leaves. The leaves in the upper levels |
|---|
| 3058 | of the tree are not saved there, as a pointer to the leaf array containing only |
|---|
| 3059 | one leaf would be a waste of space (instead they are stored directly as a child |
|---|
| 3060 | ptr). |
|---|
| 3061 | b) traverse the tree in DFS order from lowest level to the root, from right to the |
|---|
| 3062 | left, enter relative pointers in the ChildPtr[] array, count the space |
|---|
| 3063 | consumption. |
|---|
| 3064 | */ |
|---|
| 3065 | |
|---|
| 3066 | /* calculate the relative pointers and the tree traversal */ |
|---|
| 3067 | pp->pp_DiskOuterLeaves = 0; |
|---|
| 3068 | pp->pp_DiskTreeSize = 0; |
|---|
| 3069 | pp->pp_TraverseTreeRoot = ~0UL & RELOFFSETMASK; |
|---|
| 3070 | |
|---|
| 3071 | ULONG tempDiskTreeSize = FixRelativePointersRec(pp, 0, 0, 0); |
|---|
| 3072 | pp->pp_DiskTreeSize += tempDiskTreeSize; |
|---|
| 3073 | //printf("Total size on disk: %ld KB\n", pp->pp_DiskTreeSize >> 10); |
|---|
| 3074 | |
|---|
| 3075 | pg->pg_Bench.ts_Reloc += BenchTimePassed(pg); |
|---|
| 3076 | /* now finally write it to disk */ |
|---|
| 3077 | pp->pp_PartitionFile = fopen(pp->pp_PartitionName, "w"); |
|---|
| 3078 | if(!pp->pp_PartitionFile) |
|---|
| 3079 | { |
|---|
| 3080 | printf("ERROR: Couldn't open partition file %s for writing!\n", |
|---|
| 3081 | pp->pp_PartitionName); |
|---|
| 3082 | return(FALSE); |
|---|
| 3083 | } |
|---|
| 3084 | |
|---|
| 3085 | WriteTreeHeader(pp); |
|---|
| 3086 | |
|---|
| 3087 | /* use unused node buffer for temporary write buffer */ |
|---|
| 3088 | pp->pp_DiskBuffer = (UBYTE *) &pp->pp_SfxNodes[pp->pp_SfxNEdgeOffset]; |
|---|
| 3089 | pp->pp_DiskBufferSize = pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset; |
|---|
| 3090 | if(pp->pp_DiskBufferSize < (128UL << 10)) |
|---|
| 3091 | { |
|---|
| 3092 | /* disk buffer was much too small! */ |
|---|
| 3093 | pp->pp_DiskBufferSize = 128UL << 10; |
|---|
| 3094 | pp->pp_DiskBuffer = (UBYTE *) calloc(1, pp->pp_DiskBufferSize); |
|---|
| 3095 | freedisk = TRUE; |
|---|
| 3096 | } else { |
|---|
| 3097 | if(pp->pp_DiskBufferSize > (512UL << 10)) |
|---|
| 3098 | { |
|---|
| 3099 | pp->pp_DiskBufferSize = 512UL << 10; |
|---|
| 3100 | } |
|---|
| 3101 | } |
|---|
| 3102 | //printf("Diskbuffer: %ld KB\n", pp->pp_DiskBufferSize >> 10); |
|---|
| 3103 | pp->pp_DiskPos = 0; |
|---|
| 3104 | bytessaved = 0; |
|---|
| 3105 | |
|---|
| 3106 | #if 0 /* debug */ |
|---|
| 3107 | pp->pp_BranchTree = BuildHuffmanTreeFromTable(pp->pp_BranchCode, 1UL << pg->pg_AlphaSize); |
|---|
| 3108 | pp->pp_ShortEdgeTree = BuildHuffmanTreeFromTable(pp->pp_ShortEdgeCode, 1UL << (pg->pg_BitsUseTable[SHORTEDGEMAX]+1)); |
|---|
| 3109 | pp->pp_LongEdgeLenTree = BuildHuffmanTreeFromTable(pp->pp_LongEdgeLenCode, pp->pp_LongEdgeLenSize); |
|---|
| 3110 | #endif |
|---|
| 3111 | |
|---|
| 3112 | printf("Writing (%ld KB)",pp->pp_DiskTreeSize >> 10); |
|---|
| 3113 | fflush(NULL); |
|---|
| 3114 | pp->pp_DiskNodeCount = 0; |
|---|
| 3115 | pp->pp_DiskNodeSpace = 0; |
|---|
| 3116 | pp->pp_DiskLeafCount = 0; |
|---|
| 3117 | pp->pp_DiskLeafSpace = 0; |
|---|
| 3118 | |
|---|
| 3119 | pos = pp->pp_TraverseTreeRoot; |
|---|
| 3120 | while((pos & RELOFFSETMASK) != (~0UL & RELOFFSETMASK)) |
|---|
| 3121 | { |
|---|
| 3122 | childptr = (pos & RELOFFSETMASK) << 2; |
|---|
| 3123 | //printf("Pos=%08lx [%ld]\n", childptr, bytessaved); |
|---|
| 3124 | sfxnode = (struct SfxNode *) &pp->pp_SfxNodes[childptr]; |
|---|
| 3125 | |
|---|
| 3126 | packsize = 0; |
|---|
| 3127 | if(sfxnode->sn_AlphaMask) /* is this a normal node or the end of the tree */ |
|---|
| 3128 | { |
|---|
| 3129 | packsize = WritePackedNode(pp, childptr, &pp->pp_DiskBuffer[pp->pp_DiskPos]); |
|---|
| 3130 | pp->pp_DiskNodeCount++; |
|---|
| 3131 | pp->pp_DiskNodeSpace += packsize; |
|---|
| 3132 | #if 0 /* debug */ |
|---|
| 3133 | { |
|---|
| 3134 | struct TreeNode *tn; |
|---|
| 3135 | pp->pp_DiskTree = &pp->pp_DiskBuffer[pp->pp_DiskPos]; |
|---|
| 3136 | tn = ReadPackedNode(pp, 0); |
|---|
| 3137 | if(tn) |
|---|
| 3138 | { |
|---|
| 3139 | if(packsize != tn->tn_Size) |
|---|
| 3140 | { |
|---|
| 3141 | ULONG bc; |
|---|
| 3142 | printf("ARGH! ARGH! ARRRRGH! NodePos %08lx [%ld != %ld]\n", |
|---|
| 3143 | pos, packsize, tn->tn_Size); |
|---|
| 3144 | for(bc = 0; bc < 32; bc++) |
|---|
| 3145 | { |
|---|
| 3146 | printf("[%02x] ", pp->pp_DiskTree[bc]); |
|---|
| 3147 | } |
|---|
| 3148 | printf("\n"); |
|---|
| 3149 | } |
|---|
| 3150 | free(tn); |
|---|
| 3151 | } |
|---|
| 3152 | } |
|---|
| 3153 | #endif |
|---|
| 3154 | } else { |
|---|
| 3155 | packsize = WritePackedLeaf(pp, childptr, &pp->pp_DiskBuffer[pp->pp_DiskPos]); |
|---|
| 3156 | pp->pp_DiskLeafCount++; |
|---|
| 3157 | pp->pp_DiskLeafSpace += packsize; |
|---|
| 3158 | #if 0 /* debug */ |
|---|
| 3159 | { |
|---|
| 3160 | struct TreeNode *tn; |
|---|
| 3161 | BOOL argh; |
|---|
| 3162 | ULONG bcnt; |
|---|
| 3163 | |
|---|
| 3164 | pp->pp_DiskTree = &pp->pp_DiskBuffer[pp->pp_DiskPos]; |
|---|
| 3165 | tn = ReadPackedLeaf(pp, 0); |
|---|
| 3166 | if(tn) |
|---|
| 3167 | { |
|---|
| 3168 | argh = (packsize != tn->tn_Size); |
|---|
| 3169 | for(bcnt = 0; bcnt < tn->tn_NumLeaves; bcnt++) |
|---|
| 3170 | { |
|---|
| 3171 | if(tn->tn_Leaves[bcnt] < 0) |
|---|
| 3172 | { |
|---|
| 3173 | argh = TRUE; |
|---|
| 3174 | } |
|---|
| 3175 | } |
|---|
| 3176 | if(argh) |
|---|
| 3177 | { |
|---|
| 3178 | ULONG bc; |
|---|
| 3179 | printf("ARGH! ARGH! ARRRRGH! LeafPos %08lx [%ld != %ld]\n", |
|---|
| 3180 | pos, packsize, tn->tn_Size); |
|---|
| 3181 | for(bc = 0; bc < 32; bc++) |
|---|
| 3182 | { |
|---|
| 3183 | printf("[%02x] ", pp->pp_DiskTree[bc]); |
|---|
| 3184 | } |
|---|
| 3185 | printf("\n"); |
|---|
| 3186 | } |
|---|
| 3187 | free(tn); |
|---|
| 3188 | } |
|---|
| 3189 | } |
|---|
| 3190 | #endif |
|---|
| 3191 | } |
|---|
| 3192 | pp->pp_DiskPos += packsize; |
|---|
| 3193 | bytessaved += packsize; |
|---|
| 3194 | /* check if disk buffer is full enough to write a new chunk */ |
|---|
| 3195 | if(pp->pp_DiskPos > (pp->pp_DiskBufferSize >> 1)) |
|---|
| 3196 | { |
|---|
| 3197 | |
|---|
| 3198 | printf("."); |
|---|
| 3199 | fflush(NULL); |
|---|
| 3200 | fwrite(pp->pp_DiskBuffer, pp->pp_DiskPos, 1, pp->pp_PartitionFile); |
|---|
| 3201 | pp->pp_DiskPos = 0; |
|---|
| 3202 | } |
|---|
| 3203 | pos = sfxnode->sn_Parent; |
|---|
| 3204 | } // end while |
|---|
| 3205 | |
|---|
| 3206 | if(pp->pp_DiskPos) |
|---|
| 3207 | { |
|---|
| 3208 | printf(".\n"); |
|---|
| 3209 | fwrite(pp->pp_DiskBuffer, pp->pp_DiskPos, 1, pp->pp_PartitionFile); |
|---|
| 3210 | } |
|---|
| 3211 | pp->pp_DiskIdxSpace = ftell(pp->pp_PartitionFile); |
|---|
| 3212 | printf("%ld inner nodes (%ld KB, %f b.p.n.)\n" |
|---|
| 3213 | "%ld leaf nodes (%ld KB, %f b.p.n.)\n" |
|---|
| 3214 | "%ld leaves in array (%ld KB, %f b.p.l.)\n" |
|---|
| 3215 | "Overall %f bytes per base.\n", |
|---|
| 3216 | pp->pp_DiskNodeCount, pp->pp_DiskNodeSpace >> 10, |
|---|
| 3217 | (float) pp->pp_DiskNodeSpace / (float) pp->pp_DiskNodeCount, |
|---|
| 3218 | pp->pp_DiskLeafCount, pp->pp_DiskLeafSpace >> 10, |
|---|
| 3219 | (float) pp->pp_DiskLeafSpace / (float) pp->pp_DiskLeafCount, |
|---|
| 3220 | pp->pp_DiskOuterLeaves, pp->pp_DiskLeafSpace >> 10, |
|---|
| 3221 | (float) pp->pp_DiskLeafSpace / (float) pp->pp_DiskOuterLeaves, |
|---|
| 3222 | (float) pp->pp_DiskIdxSpace / (float) pp->pp_Size); |
|---|
| 3223 | fclose(pp->pp_PartitionFile); |
|---|
| 3224 | if(freedisk) |
|---|
| 3225 | { |
|---|
| 3226 | free(pp->pp_DiskBuffer); |
|---|
| 3227 | } |
|---|
| 3228 | |
|---|
| 3229 | pg->pg_Bench.ts_Writing += BenchTimePassed(pg); |
|---|
| 3230 | |
|---|
| 3231 | if(bytessaved != pp->pp_DiskTreeSize) |
|---|
| 3232 | { |
|---|
| 3233 | printf("ERROR: Calculated tree size did not match written data (%ld != %ld)!\n", |
|---|
| 3234 | bytessaved, pp->pp_DiskTreeSize); |
|---|
| 3235 | return(FALSE); |
|---|
| 3236 | } |
|---|
| 3237 | return(TRUE); |
|---|
| 3238 | } |
|---|
| 3239 | /* \\\ */ |
|---|
| 3240 | |
|---|
| 3241 | /* /// "CreatePartitionLookup()" */ |
|---|
| 3242 | BOOL CreatePartitionLookup(struct PTPanGlobal *pg) |
|---|
| 3243 | { |
|---|
| 3244 | ULONG cnt; |
|---|
| 3245 | ULONG len; |
|---|
| 3246 | struct PTPanPartition *pp; |
|---|
| 3247 | UWORD range; |
|---|
| 3248 | |
|---|
| 3249 | /* allocate memory for table */ |
|---|
| 3250 | len = pg->pg_PowerTable[pg->pg_MaxPrefixLen]; |
|---|
| 3251 | pg->pg_PartitionLookup = (struct PTPanPartition **) calloc(len, sizeof(struct PTPanPartition *)); |
|---|
| 3252 | if(!pg->pg_PartitionLookup) |
|---|
| 3253 | { |
|---|
| 3254 | return(FALSE); /* out of memory */ |
|---|
| 3255 | } |
|---|
| 3256 | |
|---|
| 3257 | /* create lookup table */ |
|---|
| 3258 | pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head; |
|---|
| 3259 | for(cnt = 0; cnt < len; cnt++) |
|---|
| 3260 | { |
|---|
| 3261 | if(!pp->pp_Node.ln_Succ) |
|---|
| 3262 | { |
|---|
| 3263 | break; /* end of partition list reached */ |
|---|
| 3264 | } |
|---|
| 3265 | do |
|---|
| 3266 | { |
|---|
| 3267 | range = pg->pg_PowerTable[pg->pg_MaxPrefixLen - pp->pp_PrefixLen]; |
|---|
| 3268 | if((cnt >= pp->pp_Prefix * range) && (cnt < (pp->pp_Prefix+1) * range)) |
|---|
| 3269 | { |
|---|
| 3270 | //printf("Entry %ld = prefix %ld\n", cnt, pp->pp_Prefix); |
|---|
| 3271 | pg->pg_PartitionLookup[cnt] = pp; |
|---|
| 3272 | break; |
|---|
| 3273 | } else { |
|---|
| 3274 | if(cnt >= (pp->pp_Prefix+1) * range) |
|---|
| 3275 | { |
|---|
| 3276 | /* we're past this partition prefix, get next one */ |
|---|
| 3277 | pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ; |
|---|
| 3278 | if(!pp->pp_Node.ln_Succ) |
|---|
| 3279 | { |
|---|
| 3280 | break; /* end of partition list reached */ |
|---|
| 3281 | } |
|---|
| 3282 | } else { |
|---|
| 3283 | break; /* not yet found */ |
|---|
| 3284 | } |
|---|
| 3285 | } |
|---|
| 3286 | } while(TRUE); |
|---|
| 3287 | } |
|---|
| 3288 | return(TRUE); |
|---|
| 3289 | } |
|---|
| 3290 | /* \\\ */ |
|---|