1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | #include <sys/time.h> |
---|
4 | // #include <malloc.h> |
---|
5 | #include <memory.h> |
---|
6 | #include <string.h> |
---|
7 | #include <math.h> |
---|
8 | #include <PT_server.h> |
---|
9 | #include "ptpan.h" |
---|
10 | #include "pt_prototypes.h" |
---|
11 | #include <arbdbt.h> |
---|
12 | #include <BI_helix.hxx> |
---|
13 | |
---|
14 | #ifdef BENCHMARK |
---|
15 | /* /// "BenchTimePassed()" */ |
---|
16 | ULONG BenchTimePassed(struct PTPanGlobal *pg) |
---|
17 | { |
---|
18 | ULONG ms; |
---|
19 | |
---|
20 | gettimeofday(&pg->pg_Bench.ts_Now, NULL); |
---|
21 | ms = (pg->pg_Bench.ts_Now.tv_sec - pg->pg_Bench.ts_Last.tv_sec) * 1000; |
---|
22 | if(pg->pg_Bench.ts_Now.tv_usec < pg->pg_Bench.ts_Last.tv_usec) |
---|
23 | { |
---|
24 | ms -= 1000 - (pg->pg_Bench.ts_Last.tv_usec - pg->pg_Bench.ts_Now.tv_usec) / 1000; |
---|
25 | } else { |
---|
26 | ms += (pg->pg_Bench.ts_Now.tv_usec - pg->pg_Bench.ts_Last.tv_usec) / 1000; |
---|
27 | } |
---|
28 | pg->pg_Bench.ts_Last = pg->pg_Bench.ts_Now; |
---|
29 | return(ms); |
---|
30 | } |
---|
31 | /* \\\ */ |
---|
32 | |
---|
33 | /* /// "BenchOutput()" */ |
---|
34 | void BenchOutput(struct PTPanGlobal *pg) |
---|
35 | { |
---|
36 | struct PTPanPartition *pp; |
---|
37 | ULONG diskidxspace = 0; |
---|
38 | ULONG disknodecount = 0; |
---|
39 | ULONG disknodespace = 0; |
---|
40 | ULONG diskleafcount = 0; |
---|
41 | ULONG diskleafspace = 0; |
---|
42 | ULONG diskouterleaves = 0; |
---|
43 | ULONG memused; |
---|
44 | ULONG memusedmax = 0; |
---|
45 | |
---|
46 | pg->pg_Bench.ts_Last = pg->pg_Bench.ts_Init; |
---|
47 | pg->pg_Bench.ts_TotalBuild = BenchTimePassed(pg); |
---|
48 | printf("pDAT: (id np fsize tsize bufmem used 2e 5e depth pdepth plen edges ledges dictsize node# nodespc leaf# leafcnt outl)\n"); |
---|
49 | pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head; |
---|
50 | while(pp->pp_Node.ln_Succ) |
---|
51 | { |
---|
52 | diskidxspace += pp->pp_DiskIdxSpace; |
---|
53 | disknodecount += pp->pp_DiskNodeCount; |
---|
54 | disknodespace += pp->pp_DiskNodeSpace; |
---|
55 | diskleafcount += pp->pp_DiskLeafCount; |
---|
56 | diskleafspace += pp->pp_DiskLeafSpace; |
---|
57 | diskouterleaves += pp->pp_DiskOuterLeaves; |
---|
58 | memused = pp->pp_SfxMemorySize - (pp->pp_Sfx2EdgeOffset - pp->pp_SfxNEdgeOffset); |
---|
59 | if(memused > memusedmax) |
---|
60 | { |
---|
61 | memusedmax = memused; |
---|
62 | } |
---|
63 | printf("%ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %s PDAT\n", |
---|
64 | pp->pp_ID, |
---|
65 | pp->pp_Size, |
---|
66 | pp->pp_DiskIdxSpace, |
---|
67 | pp->pp_DiskTreeSize, |
---|
68 | pp->pp_SfxMemorySize, |
---|
69 | memused, |
---|
70 | pp->pp_NumSmallNodes, |
---|
71 | pp->pp_NumBigNodes, |
---|
72 | pp->pp_MaxTreeDepth, |
---|
73 | pp->pp_TreePruneDepth, |
---|
74 | pp->pp_TreePruneLength, |
---|
75 | pp->pp_EdgeCount, |
---|
76 | pp->pp_LongEdgeCount, |
---|
77 | pp->pp_LongDictRawSize, |
---|
78 | pp->pp_DiskNodeCount, |
---|
79 | pp->pp_DiskNodeSpace, |
---|
80 | pp->pp_DiskLeafCount, |
---|
81 | pp->pp_DiskLeafSpace, |
---|
82 | pp->pp_DiskOuterLeaves, |
---|
83 | pg->pg_DBName); |
---|
84 | |
---|
85 | pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ; |
---|
86 | } |
---|
87 | |
---|
88 | printf("gDAT: (n s lb np t idxsize memusedmax node# nodespc leaf# leafcnt outl Total CollDB MergeDB PScan MemTree Stats LDPre LDBuild Reloc Disk)\n" |
---|
89 | "%lld %ld %ld %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %s GDAT\n", |
---|
90 | pg->pg_TotalRawSize, |
---|
91 | pg->pg_NumSpecies, |
---|
92 | pg->pg_MaxBaseLength, |
---|
93 | pg->pg_NumPartitions, |
---|
94 | pg->pg_MaxPartitionSize, |
---|
95 | diskidxspace, |
---|
96 | memusedmax, |
---|
97 | disknodecount, |
---|
98 | disknodespace, |
---|
99 | diskleafcount, |
---|
100 | diskleafspace, |
---|
101 | diskouterleaves, |
---|
102 | pg->pg_Bench.ts_TotalBuild, |
---|
103 | pg->pg_Bench.ts_CollectDB, |
---|
104 | pg->pg_Bench.ts_MergeDB, |
---|
105 | pg->pg_Bench.ts_PrefixScan, |
---|
106 | pg->pg_Bench.ts_MemTree, |
---|
107 | pg->pg_Bench.ts_TreeStats, |
---|
108 | pg->pg_Bench.ts_LongDictPre, |
---|
109 | pg->pg_Bench.ts_LongDictBuild, |
---|
110 | pg->pg_Bench.ts_Reloc, |
---|
111 | pg->pg_Bench.ts_Writing, |
---|
112 | pg->pg_DBName); |
---|
113 | }; |
---|
114 | /* \\\ */ |
---|
115 | #endif |
---|
116 | |
---|
117 | /* /// "GetSequenceRelPos()" */ |
---|
118 | /* |
---|
119 | ULONG GetSequenceRelPos(struct PTPanGlobal *pg, STRPTR srcseq, ULONG abspos) |
---|
120 | { |
---|
121 | ULONG relpos = 0; |
---|
122 | // given an absolute sequence position, search for the relative one, |
---|
123 | // e.g. abspos 2 on "-----UU-C-C" will yield 8 |
---|
124 | while(*srcseq) |
---|
125 | { |
---|
126 | if(pg->pg_SeqCodeValidTable[*srcseq++]) |
---|
127 | { |
---|
128 | if(!(abspos--)) |
---|
129 | { |
---|
130 | break; // position found |
---|
131 | } |
---|
132 | } |
---|
133 | relpos++; |
---|
134 | } |
---|
135 | return(relpos); |
---|
136 | } |
---|
137 | */ |
---|
138 | /* \\\ */ |
---|
139 | |
---|
140 | /* /// "GetSequenceAbsPos()" */ |
---|
141 | /* |
---|
142 | ULONG GetSequenceAbsPos(struct PTPanGlobal *pg, STRPTR srcseq, ULONG relpos) |
---|
143 | { |
---|
144 | ULONG abspos = 0; |
---|
145 | // given an absolute sequence position, search for the relative one, |
---|
146 | // e.g. relpos 8 on "-----UU-C-C" will yield 3 |
---|
147 | while(*srcseq && relpos--) |
---|
148 | { |
---|
149 | if(pg->pg_SeqCodeValidTable[*srcseq++]) |
---|
150 | { |
---|
151 | abspos++; |
---|
152 | } |
---|
153 | } |
---|
154 | return(abspos); |
---|
155 | } |
---|
156 | */ |
---|
157 | /* \\\ */ |
---|
158 | |
---|
159 | /* /// "CalcLengthForFilteredSequence()" */ |
---|
160 | ULONG CalcLengthForFilteredSequence(struct PTPanGlobal *pg, STRPTR srcseq) |
---|
161 | { |
---|
162 | ULONG len = 0; |
---|
163 | /* calculate size of compressed sequence */ |
---|
164 | while(*srcseq) |
---|
165 | { |
---|
166 | len += pg->pg_SeqCodeValidTable[*srcseq++]; |
---|
167 | } |
---|
168 | return(len); |
---|
169 | } |
---|
170 | /* \\\ */ |
---|
171 | |
---|
172 | /* /// "FilterSequenceTo()" */ |
---|
173 | ULONG FilterSequenceTo(struct PTPanGlobal *pg, STRPTR srcstr, STRPTR filtptr) |
---|
174 | { |
---|
175 | ULONG len = 0; |
---|
176 | UBYTE code; |
---|
177 | |
---|
178 | /* now actually filter the sequence */ |
---|
179 | while((code = *srcstr++)) |
---|
180 | { |
---|
181 | if(pg->pg_SeqCodeValidTable[code]) |
---|
182 | { |
---|
183 | /* add sequence code */ |
---|
184 | *filtptr++ = pg->pg_DecompressTable[pg->pg_CompressTable[code]]; |
---|
185 | len++; |
---|
186 | } |
---|
187 | } |
---|
188 | *filtptr = 0; |
---|
189 | return(len); |
---|
190 | } |
---|
191 | /* \\\ */ |
---|
192 | |
---|
193 | /* /// "FilterSequence()" */ |
---|
194 | STRPTR FilterSequence(struct PTPanGlobal *pg, STRPTR srcseq) |
---|
195 | { |
---|
196 | ULONG len; |
---|
197 | STRPTR filtseq; |
---|
198 | |
---|
199 | len = CalcLengthForFilteredSequence(pg, srcseq); |
---|
200 | filtseq = (STRPTR) malloc(len + 1); |
---|
201 | if(!filtseq) |
---|
202 | { |
---|
203 | return(NULL); /* out of memory */ |
---|
204 | } |
---|
205 | /* now actually compress the sequence */ |
---|
206 | len = FilterSequenceTo(pg, srcseq, filtseq); |
---|
207 | //printf("%ld bytes used.\n", len); |
---|
208 | |
---|
209 | return(filtseq); |
---|
210 | } |
---|
211 | /* \\\ */ |
---|
212 | |
---|
213 | /* /// "CompressSequenceTo()" */ |
---|
214 | ULONG CompressSequenceTo(struct PTPanGlobal *pg, STRPTR srcseq, ULONG *seqptr) |
---|
215 | { |
---|
216 | ULONG len; |
---|
217 | ULONG seqcode; |
---|
218 | UWORD cnt; |
---|
219 | ULONG pval; |
---|
220 | UBYTE code; |
---|
221 | |
---|
222 | /* now actually compress the sequence */ |
---|
223 | len = 4; |
---|
224 | cnt = 0; |
---|
225 | pval = 0; |
---|
226 | while((code = *srcseq++)) |
---|
227 | { |
---|
228 | if(pg->pg_SeqCodeValidTable[code]) |
---|
229 | { |
---|
230 | /* add sequence code */ |
---|
231 | seqcode = pg->pg_CompressTable[code]; |
---|
232 | pval *= pg->pg_AlphaSize; |
---|
233 | pval += seqcode; |
---|
234 | /* check, if storage capacity was reached? */ |
---|
235 | if(++cnt == MAXCODEFITLONG) |
---|
236 | { |
---|
237 | /* write out compressed longword (with eof bit) */ |
---|
238 | //printf("[%08lx]", pval | pg->pg_BitsMaskTable[cnt]); |
---|
239 | *seqptr++ = (pval << pg->pg_BitsShiftTable[MAXCODEFITLONG]) | pg->pg_BitsMaskTable[MAXCODEFITLONG]; |
---|
240 | len += 4; |
---|
241 | cnt = 0; |
---|
242 | pval = 0; |
---|
243 | } |
---|
244 | } |
---|
245 | } |
---|
246 | |
---|
247 | /* write pending bits (with eof bit) */ |
---|
248 | *seqptr = (pval << pg->pg_BitsShiftTable[cnt]) | pg->pg_BitsMaskTable[cnt]; |
---|
249 | //printf("[%08lx]\n", *seqptr); |
---|
250 | return(len); |
---|
251 | } |
---|
252 | /* \\\ */ |
---|
253 | |
---|
254 | /* /// "CompressSequence()" */ |
---|
255 | ULONG * CompressSequence(struct PTPanGlobal *pg, STRPTR srcseq) |
---|
256 | { |
---|
257 | ULONG len; |
---|
258 | ULONG *compseq; |
---|
259 | |
---|
260 | len = CalcLengthForFilteredSequence(pg, srcseq); |
---|
261 | //printf("compressing %s (%ld/%ld)...", srcseq, len, (ULONG) strlen(srcseq)); |
---|
262 | |
---|
263 | /* that's all we need: ceil(len/MAXCODEFITLONG) longwords */ |
---|
264 | compseq = (ULONG *) malloc(((len / MAXCODEFITLONG) + 1) * sizeof(ULONG)); |
---|
265 | if(!compseq) |
---|
266 | { |
---|
267 | return(NULL); /* out of memory */ |
---|
268 | } |
---|
269 | /* now actually compress the sequence */ |
---|
270 | len = CompressSequenceTo(pg, srcseq, compseq); |
---|
271 | //printf("%ld bytes used.\n", len); |
---|
272 | |
---|
273 | return(compseq); |
---|
274 | } |
---|
275 | /* \\\ */ |
---|
276 | |
---|
277 | /* /// "GetLengthOfCompressedSequence() */ |
---|
278 | ULONG GetLengthOfCompressedSequence(struct PTPanGlobal *pg, ULONG *seqptr) |
---|
279 | { |
---|
280 | ULONG len = 0; |
---|
281 | UWORD cnt; |
---|
282 | ULONG mask = pg->pg_BitsMaskTable[MAXCODEFITLONG]; |
---|
283 | do |
---|
284 | { |
---|
285 | if(*seqptr++ & mask) /* check, if lowest bit is set */ |
---|
286 | { |
---|
287 | len += MAXCODEFITLONG; |
---|
288 | } else { |
---|
289 | /* okay, we seem to be at the end of the compressed sequence, |
---|
290 | and we need to find out the actual size */ |
---|
291 | --seqptr; |
---|
292 | cnt = MAXCODEFITLONG; |
---|
293 | while(--cnt) |
---|
294 | { |
---|
295 | if(*seqptr & pg->pg_BitsMaskTable[cnt]) /* seems like we found it */ |
---|
296 | { |
---|
297 | len += cnt; |
---|
298 | break; |
---|
299 | } |
---|
300 | } |
---|
301 | break; |
---|
302 | } |
---|
303 | } while(TRUE); |
---|
304 | return(len); |
---|
305 | } |
---|
306 | /* \\\ */ |
---|
307 | |
---|
308 | /* /// "GetCompressedLongSize()" */ |
---|
309 | UWORD GetCompressedLongSize(struct PTPanGlobal *pg, ULONG pval) |
---|
310 | { |
---|
311 | UWORD cnt = MAXCODEFITLONG; |
---|
312 | while(!(pval & pg->pg_BitsMaskTable[cnt])) /* check, if termination bit is set */ |
---|
313 | { |
---|
314 | cnt--; |
---|
315 | } |
---|
316 | return(cnt); |
---|
317 | } |
---|
318 | /* \\\ */ |
---|
319 | |
---|
320 | /* /// "DecompressSequenceTo() */ |
---|
321 | ULONG DecompressSequenceTo(struct PTPanGlobal *pg, ULONG *seqptr, STRPTR tarseq) |
---|
322 | { |
---|
323 | ULONG len = 0; |
---|
324 | BOOL lastlong; |
---|
325 | UWORD cnt; |
---|
326 | ULONG pval; |
---|
327 | do |
---|
328 | { |
---|
329 | /* get next longword */ |
---|
330 | pval = *seqptr++; |
---|
331 | cnt = GetCompressedLongSize(pg, pval); |
---|
332 | pval >>= pg->pg_BitsShiftTable[cnt]; |
---|
333 | lastlong = (cnt < MAXCODEFITLONG); /* last longword reached? */ |
---|
334 | |
---|
335 | /* unpack compressed longword */ |
---|
336 | if(cnt) |
---|
337 | { |
---|
338 | do |
---|
339 | { |
---|
340 | *tarseq++ = pg->pg_DecompressTable[(pval / pg->pg_PowerTable[--cnt]) |
---|
341 | % pg->pg_AlphaSize]; |
---|
342 | len++; |
---|
343 | } while(cnt); |
---|
344 | } |
---|
345 | } while(!lastlong); |
---|
346 | *tarseq = 0; /* null terminate string */ |
---|
347 | |
---|
348 | return(len); |
---|
349 | } |
---|
350 | /* \\\ */ |
---|
351 | |
---|
352 | /* /// "DecompressCompressedLongTo() */ |
---|
353 | ULONG DecompressCompressedLongTo(struct PTPanGlobal *pg, ULONG pval, STRPTR tarseq) |
---|
354 | { |
---|
355 | ULONG len; |
---|
356 | UWORD cnt; |
---|
357 | |
---|
358 | len = cnt = GetCompressedLongSize(pg, pval); |
---|
359 | pval >>= pg->pg_BitsShiftTable[cnt]; |
---|
360 | /* unpack compressed longword */ |
---|
361 | do |
---|
362 | { |
---|
363 | *tarseq++ = pg->pg_DecompressTable[(pval / pg->pg_PowerTable[--cnt]) |
---|
364 | % pg->pg_AlphaSize]; |
---|
365 | } while(cnt); |
---|
366 | *tarseq = 0; /* null terminate string */ |
---|
367 | return(len); |
---|
368 | } |
---|
369 | /* \\\ */ |
---|
370 | |
---|
371 | /* /// "DecompressSequence()" */ |
---|
372 | STRPTR DecompressSequence(struct PTPanGlobal *pg, ULONG *seqptr) |
---|
373 | { |
---|
374 | ULONG len; |
---|
375 | STRPTR tarseq; |
---|
376 | /* first get length */ |
---|
377 | len = GetLengthOfCompressedSequence(pg, seqptr); |
---|
378 | |
---|
379 | /* allocate memory for uncompressed sequence */ |
---|
380 | tarseq = (STRPTR) malloc(len + 1); |
---|
381 | if(!tarseq) |
---|
382 | { |
---|
383 | return(NULL); /* out of memory */ |
---|
384 | } |
---|
385 | |
---|
386 | /* decompress sequence */ |
---|
387 | DecompressSequenceTo(pg, seqptr, tarseq); |
---|
388 | //printf("Decompressed sequence '%s'\n", tarseq); |
---|
389 | return(tarseq); |
---|
390 | } |
---|
391 | /* \\\ */ |
---|
392 | |
---|
393 | /* /// "DecompressSequencePartTo()" */ |
---|
394 | LONG DecompressSequencePartTo(struct PTPanGlobal *pg, |
---|
395 | ULONG *seqptr, ULONG seqpos, ULONG length, |
---|
396 | STRPTR tarseq) |
---|
397 | { |
---|
398 | ULONG off = seqpos / MAXCODEFITLONG; |
---|
399 | UWORD codeoff = seqpos % MAXCODEFITLONG; |
---|
400 | UWORD cnt; |
---|
401 | ULONG len = 0; |
---|
402 | ULONG pval; |
---|
403 | BOOL lastlong; |
---|
404 | BOOL first; |
---|
405 | |
---|
406 | if(!length) /* empty sequence requested? */ |
---|
407 | { |
---|
408 | *tarseq = 0; |
---|
409 | return(0); |
---|
410 | } |
---|
411 | |
---|
412 | /* decompress sequence */ |
---|
413 | first = TRUE; |
---|
414 | seqptr += off; |
---|
415 | do |
---|
416 | { |
---|
417 | /* get next longword */ |
---|
418 | pval = *seqptr++; |
---|
419 | cnt = GetCompressedLongSize(pg, pval); |
---|
420 | pval >>= pg->pg_BitsShiftTable[cnt]; |
---|
421 | lastlong = (cnt < MAXCODEFITLONG); /* last longword reached? */ |
---|
422 | |
---|
423 | if(first) /* do we need to start at a certain offset? */ |
---|
424 | { |
---|
425 | if(codeoff > cnt) /* past end of sequence? */ |
---|
426 | { |
---|
427 | break; |
---|
428 | } |
---|
429 | cnt -= codeoff; |
---|
430 | first = FALSE; |
---|
431 | } |
---|
432 | /* unpack compressed longword */ |
---|
433 | do |
---|
434 | { |
---|
435 | *tarseq++ = pg->pg_DecompressTable[(pval / pg->pg_PowerTable[--cnt]) |
---|
436 | % pg->pg_AlphaSize]; |
---|
437 | len++; |
---|
438 | length--; |
---|
439 | } while(cnt && length); |
---|
440 | } while(length && !lastlong); |
---|
441 | *tarseq = 0; /* null terminate string */ |
---|
442 | |
---|
443 | return(len); |
---|
444 | } |
---|
445 | /* \\\ */ |
---|
446 | |
---|
447 | |
---|
448 | /* /// "GetNextCharacter()" */ |
---|
449 | UBYTE GetNextCharacter(struct PTPanGlobal *pg, UBYTE* buffer, ULONG &bitpos, ULONG &count) |
---|
450 | { |
---|
451 | UBYTE character = 0xff; // return the next character of |
---|
452 | UBYTE code; // sequence or 0xff if end flag found |
---|
453 | // increase bitpos by consumed bits |
---|
454 | code = ReadBits(buffer, bitpos, 3); // set count to the number of |
---|
455 | bitpos += 3; // found characters |
---|
456 | |
---|
457 | if (code == 0x07) // end flag |
---|
458 | { |
---|
459 | return 0xff; |
---|
460 | } else if (code <= SEQCODE_T) // valid character |
---|
461 | { |
---|
462 | character = pg->pg_DecompressTable[code]; |
---|
463 | count = 1; |
---|
464 | } else if ((code == SEQCODE_DOT) || (code == SEQCODE_HYPHEN)) // '.' or '-' |
---|
465 | { // skip ... chars |
---|
466 | if (code == SEQCODE_DOT) character = '.'; |
---|
467 | if (code == SEQCODE_HYPHEN) character = '-'; |
---|
468 | |
---|
469 | code = ReadBits(buffer, bitpos, 4); |
---|
470 | if ((code >> 3) == 0x01) // 1xxx skip one |
---|
471 | { |
---|
472 | count = 1; |
---|
473 | ++bitpos; |
---|
474 | } else if ((code >> 2) == 0x01) // 01xx skip two |
---|
475 | { |
---|
476 | count = 2; |
---|
477 | bitpos += 2; |
---|
478 | } else if ((code >> 1) == 0x01) // 001x skip up to 63 |
---|
479 | { |
---|
480 | bitpos += 3; |
---|
481 | count = ReadBits(buffer, bitpos, 6); |
---|
482 | bitpos += 6; |
---|
483 | } else if ((code) == 0x01) // 0001 skip up to 1023 |
---|
484 | { |
---|
485 | bitpos += 4; |
---|
486 | count = ReadBits(buffer, bitpos, 10); |
---|
487 | bitpos += 10; |
---|
488 | } else if ((code) == 0x00) // 0000 skip up to 8191 |
---|
489 | { |
---|
490 | bitpos += 4; |
---|
491 | count = ReadBits(buffer, bitpos, 13); |
---|
492 | bitpos += 13; |
---|
493 | |
---|
494 | ULONG tmpbitpos = bitpos; // test if next char is also the same. |
---|
495 | ULONG tmpcount = count; |
---|
496 | UBYTE tmpcode = GetNextCharacter(pg, buffer, tmpbitpos, tmpcount); |
---|
497 | if (character == tmpcode) // it is -> the number of same characters |
---|
498 | { // was splitted (i.e. >8191) |
---|
499 | arb_assert(count == 8191); |
---|
500 | bitpos = tmpbitpos; // consume bits... |
---|
501 | count += tmpcount; // ...and add count |
---|
502 | } |
---|
503 | } else // |
---|
504 | { |
---|
505 | arb_assert(false); // shouldn't be possible to get to this line |
---|
506 | } |
---|
507 | } else // neither end-flag nor valid char |
---|
508 | { // nor '.' nor '-' => something went wrong |
---|
509 | arb_assert(false); |
---|
510 | } |
---|
511 | return character; |
---|
512 | } |
---|
513 | /* \\\ */ |
---|
514 | |
---|
515 | |
---|
516 | ULONG WriteManyChars(UBYTE* buffer, ULONG bitpos, BYTE c, ULONG i) |
---|
517 | { |
---|
518 | arb_assert((c == SEQCODE_DOT) || (c == SEQCODE_HYPHEN)); // only '.' and '-' are allowed |
---|
519 | while (i > 0) |
---|
520 | { |
---|
521 | bitpos = WriteBits(buffer, bitpos, c, 3); // code for character |
---|
522 | if (i == 1) |
---|
523 | { |
---|
524 | bitpos = WriteBits(buffer, bitpos, 0x01, 1); // 1 |
---|
525 | return bitpos; |
---|
526 | } |
---|
527 | if (i == 2) |
---|
528 | { |
---|
529 | bitpos = WriteBits(buffer, bitpos, 0x01, 2); // 01 |
---|
530 | return bitpos; |
---|
531 | } |
---|
532 | if (i <= 63) |
---|
533 | { |
---|
534 | bitpos = WriteBits(buffer, bitpos, 0x01, 3); // 001 |
---|
535 | bitpos = WriteBits(buffer, bitpos, (i & 0x3f), 6); // 6 bit payload (up to 63) |
---|
536 | return bitpos; |
---|
537 | } |
---|
538 | if (i <= 1023) |
---|
539 | { |
---|
540 | bitpos = WriteBits(buffer, bitpos, 0x01, 4); // 0001 |
---|
541 | bitpos = WriteBits(buffer, bitpos, (i & 0x3ff), 10); // 10 bit payload (up to 1023) |
---|
542 | return bitpos; |
---|
543 | } |
---|
544 | if (i <= 8191) |
---|
545 | { |
---|
546 | bitpos = WriteBits(buffer, bitpos, 0x00, 4); // 0000 |
---|
547 | bitpos = WriteBits(buffer, bitpos, (i & 0x1fff), 13); // 13 bit payload (up to 8191) |
---|
548 | return bitpos; |
---|
549 | } |
---|
550 | bitpos = WriteBits(buffer, bitpos, 0x00, 4); // 0000 |
---|
551 | bitpos = WriteBits(buffer, bitpos, 8191, 13); // 13 bit payload (exactly 8191) |
---|
552 | i -= 8191; |
---|
553 | } |
---|
554 | return bitpos; |
---|
555 | } |
---|
556 | |
---|
557 | |
---|
558 | /* /// "CompressSequenceWithDotsAndHyphens()" */ |
---|
559 | ULONG CompressSequenceWithDotsAndHyphens(struct PTPanGlobal *pg, struct PTPanSpecies *ps) |
---|
560 | { |
---|
561 | ULONG len = 0; // len is the count of characters inserted into ps_RawData |
---|
562 | ULONG bitpos = 0; // ...ps_RawDataSize will be set to len later |
---|
563 | UBYTE* ptr = (UBYTE*) ps->ps_SeqData; |
---|
564 | UBYTE* buffer = (UBYTE*) malloc((ps->ps_SeqDataSize * 3 / 8) + 1); // TODO: look over it and find a good |
---|
565 | if (buffer == NULL) // estimation of needed size |
---|
566 | { // TODO: what is faster, precount or |
---|
567 | // estimate and realloc? |
---|
568 | printf("Error: Could not get enough memory to compress sequences with dots and hyphens\n"); |
---|
569 | return FALSE; |
---|
570 | } |
---|
571 | while (*ptr) |
---|
572 | { |
---|
573 | arb_assert(((bitpos >> 3) + 1 < ps->ps_SeqDataSize)); |
---|
574 | if (*ptr == '.') |
---|
575 | { // found a '.' |
---|
576 | ULONG count; |
---|
577 | for (count = 0; *ptr == '.'; ++count, ++ptr) { } // count all '.' |
---|
578 | #ifdef ALLOWDOTSINMATCH |
---|
579 | if (count <= MAXDOTSINMATCH) |
---|
580 | { |
---|
581 | len += count; |
---|
582 | while (count-- > 0) // write 'count' |
---|
583 | { // times one '.' |
---|
584 | bitpos = WriteManyChars(buffer, bitpos, SEQCODE_DOT, 1); |
---|
585 | } |
---|
586 | } else bitpos = WriteManyChars(buffer, bitpos, SEQCODE_DOT, count); // write all '.' |
---|
587 | #else |
---|
588 | bitpos = WriteManyChars(buffer, bitpos, SEQCODE_DOT, count); // write all '.' |
---|
589 | #endif |
---|
590 | } else if (*ptr == '-') |
---|
591 | { // found a '-' |
---|
592 | ULONG count; |
---|
593 | for (count = 0; *ptr == '-'; ++count, ++ptr) { } // count all '-' |
---|
594 | bitpos = WriteManyChars(buffer, bitpos, SEQCODE_HYPHEN, count); // write all '-' |
---|
595 | } else if (pg->pg_SeqCodeValidTable[*ptr]) |
---|
596 | { // found a valid character |
---|
597 | UBYTE seqcode = pg->pg_CompressTable[*ptr]; |
---|
598 | arb_assert(seqcode <= SEQCODE_T); |
---|
599 | bitpos = WriteBits(buffer, bitpos, seqcode, 3); // write valid char |
---|
600 | ++ptr; |
---|
601 | ++len; |
---|
602 | } else |
---|
603 | { // found an unknown char |
---|
604 | // printf("Found an unknown char in Species Sequence - ignoring\n"); |
---|
605 | bitpos = WriteManyChars(buffer, bitpos, SEQCODE_HYPHEN, 1); // write one '-' |
---|
606 | ++ptr; |
---|
607 | } |
---|
608 | } |
---|
609 | bitpos = WriteBits(buffer, bitpos, 0x07, 3); // write end flag (111) |
---|
610 | |
---|
611 | ps->ps_SeqDataCompressedSize = bitpos; |
---|
612 | ps->ps_SeqDataCompressed = (UBYTE*) realloc(buffer, (bitpos >> 3) + 1); |
---|
613 | if (ps->ps_SeqDataCompressed == NULL) |
---|
614 | { |
---|
615 | printf("Error: Could not get enough memory to compress sequences with dots and hyphens\n"); |
---|
616 | return -1; |
---|
617 | } |
---|
618 | if (ReadBits(buffer, bitpos - 3, 3) != 0x07) |
---|
619 | { |
---|
620 | printf("Error Compressing SeqData (with '.' and '-')\tSpecies: %s\n", ps->ps_Name); |
---|
621 | return -1; |
---|
622 | } |
---|
623 | |
---|
624 | pg->pg_TotalSeqCompressedSize += ((ps->ps_SeqDataCompressedSize >> 3) + 1); // convert from bit to byte |
---|
625 | return len; |
---|
626 | } |
---|
627 | |
---|
628 | |
---|
629 | /* /// "ComplementSequence()" */ |
---|
630 | void ComplementSequence(struct PTPanGlobal *pg, STRPTR seqstr) |
---|
631 | { |
---|
632 | UBYTE code; |
---|
633 | /* flip A<->T and C<->G */ |
---|
634 | while((code = *seqstr)) |
---|
635 | { |
---|
636 | *seqstr++ = pg->pg_DecompressTable[pg->pg_ComplementTable[pg->pg_CompressTable[code]]]; |
---|
637 | } |
---|
638 | } |
---|
639 | /* \\\ */ |
---|
640 | |
---|
641 | /* /// "ReverseSequence()" */ |
---|
642 | void ReverseSequence(struct PTPanGlobal *, STRPTR seqstr) |
---|
643 | { |
---|
644 | char code; |
---|
645 | STRPTR leftptr = seqstr; |
---|
646 | STRPTR rightptr = &seqstr[strlen(seqstr)]; |
---|
647 | |
---|
648 | /* reverse the sequence string */ |
---|
649 | while(leftptr < rightptr) |
---|
650 | { |
---|
651 | code = *leftptr; |
---|
652 | *leftptr++ = *--rightptr; |
---|
653 | *rightptr = code; |
---|
654 | } |
---|
655 | } |
---|
656 | /* \\\ */ |
---|
657 | |
---|
658 | /* /// "OpenDataBase()" |
---|
659 | initially open the database and read the species data. |
---|
660 | */ |
---|
661 | BOOL OpenDataBase(struct PTPanGlobal *pg) |
---|
662 | { |
---|
663 | GB_set_verbose(); |
---|
664 | /* open the database */ |
---|
665 | if(!(pg->pg_MainDB = GB_open(pg->pg_DBName, "r"))) |
---|
666 | { |
---|
667 | printf("Error reading file %s\n", pg->pg_DBName); |
---|
668 | return(FALSE); |
---|
669 | } |
---|
670 | GB_begin_transaction(pg->pg_MainDB); |
---|
671 | /* open the species data */ |
---|
672 | if(!(pg->pg_SpeciesData = GB_find(pg->pg_MainDB, "species_data", down_level))) |
---|
673 | { |
---|
674 | printf("Database %s is empty\n", pg->pg_DBName); |
---|
675 | return(FALSE); |
---|
676 | } |
---|
677 | /* add the extended data container */ |
---|
678 | pg->pg_SaiData = GBT_get_SAI_data(pg->pg_MainDB); |
---|
679 | pg->pg_AlignmentName = GBT_get_default_alignment(pg->pg_MainDB); |
---|
680 | |
---|
681 | printf("Building PT-Server for alignment '%s'...\n", pg->pg_AlignmentName); |
---|
682 | |
---|
683 | GB_commit_transaction(pg->pg_MainDB); |
---|
684 | |
---|
685 | return(TRUE); |
---|
686 | } |
---|
687 | /* \\\ */ |
---|
688 | |
---|
689 | /* /// "LoadEcoliSequence()" */ |
---|
690 | BOOL LoadEcoliSequence(struct PTPanGlobal *pg) |
---|
691 | { |
---|
692 | GBDATA *gb_extdata; |
---|
693 | STRPTR defaultref = GBT_get_default_ref(pg->pg_MainDB); |
---|
694 | |
---|
695 | gb_extdata = GBT_find_SAI_rel_SAI_data(pg->pg_SaiData, defaultref); |
---|
696 | free(defaultref); |
---|
697 | |
---|
698 | /* free memory if previously allocated */ |
---|
699 | freeset(pg->pg_EcoliSeq, NULL); |
---|
700 | freeset(pg->pg_EcoliBaseTable, NULL); |
---|
701 | |
---|
702 | /* prepare ecoli sequence */ |
---|
703 | if(gb_extdata) |
---|
704 | { |
---|
705 | GBDATA *gb_data; |
---|
706 | gb_data = GBT_read_sequence(gb_extdata, pg->pg_AlignmentName); |
---|
707 | if(gb_data) |
---|
708 | { |
---|
709 | ULONG abspos = 0; |
---|
710 | STRPTR srcseq; |
---|
711 | ULONG *posptr; |
---|
712 | |
---|
713 | /* load sequence */ |
---|
714 | pg->pg_EcoliSeqSize = GB_read_string_count(gb_data); |
---|
715 | pg->pg_EcoliSeq = GB_read_string(gb_data); |
---|
716 | |
---|
717 | /* calculate look up table to speed up ecoli position calculation */ |
---|
718 | pg->pg_EcoliBaseTable = (ULONG *) calloc(pg->pg_EcoliSeqSize + 1, sizeof(ULONG)); |
---|
719 | if(pg->pg_EcoliBaseTable) |
---|
720 | { |
---|
721 | srcseq = pg->pg_EcoliSeq; |
---|
722 | posptr = pg->pg_EcoliBaseTable; // TODO: check if this works well |
---|
723 | while(*srcseq) // with ALLOWDOTSINMATCH |
---|
724 | { |
---|
725 | *posptr++ = abspos; |
---|
726 | if(pg->pg_SeqCodeValidTable[*srcseq++]) |
---|
727 | { |
---|
728 | abspos++; |
---|
729 | } |
---|
730 | } |
---|
731 | *posptr = abspos; |
---|
732 | return(TRUE); |
---|
733 | } else { |
---|
734 | printf("Out of memory for ecoli position table!\n"); |
---|
735 | } |
---|
736 | } |
---|
737 | } |
---|
738 | return(FALSE); |
---|
739 | } |
---|
740 | /* \\\ */ |
---|
741 | |
---|
742 | /* /// "FreeAllSpecies()" */ |
---|
743 | void FreeAllSpecies(struct PTPanGlobal *pg) |
---|
744 | { |
---|
745 | struct PTPanSpecies *ps; |
---|
746 | FlushCache(pg->pg_SpeciesCache); |
---|
747 | ps = (struct PTPanSpecies *) pg->pg_Species.lh_Head; |
---|
748 | while(ps->ps_Node.ln_Succ) |
---|
749 | { |
---|
750 | FreeCacheNode(pg->pg_SpeciesCache, ps->ps_CacheNode); |
---|
751 | Remove(&ps->ps_Node); |
---|
752 | free(ps->ps_Name); |
---|
753 | free(ps->ps_FullName); |
---|
754 | freeset(ps, (struct PTPanSpecies *) pg->pg_Species.lh_Head); |
---|
755 | } |
---|
756 | FreeBinTree(pg->pg_SpeciesBinTree); |
---|
757 | pg->pg_SpeciesBinTree = NULL; |
---|
758 | pg->pg_NumSpecies = 0; |
---|
759 | pg->pg_TotalSeqSize = 0; |
---|
760 | pg->pg_TotalSeqCompressedSize = 0; |
---|
761 | pg->pg_TotalRawSize = 0; |
---|
762 | pg->pg_TotalRawBits = 0; |
---|
763 | } |
---|
764 | /* \\\ */ |
---|
765 | |
---|
766 | /* /// "CacheSpeciesLoad()" */ |
---|
767 | BOOL CacheSpeciesLoad(struct CacheHandler *, struct PTPanSpecies *ps) |
---|
768 | { |
---|
769 | //struct PTPanGlobal *pg = (struct PTPanGlobal *pg) ch->ch_UserData; |
---|
770 | |
---|
771 | if(!ps->ps_SeqData) |
---|
772 | { |
---|
773 | /* load alignment data */ |
---|
774 | ps->ps_SeqData = GB_read_string(ps->ps_SeqDataDB); |
---|
775 | return(TRUE); |
---|
776 | } |
---|
777 | return(FALSE); |
---|
778 | } |
---|
779 | /* \\\ */ |
---|
780 | |
---|
781 | /* /// "CacheSpeciesUnload()" */ |
---|
782 | BOOL CacheSpeciesUnload(struct CacheHandler *, struct PTPanSpecies *ps) |
---|
783 | { |
---|
784 | //struct PTPanGlobal *pg = (struct PTPanGlobal *pg) ch->ch_UserData; |
---|
785 | |
---|
786 | if(ps->ps_SeqData) |
---|
787 | { |
---|
788 | /* load alignment data */ |
---|
789 | freeset(ps->ps_SeqData, NULL); |
---|
790 | return(TRUE); |
---|
791 | } |
---|
792 | return(FALSE); |
---|
793 | } |
---|
794 | /* \\\ */ |
---|
795 | |
---|
796 | /* /// "CacheSpeciesSize()" */ |
---|
797 | ULONG CacheSpeciesSize(struct CacheHandler *, struct PTPanSpecies *ps) |
---|
798 | { |
---|
799 | //struct PTPanGlobal *pg = (struct PTPanGlobal *pg) ch->ch_UserData; |
---|
800 | return(ps->ps_SeqDataSize); |
---|
801 | } |
---|
802 | /* \\\ */ |
---|
803 | |
---|
804 | /* /// "LoadSpecies()" */ |
---|
805 | BOOL LoadSpecies(struct PTPanGlobal *pg) |
---|
806 | { |
---|
807 | GBDATA *gb_species; |
---|
808 | struct PTPanSpecies *ps; |
---|
809 | ULONG ignorecount; |
---|
810 | |
---|
811 | ULONG longestali = 0; |
---|
812 | |
---|
813 | /* NOTE: This database scan should avoided. We should store all the |
---|
814 | data that's built up here in a secondary file. That way we would |
---|
815 | get rid of the loading and scanning of the sequence data in low |
---|
816 | memory mode */ |
---|
817 | |
---|
818 | /* open data base */ |
---|
819 | if(!(OpenDataBase(pg))) |
---|
820 | { |
---|
821 | printf("Failed to open database %s!\n", pg->pg_DBName); |
---|
822 | exit(1); |
---|
823 | } |
---|
824 | |
---|
825 | GB_begin_transaction(pg->pg_MainDB); |
---|
826 | |
---|
827 | /* get the ecoli reference sequence */ |
---|
828 | LoadEcoliSequence(pg); |
---|
829 | |
---|
830 | if(pg->pg_TotalRawSize) /* seems like we've already have the list */ |
---|
831 | { |
---|
832 | /* only load in alignment data */ |
---|
833 | if(pg->pg_LowMemoryMode) |
---|
834 | { |
---|
835 | GB_commit_transaction(pg->pg_MainDB); |
---|
836 | return(TRUE); |
---|
837 | } |
---|
838 | printf("Reloading alignment data...\n"); |
---|
839 | ps = (struct PTPanSpecies *) pg->pg_Species.lh_Head; |
---|
840 | while(ps->ps_Node.ln_Succ) |
---|
841 | { |
---|
842 | ps->ps_CacheNode = CacheLoadData(pg->pg_SpeciesCache, ps->ps_CacheNode, ps); |
---|
843 | ps = (struct PTPanSpecies *) ps->ps_Node.ln_Succ; |
---|
844 | } |
---|
845 | GB_commit_transaction(pg->pg_MainDB); |
---|
846 | return(TRUE); |
---|
847 | } |
---|
848 | |
---|
849 | FreeAllSpecies(pg); |
---|
850 | |
---|
851 | /* add the species to the list */ |
---|
852 | pg->pg_MaxBaseLength = 0; |
---|
853 | pg->pg_TotalSeqSize = 0; |
---|
854 | pg->pg_TotalSeqCompressedSize = 0; |
---|
855 | pg->pg_TotalRawSize = 0; |
---|
856 | pg->pg_NumSpecies = 0; |
---|
857 | ignorecount = 0; |
---|
858 | for(gb_species = GBT_first_species_rel_species_data(pg->pg_SpeciesData); |
---|
859 | gb_species; |
---|
860 | gb_species = GBT_next_species(gb_species)) |
---|
861 | { |
---|
862 | GBDATA *gb_name; |
---|
863 | GBDATA *gb_ali; |
---|
864 | GBDATA *gb_data; |
---|
865 | STRPTR spname; |
---|
866 | |
---|
867 | /* get name */ |
---|
868 | gb_name = GB_find(gb_species, "name", down_level); |
---|
869 | if(!gb_name) |
---|
870 | { |
---|
871 | ignorecount++; |
---|
872 | continue; /* huh? couldn't find the name of the species? */ |
---|
873 | } |
---|
874 | spname = GB_read_string(gb_name); |
---|
875 | |
---|
876 | /* get alignments */ |
---|
877 | gb_ali = GB_find(gb_species, pg->pg_AlignmentName, down_level); |
---|
878 | if(!gb_ali) |
---|
879 | { |
---|
880 | ignorecount++; |
---|
881 | free(spname); |
---|
882 | continue; /* too bad, no alignment information found */ |
---|
883 | } |
---|
884 | gb_data = GB_find(gb_ali, "data", down_level); |
---|
885 | if(!gb_data) |
---|
886 | { |
---|
887 | ignorecount++; |
---|
888 | fprintf(stderr, "Species '%s' has no data in '%s'\n", |
---|
889 | spname, pg->pg_AlignmentName); |
---|
890 | free(spname); |
---|
891 | continue; |
---|
892 | } |
---|
893 | |
---|
894 | /* okay, cannot fail now anymore, allocate a PTPanSpecies structure */ |
---|
895 | ps = (struct PTPanSpecies *) calloc(1, sizeof(struct PTPanSpecies)); |
---|
896 | |
---|
897 | /* write name and long name into the structure */ |
---|
898 | ps->ps_SpeciesDB = gb_species; |
---|
899 | ps->ps_SeqDataDB = gb_data; |
---|
900 | ps->ps_IsGroup = TRUE; |
---|
901 | ps->ps_Name = spname; |
---|
902 | gb_name = GB_find(gb_species, "full_name", down_level); |
---|
903 | if(gb_name) |
---|
904 | { |
---|
905 | ps->ps_FullName = GB_read_string(gb_name); |
---|
906 | } else { |
---|
907 | ps->ps_FullName = strdup(ps->ps_Name); |
---|
908 | } |
---|
909 | |
---|
910 | /* (temporarily) load in the alignment and compress it */ |
---|
911 | ps->ps_SeqDataSize = GB_read_string_count(ps->ps_SeqDataDB); |
---|
912 | ps->ps_SeqData = GB_read_string(ps->ps_SeqDataDB); |
---|
913 | |
---|
914 | if(strlen(ps->ps_SeqData) != ps->ps_SeqDataSize) |
---|
915 | { |
---|
916 | printf("%s is corrupt, ignoring!\n", ps->ps_Name); |
---|
917 | ignorecount++; |
---|
918 | FreeCacheNode(pg->pg_SpeciesCache, ps->ps_CacheNode); |
---|
919 | free(ps->ps_SeqData); |
---|
920 | free(ps->ps_Name); |
---|
921 | free(ps->ps_FullName); |
---|
922 | free(ps); |
---|
923 | continue; /* too bad, alignment was somehow corrupt */ |
---|
924 | } |
---|
925 | |
---|
926 | #if 0 /* not required anymore */ |
---|
927 | if(pg->pg_LowMemoryMode) /* free memory in low memory case */ |
---|
928 | { |
---|
929 | CacheUnloadData(pg->pg_SpeciesCache, ps->ps_CacheNode); |
---|
930 | } |
---|
931 | #endif |
---|
932 | |
---|
933 | ps->ps_RawDataSize = CompressSequenceWithDotsAndHyphens(pg, ps); |
---|
934 | freeset(ps->ps_SeqData, NULL); |
---|
935 | if (ps->ps_RawDataSize < 0) // TODO: problem, ps_RawDataSize is unsigned... |
---|
936 | { |
---|
937 | printf("%s is corrupt, ignoring!\n", ps->ps_Name); |
---|
938 | ignorecount++; |
---|
939 | FreeCacheNode(pg->pg_SpeciesCache, ps->ps_CacheNode); |
---|
940 | free(ps->ps_Name); |
---|
941 | free(ps->ps_FullName); |
---|
942 | free(ps); |
---|
943 | continue; |
---|
944 | } |
---|
945 | |
---|
946 | /* enter global absolute offset in index */ |
---|
947 | ps->ps_AbsOffset = pg->pg_TotalRawSize; |
---|
948 | ps->ps_Node.ln_Pri = ps->ps_AbsOffset; |
---|
949 | pg->pg_TotalSeqSize += ps->ps_SeqDataSize; |
---|
950 | pg->pg_TotalRawSize += ps->ps_RawDataSize; |
---|
951 | if(ps->ps_RawDataSize > pg->pg_MaxBaseLength) |
---|
952 | { |
---|
953 | pg->pg_MaxBaseLength = ps->ps_RawDataSize; |
---|
954 | } |
---|
955 | if(ps->ps_SeqDataSize > longestali) |
---|
956 | { |
---|
957 | longestali = ps->ps_SeqDataSize; |
---|
958 | } |
---|
959 | /* Init complete, now add it to the list */ |
---|
960 | //printf("Added %s ('%s')...\n", ps->ps_Name, ps->ps_FullName); |
---|
961 | AddTail(&pg->pg_Species, &ps->ps_Node); |
---|
962 | pg->pg_NumSpecies++; |
---|
963 | |
---|
964 | /* visual feedback */ |
---|
965 | if((pg->pg_NumSpecies % 10) == 0) |
---|
966 | { |
---|
967 | if(pg->pg_NumSpecies % 500) |
---|
968 | { |
---|
969 | printf("."); |
---|
970 | fflush(stdout); |
---|
971 | } else { |
---|
972 | printf(".%6ld (%6lld KB)\n", pg->pg_NumSpecies, (ps->ps_AbsOffset >> 10)); |
---|
973 | } |
---|
974 | } |
---|
975 | } |
---|
976 | |
---|
977 | /* calculate bits usage */ |
---|
978 | pg->pg_TotalRawBits = 8; |
---|
979 | while((1UL << pg->pg_TotalRawBits) < pg->pg_TotalRawSize) |
---|
980 | { |
---|
981 | pg->pg_TotalRawBits++; |
---|
982 | } |
---|
983 | |
---|
984 | /* build tree to find species quicker by raw position */ |
---|
985 | pg->pg_SpeciesBinTree = BuildBinTree(&pg->pg_Species); |
---|
986 | |
---|
987 | printf("\nLongest sequence was %ld bases (alignment size %ld).\n\n", |
---|
988 | pg->pg_MaxBaseLength, longestali); |
---|
989 | printf("Database contains %ld valid species (%ld ignored).\n" |
---|
990 | "%lld bytes alignment data (%lld bases).\n", |
---|
991 | pg->pg_NumSpecies, ignorecount, pg->pg_TotalSeqSize, pg->pg_TotalRawSize); |
---|
992 | |
---|
993 | printf("Compressed sequence data (with dots and hyphens): %llu byte (%llu kb, %llu mb)\n", |
---|
994 | pg->pg_TotalSeqCompressedSize, pg->pg_TotalSeqCompressedSize >> 10, pg->pg_TotalSeqCompressedSize >> 20); |
---|
995 | |
---|
996 | |
---|
997 | pg->pg_Bench.ts_CollectDB = BenchTimePassed(pg); |
---|
998 | |
---|
999 | /* done! */ |
---|
1000 | GB_commit_transaction(pg->pg_MainDB); |
---|
1001 | return(TRUE); |
---|
1002 | } |
---|
1003 | /* \\\ */ |
---|
1004 | |
---|
1005 | /* /// "LoadIndexHeader()" */ |
---|
1006 | BOOL LoadIndexHeader(struct PTPanGlobal *pg) |
---|
1007 | { |
---|
1008 | FILE *fh; |
---|
1009 | struct PTPanSpecies *ps; |
---|
1010 | struct PTPanPartition *pp; |
---|
1011 | ULONG numspec; |
---|
1012 | ULONG ignorecount; |
---|
1013 | ULONG endian = 0; |
---|
1014 | UWORD version = 0; |
---|
1015 | UWORD cnt; |
---|
1016 | char idstr[16]; |
---|
1017 | |
---|
1018 | FreeAllSpecies(pg); |
---|
1019 | FreeAllPartitions(pg); |
---|
1020 | |
---|
1021 | /* Does similar things as LoadSpecies() */ |
---|
1022 | if(!(fh = fopen(pg->pg_IndexName, "r"))) |
---|
1023 | { |
---|
1024 | printf("ERROR: Couldn't open index %s!\n", pg->pg_IndexName); |
---|
1025 | return(FALSE); |
---|
1026 | } |
---|
1027 | |
---|
1028 | /* read id string */ |
---|
1029 | fread(idstr, 16, 1, fh); |
---|
1030 | if(strncmp("TUM PeTerPAN IDX", idstr, 16)) |
---|
1031 | { |
---|
1032 | printf("ERROR: This is no index file!\n"); |
---|
1033 | fclose(fh); |
---|
1034 | return(FALSE); |
---|
1035 | } |
---|
1036 | |
---|
1037 | /* check endianness */ |
---|
1038 | fread(&endian, sizeof(endian), 1, fh); |
---|
1039 | if(endian != 0x01020304) |
---|
1040 | { |
---|
1041 | printf("ERROR: Index was created on a different endian machine (%08lx)!\n", endian); |
---|
1042 | fclose(fh); |
---|
1043 | return(FALSE); |
---|
1044 | } |
---|
1045 | |
---|
1046 | /* check file structure version */ |
---|
1047 | fread(&version, sizeof(version), 1, fh); |
---|
1048 | if(version != FILESTRUCTVERSION) |
---|
1049 | { |
---|
1050 | printf("ERROR: Index (V%d.%d) does not match current file structure version (V%d.%d)!\n", |
---|
1051 | version >> 8, version & 0xff, |
---|
1052 | FILESTRUCTVERSION >> 8, FILESTRUCTVERSION & 0xff); |
---|
1053 | fclose(fh); |
---|
1054 | return(FALSE); |
---|
1055 | } |
---|
1056 | |
---|
1057 | /* read the rest of the important data */ |
---|
1058 | fread(&pg->pg_UseStdSfxTree, sizeof(pg->pg_UseStdSfxTree), 1, fh); |
---|
1059 | fread(&pg->pg_AlphaSize , sizeof(pg->pg_AlphaSize) , 1, fh); |
---|
1060 | fread(&pg->pg_TotalSeqSize , sizeof(pg->pg_TotalSeqSize) , 1, fh); |
---|
1061 | fread(&pg->pg_TotalSeqCompressedSize, sizeof(pg->pg_TotalSeqCompressedSize) , 1, fh); |
---|
1062 | fread(&pg->pg_TotalRawSize , sizeof(pg->pg_TotalRawSize) , 1, fh); |
---|
1063 | fread(&pg->pg_TotalRawBits , sizeof(pg->pg_TotalRawBits) , 1, fh); |
---|
1064 | fread(&pg->pg_AllHashSum , sizeof(pg->pg_AllHashSum) , 1, fh); |
---|
1065 | fread(&pg->pg_NumSpecies , sizeof(pg->pg_NumSpecies) , 1, fh); |
---|
1066 | fread(&pg->pg_NumPartitions, sizeof(pg->pg_NumPartitions), 1, fh); |
---|
1067 | fread(&pg->pg_MaxPrefixLen , sizeof(pg->pg_MaxPrefixLen) , 1, fh); |
---|
1068 | |
---|
1069 | // read Ecoli Sequence |
---|
1070 | /* free memory if previously allocated */ |
---|
1071 | freeset(pg->pg_EcoliSeq, NULL); |
---|
1072 | freeset(pg->pg_EcoliBaseTable, NULL); |
---|
1073 | |
---|
1074 | fread(&pg->pg_EcoliSeqSize, sizeof(pg->pg_EcoliSeqSize), 1, fh); |
---|
1075 | if (pg->pg_EcoliSeqSize > 0) |
---|
1076 | { // only read EcoliSeq and |
---|
1077 | pg->pg_EcoliSeq = (char*) malloc(pg->pg_EcoliSeqSize + 1); // EcoliBaseTable if we |
---|
1078 | if(!pg->pg_EcoliSeq) // fonud them earlier in |
---|
1079 | { // the build process... |
---|
1080 | printf("Out of memory allocating buffer for pg->pg_EcoliSeq!\n"); // aka if pg_EcoliSeqSize |
---|
1081 | return(FALSE); // is greater than zero |
---|
1082 | } |
---|
1083 | fread(pg->pg_EcoliSeq, 1, pg->pg_EcoliSeqSize + 1, fh); |
---|
1084 | |
---|
1085 | pg->pg_EcoliBaseTable = (ULONG *) calloc(pg->pg_EcoliSeqSize + 1, sizeof(ULONG)); |
---|
1086 | if(!pg->pg_EcoliBaseTable) |
---|
1087 | { |
---|
1088 | printf("Out of memory allocating buffer for pg->pg_EcoliBaseTable!\n"); |
---|
1089 | return(FALSE); |
---|
1090 | } |
---|
1091 | fread(pg->pg_EcoliBaseTable, sizeof(ULONG), pg->pg_EcoliSeqSize + 1, fh); |
---|
1092 | } |
---|
1093 | |
---|
1094 | /* fix partition loading routine for standard suffix tree */ |
---|
1095 | if(pg->pg_UseStdSfxTree) |
---|
1096 | { |
---|
1097 | pg->pg_PartitionCache->ch_LoadFunc = (BOOL (*)(struct CacheHandler *, APTR)) CacheStdSuffixPartitionLoad; |
---|
1098 | pg->pg_PartitionCache->ch_UnloadFunc = (BOOL (*)(struct CacheHandler *, APTR)) CacheStdSuffixPartitionUnload; |
---|
1099 | } |
---|
1100 | |
---|
1101 | /* add the species to the list */ |
---|
1102 | pg->pg_SpeciesMap = (struct PTPanSpecies **) calloc(sizeof(struct PTPanSpecies *), |
---|
1103 | pg->pg_NumSpecies); |
---|
1104 | ignorecount = 0; |
---|
1105 | numspec = 0; |
---|
1106 | while(numspec < pg->pg_NumSpecies) |
---|
1107 | { |
---|
1108 | STRPTR spname; |
---|
1109 | STRPTR filespname; |
---|
1110 | STRPTR fullname; |
---|
1111 | UWORD len; |
---|
1112 | BOOL obsolete; |
---|
1113 | |
---|
1114 | obsolete = FALSE; |
---|
1115 | fullname = NULL; |
---|
1116 | |
---|
1117 | /* get name of species on disk */ |
---|
1118 | fread(&len, sizeof(len), 1, fh); |
---|
1119 | filespname = (STRPTR) calloc(len+1, 1); |
---|
1120 | fread(filespname, len, 1, fh); |
---|
1121 | |
---|
1122 | fread(&len, sizeof(len), 1, fh); |
---|
1123 | fullname = (STRPTR) calloc(len+1, 1); |
---|
1124 | fread(fullname, len, 1, fh); |
---|
1125 | |
---|
1126 | /* okay, cannot fail now anymore, allocate a PTPanSpecies structure */ |
---|
1127 | ps = (struct PTPanSpecies *) calloc(1, sizeof(struct PTPanSpecies)); |
---|
1128 | pg->pg_SpeciesMap[numspec] = ps; |
---|
1129 | ps->ps_Num = numspec + ignorecount; |
---|
1130 | |
---|
1131 | /* write name and long name into the structure */ |
---|
1132 | ps->ps_SpeciesDB = NULL; |
---|
1133 | ps->ps_SeqDataDB = NULL; |
---|
1134 | ps->ps_IsGroup = FALSE; |
---|
1135 | ps->ps_Obsolete = obsolete; |
---|
1136 | ps->ps_Name = filespname; |
---|
1137 | ps->ps_FullName = fullname; |
---|
1138 | |
---|
1139 | /* load in the alignment information */ |
---|
1140 | fread(&ps->ps_SeqDataSize, sizeof(ps->ps_SeqDataSize), 1, fh); |
---|
1141 | fread(&ps->ps_RawDataSize, sizeof(ps->ps_RawDataSize), 1, fh); |
---|
1142 | fread(&ps->ps_AbsOffset, sizeof(ps->ps_AbsOffset), 1, fh); |
---|
1143 | fread(&ps->ps_SeqHash, sizeof(ps->ps_SeqHash), 1, fh); |
---|
1144 | fread(&ps->ps_SeqDataCompressedSize, sizeof(ps->ps_SeqDataCompressedSize), 1, fh); |
---|
1145 | ps->ps_SeqDataCompressed = (UBYTE*) malloc((ps->ps_SeqDataCompressedSize >> 3) + 1); |
---|
1146 | if(!ps->ps_SeqDataCompressed) |
---|
1147 | { |
---|
1148 | printf("Out of memory allocating buffer for compressed SeqData (with '.' and '-')!\n"); |
---|
1149 | return(FALSE); |
---|
1150 | } |
---|
1151 | fread(ps->ps_SeqDataCompressed, 1, ((ps->ps_SeqDataCompressedSize >> 3) + 1), fh); |
---|
1152 | ps->ps_Node.ln_Pri = ps->ps_AbsOffset; |
---|
1153 | |
---|
1154 | /* Init complete, now add it to the list */ |
---|
1155 | //printf("Added %s ('%s')...\n", ps->ps_Name, ps->ps_FullName); |
---|
1156 | AddTail(&pg->pg_Species, &ps->ps_Node); |
---|
1157 | numspec++; |
---|
1158 | |
---|
1159 | /* visual feedback */ |
---|
1160 | if((numspec % 20) == 0) |
---|
1161 | { |
---|
1162 | if(numspec % 1000) |
---|
1163 | { |
---|
1164 | printf("."); |
---|
1165 | fflush(stdout); |
---|
1166 | } else { |
---|
1167 | printf(".%6ld (%6lld KB)\n", numspec, (ps->ps_AbsOffset>>10)); |
---|
1168 | } |
---|
1169 | } |
---|
1170 | } |
---|
1171 | |
---|
1172 | if(numspec != pg->pg_NumSpecies) |
---|
1173 | { |
---|
1174 | printf("ERROR: Number of species has changed!\n"); |
---|
1175 | fclose(fh); |
---|
1176 | return(FALSE); |
---|
1177 | } |
---|
1178 | |
---|
1179 | /* build tree to find species quicker by raw position */ |
---|
1180 | pg->pg_SpeciesBinTree = BuildBinTree(&pg->pg_Species); |
---|
1181 | |
---|
1182 | /* build a species name hash to mark species in groups */ |
---|
1183 | pg->pg_SpeciesNameHash = GBS_create_hash(SPECIESNAMEHASHSIZE, GB_IGNORE_CASE); // TODO: JB: check if GB_IGNORE_CASE |
---|
1184 | // is right (was 0) |
---|
1185 | ps = (struct PTPanSpecies *) pg->pg_Species.lh_Head; |
---|
1186 | while(ps->ps_Node.ln_Succ) |
---|
1187 | { |
---|
1188 | GBS_write_hash(pg->pg_SpeciesNameHash, ps->ps_Name, ps->ps_Num + 1); |
---|
1189 | ps = (struct PTPanSpecies *) ps->ps_Node.ln_Succ; |
---|
1190 | } |
---|
1191 | |
---|
1192 | printf("\n\nDatabase contains %ld valid species (%ld ignored).\n" |
---|
1193 | "%lld bytes alignment data (%lld bases).\n", |
---|
1194 | pg->pg_NumSpecies, ignorecount, pg->pg_TotalSeqSize, pg->pg_TotalRawSize); |
---|
1195 | printf("Compressed sequence data (with dots and hyphens): %llu byte (%llu kb, %llu mb)\n", |
---|
1196 | pg->pg_TotalSeqCompressedSize, pg->pg_TotalSeqCompressedSize >> 10, pg->pg_TotalSeqCompressedSize >> 20); |
---|
1197 | |
---|
1198 | printf("Number of partitions: %d\n", pg->pg_NumPartitions); |
---|
1199 | |
---|
1200 | for(cnt = 0; cnt < pg->pg_NumPartitions; cnt++) |
---|
1201 | { |
---|
1202 | ULONG pcnt; |
---|
1203 | |
---|
1204 | pp = (struct PTPanPartition *) calloc(1, sizeof(struct PTPanPartition)); |
---|
1205 | if(!pp) |
---|
1206 | { |
---|
1207 | fclose(fh); |
---|
1208 | return(FALSE); /* out of memory */ |
---|
1209 | } |
---|
1210 | pp->pp_PTPanGlobal = pg; |
---|
1211 | fread(&pp->pp_ID, sizeof(pp->pp_ID), 1, fh); |
---|
1212 | fread(&pp->pp_Prefix, sizeof(pp->pp_Prefix), 1, fh); |
---|
1213 | fread(&pp->pp_PrefixLen, sizeof(pp->pp_PrefixLen), 1, fh); |
---|
1214 | fread(&pp->pp_Size, sizeof(pp->pp_Size), 1, fh); |
---|
1215 | fread(&pp->pp_RawOffset, sizeof(pp->pp_RawOffset), 1, fh); |
---|
1216 | |
---|
1217 | pp->pp_PartitionName = (STRPTR) calloc(strlen(pg->pg_IndexName) + 5, 1); |
---|
1218 | if(pg->pg_UseStdSfxTree) |
---|
1219 | { |
---|
1220 | strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 3); |
---|
1221 | sprintf(&pp->pp_PartitionName[strlen(pg->pg_IndexName) - 3], "sfx", |
---|
1222 | pp->pp_ID); |
---|
1223 | } else { |
---|
1224 | strncpy(pp->pp_PartitionName, pg->pg_IndexName, strlen(pg->pg_IndexName) - 2); |
---|
1225 | sprintf(&pp->pp_PartitionName[strlen(pg->pg_IndexName) - 2], "t%03ld", |
---|
1226 | pp->pp_ID); |
---|
1227 | } |
---|
1228 | /* generate partition string */ |
---|
1229 | pcnt = pp->pp_PrefixLen; |
---|
1230 | pp->pp_PrefixSeq = (STRPTR) malloc(pcnt + 1); |
---|
1231 | while(pcnt) |
---|
1232 | { |
---|
1233 | pp->pp_PrefixSeq[pp->pp_PrefixLen - pcnt] = pg->pg_DecompressTable[(pp->pp_Prefix / |
---|
1234 | pg->pg_PowerTable[pcnt - 1]) % pg->pg_AlphaSize]; |
---|
1235 | pcnt--; |
---|
1236 | } |
---|
1237 | pp->pp_PrefixSeq[pp->pp_PrefixLen] = 0; |
---|
1238 | //printf("Part %ld (%ld = '%s')\n", pp->pp_ID, pp->pp_Prefix, pp->pp_PrefixSeq); |
---|
1239 | /* partition ready, add it */ |
---|
1240 | AddTail(&pg->pg_Partitions, &pp->pp_Node); |
---|
1241 | } |
---|
1242 | fclose(fh); |
---|
1243 | |
---|
1244 | /* done! */ |
---|
1245 | return(TRUE); |
---|
1246 | } |
---|
1247 | /* \\\ */ |
---|
1248 | |
---|
1249 | /* /// "LoadAllPartitions()" */ |
---|
1250 | BOOL LoadAllPartitions(struct PTPanGlobal *pg) |
---|
1251 | { |
---|
1252 | struct PTPanPartition *pp; |
---|
1253 | |
---|
1254 | /* load in each partition */ |
---|
1255 | pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head; |
---|
1256 | while(pp->pp_Node.ln_Succ) |
---|
1257 | { |
---|
1258 | if(!(pp->pp_CacheNode = CacheLoadData(pg->pg_PartitionCache, pp->pp_CacheNode, pp))) |
---|
1259 | { |
---|
1260 | return(FALSE); |
---|
1261 | } |
---|
1262 | pp = (struct PTPanPartition *) pp->pp_Node.ln_Succ; |
---|
1263 | } |
---|
1264 | return(TRUE); |
---|
1265 | } |
---|
1266 | /* \\\ */ |
---|
1267 | |
---|
1268 | /* /// "FreeAllPartitions()" */ |
---|
1269 | void FreeAllPartitions(struct PTPanGlobal *pg) |
---|
1270 | { |
---|
1271 | struct PTPanPartition *pp; |
---|
1272 | FlushCache(pg->pg_PartitionCache); |
---|
1273 | pp = (struct PTPanPartition *) pg->pg_Partitions.lh_Head; |
---|
1274 | while(pp->pp_Node.ln_Succ) |
---|
1275 | { |
---|
1276 | FreeCacheNode(pg->pg_PartitionCache, pp->pp_CacheNode); |
---|
1277 | Remove(&pp->pp_Node); |
---|
1278 | free(pp->pp_PrefixSeq); |
---|
1279 | FreeHuffmanTree(pp->pp_BranchTree); |
---|
1280 | FreeHuffmanTree(pp->pp_ShortEdgeTree); |
---|
1281 | FreeHuffmanTree(pp->pp_LongEdgeLenTree); |
---|
1282 | freeset(pp, (struct PTPanPartition *) pg->pg_Partitions.lh_Head); |
---|
1283 | } |
---|
1284 | pg->pg_NumPartitions = 0; |
---|
1285 | } |
---|
1286 | /* \\\ */ |
---|
1287 | |
---|