1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | #include <string.h> |
---|
4 | |
---|
5 | #include <adlocal.h> |
---|
6 | #include <arbdbt.h> |
---|
7 | |
---|
8 | /* -------------------------------------------------------------------------------- */ |
---|
9 | |
---|
10 | #define MAX_SEQUENCE_PER_MASTER 50 /* was 18 till May 2008 */ |
---|
11 | |
---|
12 | #if defined(DEBUG) |
---|
13 | /* don't do optimize, only create tree and save to DB */ |
---|
14 | /* #define SAVE_COMPRESSION_TREE_TO_DB */ |
---|
15 | #endif /* DEBUG */ |
---|
16 | |
---|
17 | /* -------------------------------------------------------------------------------- */ |
---|
18 | |
---|
19 | typedef struct gb_seq_compr_tree { |
---|
20 | #ifdef FAKE_VTAB_PTR |
---|
21 | virtualTable *dummy_virtual; /* simulate pointer to virtual-table used in AP_tree */ |
---|
22 | #endif |
---|
23 | GBT_TREE_ELEMENTS(struct gb_seq_compr_tree); |
---|
24 | |
---|
25 | int index; /* master(inner nodes) or sequence(leaf nodes) index */ |
---|
26 | int sons; /* sons with sequence or masters (in subtree) */ |
---|
27 | } GB_CTREE; |
---|
28 | |
---|
29 | typedef struct { |
---|
30 | int len; |
---|
31 | char used[256]; |
---|
32 | unsigned char *con[256]; |
---|
33 | } GB_Consensus; |
---|
34 | |
---|
35 | typedef struct { |
---|
36 | GBDATA *gbd; |
---|
37 | int master; |
---|
38 | } GB_Sequence; |
---|
39 | |
---|
40 | typedef struct { |
---|
41 | GBDATA *gbd; |
---|
42 | int master; |
---|
43 | } GB_Master; |
---|
44 | |
---|
45 | /* -------------------------------------------------------------------------------- */ |
---|
46 | |
---|
47 | GB_Consensus *g_b_new_Consensus(long len){ |
---|
48 | GB_Consensus *gcon = (GB_Consensus *)GB_calloc(sizeof(GB_Consensus),1); |
---|
49 | int i; |
---|
50 | unsigned char *data = (unsigned char *)GB_calloc(sizeof(char)*256,len); |
---|
51 | gcon->len = len; |
---|
52 | |
---|
53 | for (i=0;i<256;i++){ |
---|
54 | gcon->con[i] = data + len*i; |
---|
55 | } |
---|
56 | return gcon; |
---|
57 | } |
---|
58 | |
---|
59 | |
---|
60 | void g_b_delete_Consensus(GB_Consensus *gcon){ |
---|
61 | free((char *)gcon->con[0]); |
---|
62 | free((char *)gcon); |
---|
63 | } |
---|
64 | |
---|
65 | |
---|
66 | void g_b_Consensus_add(GB_Consensus *gcon, unsigned char *seq, long seq_len){ |
---|
67 | int i; |
---|
68 | int li; |
---|
69 | int c; |
---|
70 | unsigned char *s; |
---|
71 | int last; |
---|
72 | unsigned char *p; |
---|
73 | int eq_count; |
---|
74 | const int max_priority = 255/MAX_SEQUENCE_PER_MASTER; /* No overflow possible */ |
---|
75 | |
---|
76 | ad_assert(max_priority >= 1); |
---|
77 | |
---|
78 | if (seq_len > gcon->len) seq_len = gcon->len; |
---|
79 | |
---|
80 | /* Search for runs */ |
---|
81 | s = seq; |
---|
82 | last = 0; |
---|
83 | eq_count = 0; |
---|
84 | for (li = i = 0; i < seq_len; i++ ){ |
---|
85 | c = *(s++); |
---|
86 | if (c == last){ |
---|
87 | continue; |
---|
88 | }else{ |
---|
89 | inc_hits: |
---|
90 | eq_count = i-li; |
---|
91 | gcon->used[c] = 1; |
---|
92 | p = gcon->con[last]; |
---|
93 | last = c; |
---|
94 | if (eq_count <= GB_RUNLENGTH_SIZE) { |
---|
95 | c = max_priority; |
---|
96 | while (li < i) p[li++] += c; |
---|
97 | }else{ |
---|
98 | c = max_priority * ( GB_RUNLENGTH_SIZE ) / eq_count; |
---|
99 | if (c){ |
---|
100 | while (li < i) p[li++] += c; |
---|
101 | }else{ |
---|
102 | while (li < i) p[li++] |= 1; |
---|
103 | } |
---|
104 | } |
---|
105 | } |
---|
106 | } |
---|
107 | if (li<seq_len){ |
---|
108 | c = last; |
---|
109 | i = seq_len; |
---|
110 | goto inc_hits; |
---|
111 | } |
---|
112 | } |
---|
113 | |
---|
114 | char *g_b_Consensus_get_sequence(GB_Consensus *gcon){ |
---|
115 | int pos; |
---|
116 | unsigned char *s; |
---|
117 | unsigned char *max = (unsigned char *)GB_calloc(sizeof(char), gcon->len); |
---|
118 | int c; |
---|
119 | char *seq = (char *)GB_calloc(sizeof(char),gcon->len+1); |
---|
120 | |
---|
121 | memset(seq,'@',gcon->len); |
---|
122 | |
---|
123 | for (c = 1; c<256;c++){ /* Find maximum frequency of non run */ |
---|
124 | if (!gcon->used[c]) continue; |
---|
125 | s = gcon->con[c]; |
---|
126 | for (pos= 0;pos<gcon->len;pos++){ |
---|
127 | if (*s > max[pos]) { |
---|
128 | max[pos] = *s; |
---|
129 | seq[pos] = c; |
---|
130 | } |
---|
131 | s++; |
---|
132 | } |
---|
133 | } |
---|
134 | free((char *)max); |
---|
135 | return seq; |
---|
136 | } |
---|
137 | |
---|
138 | |
---|
139 | int g_b_count_leafs(GB_CTREE *node){ |
---|
140 | if (node->is_leaf) return 1; |
---|
141 | node->gb_node = 0; |
---|
142 | return (g_b_count_leafs(node->leftson) + g_b_count_leafs(node->rightson)); |
---|
143 | } |
---|
144 | |
---|
145 | void g_b_put_sequences_in_container(GB_CTREE *ctree,GB_Sequence *seqs,GB_Master **masters,GB_Consensus *gcon){ |
---|
146 | if (ctree->is_leaf){ |
---|
147 | if (ctree->index >= 0) { |
---|
148 | GB_CSTR data = GB_read_char_pntr(seqs[ctree->index].gbd); |
---|
149 | long len = GB_read_string_count(seqs[ctree->index].gbd); |
---|
150 | g_b_Consensus_add(gcon,(unsigned char *)data,len); |
---|
151 | } |
---|
152 | } |
---|
153 | else if (ctree->index<0){ |
---|
154 | g_b_put_sequences_in_container(ctree->leftson,seqs,masters,gcon); |
---|
155 | g_b_put_sequences_in_container(ctree->rightson,seqs,masters,gcon); |
---|
156 | } |
---|
157 | else { |
---|
158 | GB_CSTR data = GB_read_char_pntr(masters[ctree->index]->gbd); |
---|
159 | long len = GB_read_string_count(masters[ctree->index]->gbd); |
---|
160 | g_b_Consensus_add(gcon,(unsigned char *)data,len); |
---|
161 | } |
---|
162 | } |
---|
163 | |
---|
164 | void g_b_create_master(GB_CTREE *node, GB_Sequence *seqs, GB_Master **masters, |
---|
165 | int masterCount, int *builtMasters, int my_master, |
---|
166 | const char *ali_name, long seq_len, int *aborted) |
---|
167 | { |
---|
168 | if (*aborted) { |
---|
169 | return; |
---|
170 | } |
---|
171 | |
---|
172 | if (node->is_leaf){ |
---|
173 | if (node->index >= 0) { |
---|
174 | GBDATA *gb_data = GBT_read_sequence(node->gb_node, ali_name); |
---|
175 | |
---|
176 | seqs[node->index].gbd = gb_data; |
---|
177 | seqs[node->index].master = my_master; |
---|
178 | } |
---|
179 | } |
---|
180 | else { |
---|
181 | if (node->index>=0){ |
---|
182 | masters[node->index]->master = my_master; |
---|
183 | my_master = node->index; |
---|
184 | } |
---|
185 | g_b_create_master(node->leftson,seqs,masters,masterCount,builtMasters,my_master,ali_name,seq_len, aborted); |
---|
186 | g_b_create_master(node->rightson,seqs,masters,masterCount,builtMasters,my_master,ali_name,seq_len, aborted); |
---|
187 | if (node->index>=0 && !*aborted) { /* build me */ |
---|
188 | char *data; |
---|
189 | GB_Consensus *gcon = g_b_new_Consensus(seq_len); |
---|
190 | |
---|
191 | g_b_put_sequences_in_container(node->leftson,seqs,masters,gcon); |
---|
192 | g_b_put_sequences_in_container(node->rightson,seqs,masters,gcon); |
---|
193 | |
---|
194 | data = g_b_Consensus_get_sequence(gcon); |
---|
195 | |
---|
196 | GB_write_string(masters[node->index]->gbd,data); |
---|
197 | GB_write_security_write(masters[node->index]->gbd,7); |
---|
198 | |
---|
199 | g_b_delete_Consensus(gcon); |
---|
200 | free(data); |
---|
201 | |
---|
202 | (*builtMasters)++; |
---|
203 | *aborted |= GB_status(*builtMasters/(double)masterCount); |
---|
204 | } |
---|
205 | } |
---|
206 | } |
---|
207 | |
---|
208 | /* ------------------------------------- */ |
---|
209 | /* distribute master sequences */ |
---|
210 | /* ------------------------------------- */ |
---|
211 | |
---|
212 | static void subtract_sons_from_tree(GB_CTREE *node, int subtract) { |
---|
213 | while (node) { |
---|
214 | node->sons -= subtract; |
---|
215 | node = node->father; |
---|
216 | } |
---|
217 | } |
---|
218 | |
---|
219 | static int set_masters_with_sons(GB_CTREE *node, int wantedSons, int *mcount) { |
---|
220 | if (!node->is_leaf) { |
---|
221 | if (node->sons == wantedSons) { |
---|
222 | /* insert new master */ |
---|
223 | ad_assert(node->index == -1); |
---|
224 | node->index = *mcount; |
---|
225 | (*mcount)++; |
---|
226 | |
---|
227 | subtract_sons_from_tree(node->father, node->sons-1); |
---|
228 | node->sons = 1; |
---|
229 | } |
---|
230 | else if (node->sons>wantedSons) { |
---|
231 | int lMax = set_masters_with_sons(node->leftson, wantedSons, mcount); |
---|
232 | int rMax = set_masters_with_sons(node->rightson, wantedSons, mcount); |
---|
233 | |
---|
234 | int maxSons = lMax<rMax ? rMax : lMax; |
---|
235 | if (node->sons <= MAX_SEQUENCE_PER_MASTER && node->sons>maxSons) { |
---|
236 | maxSons = node->sons; |
---|
237 | } |
---|
238 | return maxSons; |
---|
239 | } |
---|
240 | } |
---|
241 | return node->sons <= MAX_SEQUENCE_PER_MASTER ? node->sons : 0; |
---|
242 | } |
---|
243 | |
---|
244 | static int maxCompressionSteps(GB_CTREE *node) { |
---|
245 | if (node->is_leaf) { |
---|
246 | return 0; |
---|
247 | } |
---|
248 | |
---|
249 | int left = maxCompressionSteps(node->leftson); |
---|
250 | int right = maxCompressionSteps(node->rightson); |
---|
251 | |
---|
252 | #if defined(SAVE_COMPRESSION_TREE_TO_DB) |
---|
253 | freeset(node->name, 0); |
---|
254 | if (node->index2 != -1) { |
---|
255 | node->name = GBS_global_string_copy("master_%03i", node->index2); |
---|
256 | } |
---|
257 | #endif /* SAVE_COMPRESSION_TREE_TO_DB */ |
---|
258 | |
---|
259 | return (left>right ? left : right) + (node->index == -1 ? 0 : 1); |
---|
260 | } |
---|
261 | |
---|
262 | static int init_indices_and_count_sons(GB_CTREE *node, int *scount, const char *ali_name) { |
---|
263 | if (node->is_leaf){ |
---|
264 | if (node->gb_node ==0 || !GBT_read_sequence(node->gb_node,(char *)ali_name)) { |
---|
265 | node->index = -1; |
---|
266 | node->sons = 0; |
---|
267 | } |
---|
268 | else { |
---|
269 | node->index = *scount; |
---|
270 | node->sons = 1; |
---|
271 | (*scount)++; |
---|
272 | } |
---|
273 | } |
---|
274 | else { |
---|
275 | node->index = -1; |
---|
276 | node->sons = |
---|
277 | init_indices_and_count_sons(node->leftson, scount, ali_name) + |
---|
278 | init_indices_and_count_sons(node->rightson, scount, ali_name); |
---|
279 | } |
---|
280 | return node->sons; |
---|
281 | } |
---|
282 | |
---|
283 | static void distribute_masters(GB_CTREE *tree, int *mcount, int *max_masters) { |
---|
284 | int wantedSons = MAX_SEQUENCE_PER_MASTER; |
---|
285 | while (wantedSons >= 2) { |
---|
286 | int maxSons = set_masters_with_sons(tree, wantedSons, mcount); |
---|
287 | wantedSons = maxSons; |
---|
288 | } |
---|
289 | ad_assert(tree->sons == 1); |
---|
290 | |
---|
291 | ad_assert(tree->index != -1); |
---|
292 | *max_masters = maxCompressionSteps(tree); |
---|
293 | } |
---|
294 | |
---|
295 | /* -------------------------------------------------------------------------------- */ |
---|
296 | |
---|
297 | static GB_INLINE long g_b_read_number2(const unsigned char **s) { |
---|
298 | unsigned int c0,c1,c2,c3,c4; |
---|
299 | c0 = (*((*s)++)); |
---|
300 | if (c0 & 0x80){ |
---|
301 | c1 = (*((*s)++)); |
---|
302 | if (c0 & 0x40) { |
---|
303 | c2 = (*((*s)++)); |
---|
304 | if (c0 & 0x20) { |
---|
305 | c3 = (*((*s)++)); |
---|
306 | if (c0 &0x10) { |
---|
307 | c4 = (*((*s)++)); |
---|
308 | return c4 | (c3<<8) | (c2<<16) | (c1<<8); |
---|
309 | }else{ |
---|
310 | return (c3) | (c2<<8 ) | (c1<<16) | ((c0 & 0x0f)<<24); |
---|
311 | } |
---|
312 | }else{ |
---|
313 | return (c2) | (c1<<8) | ((c0 & 0x1f)<<16); |
---|
314 | } |
---|
315 | }else{ |
---|
316 | return (c1) | ((c0 & 0x3f)<<8); |
---|
317 | } |
---|
318 | }else{ |
---|
319 | return c0; |
---|
320 | } |
---|
321 | } |
---|
322 | |
---|
323 | static GB_INLINE void g_b_put_number2(int i, unsigned char **s) { |
---|
324 | int j; |
---|
325 | if (i< 0x80 ){ *((*s)++)=i;return; } |
---|
326 | if (i<0x4000) { |
---|
327 | j = (i>>8) | 0x80; |
---|
328 | *((*s)++)=j; |
---|
329 | *((*s)++)=i; |
---|
330 | }else if (i<0x200000) { |
---|
331 | j = (i>>16) | 0xC0; |
---|
332 | *((*s)++)=j; |
---|
333 | j = (i>>8); |
---|
334 | *((*s)++)=j; |
---|
335 | *((*s)++)=i; |
---|
336 | } else if (i<0x10000000) { |
---|
337 | j = (i>>24) | 0xE0; |
---|
338 | *((*s)++)=j; |
---|
339 | j = (i>>16); |
---|
340 | *((*s)++)=j; |
---|
341 | j = (i>>8); |
---|
342 | *((*s)++)=j; |
---|
343 | *((*s)++)=i; |
---|
344 | } |
---|
345 | } |
---|
346 | |
---|
347 | |
---|
348 | char *gb_compress_seq_by_master(const char *master,int master_len,int master_index, |
---|
349 | GBQUARK q, const char *seq, long seq_len, |
---|
350 | long *memsize, int old_flag){ |
---|
351 | unsigned char *buffer; |
---|
352 | int rest = 0; |
---|
353 | unsigned char *d; |
---|
354 | int i,cs,cm; |
---|
355 | int last; |
---|
356 | long len = seq_len; |
---|
357 | |
---|
358 | d = buffer = (unsigned char *)GB_give_other_buffer(seq,seq_len); |
---|
359 | |
---|
360 | if (seq_len > master_len){ |
---|
361 | rest = seq_len - master_len; |
---|
362 | len = master_len; |
---|
363 | } |
---|
364 | |
---|
365 | last = -1000; /* Convert Sequence relative to Master */ |
---|
366 | for( i = len; i>0; i--){ |
---|
367 | cm = *(master++); |
---|
368 | cs = *(seq++); |
---|
369 | if (cm==cs && cs != last){ |
---|
370 | *(d++) = 0; |
---|
371 | last = 1000; |
---|
372 | }else{ |
---|
373 | *(d++) = cs; |
---|
374 | last = cs; |
---|
375 | } |
---|
376 | } |
---|
377 | for( i = rest; i>0; i--){ |
---|
378 | *(d++) = *(seq++); |
---|
379 | } |
---|
380 | |
---|
381 | { /* Append run length compression method */ |
---|
382 | unsigned char *buffer2; |
---|
383 | unsigned char *dest2; |
---|
384 | buffer2 = dest2 = (unsigned char *)GB_give_other_buffer((char *)buffer,seq_len+100); |
---|
385 | *(dest2++) = GB_COMPRESSION_SEQUENCE | old_flag; |
---|
386 | |
---|
387 | g_b_put_number2(master_index,&dest2); /* Tags */ |
---|
388 | g_b_put_number2(q,&dest2); |
---|
389 | |
---|
390 | gb_compress_equal_bytes_2((char *)buffer,seq_len,memsize,(char *)dest2); /* append runlength compressed sequences to tags */ |
---|
391 | |
---|
392 | *memsize = *memsize + (dest2-buffer2); |
---|
393 | return (char *)buffer2; |
---|
394 | } |
---|
395 | } |
---|
396 | |
---|
397 | char *gb_compress_sequence_by_master(GBDATA *gbd,const char *master,int master_len,int master_index, |
---|
398 | GBQUARK q, const char *seq, int seq_len, long *memsize) |
---|
399 | { |
---|
400 | long size; |
---|
401 | char *is = gb_compress_seq_by_master(master,master_len,master_index,q,seq,seq_len,&size,GB_COMPRESSION_LAST); |
---|
402 | char *res = gb_compress_data(gbd,0,is,size,memsize, ~(GB_COMPRESSION_DICTIONARY|GB_COMPRESSION_SORTBYTES|GB_COMPRESSION_RUNLENGTH),GB_TRUE); |
---|
403 | return res; |
---|
404 | } |
---|
405 | |
---|
406 | static GB_ERROR compress_sequence_tree(GBDATA *gb_main, GB_CTREE *tree, const char *ali_name){ |
---|
407 | GB_ERROR error = 0; |
---|
408 | long ali_len = GBT_get_alignment_len(gb_main, ali_name); |
---|
409 | int main_clock = GB_read_clock(gb_main); |
---|
410 | |
---|
411 | GB_ERROR warning = NULL; |
---|
412 | |
---|
413 | if (ali_len<0){ |
---|
414 | warning = GBS_global_string("Skipping alignment '%s' (not a valid alignment; len=%li).", ali_name, ali_len); |
---|
415 | } |
---|
416 | else { |
---|
417 | int leafcount = g_b_count_leafs(tree); |
---|
418 | if (!leafcount) { |
---|
419 | error = "Tree is empty"; |
---|
420 | } |
---|
421 | else { |
---|
422 | /* Distribute masters in tree */ |
---|
423 | int mastercount = 0; |
---|
424 | int max_compSteps = 0; // in one branch |
---|
425 | int seqcount = 0; |
---|
426 | |
---|
427 | GB_status2("Create master sequences"); |
---|
428 | GB_status(0); |
---|
429 | |
---|
430 | init_indices_and_count_sons(tree, &seqcount, ali_name); |
---|
431 | if (!seqcount) { |
---|
432 | warning = GBS_global_string("Tree contains no sequences with data in '%s'\n" |
---|
433 | "Skipping compression for this alignment", |
---|
434 | ali_name); |
---|
435 | } |
---|
436 | else { |
---|
437 | distribute_masters(tree, &mastercount, &max_compSteps); |
---|
438 | |
---|
439 | #if defined(SAVE_COMPRESSION_TREE_TO_DB) |
---|
440 | { |
---|
441 | error = GBT_link_tree((GBT_TREE*)tree, gb_main, 0, NULL, NULL); |
---|
442 | if (!error) error = GBT_write_tree(gb_main,0,"tree_compression_new",(GBT_TREE*)tree); |
---|
443 | GB_information("Only generated compression tree (do NOT save DB anymore)"); |
---|
444 | return error; |
---|
445 | } |
---|
446 | #endif /* SAVE_COMPRESSION_TREE_TO_DB */ |
---|
447 | |
---|
448 | // detect degenerated trees |
---|
449 | { |
---|
450 | int min_masters = ((seqcount-1)/MAX_SEQUENCE_PER_MASTER)+1; |
---|
451 | int min_compSteps = 1; |
---|
452 | { |
---|
453 | int m = min_masters; |
---|
454 | while (m>1) { |
---|
455 | m = ((m-1)/MAX_SEQUENCE_PER_MASTER)+1; |
---|
456 | min_masters += m; |
---|
457 | min_compSteps++; |
---|
458 | } |
---|
459 | } |
---|
460 | |
---|
461 | int acceptable_masters = (3*min_masters)/2; // accept 50% overhead |
---|
462 | int acceptable_compSteps = 11*min_compSteps; // accept 1000% overhead |
---|
463 | |
---|
464 | if (mastercount>acceptable_masters || max_compSteps>acceptable_compSteps) { |
---|
465 | GB_warningf("Tree is ill-suited for compression (cause of deep branches)\n" |
---|
466 | " Used tree Optimal tree Overhead\n" |
---|
467 | "Compression steps %5i %5i %4i%% (speed)\n" |
---|
468 | "Master sequences %5i %5i %4i%% (size)\n" |
---|
469 | "If you like to restart with a better tree,\n" |
---|
470 | "press 'Abort' to stop compression", |
---|
471 | max_compSteps, min_compSteps, (100*max_compSteps)/min_compSteps-100, |
---|
472 | mastercount, min_masters, (100*mastercount)/min_masters-100); |
---|
473 | } |
---|
474 | } |
---|
475 | |
---|
476 | ad_assert(mastercount>0); |
---|
477 | } |
---|
478 | |
---|
479 | if (!warning) { |
---|
480 | GBDATA *gb_master_ali = 0; |
---|
481 | GBDATA *old_gb_master_ali = 0; |
---|
482 | GB_Sequence *seqs = 0; |
---|
483 | GB_MAIN_TYPE *Main = GB_MAIN(gb_main); |
---|
484 | GBQUARK ali_quark = gb_key_2_quark(Main,ali_name); |
---|
485 | unsigned long long sumorg = 0; |
---|
486 | unsigned long long sumold = 0; |
---|
487 | unsigned long long sumnew = 0; |
---|
488 | GB_Master **masters = (GB_Master **)GB_calloc(sizeof(*masters),leafcount); |
---|
489 | int si; |
---|
490 | |
---|
491 | { |
---|
492 | char *masterfoldername = GBS_global_string_copy("%s/@master_data/@%s",GB_SYSTEM_FOLDER,ali_name); |
---|
493 | old_gb_master_ali = GB_search(gb_main, masterfoldername,GB_FIND); |
---|
494 | free(masterfoldername); |
---|
495 | } |
---|
496 | |
---|
497 | // create masters |
---|
498 | if (!error) { |
---|
499 | { |
---|
500 | char *master_data_name = GBS_global_string_copy("%s/@master_data",GB_SYSTEM_FOLDER); |
---|
501 | char *master_name = GBS_global_string_copy("@%s",ali_name); |
---|
502 | GBDATA *gb_master_data = gb_search(gb_main, master_data_name,GB_CREATE_CONTAINER,1); |
---|
503 | |
---|
504 | /* create a master container, the old is deleted as soon as all sequences are compressed by the new method*/ |
---|
505 | gb_master_ali = gb_create_container(gb_master_data, master_name); |
---|
506 | GB_write_security_delete(gb_master_ali,7); |
---|
507 | |
---|
508 | free(master_name); |
---|
509 | free(master_data_name); |
---|
510 | } |
---|
511 | for (si = 0; si<mastercount; si++) { |
---|
512 | masters[si] = (GB_Master *)GB_calloc(sizeof(GB_Master),1); |
---|
513 | masters[si]->gbd = gb_create(gb_master_ali,"@master",GB_STRING); |
---|
514 | } |
---|
515 | seqs = (GB_Sequence *)GB_calloc(sizeof(*seqs),leafcount); |
---|
516 | { |
---|
517 | int builtMasters = 0; |
---|
518 | int aborted = 0; |
---|
519 | GB_status2("Building %i master sequences", mastercount); |
---|
520 | g_b_create_master(tree,seqs,masters,mastercount,&builtMasters,-1,ali_name,ali_len, &aborted); |
---|
521 | if (aborted) error = "User abort"; |
---|
522 | } |
---|
523 | } |
---|
524 | |
---|
525 | /* Compress sequences in tree */ |
---|
526 | if (!error) { |
---|
527 | GB_status2("Compressing %i sequences in tree", seqcount); |
---|
528 | GB_status(0); |
---|
529 | |
---|
530 | for (si=0;si<seqcount;si++){ |
---|
531 | int mi = seqs[si].master; |
---|
532 | GB_Master *master = masters[mi]; |
---|
533 | GBDATA *gbd = seqs[si].gbd; |
---|
534 | |
---|
535 | if (GB_read_clock(gbd) >= main_clock){ |
---|
536 | GB_warning("A species seems to be more than once in the tree"); |
---|
537 | } |
---|
538 | else { |
---|
539 | char *seq = GB_read_string(gbd); |
---|
540 | int seq_len = GB_read_string_count(gbd); |
---|
541 | long sizen = GB_read_memuse(gbd); |
---|
542 | char *seqm = GB_read_string(master->gbd); |
---|
543 | int master_len = GB_read_string_count(master->gbd); |
---|
544 | long sizes; |
---|
545 | char *ss = gb_compress_sequence_by_master(gbd,seqm,master_len,mi,ali_quark,seq,seq_len,&sizes); |
---|
546 | |
---|
547 | gb_write_compressed_pntr(gbd,ss,sizes,seq_len); |
---|
548 | sizes = GB_read_memuse(gbd); // check real usage |
---|
549 | |
---|
550 | sumnew += sizes; |
---|
551 | sumold += sizen; |
---|
552 | sumorg += seq_len; |
---|
553 | |
---|
554 | free(seqm); |
---|
555 | free(seq); |
---|
556 | } |
---|
557 | |
---|
558 | if (GB_status((si+1)/(double)seqcount)) { |
---|
559 | error = "User abort"; |
---|
560 | break; |
---|
561 | } |
---|
562 | } |
---|
563 | } |
---|
564 | |
---|
565 | /* Compress rest of sequences */ |
---|
566 | if (!error) { |
---|
567 | int pass; /* pass 1 : count species to compress, pass 2 : compress species */ |
---|
568 | int speciesNotInTree = 0; |
---|
569 | |
---|
570 | GB_status2("Compressing sequences NOT in tree"); |
---|
571 | GB_status(0); |
---|
572 | |
---|
573 | for (pass = 1; pass <= 2; ++pass) { |
---|
574 | GBDATA *gb_species_data = GB_search(gb_main,"species_data",GB_CREATE_CONTAINER); |
---|
575 | GBDATA *gb_species; |
---|
576 | int count = 0; |
---|
577 | |
---|
578 | for (gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
579 | gb_species; |
---|
580 | gb_species = GBT_next_species(gb_species)) |
---|
581 | { |
---|
582 | GBDATA *gbd = GBT_read_sequence(gb_species,ali_name); |
---|
583 | |
---|
584 | if(!gbd) continue; |
---|
585 | if (GB_read_clock(gbd) >= main_clock) continue; /* Compress only those which are not compressed by masters */ |
---|
586 | count++; |
---|
587 | if (pass == 2) { |
---|
588 | char *data = GB_read_string(gbd); |
---|
589 | long seq_len = GB_read_string_count(gbd); |
---|
590 | long size = GB_read_memuse(gbd); |
---|
591 | |
---|
592 | GB_write_string(gbd,""); |
---|
593 | GB_write_string(gbd,data); |
---|
594 | free(data); |
---|
595 | |
---|
596 | sumold += size; |
---|
597 | |
---|
598 | size = GB_read_memuse(gbd); |
---|
599 | sumnew += size; |
---|
600 | sumorg += seq_len; |
---|
601 | |
---|
602 | if (GB_status(count/(double)speciesNotInTree)) { |
---|
603 | error = "User abort"; |
---|
604 | break; |
---|
605 | } |
---|
606 | } |
---|
607 | } |
---|
608 | if (pass == 1) { |
---|
609 | speciesNotInTree = count; |
---|
610 | if (GB_status2("Compressing %i sequences NOT in tree", speciesNotInTree)) { |
---|
611 | error = "User abort"; |
---|
612 | break; |
---|
613 | } |
---|
614 | } |
---|
615 | } |
---|
616 | } |
---|
617 | |
---|
618 | if (!error) { |
---|
619 | GB_status2("Compressing %i master-sequences", mastercount); |
---|
620 | GB_status(0); |
---|
621 | |
---|
622 | /* Compress all masters */ |
---|
623 | for (si=0;si<mastercount;si++){ |
---|
624 | int mi = masters[si]->master; |
---|
625 | |
---|
626 | if (mi>0) { /* master available */ |
---|
627 | GBDATA *gbd = masters[si]->gbd; |
---|
628 | |
---|
629 | ad_assert(mi>si); /* we don't want a recursion, because we cannot uncompress sequence compressed masters, Main->gb_master_data is wrong */ |
---|
630 | |
---|
631 | if (gb_read_nr(gbd) != si) { /* Check database */ |
---|
632 | GB_internal_error("Sequence Compression: Master Index Conflict"); |
---|
633 | error = GB_export_error("Sequence Compression: Master Index Conflict"); |
---|
634 | break; |
---|
635 | } |
---|
636 | |
---|
637 | { |
---|
638 | GB_Master *master = masters[mi]; |
---|
639 | char *seqm = GB_read_string(master->gbd); |
---|
640 | int master_len = GB_read_string_count(master->gbd); |
---|
641 | char *seq = GB_read_string(gbd); |
---|
642 | int seq_len = GB_read_string_count(gbd); |
---|
643 | long sizes; |
---|
644 | char *ss = gb_compress_sequence_by_master(gbd,seqm,master_len,mi,ali_quark,seq,seq_len,&sizes); |
---|
645 | |
---|
646 | gb_write_compressed_pntr(gbd,ss,sizes,seq_len); |
---|
647 | sumnew += sizes; |
---|
648 | |
---|
649 | free(seq); |
---|
650 | free(seqm); |
---|
651 | } |
---|
652 | |
---|
653 | if (GB_status((si+1)/(double)mastercount)) { |
---|
654 | error = "User abort"; |
---|
655 | break; |
---|
656 | } |
---|
657 | } |
---|
658 | else { // count size of top master |
---|
659 | GBDATA *gbd = masters[si]->gbd; |
---|
660 | sumnew += GB_read_memuse(gbd); |
---|
661 | } |
---|
662 | } |
---|
663 | |
---|
664 | // count size of old master data |
---|
665 | if (!error) { |
---|
666 | GBDATA *gb_omaster; |
---|
667 | for (gb_omaster = GB_entry(old_gb_master_ali, "@master"); |
---|
668 | gb_omaster; |
---|
669 | gb_omaster = GB_nextEntry(gb_omaster)) |
---|
670 | { |
---|
671 | long size = GB_read_memuse(gb_omaster); |
---|
672 | sumold += size; |
---|
673 | } |
---|
674 | } |
---|
675 | |
---|
676 | if (!error) { |
---|
677 | char *sizeOrg = strdup(GBS_readable_size(sumorg)); |
---|
678 | char *sizeOld = strdup(GBS_readable_size(sumold)); |
---|
679 | char *sizeNew = strdup(GBS_readable_size(sumnew)); |
---|
680 | |
---|
681 | GB_warningf("Alignment '%s':\n" |
---|
682 | " Uncompressed data: %7s\n" |
---|
683 | " Old compressed data: %7s = %6.2f%%\n" |
---|
684 | " New compressed data: %7s = %6.2f%%", |
---|
685 | ali_name, sizeOrg, |
---|
686 | sizeOld, (100.0*sumold)/sumorg, |
---|
687 | sizeNew, (100.0*sumnew)/sumorg); |
---|
688 | |
---|
689 | free(sizeNew); |
---|
690 | free(sizeOld); |
---|
691 | free(sizeOrg); |
---|
692 | } |
---|
693 | } |
---|
694 | |
---|
695 | |
---|
696 | if (!error) { |
---|
697 | if (old_gb_master_ali){ |
---|
698 | error = GB_delete(old_gb_master_ali); |
---|
699 | } |
---|
700 | Main->keys[ali_quark].gb_master_ali = gb_master_ali; |
---|
701 | } |
---|
702 | |
---|
703 | // free data |
---|
704 | free(seqs); |
---|
705 | for (si=0;si<mastercount;si++) free(masters[si]); |
---|
706 | free(masters); |
---|
707 | } |
---|
708 | } |
---|
709 | } |
---|
710 | |
---|
711 | if (warning) GB_information(warning); |
---|
712 | |
---|
713 | return error; |
---|
714 | } |
---|
715 | |
---|
716 | /* Compress sequences, call only outside a transaction */ |
---|
717 | GB_ERROR GBT_compress_sequence_tree2(GBDATA *gb_main, const char *tree_name, const char *ali_name){ |
---|
718 | GB_ERROR error = 0; |
---|
719 | GB_CTREE *ctree; |
---|
720 | GB_UNDO_TYPE undo_type = GB_get_requested_undo_type(gb_main); |
---|
721 | GB_MAIN_TYPE *Main = GB_MAIN(gb_main); |
---|
722 | char *to_free = 0; |
---|
723 | |
---|
724 | if (Main->transaction>0){ |
---|
725 | GB_internal_error("Internal Error: Compress Sequences called during a running transaction"); |
---|
726 | return GB_export_error("Internal Error: Compress Sequences called during a running transaction"); |
---|
727 | } |
---|
728 | |
---|
729 | GB_request_undo_type(gb_main,GB_UNDO_KILL); |
---|
730 | GB_begin_transaction(gb_main); |
---|
731 | GB_push_my_security(gb_main); |
---|
732 | |
---|
733 | if (!tree_name || !strlen(tree_name)) { |
---|
734 | to_free = GBT_find_largest_tree(gb_main); |
---|
735 | tree_name = to_free; |
---|
736 | } |
---|
737 | ctree = (GB_CTREE *)GBT_read_tree(gb_main,(char *)tree_name,-sizeof(GB_CTREE)); |
---|
738 | if (!ctree) { |
---|
739 | error = GB_export_errorf("Tree %s not found in database",tree_name); |
---|
740 | } |
---|
741 | else { |
---|
742 | error = GBT_link_tree((GBT_TREE *)ctree, gb_main, GB_FALSE, 0, 0); |
---|
743 | if (!error) error = compress_sequence_tree(gb_main,ctree,ali_name); |
---|
744 | GBT_delete_tree((GBT_TREE *)ctree); |
---|
745 | } |
---|
746 | |
---|
747 | GB_pop_my_security(gb_main); |
---|
748 | if (error) { |
---|
749 | GB_abort_transaction(gb_main); |
---|
750 | } |
---|
751 | else { |
---|
752 | GB_commit_transaction(gb_main); |
---|
753 | GB_disable_quicksave(gb_main,"Database optimized"); |
---|
754 | } |
---|
755 | GB_request_undo_type(gb_main,undo_type); |
---|
756 | |
---|
757 | if (to_free) free(to_free); |
---|
758 | |
---|
759 | #if defined(SAVE_COMPRESSION_TREE_TO_DB) |
---|
760 | error = "fake error"; |
---|
761 | #endif /* SAVE_COMPRESSION_TREE_TO_DB */ |
---|
762 | |
---|
763 | return error; |
---|
764 | } |
---|
765 | |
---|
766 | void GBT_compression_test(void *dummy, GBDATA *gb_main) { |
---|
767 | GB_ERROR error = GB_begin_transaction(gb_main); |
---|
768 | char *ali_name = GBT_get_default_alignment(gb_main); |
---|
769 | char *tree_name = GBT_read_string(gb_main, "focus/tree_name"); |
---|
770 | |
---|
771 | GBUSE(dummy); |
---|
772 | if (!ali_name || !tree_name) error = GB_await_error(); |
---|
773 | |
---|
774 | error = GB_end_transaction(gb_main, error); |
---|
775 | |
---|
776 | if (!error) { |
---|
777 | printf("Recompression data in alignment '%s' using tree '%s'\n", ali_name, tree_name); |
---|
778 | error = GBT_compress_sequence_tree2(gb_main, tree_name, ali_name); |
---|
779 | } |
---|
780 | |
---|
781 | if (error) GB_warning(error); |
---|
782 | free(tree_name); |
---|
783 | free(ali_name); |
---|
784 | } |
---|
785 | |
---|
786 | |
---|
787 | |
---|
788 | /* ******************** Decompress Sequences ******************** */ |
---|
789 | |
---|
790 | char *g_b_uncompress_single_sequence_by_master(const char *s, const char *master, long size, long *new_size) { |
---|
791 | const signed char *source = (signed char *)s; |
---|
792 | char *dest; |
---|
793 | const char *m = master; |
---|
794 | unsigned int c; |
---|
795 | int j; |
---|
796 | int i; |
---|
797 | char *buffer; |
---|
798 | |
---|
799 | dest = buffer = GB_give_other_buffer((char *)source,size); |
---|
800 | |
---|
801 | for (i=size;i;) { |
---|
802 | j = *(source++); |
---|
803 | if (j>0) { /* uncompressed data block */ |
---|
804 | if (j>i) j=i; |
---|
805 | i -= j; |
---|
806 | for (;j;j--) { |
---|
807 | c = *(source++); |
---|
808 | if (!c) c = *m; |
---|
809 | *(dest++) = c; |
---|
810 | m++; |
---|
811 | } |
---|
812 | }else{ /* equal bytes compressed */ |
---|
813 | if (!j) break; /* end symbol */ |
---|
814 | if (j== -122) { |
---|
815 | j = *(source++) & 0xff; |
---|
816 | j |= ((*(source++)) <<8) &0xff00; |
---|
817 | j = -j; |
---|
818 | } |
---|
819 | c = *(source++); |
---|
820 | i += j; |
---|
821 | if (i<0) { |
---|
822 | GB_internal_error("Internal Error: Missing end in data"); |
---|
823 | j += -i; |
---|
824 | i = 0; |
---|
825 | } |
---|
826 | if (c==0) { |
---|
827 | memcpy(dest, m, -j); |
---|
828 | dest+= -j; |
---|
829 | m+= -j; |
---|
830 | } else { |
---|
831 | memset(dest, c, -j); |
---|
832 | dest+= -j; |
---|
833 | m+= -j; |
---|
834 | } |
---|
835 | } |
---|
836 | } |
---|
837 | *(dest++) = 0; /* NULL of NULL terminated string */ |
---|
838 | |
---|
839 | *new_size = dest-buffer; |
---|
840 | ad_assert(size == *new_size); // buffer overflow |
---|
841 | |
---|
842 | return buffer; |
---|
843 | } |
---|
844 | |
---|
845 | char *gb_uncompress_by_sequence(GBDATA *gbd, const char *ss,long size, GB_ERROR *error, long *new_size) { |
---|
846 | char *dest = 0; |
---|
847 | |
---|
848 | *error = 0; |
---|
849 | if (!GB_FATHER(gbd)) { |
---|
850 | *error = "Cannot uncompress this sequence: Sequence has no father"; |
---|
851 | } |
---|
852 | else { |
---|
853 | GB_MAIN_TYPE *Main = GB_MAIN(gbd); |
---|
854 | GBDATA *gb_main = (GBDATA*)Main->data; |
---|
855 | char *to_free = GB_check_out_buffer(ss); /* Remove 'ss' from memory management, otherwise load_single_key_data() may destroy it */ |
---|
856 | int index; |
---|
857 | GBQUARK quark; |
---|
858 | |
---|
859 | { |
---|
860 | const unsigned char *s = (const unsigned char *)ss; |
---|
861 | |
---|
862 | index = g_b_read_number2(&s); |
---|
863 | quark = g_b_read_number2(&s); |
---|
864 | |
---|
865 | ss = (const char *)s; |
---|
866 | } |
---|
867 | |
---|
868 | if (!Main->keys[quark].gb_master_ali){ |
---|
869 | gb_load_single_key_data(gb_main,quark); |
---|
870 | } |
---|
871 | |
---|
872 | if (!Main->keys[quark].gb_master_ali){ |
---|
873 | *error = "Cannot uncompress this sequence: Cannot find a master sequence"; |
---|
874 | } |
---|
875 | else { |
---|
876 | GBDATA *gb_master = gb_find_by_nr(Main->keys[quark].gb_master_ali,index); |
---|
877 | if (gb_master){ |
---|
878 | const char *master = GB_read_char_pntr(gb_master); /* make sure that this is not a buffer !!! */ |
---|
879 | |
---|
880 | ad_assert((GB_read_string_count(gb_master)+1) == size); // size mismatch between master and slave |
---|
881 | dest = g_b_uncompress_single_sequence_by_master(ss, master, size, new_size); |
---|
882 | } |
---|
883 | else { |
---|
884 | *error = GB_await_error(); |
---|
885 | } |
---|
886 | } |
---|
887 | free(to_free); |
---|
888 | } |
---|
889 | |
---|
890 | return dest; |
---|
891 | } |
---|