1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : PT_buildtree.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "probe.h" |
---|
12 | #include <PT_server_prototypes.h> |
---|
13 | #include "probe_tree.h" |
---|
14 | #include "pt_prototypes.h" |
---|
15 | #include <arb_defs.h> |
---|
16 | #include <arb_file.h> |
---|
17 | |
---|
18 | #include <arb_progress.h> |
---|
19 | |
---|
20 | #include <unistd.h> |
---|
21 | |
---|
22 | // AISC_MKPT_PROMOTE: class DataLoc; |
---|
23 | |
---|
24 | static POS_TREE *build_pos_tree(POS_TREE *const root, const DataLoc& loc) { |
---|
25 | POS_TREE *at = root; |
---|
26 | int height = 0; |
---|
27 | |
---|
28 | while (PT_read_type(at) == PT_NT_NODE) { // now we got an inner node |
---|
29 | POS_TREE *pt_next = PT_read_son_stage_1(at, loc[height]); |
---|
30 | if (!pt_next) { // there is no son of that type -> simply add the new son to that path // |
---|
31 | bool atRoot = (at == root); |
---|
32 | PT_create_leaf(&at, loc[height], loc); |
---|
33 | return atRoot ? at : root; // inside tree return old root, otherwise new root has been created |
---|
34 | } |
---|
35 | else { // go down the tree |
---|
36 | at = pt_next; |
---|
37 | height++; |
---|
38 | |
---|
39 | if (loc.is_shorther_than(height)) { |
---|
40 | // end of sequence reached -> change node to chain and add |
---|
41 | // should never be reached, because of the terminal symbol of each sequence (@@@ this IS reached - even with unittestdb) |
---|
42 | if (PT_read_type(at) == PT_NT_CHAIN) { |
---|
43 | PT_add_to_chain(at, loc); |
---|
44 | } |
---|
45 | // if type == node then forget it |
---|
46 | return root; |
---|
47 | } |
---|
48 | } |
---|
49 | } |
---|
50 | |
---|
51 | // type == leaf or chain |
---|
52 | if (PT_read_type(at) == PT_NT_CHAIN) { // old chain reached |
---|
53 | PT_add_to_chain(at, loc); |
---|
54 | return root; |
---|
55 | } |
---|
56 | |
---|
57 | // change leave to node and create two sons |
---|
58 | |
---|
59 | const DataLoc loc_ref(at); |
---|
60 | |
---|
61 | while (loc[height] == loc_ref[height]) { // creates nodes until sequences are different |
---|
62 | // type != nt_node |
---|
63 | if (PT_read_type(at) == PT_NT_CHAIN) { |
---|
64 | PT_add_to_chain(at, loc); |
---|
65 | return root; |
---|
66 | } |
---|
67 | if (height >= PT_POS_TREE_HEIGHT) { |
---|
68 | if (PT_read_type(at) == PT_NT_LEAF) { |
---|
69 | at = PT_leaf_to_chain(at); |
---|
70 | } |
---|
71 | PT_add_to_chain(at, loc); |
---|
72 | return root; |
---|
73 | } |
---|
74 | |
---|
75 | bool loc_done = loc.is_shorther_than(height+1); |
---|
76 | bool ref_done = loc_ref.is_shorther_than(height+1); |
---|
77 | |
---|
78 | if (ref_done && loc_done) return root; // end of both sequences |
---|
79 | |
---|
80 | at = PT_change_leaf_to_node(at); // change tip to node and append two new leafs |
---|
81 | if (loc_done) { // end of source sequence reached |
---|
82 | PT_create_leaf(&at, loc_ref[height], loc_ref); |
---|
83 | return root; |
---|
84 | } |
---|
85 | if (ref_done) { // end of reference sequence |
---|
86 | PT_create_leaf(&at, loc[height], loc); |
---|
87 | return root; |
---|
88 | } |
---|
89 | at = PT_create_leaf(&at, loc[height], loc_ref); // dummy leaf just to create a new node; may become a chain |
---|
90 | height++; |
---|
91 | } |
---|
92 | |
---|
93 | |
---|
94 | |
---|
95 | if (height >= PT_POS_TREE_HEIGHT) { |
---|
96 | if (PT_read_type(at) == PT_NT_LEAF) at = PT_leaf_to_chain(at); |
---|
97 | PT_add_to_chain(at, loc); |
---|
98 | return root; |
---|
99 | } |
---|
100 | if (PT_read_type(at) == PT_NT_CHAIN) { |
---|
101 | // not covered by test - but looks similar to case in top-loop |
---|
102 | PT_add_to_chain(at, loc); |
---|
103 | } |
---|
104 | else { |
---|
105 | at = PT_change_leaf_to_node(at); // delete leaf |
---|
106 | PT_create_leaf(&at, loc[height], loc); // two new leafs |
---|
107 | PT_create_leaf(&at, loc_ref[height], loc_ref); |
---|
108 | } |
---|
109 | return root; |
---|
110 | } |
---|
111 | |
---|
112 | |
---|
113 | inline void get_abs_align_pos(char *seq, int &pos) |
---|
114 | { |
---|
115 | // get the absolute alignment position |
---|
116 | int q_exists = 0; |
---|
117 | if (pos > 3) { |
---|
118 | pos-=3; |
---|
119 | while (pos > 0) { |
---|
120 | uint32_t c = *((uint32_t*)(seq+pos)); |
---|
121 | if (c == 0x2E2E2E2E) { |
---|
122 | q_exists = 1; |
---|
123 | pos-=4; |
---|
124 | continue; |
---|
125 | } |
---|
126 | if (c == 0x2D2D2D2D) { |
---|
127 | pos-=4; |
---|
128 | continue; |
---|
129 | } |
---|
130 | break; |
---|
131 | } |
---|
132 | pos+=3; |
---|
133 | } |
---|
134 | while (pos) { |
---|
135 | unsigned char c = seq[pos]; |
---|
136 | if (c == '.') { |
---|
137 | q_exists = 1; |
---|
138 | pos--; |
---|
139 | continue; |
---|
140 | } |
---|
141 | if (c == '-') { |
---|
142 | pos--; |
---|
143 | continue; |
---|
144 | } |
---|
145 | break; |
---|
146 | } |
---|
147 | pos+=q_exists; |
---|
148 | } |
---|
149 | |
---|
150 | static long PTD_save_partial_tree(FILE *out, POS_TREE * node, char *partstring, int partsize, long pos, long *ppos, ARB_ERROR& error) { |
---|
151 | if (partsize) { |
---|
152 | POS_TREE *son = PT_read_son(node, (PT_BASES)partstring[0]); |
---|
153 | if (son) { |
---|
154 | pos = PTD_save_partial_tree(out, son, partstring+1, partsize-1, pos, ppos, error); |
---|
155 | } |
---|
156 | } |
---|
157 | else { |
---|
158 | PTD_clear_fathers(node); |
---|
159 | long r_pos; |
---|
160 | int blocked; |
---|
161 | blocked = 1; |
---|
162 | while (blocked && !error) { |
---|
163 | blocked = 0; |
---|
164 | #if defined(PTM_DEBUG) |
---|
165 | printf("flushing to disk [%li]\n", pos); |
---|
166 | fflush(stdout); |
---|
167 | #endif |
---|
168 | r_pos = PTD_write_leafs_to_disk(out, node, pos, ppos, &blocked, error); |
---|
169 | if (r_pos > pos) pos = r_pos; |
---|
170 | } |
---|
171 | } |
---|
172 | return pos; |
---|
173 | } |
---|
174 | |
---|
175 | inline int ptd_string_shorter_than(const char *s, int len) { |
---|
176 | int i; |
---|
177 | for (i=0; i<len; i++) { |
---|
178 | if (!s[i]) { |
---|
179 | return 1; |
---|
180 | } |
---|
181 | } |
---|
182 | return 0; |
---|
183 | } |
---|
184 | |
---|
185 | ARB_ERROR enter_stage_1_build_tree(PT_main * , char *tname) { // __ATTR__USERESULT |
---|
186 | // initialize tree and call the build pos tree procedure |
---|
187 | |
---|
188 | ARB_ERROR error; |
---|
189 | |
---|
190 | if (unlink(tname)) { |
---|
191 | if (GB_size_of_file(tname) >= 0) { |
---|
192 | error = GBS_global_string("Cannot remove %s\n", tname); |
---|
193 | } |
---|
194 | } |
---|
195 | |
---|
196 | if (!error) { |
---|
197 | char *t2name = (char *)calloc(sizeof(char), strlen(tname) + 2); |
---|
198 | sprintf(t2name, "%s%%", tname); |
---|
199 | |
---|
200 | FILE *out = fopen(t2name, "w"); |
---|
201 | if (!out) { |
---|
202 | error = GBS_global_string("Cannot open %s\n", t2name); |
---|
203 | } |
---|
204 | else { |
---|
205 | POS_TREE *pt = NULL; |
---|
206 | |
---|
207 | { |
---|
208 | GB_ERROR sm_error = GB_set_mode_of_file(t2name, 0666); |
---|
209 | if (sm_error) { |
---|
210 | GB_warningf("%s\nOther users might get problems when they try to access this file.", sm_error); |
---|
211 | } |
---|
212 | } |
---|
213 | |
---|
214 | putc(0, out); // disable zero father |
---|
215 | long pos = 1; |
---|
216 | |
---|
217 | // now temp file exists -> trigger ptserver-selectionlist-update in all |
---|
218 | // ARB applications by writing to log |
---|
219 | GBS_add_ptserver_logentry(GBS_global_string("Calculating probe tree (%s)", tname)); |
---|
220 | |
---|
221 | psg.ptmain = PT_init(); |
---|
222 | psg.ptmain->stage1 = 1; // enter stage 1 |
---|
223 | |
---|
224 | pt = PT_create_leaf(NULL, PT_N, DataLoc(0, 0, 0)); // create main node |
---|
225 | pt = PT_change_leaf_to_node(pt); |
---|
226 | psg.stat.cut_offs = 0; // statistic information |
---|
227 | GB_begin_transaction(psg.gb_main); |
---|
228 | |
---|
229 | long last_obj = 0; |
---|
230 | char partstring[256]; |
---|
231 | int partsize = 0; |
---|
232 | int pass0 = 0; |
---|
233 | int passes = 1; |
---|
234 | |
---|
235 | { |
---|
236 | ULONG total_size = psg.char_count; |
---|
237 | |
---|
238 | printf("Overall bases: %li\n", total_size); |
---|
239 | |
---|
240 | while (1) { |
---|
241 | #ifdef ARB_64 |
---|
242 | ULONG estimated_kb = (total_size/1024)*55; // value by try and error (for gene server) |
---|
243 | // TODO: estimated_kb depends on 32/64 bit... |
---|
244 | #else |
---|
245 | ULONG estimated_kb = (total_size/1024)*35; // value by try and error; 35 bytes per base |
---|
246 | #endif |
---|
247 | printf("Estimated memory usage for %i passes: %lu k\n", passes, estimated_kb); |
---|
248 | |
---|
249 | if (estimated_kb <= physical_memory) break; |
---|
250 | |
---|
251 | total_size /= 4; |
---|
252 | partsize ++; |
---|
253 | passes *= 5; |
---|
254 | } |
---|
255 | } |
---|
256 | |
---|
257 | PT_init_base_string_counter(partstring, PT_N, partsize); |
---|
258 | arb_progress pass_progress(GBS_global_string ("Tree Build: %s in %i passes\n", GBS_readable_size(psg.char_count, "bp"), passes), |
---|
259 | passes); |
---|
260 | |
---|
261 | while (!PT_base_string_counter_eof(partstring)) { |
---|
262 | ++pass0; |
---|
263 | arb_progress data_progress(GBS_global_string("pass %i/%i", pass0, passes), psg.data_count); |
---|
264 | |
---|
265 | for (int i = 0; i < psg.data_count; i++) { |
---|
266 | int psize; |
---|
267 | char *align_abs = probe_read_alignment(i, &psize); |
---|
268 | |
---|
269 | int abs_align_pos = psize-1; |
---|
270 | for (int j = psg.data[i].get_size() - 1; j >= 0; j--, abs_align_pos--) { |
---|
271 | get_abs_align_pos(align_abs, abs_align_pos); // may result in neg. abs_align_pos (seems to happen if sequences are short < 214bp ) |
---|
272 | if (abs_align_pos < 0) break; // -> in this case abort |
---|
273 | |
---|
274 | if (partsize && (*partstring != psg.data[i].get_data()[j] || strncmp(partstring, psg.data[i].get_data()+j, partsize))) continue; |
---|
275 | if (ptd_string_shorter_than(psg.data[i].get_data()+j, 9)) continue; |
---|
276 | |
---|
277 | pt = build_pos_tree(pt, DataLoc(i, abs_align_pos, j)); |
---|
278 | } |
---|
279 | free(align_abs); |
---|
280 | |
---|
281 | ++data_progress; |
---|
282 | } |
---|
283 | pos = PTD_save_partial_tree(out, pt, partstring, partsize, pos, &last_obj, error); |
---|
284 | if (error) break; |
---|
285 | |
---|
286 | #ifdef PTM_DEBUG |
---|
287 | PTM_debug_mem(); |
---|
288 | PTD_debug_nodes(); |
---|
289 | #endif |
---|
290 | PT_inc_base_string_count(partstring, PT_N, PT_B_MAX, partsize); |
---|
291 | } |
---|
292 | |
---|
293 | if (!error) { |
---|
294 | if (partsize) { |
---|
295 | pos = PTD_save_partial_tree(out, pt, NULL, 0, pos, &last_obj, error); |
---|
296 | #ifdef PTM_DEBUG |
---|
297 | PTM_debug_mem(); |
---|
298 | PTD_debug_nodes(); |
---|
299 | #endif |
---|
300 | } |
---|
301 | } |
---|
302 | if (!error) { |
---|
303 | bool need64bit = false; // does created db need a 64bit ptserver ? |
---|
304 | #ifdef ARB_64 |
---|
305 | if (last_obj >= 0xffffffff) need64bit = true; // last_obj is bigger than int |
---|
306 | #else |
---|
307 | if (last_obj <= 0) { // overflow ? |
---|
308 | GBK_terminate("Overflow - out of memory"); |
---|
309 | } |
---|
310 | #endif |
---|
311 | |
---|
312 | // write information about database |
---|
313 | long info_pos = pos; |
---|
314 | PTD_put_int(out, PT_SERVER_MAGIC); // marker to identify PT-Server file |
---|
315 | fputc(PT_SERVER_VERSION, out); // version of PT-Server file |
---|
316 | pos += 4+1; |
---|
317 | |
---|
318 | // as last element of info block, write it's size (2byte) |
---|
319 | long info_size = pos-info_pos; |
---|
320 | PTD_put_short(out, info_size); |
---|
321 | pos += 2; |
---|
322 | |
---|
323 | // save DB footer (which is the entry point on load) |
---|
324 | |
---|
325 | if (need64bit) { // last_obj is bigger than int |
---|
326 | #ifdef ARB_64 |
---|
327 | PTD_put_longlong(out, last_obj); // write last_obj as long long (64 bit) |
---|
328 | PTD_put_int(out, 0xffffffff); // write 0xffffffff at the end to signalize 64bit ptserver is needed |
---|
329 | #else |
---|
330 | pt_assert(0); |
---|
331 | #endif |
---|
332 | } |
---|
333 | else { |
---|
334 | PTD_put_int(out, last_obj); // last_obj fits into an int -> store it as usual (compatible to old unversioned format) |
---|
335 | } |
---|
336 | } |
---|
337 | if (error) { |
---|
338 | GB_abort_transaction(psg.gb_main); |
---|
339 | fclose(out); |
---|
340 | |
---|
341 | int res = GB_unlink(t2name); |
---|
342 | if (res == -1) fputs(GB_await_error(), stderr); |
---|
343 | } |
---|
344 | else { |
---|
345 | GB_commit_transaction(psg.gb_main); |
---|
346 | fclose(out); |
---|
347 | |
---|
348 | error = GB_rename_file(t2name, tname); |
---|
349 | if (!error) { |
---|
350 | GB_ERROR sm_error = GB_set_mode_of_file(tname, 00666); |
---|
351 | if (sm_error) GB_warning(sm_error); |
---|
352 | |
---|
353 | psg.pt = pt; |
---|
354 | } |
---|
355 | } |
---|
356 | } |
---|
357 | free(t2name); |
---|
358 | } |
---|
359 | return error; |
---|
360 | } |
---|
361 | |
---|
362 | ARB_ERROR enter_stage_3_load_tree(PT_main *, const char *tname) { // __ATTR__USERESULT |
---|
363 | // load tree from disk |
---|
364 | ARB_ERROR error; |
---|
365 | |
---|
366 | psg.ptmain = PT_init(); |
---|
367 | psg.ptmain->stage3 = 1; // enter stage 3 |
---|
368 | |
---|
369 | long size = GB_size_of_file(tname); |
---|
370 | if (size<0) { |
---|
371 | error = GB_IO_error("stat", tname); |
---|
372 | } |
---|
373 | else { |
---|
374 | printf("- reading Tree %s of size %li from disk\n", tname, size); |
---|
375 | FILE *in = fopen(tname, "r"); |
---|
376 | if (!in) { |
---|
377 | error = GB_IO_error("read", tname); |
---|
378 | } |
---|
379 | else { |
---|
380 | error = PTD_read_leafs_from_disk(tname, &psg.pt); |
---|
381 | fclose(in); |
---|
382 | } |
---|
383 | } |
---|
384 | |
---|
385 | return error; |
---|
386 | } |
---|
387 | |
---|
388 | // -------------------------------------------------------------------------------- |
---|
389 | |
---|
390 | #ifdef UNIT_TESTS |
---|
391 | #ifndef TEST_UNIT_H |
---|
392 | #include <test_unit.h> |
---|
393 | #endif |
---|
394 | |
---|
395 | void NOTEST_SLOW_maybe_build_tree() { |
---|
396 | // does only test sth if DB is present. |
---|
397 | |
---|
398 | const char *dbarg = "-D" "extra_pt_src.arb"; |
---|
399 | const char *testDB = dbarg+2; |
---|
400 | const char *resultPT = "extra_pt_src.arb.pt"; |
---|
401 | const char *expectedPT = "extra_pt_src.arb_expected.pt"; |
---|
402 | bool exists = GB_is_regularfile(testDB); |
---|
403 | |
---|
404 | if (exists) { |
---|
405 | const char *argv[] = { |
---|
406 | "fake_pt_server", |
---|
407 | "-build", |
---|
408 | dbarg, |
---|
409 | }; |
---|
410 | |
---|
411 | #if 1 |
---|
412 | // build |
---|
413 | int res = ARB_main(ARRAY_ELEMS(argv), argv); |
---|
414 | TEST_ASSERT_EQUAL(res, EXIT_SUCCESS); |
---|
415 | #endif |
---|
416 | |
---|
417 | // #define TEST_AUTO_UPDATE |
---|
418 | #if defined(TEST_AUTO_UPDATE) |
---|
419 | TEST_COPY_FILE(resultPT, expectedPT); |
---|
420 | #else // !defined(TEST_AUTO_UPDATE) |
---|
421 | TEST_ASSERT_FILES_EQUAL(resultPT, expectedPT); |
---|
422 | #endif |
---|
423 | } |
---|
424 | } |
---|
425 | |
---|
426 | #endif // UNIT_TESTS |
---|
427 | |
---|
428 | // -------------------------------------------------------------------------------- |
---|