1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : PT_partition.h // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in October 2012 // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #ifndef PT_PARTITION_H |
---|
13 | #define PT_PARTITION_H |
---|
14 | |
---|
15 | #ifndef PT_PREFIXITER_H |
---|
16 | #include "PT_prefixIter.h" |
---|
17 | #endif |
---|
18 | #ifndef _GLIBCXX_CMATH |
---|
19 | #include <cmath> |
---|
20 | #endif |
---|
21 | |
---|
22 | // -------------------------------------------------------------------------------- |
---|
23 | // central settings for memory estimations |
---|
24 | // |
---|
25 | |
---|
26 | #ifdef ARB_64 |
---|
27 | |
---|
28 | # define STAGE1_INDEX_BYTES_PER_PASS_OLIGO 3.7 // size of index (for each oligo inserted in one pass) |
---|
29 | # define STAGE1_INDEX_BYTES_PER_BASE 0.1 // size of index (for each bp in database) |
---|
30 | # define STAGE1_OTHER_BYTES_PER_PASS_OLIGO 1.0 // non-index data (for each oligo inserted in one pass) |
---|
31 | # define STAGE1_OTHER_BYTES_PER_BASE 1.15 // non-index data (for each bp in database) |
---|
32 | |
---|
33 | # define STAGE1_INDEX_EXTRA_MB 350 // additional constant memory used by index (+ a bit safety) |
---|
34 | # define STAGE1_OTHER_EXTRA_MB 80 // additional constant memory used elsewhere (+ a bit safety) |
---|
35 | |
---|
36 | #else |
---|
37 | |
---|
38 | # define STAGE1_INDEX_BYTES_PER_PASS_OLIGO 3.2 // size of index (for each oligo inserted in one pass) |
---|
39 | # define STAGE1_INDEX_BYTES_PER_BASE 0 // size of index (for each bp in database) |
---|
40 | # define STAGE1_OTHER_BYTES_PER_PASS_OLIGO 0.7 // non-index data (for each oligo inserted in one pass) |
---|
41 | # define STAGE1_OTHER_BYTES_PER_BASE 1 // non-index data (for each bp in database) |
---|
42 | |
---|
43 | # define STAGE1_INDEX_EXTRA_MB 150 // additional constant memory used by index (+ a bit safety) |
---|
44 | # define STAGE1_OTHER_EXTRA_MB 50 // additional constant memory used elsewhere (+ a bit safety) |
---|
45 | |
---|
46 | #endif |
---|
47 | |
---|
48 | # define PTSERVER_BIN_MB 20 // binary mem footprint of ptserver (incl. libs, w/o DB) detected using pmap |
---|
49 | |
---|
50 | #define STAGE1_BYTES_PER_PASS_OLIGO (STAGE1_INDEX_BYTES_PER_PASS_OLIGO + STAGE1_OTHER_BYTES_PER_PASS_OLIGO) |
---|
51 | #define STAGE1_BYTES_PER_BASE (STAGE1_INDEX_BYTES_PER_BASE + STAGE1_OTHER_BYTES_PER_BASE) |
---|
52 | #define STAGE1_EXTRA_MB (STAGE1_INDEX_EXTRA_MB + STAGE1_OTHER_EXTRA_MB) |
---|
53 | |
---|
54 | CONSTEXPR_INLINE ULONG estimate_stage1_memusage_kb(ULONG all_bp, ULONG partition_bp) { |
---|
55 | return ULONG(STAGE1_BYTES_PER_PASS_OLIGO * partition_bp / 1024.0 + |
---|
56 | STAGE1_BYTES_PER_BASE * all_bp / 1024.0 + |
---|
57 | STAGE1_EXTRA_MB * 1024 + |
---|
58 | 0.5); |
---|
59 | } |
---|
60 | |
---|
61 | static double base_probability(char base) { |
---|
62 | pt_assert(base >= PT_QU && base < PT_BASES); |
---|
63 | static double bprob[PT_BASES] = { |
---|
64 | // probabilities are taken from silva-108-SSU-ref |
---|
65 | 0.0014, // PT_QU |
---|
66 | 0.0003, // PT_N |
---|
67 | 0.2543, // PT_A |
---|
68 | 0.2268, // PT_C |
---|
69 | 0.3074, // PT_G |
---|
70 | 0.2098, // PT_T |
---|
71 | }; |
---|
72 | return bprob[safeCharIndex(base)]; |
---|
73 | } |
---|
74 | |
---|
75 | inline double calc_probability(const char *prefix, size_t length) { |
---|
76 | double prob = 1.0; |
---|
77 | for (size_t i = 0; i<length; ++i) { |
---|
78 | prob *= base_probability(prefix[i]); |
---|
79 | } |
---|
80 | return prob; |
---|
81 | } |
---|
82 | |
---|
83 | class PrefixProbabilities { |
---|
84 | int depth; |
---|
85 | int prefixes; |
---|
86 | double *left_prob; |
---|
87 | |
---|
88 | public: |
---|
89 | PrefixProbabilities(int depth_) |
---|
90 | : depth(depth_) |
---|
91 | { |
---|
92 | PrefixIterator prefix(PT_QU, PT_T, depth); |
---|
93 | prefixes = prefix.steps(); |
---|
94 | |
---|
95 | left_prob = new double[prefixes+1]; |
---|
96 | left_prob[0] = 0.0; |
---|
97 | |
---|
98 | double prob[prefixes]; |
---|
99 | |
---|
100 | int i = 0; |
---|
101 | while (!prefix.done()) { |
---|
102 | pt_assert(i<prefixes); |
---|
103 | prob[i] = calc_probability(prefix.prefix(), prefix.length()); |
---|
104 | left_prob[i+1] = left_prob[i]+prob[i]; |
---|
105 | ++i; |
---|
106 | ++prefix; |
---|
107 | } |
---|
108 | } |
---|
109 | PrefixProbabilities(const PrefixProbabilities& other) |
---|
110 | : depth(other.depth), |
---|
111 | prefixes(other.prefixes) |
---|
112 | { |
---|
113 | left_prob = new double[prefixes+1]; |
---|
114 | memcpy(left_prob, other.left_prob, sizeof(*left_prob)*(prefixes+1)); |
---|
115 | } |
---|
116 | DECLARE_ASSIGNMENT_OPERATOR(PrefixProbabilities); |
---|
117 | ~PrefixProbabilities() { delete [] left_prob; } |
---|
118 | |
---|
119 | int get_prefix_count() const { return prefixes; } |
---|
120 | int get_depth() const { return depth; } |
---|
121 | |
---|
122 | double left_of(int prefix_idx) const { |
---|
123 | pt_assert(prefix_idx >= 0 && prefix_idx <= prefixes); // index must be in range [0 ... prefixes] |
---|
124 | return left_prob[prefix_idx]; |
---|
125 | } |
---|
126 | double of_range(int first_idx, int last_idx) const { |
---|
127 | pt_assert(first_idx <= last_idx); |
---|
128 | pt_assert(first_idx >= 0 && last_idx<prefixes); // index must be in range [0 ... prefixes-1] |
---|
129 | |
---|
130 | return left_of(last_idx+1) - left_of(first_idx); |
---|
131 | } |
---|
132 | double of(int prefix_idx) { |
---|
133 | pt_assert(prefix_idx >= 0 && prefix_idx<prefixes); // index must be in range [0 ... prefixes-1] |
---|
134 | return of_range(prefix_idx, prefix_idx); |
---|
135 | } |
---|
136 | |
---|
137 | int find_index_near_leftsum(double wanted) const { |
---|
138 | // returned index is in range [0 ... prefixes] |
---|
139 | |
---|
140 | int best_idx; |
---|
141 | if (prefixes == 0) best_idx = 0; |
---|
142 | else { |
---|
143 | int low = 0; |
---|
144 | int high = prefixes; |
---|
145 | |
---|
146 | double lol = left_of(low); |
---|
147 | if (lol >= wanted) best_idx = low; |
---|
148 | else { |
---|
149 | double loh = left_of(high); |
---|
150 | if (loh<wanted) best_idx = high; |
---|
151 | else { |
---|
152 | while ((low+1)<high) { |
---|
153 | pt_assert(lol<wanted && wanted<=loh); |
---|
154 | |
---|
155 | int mid = (low+high)/2; |
---|
156 | pt_assert(mid != low && mid != high); |
---|
157 | |
---|
158 | double left_of_mid = left_of(mid); |
---|
159 | if (left_of_mid<wanted) { |
---|
160 | low = mid; |
---|
161 | lol = left_of_mid; |
---|
162 | } |
---|
163 | else { |
---|
164 | high = mid; |
---|
165 | loh = left_of_mid; |
---|
166 | } |
---|
167 | } |
---|
168 | best_idx = fabs(lol-wanted) < fabs(loh-wanted) ? low : high; |
---|
169 | } |
---|
170 | } |
---|
171 | } |
---|
172 | return best_idx; |
---|
173 | } |
---|
174 | }; |
---|
175 | |
---|
176 | class DecisionTree : virtual Noncopyable { |
---|
177 | // associate bool-values with probe-prefixes |
---|
178 | |
---|
179 | bool decision; |
---|
180 | bool decided; |
---|
181 | DecisionTree *child[PT_BASES]; // all NULp if decided |
---|
182 | |
---|
183 | void forgetChilds() { |
---|
184 | for (int i = PT_QU; i<PT_BASES; ++i) { |
---|
185 | delete child[i]; |
---|
186 | child[i] = NULp; |
---|
187 | } |
---|
188 | } |
---|
189 | |
---|
190 | public: |
---|
191 | DecisionTree() |
---|
192 | : decided(false) |
---|
193 | { |
---|
194 | for (int i = PT_QU; i<PT_BASES; ++i) { |
---|
195 | child[i] = NULp; |
---|
196 | } |
---|
197 | } |
---|
198 | ~DecisionTree() { |
---|
199 | forgetChilds(); |
---|
200 | } |
---|
201 | |
---|
202 | void setValue(const char *probe, size_t len, bool wanted) { |
---|
203 | if (len>0) { |
---|
204 | DecisionTree*& probe_child = child[safeCharIndex(*probe)]; |
---|
205 | if (!probe_child) probe_child = new DecisionTree; |
---|
206 | probe_child->setValue(probe+1, len-1, wanted); |
---|
207 | } |
---|
208 | else { |
---|
209 | decision = wanted; |
---|
210 | decided = true; |
---|
211 | } |
---|
212 | } |
---|
213 | |
---|
214 | bool getValue(const char *probe) const { |
---|
215 | if (decided) return decision; |
---|
216 | return child[safeCharIndex(*probe)]->getValue(probe+1); |
---|
217 | } |
---|
218 | |
---|
219 | void optimize() { |
---|
220 | if (!decided) { |
---|
221 | child[PT_QU]->optimize(); |
---|
222 | bool concordant = child[PT_QU]->decided; |
---|
223 | bool common_decision = concordant ? child[PT_QU]->decision : false; |
---|
224 | |
---|
225 | for (int i = PT_QU+1; i<PT_BASES; ++i) { |
---|
226 | child[i]->optimize(); |
---|
227 | if (concordant) { |
---|
228 | if (child[i]->decided) { |
---|
229 | if (child[i]->decision != common_decision) { |
---|
230 | concordant = false; |
---|
231 | } |
---|
232 | } |
---|
233 | else { |
---|
234 | concordant = false; |
---|
235 | } |
---|
236 | } |
---|
237 | } |
---|
238 | |
---|
239 | if (concordant) { |
---|
240 | forgetChilds(); |
---|
241 | decided = true; |
---|
242 | decision = common_decision; |
---|
243 | } |
---|
244 | } |
---|
245 | } |
---|
246 | }; |
---|
247 | |
---|
248 | class MarkedPrefixes { |
---|
249 | int depth; |
---|
250 | int max_partitions; |
---|
251 | bool *marked; |
---|
252 | |
---|
253 | DecisionTree *decision; |
---|
254 | |
---|
255 | void forget_decision_tree() { |
---|
256 | delete decision; |
---|
257 | decision = NULp; |
---|
258 | } |
---|
259 | |
---|
260 | public: |
---|
261 | MarkedPrefixes(int depth_) |
---|
262 | : depth(depth_), |
---|
263 | max_partitions(PrefixIterator(PT_QU, PT_T, depth).steps()), |
---|
264 | marked(new bool[max_partitions]), |
---|
265 | decision(NULp) |
---|
266 | { |
---|
267 | clearMarks(); |
---|
268 | } |
---|
269 | MarkedPrefixes(const MarkedPrefixes& other) |
---|
270 | : depth(other.depth), |
---|
271 | max_partitions(other.max_partitions), |
---|
272 | marked(new bool[max_partitions]), |
---|
273 | decision(NULp) |
---|
274 | { |
---|
275 | memcpy(marked, other.marked, max_partitions*sizeof(*marked)); |
---|
276 | } |
---|
277 | DECLARE_ASSIGNMENT_OPERATOR(MarkedPrefixes); |
---|
278 | ~MarkedPrefixes() { |
---|
279 | delete [] marked; |
---|
280 | delete decision; |
---|
281 | } |
---|
282 | |
---|
283 | void mark(int idx1, int idx2) { |
---|
284 | pt_assert(idx1 <= idx2); |
---|
285 | pt_assert(idx1 >= 0); |
---|
286 | pt_assert(idx2 < max_partitions); |
---|
287 | |
---|
288 | for (int p = idx1; p <= idx2; ++p) { |
---|
289 | marked[p] = true; |
---|
290 | } |
---|
291 | } |
---|
292 | void clearMarks() { |
---|
293 | for (int p = 0; p<max_partitions; ++p) { |
---|
294 | marked[p] = false; |
---|
295 | } |
---|
296 | } |
---|
297 | |
---|
298 | void predecide() { |
---|
299 | forget_decision_tree(); |
---|
300 | decision = new DecisionTree; |
---|
301 | |
---|
302 | PrefixIterator iter(PT_QU, PT_T, depth); |
---|
303 | int idx = 0; |
---|
304 | |
---|
305 | while (!iter.done()) { |
---|
306 | pt_assert(idx<max_partitions); |
---|
307 | decision->setValue(iter.prefix(), iter.length(), marked[idx]); |
---|
308 | ++iter; |
---|
309 | ++idx; |
---|
310 | } |
---|
311 | decision->optimize(); |
---|
312 | } |
---|
313 | |
---|
314 | |
---|
315 | bool isMarked(const char *probe) const { |
---|
316 | pt_assert(decision); // predecide() not called |
---|
317 | return decision->getValue(probe); |
---|
318 | } |
---|
319 | }; |
---|
320 | |
---|
321 | class Partition { |
---|
322 | PrefixProbabilities prob; |
---|
323 | |
---|
324 | int passes; |
---|
325 | int current_pass; |
---|
326 | |
---|
327 | MarkedPrefixes prefix; |
---|
328 | |
---|
329 | int *start_of_pass; // contains prefix index |
---|
330 | |
---|
331 | mutable size_t max_probes_in_any_pass; |
---|
332 | |
---|
333 | int first_index_of_pass(int pass) const { |
---|
334 | pt_assert(legal_pass(pass)); |
---|
335 | return start_of_pass[pass-1]; |
---|
336 | } |
---|
337 | int last_index_of_pass(int pass) const { |
---|
338 | pt_assert(legal_pass(pass)); |
---|
339 | return start_of_pass[pass]-1; |
---|
340 | } |
---|
341 | |
---|
342 | bool have_zero_prob_passes() { |
---|
343 | for (int p = 1; p <= passes; ++p) { |
---|
344 | if (pass_probability(p) <= 0.0) return true; |
---|
345 | } |
---|
346 | return false; |
---|
347 | } |
---|
348 | |
---|
349 | void calculate_pass_starts() { |
---|
350 | pt_assert(passes <= max_allowed_passes()); |
---|
351 | delete [] start_of_pass; |
---|
352 | start_of_pass = new int[passes+1]; |
---|
353 | |
---|
354 | double prob_per_pass = 1.0/passes; |
---|
355 | |
---|
356 | start_of_pass[0] = 0; |
---|
357 | for (int p = 1; p < passes; ++p) { |
---|
358 | double pass_prob = p*prob_per_pass; |
---|
359 | start_of_pass[p] = prob.find_index_near_leftsum(pass_prob); |
---|
360 | } |
---|
361 | start_of_pass[passes] = max_allowed_passes(); |
---|
362 | |
---|
363 | // ensure there are no empty passes: |
---|
364 | for (int p = 0; p<passes; ++p) { |
---|
365 | if (start_of_pass[p] >= start_of_pass[p+1]) { |
---|
366 | int nidx = start_of_pass[p]+1; |
---|
367 | if (nidx<max_allowed_passes()) { |
---|
368 | start_of_pass[p+1] = nidx; |
---|
369 | } |
---|
370 | } |
---|
371 | } |
---|
372 | for (int p = passes-1; p >= 0; --p) { |
---|
373 | if (start_of_pass[p] >= start_of_pass[p+1]) { |
---|
374 | start_of_pass[p] = start_of_pass[p+1]-1; |
---|
375 | pt_assert(start_of_pass[p] >= 0); |
---|
376 | } |
---|
377 | } |
---|
378 | |
---|
379 | pt_assert(!have_zero_prob_passes()); |
---|
380 | } |
---|
381 | |
---|
382 | void markPrefixes() { |
---|
383 | pt_assert(!done()); |
---|
384 | |
---|
385 | prefix.clearMarks(); |
---|
386 | prefix.mark(first_index_of_pass(current_pass), last_index_of_pass(current_pass)); |
---|
387 | prefix.predecide(); |
---|
388 | } |
---|
389 | |
---|
390 | bool legal_pass(int pass) const { return pass >= 1 && pass <= passes; } |
---|
391 | |
---|
392 | public: |
---|
393 | Partition(const PrefixProbabilities& prob_, int passes_) |
---|
394 | : prob(prob_), |
---|
395 | passes(passes_), |
---|
396 | prefix(prob.get_depth()), |
---|
397 | start_of_pass(new int[passes+1]), |
---|
398 | max_probes_in_any_pass(0) |
---|
399 | { |
---|
400 | pt_assert(passes >= 1 && passes <= max_allowed_passes()); |
---|
401 | calculate_pass_starts(); |
---|
402 | reset(); |
---|
403 | } |
---|
404 | Partition(const Partition& other) |
---|
405 | : prob(other.prob), |
---|
406 | passes(other.passes), |
---|
407 | current_pass(other.current_pass), |
---|
408 | prefix(other.prefix), |
---|
409 | start_of_pass(new int[passes+1]), |
---|
410 | max_probes_in_any_pass(other.max_probes_in_any_pass) |
---|
411 | { |
---|
412 | memcpy(start_of_pass, other.start_of_pass, sizeof(*start_of_pass)*(passes+1)); |
---|
413 | prefix.predecide(); |
---|
414 | } |
---|
415 | DECLARE_ASSIGNMENT_OPERATOR(Partition); |
---|
416 | ~Partition() { |
---|
417 | delete [] start_of_pass; |
---|
418 | } |
---|
419 | |
---|
420 | double pass_probability(int pass) const { |
---|
421 | return prob.of_range(first_index_of_pass(pass), last_index_of_pass(pass)); |
---|
422 | } |
---|
423 | |
---|
424 | int max_allowed_passes() const { return prob.get_prefix_count(); } |
---|
425 | int number_of_passes() const { return passes; } |
---|
426 | int split_depth() const { return prob.get_depth(); } |
---|
427 | |
---|
428 | bool done() const { return !legal_pass(current_pass); } |
---|
429 | bool next() { |
---|
430 | ++current_pass; |
---|
431 | if (done()) return false; |
---|
432 | markPrefixes(); |
---|
433 | return true; |
---|
434 | } |
---|
435 | |
---|
436 | void reset() { |
---|
437 | current_pass = 1; |
---|
438 | markPrefixes(); |
---|
439 | } |
---|
440 | |
---|
441 | size_t estimate_probes_for_pass(int pass, size_t overall_base_count) const { |
---|
442 | pt_assert(legal_pass(pass)); |
---|
443 | return size_t(pass_probability(pass)*overall_base_count+0.5); |
---|
444 | } |
---|
445 | size_t estimate_max_probes_for_any_pass(size_t overall_base_count) const { |
---|
446 | if (max_probes_in_any_pass == 0) { // lazy eval |
---|
447 | for (int p = 1; p <= passes; ++p) { |
---|
448 | size_t probes = estimate_probes_for_pass(p, overall_base_count); |
---|
449 | if (probes>max_probes_in_any_pass) max_probes_in_any_pass = probes; |
---|
450 | } |
---|
451 | pt_assert(max_probes_in_any_pass); |
---|
452 | } |
---|
453 | return max_probes_in_any_pass; |
---|
454 | } |
---|
455 | size_t estimate_max_kb_for_any_pass(size_t overall_base_count) const { |
---|
456 | return estimate_stage1_memusage_kb(overall_base_count, estimate_max_probes_for_any_pass(overall_base_count)); |
---|
457 | } |
---|
458 | |
---|
459 | bool contains(const char *probe) const { |
---|
460 | return prefix.isMarked(probe); |
---|
461 | } |
---|
462 | }; |
---|
463 | |
---|
464 | inline size_t max_probes_for_passes(const PrefixProbabilities& prob, int passes_wanted, size_t overall_base_count) { |
---|
465 | return Partition(prob, passes_wanted).estimate_max_probes_for_any_pass(overall_base_count); |
---|
466 | } |
---|
467 | inline size_t max_kb_for_passes(const PrefixProbabilities& prob, int passes_wanted, size_t overall_base_count) { |
---|
468 | return estimate_stage1_memusage_kb(overall_base_count, max_probes_for_passes(prob, passes_wanted, overall_base_count)); |
---|
469 | } |
---|
470 | |
---|
471 | #else |
---|
472 | #error PT_partition.h included twice |
---|
473 | #endif // PT_PARTITION_H |
---|