1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : ps_find_probes.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Coded by Wolfram Foerster in October 2002 // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #include "ps_tools.hxx" |
---|
13 | #include "ps_database.hxx" |
---|
14 | #include "ps_candidate.hxx" |
---|
15 | |
---|
16 | #include <cmath> |
---|
17 | #include <iostream> |
---|
18 | #include <sys/times.h> |
---|
19 | |
---|
20 | using namespace std; |
---|
21 | |
---|
22 | template<class T> |
---|
23 | void PS_print_set_ranges(const char *_set_name, const set<T> &_set, const bool _cr_at_end = true) { |
---|
24 | fflush(stdout); |
---|
25 | printf("%s size (%3zu) : ", _set_name, _set.size()); |
---|
26 | if (_set.empty()) { |
---|
27 | printf("(empty)"); |
---|
28 | } |
---|
29 | else { |
---|
30 | typename set<T>::const_iterator range_begin = _set.begin(); |
---|
31 | typename set<T>::const_iterator range_end = range_begin; |
---|
32 | typename set<T>::const_iterator it = _set.begin(); |
---|
33 | for (++it; |
---|
34 | it != _set.end(); |
---|
35 | ++it) { |
---|
36 | if (*it == (*range_end)+1) { |
---|
37 | range_end = it; |
---|
38 | } |
---|
39 | else { |
---|
40 | if (range_end == range_begin) { |
---|
41 | cout << *range_begin << " "; |
---|
42 | } |
---|
43 | else { |
---|
44 | cout << *range_begin << "-" << *range_end << " "; |
---|
45 | } |
---|
46 | range_begin = it; |
---|
47 | range_end = it; |
---|
48 | } |
---|
49 | } |
---|
50 | cout << *range_begin; |
---|
51 | if (range_end != range_begin) cout << "-" << *range_end; |
---|
52 | } |
---|
53 | if (_cr_at_end) cout << "\n"; |
---|
54 | fflush(stdout); |
---|
55 | } |
---|
56 | |
---|
57 | |
---|
58 | template<class T1, class T2> |
---|
59 | void PS_print_map_ranges(const char *_map_name, const map<T1, T2> &_map, const bool _compare_keys = true, const bool _cr_at_end = true) { |
---|
60 | fflush(stdout); |
---|
61 | if (_map.empty()) { |
---|
62 | printf("%s size ( 0) : (empty)", _map_name); |
---|
63 | } |
---|
64 | else { |
---|
65 | if (_compare_keys) { |
---|
66 | printf("%s size (%3i) : ", _map_name, _map.size()); |
---|
67 | typename map<T1, T2>::const_iterator range_begin = _map.begin(); |
---|
68 | typename map<T1, T2>::const_iterator range_end = range_begin; |
---|
69 | typename map<T1, T2>::const_iterator it = _map.begin(); |
---|
70 | for (++it; |
---|
71 | it != _map.end(); |
---|
72 | ++it) { |
---|
73 | if (it->first == range_end->first+1) { |
---|
74 | range_end = it; |
---|
75 | } |
---|
76 | else { |
---|
77 | if (range_end == range_begin) { |
---|
78 | cout << "(" << range_begin->first << "," << range_begin->second << ") "; |
---|
79 | } |
---|
80 | else { |
---|
81 | cout << "(" << range_begin->first << "-" << range_end->first << "," << range_begin->second << ") "; |
---|
82 | } |
---|
83 | range_begin = it; |
---|
84 | range_end = it; |
---|
85 | } |
---|
86 | } |
---|
87 | if (range_end == range_begin) { |
---|
88 | cout << "(" << range_begin->first << "," << range_begin->second << ") "; |
---|
89 | } |
---|
90 | else { |
---|
91 | cout << "(" << range_begin->first << "-" << range_end->first << "," << range_begin->second << ") "; |
---|
92 | } |
---|
93 | } |
---|
94 | else { |
---|
95 | map<T2, set<pair<T1, T1> > > value2indices; |
---|
96 | map<T2, unsigned long int> value2count; |
---|
97 | pair<T1, T1> range; |
---|
98 | range.first = _map.begin()->first; |
---|
99 | range.second = range.first; |
---|
100 | T2 cur_value = _map.begin()->second; |
---|
101 | typename map<T1, T2>::const_iterator it = _map.begin(); |
---|
102 | for (++it; |
---|
103 | it != _map.end(); |
---|
104 | ++it) { |
---|
105 | if (it->second == cur_value) { |
---|
106 | if (it->first == range.second+1) { |
---|
107 | range.second = it->first; |
---|
108 | } |
---|
109 | else { |
---|
110 | value2indices[cur_value].insert(range); |
---|
111 | value2count[cur_value] += range.second - range.first + 1; |
---|
112 | range.first = it->first; |
---|
113 | range.second = it->first; |
---|
114 | } |
---|
115 | } |
---|
116 | else { |
---|
117 | value2indices[cur_value].insert(range); |
---|
118 | value2count[cur_value] += range.second - range.first + 1; |
---|
119 | range.first = it->first; |
---|
120 | range.second = it->first; |
---|
121 | cur_value = it->second; |
---|
122 | } |
---|
123 | } |
---|
124 | value2indices[cur_value].insert(range); |
---|
125 | value2count[cur_value] += range.second - range.first + 1; |
---|
126 | for (typename map<T2, set<pair<T1, T1> > >::const_iterator value = value2indices.begin(); |
---|
127 | value != value2indices.end(); |
---|
128 | ++value) { |
---|
129 | if (value != value2indices.begin()) cout << "\n"; |
---|
130 | printf("%s size (%3i) : value (", _map_name, _map.size()); |
---|
131 | cout << value->first << ") count (" << value2count[value->first] << ") ["; |
---|
132 | for (typename set<pair<T1, T1> >::const_iterator indices = value->second.begin(); |
---|
133 | indices != value->second.end(); |
---|
134 | ++indices) { |
---|
135 | if (indices != value->second.begin()) cout << " "; |
---|
136 | cout << indices->first; |
---|
137 | if (indices->first != indices->second) cout << "-" << indices->second; |
---|
138 | } |
---|
139 | cout << "]," << value->first << ")"; |
---|
140 | } |
---|
141 | } |
---|
142 | } |
---|
143 | if (_cr_at_end) cout << "\n"; |
---|
144 | fflush(stdout); |
---|
145 | } |
---|
146 | |
---|
147 | // |
---|
148 | // common globals |
---|
149 | // |
---|
150 | |
---|
151 | static bool __VERBOSE = false; |
---|
152 | static SpeciesID __MAX_ID; |
---|
153 | static SpeciesID __MIN_ID; |
---|
154 | static SpeciesID __SPECIES_COUNT; |
---|
155 | static unsigned long __BITS_IN_MAP; |
---|
156 | static PS_BitMap_Counted *__MAP; |
---|
157 | static PS_BitMap_Counted *__PRESET_MAP; |
---|
158 | static IDSet __SOURCE_ID_SET; |
---|
159 | static PS_BitSet::IndexSet __TARGET_ID_SET; |
---|
160 | |
---|
161 | // |
---|
162 | // globals for PS_calc_next_speciesid_sets, |
---|
163 | // PS_calc_next_speciesid_sets_for_candidate |
---|
164 | // |
---|
165 | |
---|
166 | // count of trues for a source_id may be max 5% |
---|
167 | // (of the difference between __SPECIES_COUNT and the lowest |
---|
168 | // count of trues per species) higher than the lowest count |
---|
169 | // of trues for the species in __SOURCE_ID_SET |
---|
170 | #define __THRESHOLD_PERCENTAGE_NEXT_SOURCE_ID_SET 5 |
---|
171 | // target_id must be at least in 95% of total target_id_sets |
---|
172 | // to be in __TARGET_ID_SET |
---|
173 | #define __THRESHOLD_PERCENTAGE_NEXT_TARGET_ID_SET 95 |
---|
174 | |
---|
175 | static SpeciesID __MIN_SETS_ID; |
---|
176 | static SpeciesID __MAX_SETS_ID; |
---|
177 | |
---|
178 | // |
---|
179 | // globals for PS_find_probes |
---|
180 | // |
---|
181 | |
---|
182 | // initially a probe must match 40-60% of species |
---|
183 | // in the ID-sets to be a candidate |
---|
184 | // if PS_find_probes fails to find candidates within this range |
---|
185 | // it is repeated with a broadened range (by 5% each border) |
---|
186 | #define __MIN_PERCENTAGE_SET_MATCH 40 |
---|
187 | #define __MAX_PERCENTAGE_SET_MATCH 60 |
---|
188 | |
---|
189 | static float __SOURCE_MIN_MATCH_COUNT; |
---|
190 | static float __SOURCE_MAX_MATCH_COUNT; |
---|
191 | static float __TARGET_MIN_MATCH_COUNT; |
---|
192 | static float __TARGET_MAX_MATCH_COUNT; |
---|
193 | static float __SOURCE_PERFECT_MATCH_COUNT; |
---|
194 | static float __TARGET_PERFECT_MATCH_COUNT; |
---|
195 | static unsigned long __PROBES_COUNTER; |
---|
196 | static unsigned long __PROBES_REMOVED; |
---|
197 | static PS_CandidatePtr __CANDIDATES_ROOT; |
---|
198 | static unsigned long __CANDIDATES_COUNTER; |
---|
199 | static unsigned long __CANDIDATES_TODO; |
---|
200 | static unsigned long __CANDIDATES_FINISHED; |
---|
201 | static unsigned long __MAX_DEPTH; |
---|
202 | |
---|
203 | // |
---|
204 | // globals for PS_find_probe_for_sets |
---|
205 | // PS_test_candidate_on_bitmap |
---|
206 | // |
---|
207 | |
---|
208 | static IDSet __PATH; |
---|
209 | |
---|
210 | // |
---|
211 | // globals for PS_descend |
---|
212 | // |
---|
213 | static char *__PATH_IN_CANDIDATES; |
---|
214 | |
---|
215 | |
---|
216 | static unsigned long int PS_test_candidate_on_bitmap(float *_filling_level = NULp, PS_BitMap_Counted *_map = NULp) { |
---|
217 | // returns number of locations in __MAP that would be switched |
---|
218 | // from false to true by __PATH |
---|
219 | |
---|
220 | // iterate over all IDs except path |
---|
221 | IDSetCIter path_iter = __PATH.begin(); |
---|
222 | SpeciesID next_path_ID = *path_iter; |
---|
223 | unsigned long int gain = 0; |
---|
224 | if (_map) { |
---|
225 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
226 | not_in_path_ID <= __MAX_ID; |
---|
227 | ++not_in_path_ID) { |
---|
228 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
229 | ++path_iter; // advance to next path ID |
---|
230 | next_path_ID = (path_iter == __PATH.end()) ? -1 : *path_iter; |
---|
231 | continue; // skip this ID |
---|
232 | } |
---|
233 | // iterate over path IDs |
---|
234 | for (IDSetCIter path_ID = __PATH.begin(); |
---|
235 | path_ID != __PATH.end(); |
---|
236 | ++path_ID) { |
---|
237 | if (not_in_path_ID == *path_ID) |
---|
238 | continue; // obviously a probe cant differ a species from itself |
---|
239 | if (not_in_path_ID > *path_ID) { |
---|
240 | gain += !_map->get(not_in_path_ID, *path_ID); |
---|
241 | } |
---|
242 | else { |
---|
243 | gain += !_map->get(*path_ID, not_in_path_ID); |
---|
244 | } |
---|
245 | } |
---|
246 | } |
---|
247 | } |
---|
248 | else { |
---|
249 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
250 | not_in_path_ID <= __MAX_ID; |
---|
251 | ++not_in_path_ID) { |
---|
252 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
253 | ++path_iter; // advance to next path ID |
---|
254 | next_path_ID = (path_iter == __PATH.end()) ? -1 : *path_iter; |
---|
255 | continue; // skip this ID |
---|
256 | } |
---|
257 | // iterate over path IDs |
---|
258 | for (IDSetCIter path_ID = __PATH.begin(); |
---|
259 | path_ID != __PATH.end(); |
---|
260 | ++path_ID) { |
---|
261 | if (not_in_path_ID == *path_ID) |
---|
262 | continue; // obviously a probe cant differ a species from itself |
---|
263 | if (not_in_path_ID > *path_ID) { |
---|
264 | gain += !__MAP->get(not_in_path_ID, *path_ID); |
---|
265 | } |
---|
266 | else { |
---|
267 | gain += !__MAP->get(*path_ID, not_in_path_ID); |
---|
268 | } |
---|
269 | } |
---|
270 | } |
---|
271 | } |
---|
272 | if (_filling_level) { |
---|
273 | unsigned long int trues = (_map) ? _map->getCountOfTrues() + gain : __MAP->getCountOfTrues() + gain; |
---|
274 | *_filling_level = ((float)trues / __BITS_IN_MAP)*100.0; |
---|
275 | } |
---|
276 | return gain; |
---|
277 | } |
---|
278 | |
---|
279 | |
---|
280 | static bool PS_test_sets_on_path(float &_distance) { |
---|
281 | long count_matched_source = 0; |
---|
282 | long count_matched_target = 0; |
---|
283 | SpeciesID path_id; |
---|
284 | IDSetCIter path_end = __PATH.end(); |
---|
285 | IDSetCIter path = __PATH.begin(); |
---|
286 | IDSetCIter source_end = __SOURCE_ID_SET.end(); |
---|
287 | IDSetCIter source = __SOURCE_ID_SET.begin(); |
---|
288 | PS_BitSet::IndexSet::const_iterator target_end = __TARGET_ID_SET.end(); |
---|
289 | PS_BitSet::IndexSet::const_iterator target = __TARGET_ID_SET.begin(); |
---|
290 | |
---|
291 | while (path != path_end) { |
---|
292 | path_id = *path; |
---|
293 | while ((*target < path_id) && (target != target_end)) { |
---|
294 | ++target; |
---|
295 | } |
---|
296 | if ((target != target_end) && (*target == path_id)) |
---|
297 | ++count_matched_target; |
---|
298 | while ((*source < path_id) && (source != source_end)) { |
---|
299 | ++source; |
---|
300 | } |
---|
301 | if ((source != source_end) && (*source == path_id)) |
---|
302 | ++count_matched_source; |
---|
303 | ++path; |
---|
304 | } |
---|
305 | |
---|
306 | if ((count_matched_source > __SOURCE_MIN_MATCH_COUNT) && |
---|
307 | (count_matched_source < __SOURCE_MAX_MATCH_COUNT) && |
---|
308 | (count_matched_target > __TARGET_MIN_MATCH_COUNT) && |
---|
309 | (count_matched_target < __TARGET_MAX_MATCH_COUNT)) { |
---|
310 | _distance = fabs(__SOURCE_PERFECT_MATCH_COUNT - count_matched_source) + fabs(__TARGET_PERFECT_MATCH_COUNT - count_matched_target); |
---|
311 | return true; |
---|
312 | } |
---|
313 | return false; |
---|
314 | } |
---|
315 | |
---|
316 | static void PS_find_probe_for_sets(const PS_NodePtr& _ps_node, PS_CandidatePtr _candidate_parent) { |
---|
317 | // scan subtree starting with _ps_node for candidates |
---|
318 | |
---|
319 | SpeciesID id = _ps_node->getNum(); |
---|
320 | bool has_probes = _ps_node->hasProbes(); |
---|
321 | |
---|
322 | // |
---|
323 | // append ID to path |
---|
324 | // |
---|
325 | __PATH.insert(id); |
---|
326 | |
---|
327 | // |
---|
328 | // don't look at path until ID is greater than lowest ID in the sets of IDs |
---|
329 | // also don't use a node if its already used as candidate |
---|
330 | // |
---|
331 | if ((id >= __MIN_SETS_ID) && has_probes && !_candidate_parent->alreadyUsedNode(_ps_node)) { |
---|
332 | ++__PROBES_COUNTER; |
---|
333 | if (__VERBOSE && (__PROBES_COUNTER % 100 == 0)) { |
---|
334 | printf("%8lup %8luc\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", __PROBES_COUNTER, __CANDIDATES_COUNTER); |
---|
335 | fflush(stdout); |
---|
336 | } |
---|
337 | // make intersections of __PATH with __SOURCE_ID_SET and __TARGET_ID_SET |
---|
338 | float distance_to_perfect_match; |
---|
339 | if (PS_test_sets_on_path(distance_to_perfect_match)) { |
---|
340 | unsigned long gain = PS_test_candidate_on_bitmap(); // no -> calc gain and make new candidate node |
---|
341 | int status = (gain < (unsigned)__SPECIES_COUNT / 100) |
---|
342 | ? 0 |
---|
343 | : _candidate_parent->addChild(static_cast<unsigned long>(distance_to_perfect_match), gain, _ps_node, __PATH); |
---|
344 | if (status > 0) { |
---|
345 | if (status == 2) { |
---|
346 | ++__CANDIDATES_COUNTER; |
---|
347 | if (__VERBOSE) { |
---|
348 | printf("%8lup %8luc\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", __PROBES_COUNTER, __CANDIDATES_COUNTER); |
---|
349 | fflush(stdout); |
---|
350 | } |
---|
351 | } |
---|
352 | fflush(stdout); |
---|
353 | } |
---|
354 | } |
---|
355 | } |
---|
356 | |
---|
357 | // |
---|
358 | // step down the children if either ID is lower than highest |
---|
359 | // ID in the set of ID-pairs or the node has no probes |
---|
360 | // |
---|
361 | if ((id < __MAX_SETS_ID) || (! has_probes)) { |
---|
362 | for (PS_NodeMapConstIterator i = _ps_node->getChildrenBegin(); |
---|
363 | i != _ps_node->getChildrenEnd(); |
---|
364 | ++i) { |
---|
365 | PS_find_probe_for_sets(i->second, _candidate_parent); |
---|
366 | } |
---|
367 | } |
---|
368 | |
---|
369 | // |
---|
370 | // remove ID from path |
---|
371 | // |
---|
372 | __PATH.erase(id); |
---|
373 | } |
---|
374 | |
---|
375 | static void PS_find_probes(const PS_NodePtr& _root_node, const int _round, PS_CandidatePtr _candidate_parent, const float _filling_level) { |
---|
376 | // scan PS_Node-tree for candidates to raise filling level of __MAP |
---|
377 | |
---|
378 | __PROBES_COUNTER = 0; |
---|
379 | __CANDIDATES_COUNTER = 0; |
---|
380 | __SOURCE_MIN_MATCH_COUNT = (__SOURCE_ID_SET.size() * (__MIN_PERCENTAGE_SET_MATCH - (_round * 5))) / 100.0; |
---|
381 | __SOURCE_MAX_MATCH_COUNT = (__SOURCE_ID_SET.size() * (__MAX_PERCENTAGE_SET_MATCH + (_round * 5))) / 100.0; |
---|
382 | __TARGET_MIN_MATCH_COUNT = (__TARGET_ID_SET.size() * (__MIN_PERCENTAGE_SET_MATCH - (_round * 5))) / 100.0; |
---|
383 | __TARGET_MAX_MATCH_COUNT = (__TARGET_ID_SET.size() * (__MAX_PERCENTAGE_SET_MATCH + (_round * 5))) / 100.0; |
---|
384 | __SOURCE_PERFECT_MATCH_COUNT = __SOURCE_ID_SET.size() / 2.0; |
---|
385 | __TARGET_PERFECT_MATCH_COUNT = __TARGET_ID_SET.size() / 2.0; |
---|
386 | __MIN_SETS_ID = (*__SOURCE_ID_SET.begin() < *__TARGET_ID_SET.begin()) ? *__SOURCE_ID_SET.begin() : *__TARGET_ID_SET.begin(); |
---|
387 | __MAX_SETS_ID = (*__SOURCE_ID_SET.rbegin() < *__TARGET_ID_SET.rbegin()) ? *__SOURCE_ID_SET.rbegin() : *__TARGET_ID_SET.rbegin(); |
---|
388 | printf("PS_find_probes(%i) : source match %10.3f .. %10.3f target match %10.3f .. %10.3f\n", _round, __SOURCE_MIN_MATCH_COUNT, __SOURCE_MAX_MATCH_COUNT, __TARGET_MIN_MATCH_COUNT, __TARGET_MAX_MATCH_COUNT); fflush(stdout); |
---|
389 | struct tms before; |
---|
390 | times(&before); |
---|
391 | for (PS_NodeMapConstIterator i = _root_node->getChildrenBegin(); |
---|
392 | (i != _root_node->getChildrenEnd()) && (i->first < __MAX_SETS_ID); |
---|
393 | ++i) |
---|
394 | { |
---|
395 | __PATH.clear(); |
---|
396 | PS_find_probe_for_sets(i->second, _candidate_parent); |
---|
397 | } |
---|
398 | printf("PS_find_probes(%i) : %li candidates of %li probes visited ", _round, __CANDIDATES_COUNTER, __PROBES_COUNTER); |
---|
399 | PS_print_time_diff(&before); |
---|
400 | // reduce candidates to best,worst and middle gain |
---|
401 | _candidate_parent->reduceChildren(_filling_level); |
---|
402 | for (PS_CandidateByGainMapCIter c = _candidate_parent->children.begin(); |
---|
403 | c != _candidate_parent->children.end(); |
---|
404 | ++c) { |
---|
405 | printf("PS_find_probes(%i) : ", _round); |
---|
406 | c->second->print(); |
---|
407 | ++__CANDIDATES_TODO; |
---|
408 | } |
---|
409 | fflush(stdout); |
---|
410 | } |
---|
411 | |
---|
412 | static void PS_calc_next_speciesid_sets() { |
---|
413 | // scan __MAP to find sets of IDs that need differentiation most |
---|
414 | |
---|
415 | // |
---|
416 | // 1. __SOURCE_ID_SET |
---|
417 | // scan bitmap for species that need more matches |
---|
418 | // |
---|
419 | SpeciesID highest_count; |
---|
420 | SpeciesID lowest_count; |
---|
421 | float threshold; |
---|
422 | SpeciesID count; |
---|
423 | |
---|
424 | // first pass -- get lowest count of trues and calc threshold |
---|
425 | lowest_count = __SPECIES_COUNT; |
---|
426 | highest_count = 0; |
---|
427 | for (SpeciesID id = __MIN_ID; |
---|
428 | id <= __MAX_ID; |
---|
429 | ++id) { |
---|
430 | count = __MAP->getCountFor(id); |
---|
431 | if (count == __SPECIES_COUNT-__MIN_ID-1) continue; // i cannot improve species that can be differed from all others |
---|
432 | if (count > highest_count) highest_count = count; |
---|
433 | if (count < lowest_count) lowest_count = count; |
---|
434 | } |
---|
435 | threshold = (((__SPECIES_COUNT-lowest_count) * __THRESHOLD_PERCENTAGE_NEXT_SOURCE_ID_SET) / 100.0) + lowest_count; |
---|
436 | printf("PS_calc_next_speciesid_sets() : SOURCE count 1's [%i..%i] threshold (%.3f)", lowest_count, highest_count, threshold); |
---|
437 | |
---|
438 | // second pass -- get IDs where count is below or equal threshold |
---|
439 | __SOURCE_ID_SET.clear(); |
---|
440 | for (SpeciesID id = __MIN_ID; |
---|
441 | id <= __MAX_ID; |
---|
442 | ++id) { |
---|
443 | if (__MAP->getCountFor(id) <= threshold) __SOURCE_ID_SET.insert(id); |
---|
444 | } |
---|
445 | PS_print_set_ranges(" __SOURCE_ID_SET", __SOURCE_ID_SET); |
---|
446 | |
---|
447 | // |
---|
448 | // 2. __TARGET_ID_SET |
---|
449 | // scan bitmap for species IDs that need to be distinguished from MOST species in __SOURCE_ID_SET |
---|
450 | // |
---|
451 | ID2IDMap count_falses_per_id; |
---|
452 | |
---|
453 | // first -- get the IDs that need differentiation from __SOURCE_ID_SET |
---|
454 | unsigned long count_iterations = 0; |
---|
455 | for (IDSetCIter source_id = __SOURCE_ID_SET.begin(); |
---|
456 | source_id != __SOURCE_ID_SET.end(); |
---|
457 | ++source_id, ++count_iterations) { |
---|
458 | // get 'next_target_set' |
---|
459 | __MAP->getFalseIndicesFor(*source_id, __TARGET_ID_SET); |
---|
460 | // count falses per SpeciesID |
---|
461 | for (PS_BitSet::IndexSet::iterator target_id = __TARGET_ID_SET.begin(); |
---|
462 | target_id != __TARGET_ID_SET.end(); |
---|
463 | ++target_id) { |
---|
464 | if ((*target_id < __MIN_ID) || (*target_id > __MAX_ID)) continue; // skip ID's that are outside DB-IDs-range (like zero if __MIN_ID is one) |
---|
465 | if (*target_id != *source_id) ++count_falses_per_id[*target_id]; |
---|
466 | } |
---|
467 | } |
---|
468 | |
---|
469 | // second -- get highest count of falses and calc threshold |
---|
470 | lowest_count = __SPECIES_COUNT; |
---|
471 | highest_count = 0; |
---|
472 | for (ID2IDMapCIter count_per_id = count_falses_per_id.begin(); |
---|
473 | count_per_id != count_falses_per_id.end(); |
---|
474 | ++count_per_id) { |
---|
475 | if (count_per_id->second > highest_count) highest_count = count_per_id->second; |
---|
476 | if (count_per_id->second < lowest_count) lowest_count = count_per_id->second; |
---|
477 | } |
---|
478 | printf("PS_calc_next_speciesid_sets() : TARGET count 0's [%i..%i]", lowest_count, highest_count); |
---|
479 | |
---|
480 | // third -- put all IDs in __TARGET_ID_SET that are needed most by species from __SOURCE_ID_SET |
---|
481 | __TARGET_ID_SET.clear(); |
---|
482 | for (ID2IDMapCIter count_per_id = count_falses_per_id.begin(); |
---|
483 | count_per_id != count_falses_per_id.end(); |
---|
484 | ++count_per_id) { |
---|
485 | if (count_per_id->second == highest_count) __TARGET_ID_SET.insert(count_per_id->first); |
---|
486 | } |
---|
487 | PS_print_set_ranges(" __TARGET_ID_SET", __TARGET_ID_SET); |
---|
488 | } |
---|
489 | |
---|
490 | void PS_apply_path_to_bitmap(IDSet &_path, const bool _silent = false, PS_BitMap_Counted *_map = NULp) { |
---|
491 | // set true in __MAP for all combinations of IDs (in _path , not in _path) |
---|
492 | |
---|
493 | // iterate over all IDs except path |
---|
494 | IDSetCIter path_iter = _path.begin(); |
---|
495 | SpeciesID next_path_ID = *path_iter; |
---|
496 | unsigned long int gain = 0; |
---|
497 | if (_map) { // called for a 'private' map ? |
---|
498 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
499 | not_in_path_ID <= __MAX_ID; |
---|
500 | ++not_in_path_ID) { |
---|
501 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
502 | ++path_iter; // advance to next path ID |
---|
503 | next_path_ID = (path_iter == _path.end()) ? -1 : *path_iter; |
---|
504 | continue; // skip this ID |
---|
505 | } |
---|
506 | // iterate over path IDs |
---|
507 | for (IDSetCIter path_ID = _path.begin(); |
---|
508 | path_ID != _path.end(); |
---|
509 | ++path_ID) { |
---|
510 | if (not_in_path_ID == *path_ID) continue; // obviously a probe cant differ a species from itself |
---|
511 | if (not_in_path_ID > *path_ID) { |
---|
512 | gain += !_map->set(not_in_path_ID, *path_ID, true); |
---|
513 | } |
---|
514 | else { |
---|
515 | gain += !_map->set(*path_ID, not_in_path_ID, true); |
---|
516 | } |
---|
517 | } |
---|
518 | } |
---|
519 | } |
---|
520 | else { // called for __MAP |
---|
521 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
522 | not_in_path_ID <= __MAX_ID; |
---|
523 | ++not_in_path_ID) { |
---|
524 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
525 | ++path_iter; // advance to next path ID |
---|
526 | next_path_ID = (path_iter == _path.end()) ? -1 : *path_iter; |
---|
527 | continue; // skip this ID |
---|
528 | } |
---|
529 | // iterate over path IDs |
---|
530 | for (IDSetCIter path_ID = _path.begin(); |
---|
531 | path_ID != _path.end(); |
---|
532 | ++path_ID) { |
---|
533 | if (not_in_path_ID == *path_ID) continue; // obviously a probe cant differ a species from itself |
---|
534 | if (not_in_path_ID > *path_ID) { |
---|
535 | gain += !__MAP->set(not_in_path_ID, *path_ID, true); |
---|
536 | } |
---|
537 | else { |
---|
538 | gain += !__MAP->set(*path_ID, not_in_path_ID, true); |
---|
539 | } |
---|
540 | } |
---|
541 | } |
---|
542 | } |
---|
543 | unsigned long int sets = (__SPECIES_COUNT-_path.size())*_path.size(); |
---|
544 | if (!_silent) printf("PS_apply_path_to_bitmap() : gain %lu of %lu -- %.2f%% -> wasted %lu\n", gain, sets, ((float)gain/sets)*100.0, sets-gain); |
---|
545 | } |
---|
546 | |
---|
547 | static float PS_filling_level(PS_CandidatePtr _candidate = NULp) { |
---|
548 | // returns filling level of __MAP |
---|
549 | |
---|
550 | unsigned long trues = __MAP->getCountOfTrues(); |
---|
551 | float percentage = ((float)trues / __BITS_IN_MAP)*100.0; |
---|
552 | if (_candidate) { |
---|
553 | _candidate->filling_level = percentage; |
---|
554 | _candidate->false_IDs = __BITS_IN_MAP - trues; |
---|
555 | } |
---|
556 | else { |
---|
557 | printf("PS_filling_level() : bitmap (%lu) now has %lu trues and %lu falses -- %.5f%% filled\n", __BITS_IN_MAP, trues, __BITS_IN_MAP-trues, percentage); |
---|
558 | } |
---|
559 | return percentage; |
---|
560 | } |
---|
561 | |
---|
562 | void PS_GNUPlot(const char *_out_prefix, const long _iteration, const IDSet &_path, const ID2IDSet &_noMatches) { |
---|
563 | // generate data- and commandfiles for GNUPlot to display __MAP and its count_trues_per_id |
---|
564 | |
---|
565 | char *buffer = (char *)malloc(4096); |
---|
566 | // open data file |
---|
567 | sprintf(buffer, "%s.%06li", _out_prefix, _iteration); |
---|
568 | char *data_name = strdup(buffer); |
---|
569 | PS_FileBuffer *file = new PS_FileBuffer(data_name, PS_FileBuffer::WRITEONLY); |
---|
570 | // output data |
---|
571 | long size = sprintf(buffer, "# CANDIDATE #%li path (%zu)\n", _iteration, _path.size()); |
---|
572 | file->put(buffer, size); |
---|
573 | for (IDSetCIter i = _path.begin(); |
---|
574 | i != _path.end(); |
---|
575 | ++i) { |
---|
576 | size = sprintf(buffer, "# %i\n", *i); |
---|
577 | file->put(buffer, size); |
---|
578 | } |
---|
579 | size = sprintf(buffer, "#noMatches (%zu)\n", _noMatches.size()); |
---|
580 | file->put(buffer, size); |
---|
581 | for (ID2IDSetCIter p = _noMatches.begin(); |
---|
582 | p != _noMatches.end(); |
---|
583 | ++p) { |
---|
584 | size = sprintf(buffer, "%i %i\n", p->first, p->second); |
---|
585 | file->put(buffer, size); |
---|
586 | } |
---|
587 | size = sprintf(buffer, "\n\n"); |
---|
588 | file->put(buffer, size); |
---|
589 | sprintf(buffer, "Bitmap after %li. candidate", _iteration); |
---|
590 | char *title = strdup(buffer); |
---|
591 | __MAP->printGNUplot(title, buffer, file); |
---|
592 | // open gnuplot command file |
---|
593 | sprintf(buffer, "%s.%06li.gnuplot", _out_prefix, _iteration); |
---|
594 | file->reinit(buffer, PS_FileBuffer::WRITEONLY); |
---|
595 | // output gnuplot commands |
---|
596 | size = sprintf(buffer, "set terminal png color\nset output '%06li.bitmap.png\n", _iteration); |
---|
597 | file->put(buffer, size); |
---|
598 | size = sprintf(buffer, "set yrange [%i:%i] reverse\nset xrange [%i:%i]\n", __MIN_ID, __MAX_ID, __MIN_ID, __MAX_ID); |
---|
599 | file->put(buffer, size); |
---|
600 | size = sprintf(buffer, "set size square\nset title '%li. iteration'\nset pointsize 0.5\n", _iteration); |
---|
601 | file->put(buffer, size); |
---|
602 | size = sprintf(buffer, "plot '%s' index 1 title 'match ()' with dots 2,'%s' index 0 title 'no match (30)' with points 1\n", data_name, data_name); |
---|
603 | file->put(buffer, size); |
---|
604 | size = sprintf(buffer, "set terminal png color\nset output '%06li.counters.png\n", _iteration); |
---|
605 | file->put(buffer, size); |
---|
606 | size = sprintf(buffer, "set yrange [] noreverse\nset size nosquare\n"); |
---|
607 | file->put(buffer, size); |
---|
608 | size = sprintf(buffer, "plot '%s' index 2 title 'count/species' with impulses 2,'%s' index 3 title 'species/count' with points 1", data_name, data_name); |
---|
609 | file->put(buffer, size); |
---|
610 | |
---|
611 | delete file; |
---|
612 | free(title); |
---|
613 | free(data_name); |
---|
614 | free(buffer); |
---|
615 | } |
---|
616 | |
---|
617 | static PS_CandidatePtr PS_ascend(PS_CandidatePtr _last_candidate) { |
---|
618 | // 'remove' _last_candidate from __MAP by applying its parent's |
---|
619 | // paths to a fresh __MAP |
---|
620 | |
---|
621 | // |
---|
622 | // make fresh __MAP |
---|
623 | // |
---|
624 | __MAP->copy(__PRESET_MAP); |
---|
625 | // |
---|
626 | // apply paths of parent-candidates to __MAP |
---|
627 | // |
---|
628 | PS_CandidatePtr parent = _last_candidate->parent; |
---|
629 | while (parent && !parent->path.empty()) { |
---|
630 | PS_apply_path_to_bitmap(parent->path, true); |
---|
631 | parent = parent->parent; |
---|
632 | } |
---|
633 | printf("ASCEND "); |
---|
634 | PS_filling_level(); |
---|
635 | // |
---|
636 | // return the parent of the given candidate |
---|
637 | // |
---|
638 | return _last_candidate->parent; |
---|
639 | } |
---|
640 | |
---|
641 | |
---|
642 | static void PS_descend(PS_CandidatePtr _candidate_parent, const PS_NodePtr& _root_node, unsigned long _depth, const float _filling_level) { |
---|
643 | struct tms before; |
---|
644 | times(&before); |
---|
645 | |
---|
646 | printf("\nDESCEND ==================== depth (%lu / %lu) candidates (%lu / %lu -> %lu left) ====================\n", |
---|
647 | _depth, |
---|
648 | __MAX_DEPTH, |
---|
649 | __CANDIDATES_FINISHED, |
---|
650 | __CANDIDATES_TODO, |
---|
651 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); fflush(stdout); |
---|
652 | printf("DESCEND PATH : (%s)\n\n", __PATH_IN_CANDIDATES); |
---|
653 | |
---|
654 | // |
---|
655 | // calc source- and target-set of IDs |
---|
656 | // |
---|
657 | PS_calc_next_speciesid_sets(); |
---|
658 | printf("\nDESCEND ---------- calculated next speciesid-sets ----------\n\n"); fflush(stdout); |
---|
659 | |
---|
660 | // |
---|
661 | // search for candidates to improve filling level of __MAP using the source- and target-set of IDs |
---|
662 | // max. 3 rounds |
---|
663 | // |
---|
664 | int round = 0; |
---|
665 | for (; round<3 && _candidate_parent->children.empty(); ++round) { |
---|
666 | PS_find_probes(_root_node, round, _candidate_parent, _filling_level); |
---|
667 | } |
---|
668 | printf("\nDESCEND ---------- searched probe for speciesid-sets [rounds : %i] candidates (%lu / %lu -> %lu left) ----------\n\n", |
---|
669 | round, |
---|
670 | __CANDIDATES_FINISHED, |
---|
671 | __CANDIDATES_TODO, |
---|
672 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); fflush(stdout); |
---|
673 | |
---|
674 | // |
---|
675 | // descend down each (of the max 3) candidate(s) |
---|
676 | // |
---|
677 | char count = '0'+_candidate_parent->children.size(); |
---|
678 | for (PS_CandidateByGainMapRIter next_candidate_it = _candidate_parent->children.rbegin(); |
---|
679 | next_candidate_it != _candidate_parent->children.rend(); |
---|
680 | ++next_candidate_it, --count) { |
---|
681 | PS_CandidatePtr next_candidate = &(*next_candidate_it->second); |
---|
682 | ++__CANDIDATES_FINISHED; |
---|
683 | // |
---|
684 | // apply candidate to __MAP |
---|
685 | // |
---|
686 | printf("[%p] ", next_candidate); |
---|
687 | PS_apply_path_to_bitmap(next_candidate->path); |
---|
688 | printf("[%p] ", next_candidate); |
---|
689 | PS_filling_level(next_candidate); |
---|
690 | next_candidate->depth = _depth+1; |
---|
691 | next_candidate->print(0, false, false); |
---|
692 | // |
---|
693 | // step down |
---|
694 | // |
---|
695 | if ((next_candidate->filling_level < 75.0) && (_depth+1 < __MAX_DEPTH)) { |
---|
696 | __PATH_IN_CANDIDATES[_depth] = count; |
---|
697 | __PATH_IN_CANDIDATES[_depth+1] = '\x00'; |
---|
698 | PS_descend(next_candidate, _root_node, _depth+1, next_candidate->filling_level); |
---|
699 | } |
---|
700 | // |
---|
701 | // remove candidate from __MAP |
---|
702 | // |
---|
703 | PS_ascend(next_candidate); |
---|
704 | } |
---|
705 | |
---|
706 | printf("\nDESCEND ~~~~~~~~~~~~~~~~~~~~ depth (%li) max depth (%li) candidates (%lu / %lu -> %lu left) ~~~~~~~~~~~~~~~~~~~~ ", |
---|
707 | _depth, |
---|
708 | __MAX_DEPTH, |
---|
709 | __CANDIDATES_FINISHED, |
---|
710 | __CANDIDATES_TODO, |
---|
711 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); |
---|
712 | PS_print_time_diff(&before); |
---|
713 | printf("\n"); fflush(stdout); |
---|
714 | } |
---|
715 | |
---|
716 | static void PS_make_map_for_candidate(PS_CandidatePtr _candidate) { |
---|
717 | // make __MAP for _candidate |
---|
718 | |
---|
719 | if (_candidate->map) return; // if candidate already has its map return |
---|
720 | _candidate->map = new PS_BitMap_Counted(false, __MAX_ID+1); |
---|
721 | _candidate->map->copy(__PRESET_MAP); |
---|
722 | PS_apply_path_to_bitmap(_candidate->path, true, _candidate->map); // apply _candidate's path |
---|
723 | PS_CandidatePtr parent = _candidate->parent; |
---|
724 | while (parent && !parent->path.empty()) { // apply parent's paths |
---|
725 | PS_apply_path_to_bitmap(parent->path, true, _candidate->map); |
---|
726 | parent = parent->parent; |
---|
727 | } |
---|
728 | } |
---|
729 | |
---|
730 | |
---|
731 | void PS_get_leaf_candidates(PS_CandidatePtr _candidate_parent, |
---|
732 | PS_CandidateSet &_leaf_candidates, |
---|
733 | const bool _ignore_passes_left = false) { |
---|
734 | |
---|
735 | for (PS_CandidateByGainMapIter next_candidate = _candidate_parent->children.begin(); |
---|
736 | next_candidate != _candidate_parent->children.end(); |
---|
737 | ++next_candidate) { |
---|
738 | |
---|
739 | if ((next_candidate->second->children.size() == 0) && |
---|
740 | ((next_candidate->second->passes_left > 0) || _ignore_passes_left)) { |
---|
741 | _leaf_candidates.insert(next_candidate->second); |
---|
742 | } |
---|
743 | PS_get_leaf_candidates(&(*next_candidate->second), _leaf_candidates, _ignore_passes_left); |
---|
744 | } |
---|
745 | } |
---|
746 | |
---|
747 | void PS_get_next_candidates_descend(PS_NodePtr _ps_node, PS_CandidateSet &_leaf_candidates) { |
---|
748 | // scan PS_node-tree for next candidates for each of _leaf_candidates |
---|
749 | |
---|
750 | SpeciesID id = _ps_node->getNum(); |
---|
751 | bool has_probes = _ps_node->hasProbes(); |
---|
752 | |
---|
753 | // |
---|
754 | // append ID to path |
---|
755 | // |
---|
756 | __PATH.insert(id); |
---|
757 | |
---|
758 | // |
---|
759 | // don't look at path until ID is greater than lowest ID in the sets of IDs |
---|
760 | // also don't use a node if its already used as candidate |
---|
761 | // |
---|
762 | if ((id >= __MIN_SETS_ID) && has_probes) { |
---|
763 | unsigned long total_gain_of_node = 0; |
---|
764 | unsigned long total_tests_per_node = 0; |
---|
765 | ++__PROBES_COUNTER; |
---|
766 | // iterate over _leaf_candidates |
---|
767 | for (PS_CandidateSetIter candidate_iter = _leaf_candidates.begin(); |
---|
768 | candidate_iter != _leaf_candidates.end(); |
---|
769 | ++candidate_iter) { |
---|
770 | PS_CandidateSPtr candidate = *candidate_iter; |
---|
771 | |
---|
772 | // next leaf-candidate if the probe was already used for current candidate |
---|
773 | if (candidate->alreadyUsedNode(_ps_node)) continue; |
---|
774 | |
---|
775 | ++__CANDIDATES_COUNTER; // possible candidates |
---|
776 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
777 | __PROBES_COUNTER, |
---|
778 | __PROBES_REMOVED, |
---|
779 | __CANDIDATES_COUNTER, |
---|
780 | __CANDIDATES_TODO, |
---|
781 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
782 | |
---|
783 | // next leaf-candidate if __PATH doesn't fulfill matching criteria |
---|
784 | unsigned long matches = candidate->matchPathOnOneFalseIDs(__PATH); |
---|
785 | if (matches < candidate->one_false_IDs_matches) continue; |
---|
786 | |
---|
787 | ++__CANDIDATES_TODO; // good candidates |
---|
788 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
789 | __PROBES_COUNTER, |
---|
790 | __PROBES_REMOVED, |
---|
791 | __CANDIDATES_COUNTER, |
---|
792 | __CANDIDATES_TODO, |
---|
793 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
794 | |
---|
795 | // test node on candidate's bitmap |
---|
796 | float filling_level; |
---|
797 | ++total_tests_per_node; |
---|
798 | unsigned long gain = PS_test_candidate_on_bitmap(&filling_level, candidate->map); |
---|
799 | total_gain_of_node += gain; |
---|
800 | if (candidate->updateBestChild(gain, matches, filling_level, _ps_node, __PATH)) { |
---|
801 | ++__CANDIDATES_FINISHED; // used candiates |
---|
802 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
803 | __PROBES_COUNTER, |
---|
804 | __PROBES_REMOVED, |
---|
805 | __CANDIDATES_COUNTER, |
---|
806 | __CANDIDATES_TODO, |
---|
807 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
808 | } |
---|
809 | } |
---|
810 | if ((total_tests_per_node == _leaf_candidates.size()) && |
---|
811 | (total_gain_of_node == 0)) { |
---|
812 | ++__PROBES_REMOVED; |
---|
813 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
814 | __PROBES_COUNTER, |
---|
815 | __PROBES_REMOVED, |
---|
816 | __CANDIDATES_COUNTER, |
---|
817 | __CANDIDATES_TODO, |
---|
818 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
819 | _ps_node->removeProbes(); |
---|
820 | } |
---|
821 | } |
---|
822 | |
---|
823 | // |
---|
824 | // step down the children if either ID is lower than highest |
---|
825 | // ID in the set of ID-pairs or the node has no probes |
---|
826 | // |
---|
827 | if ((id < __MAX_SETS_ID) || (! has_probes)) { |
---|
828 | for (PS_NodeMapConstIterator i = _ps_node->getChildrenBegin(); |
---|
829 | i != _ps_node->getChildrenEnd(); |
---|
830 | ++i) { |
---|
831 | PS_get_next_candidates_descend(i->second, _leaf_candidates); |
---|
832 | } |
---|
833 | } |
---|
834 | |
---|
835 | // |
---|
836 | // remove ID from path |
---|
837 | // |
---|
838 | __PATH.erase(id); |
---|
839 | } |
---|
840 | |
---|
841 | |
---|
842 | void PS_get_next_candidates(const PS_NodePtr& _root_node, PS_CandidateSet& _leaf_candidates) { |
---|
843 | struct tms before; |
---|
844 | times(&before); |
---|
845 | // first calc source/target/false_IDs sets |
---|
846 | printf("PS_get_next_candidates() : initializing %zu candidates..\n", _leaf_candidates.size()); fflush(stdout); |
---|
847 | __PROBES_COUNTER = 0; |
---|
848 | __PROBES_REMOVED = 0; |
---|
849 | __CANDIDATES_COUNTER = 0; |
---|
850 | __CANDIDATES_TODO = 0; |
---|
851 | __CANDIDATES_FINISHED = 0; |
---|
852 | __MIN_SETS_ID = __MAX_ID; |
---|
853 | __MAX_SETS_ID = __MIN_ID; |
---|
854 | for (PS_CandidateSetIter iter = _leaf_candidates.begin(); |
---|
855 | iter != _leaf_candidates.end(); |
---|
856 | ++iter) { |
---|
857 | PS_CandidateSPtr candidate = *iter; |
---|
858 | if (!candidate->map) { // if candidate has no map yet |
---|
859 | candidate->getParentMap(); // try to get its parent's map |
---|
860 | if (candidate->map) { // if parent had a map (if not a new map is created in PS_calc_next_speciesid_sets_for_candidate |
---|
861 | PS_apply_path_to_bitmap(candidate->path, true, candidate->map); // apply candidate's path |
---|
862 | } |
---|
863 | } |
---|
864 | PS_make_map_for_candidate(&(*candidate)); |
---|
865 | candidate->initFalseIDs(__MIN_ID, __MAX_ID, __MIN_SETS_ID, __MAX_SETS_ID); |
---|
866 | } |
---|
867 | // scan tree |
---|
868 | printf("PS_get_next_candidates() : "); fflush(stdout); |
---|
869 | for (PS_NodeMapConstIterator node_iter = _root_node->getChildrenBegin(); |
---|
870 | (node_iter != _root_node->getChildrenEnd()) && (node_iter->first < __MAX_SETS_ID); |
---|
871 | ++node_iter) { |
---|
872 | __PATH.clear(); |
---|
873 | PS_get_next_candidates_descend(node_iter->second, _leaf_candidates); |
---|
874 | } |
---|
875 | printf("looked at probes (%lu) -> removed probes (%lu) possible candidates (%lu) -> good candidates (%lu) -> used candidates (%lu)\nPS_get_next_candidates() : ", |
---|
876 | __PROBES_COUNTER, |
---|
877 | __PROBES_REMOVED, |
---|
878 | __CANDIDATES_COUNTER, |
---|
879 | __CANDIDATES_TODO, |
---|
880 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
881 | PS_print_time_diff(&before); |
---|
882 | |
---|
883 | // 'descend' candidates |
---|
884 | for (PS_CandidateSetIter iter = _leaf_candidates.begin(); |
---|
885 | iter != _leaf_candidates.end(); |
---|
886 | ++iter) { |
---|
887 | PS_CandidateSPtr candidate = *iter; |
---|
888 | candidate->print(); |
---|
889 | candidate->decreasePasses(); |
---|
890 | } |
---|
891 | |
---|
892 | _leaf_candidates.clear(); |
---|
893 | PS_get_leaf_candidates(__CANDIDATES_ROOT, _leaf_candidates); |
---|
894 | } |
---|
895 | |
---|
896 | // ==================================================== |
---|
897 | // ==================================================== |
---|
898 | int main(int argc, |
---|
899 | char *argv[]) { |
---|
900 | // |
---|
901 | // check arguments |
---|
902 | // |
---|
903 | if (argc < 5) { |
---|
904 | printf("Missing argument\n Usage %s <database name> <result filename> <[-]candidates filename> <final candidates filename> [--verbose] [result filename prefix for output files]\n", argv[0]); |
---|
905 | exit(1); |
---|
906 | } |
---|
907 | |
---|
908 | const char *input_DB_name = argv[1]; |
---|
909 | const char *result_in_filename = argv[2]; |
---|
910 | const char *candidates_filename = argv[3]; |
---|
911 | const char *final_candidates_filename = argv[4]; |
---|
912 | const char *result_out_prefix = (argc > 5) ? argv[5] : NULp; |
---|
913 | if (argc > 5) { |
---|
914 | if (strcmp(result_out_prefix, "--verbose") == 0) { |
---|
915 | __VERBOSE = true; |
---|
916 | result_out_prefix = (argc > 6) ? argv[6] : NULp; |
---|
917 | } |
---|
918 | } |
---|
919 | // |
---|
920 | // open probe-set-database |
---|
921 | // |
---|
922 | printf("Opening probe-set-database '%s'..\n", input_DB_name); |
---|
923 | PS_Database *db = new PS_Database(input_DB_name, PS_Database::READONLY); |
---|
924 | db->load(); |
---|
925 | __MAX_ID = db->getMaxID(); |
---|
926 | __MIN_ID = db->getMinID(); |
---|
927 | __SPECIES_COUNT = db->getSpeciesCount(); |
---|
928 | printf("min ID (%i) max ID (%i) count of species (%i)\n", __MIN_ID, __MAX_ID, __SPECIES_COUNT); |
---|
929 | printf("..loaded database (enter to continue)\n"); fflush(stdout); |
---|
930 | |
---|
931 | // |
---|
932 | // open result file for preset bitmap |
---|
933 | // |
---|
934 | printf("Opening result file %s..\n", result_in_filename); |
---|
935 | PS_FileBuffer *result_file = new PS_FileBuffer(result_in_filename, PS_FileBuffer::READONLY); |
---|
936 | long size; |
---|
937 | ID2IDSet noMatches; |
---|
938 | SpeciesID id1, id2; |
---|
939 | printf("reading no matches : "); |
---|
940 | result_file->get_long(size); |
---|
941 | printf("(%li)\n", size); fflush(stdout); |
---|
942 | for (long i=0; |
---|
943 | i < size; |
---|
944 | ++i) { |
---|
945 | result_file->get_int(id1); |
---|
946 | result_file->get_int(id2); |
---|
947 | printf(" %i,%i", id1, id2); |
---|
948 | if (id1 < id2) { |
---|
949 | noMatches.insert(ID2IDPair(id1, id2)); |
---|
950 | } |
---|
951 | else { |
---|
952 | noMatches.insert(ID2IDPair(id2, id1)); |
---|
953 | } |
---|
954 | } |
---|
955 | printf("\nreading one matches : "); |
---|
956 | result_file->get_long(size); |
---|
957 | printf("(%li)\n", size); fflush(stdout); |
---|
958 | long path_length; |
---|
959 | SpeciesID path_id; |
---|
960 | IDSet path; |
---|
961 | // IDID2IDSetMap oneMatchesMap; |
---|
962 | for (long i=0; |
---|
963 | i < size; |
---|
964 | ++i) { |
---|
965 | path.clear(); |
---|
966 | result_file->get_int(id1); |
---|
967 | result_file->get_int(id2); |
---|
968 | result_file->get_long(path_length); |
---|
969 | printf("%i,%i path(%li)", id1, id2, path_length); |
---|
970 | while (path_length-- > 0) { |
---|
971 | result_file->get_int(path_id); |
---|
972 | printf(" %i", path_id); |
---|
973 | path.insert(path_id); |
---|
974 | } |
---|
975 | printf("\n"); |
---|
976 | // oneMatchesMap[ID2IDPair(id1, id2)] = path; |
---|
977 | } |
---|
978 | printf("loading preset bitmap..\n"); fflush(stdout); |
---|
979 | __BITS_IN_MAP = ((__MAX_ID+1)*__MAX_ID)/2 + __MAX_ID+1; |
---|
980 | __MAP = new PS_BitMap_Counted(result_file); |
---|
981 | __PRESET_MAP = new PS_BitMap_Counted(false, __MAX_ID+1); |
---|
982 | for (id1 = 0; id1 < __MIN_ID; ++id1) { |
---|
983 | for (id2 = 0; id2 <= __MAX_ID; ++id2) { |
---|
984 | if (id1 > id2) { |
---|
985 | __MAP->setTrue(id1, id2); |
---|
986 | } |
---|
987 | else { |
---|
988 | __MAP->setTrue(id2, id1); |
---|
989 | } |
---|
990 | } |
---|
991 | } |
---|
992 | for (SpeciesID id = 0; id <= __MAX_ID; ++id) { |
---|
993 | __MAP->setTrue(id, id); |
---|
994 | } |
---|
995 | __MAP->recalcCounters(); |
---|
996 | __PRESET_MAP->copy(__MAP); |
---|
997 | printf("..loaded result file (enter to continue)\n"); |
---|
998 | printf("cleaning up... result file\n"); fflush(stdout); |
---|
999 | delete result_file; |
---|
1000 | |
---|
1001 | // |
---|
1002 | // create or read candidates |
---|
1003 | // |
---|
1004 | __CANDIDATES_ROOT = new PS_Candidate(0.0); |
---|
1005 | PS_FileBuffer *candidates_file; |
---|
1006 | struct tms before; |
---|
1007 | if (candidates_filename[0] != '-') { |
---|
1008 | printf("searching candidates..\n"); |
---|
1009 | // |
---|
1010 | // recursively build a tree of candidates |
---|
1011 | // |
---|
1012 | __MAX_DEPTH = 0; |
---|
1013 | for (long species_count = __SPECIES_COUNT; species_count > 0; species_count >>= 1) { |
---|
1014 | ++__MAX_DEPTH; |
---|
1015 | } |
---|
1016 | __MAX_DEPTH = (__MAX_DEPTH * 3) >> 1; |
---|
1017 | __PATH_IN_CANDIDATES = (char *)calloc(__MAX_DEPTH+1, sizeof(char)); |
---|
1018 | __CANDIDATES_TODO = 0; |
---|
1019 | __CANDIDATES_FINISHED = 0; |
---|
1020 | times(&before); |
---|
1021 | PS_descend(__CANDIDATES_ROOT, db->getConstRootNode(), 0, 0.0); |
---|
1022 | printf("PS_descend : total "); |
---|
1023 | PS_print_time_diff(&before); |
---|
1024 | |
---|
1025 | // |
---|
1026 | // save candidates |
---|
1027 | // |
---|
1028 | printf("saving candidates to %s..\n", candidates_filename); |
---|
1029 | candidates_file = new PS_FileBuffer(candidates_filename, PS_FileBuffer::WRITEONLY); |
---|
1030 | __CANDIDATES_ROOT->false_IDs = __BITS_IN_MAP; |
---|
1031 | __CANDIDATES_ROOT->save(candidates_file, __BITS_IN_MAP); |
---|
1032 | } |
---|
1033 | else { |
---|
1034 | printf("loading candidates..\n"); |
---|
1035 | candidates_file = new PS_FileBuffer(candidates_filename+1, PS_FileBuffer::READONLY); |
---|
1036 | __CANDIDATES_ROOT->load(candidates_file, __BITS_IN_MAP, db->getConstRootNode()); |
---|
1037 | printf("..loaded candidates file (enter to continue)\n"); |
---|
1038 | } |
---|
1039 | printf("cleaning up... candidates file\n"); fflush(stdout); |
---|
1040 | delete candidates_file; |
---|
1041 | |
---|
1042 | // |
---|
1043 | // scan candidates-tree for leaf candidates |
---|
1044 | // |
---|
1045 | printf("CANDIDATES :\n"); |
---|
1046 | __CANDIDATES_ROOT->print(); |
---|
1047 | printf("\nsearching leaf candidates.. "); |
---|
1048 | PS_CandidateSet leaf_candidates; |
---|
1049 | PS_get_leaf_candidates(__CANDIDATES_ROOT, leaf_candidates); |
---|
1050 | printf("%zu\n", leaf_candidates.size()); |
---|
1051 | |
---|
1052 | // |
---|
1053 | // scan tree for next candidates (below the ones in leaf_candidates) |
---|
1054 | // |
---|
1055 | times(&before); |
---|
1056 | long round = 0; |
---|
1057 | while (!leaf_candidates.empty() && round<200) { |
---|
1058 | printf("\nsearching next candidates [round #%li]..\n", ++round); |
---|
1059 | PS_get_next_candidates(db->getConstRootNode(), leaf_candidates); |
---|
1060 | |
---|
1061 | printf("rounds %li total ", round); |
---|
1062 | PS_print_time_diff(&before); |
---|
1063 | } |
---|
1064 | |
---|
1065 | printf("\nFINAL CANDIDATES:\n"); |
---|
1066 | __CANDIDATES_ROOT->print(0, true); |
---|
1067 | printf("\nfinal leaf candidates.. (%zu)\n", leaf_candidates.size()); |
---|
1068 | PS_get_leaf_candidates(__CANDIDATES_ROOT, leaf_candidates, true); |
---|
1069 | for (PS_CandidateSetIter c = leaf_candidates.begin(); |
---|
1070 | c != leaf_candidates.end(); |
---|
1071 | ++c) { |
---|
1072 | (*(*c)).print(); |
---|
1073 | } |
---|
1074 | |
---|
1075 | // |
---|
1076 | // save final candidates |
---|
1077 | // |
---|
1078 | printf("saving final candidates to %s..\n", final_candidates_filename); |
---|
1079 | candidates_file = new PS_FileBuffer(final_candidates_filename, PS_FileBuffer::WRITEONLY); |
---|
1080 | __CANDIDATES_ROOT->save(candidates_file, __BITS_IN_MAP); |
---|
1081 | printf("cleaning up... candidates file\n"); fflush(stdout); |
---|
1082 | delete candidates_file; |
---|
1083 | |
---|
1084 | // |
---|
1085 | // starting with the best candidate print the __MAP for each |
---|
1086 | // depth walking up to root of candidates-tree |
---|
1087 | // |
---|
1088 | if (result_out_prefix) { |
---|
1089 | // put __MAP in the state 'after applying best_candidate and its parents' |
---|
1090 | PS_CandidateSPtr best_candidate_smart = *leaf_candidates.begin(); |
---|
1091 | PS_CandidatePtr best_candidate = &(*best_candidate_smart); |
---|
1092 | PS_ascend(best_candidate); |
---|
1093 | PS_apply_path_to_bitmap(best_candidate->path); |
---|
1094 | while (best_candidate) { |
---|
1095 | PS_GNUPlot(result_out_prefix, __MAX_DEPTH--, best_candidate->path, noMatches); |
---|
1096 | best_candidate = PS_ascend(best_candidate); |
---|
1097 | } |
---|
1098 | } |
---|
1099 | |
---|
1100 | // |
---|
1101 | // clean up |
---|
1102 | // |
---|
1103 | printf("cleaning up... candidates\n"); fflush(stdout); |
---|
1104 | delete __CANDIDATES_ROOT; |
---|
1105 | free(__PATH_IN_CANDIDATES); |
---|
1106 | printf("cleaning up... database\n"); fflush(stdout); |
---|
1107 | delete db; |
---|
1108 | printf("cleaning up... bitmaps\n"); fflush(stdout); |
---|
1109 | delete __PRESET_MAP; |
---|
1110 | delete __MAP; |
---|
1111 | |
---|
1112 | return 0; |
---|
1113 | } |
---|