1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : ps_find_probes.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Coded by Wolfram Foerster in October 2002 // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #include "ps_tools.hxx" |
---|
13 | #include "ps_database.hxx" |
---|
14 | #include "ps_candidate.hxx" |
---|
15 | |
---|
16 | #include <cmath> |
---|
17 | #include <iostream> |
---|
18 | #include <sys/times.h> |
---|
19 | |
---|
20 | using namespace std; |
---|
21 | |
---|
22 | template<class T> |
---|
23 | void PS_print_set_ranges(const char *_set_name, const set<T> &_set, const bool _cr_at_end = true) { |
---|
24 | fflush(stdout); |
---|
25 | printf("%s size (%3zu) : ", _set_name, _set.size()); |
---|
26 | if (_set.empty()) { |
---|
27 | printf("(empty)"); |
---|
28 | } |
---|
29 | else { |
---|
30 | typename set<T>::const_iterator range_begin = _set.begin(); |
---|
31 | typename set<T>::const_iterator range_end = range_begin; |
---|
32 | typename set<T>::const_iterator it = _set.begin(); |
---|
33 | for (++it; |
---|
34 | it != _set.end(); |
---|
35 | ++it) { |
---|
36 | if (*it == (*range_end)+1) { |
---|
37 | range_end = it; |
---|
38 | } |
---|
39 | else { |
---|
40 | if (range_end == range_begin) { |
---|
41 | cout << *range_begin << " "; |
---|
42 | } |
---|
43 | else { |
---|
44 | cout << *range_begin << "-" << *range_end << " "; |
---|
45 | } |
---|
46 | range_begin = it; |
---|
47 | range_end = it; |
---|
48 | } |
---|
49 | } |
---|
50 | cout << *range_begin; |
---|
51 | if (range_end != range_begin) cout << "-" << *range_end; |
---|
52 | } |
---|
53 | if (_cr_at_end) cout << "\n"; |
---|
54 | fflush(stdout); |
---|
55 | } |
---|
56 | |
---|
57 | |
---|
58 | template<class T1, class T2> |
---|
59 | void PS_print_map_ranges(const char *_map_name, const map<T1, T2> &_map, const bool _compare_keys = true, const bool _cr_at_end = true) { |
---|
60 | fflush(stdout); |
---|
61 | if (_map.empty()) { |
---|
62 | printf("%s size ( 0) : (empty)", _map_name); |
---|
63 | } |
---|
64 | else { |
---|
65 | if (_compare_keys) { |
---|
66 | printf("%s size (%3i) : ", _map_name, _map.size()); |
---|
67 | typename map<T1, T2>::const_iterator range_begin = _map.begin(); |
---|
68 | typename map<T1, T2>::const_iterator range_end = range_begin; |
---|
69 | typename map<T1, T2>::const_iterator it = _map.begin(); |
---|
70 | for (++it; |
---|
71 | it != _map.end(); |
---|
72 | ++it) { |
---|
73 | if (it->first == range_end->first+1) { |
---|
74 | range_end = it; |
---|
75 | } |
---|
76 | else { |
---|
77 | if (range_end == range_begin) { |
---|
78 | cout << "(" << range_begin->first << "," << range_begin->second << ") "; |
---|
79 | } |
---|
80 | else { |
---|
81 | cout << "(" << range_begin->first << "-" << range_end->first << "," << range_begin->second << ") "; |
---|
82 | } |
---|
83 | range_begin = it; |
---|
84 | range_end = it; |
---|
85 | } |
---|
86 | } |
---|
87 | if (range_end == range_begin) { |
---|
88 | cout << "(" << range_begin->first << "," << range_begin->second << ") "; |
---|
89 | } |
---|
90 | else { |
---|
91 | cout << "(" << range_begin->first << "-" << range_end->first << "," << range_begin->second << ") "; |
---|
92 | } |
---|
93 | } |
---|
94 | else { |
---|
95 | map<T2, set<pair<T1, T1> > > value2indices; |
---|
96 | map<T2, unsigned long int> value2count; |
---|
97 | pair<T1, T1> range; |
---|
98 | range.first = _map.begin()->first; |
---|
99 | range.second = range.first; |
---|
100 | T2 cur_value = _map.begin()->second; |
---|
101 | typename map<T1, T2>::const_iterator it = _map.begin(); |
---|
102 | for (++it; |
---|
103 | it != _map.end(); |
---|
104 | ++it) { |
---|
105 | if (it->second == cur_value) { |
---|
106 | if (it->first == range.second+1) { |
---|
107 | range.second = it->first; |
---|
108 | } |
---|
109 | else { |
---|
110 | value2indices[cur_value].insert(range); |
---|
111 | value2count[cur_value] += range.second - range.first + 1; |
---|
112 | range.first = it->first; |
---|
113 | range.second = it->first; |
---|
114 | } |
---|
115 | } |
---|
116 | else { |
---|
117 | value2indices[cur_value].insert(range); |
---|
118 | value2count[cur_value] += range.second - range.first + 1; |
---|
119 | range.first = it->first; |
---|
120 | range.second = it->first; |
---|
121 | cur_value = it->second; |
---|
122 | } |
---|
123 | } |
---|
124 | value2indices[cur_value].insert(range); |
---|
125 | value2count[cur_value] += range.second - range.first + 1; |
---|
126 | for (typename map<T2, set<pair<T1, T1> > >::const_iterator value = value2indices.begin(); |
---|
127 | value != value2indices.end(); |
---|
128 | ++value) { |
---|
129 | if (value != value2indices.begin()) cout << "\n"; |
---|
130 | printf("%s size (%3i) : value (", _map_name, _map.size()); |
---|
131 | cout << value->first << ") count (" << value2count[value->first] << ") ["; |
---|
132 | for (typename set<pair<T1, T1> >::const_iterator indices = value->second.begin(); |
---|
133 | indices != value->second.end(); |
---|
134 | ++indices) { |
---|
135 | if (indices != value->second.begin()) cout << " "; |
---|
136 | cout << indices->first; |
---|
137 | if (indices->first != indices->second) cout << "-" << indices->second; |
---|
138 | } |
---|
139 | cout << "]," << value->first << ")"; |
---|
140 | } |
---|
141 | } |
---|
142 | } |
---|
143 | if (_cr_at_end) cout << "\n"; |
---|
144 | fflush(stdout); |
---|
145 | } |
---|
146 | |
---|
147 | // |
---|
148 | // common globals |
---|
149 | // |
---|
150 | |
---|
151 | static bool __VERBOSE = false; |
---|
152 | static SpeciesID __MAX_ID; |
---|
153 | static SpeciesID __MIN_ID; |
---|
154 | static SpeciesID __SPECIES_COUNT; |
---|
155 | static unsigned long __BITS_IN_MAP; |
---|
156 | static PS_BitMap_Counted *__MAP; |
---|
157 | static PS_BitMap_Counted *__PRESET_MAP; |
---|
158 | static IDSet __SOURCE_ID_SET; |
---|
159 | static PS_BitSet::IndexSet __TARGET_ID_SET; |
---|
160 | |
---|
161 | // |
---|
162 | // globals for PS_calc_next_speciesid_sets, |
---|
163 | // PS_calc_next_speciesid_sets_for_candidate |
---|
164 | // |
---|
165 | |
---|
166 | // count of trues for a source_id may be max 5% |
---|
167 | // (of the difference between __SPECIES_COUNT and the lowest |
---|
168 | // count of trues per species) higher than the lowest count |
---|
169 | // of trues for the species in __SOURCE_ID_SET |
---|
170 | #define __THRESHOLD_PERCENTAGE_NEXT_SOURCE_ID_SET 5 |
---|
171 | // target_id must be at least in 95% of total target_id_sets |
---|
172 | // to be in __TARGET_ID_SET |
---|
173 | #define __THRESHOLD_PERCENTAGE_NEXT_TARGET_ID_SET 95 |
---|
174 | |
---|
175 | static SpeciesID __MIN_SETS_ID; |
---|
176 | static SpeciesID __MAX_SETS_ID; |
---|
177 | |
---|
178 | // |
---|
179 | // globals for PS_find_probes |
---|
180 | // |
---|
181 | |
---|
182 | // initially a probe must match 40-60% of species |
---|
183 | // in the ID-sets to be a candidate |
---|
184 | // if PS_find_probes fails to find candidates within this range |
---|
185 | // it is repeated with a broadened range (by 5% each border) |
---|
186 | #define __MIN_PERCENTAGE_SET_MATCH 40 |
---|
187 | #define __MAX_PERCENTAGE_SET_MATCH 60 |
---|
188 | |
---|
189 | static float __SOURCE_MIN_MATCH_COUNT; |
---|
190 | static float __SOURCE_MAX_MATCH_COUNT; |
---|
191 | static float __TARGET_MIN_MATCH_COUNT; |
---|
192 | static float __TARGET_MAX_MATCH_COUNT; |
---|
193 | static float __SOURCE_PERFECT_MATCH_COUNT; |
---|
194 | static float __TARGET_PERFECT_MATCH_COUNT; |
---|
195 | static unsigned long __PROBES_COUNTER; |
---|
196 | static unsigned long __PROBES_REMOVED; |
---|
197 | static PS_CandidatePtr __CANDIDATES_ROOT; |
---|
198 | static unsigned long __CANDIDATES_COUNTER; |
---|
199 | static unsigned long __CANDIDATES_TODO; |
---|
200 | static unsigned long __CANDIDATES_FINISHED; |
---|
201 | static unsigned long __MAX_DEPTH; |
---|
202 | |
---|
203 | // |
---|
204 | // globals for PS_find_probe_for_sets |
---|
205 | // PS_test_candidate_on_bitmap |
---|
206 | // |
---|
207 | |
---|
208 | static IDSet __PATH; |
---|
209 | |
---|
210 | // |
---|
211 | // globals for PS_descend |
---|
212 | // |
---|
213 | static char *__PATH_IN_CANDIDATES; |
---|
214 | |
---|
215 | |
---|
216 | static unsigned long int PS_test_candidate_on_bitmap(float *_filling_level = 0, |
---|
217 | PS_BitMap_Counted *_map = 0) { |
---|
218 | // returns number of locations in __MAP that would be switched |
---|
219 | // from false to true by __PATH |
---|
220 | |
---|
221 | // iterate over all IDs except path |
---|
222 | IDSetCIter path_iter = __PATH.begin(); |
---|
223 | SpeciesID next_path_ID = *path_iter; |
---|
224 | unsigned long int gain = 0; |
---|
225 | if (_map) { |
---|
226 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
227 | not_in_path_ID <= __MAX_ID; |
---|
228 | ++not_in_path_ID) { |
---|
229 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
230 | ++path_iter; // advance to next path ID |
---|
231 | next_path_ID = (path_iter == __PATH.end()) ? -1 : *path_iter; |
---|
232 | continue; // skip this ID |
---|
233 | } |
---|
234 | // iterate over path IDs |
---|
235 | for (IDSetCIter path_ID = __PATH.begin(); |
---|
236 | path_ID != __PATH.end(); |
---|
237 | ++path_ID) { |
---|
238 | if (not_in_path_ID == *path_ID) |
---|
239 | continue; // obviously a probe cant differ a species from itself |
---|
240 | if (not_in_path_ID > *path_ID) { |
---|
241 | gain += !_map->get(not_in_path_ID, *path_ID); |
---|
242 | } |
---|
243 | else { |
---|
244 | gain += !_map->get(*path_ID, not_in_path_ID); |
---|
245 | } |
---|
246 | } |
---|
247 | } |
---|
248 | } |
---|
249 | else { |
---|
250 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
251 | not_in_path_ID <= __MAX_ID; |
---|
252 | ++not_in_path_ID) { |
---|
253 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
254 | ++path_iter; // advance to next path ID |
---|
255 | next_path_ID = (path_iter == __PATH.end()) ? -1 : *path_iter; |
---|
256 | continue; // skip this ID |
---|
257 | } |
---|
258 | // iterate over path IDs |
---|
259 | for (IDSetCIter path_ID = __PATH.begin(); |
---|
260 | path_ID != __PATH.end(); |
---|
261 | ++path_ID) { |
---|
262 | if (not_in_path_ID == *path_ID) |
---|
263 | continue; // obviously a probe cant differ a species from itself |
---|
264 | if (not_in_path_ID > *path_ID) { |
---|
265 | gain += !__MAP->get(not_in_path_ID, *path_ID); |
---|
266 | } |
---|
267 | else { |
---|
268 | gain += !__MAP->get(*path_ID, not_in_path_ID); |
---|
269 | } |
---|
270 | } |
---|
271 | } |
---|
272 | } |
---|
273 | if (_filling_level) { |
---|
274 | unsigned long int trues = (_map) ? _map->getCountOfTrues() + gain : __MAP->getCountOfTrues() + gain; |
---|
275 | *_filling_level = ((float)trues / __BITS_IN_MAP)*100.0; |
---|
276 | } |
---|
277 | return gain; |
---|
278 | } |
---|
279 | |
---|
280 | |
---|
281 | static bool PS_test_sets_on_path(float &_distance) { |
---|
282 | long count_matched_source = 0; |
---|
283 | long count_matched_target = 0; |
---|
284 | SpeciesID path_id; |
---|
285 | IDSetCIter path_end = __PATH.end(); |
---|
286 | IDSetCIter path = __PATH.begin(); |
---|
287 | IDSetCIter source_end = __SOURCE_ID_SET.end(); |
---|
288 | IDSetCIter source = __SOURCE_ID_SET.begin(); |
---|
289 | PS_BitSet::IndexSet::const_iterator target_end = __TARGET_ID_SET.end(); |
---|
290 | PS_BitSet::IndexSet::const_iterator target = __TARGET_ID_SET.begin(); |
---|
291 | |
---|
292 | while (path != path_end) { |
---|
293 | path_id = *path; |
---|
294 | while ((*target < path_id) && (target != target_end)) { |
---|
295 | ++target; |
---|
296 | } |
---|
297 | if ((target != target_end) && (*target == path_id)) |
---|
298 | ++count_matched_target; |
---|
299 | while ((*source < path_id) && (source != source_end)) { |
---|
300 | ++source; |
---|
301 | } |
---|
302 | if ((source != source_end) && (*source == path_id)) |
---|
303 | ++count_matched_source; |
---|
304 | ++path; |
---|
305 | } |
---|
306 | |
---|
307 | if ((count_matched_source > __SOURCE_MIN_MATCH_COUNT) && |
---|
308 | (count_matched_source < __SOURCE_MAX_MATCH_COUNT) && |
---|
309 | (count_matched_target > __TARGET_MIN_MATCH_COUNT) && |
---|
310 | (count_matched_target < __TARGET_MAX_MATCH_COUNT)) { |
---|
311 | _distance = fabs(__SOURCE_PERFECT_MATCH_COUNT - count_matched_source) + fabs(__TARGET_PERFECT_MATCH_COUNT - count_matched_target); |
---|
312 | return true; |
---|
313 | } |
---|
314 | return false; |
---|
315 | } |
---|
316 | |
---|
317 | static void PS_find_probe_for_sets(const PS_NodePtr& _ps_node, PS_CandidatePtr _candidate_parent) { |
---|
318 | // scan subtree starting with _ps_node for candidates |
---|
319 | |
---|
320 | SpeciesID id = _ps_node->getNum(); |
---|
321 | bool has_probes = _ps_node->hasProbes(); |
---|
322 | |
---|
323 | // |
---|
324 | // append ID to path |
---|
325 | // |
---|
326 | __PATH.insert(id); |
---|
327 | |
---|
328 | // |
---|
329 | // don't look at path until ID is greater than lowest ID in the sets of IDs |
---|
330 | // also don't use a node if its already used as candidate |
---|
331 | // |
---|
332 | if ((id >= __MIN_SETS_ID) && has_probes && !_candidate_parent->alreadyUsedNode(_ps_node)) { |
---|
333 | ++__PROBES_COUNTER; |
---|
334 | if (__VERBOSE && (__PROBES_COUNTER % 100 == 0)) { |
---|
335 | printf("%8lup %8luc\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", __PROBES_COUNTER, __CANDIDATES_COUNTER); |
---|
336 | fflush(stdout); |
---|
337 | } |
---|
338 | // make intersections of __PATH with __SOURCE_ID_SET and __TARGET_ID_SET |
---|
339 | float distance_to_perfect_match; |
---|
340 | if (PS_test_sets_on_path(distance_to_perfect_match)) { |
---|
341 | unsigned long gain = PS_test_candidate_on_bitmap(); // no -> calc gain and make new candidate node |
---|
342 | int status = (gain < (unsigned)__SPECIES_COUNT / 100) |
---|
343 | ? 0 |
---|
344 | : _candidate_parent->addChild(static_cast<unsigned long>(distance_to_perfect_match), gain, _ps_node, __PATH); |
---|
345 | if (status > 0) { |
---|
346 | if (status == 2) { |
---|
347 | ++__CANDIDATES_COUNTER; |
---|
348 | if (__VERBOSE) { |
---|
349 | printf("%8lup %8luc\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", __PROBES_COUNTER, __CANDIDATES_COUNTER); |
---|
350 | fflush(stdout); |
---|
351 | } |
---|
352 | } |
---|
353 | fflush(stdout); |
---|
354 | } |
---|
355 | } |
---|
356 | } |
---|
357 | |
---|
358 | // |
---|
359 | // step down the children if either ID is lower than highest |
---|
360 | // ID in the set of ID-pairs or the node has no probes |
---|
361 | // |
---|
362 | if ((id < __MAX_SETS_ID) || (! has_probes)) { |
---|
363 | for (PS_NodeMapConstIterator i = _ps_node->getChildrenBegin(); |
---|
364 | i != _ps_node->getChildrenEnd(); |
---|
365 | ++i) { |
---|
366 | PS_find_probe_for_sets(i->second, _candidate_parent); |
---|
367 | } |
---|
368 | } |
---|
369 | |
---|
370 | // |
---|
371 | // remove ID from path |
---|
372 | // |
---|
373 | __PATH.erase(id); |
---|
374 | } |
---|
375 | |
---|
376 | static void PS_find_probes(const PS_NodePtr& _root_node, const int _round, PS_CandidatePtr _candidate_parent, const float _filling_level) { |
---|
377 | // scan PS_Node-tree for candidates to raise filling level of __MAP |
---|
378 | |
---|
379 | __PROBES_COUNTER = 0; |
---|
380 | __CANDIDATES_COUNTER = 0; |
---|
381 | __SOURCE_MIN_MATCH_COUNT = (__SOURCE_ID_SET.size() * (__MIN_PERCENTAGE_SET_MATCH - (_round * 5))) / 100.0; |
---|
382 | __SOURCE_MAX_MATCH_COUNT = (__SOURCE_ID_SET.size() * (__MAX_PERCENTAGE_SET_MATCH + (_round * 5))) / 100.0; |
---|
383 | __TARGET_MIN_MATCH_COUNT = (__TARGET_ID_SET.size() * (__MIN_PERCENTAGE_SET_MATCH - (_round * 5))) / 100.0; |
---|
384 | __TARGET_MAX_MATCH_COUNT = (__TARGET_ID_SET.size() * (__MAX_PERCENTAGE_SET_MATCH + (_round * 5))) / 100.0; |
---|
385 | __SOURCE_PERFECT_MATCH_COUNT = __SOURCE_ID_SET.size() / 2.0; |
---|
386 | __TARGET_PERFECT_MATCH_COUNT = __TARGET_ID_SET.size() / 2.0; |
---|
387 | __MIN_SETS_ID = (*__SOURCE_ID_SET.begin() < *__TARGET_ID_SET.begin()) ? *__SOURCE_ID_SET.begin() : *__TARGET_ID_SET.begin(); |
---|
388 | __MAX_SETS_ID = (*__SOURCE_ID_SET.rbegin() < *__TARGET_ID_SET.rbegin()) ? *__SOURCE_ID_SET.rbegin() : *__TARGET_ID_SET.rbegin(); |
---|
389 | printf("PS_find_probes(%i) : source match %10.3f .. %10.3f target match %10.3f .. %10.3f\n", _round, __SOURCE_MIN_MATCH_COUNT, __SOURCE_MAX_MATCH_COUNT, __TARGET_MIN_MATCH_COUNT, __TARGET_MAX_MATCH_COUNT); fflush(stdout); |
---|
390 | struct tms before; |
---|
391 | times(&before); |
---|
392 | for (PS_NodeMapConstIterator i = _root_node->getChildrenBegin(); |
---|
393 | (i != _root_node->getChildrenEnd()) && (i->first < __MAX_SETS_ID); |
---|
394 | ++i) |
---|
395 | { |
---|
396 | __PATH.clear(); |
---|
397 | PS_find_probe_for_sets(i->second, _candidate_parent); |
---|
398 | } |
---|
399 | printf("PS_find_probes(%i) : %li candidates of %li probes visited ", _round, __CANDIDATES_COUNTER, __PROBES_COUNTER); |
---|
400 | PS_print_time_diff(&before); |
---|
401 | // reduce candidates to best,worst and middle gain |
---|
402 | _candidate_parent->reduceChildren(_filling_level); |
---|
403 | for (PS_CandidateByGainMapCIter c = _candidate_parent->children.begin(); |
---|
404 | c != _candidate_parent->children.end(); |
---|
405 | ++c) { |
---|
406 | printf("PS_find_probes(%i) : ", _round); |
---|
407 | c->second->print(); |
---|
408 | ++__CANDIDATES_TODO; |
---|
409 | } |
---|
410 | fflush(stdout); |
---|
411 | } |
---|
412 | |
---|
413 | static void PS_calc_next_speciesid_sets() { |
---|
414 | // scan __MAP to find sets of IDs that need differentiation most |
---|
415 | |
---|
416 | // |
---|
417 | // 1. __SOURCE_ID_SET |
---|
418 | // scan bitmap for species that need more matches |
---|
419 | // |
---|
420 | SpeciesID highest_count; |
---|
421 | SpeciesID lowest_count; |
---|
422 | float threshold; |
---|
423 | SpeciesID count; |
---|
424 | |
---|
425 | // first pass -- get lowest count of trues and calc threshold |
---|
426 | lowest_count = __SPECIES_COUNT; |
---|
427 | highest_count = 0; |
---|
428 | for (SpeciesID id = __MIN_ID; |
---|
429 | id <= __MAX_ID; |
---|
430 | ++id) { |
---|
431 | count = __MAP->getCountFor(id); |
---|
432 | if (count == __SPECIES_COUNT-__MIN_ID-1) continue; // i cannot improve species that can be differed from all others |
---|
433 | if (count > highest_count) highest_count = count; |
---|
434 | if (count < lowest_count) lowest_count = count; |
---|
435 | } |
---|
436 | threshold = (((__SPECIES_COUNT-lowest_count) * __THRESHOLD_PERCENTAGE_NEXT_SOURCE_ID_SET) / 100.0) + lowest_count; |
---|
437 | printf("PS_calc_next_speciesid_sets() : SOURCE count 1's [%i..%i] threshold (%.3f)", lowest_count, highest_count, threshold); |
---|
438 | |
---|
439 | // second pass -- get IDs where count is below or equal threshold |
---|
440 | __SOURCE_ID_SET.clear(); |
---|
441 | for (SpeciesID id = __MIN_ID; |
---|
442 | id <= __MAX_ID; |
---|
443 | ++id) { |
---|
444 | if (__MAP->getCountFor(id) <= threshold) __SOURCE_ID_SET.insert(id); |
---|
445 | } |
---|
446 | PS_print_set_ranges(" __SOURCE_ID_SET", __SOURCE_ID_SET); |
---|
447 | |
---|
448 | // |
---|
449 | // 2. __TARGET_ID_SET |
---|
450 | // scan bitmap for species IDs that need to be distinguished from MOST species in __SOURCE_ID_SET |
---|
451 | // |
---|
452 | ID2IDMap count_falses_per_id; |
---|
453 | |
---|
454 | // first -- get the IDs that need differentiation from __SOURCE_ID_SET |
---|
455 | unsigned long count_iterations = 0; |
---|
456 | for (IDSetCIter source_id = __SOURCE_ID_SET.begin(); |
---|
457 | source_id != __SOURCE_ID_SET.end(); |
---|
458 | ++source_id, ++count_iterations) { |
---|
459 | // get 'next_target_set' |
---|
460 | __MAP->getFalseIndicesFor(*source_id, __TARGET_ID_SET); |
---|
461 | // count falses per SpeciesID |
---|
462 | for (PS_BitSet::IndexSet::iterator target_id = __TARGET_ID_SET.begin(); |
---|
463 | target_id != __TARGET_ID_SET.end(); |
---|
464 | ++target_id) { |
---|
465 | if ((*target_id < __MIN_ID) || (*target_id > __MAX_ID)) continue; // skip ID's that are outside DB-IDs-range (like zero if __MIN_ID is one) |
---|
466 | if (*target_id != *source_id) ++count_falses_per_id[*target_id]; |
---|
467 | } |
---|
468 | } |
---|
469 | |
---|
470 | // second -- get highest count of falses and calc threshold |
---|
471 | lowest_count = __SPECIES_COUNT; |
---|
472 | highest_count = 0; |
---|
473 | for (ID2IDMapCIter count_per_id = count_falses_per_id.begin(); |
---|
474 | count_per_id != count_falses_per_id.end(); |
---|
475 | ++count_per_id) { |
---|
476 | if (count_per_id->second > highest_count) highest_count = count_per_id->second; |
---|
477 | if (count_per_id->second < lowest_count) lowest_count = count_per_id->second; |
---|
478 | } |
---|
479 | printf("PS_calc_next_speciesid_sets() : TARGET count 0's [%i..%i]", lowest_count, highest_count); |
---|
480 | |
---|
481 | // third -- put all IDs in __TARGET_ID_SET that are needed most by species from __SOURCE_ID_SET |
---|
482 | __TARGET_ID_SET.clear(); |
---|
483 | for (ID2IDMapCIter count_per_id = count_falses_per_id.begin(); |
---|
484 | count_per_id != count_falses_per_id.end(); |
---|
485 | ++count_per_id) { |
---|
486 | if (count_per_id->second == highest_count) __TARGET_ID_SET.insert(count_per_id->first); |
---|
487 | } |
---|
488 | PS_print_set_ranges(" __TARGET_ID_SET", __TARGET_ID_SET); |
---|
489 | } |
---|
490 | |
---|
491 | void PS_apply_path_to_bitmap(IDSet &_path, const bool _silent = false, PS_BitMap_Counted *_map = 0) { |
---|
492 | // set true in __MAP for all combinations of IDs (in _path , not in _path) |
---|
493 | |
---|
494 | // iterate over all IDs except path |
---|
495 | IDSetCIter path_iter = _path.begin(); |
---|
496 | SpeciesID next_path_ID = *path_iter; |
---|
497 | unsigned long int gain = 0; |
---|
498 | if (_map) { // called for a 'private' map ? |
---|
499 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
500 | not_in_path_ID <= __MAX_ID; |
---|
501 | ++not_in_path_ID) { |
---|
502 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
503 | ++path_iter; // advance to next path ID |
---|
504 | next_path_ID = (path_iter == _path.end()) ? -1 : *path_iter; |
---|
505 | continue; // skip this ID |
---|
506 | } |
---|
507 | // iterate over path IDs |
---|
508 | for (IDSetCIter path_ID = _path.begin(); |
---|
509 | path_ID != _path.end(); |
---|
510 | ++path_ID) { |
---|
511 | if (not_in_path_ID == *path_ID) continue; // obviously a probe cant differ a species from itself |
---|
512 | if (not_in_path_ID > *path_ID) { |
---|
513 | gain += !_map->set(not_in_path_ID, *path_ID, true); |
---|
514 | } |
---|
515 | else { |
---|
516 | gain += !_map->set(*path_ID, not_in_path_ID, true); |
---|
517 | } |
---|
518 | } |
---|
519 | } |
---|
520 | } |
---|
521 | else { // called for __MAP |
---|
522 | for (SpeciesID not_in_path_ID = __MIN_ID; |
---|
523 | not_in_path_ID <= __MAX_ID; |
---|
524 | ++not_in_path_ID) { |
---|
525 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
---|
526 | ++path_iter; // advance to next path ID |
---|
527 | next_path_ID = (path_iter == _path.end()) ? -1 : *path_iter; |
---|
528 | continue; // skip this ID |
---|
529 | } |
---|
530 | // iterate over path IDs |
---|
531 | for (IDSetCIter path_ID = _path.begin(); |
---|
532 | path_ID != _path.end(); |
---|
533 | ++path_ID) { |
---|
534 | if (not_in_path_ID == *path_ID) continue; // obviously a probe cant differ a species from itself |
---|
535 | if (not_in_path_ID > *path_ID) { |
---|
536 | gain += !__MAP->set(not_in_path_ID, *path_ID, true); |
---|
537 | } |
---|
538 | else { |
---|
539 | gain += !__MAP->set(*path_ID, not_in_path_ID, true); |
---|
540 | } |
---|
541 | } |
---|
542 | } |
---|
543 | } |
---|
544 | unsigned long int sets = (__SPECIES_COUNT-_path.size())*_path.size(); |
---|
545 | if (!_silent) printf("PS_apply_path_to_bitmap() : gain %lu of %lu -- %.2f%% -> wasted %lu\n", gain, sets, ((float)gain/sets)*100.0, sets-gain); |
---|
546 | } |
---|
547 | |
---|
548 | static float PS_filling_level(PS_CandidatePtr _candidate = 0) { |
---|
549 | // returns filling level of __MAP |
---|
550 | |
---|
551 | unsigned long trues = __MAP->getCountOfTrues(); |
---|
552 | float percentage = ((float)trues / __BITS_IN_MAP)*100.0; |
---|
553 | if (_candidate) { |
---|
554 | _candidate->filling_level = percentage; |
---|
555 | _candidate->false_IDs = __BITS_IN_MAP - trues; |
---|
556 | } |
---|
557 | else { |
---|
558 | printf("PS_filling_level() : bitmap (%lu) now has %lu trues and %lu falses -- %.5f%% filled\n", __BITS_IN_MAP, trues, __BITS_IN_MAP-trues, percentage); |
---|
559 | } |
---|
560 | return percentage; |
---|
561 | } |
---|
562 | |
---|
563 | void PS_GNUPlot(const char *_out_prefix, const long _iteration, const IDSet &_path, const ID2IDSet &_noMatches) { |
---|
564 | // generate data- and commandfiles for GNUPlot to display __MAP and its count_trues_per_id |
---|
565 | |
---|
566 | char *buffer = (char *)malloc(4096); |
---|
567 | // open data file |
---|
568 | sprintf(buffer, "%s.%06li", _out_prefix, _iteration); |
---|
569 | char *data_name = strdup(buffer); |
---|
570 | PS_FileBuffer *file = new PS_FileBuffer(data_name, PS_FileBuffer::WRITEONLY); |
---|
571 | // output data |
---|
572 | long size = sprintf(buffer, "# CANDIDATE #%li path (%zu)\n", _iteration, _path.size()); |
---|
573 | file->put(buffer, size); |
---|
574 | for (IDSetCIter i = _path.begin(); |
---|
575 | i != _path.end(); |
---|
576 | ++i) { |
---|
577 | size = sprintf(buffer, "# %i\n", *i); |
---|
578 | file->put(buffer, size); |
---|
579 | } |
---|
580 | size = sprintf(buffer, "#noMatches (%zu)\n", _noMatches.size()); |
---|
581 | file->put(buffer, size); |
---|
582 | for (ID2IDSetCIter p = _noMatches.begin(); |
---|
583 | p != _noMatches.end(); |
---|
584 | ++p) { |
---|
585 | size = sprintf(buffer, "%i %i\n", p->first, p->second); |
---|
586 | file->put(buffer, size); |
---|
587 | } |
---|
588 | size = sprintf(buffer, "\n\n"); |
---|
589 | file->put(buffer, size); |
---|
590 | sprintf(buffer, "Bitmap after %li. candidate", _iteration); |
---|
591 | char *title = strdup(buffer); |
---|
592 | __MAP->printGNUplot(title, buffer, file); |
---|
593 | // open gnuplot command file |
---|
594 | sprintf(buffer, "%s.%06li.gnuplot", _out_prefix, _iteration); |
---|
595 | file->reinit(buffer, PS_FileBuffer::WRITEONLY); |
---|
596 | // output gnuplot commands |
---|
597 | size = sprintf(buffer, "set terminal png color\nset output '%06li.bitmap.png\n", _iteration); |
---|
598 | file->put(buffer, size); |
---|
599 | size = sprintf(buffer, "set yrange [%i:%i] reverse\nset xrange [%i:%i]\n", __MIN_ID, __MAX_ID, __MIN_ID, __MAX_ID); |
---|
600 | file->put(buffer, size); |
---|
601 | size = sprintf(buffer, "set size square\nset title '%li. iteration'\nset pointsize 0.5\n", _iteration); |
---|
602 | file->put(buffer, size); |
---|
603 | size = sprintf(buffer, "plot '%s' index 1 title 'match ()' with dots 2,'%s' index 0 title 'no match (30)' with points 1\n", data_name, data_name); |
---|
604 | file->put(buffer, size); |
---|
605 | size = sprintf(buffer, "set terminal png color\nset output '%06li.counters.png\n", _iteration); |
---|
606 | file->put(buffer, size); |
---|
607 | size = sprintf(buffer, "set yrange [] noreverse\nset size nosquare\n"); |
---|
608 | file->put(buffer, size); |
---|
609 | size = sprintf(buffer, "plot '%s' index 2 title 'count/species' with impulses 2,'%s' index 3 title 'species/count' with points 1", data_name, data_name); |
---|
610 | file->put(buffer, size); |
---|
611 | |
---|
612 | delete file; |
---|
613 | free(title); |
---|
614 | free(data_name); |
---|
615 | free(buffer); |
---|
616 | } |
---|
617 | |
---|
618 | static PS_CandidatePtr PS_ascend(PS_CandidatePtr _last_candidate) { |
---|
619 | // 'remove' _last_candidate from __MAP by applying its parent's |
---|
620 | // paths to a fresh __MAP |
---|
621 | |
---|
622 | // |
---|
623 | // make fresh __MAP |
---|
624 | // |
---|
625 | __MAP->copy(__PRESET_MAP); |
---|
626 | // |
---|
627 | // apply paths of parent-candidates to __MAP |
---|
628 | // |
---|
629 | PS_CandidatePtr parent = _last_candidate->parent; |
---|
630 | while (parent && !parent->path.empty()) { |
---|
631 | PS_apply_path_to_bitmap(parent->path, true); |
---|
632 | parent = parent->parent; |
---|
633 | } |
---|
634 | printf("ASCEND "); |
---|
635 | PS_filling_level(); |
---|
636 | // |
---|
637 | // return the parent of the given candidate |
---|
638 | // |
---|
639 | return _last_candidate->parent; |
---|
640 | } |
---|
641 | |
---|
642 | |
---|
643 | static void PS_descend(PS_CandidatePtr _candidate_parent, const PS_NodePtr& _root_node, unsigned long _depth, const float _filling_level) { |
---|
644 | struct tms before; |
---|
645 | times(&before); |
---|
646 | |
---|
647 | printf("\nDESCEND ==================== depth (%lu / %lu) candidates (%lu / %lu -> %lu left) ====================\n", |
---|
648 | _depth, |
---|
649 | __MAX_DEPTH, |
---|
650 | __CANDIDATES_FINISHED, |
---|
651 | __CANDIDATES_TODO, |
---|
652 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); fflush(stdout); |
---|
653 | printf("DESCEND PATH : (%s)\n\n", __PATH_IN_CANDIDATES); |
---|
654 | |
---|
655 | // |
---|
656 | // calc source- and target-set of IDs |
---|
657 | // |
---|
658 | PS_calc_next_speciesid_sets(); |
---|
659 | printf("\nDESCEND ---------- calculated next speciesid-sets ----------\n\n"); fflush(stdout); |
---|
660 | |
---|
661 | // |
---|
662 | // search for candidates to improve filling level of __MAP using the source- and target-set of IDs |
---|
663 | // max. 3 rounds |
---|
664 | // |
---|
665 | int round = 0; |
---|
666 | for (; round<3 && _candidate_parent->children.empty(); ++round) { |
---|
667 | PS_find_probes(_root_node, round, _candidate_parent, _filling_level); |
---|
668 | } |
---|
669 | printf("\nDESCEND ---------- searched probe for speciesid-sets [rounds : %i] candidates (%lu / %lu -> %lu left) ----------\n\n", |
---|
670 | round, |
---|
671 | __CANDIDATES_FINISHED, |
---|
672 | __CANDIDATES_TODO, |
---|
673 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); fflush(stdout); |
---|
674 | |
---|
675 | // |
---|
676 | // descend down each (of the max 3) candidate(s) |
---|
677 | // |
---|
678 | char count = '0'+_candidate_parent->children.size(); |
---|
679 | for (PS_CandidateByGainMapRIter next_candidate_it = _candidate_parent->children.rbegin(); |
---|
680 | next_candidate_it != _candidate_parent->children.rend(); |
---|
681 | ++next_candidate_it, --count) { |
---|
682 | PS_CandidatePtr next_candidate = &(*next_candidate_it->second); |
---|
683 | ++__CANDIDATES_FINISHED; |
---|
684 | // |
---|
685 | // apply candidate to __MAP |
---|
686 | // |
---|
687 | printf("[%p] ", next_candidate); |
---|
688 | PS_apply_path_to_bitmap(next_candidate->path); |
---|
689 | printf("[%p] ", next_candidate); |
---|
690 | PS_filling_level(next_candidate); |
---|
691 | next_candidate->depth = _depth+1; |
---|
692 | next_candidate->print(0, false, false); |
---|
693 | // |
---|
694 | // step down |
---|
695 | // |
---|
696 | if ((next_candidate->filling_level < 75.0) && (_depth+1 < __MAX_DEPTH)) { |
---|
697 | __PATH_IN_CANDIDATES[_depth] = count; |
---|
698 | __PATH_IN_CANDIDATES[_depth+1] = '\x00'; |
---|
699 | PS_descend(next_candidate, _root_node, _depth+1, next_candidate->filling_level); |
---|
700 | } |
---|
701 | // |
---|
702 | // remove candidate from __MAP |
---|
703 | // |
---|
704 | PS_ascend(next_candidate); |
---|
705 | } |
---|
706 | |
---|
707 | printf("\nDESCEND ~~~~~~~~~~~~~~~~~~~~ depth (%li) max depth (%li) candidates (%lu / %lu -> %lu left) ~~~~~~~~~~~~~~~~~~~~ ", |
---|
708 | _depth, |
---|
709 | __MAX_DEPTH, |
---|
710 | __CANDIDATES_FINISHED, |
---|
711 | __CANDIDATES_TODO, |
---|
712 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); |
---|
713 | PS_print_time_diff(&before); |
---|
714 | printf("\n"); fflush(stdout); |
---|
715 | } |
---|
716 | |
---|
717 | static void PS_make_map_for_candidate(PS_CandidatePtr _candidate) { |
---|
718 | // make __MAP for _candidate |
---|
719 | |
---|
720 | if (_candidate->map) return; // if candidate already has its map return |
---|
721 | _candidate->map = new PS_BitMap_Counted(false, __MAX_ID+1); |
---|
722 | _candidate->map->copy(__PRESET_MAP); |
---|
723 | PS_apply_path_to_bitmap(_candidate->path, true, _candidate->map); // apply _candidate's path |
---|
724 | PS_CandidatePtr parent = _candidate->parent; |
---|
725 | while (parent && !parent->path.empty()) { // apply parent's paths |
---|
726 | PS_apply_path_to_bitmap(parent->path, true, _candidate->map); |
---|
727 | parent = parent->parent; |
---|
728 | } |
---|
729 | } |
---|
730 | |
---|
731 | |
---|
732 | void PS_get_leaf_candidates(PS_CandidatePtr _candidate_parent, |
---|
733 | PS_CandidateSet &_leaf_candidates, |
---|
734 | const bool _ignore_passes_left = false) { |
---|
735 | |
---|
736 | for (PS_CandidateByGainMapIter next_candidate = _candidate_parent->children.begin(); |
---|
737 | next_candidate != _candidate_parent->children.end(); |
---|
738 | ++next_candidate) { |
---|
739 | |
---|
740 | if ((next_candidate->second->children.size() == 0) && |
---|
741 | ((next_candidate->second->passes_left > 0) || _ignore_passes_left)) { |
---|
742 | _leaf_candidates.insert(next_candidate->second); |
---|
743 | } |
---|
744 | PS_get_leaf_candidates(&(*next_candidate->second), _leaf_candidates, _ignore_passes_left); |
---|
745 | } |
---|
746 | } |
---|
747 | |
---|
748 | void PS_get_next_candidates_descend(PS_NodePtr _ps_node, PS_CandidateSet &_leaf_candidates) { |
---|
749 | // scan PS_node-tree for next candidates for each of _leaf_candidates |
---|
750 | |
---|
751 | SpeciesID id = _ps_node->getNum(); |
---|
752 | bool has_probes = _ps_node->hasProbes(); |
---|
753 | |
---|
754 | // |
---|
755 | // append ID to path |
---|
756 | // |
---|
757 | __PATH.insert(id); |
---|
758 | |
---|
759 | // |
---|
760 | // don't look at path until ID is greater than lowest ID in the sets of IDs |
---|
761 | // also don't use a node if its already used as candidate |
---|
762 | // |
---|
763 | if ((id >= __MIN_SETS_ID) && has_probes) { |
---|
764 | unsigned long total_gain_of_node = 0; |
---|
765 | unsigned long total_tests_per_node = 0; |
---|
766 | ++__PROBES_COUNTER; |
---|
767 | // iterate over _leaf_candidates |
---|
768 | for (PS_CandidateSetIter candidate_iter = _leaf_candidates.begin(); |
---|
769 | candidate_iter != _leaf_candidates.end(); |
---|
770 | ++candidate_iter) { |
---|
771 | PS_CandidateSPtr candidate = *candidate_iter; |
---|
772 | |
---|
773 | // next leaf-candidate if the probe was already used for current candidate |
---|
774 | if (candidate->alreadyUsedNode(_ps_node)) continue; |
---|
775 | |
---|
776 | ++__CANDIDATES_COUNTER; // possible candidates |
---|
777 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
778 | __PROBES_COUNTER, |
---|
779 | __PROBES_REMOVED, |
---|
780 | __CANDIDATES_COUNTER, |
---|
781 | __CANDIDATES_TODO, |
---|
782 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
783 | |
---|
784 | // next leaf-candidate if __PATH doesn't fulfill matching criteria |
---|
785 | unsigned long matches = candidate->matchPathOnOneFalseIDs(__PATH); |
---|
786 | if (matches < candidate->one_false_IDs_matches) continue; |
---|
787 | |
---|
788 | ++__CANDIDATES_TODO; // good candidates |
---|
789 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
790 | __PROBES_COUNTER, |
---|
791 | __PROBES_REMOVED, |
---|
792 | __CANDIDATES_COUNTER, |
---|
793 | __CANDIDATES_TODO, |
---|
794 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
795 | |
---|
796 | // test node on candidate's bitmap |
---|
797 | float filling_level; |
---|
798 | ++total_tests_per_node; |
---|
799 | unsigned long gain = PS_test_candidate_on_bitmap(&filling_level, candidate->map); |
---|
800 | total_gain_of_node += gain; |
---|
801 | if (candidate->updateBestChild(gain, matches, filling_level, _ps_node, __PATH)) { |
---|
802 | ++__CANDIDATES_FINISHED; // used candiates |
---|
803 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
804 | __PROBES_COUNTER, |
---|
805 | __PROBES_REMOVED, |
---|
806 | __CANDIDATES_COUNTER, |
---|
807 | __CANDIDATES_TODO, |
---|
808 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
809 | } |
---|
810 | } |
---|
811 | if ((total_tests_per_node == _leaf_candidates.size()) && |
---|
812 | (total_gain_of_node == 0)) { |
---|
813 | ++__PROBES_REMOVED; |
---|
814 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
---|
815 | __PROBES_COUNTER, |
---|
816 | __PROBES_REMOVED, |
---|
817 | __CANDIDATES_COUNTER, |
---|
818 | __CANDIDATES_TODO, |
---|
819 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
820 | _ps_node->removeProbes(); |
---|
821 | } |
---|
822 | } |
---|
823 | |
---|
824 | // |
---|
825 | // step down the children if either ID is lower than highest |
---|
826 | // ID in the set of ID-pairs or the node has no probes |
---|
827 | // |
---|
828 | if ((id < __MAX_SETS_ID) || (! has_probes)) { |
---|
829 | for (PS_NodeMapConstIterator i = _ps_node->getChildrenBegin(); |
---|
830 | i != _ps_node->getChildrenEnd(); |
---|
831 | ++i) { |
---|
832 | PS_get_next_candidates_descend(i->second, _leaf_candidates); |
---|
833 | } |
---|
834 | } |
---|
835 | |
---|
836 | // |
---|
837 | // remove ID from path |
---|
838 | // |
---|
839 | __PATH.erase(id); |
---|
840 | } |
---|
841 | |
---|
842 | |
---|
843 | void PS_get_next_candidates(const PS_NodePtr& _root_node, PS_CandidateSet& _leaf_candidates) { |
---|
844 | struct tms before; |
---|
845 | times(&before); |
---|
846 | // first calc source/target/false_IDs sets |
---|
847 | printf("PS_get_next_candidates() : initializing %zu candidates..\n", _leaf_candidates.size()); fflush(stdout); |
---|
848 | __PROBES_COUNTER = 0; |
---|
849 | __PROBES_REMOVED = 0; |
---|
850 | __CANDIDATES_COUNTER = 0; |
---|
851 | __CANDIDATES_TODO = 0; |
---|
852 | __CANDIDATES_FINISHED = 0; |
---|
853 | __MIN_SETS_ID = __MAX_ID; |
---|
854 | __MAX_SETS_ID = __MIN_ID; |
---|
855 | for (PS_CandidateSetIter iter = _leaf_candidates.begin(); |
---|
856 | iter != _leaf_candidates.end(); |
---|
857 | ++iter) { |
---|
858 | PS_CandidateSPtr candidate = *iter; |
---|
859 | if (!candidate->map) { // if candidate has no map yet |
---|
860 | candidate->getParentMap(); // try to get its parent's map |
---|
861 | if (candidate->map) { // if parent had a map (if not a new map is created in PS_calc_next_speciesid_sets_for_candidate |
---|
862 | PS_apply_path_to_bitmap(candidate->path, true, candidate->map); // apply candidate's path |
---|
863 | } |
---|
864 | } |
---|
865 | PS_make_map_for_candidate(&(*candidate)); |
---|
866 | candidate->initFalseIDs(__MIN_ID, __MAX_ID, __MIN_SETS_ID, __MAX_SETS_ID); |
---|
867 | } |
---|
868 | // scan tree |
---|
869 | printf("PS_get_next_candidates() : "); fflush(stdout); |
---|
870 | for (PS_NodeMapConstIterator node_iter = _root_node->getChildrenBegin(); |
---|
871 | (node_iter != _root_node->getChildrenEnd()) && (node_iter->first < __MAX_SETS_ID); |
---|
872 | ++node_iter) { |
---|
873 | __PATH.clear(); |
---|
874 | PS_get_next_candidates_descend(node_iter->second, _leaf_candidates); |
---|
875 | } |
---|
876 | printf("looked at probes (%lu) -> removed probes (%lu) possible candidates (%lu) -> good candidates (%lu) -> used candidates (%lu)\nPS_get_next_candidates() : ", |
---|
877 | __PROBES_COUNTER, |
---|
878 | __PROBES_REMOVED, |
---|
879 | __CANDIDATES_COUNTER, |
---|
880 | __CANDIDATES_TODO, |
---|
881 | __CANDIDATES_FINISHED); fflush(stdout); |
---|
882 | PS_print_time_diff(&before); |
---|
883 | |
---|
884 | // 'descend' candidates |
---|
885 | for (PS_CandidateSetIter iter = _leaf_candidates.begin(); |
---|
886 | iter != _leaf_candidates.end(); |
---|
887 | ++iter) { |
---|
888 | PS_CandidateSPtr candidate = *iter; |
---|
889 | candidate->print(); |
---|
890 | candidate->decreasePasses(); |
---|
891 | } |
---|
892 | |
---|
893 | _leaf_candidates.clear(); |
---|
894 | PS_get_leaf_candidates(__CANDIDATES_ROOT, _leaf_candidates); |
---|
895 | } |
---|
896 | |
---|
897 | // ==================================================== |
---|
898 | // ==================================================== |
---|
899 | int main(int argc, |
---|
900 | char *argv[]) { |
---|
901 | // |
---|
902 | // check arguments |
---|
903 | // |
---|
904 | if (argc < 5) { |
---|
905 | printf("Missing argument\n Usage %s <database name> <result filename> <[-]candidates filename> <final candidates filename> [--verbose] [result filename prefix for output files]\n", argv[0]); |
---|
906 | exit(1); |
---|
907 | } |
---|
908 | |
---|
909 | const char *input_DB_name = argv[1]; |
---|
910 | const char *result_in_filename = argv[2]; |
---|
911 | const char *candidates_filename = argv[3]; |
---|
912 | const char *final_candidates_filename = argv[4]; |
---|
913 | const char *result_out_prefix = (argc > 5) ? argv[5] : 0; |
---|
914 | if (argc > 5) { |
---|
915 | if (strcmp(result_out_prefix, "--verbose") == 0) { |
---|
916 | __VERBOSE = true; |
---|
917 | result_out_prefix = (argc > 6) ? argv[6] : 0; |
---|
918 | } |
---|
919 | } |
---|
920 | // |
---|
921 | // open probe-set-database |
---|
922 | // |
---|
923 | printf("Opening probe-set-database '%s'..\n", input_DB_name); |
---|
924 | PS_Database *db = new PS_Database(input_DB_name, PS_Database::READONLY); |
---|
925 | db->load(); |
---|
926 | __MAX_ID = db->getMaxID(); |
---|
927 | __MIN_ID = db->getMinID(); |
---|
928 | __SPECIES_COUNT = db->getSpeciesCount(); |
---|
929 | printf("min ID (%i) max ID (%i) count of species (%i)\n", __MIN_ID, __MAX_ID, __SPECIES_COUNT); |
---|
930 | printf("..loaded database (enter to continue)\n"); fflush(stdout); |
---|
931 | |
---|
932 | // |
---|
933 | // open result file for preset bitmap |
---|
934 | // |
---|
935 | printf("Opening result file %s..\n", result_in_filename); |
---|
936 | PS_FileBuffer *result_file = new PS_FileBuffer(result_in_filename, PS_FileBuffer::READONLY); |
---|
937 | long size; |
---|
938 | ID2IDSet noMatches; |
---|
939 | SpeciesID id1, id2; |
---|
940 | printf("reading no matches : "); |
---|
941 | result_file->get_long(size); |
---|
942 | printf("(%li)\n", size); fflush(stdout); |
---|
943 | for (long i=0; |
---|
944 | i < size; |
---|
945 | ++i) { |
---|
946 | result_file->get_int(id1); |
---|
947 | result_file->get_int(id2); |
---|
948 | printf(" %i,%i", id1, id2); |
---|
949 | if (id1 < id2) { |
---|
950 | noMatches.insert(ID2IDPair(id1, id2)); |
---|
951 | } |
---|
952 | else { |
---|
953 | noMatches.insert(ID2IDPair(id2, id1)); |
---|
954 | } |
---|
955 | } |
---|
956 | printf("\nreading one matches : "); |
---|
957 | result_file->get_long(size); |
---|
958 | printf("(%li)\n", size); fflush(stdout); |
---|
959 | long path_length; |
---|
960 | SpeciesID path_id; |
---|
961 | IDSet path; |
---|
962 | // IDID2IDSetMap oneMatchesMap; |
---|
963 | for (long i=0; |
---|
964 | i < size; |
---|
965 | ++i) { |
---|
966 | path.clear(); |
---|
967 | result_file->get_int(id1); |
---|
968 | result_file->get_int(id2); |
---|
969 | result_file->get_long(path_length); |
---|
970 | printf("%i,%i path(%li)", id1, id2, path_length); |
---|
971 | while (path_length-- > 0) { |
---|
972 | result_file->get_int(path_id); |
---|
973 | printf(" %i", path_id); |
---|
974 | path.insert(path_id); |
---|
975 | } |
---|
976 | printf("\n"); |
---|
977 | // oneMatchesMap[ID2IDPair(id1, id2)] = path; |
---|
978 | } |
---|
979 | printf("loading preset bitmap..\n"); fflush(stdout); |
---|
980 | __BITS_IN_MAP = ((__MAX_ID+1)*__MAX_ID)/2 + __MAX_ID+1; |
---|
981 | __MAP = new PS_BitMap_Counted(result_file); |
---|
982 | __PRESET_MAP = new PS_BitMap_Counted(false, __MAX_ID+1); |
---|
983 | for (id1 = 0; id1 < __MIN_ID; ++id1) { |
---|
984 | for (id2 = 0; id2 <= __MAX_ID; ++id2) { |
---|
985 | if (id1 > id2) { |
---|
986 | __MAP->setTrue(id1, id2); |
---|
987 | } |
---|
988 | else { |
---|
989 | __MAP->setTrue(id2, id1); |
---|
990 | } |
---|
991 | } |
---|
992 | } |
---|
993 | for (SpeciesID id = 0; id <= __MAX_ID; ++id) { |
---|
994 | __MAP->setTrue(id, id); |
---|
995 | } |
---|
996 | __MAP->recalcCounters(); |
---|
997 | __PRESET_MAP->copy(__MAP); |
---|
998 | printf("..loaded result file (enter to continue)\n"); |
---|
999 | printf("cleaning up... result file\n"); fflush(stdout); |
---|
1000 | delete result_file; |
---|
1001 | |
---|
1002 | // |
---|
1003 | // create or read candidates |
---|
1004 | // |
---|
1005 | __CANDIDATES_ROOT = new PS_Candidate(0.0); |
---|
1006 | PS_FileBuffer *candidates_file; |
---|
1007 | struct tms before; |
---|
1008 | if (candidates_filename[0] != '-') { |
---|
1009 | printf("searching candidates..\n"); |
---|
1010 | // |
---|
1011 | // recursively build a tree of candidates |
---|
1012 | // |
---|
1013 | __MAX_DEPTH = 0; |
---|
1014 | for (long species_count = __SPECIES_COUNT; species_count > 0; species_count >>= 1) { |
---|
1015 | ++__MAX_DEPTH; |
---|
1016 | } |
---|
1017 | __MAX_DEPTH = (__MAX_DEPTH * 3) >> 1; |
---|
1018 | __PATH_IN_CANDIDATES = (char *)calloc(__MAX_DEPTH+1, sizeof(char)); |
---|
1019 | __CANDIDATES_TODO = 0; |
---|
1020 | __CANDIDATES_FINISHED = 0; |
---|
1021 | times(&before); |
---|
1022 | PS_descend(__CANDIDATES_ROOT, db->getConstRootNode(), 0, 0.0); |
---|
1023 | printf("PS_descend : total "); |
---|
1024 | PS_print_time_diff(&before); |
---|
1025 | |
---|
1026 | // |
---|
1027 | // save candidates |
---|
1028 | // |
---|
1029 | printf("saving candidates to %s..\n", candidates_filename); |
---|
1030 | candidates_file = new PS_FileBuffer(candidates_filename, PS_FileBuffer::WRITEONLY); |
---|
1031 | __CANDIDATES_ROOT->false_IDs = __BITS_IN_MAP; |
---|
1032 | __CANDIDATES_ROOT->save(candidates_file, __BITS_IN_MAP); |
---|
1033 | } |
---|
1034 | else { |
---|
1035 | printf("loading candidates..\n"); |
---|
1036 | candidates_file = new PS_FileBuffer(candidates_filename+1, PS_FileBuffer::READONLY); |
---|
1037 | __CANDIDATES_ROOT->load(candidates_file, __BITS_IN_MAP, db->getConstRootNode()); |
---|
1038 | printf("..loaded candidates file (enter to continue)\n"); |
---|
1039 | } |
---|
1040 | printf("cleaning up... candidates file\n"); fflush(stdout); |
---|
1041 | delete candidates_file; |
---|
1042 | |
---|
1043 | // |
---|
1044 | // scan candidates-tree for leaf candidates |
---|
1045 | // |
---|
1046 | printf("CANDIDATES :\n"); |
---|
1047 | __CANDIDATES_ROOT->print(); |
---|
1048 | printf("\nsearching leaf candidates.. "); |
---|
1049 | PS_CandidateSet leaf_candidates; |
---|
1050 | PS_get_leaf_candidates(__CANDIDATES_ROOT, leaf_candidates); |
---|
1051 | printf("%zu\n", leaf_candidates.size()); |
---|
1052 | |
---|
1053 | // |
---|
1054 | // scan tree for next candidates (below the ones in leaf_candidates) |
---|
1055 | // |
---|
1056 | times(&before); |
---|
1057 | long round = 0; |
---|
1058 | while (!leaf_candidates.empty() && round<200) { |
---|
1059 | printf("\nsearching next candidates [round #%li]..\n", ++round); |
---|
1060 | PS_get_next_candidates(db->getConstRootNode(), leaf_candidates); |
---|
1061 | |
---|
1062 | printf("rounds %li total ", round); |
---|
1063 | PS_print_time_diff(&before); |
---|
1064 | } |
---|
1065 | |
---|
1066 | printf("\nFINAL CANDIDATES:\n"); |
---|
1067 | __CANDIDATES_ROOT->print(0, true); |
---|
1068 | printf("\nfinal leaf candidates.. (%zu)\n", leaf_candidates.size()); |
---|
1069 | PS_get_leaf_candidates(__CANDIDATES_ROOT, leaf_candidates, true); |
---|
1070 | for (PS_CandidateSetIter c = leaf_candidates.begin(); |
---|
1071 | c != leaf_candidates.end(); |
---|
1072 | ++c) { |
---|
1073 | (*(*c)).print(); |
---|
1074 | } |
---|
1075 | |
---|
1076 | // |
---|
1077 | // save final candidates |
---|
1078 | // |
---|
1079 | printf("saving final candidates to %s..\n", final_candidates_filename); |
---|
1080 | candidates_file = new PS_FileBuffer(final_candidates_filename, PS_FileBuffer::WRITEONLY); |
---|
1081 | __CANDIDATES_ROOT->save(candidates_file, __BITS_IN_MAP); |
---|
1082 | printf("cleaning up... candidates file\n"); fflush(stdout); |
---|
1083 | delete candidates_file; |
---|
1084 | |
---|
1085 | // |
---|
1086 | // starting with the best candidate print the __MAP for each |
---|
1087 | // depth walking up to root of candidates-tree |
---|
1088 | // |
---|
1089 | if (result_out_prefix) { |
---|
1090 | // put __MAP in the state 'after applying best_candidate and its parents' |
---|
1091 | PS_CandidateSPtr best_candidate_smart = *leaf_candidates.begin(); |
---|
1092 | PS_CandidatePtr best_candidate = &(*best_candidate_smart); |
---|
1093 | PS_ascend(best_candidate); |
---|
1094 | PS_apply_path_to_bitmap(best_candidate->path); |
---|
1095 | while (best_candidate) { |
---|
1096 | PS_GNUPlot(result_out_prefix, __MAX_DEPTH--, best_candidate->path, noMatches); |
---|
1097 | best_candidate = PS_ascend(best_candidate); |
---|
1098 | } |
---|
1099 | } |
---|
1100 | |
---|
1101 | // |
---|
1102 | // clean up |
---|
1103 | // |
---|
1104 | printf("cleaning up... candidates\n"); fflush(stdout); |
---|
1105 | delete __CANDIDATES_ROOT; |
---|
1106 | free(__PATH_IN_CANDIDATES); |
---|
1107 | printf("cleaning up... database\n"); fflush(stdout); |
---|
1108 | delete db; |
---|
1109 | printf("cleaning up... bitmaps\n"); fflush(stdout); |
---|
1110 | delete __PRESET_MAP; |
---|
1111 | delete __MAP; |
---|
1112 | |
---|
1113 | return 0; |
---|
1114 | } |
---|