| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ps_find_probes.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Wolfram Foerster in October 2002 // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | #include "ps_tools.hxx" |
|---|
| 13 | #include "ps_database.hxx" |
|---|
| 14 | #include "ps_candidate.hxx" |
|---|
| 15 | |
|---|
| 16 | #include <cmath> |
|---|
| 17 | #include <iostream> |
|---|
| 18 | #include <sys/times.h> |
|---|
| 19 | |
|---|
| 20 | using namespace std; |
|---|
| 21 | |
|---|
| 22 | template<class T> |
|---|
| 23 | void PS_print_set_ranges(const char *_set_name, const set<T> &_set, const bool _cr_at_end = true) { |
|---|
| 24 | fflush(stdout); |
|---|
| 25 | printf("%s size (%3zu) : ", _set_name, _set.size()); |
|---|
| 26 | if (_set.empty()) { |
|---|
| 27 | printf("(empty)"); |
|---|
| 28 | } |
|---|
| 29 | else { |
|---|
| 30 | typename set<T>::const_iterator range_begin = _set.begin(); |
|---|
| 31 | typename set<T>::const_iterator range_end = range_begin; |
|---|
| 32 | typename set<T>::const_iterator it = _set.begin(); |
|---|
| 33 | for (++it; |
|---|
| 34 | it != _set.end(); |
|---|
| 35 | ++it) { |
|---|
| 36 | if (*it == (*range_end)+1) { |
|---|
| 37 | range_end = it; |
|---|
| 38 | } |
|---|
| 39 | else { |
|---|
| 40 | if (range_end == range_begin) { |
|---|
| 41 | cout << *range_begin << " "; |
|---|
| 42 | } |
|---|
| 43 | else { |
|---|
| 44 | cout << *range_begin << "-" << *range_end << " "; |
|---|
| 45 | } |
|---|
| 46 | range_begin = it; |
|---|
| 47 | range_end = it; |
|---|
| 48 | } |
|---|
| 49 | } |
|---|
| 50 | cout << *range_begin; |
|---|
| 51 | if (range_end != range_begin) cout << "-" << *range_end; |
|---|
| 52 | } |
|---|
| 53 | if (_cr_at_end) cout << "\n"; |
|---|
| 54 | fflush(stdout); |
|---|
| 55 | } |
|---|
| 56 | |
|---|
| 57 | |
|---|
| 58 | template<class T1, class T2> |
|---|
| 59 | void PS_print_map_ranges(const char *_map_name, const map<T1, T2> &_map, const bool _compare_keys = true, const bool _cr_at_end = true) { |
|---|
| 60 | fflush(stdout); |
|---|
| 61 | if (_map.empty()) { |
|---|
| 62 | printf("%s size ( 0) : (empty)", _map_name); |
|---|
| 63 | } |
|---|
| 64 | else { |
|---|
| 65 | if (_compare_keys) { |
|---|
| 66 | printf("%s size (%3i) : ", _map_name, _map.size()); |
|---|
| 67 | typename map<T1, T2>::const_iterator range_begin = _map.begin(); |
|---|
| 68 | typename map<T1, T2>::const_iterator range_end = range_begin; |
|---|
| 69 | typename map<T1, T2>::const_iterator it = _map.begin(); |
|---|
| 70 | for (++it; |
|---|
| 71 | it != _map.end(); |
|---|
| 72 | ++it) { |
|---|
| 73 | if (it->first == range_end->first+1) { |
|---|
| 74 | range_end = it; |
|---|
| 75 | } |
|---|
| 76 | else { |
|---|
| 77 | if (range_end == range_begin) { |
|---|
| 78 | cout << "(" << range_begin->first << "," << range_begin->second << ") "; |
|---|
| 79 | } |
|---|
| 80 | else { |
|---|
| 81 | cout << "(" << range_begin->first << "-" << range_end->first << "," << range_begin->second << ") "; |
|---|
| 82 | } |
|---|
| 83 | range_begin = it; |
|---|
| 84 | range_end = it; |
|---|
| 85 | } |
|---|
| 86 | } |
|---|
| 87 | if (range_end == range_begin) { |
|---|
| 88 | cout << "(" << range_begin->first << "," << range_begin->second << ") "; |
|---|
| 89 | } |
|---|
| 90 | else { |
|---|
| 91 | cout << "(" << range_begin->first << "-" << range_end->first << "," << range_begin->second << ") "; |
|---|
| 92 | } |
|---|
| 93 | } |
|---|
| 94 | else { |
|---|
| 95 | map<T2, set<pair<T1, T1> > > value2indices; |
|---|
| 96 | map<T2, unsigned long int> value2count; |
|---|
| 97 | pair<T1, T1> range; |
|---|
| 98 | range.first = _map.begin()->first; |
|---|
| 99 | range.second = range.first; |
|---|
| 100 | T2 cur_value = _map.begin()->second; |
|---|
| 101 | typename map<T1, T2>::const_iterator it = _map.begin(); |
|---|
| 102 | for (++it; |
|---|
| 103 | it != _map.end(); |
|---|
| 104 | ++it) { |
|---|
| 105 | if (it->second == cur_value) { |
|---|
| 106 | if (it->first == range.second+1) { |
|---|
| 107 | range.second = it->first; |
|---|
| 108 | } |
|---|
| 109 | else { |
|---|
| 110 | value2indices[cur_value].insert(range); |
|---|
| 111 | value2count[cur_value] += range.second - range.first + 1; |
|---|
| 112 | range.first = it->first; |
|---|
| 113 | range.second = it->first; |
|---|
| 114 | } |
|---|
| 115 | } |
|---|
| 116 | else { |
|---|
| 117 | value2indices[cur_value].insert(range); |
|---|
| 118 | value2count[cur_value] += range.second - range.first + 1; |
|---|
| 119 | range.first = it->first; |
|---|
| 120 | range.second = it->first; |
|---|
| 121 | cur_value = it->second; |
|---|
| 122 | } |
|---|
| 123 | } |
|---|
| 124 | value2indices[cur_value].insert(range); |
|---|
| 125 | value2count[cur_value] += range.second - range.first + 1; |
|---|
| 126 | for (typename map<T2, set<pair<T1, T1> > >::const_iterator value = value2indices.begin(); |
|---|
| 127 | value != value2indices.end(); |
|---|
| 128 | ++value) { |
|---|
| 129 | if (value != value2indices.begin()) cout << "\n"; |
|---|
| 130 | printf("%s size (%3i) : value (", _map_name, _map.size()); |
|---|
| 131 | cout << value->first << ") count (" << value2count[value->first] << ") ["; |
|---|
| 132 | for (typename set<pair<T1, T1> >::const_iterator indices = value->second.begin(); |
|---|
| 133 | indices != value->second.end(); |
|---|
| 134 | ++indices) { |
|---|
| 135 | if (indices != value->second.begin()) cout << " "; |
|---|
| 136 | cout << indices->first; |
|---|
| 137 | if (indices->first != indices->second) cout << "-" << indices->second; |
|---|
| 138 | } |
|---|
| 139 | cout << "]," << value->first << ")"; |
|---|
| 140 | } |
|---|
| 141 | } |
|---|
| 142 | } |
|---|
| 143 | if (_cr_at_end) cout << "\n"; |
|---|
| 144 | fflush(stdout); |
|---|
| 145 | } |
|---|
| 146 | |
|---|
| 147 | // |
|---|
| 148 | // common globals |
|---|
| 149 | // |
|---|
| 150 | |
|---|
| 151 | static bool __VERBOSE = false; |
|---|
| 152 | static SpeciesID __MAX_ID; |
|---|
| 153 | static SpeciesID __MIN_ID; |
|---|
| 154 | static SpeciesID __SPECIES_COUNT; |
|---|
| 155 | static unsigned long __BITS_IN_MAP; |
|---|
| 156 | static PS_BitMap_Counted *__MAP; |
|---|
| 157 | static PS_BitMap_Counted *__PRESET_MAP; |
|---|
| 158 | static IDSet __SOURCE_ID_SET; |
|---|
| 159 | static PS_BitSet::IndexSet __TARGET_ID_SET; |
|---|
| 160 | |
|---|
| 161 | // |
|---|
| 162 | // globals for PS_calc_next_speciesid_sets, |
|---|
| 163 | // PS_calc_next_speciesid_sets_for_candidate |
|---|
| 164 | // |
|---|
| 165 | |
|---|
| 166 | // count of trues for a source_id may be max 5% |
|---|
| 167 | // (of the difference between __SPECIES_COUNT and the lowest |
|---|
| 168 | // count of trues per species) higher than the lowest count |
|---|
| 169 | // of trues for the species in __SOURCE_ID_SET |
|---|
| 170 | #define __THRESHOLD_PERCENTAGE_NEXT_SOURCE_ID_SET 5 |
|---|
| 171 | // target_id must be at least in 95% of total target_id_sets |
|---|
| 172 | // to be in __TARGET_ID_SET |
|---|
| 173 | #define __THRESHOLD_PERCENTAGE_NEXT_TARGET_ID_SET 95 |
|---|
| 174 | |
|---|
| 175 | static SpeciesID __MIN_SETS_ID; |
|---|
| 176 | static SpeciesID __MAX_SETS_ID; |
|---|
| 177 | |
|---|
| 178 | // |
|---|
| 179 | // globals for PS_find_probes |
|---|
| 180 | // |
|---|
| 181 | |
|---|
| 182 | // initially a probe must match 40-60% of species |
|---|
| 183 | // in the ID-sets to be a candidate |
|---|
| 184 | // if PS_find_probes fails to find candidates within this range |
|---|
| 185 | // it is repeated with a broadened range (by 5% each border) |
|---|
| 186 | #define __MIN_PERCENTAGE_SET_MATCH 40 |
|---|
| 187 | #define __MAX_PERCENTAGE_SET_MATCH 60 |
|---|
| 188 | |
|---|
| 189 | static float __SOURCE_MIN_MATCH_COUNT; |
|---|
| 190 | static float __SOURCE_MAX_MATCH_COUNT; |
|---|
| 191 | static float __TARGET_MIN_MATCH_COUNT; |
|---|
| 192 | static float __TARGET_MAX_MATCH_COUNT; |
|---|
| 193 | static float __SOURCE_PERFECT_MATCH_COUNT; |
|---|
| 194 | static float __TARGET_PERFECT_MATCH_COUNT; |
|---|
| 195 | static unsigned long __PROBES_COUNTER; |
|---|
| 196 | static unsigned long __PROBES_REMOVED; |
|---|
| 197 | static PS_CandidatePtr __CANDIDATES_ROOT; |
|---|
| 198 | static unsigned long __CANDIDATES_COUNTER; |
|---|
| 199 | static unsigned long __CANDIDATES_TODO; |
|---|
| 200 | static unsigned long __CANDIDATES_FINISHED; |
|---|
| 201 | static unsigned long __MAX_DEPTH; |
|---|
| 202 | |
|---|
| 203 | // |
|---|
| 204 | // globals for PS_find_probe_for_sets |
|---|
| 205 | // PS_test_candidate_on_bitmap |
|---|
| 206 | // |
|---|
| 207 | |
|---|
| 208 | static IDSet __PATH; |
|---|
| 209 | |
|---|
| 210 | // |
|---|
| 211 | // globals for PS_descend |
|---|
| 212 | // |
|---|
| 213 | static char *__PATH_IN_CANDIDATES; |
|---|
| 214 | |
|---|
| 215 | |
|---|
| 216 | static unsigned long int PS_test_candidate_on_bitmap(float *_filling_level = NULp, PS_BitMap_Counted *_map = NULp) { |
|---|
| 217 | // returns number of locations in __MAP that would be switched |
|---|
| 218 | // from false to true by __PATH |
|---|
| 219 | |
|---|
| 220 | // iterate over all IDs except path |
|---|
| 221 | IDSetCIter path_iter = __PATH.begin(); |
|---|
| 222 | SpeciesID next_path_ID = *path_iter; |
|---|
| 223 | unsigned long int gain = 0; |
|---|
| 224 | if (_map) { |
|---|
| 225 | for (SpeciesID not_in_path_ID = __MIN_ID; |
|---|
| 226 | not_in_path_ID <= __MAX_ID; |
|---|
| 227 | ++not_in_path_ID) { |
|---|
| 228 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
|---|
| 229 | ++path_iter; // advance to next path ID |
|---|
| 230 | next_path_ID = (path_iter == __PATH.end()) ? -1 : *path_iter; |
|---|
| 231 | continue; // skip this ID |
|---|
| 232 | } |
|---|
| 233 | // iterate over path IDs |
|---|
| 234 | for (IDSetCIter path_ID = __PATH.begin(); |
|---|
| 235 | path_ID != __PATH.end(); |
|---|
| 236 | ++path_ID) { |
|---|
| 237 | if (not_in_path_ID == *path_ID) |
|---|
| 238 | continue; // obviously a probe cant differ a species from itself |
|---|
| 239 | if (not_in_path_ID > *path_ID) { |
|---|
| 240 | gain += !_map->get(not_in_path_ID, *path_ID); |
|---|
| 241 | } |
|---|
| 242 | else { |
|---|
| 243 | gain += !_map->get(*path_ID, not_in_path_ID); |
|---|
| 244 | } |
|---|
| 245 | } |
|---|
| 246 | } |
|---|
| 247 | } |
|---|
| 248 | else { |
|---|
| 249 | for (SpeciesID not_in_path_ID = __MIN_ID; |
|---|
| 250 | not_in_path_ID <= __MAX_ID; |
|---|
| 251 | ++not_in_path_ID) { |
|---|
| 252 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
|---|
| 253 | ++path_iter; // advance to next path ID |
|---|
| 254 | next_path_ID = (path_iter == __PATH.end()) ? -1 : *path_iter; |
|---|
| 255 | continue; // skip this ID |
|---|
| 256 | } |
|---|
| 257 | // iterate over path IDs |
|---|
| 258 | for (IDSetCIter path_ID = __PATH.begin(); |
|---|
| 259 | path_ID != __PATH.end(); |
|---|
| 260 | ++path_ID) { |
|---|
| 261 | if (not_in_path_ID == *path_ID) |
|---|
| 262 | continue; // obviously a probe cant differ a species from itself |
|---|
| 263 | if (not_in_path_ID > *path_ID) { |
|---|
| 264 | gain += !__MAP->get(not_in_path_ID, *path_ID); |
|---|
| 265 | } |
|---|
| 266 | else { |
|---|
| 267 | gain += !__MAP->get(*path_ID, not_in_path_ID); |
|---|
| 268 | } |
|---|
| 269 | } |
|---|
| 270 | } |
|---|
| 271 | } |
|---|
| 272 | if (_filling_level) { |
|---|
| 273 | unsigned long int trues = (_map) ? _map->getCountOfTrues() + gain : __MAP->getCountOfTrues() + gain; |
|---|
| 274 | *_filling_level = ((float)trues / __BITS_IN_MAP)*100.0; |
|---|
| 275 | } |
|---|
| 276 | return gain; |
|---|
| 277 | } |
|---|
| 278 | |
|---|
| 279 | |
|---|
| 280 | static bool PS_test_sets_on_path(float &_distance) { |
|---|
| 281 | long count_matched_source = 0; |
|---|
| 282 | long count_matched_target = 0; |
|---|
| 283 | SpeciesID path_id; |
|---|
| 284 | IDSetCIter path_end = __PATH.end(); |
|---|
| 285 | IDSetCIter path = __PATH.begin(); |
|---|
| 286 | IDSetCIter source_end = __SOURCE_ID_SET.end(); |
|---|
| 287 | IDSetCIter source = __SOURCE_ID_SET.begin(); |
|---|
| 288 | PS_BitSet::IndexSet::const_iterator target_end = __TARGET_ID_SET.end(); |
|---|
| 289 | PS_BitSet::IndexSet::const_iterator target = __TARGET_ID_SET.begin(); |
|---|
| 290 | |
|---|
| 291 | while (path != path_end) { |
|---|
| 292 | path_id = *path; |
|---|
| 293 | while ((*target < path_id) && (target != target_end)) { |
|---|
| 294 | ++target; |
|---|
| 295 | } |
|---|
| 296 | if ((target != target_end) && (*target == path_id)) |
|---|
| 297 | ++count_matched_target; |
|---|
| 298 | while ((*source < path_id) && (source != source_end)) { |
|---|
| 299 | ++source; |
|---|
| 300 | } |
|---|
| 301 | if ((source != source_end) && (*source == path_id)) |
|---|
| 302 | ++count_matched_source; |
|---|
| 303 | ++path; |
|---|
| 304 | } |
|---|
| 305 | |
|---|
| 306 | if ((count_matched_source > __SOURCE_MIN_MATCH_COUNT) && |
|---|
| 307 | (count_matched_source < __SOURCE_MAX_MATCH_COUNT) && |
|---|
| 308 | (count_matched_target > __TARGET_MIN_MATCH_COUNT) && |
|---|
| 309 | (count_matched_target < __TARGET_MAX_MATCH_COUNT)) { |
|---|
| 310 | _distance = fabs(__SOURCE_PERFECT_MATCH_COUNT - count_matched_source) + fabs(__TARGET_PERFECT_MATCH_COUNT - count_matched_target); |
|---|
| 311 | return true; |
|---|
| 312 | } |
|---|
| 313 | return false; |
|---|
| 314 | } |
|---|
| 315 | |
|---|
| 316 | static void PS_find_probe_for_sets(const PS_NodePtr& _ps_node, PS_CandidatePtr _candidate_parent) { |
|---|
| 317 | // scan subtree starting with _ps_node for candidates |
|---|
| 318 | |
|---|
| 319 | SpeciesID id = _ps_node->getNum(); |
|---|
| 320 | bool has_probes = _ps_node->hasProbes(); |
|---|
| 321 | |
|---|
| 322 | // |
|---|
| 323 | // append ID to path |
|---|
| 324 | // |
|---|
| 325 | __PATH.insert(id); |
|---|
| 326 | |
|---|
| 327 | // |
|---|
| 328 | // don't look at path until ID is greater than lowest ID in the sets of IDs |
|---|
| 329 | // also don't use a node if its already used as candidate |
|---|
| 330 | // |
|---|
| 331 | if ((id >= __MIN_SETS_ID) && has_probes && !_candidate_parent->alreadyUsedNode(_ps_node)) { |
|---|
| 332 | ++__PROBES_COUNTER; |
|---|
| 333 | if (__VERBOSE && (__PROBES_COUNTER % 100 == 0)) { |
|---|
| 334 | printf("%8lup %8luc\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", __PROBES_COUNTER, __CANDIDATES_COUNTER); |
|---|
| 335 | fflush(stdout); |
|---|
| 336 | } |
|---|
| 337 | // make intersections of __PATH with __SOURCE_ID_SET and __TARGET_ID_SET |
|---|
| 338 | float distance_to_perfect_match; |
|---|
| 339 | if (PS_test_sets_on_path(distance_to_perfect_match)) { |
|---|
| 340 | unsigned long gain = PS_test_candidate_on_bitmap(); // no -> calc gain and make new candidate node |
|---|
| 341 | int status = (gain < (unsigned)__SPECIES_COUNT / 100) |
|---|
| 342 | ? 0 |
|---|
| 343 | : _candidate_parent->addChild(static_cast<unsigned long>(distance_to_perfect_match), gain, _ps_node, __PATH); |
|---|
| 344 | if (status > 0) { |
|---|
| 345 | if (status == 2) { |
|---|
| 346 | ++__CANDIDATES_COUNTER; |
|---|
| 347 | if (__VERBOSE) { |
|---|
| 348 | printf("%8lup %8luc\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", __PROBES_COUNTER, __CANDIDATES_COUNTER); |
|---|
| 349 | fflush(stdout); |
|---|
| 350 | } |
|---|
| 351 | } |
|---|
| 352 | fflush(stdout); |
|---|
| 353 | } |
|---|
| 354 | } |
|---|
| 355 | } |
|---|
| 356 | |
|---|
| 357 | // |
|---|
| 358 | // step down the children if either ID is lower than highest |
|---|
| 359 | // ID in the set of ID-pairs or the node has no probes |
|---|
| 360 | // |
|---|
| 361 | if ((id < __MAX_SETS_ID) || (! has_probes)) { |
|---|
| 362 | for (PS_NodeMapConstIterator i = _ps_node->getChildrenBegin(); |
|---|
| 363 | i != _ps_node->getChildrenEnd(); |
|---|
| 364 | ++i) { |
|---|
| 365 | PS_find_probe_for_sets(i->second, _candidate_parent); |
|---|
| 366 | } |
|---|
| 367 | } |
|---|
| 368 | |
|---|
| 369 | // |
|---|
| 370 | // remove ID from path |
|---|
| 371 | // |
|---|
| 372 | __PATH.erase(id); |
|---|
| 373 | } |
|---|
| 374 | |
|---|
| 375 | static void PS_find_probes(const PS_NodePtr& _root_node, const int _round, PS_CandidatePtr _candidate_parent, const float _filling_level) { |
|---|
| 376 | // scan PS_Node-tree for candidates to raise filling level of __MAP |
|---|
| 377 | |
|---|
| 378 | __PROBES_COUNTER = 0; |
|---|
| 379 | __CANDIDATES_COUNTER = 0; |
|---|
| 380 | __SOURCE_MIN_MATCH_COUNT = (__SOURCE_ID_SET.size() * (__MIN_PERCENTAGE_SET_MATCH - (_round * 5))) / 100.0; |
|---|
| 381 | __SOURCE_MAX_MATCH_COUNT = (__SOURCE_ID_SET.size() * (__MAX_PERCENTAGE_SET_MATCH + (_round * 5))) / 100.0; |
|---|
| 382 | __TARGET_MIN_MATCH_COUNT = (__TARGET_ID_SET.size() * (__MIN_PERCENTAGE_SET_MATCH - (_round * 5))) / 100.0; |
|---|
| 383 | __TARGET_MAX_MATCH_COUNT = (__TARGET_ID_SET.size() * (__MAX_PERCENTAGE_SET_MATCH + (_round * 5))) / 100.0; |
|---|
| 384 | __SOURCE_PERFECT_MATCH_COUNT = __SOURCE_ID_SET.size() / 2.0; |
|---|
| 385 | __TARGET_PERFECT_MATCH_COUNT = __TARGET_ID_SET.size() / 2.0; |
|---|
| 386 | __MIN_SETS_ID = (*__SOURCE_ID_SET.begin() < *__TARGET_ID_SET.begin()) ? *__SOURCE_ID_SET.begin() : *__TARGET_ID_SET.begin(); |
|---|
| 387 | __MAX_SETS_ID = (*__SOURCE_ID_SET.rbegin() < *__TARGET_ID_SET.rbegin()) ? *__SOURCE_ID_SET.rbegin() : *__TARGET_ID_SET.rbegin(); |
|---|
| 388 | printf("PS_find_probes(%i) : source match %10.3f .. %10.3f target match %10.3f .. %10.3f\n", _round, __SOURCE_MIN_MATCH_COUNT, __SOURCE_MAX_MATCH_COUNT, __TARGET_MIN_MATCH_COUNT, __TARGET_MAX_MATCH_COUNT); fflush(stdout); |
|---|
| 389 | struct tms before; |
|---|
| 390 | times(&before); |
|---|
| 391 | for (PS_NodeMapConstIterator i = _root_node->getChildrenBegin(); |
|---|
| 392 | (i != _root_node->getChildrenEnd()) && (i->first < __MAX_SETS_ID); |
|---|
| 393 | ++i) |
|---|
| 394 | { |
|---|
| 395 | __PATH.clear(); |
|---|
| 396 | PS_find_probe_for_sets(i->second, _candidate_parent); |
|---|
| 397 | } |
|---|
| 398 | printf("PS_find_probes(%i) : %li candidates of %li probes visited ", _round, __CANDIDATES_COUNTER, __PROBES_COUNTER); |
|---|
| 399 | PS_print_time_diff(&before); |
|---|
| 400 | // reduce candidates to best,worst and middle gain |
|---|
| 401 | _candidate_parent->reduceChildren(_filling_level); |
|---|
| 402 | for (PS_CandidateByGainMapCIter c = _candidate_parent->children.begin(); |
|---|
| 403 | c != _candidate_parent->children.end(); |
|---|
| 404 | ++c) { |
|---|
| 405 | printf("PS_find_probes(%i) : ", _round); |
|---|
| 406 | c->second->print(); |
|---|
| 407 | ++__CANDIDATES_TODO; |
|---|
| 408 | } |
|---|
| 409 | fflush(stdout); |
|---|
| 410 | } |
|---|
| 411 | |
|---|
| 412 | static void PS_calc_next_speciesid_sets() { |
|---|
| 413 | // scan __MAP to find sets of IDs that need differentiation most |
|---|
| 414 | |
|---|
| 415 | // |
|---|
| 416 | // 1. __SOURCE_ID_SET |
|---|
| 417 | // scan bitmap for species that need more matches |
|---|
| 418 | // |
|---|
| 419 | SpeciesID highest_count; |
|---|
| 420 | SpeciesID lowest_count; |
|---|
| 421 | float threshold; |
|---|
| 422 | SpeciesID count; |
|---|
| 423 | |
|---|
| 424 | // first pass -- get lowest count of trues and calc threshold |
|---|
| 425 | lowest_count = __SPECIES_COUNT; |
|---|
| 426 | highest_count = 0; |
|---|
| 427 | for (SpeciesID id = __MIN_ID; |
|---|
| 428 | id <= __MAX_ID; |
|---|
| 429 | ++id) { |
|---|
| 430 | count = __MAP->getCountFor(id); |
|---|
| 431 | if (count == __SPECIES_COUNT-__MIN_ID-1) continue; // i cannot improve species that can be differed from all others |
|---|
| 432 | if (count > highest_count) highest_count = count; |
|---|
| 433 | if (count < lowest_count) lowest_count = count; |
|---|
| 434 | } |
|---|
| 435 | threshold = (((__SPECIES_COUNT-lowest_count) * __THRESHOLD_PERCENTAGE_NEXT_SOURCE_ID_SET) / 100.0) + lowest_count; |
|---|
| 436 | printf("PS_calc_next_speciesid_sets() : SOURCE count 1's [%i..%i] threshold (%.3f)", lowest_count, highest_count, threshold); |
|---|
| 437 | |
|---|
| 438 | // second pass -- get IDs where count is below or equal threshold |
|---|
| 439 | __SOURCE_ID_SET.clear(); |
|---|
| 440 | for (SpeciesID id = __MIN_ID; |
|---|
| 441 | id <= __MAX_ID; |
|---|
| 442 | ++id) { |
|---|
| 443 | if (__MAP->getCountFor(id) <= threshold) __SOURCE_ID_SET.insert(id); |
|---|
| 444 | } |
|---|
| 445 | PS_print_set_ranges(" __SOURCE_ID_SET", __SOURCE_ID_SET); |
|---|
| 446 | |
|---|
| 447 | // |
|---|
| 448 | // 2. __TARGET_ID_SET |
|---|
| 449 | // scan bitmap for species IDs that need to be distinguished from MOST species in __SOURCE_ID_SET |
|---|
| 450 | // |
|---|
| 451 | ID2IDMap count_falses_per_id; |
|---|
| 452 | |
|---|
| 453 | // first -- get the IDs that need differentiation from __SOURCE_ID_SET |
|---|
| 454 | unsigned long count_iterations = 0; |
|---|
| 455 | for (IDSetCIter source_id = __SOURCE_ID_SET.begin(); |
|---|
| 456 | source_id != __SOURCE_ID_SET.end(); |
|---|
| 457 | ++source_id, ++count_iterations) { |
|---|
| 458 | // get 'next_target_set' |
|---|
| 459 | __MAP->getFalseIndicesFor(*source_id, __TARGET_ID_SET); |
|---|
| 460 | // count falses per SpeciesID |
|---|
| 461 | for (PS_BitSet::IndexSet::iterator target_id = __TARGET_ID_SET.begin(); |
|---|
| 462 | target_id != __TARGET_ID_SET.end(); |
|---|
| 463 | ++target_id) { |
|---|
| 464 | if ((*target_id < __MIN_ID) || (*target_id > __MAX_ID)) continue; // skip ID's that are outside DB-IDs-range (like zero if __MIN_ID is one) |
|---|
| 465 | if (*target_id != *source_id) ++count_falses_per_id[*target_id]; |
|---|
| 466 | } |
|---|
| 467 | } |
|---|
| 468 | |
|---|
| 469 | // second -- get highest count of falses and calc threshold |
|---|
| 470 | lowest_count = __SPECIES_COUNT; |
|---|
| 471 | highest_count = 0; |
|---|
| 472 | for (ID2IDMapCIter count_per_id = count_falses_per_id.begin(); |
|---|
| 473 | count_per_id != count_falses_per_id.end(); |
|---|
| 474 | ++count_per_id) { |
|---|
| 475 | if (count_per_id->second > highest_count) highest_count = count_per_id->second; |
|---|
| 476 | if (count_per_id->second < lowest_count) lowest_count = count_per_id->second; |
|---|
| 477 | } |
|---|
| 478 | printf("PS_calc_next_speciesid_sets() : TARGET count 0's [%i..%i]", lowest_count, highest_count); |
|---|
| 479 | |
|---|
| 480 | // third -- put all IDs in __TARGET_ID_SET that are needed most by species from __SOURCE_ID_SET |
|---|
| 481 | __TARGET_ID_SET.clear(); |
|---|
| 482 | for (ID2IDMapCIter count_per_id = count_falses_per_id.begin(); |
|---|
| 483 | count_per_id != count_falses_per_id.end(); |
|---|
| 484 | ++count_per_id) { |
|---|
| 485 | if (count_per_id->second == highest_count) __TARGET_ID_SET.insert(count_per_id->first); |
|---|
| 486 | } |
|---|
| 487 | PS_print_set_ranges(" __TARGET_ID_SET", __TARGET_ID_SET); |
|---|
| 488 | } |
|---|
| 489 | |
|---|
| 490 | void PS_apply_path_to_bitmap(IDSet &_path, const bool _silent = false, PS_BitMap_Counted *_map = NULp) { |
|---|
| 491 | // set true in __MAP for all combinations of IDs (in _path , not in _path) |
|---|
| 492 | |
|---|
| 493 | // iterate over all IDs except path |
|---|
| 494 | IDSetCIter path_iter = _path.begin(); |
|---|
| 495 | SpeciesID next_path_ID = *path_iter; |
|---|
| 496 | unsigned long int gain = 0; |
|---|
| 497 | if (_map) { // called for a 'private' map ? |
|---|
| 498 | for (SpeciesID not_in_path_ID = __MIN_ID; |
|---|
| 499 | not_in_path_ID <= __MAX_ID; |
|---|
| 500 | ++not_in_path_ID) { |
|---|
| 501 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
|---|
| 502 | ++path_iter; // advance to next path ID |
|---|
| 503 | next_path_ID = (path_iter == _path.end()) ? -1 : *path_iter; |
|---|
| 504 | continue; // skip this ID |
|---|
| 505 | } |
|---|
| 506 | // iterate over path IDs |
|---|
| 507 | for (IDSetCIter path_ID = _path.begin(); |
|---|
| 508 | path_ID != _path.end(); |
|---|
| 509 | ++path_ID) { |
|---|
| 510 | if (not_in_path_ID == *path_ID) continue; // obviously a probe cant differ a species from itself |
|---|
| 511 | if (not_in_path_ID > *path_ID) { |
|---|
| 512 | gain += !_map->set(not_in_path_ID, *path_ID, true); |
|---|
| 513 | } |
|---|
| 514 | else { |
|---|
| 515 | gain += !_map->set(*path_ID, not_in_path_ID, true); |
|---|
| 516 | } |
|---|
| 517 | } |
|---|
| 518 | } |
|---|
| 519 | } |
|---|
| 520 | else { // called for __MAP |
|---|
| 521 | for (SpeciesID not_in_path_ID = __MIN_ID; |
|---|
| 522 | not_in_path_ID <= __MAX_ID; |
|---|
| 523 | ++not_in_path_ID) { |
|---|
| 524 | if (not_in_path_ID == next_path_ID) { // if i run into a ID in path |
|---|
| 525 | ++path_iter; // advance to next path ID |
|---|
| 526 | next_path_ID = (path_iter == _path.end()) ? -1 : *path_iter; |
|---|
| 527 | continue; // skip this ID |
|---|
| 528 | } |
|---|
| 529 | // iterate over path IDs |
|---|
| 530 | for (IDSetCIter path_ID = _path.begin(); |
|---|
| 531 | path_ID != _path.end(); |
|---|
| 532 | ++path_ID) { |
|---|
| 533 | if (not_in_path_ID == *path_ID) continue; // obviously a probe cant differ a species from itself |
|---|
| 534 | if (not_in_path_ID > *path_ID) { |
|---|
| 535 | gain += !__MAP->set(not_in_path_ID, *path_ID, true); |
|---|
| 536 | } |
|---|
| 537 | else { |
|---|
| 538 | gain += !__MAP->set(*path_ID, not_in_path_ID, true); |
|---|
| 539 | } |
|---|
| 540 | } |
|---|
| 541 | } |
|---|
| 542 | } |
|---|
| 543 | unsigned long int sets = (__SPECIES_COUNT-_path.size())*_path.size(); |
|---|
| 544 | if (!_silent) printf("PS_apply_path_to_bitmap() : gain %lu of %lu -- %.2f%% -> wasted %lu\n", gain, sets, ((float)gain/sets)*100.0, sets-gain); |
|---|
| 545 | } |
|---|
| 546 | |
|---|
| 547 | static float PS_filling_level(PS_CandidatePtr _candidate = NULp) { |
|---|
| 548 | // returns filling level of __MAP |
|---|
| 549 | |
|---|
| 550 | unsigned long trues = __MAP->getCountOfTrues(); |
|---|
| 551 | float percentage = ((float)trues / __BITS_IN_MAP)*100.0; |
|---|
| 552 | if (_candidate) { |
|---|
| 553 | _candidate->filling_level = percentage; |
|---|
| 554 | _candidate->false_IDs = __BITS_IN_MAP - trues; |
|---|
| 555 | } |
|---|
| 556 | else { |
|---|
| 557 | printf("PS_filling_level() : bitmap (%lu) now has %lu trues and %lu falses -- %.5f%% filled\n", __BITS_IN_MAP, trues, __BITS_IN_MAP-trues, percentage); |
|---|
| 558 | } |
|---|
| 559 | return percentage; |
|---|
| 560 | } |
|---|
| 561 | |
|---|
| 562 | void PS_GNUPlot(const char *_out_prefix, const long _iteration, const IDSet &_path, const ID2IDSet &_noMatches) { |
|---|
| 563 | // generate data- and commandfiles for GNUPlot to display __MAP and its count_trues_per_id |
|---|
| 564 | |
|---|
| 565 | char *buffer = (char *)malloc(4096); |
|---|
| 566 | // open data file |
|---|
| 567 | sprintf(buffer, "%s.%06li", _out_prefix, _iteration); |
|---|
| 568 | char *data_name = strdup(buffer); |
|---|
| 569 | PS_FileBuffer *file = new PS_FileBuffer(data_name, PS_FileBuffer::WRITEONLY); |
|---|
| 570 | // output data |
|---|
| 571 | long size = sprintf(buffer, "# CANDIDATE #%li path (%zu)\n", _iteration, _path.size()); |
|---|
| 572 | file->put(buffer, size); |
|---|
| 573 | for (IDSetCIter i = _path.begin(); |
|---|
| 574 | i != _path.end(); |
|---|
| 575 | ++i) { |
|---|
| 576 | size = sprintf(buffer, "# %i\n", *i); |
|---|
| 577 | file->put(buffer, size); |
|---|
| 578 | } |
|---|
| 579 | size = sprintf(buffer, "#noMatches (%zu)\n", _noMatches.size()); |
|---|
| 580 | file->put(buffer, size); |
|---|
| 581 | for (ID2IDSetCIter p = _noMatches.begin(); |
|---|
| 582 | p != _noMatches.end(); |
|---|
| 583 | ++p) { |
|---|
| 584 | size = sprintf(buffer, "%i %i\n", p->first, p->second); |
|---|
| 585 | file->put(buffer, size); |
|---|
| 586 | } |
|---|
| 587 | size = sprintf(buffer, "\n\n"); |
|---|
| 588 | file->put(buffer, size); |
|---|
| 589 | sprintf(buffer, "Bitmap after %li. candidate", _iteration); |
|---|
| 590 | char *title = strdup(buffer); |
|---|
| 591 | __MAP->printGNUplot(title, buffer, file); |
|---|
| 592 | // open gnuplot command file |
|---|
| 593 | sprintf(buffer, "%s.%06li.gnuplot", _out_prefix, _iteration); |
|---|
| 594 | file->reinit(buffer, PS_FileBuffer::WRITEONLY); |
|---|
| 595 | // output gnuplot commands |
|---|
| 596 | size = sprintf(buffer, "set terminal png color\nset output '%06li.bitmap.png\n", _iteration); |
|---|
| 597 | file->put(buffer, size); |
|---|
| 598 | size = sprintf(buffer, "set yrange [%i:%i] reverse\nset xrange [%i:%i]\n", __MIN_ID, __MAX_ID, __MIN_ID, __MAX_ID); |
|---|
| 599 | file->put(buffer, size); |
|---|
| 600 | size = sprintf(buffer, "set size square\nset title '%li. iteration'\nset pointsize 0.5\n", _iteration); |
|---|
| 601 | file->put(buffer, size); |
|---|
| 602 | size = sprintf(buffer, "plot '%s' index 1 title 'match ()' with dots 2,'%s' index 0 title 'no match (30)' with points 1\n", data_name, data_name); |
|---|
| 603 | file->put(buffer, size); |
|---|
| 604 | size = sprintf(buffer, "set terminal png color\nset output '%06li.counters.png\n", _iteration); |
|---|
| 605 | file->put(buffer, size); |
|---|
| 606 | size = sprintf(buffer, "set yrange [] noreverse\nset size nosquare\n"); |
|---|
| 607 | file->put(buffer, size); |
|---|
| 608 | size = sprintf(buffer, "plot '%s' index 2 title 'count/species' with impulses 2,'%s' index 3 title 'species/count' with points 1", data_name, data_name); |
|---|
| 609 | file->put(buffer, size); |
|---|
| 610 | |
|---|
| 611 | delete file; |
|---|
| 612 | free(title); |
|---|
| 613 | free(data_name); |
|---|
| 614 | free(buffer); |
|---|
| 615 | } |
|---|
| 616 | |
|---|
| 617 | static PS_CandidatePtr PS_ascend(PS_CandidatePtr _last_candidate) { |
|---|
| 618 | // 'remove' _last_candidate from __MAP by applying its parent's |
|---|
| 619 | // paths to a fresh __MAP |
|---|
| 620 | |
|---|
| 621 | // |
|---|
| 622 | // make fresh __MAP |
|---|
| 623 | // |
|---|
| 624 | __MAP->copy(__PRESET_MAP); |
|---|
| 625 | // |
|---|
| 626 | // apply paths of parent-candidates to __MAP |
|---|
| 627 | // |
|---|
| 628 | PS_CandidatePtr parent = _last_candidate->parent; |
|---|
| 629 | while (parent && !parent->path.empty()) { |
|---|
| 630 | PS_apply_path_to_bitmap(parent->path, true); |
|---|
| 631 | parent = parent->parent; |
|---|
| 632 | } |
|---|
| 633 | printf("ASCEND "); |
|---|
| 634 | PS_filling_level(); |
|---|
| 635 | // |
|---|
| 636 | // return the parent of the given candidate |
|---|
| 637 | // |
|---|
| 638 | return _last_candidate->parent; |
|---|
| 639 | } |
|---|
| 640 | |
|---|
| 641 | |
|---|
| 642 | static void PS_descend(PS_CandidatePtr _candidate_parent, const PS_NodePtr& _root_node, unsigned long _depth, const float _filling_level) { |
|---|
| 643 | struct tms before; |
|---|
| 644 | times(&before); |
|---|
| 645 | |
|---|
| 646 | printf("\nDESCEND ==================== depth (%lu / %lu) candidates (%lu / %lu -> %lu left) ====================\n", |
|---|
| 647 | _depth, |
|---|
| 648 | __MAX_DEPTH, |
|---|
| 649 | __CANDIDATES_FINISHED, |
|---|
| 650 | __CANDIDATES_TODO, |
|---|
| 651 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); fflush(stdout); |
|---|
| 652 | printf("DESCEND PATH : (%s)\n\n", __PATH_IN_CANDIDATES); |
|---|
| 653 | |
|---|
| 654 | // |
|---|
| 655 | // calc source- and target-set of IDs |
|---|
| 656 | // |
|---|
| 657 | PS_calc_next_speciesid_sets(); |
|---|
| 658 | printf("\nDESCEND ---------- calculated next speciesid-sets ----------\n\n"); fflush(stdout); |
|---|
| 659 | |
|---|
| 660 | // |
|---|
| 661 | // search for candidates to improve filling level of __MAP using the source- and target-set of IDs |
|---|
| 662 | // max. 3 rounds |
|---|
| 663 | // |
|---|
| 664 | int round = 0; |
|---|
| 665 | for (; round<3 && _candidate_parent->children.empty(); ++round) { |
|---|
| 666 | PS_find_probes(_root_node, round, _candidate_parent, _filling_level); |
|---|
| 667 | } |
|---|
| 668 | printf("\nDESCEND ---------- searched probe for speciesid-sets [rounds : %i] candidates (%lu / %lu -> %lu left) ----------\n\n", |
|---|
| 669 | round, |
|---|
| 670 | __CANDIDATES_FINISHED, |
|---|
| 671 | __CANDIDATES_TODO, |
|---|
| 672 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); fflush(stdout); |
|---|
| 673 | |
|---|
| 674 | // |
|---|
| 675 | // descend down each (of the max 3) candidate(s) |
|---|
| 676 | // |
|---|
| 677 | char count = '0'+_candidate_parent->children.size(); |
|---|
| 678 | for (PS_CandidateByGainMapRIter next_candidate_it = _candidate_parent->children.rbegin(); |
|---|
| 679 | next_candidate_it != _candidate_parent->children.rend(); |
|---|
| 680 | ++next_candidate_it, --count) { |
|---|
| 681 | PS_CandidatePtr next_candidate = &(*next_candidate_it->second); |
|---|
| 682 | ++__CANDIDATES_FINISHED; |
|---|
| 683 | // |
|---|
| 684 | // apply candidate to __MAP |
|---|
| 685 | // |
|---|
| 686 | printf("[%p] ", next_candidate); |
|---|
| 687 | PS_apply_path_to_bitmap(next_candidate->path); |
|---|
| 688 | printf("[%p] ", next_candidate); |
|---|
| 689 | PS_filling_level(next_candidate); |
|---|
| 690 | next_candidate->depth = _depth+1; |
|---|
| 691 | next_candidate->print(0, false, false); |
|---|
| 692 | // |
|---|
| 693 | // step down |
|---|
| 694 | // |
|---|
| 695 | if ((next_candidate->filling_level < 75.0) && (_depth+1 < __MAX_DEPTH)) { |
|---|
| 696 | __PATH_IN_CANDIDATES[_depth] = count; |
|---|
| 697 | __PATH_IN_CANDIDATES[_depth+1] = '\x00'; |
|---|
| 698 | PS_descend(next_candidate, _root_node, _depth+1, next_candidate->filling_level); |
|---|
| 699 | } |
|---|
| 700 | // |
|---|
| 701 | // remove candidate from __MAP |
|---|
| 702 | // |
|---|
| 703 | PS_ascend(next_candidate); |
|---|
| 704 | } |
|---|
| 705 | |
|---|
| 706 | printf("\nDESCEND ~~~~~~~~~~~~~~~~~~~~ depth (%li) max depth (%li) candidates (%lu / %lu -> %lu left) ~~~~~~~~~~~~~~~~~~~~ ", |
|---|
| 707 | _depth, |
|---|
| 708 | __MAX_DEPTH, |
|---|
| 709 | __CANDIDATES_FINISHED, |
|---|
| 710 | __CANDIDATES_TODO, |
|---|
| 711 | __CANDIDATES_TODO-__CANDIDATES_FINISHED); |
|---|
| 712 | PS_print_time_diff(&before); |
|---|
| 713 | printf("\n"); fflush(stdout); |
|---|
| 714 | } |
|---|
| 715 | |
|---|
| 716 | static void PS_make_map_for_candidate(PS_CandidatePtr _candidate) { |
|---|
| 717 | // make __MAP for _candidate |
|---|
| 718 | |
|---|
| 719 | if (_candidate->map) return; // if candidate already has its map return |
|---|
| 720 | _candidate->map = new PS_BitMap_Counted(false, __MAX_ID+1); |
|---|
| 721 | _candidate->map->copy(__PRESET_MAP); |
|---|
| 722 | PS_apply_path_to_bitmap(_candidate->path, true, _candidate->map); // apply _candidate's path |
|---|
| 723 | PS_CandidatePtr parent = _candidate->parent; |
|---|
| 724 | while (parent && !parent->path.empty()) { // apply parent's paths |
|---|
| 725 | PS_apply_path_to_bitmap(parent->path, true, _candidate->map); |
|---|
| 726 | parent = parent->parent; |
|---|
| 727 | } |
|---|
| 728 | } |
|---|
| 729 | |
|---|
| 730 | |
|---|
| 731 | void PS_get_leaf_candidates(PS_CandidatePtr _candidate_parent, |
|---|
| 732 | PS_CandidateSet &_leaf_candidates, |
|---|
| 733 | const bool _ignore_passes_left = false) { |
|---|
| 734 | |
|---|
| 735 | for (PS_CandidateByGainMapIter next_candidate = _candidate_parent->children.begin(); |
|---|
| 736 | next_candidate != _candidate_parent->children.end(); |
|---|
| 737 | ++next_candidate) { |
|---|
| 738 | |
|---|
| 739 | if ((next_candidate->second->children.size() == 0) && |
|---|
| 740 | ((next_candidate->second->passes_left > 0) || _ignore_passes_left)) { |
|---|
| 741 | _leaf_candidates.insert(next_candidate->second); |
|---|
| 742 | } |
|---|
| 743 | PS_get_leaf_candidates(&(*next_candidate->second), _leaf_candidates, _ignore_passes_left); |
|---|
| 744 | } |
|---|
| 745 | } |
|---|
| 746 | |
|---|
| 747 | void PS_get_next_candidates_descend(PS_NodePtr _ps_node, PS_CandidateSet &_leaf_candidates) { |
|---|
| 748 | // scan PS_node-tree for next candidates for each of _leaf_candidates |
|---|
| 749 | |
|---|
| 750 | SpeciesID id = _ps_node->getNum(); |
|---|
| 751 | bool has_probes = _ps_node->hasProbes(); |
|---|
| 752 | |
|---|
| 753 | // |
|---|
| 754 | // append ID to path |
|---|
| 755 | // |
|---|
| 756 | __PATH.insert(id); |
|---|
| 757 | |
|---|
| 758 | // |
|---|
| 759 | // don't look at path until ID is greater than lowest ID in the sets of IDs |
|---|
| 760 | // also don't use a node if its already used as candidate |
|---|
| 761 | // |
|---|
| 762 | if ((id >= __MIN_SETS_ID) && has_probes) { |
|---|
| 763 | unsigned long total_gain_of_node = 0; |
|---|
| 764 | unsigned long total_tests_per_node = 0; |
|---|
| 765 | ++__PROBES_COUNTER; |
|---|
| 766 | // iterate over _leaf_candidates |
|---|
| 767 | for (PS_CandidateSetIter candidate_iter = _leaf_candidates.begin(); |
|---|
| 768 | candidate_iter != _leaf_candidates.end(); |
|---|
| 769 | ++candidate_iter) { |
|---|
| 770 | PS_CandidateSPtr candidate = *candidate_iter; |
|---|
| 771 | |
|---|
| 772 | // next leaf-candidate if the probe was already used for current candidate |
|---|
| 773 | if (candidate->alreadyUsedNode(_ps_node)) continue; |
|---|
| 774 | |
|---|
| 775 | ++__CANDIDATES_COUNTER; // possible candidates |
|---|
| 776 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
|---|
| 777 | __PROBES_COUNTER, |
|---|
| 778 | __PROBES_REMOVED, |
|---|
| 779 | __CANDIDATES_COUNTER, |
|---|
| 780 | __CANDIDATES_TODO, |
|---|
| 781 | __CANDIDATES_FINISHED); fflush(stdout); |
|---|
| 782 | |
|---|
| 783 | // next leaf-candidate if __PATH doesn't fulfill matching criteria |
|---|
| 784 | unsigned long matches = candidate->matchPathOnOneFalseIDs(__PATH); |
|---|
| 785 | if (matches < candidate->one_false_IDs_matches) continue; |
|---|
| 786 | |
|---|
| 787 | ++__CANDIDATES_TODO; // good candidates |
|---|
| 788 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
|---|
| 789 | __PROBES_COUNTER, |
|---|
| 790 | __PROBES_REMOVED, |
|---|
| 791 | __CANDIDATES_COUNTER, |
|---|
| 792 | __CANDIDATES_TODO, |
|---|
| 793 | __CANDIDATES_FINISHED); fflush(stdout); |
|---|
| 794 | |
|---|
| 795 | // test node on candidate's bitmap |
|---|
| 796 | float filling_level; |
|---|
| 797 | ++total_tests_per_node; |
|---|
| 798 | unsigned long gain = PS_test_candidate_on_bitmap(&filling_level, candidate->map); |
|---|
| 799 | total_gain_of_node += gain; |
|---|
| 800 | if (candidate->updateBestChild(gain, matches, filling_level, _ps_node, __PATH)) { |
|---|
| 801 | ++__CANDIDATES_FINISHED; // used candiates |
|---|
| 802 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
|---|
| 803 | __PROBES_COUNTER, |
|---|
| 804 | __PROBES_REMOVED, |
|---|
| 805 | __CANDIDATES_COUNTER, |
|---|
| 806 | __CANDIDATES_TODO, |
|---|
| 807 | __CANDIDATES_FINISHED); fflush(stdout); |
|---|
| 808 | } |
|---|
| 809 | } |
|---|
| 810 | if ((total_tests_per_node == _leaf_candidates.size()) && |
|---|
| 811 | (total_gain_of_node == 0)) { |
|---|
| 812 | ++__PROBES_REMOVED; |
|---|
| 813 | if (__VERBOSE) printf("%8lup %8lur %8luc %8lug %8luu\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b", |
|---|
| 814 | __PROBES_COUNTER, |
|---|
| 815 | __PROBES_REMOVED, |
|---|
| 816 | __CANDIDATES_COUNTER, |
|---|
| 817 | __CANDIDATES_TODO, |
|---|
| 818 | __CANDIDATES_FINISHED); fflush(stdout); |
|---|
| 819 | _ps_node->removeProbes(); |
|---|
| 820 | } |
|---|
| 821 | } |
|---|
| 822 | |
|---|
| 823 | // |
|---|
| 824 | // step down the children if either ID is lower than highest |
|---|
| 825 | // ID in the set of ID-pairs or the node has no probes |
|---|
| 826 | // |
|---|
| 827 | if ((id < __MAX_SETS_ID) || (! has_probes)) { |
|---|
| 828 | for (PS_NodeMapConstIterator i = _ps_node->getChildrenBegin(); |
|---|
| 829 | i != _ps_node->getChildrenEnd(); |
|---|
| 830 | ++i) { |
|---|
| 831 | PS_get_next_candidates_descend(i->second, _leaf_candidates); |
|---|
| 832 | } |
|---|
| 833 | } |
|---|
| 834 | |
|---|
| 835 | // |
|---|
| 836 | // remove ID from path |
|---|
| 837 | // |
|---|
| 838 | __PATH.erase(id); |
|---|
| 839 | } |
|---|
| 840 | |
|---|
| 841 | |
|---|
| 842 | void PS_get_next_candidates(const PS_NodePtr& _root_node, PS_CandidateSet& _leaf_candidates) { |
|---|
| 843 | struct tms before; |
|---|
| 844 | times(&before); |
|---|
| 845 | // first calc source/target/false_IDs sets |
|---|
| 846 | printf("PS_get_next_candidates() : initializing %zu candidates..\n", _leaf_candidates.size()); fflush(stdout); |
|---|
| 847 | __PROBES_COUNTER = 0; |
|---|
| 848 | __PROBES_REMOVED = 0; |
|---|
| 849 | __CANDIDATES_COUNTER = 0; |
|---|
| 850 | __CANDIDATES_TODO = 0; |
|---|
| 851 | __CANDIDATES_FINISHED = 0; |
|---|
| 852 | __MIN_SETS_ID = __MAX_ID; |
|---|
| 853 | __MAX_SETS_ID = __MIN_ID; |
|---|
| 854 | for (PS_CandidateSetIter iter = _leaf_candidates.begin(); |
|---|
| 855 | iter != _leaf_candidates.end(); |
|---|
| 856 | ++iter) { |
|---|
| 857 | PS_CandidateSPtr candidate = *iter; |
|---|
| 858 | if (!candidate->map) { // if candidate has no map yet |
|---|
| 859 | candidate->getParentMap(); // try to get its parent's map |
|---|
| 860 | if (candidate->map) { // if parent had a map (if not a new map is created in PS_calc_next_speciesid_sets_for_candidate |
|---|
| 861 | PS_apply_path_to_bitmap(candidate->path, true, candidate->map); // apply candidate's path |
|---|
| 862 | } |
|---|
| 863 | } |
|---|
| 864 | PS_make_map_for_candidate(&(*candidate)); |
|---|
| 865 | candidate->initFalseIDs(__MIN_ID, __MAX_ID, __MIN_SETS_ID, __MAX_SETS_ID); |
|---|
| 866 | } |
|---|
| 867 | // scan tree |
|---|
| 868 | printf("PS_get_next_candidates() : "); fflush(stdout); |
|---|
| 869 | for (PS_NodeMapConstIterator node_iter = _root_node->getChildrenBegin(); |
|---|
| 870 | (node_iter != _root_node->getChildrenEnd()) && (node_iter->first < __MAX_SETS_ID); |
|---|
| 871 | ++node_iter) { |
|---|
| 872 | __PATH.clear(); |
|---|
| 873 | PS_get_next_candidates_descend(node_iter->second, _leaf_candidates); |
|---|
| 874 | } |
|---|
| 875 | printf("looked at probes (%lu) -> removed probes (%lu) possible candidates (%lu) -> good candidates (%lu) -> used candidates (%lu)\nPS_get_next_candidates() : ", |
|---|
| 876 | __PROBES_COUNTER, |
|---|
| 877 | __PROBES_REMOVED, |
|---|
| 878 | __CANDIDATES_COUNTER, |
|---|
| 879 | __CANDIDATES_TODO, |
|---|
| 880 | __CANDIDATES_FINISHED); fflush(stdout); |
|---|
| 881 | PS_print_time_diff(&before); |
|---|
| 882 | |
|---|
| 883 | // 'descend' candidates |
|---|
| 884 | for (PS_CandidateSetIter iter = _leaf_candidates.begin(); |
|---|
| 885 | iter != _leaf_candidates.end(); |
|---|
| 886 | ++iter) { |
|---|
| 887 | PS_CandidateSPtr candidate = *iter; |
|---|
| 888 | candidate->print(); |
|---|
| 889 | candidate->decreasePasses(); |
|---|
| 890 | } |
|---|
| 891 | |
|---|
| 892 | _leaf_candidates.clear(); |
|---|
| 893 | PS_get_leaf_candidates(__CANDIDATES_ROOT, _leaf_candidates); |
|---|
| 894 | } |
|---|
| 895 | |
|---|
| 896 | // ==================================================== |
|---|
| 897 | // ==================================================== |
|---|
| 898 | int main(int argc, |
|---|
| 899 | char *argv[]) { |
|---|
| 900 | // |
|---|
| 901 | // check arguments |
|---|
| 902 | // |
|---|
| 903 | if (argc < 5) { |
|---|
| 904 | printf("Missing argument\n Usage %s <database name> <result filename> <[-]candidates filename> <final candidates filename> [--verbose] [result filename prefix for output files]\n", argv[0]); |
|---|
| 905 | exit(1); |
|---|
| 906 | } |
|---|
| 907 | |
|---|
| 908 | const char *input_DB_name = argv[1]; |
|---|
| 909 | const char *result_in_filename = argv[2]; |
|---|
| 910 | const char *candidates_filename = argv[3]; |
|---|
| 911 | const char *final_candidates_filename = argv[4]; |
|---|
| 912 | const char *result_out_prefix = (argc > 5) ? argv[5] : NULp; |
|---|
| 913 | if (argc > 5) { |
|---|
| 914 | if (strcmp(result_out_prefix, "--verbose") == 0) { |
|---|
| 915 | __VERBOSE = true; |
|---|
| 916 | result_out_prefix = (argc > 6) ? argv[6] : NULp; |
|---|
| 917 | } |
|---|
| 918 | } |
|---|
| 919 | // |
|---|
| 920 | // open probe-set-database |
|---|
| 921 | // |
|---|
| 922 | printf("Opening probe-set-database '%s'..\n", input_DB_name); |
|---|
| 923 | PS_Database *db = new PS_Database(input_DB_name, PS_Database::READONLY); |
|---|
| 924 | db->load(); |
|---|
| 925 | __MAX_ID = db->getMaxID(); |
|---|
| 926 | __MIN_ID = db->getMinID(); |
|---|
| 927 | __SPECIES_COUNT = db->getSpeciesCount(); |
|---|
| 928 | printf("min ID (%i) max ID (%i) count of species (%i)\n", __MIN_ID, __MAX_ID, __SPECIES_COUNT); |
|---|
| 929 | printf("..loaded database (enter to continue)\n"); fflush(stdout); |
|---|
| 930 | |
|---|
| 931 | // |
|---|
| 932 | // open result file for preset bitmap |
|---|
| 933 | // |
|---|
| 934 | printf("Opening result file %s..\n", result_in_filename); |
|---|
| 935 | PS_FileBuffer *result_file = new PS_FileBuffer(result_in_filename, PS_FileBuffer::READONLY); |
|---|
| 936 | long size; |
|---|
| 937 | ID2IDSet noMatches; |
|---|
| 938 | SpeciesID id1, id2; |
|---|
| 939 | printf("reading no matches : "); |
|---|
| 940 | result_file->get_long(size); |
|---|
| 941 | printf("(%li)\n", size); fflush(stdout); |
|---|
| 942 | for (long i=0; |
|---|
| 943 | i < size; |
|---|
| 944 | ++i) { |
|---|
| 945 | result_file->get_int(id1); |
|---|
| 946 | result_file->get_int(id2); |
|---|
| 947 | printf(" %i,%i", id1, id2); |
|---|
| 948 | if (id1 < id2) { |
|---|
| 949 | noMatches.insert(ID2IDPair(id1, id2)); |
|---|
| 950 | } |
|---|
| 951 | else { |
|---|
| 952 | noMatches.insert(ID2IDPair(id2, id1)); |
|---|
| 953 | } |
|---|
| 954 | } |
|---|
| 955 | printf("\nreading one matches : "); |
|---|
| 956 | result_file->get_long(size); |
|---|
| 957 | printf("(%li)\n", size); fflush(stdout); |
|---|
| 958 | long path_length; |
|---|
| 959 | SpeciesID path_id; |
|---|
| 960 | IDSet path; |
|---|
| 961 | // IDID2IDSetMap oneMatchesMap; |
|---|
| 962 | for (long i=0; |
|---|
| 963 | i < size; |
|---|
| 964 | ++i) { |
|---|
| 965 | path.clear(); |
|---|
| 966 | result_file->get_int(id1); |
|---|
| 967 | result_file->get_int(id2); |
|---|
| 968 | result_file->get_long(path_length); |
|---|
| 969 | printf("%i,%i path(%li)", id1, id2, path_length); |
|---|
| 970 | while (path_length-- > 0) { |
|---|
| 971 | result_file->get_int(path_id); |
|---|
| 972 | printf(" %i", path_id); |
|---|
| 973 | path.insert(path_id); |
|---|
| 974 | } |
|---|
| 975 | printf("\n"); |
|---|
| 976 | // oneMatchesMap[ID2IDPair(id1, id2)] = path; |
|---|
| 977 | } |
|---|
| 978 | printf("loading preset bitmap..\n"); fflush(stdout); |
|---|
| 979 | __BITS_IN_MAP = ((__MAX_ID+1)*__MAX_ID)/2 + __MAX_ID+1; |
|---|
| 980 | __MAP = new PS_BitMap_Counted(result_file); |
|---|
| 981 | __PRESET_MAP = new PS_BitMap_Counted(false, __MAX_ID+1); |
|---|
| 982 | for (id1 = 0; id1 < __MIN_ID; ++id1) { |
|---|
| 983 | for (id2 = 0; id2 <= __MAX_ID; ++id2) { |
|---|
| 984 | if (id1 > id2) { |
|---|
| 985 | __MAP->setTrue(id1, id2); |
|---|
| 986 | } |
|---|
| 987 | else { |
|---|
| 988 | __MAP->setTrue(id2, id1); |
|---|
| 989 | } |
|---|
| 990 | } |
|---|
| 991 | } |
|---|
| 992 | for (SpeciesID id = 0; id <= __MAX_ID; ++id) { |
|---|
| 993 | __MAP->setTrue(id, id); |
|---|
| 994 | } |
|---|
| 995 | __MAP->recalcCounters(); |
|---|
| 996 | __PRESET_MAP->copy(__MAP); |
|---|
| 997 | printf("..loaded result file (enter to continue)\n"); |
|---|
| 998 | printf("cleaning up... result file\n"); fflush(stdout); |
|---|
| 999 | delete result_file; |
|---|
| 1000 | |
|---|
| 1001 | // |
|---|
| 1002 | // create or read candidates |
|---|
| 1003 | // |
|---|
| 1004 | __CANDIDATES_ROOT = new PS_Candidate(0.0); |
|---|
| 1005 | PS_FileBuffer *candidates_file; |
|---|
| 1006 | struct tms before; |
|---|
| 1007 | if (candidates_filename[0] != '-') { |
|---|
| 1008 | printf("searching candidates..\n"); |
|---|
| 1009 | // |
|---|
| 1010 | // recursively build a tree of candidates |
|---|
| 1011 | // |
|---|
| 1012 | __MAX_DEPTH = 0; |
|---|
| 1013 | for (long species_count = __SPECIES_COUNT; species_count > 0; species_count >>= 1) { |
|---|
| 1014 | ++__MAX_DEPTH; |
|---|
| 1015 | } |
|---|
| 1016 | __MAX_DEPTH = (__MAX_DEPTH * 3) >> 1; |
|---|
| 1017 | __PATH_IN_CANDIDATES = (char *)calloc(__MAX_DEPTH+1, sizeof(char)); |
|---|
| 1018 | __CANDIDATES_TODO = 0; |
|---|
| 1019 | __CANDIDATES_FINISHED = 0; |
|---|
| 1020 | times(&before); |
|---|
| 1021 | PS_descend(__CANDIDATES_ROOT, db->getConstRootNode(), 0, 0.0); |
|---|
| 1022 | printf("PS_descend : total "); |
|---|
| 1023 | PS_print_time_diff(&before); |
|---|
| 1024 | |
|---|
| 1025 | // |
|---|
| 1026 | // save candidates |
|---|
| 1027 | // |
|---|
| 1028 | printf("saving candidates to %s..\n", candidates_filename); |
|---|
| 1029 | candidates_file = new PS_FileBuffer(candidates_filename, PS_FileBuffer::WRITEONLY); |
|---|
| 1030 | __CANDIDATES_ROOT->false_IDs = __BITS_IN_MAP; |
|---|
| 1031 | __CANDIDATES_ROOT->save(candidates_file, __BITS_IN_MAP); |
|---|
| 1032 | } |
|---|
| 1033 | else { |
|---|
| 1034 | printf("loading candidates..\n"); |
|---|
| 1035 | candidates_file = new PS_FileBuffer(candidates_filename+1, PS_FileBuffer::READONLY); |
|---|
| 1036 | __CANDIDATES_ROOT->load(candidates_file, __BITS_IN_MAP, db->getConstRootNode()); |
|---|
| 1037 | printf("..loaded candidates file (enter to continue)\n"); |
|---|
| 1038 | } |
|---|
| 1039 | printf("cleaning up... candidates file\n"); fflush(stdout); |
|---|
| 1040 | delete candidates_file; |
|---|
| 1041 | |
|---|
| 1042 | // |
|---|
| 1043 | // scan candidates-tree for leaf candidates |
|---|
| 1044 | // |
|---|
| 1045 | printf("CANDIDATES :\n"); |
|---|
| 1046 | __CANDIDATES_ROOT->print(); |
|---|
| 1047 | printf("\nsearching leaf candidates.. "); |
|---|
| 1048 | PS_CandidateSet leaf_candidates; |
|---|
| 1049 | PS_get_leaf_candidates(__CANDIDATES_ROOT, leaf_candidates); |
|---|
| 1050 | printf("%zu\n", leaf_candidates.size()); |
|---|
| 1051 | |
|---|
| 1052 | // |
|---|
| 1053 | // scan tree for next candidates (below the ones in leaf_candidates) |
|---|
| 1054 | // |
|---|
| 1055 | times(&before); |
|---|
| 1056 | long round = 0; |
|---|
| 1057 | while (!leaf_candidates.empty() && round<200) { |
|---|
| 1058 | printf("\nsearching next candidates [round #%li]..\n", ++round); |
|---|
| 1059 | PS_get_next_candidates(db->getConstRootNode(), leaf_candidates); |
|---|
| 1060 | |
|---|
| 1061 | printf("rounds %li total ", round); |
|---|
| 1062 | PS_print_time_diff(&before); |
|---|
| 1063 | } |
|---|
| 1064 | |
|---|
| 1065 | printf("\nFINAL CANDIDATES:\n"); |
|---|
| 1066 | __CANDIDATES_ROOT->print(0, true); |
|---|
| 1067 | printf("\nfinal leaf candidates.. (%zu)\n", leaf_candidates.size()); |
|---|
| 1068 | PS_get_leaf_candidates(__CANDIDATES_ROOT, leaf_candidates, true); |
|---|
| 1069 | for (PS_CandidateSetIter c = leaf_candidates.begin(); |
|---|
| 1070 | c != leaf_candidates.end(); |
|---|
| 1071 | ++c) { |
|---|
| 1072 | (*(*c)).print(); |
|---|
| 1073 | } |
|---|
| 1074 | |
|---|
| 1075 | // |
|---|
| 1076 | // save final candidates |
|---|
| 1077 | // |
|---|
| 1078 | printf("saving final candidates to %s..\n", final_candidates_filename); |
|---|
| 1079 | candidates_file = new PS_FileBuffer(final_candidates_filename, PS_FileBuffer::WRITEONLY); |
|---|
| 1080 | __CANDIDATES_ROOT->save(candidates_file, __BITS_IN_MAP); |
|---|
| 1081 | printf("cleaning up... candidates file\n"); fflush(stdout); |
|---|
| 1082 | delete candidates_file; |
|---|
| 1083 | |
|---|
| 1084 | // |
|---|
| 1085 | // starting with the best candidate print the __MAP for each |
|---|
| 1086 | // depth walking up to root of candidates-tree |
|---|
| 1087 | // |
|---|
| 1088 | if (result_out_prefix) { |
|---|
| 1089 | // put __MAP in the state 'after applying best_candidate and its parents' |
|---|
| 1090 | PS_CandidateSPtr best_candidate_smart = *leaf_candidates.begin(); |
|---|
| 1091 | PS_CandidatePtr best_candidate = &(*best_candidate_smart); |
|---|
| 1092 | PS_ascend(best_candidate); |
|---|
| 1093 | PS_apply_path_to_bitmap(best_candidate->path); |
|---|
| 1094 | while (best_candidate) { |
|---|
| 1095 | PS_GNUPlot(result_out_prefix, __MAX_DEPTH--, best_candidate->path, noMatches); |
|---|
| 1096 | best_candidate = PS_ascend(best_candidate); |
|---|
| 1097 | } |
|---|
| 1098 | } |
|---|
| 1099 | |
|---|
| 1100 | // |
|---|
| 1101 | // clean up |
|---|
| 1102 | // |
|---|
| 1103 | printf("cleaning up... candidates\n"); fflush(stdout); |
|---|
| 1104 | delete __CANDIDATES_ROOT; |
|---|
| 1105 | free(__PATH_IN_CANDIDATES); |
|---|
| 1106 | printf("cleaning up... database\n"); fflush(stdout); |
|---|
| 1107 | delete db; |
|---|
| 1108 | printf("cleaning up... bitmaps\n"); fflush(stdout); |
|---|
| 1109 | delete __PRESET_MAP; |
|---|
| 1110 | delete __MAP; |
|---|
| 1111 | |
|---|
| 1112 | return 0; |
|---|
| 1113 | } |
|---|