| 1 | // ============================================================= // | 
|---|
| 2 | //                                                               // | 
|---|
| 3 | //   File      : group_search.cxx                                // | 
|---|
| 4 | //   Purpose   : provides group search functionality             // | 
|---|
| 5 | //                                                               // | 
|---|
| 6 | //   Coded by Ralf Westram (coder@reallysoft.de) in April 2017   // | 
|---|
| 7 | //   http://www.arb-home.de/                                     // | 
|---|
| 8 | //                                                               // | 
|---|
| 9 | // ============================================================= // | 
|---|
| 10 |  | 
|---|
| 11 | #include "group_search.h" | 
|---|
| 12 |  | 
|---|
| 13 | #include <arb_strarray.h> | 
|---|
| 14 | #include <arb_progress.h> | 
|---|
| 15 | #include <arb_sort.h> | 
|---|
| 16 | #include <arb_strbuf.h> | 
|---|
| 17 | #include <arb_defs.h> | 
|---|
| 18 |  | 
|---|
| 19 | #include <gb_aci_impl.h> | 
|---|
| 20 |  | 
|---|
| 21 | #include <ad_cb.h> | 
|---|
| 22 | #include <TreeNode.h> | 
|---|
| 23 |  | 
|---|
| 24 | #include <map> | 
|---|
| 25 | #include <stack> | 
|---|
| 26 | #include <arb_misc.h> | 
|---|
| 27 | #include <arb_msg_nospam.h> | 
|---|
| 28 |  | 
|---|
| 29 | using namespace std; | 
|---|
| 30 |  | 
|---|
| 31 | class GroupSearchTree; | 
|---|
| 32 |  | 
|---|
| 33 | class GroupSearchRoot FINAL_TYPE : public TreeRoot { | 
|---|
| 34 | public: | 
|---|
| 35 | GroupSearchRoot() : | 
|---|
| 36 | TreeRoot(false) | 
|---|
| 37 | {} | 
|---|
| 38 | ~GroupSearchRoot() FINAL_OVERRIDE { predelete(); } | 
|---|
| 39 |  | 
|---|
| 40 | DEFINE_TREE_ROOT_ACCESSORS(GroupSearchRoot, GroupSearchTree); | 
|---|
| 41 |  | 
|---|
| 42 | // TreeRoot interface | 
|---|
| 43 | inline TreeNode *makeNode() const OVERRIDE; | 
|---|
| 44 | inline void destroyNode(TreeNode *node) const OVERRIDE; | 
|---|
| 45 | }; | 
|---|
| 46 |  | 
|---|
| 47 | class GroupSearchTree FINAL_TYPE : public TreeNode { | 
|---|
| 48 | mutable Lazy<int,-1>      size;    // number of leafs (=zombies+species); -1 -> need update | 
|---|
| 49 | mutable Lazy<int,-1>      marked;  // number of marked species; -1 -> need update | 
|---|
| 50 | mutable Lazy<int,-1>      zombies; // number of zombies | 
|---|
| 51 | mutable LazyFloat<double> aid;     // average ingroup distance | 
|---|
| 52 |  | 
|---|
| 53 | enum UpdateWhat { | 
|---|
| 54 | UPDATE_SIZE,   // quick (update 'size' only) | 
|---|
| 55 | UPDATE_MARKED, // slow  (update all) | 
|---|
| 56 | }; | 
|---|
| 57 |  | 
|---|
| 58 | void update_info(UpdateWhat what) const; | 
|---|
| 59 | void calc_average_ingroup_distance(int group_size) const; | 
|---|
| 60 | double weighted_branchlength_sum(int group_size) const; | 
|---|
| 61 |  | 
|---|
| 62 | static GBDATA *gb_species_data; | 
|---|
| 63 |  | 
|---|
| 64 | public: | 
|---|
| 65 | GroupSearchTree(GroupSearchRoot *root) : | 
|---|
| 66 | TreeNode(root) | 
|---|
| 67 | {} | 
|---|
| 68 |  | 
|---|
| 69 | DEFINE_TREE_RELATIVES_ACCESSORS(GroupSearchTree); | 
|---|
| 70 |  | 
|---|
| 71 | static void set_species_data(GBDATA *gb_species_data_) { gb_species_data = gb_species_data_; } | 
|---|
| 72 |  | 
|---|
| 73 | // TreeNode interface | 
|---|
| 74 | unsigned get_leaf_count() const FINAL_OVERRIDE { | 
|---|
| 75 | if (size.needs_eval()) update_info(UPDATE_SIZE); | 
|---|
| 76 | return size; | 
|---|
| 77 | } | 
|---|
| 78 | void compute_tree() OVERRIDE { | 
|---|
| 79 | gs_assert(0); // should be unused | 
|---|
| 80 | } | 
|---|
| 81 |  | 
|---|
| 82 | unsigned get_marked_count() const { | 
|---|
| 83 | if (marked.needs_eval()) update_info(UPDATE_MARKED); | 
|---|
| 84 | return marked; | 
|---|
| 85 | } | 
|---|
| 86 | unsigned get_zombie_count() const { | 
|---|
| 87 | if (zombies.needs_eval()) update_info(UPDATE_MARKED); | 
|---|
| 88 | return zombies; | 
|---|
| 89 | } | 
|---|
| 90 |  | 
|---|
| 91 | double get_average_ingroup_distance() const { | 
|---|
| 92 | if (aid.needs_eval()) calc_average_ingroup_distance(get_leaf_count()); | 
|---|
| 93 | return aid; | 
|---|
| 94 | } | 
|---|
| 95 | }; | 
|---|
| 96 |  | 
|---|
| 97 | GBDATA *GroupSearchTree::gb_species_data = NULp; | 
|---|
| 98 |  | 
|---|
| 99 | inline TreeNode *GroupSearchRoot::makeNode() const { return new GroupSearchTree(const_cast<GroupSearchRoot*>(this)); } | 
|---|
| 100 | inline void GroupSearchRoot::destroyNode(TreeNode *node) const { delete DOWNCAST(GroupSearchTree*,node); } | 
|---|
| 101 |  | 
|---|
| 102 | void GroupSearchTree::update_info(UpdateWhat what) const { | 
|---|
| 103 | if (is_leaf()) { | 
|---|
| 104 | size = 1; | 
|---|
| 105 | if (what == UPDATE_MARKED) { | 
|---|
| 106 | gs_assert(gb_species_data); | 
|---|
| 107 |  | 
|---|
| 108 | GBDATA *gb_species = GBT_find_species_rel_species_data(gb_species_data, name); | 
|---|
| 109 | if (gb_species) { | 
|---|
| 110 | marked  = GB_read_flag(gb_species); | 
|---|
| 111 | zombies = 0; | 
|---|
| 112 | } | 
|---|
| 113 | else { | 
|---|
| 114 | marked  = 0; | 
|---|
| 115 | zombies = 1; | 
|---|
| 116 | } | 
|---|
| 117 | } | 
|---|
| 118 | } | 
|---|
| 119 | else { | 
|---|
| 120 | switch (what) { | 
|---|
| 121 | case UPDATE_MARKED: | 
|---|
| 122 | marked  = get_leftson()->get_marked_count() + get_rightson()->get_marked_count(); // triggers lazy-update (UPDATE_MARKED) | 
|---|
| 123 | zombies = get_leftson()->get_zombie_count() + get_rightson()->get_zombie_count(); | 
|---|
| 124 | // fall-through | 
|---|
| 125 | case UPDATE_SIZE: | 
|---|
| 126 | size    = get_leftson()->get_leaf_count() + get_rightson()->get_leaf_count();    // triggers lazy-update (UPDATE_SIZE) | 
|---|
| 127 | break; | 
|---|
| 128 | } | 
|---|
| 129 | } | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | typedef SmartPtr<GroupSearchRoot> GroupSearchRootPtr; | 
|---|
| 133 |  | 
|---|
| 134 | class SearchedTree { | 
|---|
| 135 | string         name; | 
|---|
| 136 | RefPtr<GBDATA> gb_tree; | 
|---|
| 137 | long           inner_nodes; // number of inner nodes in binary tree (i.e. ROOTED) | 
|---|
| 138 | // (Note: corrupted trees in existing DBs sometimes contain zero nodes | 
|---|
| 139 | //        (caused by older bugs?)) | 
|---|
| 140 |  | 
|---|
| 141 | GroupSearchRootPtr troot; // (optional) loaded tree | 
|---|
| 142 | string             load_error; | 
|---|
| 143 |  | 
|---|
| 144 | void load_tree() { | 
|---|
| 145 | gs_assert(!tree_is_loaded() && !failed_to_load()); | 
|---|
| 146 | troot              = new GroupSearchRoot; | 
|---|
| 147 | TreeNode *rootNode = GBT_read_tree(GB_get_root(gb_tree), get_name(), &*troot); | 
|---|
| 148 | gs_assert(implicated(rootNode, !rootNode->is_normal_group())); // otherwise parent caching will get confused | 
|---|
| 149 | if (!rootNode) { | 
|---|
| 150 | load_error = GB_await_error(); | 
|---|
| 151 | } | 
|---|
| 152 | else { | 
|---|
| 153 | gs_assert(rootNode == troot->get_root_node()); | 
|---|
| 154 | } | 
|---|
| 155 | } | 
|---|
| 156 |  | 
|---|
| 157 | public: | 
|---|
| 158 | SearchedTree(const char *name_, GBDATA *gb_main) : | 
|---|
| 159 | name(name_), | 
|---|
| 160 | gb_tree(GBT_find_tree(gb_main, name_)), | 
|---|
| 161 | inner_nodes(-1) | 
|---|
| 162 | { | 
|---|
| 163 | gs_assert(gb_tree); | 
|---|
| 164 | GBDATA *gb_nnodes     = GB_entry(gb_tree, "nnodes"); | 
|---|
| 165 | if (gb_nnodes) inner_nodes = GB_read_int(gb_nnodes); // see GBT_size_of_tree | 
|---|
| 166 | } | 
|---|
| 167 |  | 
|---|
| 168 | GBDATA *get_tree_data() { return gb_tree; } | 
|---|
| 169 | const char *get_name() const { return name.c_str(); } | 
|---|
| 170 |  | 
|---|
| 171 | int get_leaf_count() const { return inner_nodes+1; } | 
|---|
| 172 | int get_edge_iteration_count() const { return ARB_edge::iteration_count(get_leaf_count()); } | 
|---|
| 173 |  | 
|---|
| 174 | bool tree_is_loaded() const { return troot.isSet(); } | 
|---|
| 175 | bool failed_to_load() const { return !load_error.empty(); } | 
|---|
| 176 | const char *get_load_error() const { | 
|---|
| 177 | gs_assert(failed_to_load()); | 
|---|
| 178 | return load_error.c_str(); | 
|---|
| 179 | } | 
|---|
| 180 | GroupSearchRoot *get_tree_root() { | 
|---|
| 181 | if (!tree_is_loaded()) load_tree(); | 
|---|
| 182 | return failed_to_load() ? NULp : &*troot; | 
|---|
| 183 | } | 
|---|
| 184 | void flush_loaded_tree() { troot.setNull(); } | 
|---|
| 185 | }; | 
|---|
| 186 |  | 
|---|
| 187 | typedef vector<SearchedTree>            SearchedTreeContainer; | 
|---|
| 188 | typedef SearchedTreeContainer::iterator SearchedTreeIter; | 
|---|
| 189 |  | 
|---|
| 190 | const char *FoundGroup::get_name() const { | 
|---|
| 191 | GBDATA *gb_name = GB_search(gb_group, "group_name", GB_STRING); | 
|---|
| 192 | return gb_name ? GB_read_char_pntr(gb_name) : NULp; | 
|---|
| 193 | } | 
|---|
| 194 | int FoundGroup::get_name_length() const { | 
|---|
| 195 | GB_transaction ta(gb_group); | 
|---|
| 196 | GBDATA *gb_name = GB_search(gb_group, "group_name", GB_STRING); | 
|---|
| 197 | return GB_read_string_count(gb_name); | 
|---|
| 198 | } | 
|---|
| 199 |  | 
|---|
| 200 | GBDATA *FoundGroup::get_tree_data() const { | 
|---|
| 201 | return GB_get_father(gb_group); | 
|---|
| 202 | } | 
|---|
| 203 |  | 
|---|
| 204 | const char *FoundGroup::get_tree_name() const { | 
|---|
| 205 | GBDATA *gb_tree = get_tree_data(); | 
|---|
| 206 | return gb_tree ? GB_read_key_pntr(gb_tree) : NULp; | 
|---|
| 207 | } | 
|---|
| 208 |  | 
|---|
| 209 | int FoundGroup::get_tree_order() const { | 
|---|
| 210 | GBDATA *gb_tree = GB_get_father(gb_group); | 
|---|
| 211 | int     order   = -1; | 
|---|
| 212 | if (gb_tree) { | 
|---|
| 213 | GBDATA *gb_order = GB_entry(gb_tree, "order"); | 
|---|
| 214 | if (gb_order) { | 
|---|
| 215 | order = GB_read_int(gb_order); | 
|---|
| 216 | } | 
|---|
| 217 | } | 
|---|
| 218 | return order; | 
|---|
| 219 | } | 
|---|
| 220 |  | 
|---|
| 221 | GB_ERROR FoundGroup::delete_from_DB() { | 
|---|
| 222 | GB_ERROR       error = NULp; | 
|---|
| 223 | GB_transaction ta(gb_group); | 
|---|
| 224 |  | 
|---|
| 225 | GBDATA *gb_gname    = GB_entry(gb_group, "group_name"); | 
|---|
| 226 | gs_assert(gb_gname); // groups shall always have a name | 
|---|
| 227 | if (gb_gname) error = GB_delete(gb_gname); | 
|---|
| 228 |  | 
|---|
| 229 | if (!error) { | 
|---|
| 230 | GBDATA *gb_grouped    = GB_entry(gb_group, "grouped"); | 
|---|
| 231 | if (gb_grouped) error = GB_delete(gb_grouped); | 
|---|
| 232 | } | 
|---|
| 233 |  | 
|---|
| 234 | if (!error) { | 
|---|
| 235 | bool    keep_node = false; | 
|---|
| 236 | GBQUARK qid       = GB_find_existing_quark(gb_group, "id"); | 
|---|
| 237 | for (GBDATA *gb_child = GB_child(gb_group); gb_child && !keep_node; gb_child = GB_nextChild(gb_child)) { | 
|---|
| 238 | if (GB_get_quark(gb_child) != qid) { | 
|---|
| 239 | keep_node = true; | 
|---|
| 240 | } | 
|---|
| 241 | } | 
|---|
| 242 | if (!keep_node) { // no child beside "id" left -> delete node | 
|---|
| 243 | error = GB_delete(gb_group.pointer_ref()); | 
|---|
| 244 | } | 
|---|
| 245 | } | 
|---|
| 246 |  | 
|---|
| 247 | return error; | 
|---|
| 248 | } | 
|---|
| 249 |  | 
|---|
| 250 | ARB_ERROR FoundGroup::rename_by_ACI(const char *acisrt, const QueriedGroups& results, int hit_idx) { | 
|---|
| 251 | ARB_ERROR      error; | 
|---|
| 252 | GB_transaction ta(gb_group); | 
|---|
| 253 |  | 
|---|
| 254 | GBDATA *gb_gname = GB_entry(gb_group, "group_name"); | 
|---|
| 255 | if (!gb_gname) { | 
|---|
| 256 | gs_assert(0); // groups shall always have a name | 
|---|
| 257 | error = "FATAL: unnamed group detected"; | 
|---|
| 258 | } | 
|---|
| 259 | else { | 
|---|
| 260 | char *old_name = GB_read_string(gb_gname); | 
|---|
| 261 | char *new_name = GS_calc_resulting_groupname(gb_group, results, hit_idx, old_name, acisrt, error); | 
|---|
| 262 |  | 
|---|
| 263 | if (!error && new_name[0]) { // if ACI produces empty result -> skip rename | 
|---|
| 264 | error = GBT_write_group_name(gb_gname, new_name, true); | 
|---|
| 265 | } | 
|---|
| 266 |  | 
|---|
| 267 | free(new_name); | 
|---|
| 268 | free(old_name); | 
|---|
| 269 | } | 
|---|
| 270 |  | 
|---|
| 271 | return error; | 
|---|
| 272 | } | 
|---|
| 273 |  | 
|---|
| 274 | inline bool group_is_folded(GBDATA *gb_group) { | 
|---|
| 275 | if (!gb_group) return false; | 
|---|
| 276 | GBDATA *gb_grouped = GB_entry(gb_group, "grouped"); | 
|---|
| 277 | return gb_grouped && GB_read_byte(gb_grouped) != 0; | 
|---|
| 278 | } | 
|---|
| 279 | inline ARB_ERROR group_set_folded(GBDATA *gb_group, bool folded) { | 
|---|
| 280 | gs_assert(gb_group); | 
|---|
| 281 |  | 
|---|
| 282 | ARB_ERROR  error; | 
|---|
| 283 | GBDATA    *gb_grouped = GB_entry(gb_group, "grouped"); | 
|---|
| 284 |  | 
|---|
| 285 | if (!gb_grouped && folded) { | 
|---|
| 286 | gb_grouped = GB_create(gb_group, "grouped", GB_BYTE); | 
|---|
| 287 | if (!gb_grouped) error = GB_await_error(); | 
|---|
| 288 | } | 
|---|
| 289 | if (gb_grouped) { | 
|---|
| 290 | gs_assert(!error); | 
|---|
| 291 | error = GB_write_byte(gb_grouped, folded); | 
|---|
| 292 | } | 
|---|
| 293 | #if defined(ASSERTION_USED) | 
|---|
| 294 | else gs_assert(!folded); | 
|---|
| 295 | #endif | 
|---|
| 296 | return error; | 
|---|
| 297 | } | 
|---|
| 298 |  | 
|---|
| 299 | bool FoundGroup::overlap_is_folded() const { | 
|---|
| 300 | return group_is_folded(get_overlap_group()); | 
|---|
| 301 | } | 
|---|
| 302 | bool FoundGroup::is_folded() const { | 
|---|
| 303 | return group_is_folded(gb_group); | 
|---|
| 304 | } | 
|---|
| 305 |  | 
|---|
| 306 | ARB_ERROR FoundGroup::set_folded(bool folded) { | 
|---|
| 307 | return group_set_folded(gb_group, folded); | 
|---|
| 308 | } | 
|---|
| 309 | ARB_ERROR FoundGroup::set_overlap_folded(bool folded) { | 
|---|
| 310 | return group_set_folded(get_overlap_group(), folded); | 
|---|
| 311 | } | 
|---|
| 312 |  | 
|---|
| 313 | ARB_ERROR FoundGroup::change_folding(GroupFoldingMode mode) { | 
|---|
| 314 | GB_transaction ta(gb_group); | 
|---|
| 315 |  | 
|---|
| 316 | ARB_ERROR error; | 
|---|
| 317 |  | 
|---|
| 318 | bool was_folded         = is_folded(); | 
|---|
| 319 | bool knows_overlap      = knows_details(); // may be false when called by fold_found_groups(); acceptable | 
|---|
| 320 | bool overlap_was_folded = knows_overlap && overlap_is_folded(); | 
|---|
| 321 | bool want_folded        = was_folded; | 
|---|
| 322 |  | 
|---|
| 323 | switch (mode) { | 
|---|
| 324 | case GFM_TOGGLE:   want_folded = !(was_folded || overlap_was_folded); break; | 
|---|
| 325 | case GFM_COLLAPSE: want_folded = true; break; | 
|---|
| 326 | case GFM_EXPAND:   want_folded = false; break; | 
|---|
| 327 | default: error = "invalid collapse mode"; gs_assert(0); break; | 
|---|
| 328 | } | 
|---|
| 329 |  | 
|---|
| 330 | if (!error && want_folded != was_folded) { | 
|---|
| 331 | error = set_folded(want_folded); | 
|---|
| 332 | } | 
|---|
| 333 | if (!error && want_folded != overlap_was_folded && knows_overlap && gb_overlap_group) { | 
|---|
| 334 | error = set_overlap_folded(want_folded); | 
|---|
| 335 | } | 
|---|
| 336 |  | 
|---|
| 337 | return error; | 
|---|
| 338 | } | 
|---|
| 339 |  | 
|---|
| 340 | void ColumnWidths::track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled) { | 
|---|
| 341 | seen_keeled = seen_keeled || keeled; | 
|---|
| 342 |  | 
|---|
| 343 | // track max. width: | 
|---|
| 344 | name   = std::max(name, wName); | 
|---|
| 345 | reason = std::max(reason, wReason); | 
|---|
| 346 |  | 
|---|
| 347 | // track max. value: | 
|---|
| 348 | max_nesting    = std::max(max_nesting, nesting); | 
|---|
| 349 | max_size       = std::max(max_size, size); | 
|---|
| 350 | max_marked     = std::max(max_marked, marked); | 
|---|
| 351 | max_marked_pc  = std::max(max_marked_pc, percent(marked, size)); | 
|---|
| 352 | max_cluster_id = std::max(max_cluster_id, clusID); | 
|---|
| 353 | max_aid        = std::max(max_aid, int(aid)); | 
|---|
| 354 | } | 
|---|
| 355 | void FoundGroup::track_max_widths(ColumnWidths& widths) const { | 
|---|
| 356 | gs_assert(knows_details()); | 
|---|
| 357 | widths.track(get_name_length(), | 
|---|
| 358 | get_hit_reason().length(), | 
|---|
| 359 | nesting, | 
|---|
| 360 | size, | 
|---|
| 361 | marked, | 
|---|
| 362 | clusterID, | 
|---|
| 363 | aid, | 
|---|
| 364 | keeled); | 
|---|
| 365 | } | 
|---|
| 366 |  | 
|---|
| 367 | // --------------------- | 
|---|
| 368 | //      ParentCache | 
|---|
| 369 |  | 
|---|
| 370 | class ParentCache : virtual Noncopyable { | 
|---|
| 371 | typedef map<GBDATA*,GBDATA*> Cache; | 
|---|
| 372 | Cache cache; | 
|---|
| 373 |  | 
|---|
| 374 | public: | 
|---|
| 375 | void defineParentOf(GBDATA *gb_child_group, GBDATA *gb_parent_group) { | 
|---|
| 376 | // gb_parent_group may be NULp | 
|---|
| 377 | gs_assert(gb_child_group); | 
|---|
| 378 | cache[gb_child_group] = gb_parent_group; | 
|---|
| 379 | } | 
|---|
| 380 | GBDATA *lookupParent(GBDATA *gb_child_group) const { | 
|---|
| 381 | Cache::const_iterator  found  = cache.find(gb_child_group); | 
|---|
| 382 | return found == cache.end() ? NULp : found->second; | 
|---|
| 383 | } | 
|---|
| 384 |  | 
|---|
| 385 | void fix_deleted_groups(const GBDATAset& deleted_groups) { | 
|---|
| 386 | ParentCache translate; // translation table: oldDelParent -> newExistingParent (or NULp at top-level) | 
|---|
| 387 | for (GBDATAset::const_iterator del = deleted_groups.begin(); del != deleted_groups.end(); ++del) { | 
|---|
| 388 | GBDATA *gb_remaining_father = lookupParent(*del); | 
|---|
| 389 | if (gb_remaining_father) { // otherwise 'del' point to sth unkown (see comment in GroupSearchCommon) | 
|---|
| 390 | while (gb_remaining_father) { | 
|---|
| 391 | if (deleted_groups.find(gb_remaining_father) == deleted_groups.end()) { | 
|---|
| 392 | break; // not deleted -> use as replacement | 
|---|
| 393 | } | 
|---|
| 394 | gb_remaining_father = lookupParent(gb_remaining_father); | 
|---|
| 395 | } | 
|---|
| 396 | translate.defineParentOf(*del, gb_remaining_father); | 
|---|
| 397 | } | 
|---|
| 398 | } | 
|---|
| 399 |  | 
|---|
| 400 | // erase deleted nodes from cache | 
|---|
| 401 | for (GBDATAset::const_iterator del = deleted_groups.begin(); del != deleted_groups.end(); ++del) { | 
|---|
| 402 | cache.erase(*del); | 
|---|
| 403 | } | 
|---|
| 404 |  | 
|---|
| 405 | // translate remaining entries | 
|---|
| 406 | for (Cache::iterator c = cache.begin(); c != cache.end(); ++c) { | 
|---|
| 407 | GBDATA *gb_child  = c->first; | 
|---|
| 408 | GBDATA *gb_parent = c->second; | 
|---|
| 409 | if (deleted_groups.find(gb_parent) != deleted_groups.end()) { | 
|---|
| 410 | defineParentOf(gb_child, translate.lookupParent(gb_parent)); | 
|---|
| 411 | } | 
|---|
| 412 | } | 
|---|
| 413 | } | 
|---|
| 414 | }; | 
|---|
| 415 |  | 
|---|
| 416 | // --------------------------- | 
|---|
| 417 | //      GroupSearchCommon | 
|---|
| 418 |  | 
|---|
| 419 | #define TRIGGER_UPDATE_GROUP_RESULTS "/tmp/trigger/group_result_update" | 
|---|
| 420 |  | 
|---|
| 421 | class GroupSearchCommon : virtual Noncopyable { | 
|---|
| 422 | // controls and maintains validity of existing group-search-results | 
|---|
| 423 |  | 
|---|
| 424 | typedef set<GroupSearch*> GroupSearchSet; | 
|---|
| 425 |  | 
|---|
| 426 | GroupSearchSet searches; // all existing searches (normally only one) | 
|---|
| 427 |  | 
|---|
| 428 | bool    cbs_installed; | 
|---|
| 429 | GBDATA *gb_trigger; // TRIGGER_UPDATE_GROUP_RESULTS (triggers ONCE for multiple DB changes) | 
|---|
| 430 |  | 
|---|
| 431 | // The following two sets may also contain "node" entries from | 
|---|
| 432 | // completely different parts of the DB -> do not make assumptions! | 
|---|
| 433 | GBDATAset deleted_groups;  // entries are "deleted", i.e. access is invalid! Only comparing pointers is defined! | 
|---|
| 434 | GBDATAset modified_groups; | 
|---|
| 435 |  | 
|---|
| 436 | ParentCache pcache; | 
|---|
| 437 |  | 
|---|
| 438 | void add_callbacks(GBDATA *gb_main); | 
|---|
| 439 | void remove_callbacks(GBDATA *gb_main); | 
|---|
| 440 |  | 
|---|
| 441 | void trigger_group_search_update() { GB_touch(gb_trigger); } | 
|---|
| 442 |  | 
|---|
| 443 | public: | 
|---|
| 444 | GroupSearchCommon() : | 
|---|
| 445 | cbs_installed(false), | 
|---|
| 446 | gb_trigger(NULp) | 
|---|
| 447 | {} | 
|---|
| 448 | ~GroupSearchCommon() { | 
|---|
| 449 | gs_assert(!cbs_installed); | 
|---|
| 450 | } | 
|---|
| 451 |  | 
|---|
| 452 | ParentCache& get_parent_cache() { return pcache; } | 
|---|
| 453 |  | 
|---|
| 454 | void notify_deleted(GBDATA *gb_node)  { deleted_groups.insert(gb_node);  trigger_group_search_update(); } | 
|---|
| 455 | void notify_modified(GBDATA *gb_node) { modified_groups.insert(gb_node); trigger_group_search_update(); } | 
|---|
| 456 |  | 
|---|
| 457 | bool has_been_deleted(GBDATA *gb_node) { return deleted_groups.find(gb_node) != deleted_groups.end(); } | 
|---|
| 458 | bool has_been_modified(GBDATA *gb_node) { return modified_groups.find(gb_node) != modified_groups.end(); } | 
|---|
| 459 |  | 
|---|
| 460 | void add(GroupSearch *gs) { | 
|---|
| 461 | if (empty()) { | 
|---|
| 462 | GBDATA *gb_main = gs->get_gb_main(); | 
|---|
| 463 | add_callbacks(gb_main); | 
|---|
| 464 | } | 
|---|
| 465 | searches.insert(gs); | 
|---|
| 466 | } | 
|---|
| 467 | void remove(GroupSearch *gs) { | 
|---|
| 468 | searches.erase(gs); | 
|---|
| 469 | if (empty()) { | 
|---|
| 470 | GBDATA *gb_main = gs->get_gb_main(); | 
|---|
| 471 | remove_callbacks(gb_main); | 
|---|
| 472 | } | 
|---|
| 473 | } | 
|---|
| 474 | bool empty() const { return searches.empty(); } | 
|---|
| 475 |  | 
|---|
| 476 | void clear_notifications() { | 
|---|
| 477 | deleted_groups.clear(); | 
|---|
| 478 | modified_groups.clear(); | 
|---|
| 479 | } | 
|---|
| 480 | bool has_notifications() { | 
|---|
| 481 | return !(deleted_groups.empty() && modified_groups.empty()); | 
|---|
| 482 | } | 
|---|
| 483 |  | 
|---|
| 484 | void refresh_all_results() { | 
|---|
| 485 | if (has_notifications()) { | 
|---|
| 486 | pcache.fix_deleted_groups(deleted_groups); | 
|---|
| 487 | for (GroupSearchSet::iterator gs = searches.begin(); gs != searches.end(); ++gs) { | 
|---|
| 488 | GroupSearch *gr_search = *gs; | 
|---|
| 489 | gr_search->refresh_results_after_DBchanges(); | 
|---|
| 490 | } | 
|---|
| 491 | clear_notifications(); | 
|---|
| 492 | } | 
|---|
| 493 | } | 
|---|
| 494 | }; | 
|---|
| 495 |  | 
|---|
| 496 | static void tree_node_deleted_cb(GBDATA *gb_node, GroupSearchCommon *common, GB_CB_TYPE cbtype) { | 
|---|
| 497 | bool mark_as_deleted = cbtype == GB_CB_DELETE; | 
|---|
| 498 |  | 
|---|
| 499 | if (!mark_as_deleted) { | 
|---|
| 500 | if (!GB_entry(gb_node, "group_name")) { // if group_name disappeared | 
|---|
| 501 | mark_as_deleted = true; | 
|---|
| 502 | } | 
|---|
| 503 | } | 
|---|
| 504 |  | 
|---|
| 505 | if (mark_as_deleted) { | 
|---|
| 506 | common->notify_deleted(gb_node); | 
|---|
| 507 | } | 
|---|
| 508 | else { | 
|---|
| 509 | common->notify_modified(gb_node); | 
|---|
| 510 | } | 
|---|
| 511 | } | 
|---|
| 512 | static void group_name_changed_cb(GBDATA *gb_group_name, GroupSearchCommon *common) { | 
|---|
| 513 | GBDATA *gb_node = GB_get_father(gb_group_name); | 
|---|
| 514 | if (gb_node) { | 
|---|
| 515 | common->notify_modified(gb_node); | 
|---|
| 516 | } | 
|---|
| 517 | } | 
|---|
| 518 | static void result_update_cb(GBDATA*, GroupSearchCommon *common) { | 
|---|
| 519 | // is called once after DB changes that might affect validity of group-search-results | 
|---|
| 520 | common->refresh_all_results(); | 
|---|
| 521 | } | 
|---|
| 522 |  | 
|---|
| 523 | void GroupSearchCommon::add_callbacks(GBDATA *gb_main) { | 
|---|
| 524 | gs_assert(!cbs_installed); | 
|---|
| 525 |  | 
|---|
| 526 | GB_transaction ta(gb_main); | 
|---|
| 527 | gb_trigger = GB_search(gb_main, TRIGGER_UPDATE_GROUP_RESULTS, GB_INT); | 
|---|
| 528 |  | 
|---|
| 529 | GB_ERROR error       = GB_add_hierarchy_callback(gb_main, "node",            GB_CB_CHANGED_OR_DELETED, makeDatabaseCallback(tree_node_deleted_cb, this)); | 
|---|
| 530 | if (!error)    error = GB_add_hierarchy_callback(gb_main, "node/group_name", GB_CB_CHANGED,            makeDatabaseCallback(group_name_changed_cb, this)); | 
|---|
| 531 | if (!error)    error = GB_add_callback(gb_trigger, GB_CB_CHANGED, makeDatabaseCallback(result_update_cb, this)); | 
|---|
| 532 |  | 
|---|
| 533 | if (error) GBT_message(gb_main, GBS_global_string("Failed to bind callback (Reason: %s)", error)); | 
|---|
| 534 | else cbs_installed = true; | 
|---|
| 535 | } | 
|---|
| 536 |  | 
|---|
| 537 | void GroupSearchCommon::remove_callbacks(GBDATA *gb_main) { | 
|---|
| 538 | if (cbs_installed) { | 
|---|
| 539 | GB_transaction ta(gb_main); | 
|---|
| 540 | GB_ERROR       error = GB_remove_hierarchy_callback(gb_main, "node",            GB_CB_CHANGED_OR_DELETED, makeDatabaseCallback(tree_node_deleted_cb, this)); | 
|---|
| 541 | if (!error)    error = GB_remove_hierarchy_callback(gb_main, "node/group_name", GB_CB_CHANGED,            makeDatabaseCallback(group_name_changed_cb, this)); | 
|---|
| 542 | GB_remove_callback(gb_trigger, GB_CB_CHANGED, makeDatabaseCallback(result_update_cb, this)); | 
|---|
| 543 |  | 
|---|
| 544 | if (error) GBT_message(gb_main, GBS_global_string("Failed to remove callback (Reason: %s)", error)); | 
|---|
| 545 | else cbs_installed = false; | 
|---|
| 546 | } | 
|---|
| 547 | } | 
|---|
| 548 |  | 
|---|
| 549 | // --------------------- | 
|---|
| 550 | //      GroupSearch | 
|---|
| 551 |  | 
|---|
| 552 | GroupSearchCommon *GroupSearch::common = NULp; | 
|---|
| 553 |  | 
|---|
| 554 | GroupSearch::GroupSearch(GBDATA *gb_main_, const GroupSearchCallback& redisplay_results_cb) : | 
|---|
| 555 | gb_main(gb_main_), | 
|---|
| 556 | redisplay_cb(redisplay_results_cb), | 
|---|
| 557 | sortedByOrder(false) | 
|---|
| 558 | { | 
|---|
| 559 | if (!common) common = new GroupSearchCommon; | 
|---|
| 560 | common->add(this); | 
|---|
| 561 | } | 
|---|
| 562 |  | 
|---|
| 563 | GroupSearch::~GroupSearch() { | 
|---|
| 564 | common->remove(this); | 
|---|
| 565 | if (common->empty()) { | 
|---|
| 566 | delete common; | 
|---|
| 567 | common = NULp; | 
|---|
| 568 | } | 
|---|
| 569 | } | 
|---|
| 570 |  | 
|---|
| 571 | static void collect_searched_trees(GBDATA *gb_main, const TreeNameSet& trees_to_search, SearchedTreeContainer& searched_tree) { | 
|---|
| 572 | ConstStrArray tree_names; | 
|---|
| 573 | GBT_get_tree_names(tree_names, gb_main, false); | 
|---|
| 574 |  | 
|---|
| 575 | { | 
|---|
| 576 | bool search_all = trees_to_search.empty(); | 
|---|
| 577 | for (int t = 0; tree_names[t]; ++t) { | 
|---|
| 578 | if (search_all || trees_to_search.find(tree_names[t]) != trees_to_search.end()) { | 
|---|
| 579 | searched_tree.push_back(SearchedTree(tree_names[t], gb_main)); | 
|---|
| 580 | } | 
|---|
| 581 | } | 
|---|
| 582 | } | 
|---|
| 583 | } | 
|---|
| 584 |  | 
|---|
| 585 | class Candidate : public FoundGroup { | 
|---|
| 586 | // candidate for a search result | 
|---|
| 587 | // - able to retrieve values (have tree to examine) | 
|---|
| 588 | RefPtr<GroupSearchTree> node; | 
|---|
| 589 |  | 
|---|
| 590 | public: | 
|---|
| 591 | Candidate(const FoundGroup& group_, GroupSearchTree *node_) : | 
|---|
| 592 | FoundGroup(group_), | 
|---|
| 593 | node(node_) | 
|---|
| 594 | {} | 
|---|
| 595 | Candidate(GBDATA *gb_group_, GroupSearchTree *node_) : | 
|---|
| 596 | FoundGroup(gb_group_), | 
|---|
| 597 | node(node_) | 
|---|
| 598 | {} | 
|---|
| 599 |  | 
|---|
| 600 | FoundGroup& get_group() { return *this; } | 
|---|
| 601 | const FoundGroup& get_group() const { return *this; } | 
|---|
| 602 |  | 
|---|
| 603 | GroupSearchTree *get_clade() { // return node where clade is shown (differs from get_node for keeled groups) | 
|---|
| 604 | TreeNode *keeld = node->keelTarget(); | 
|---|
| 605 | return keeld ? DOWNCAST(GroupSearchTree*, keeld) : &*node; | 
|---|
| 606 | } | 
|---|
| 607 | const GroupSearchTree *get_clade() const { | 
|---|
| 608 | return const_cast<Candidate*>(this)->get_clade(); | 
|---|
| 609 | } | 
|---|
| 610 |  | 
|---|
| 611 | int get_keeledStateInfo() const { return node->keeledStateInfo(); } | 
|---|
| 612 |  | 
|---|
| 613 | void inform_group(const GroupSearch& group_search, const string& hitReason) { | 
|---|
| 614 | // retrieve/store all information needed later (e.g. for sorting): | 
|---|
| 615 | hit_reason = hitReason; | 
|---|
| 616 |  | 
|---|
| 617 | GroupSearchTree *clade = get_clade(); | 
|---|
| 618 |  | 
|---|
| 619 | if (nesting.needs_eval()) nesting = group_search.calc_nesting_level(get_pointer()); | 
|---|
| 620 | if (size.needs_eval())    size    = clade->get_leaf_count(); | 
|---|
| 621 | if (marked.needs_eval())  marked  = clade->get_marked_count(); | 
|---|
| 622 | if (aid.needs_eval())     aid     = clade->get_average_ingroup_distance(); | 
|---|
| 623 |  | 
|---|
| 624 | if (keeled.needs_eval())  { | 
|---|
| 625 | keeled = get_keeledStateInfo(); | 
|---|
| 626 |  | 
|---|
| 627 | // set info needed for clade-overlap | 
|---|
| 628 | if (keeled) { | 
|---|
| 629 | if (!clade->is_leaf() && clade->is_normal_group()) { // got overlap | 
|---|
| 630 | gb_overlap_group = clade->gb_node; | 
|---|
| 631 | gs_assert(gb_overlap_group); | 
|---|
| 632 | } | 
|---|
| 633 | } | 
|---|
| 634 | else { | 
|---|
| 635 | if (node->is_keeled_group()) { // got overlap | 
|---|
| 636 | gb_overlap_group = node->father->gb_node; | 
|---|
| 637 | gs_assert(gb_overlap_group); | 
|---|
| 638 | } | 
|---|
| 639 | } | 
|---|
| 640 |  | 
|---|
| 641 | } | 
|---|
| 642 |  | 
|---|
| 643 | gs_assert(knows_details()); | 
|---|
| 644 | } | 
|---|
| 645 | }; | 
|---|
| 646 |  | 
|---|
| 647 | class TargetGroup: public QueryTarget, virtual Noncopyable { | 
|---|
| 648 | // wrapper to use Candidate as QueryTarget | 
|---|
| 649 | SmartPtr<Candidate> cand; | 
|---|
| 650 |  | 
|---|
| 651 | public: | 
|---|
| 652 | TargetGroup(GBDATA *gb_main_, const char *treename_) : | 
|---|
| 653 | QueryTarget(gb_main_, treename_) | 
|---|
| 654 | {} | 
|---|
| 655 | ~TargetGroup() OVERRIDE {} | 
|---|
| 656 |  | 
|---|
| 657 | void aimTo(const Candidate& c) { cand = new Candidate(c); } | 
|---|
| 658 | void unAim() { cand.setNull(); } | 
|---|
| 659 |  | 
|---|
| 660 | const FoundGroup& get_group() const { gs_assert(cand.isSet()); return cand->get_group(); } | 
|---|
| 661 | const GroupSearchTree *get_clade() const { gs_assert(cand.isSet() && cand->get_clade()); return cand->get_clade(); } | 
|---|
| 662 |  | 
|---|
| 663 | const char *get_group_name() const { return get_group().get_name(); } | 
|---|
| 664 | unsigned get_group_size() const { return get_clade()->get_leaf_count(); } | 
|---|
| 665 | unsigned get_marked_count() const { return get_clade()->get_marked_count(); } | 
|---|
| 666 | unsigned get_zombie_count() const { return get_clade()->get_zombie_count(); } | 
|---|
| 667 | double get_average_ingroup_distance() const { return get_clade()->get_average_ingroup_distance(); } | 
|---|
| 668 | int get_keeledStateInfo() const { gs_assert(cand.isSet()); return cand->get_keeledStateInfo(); } | 
|---|
| 669 |  | 
|---|
| 670 | // virtual QueryTarget interface: | 
|---|
| 671 | GBDATA *get_ACI_item() const { return get_group().get_pointer(); } | 
|---|
| 672 | }; | 
|---|
| 673 |  | 
|---|
| 674 | typedef list<Candidate> CandidateList; | 
|---|
| 675 |  | 
|---|
| 676 | #if defined(ASSERTION_USED) | 
|---|
| 677 | inline bool isCorrectParent(TreeNode *node, GBDATA *gb_group, GBDATA *gb_parent_group) { | 
|---|
| 678 | /*! check correctness of parent (caching) | 
|---|
| 679 | * @param node            the TreeNode where clade is shown in tree | 
|---|
| 680 | * @param gb_group        the group data related to node (at node for normal groups; at parent-node for keeled groups) | 
|---|
| 681 | * @param gb_parent_group the parent group data (may be NULp) | 
|---|
| 682 | * @return true if gb_parent_group is the correct parent | 
|---|
| 683 | */ | 
|---|
| 684 |  | 
|---|
| 685 | gs_assert(node && gb_group); | 
|---|
| 686 |  | 
|---|
| 687 | TreeNode *pnode = node->find_parent_with_groupInfo(true); | 
|---|
| 688 | if (pnode) { | 
|---|
| 689 | if (node->gb_node == gb_group) { // = node is not keeled | 
|---|
| 690 | gs_assert(node->is_normal_group()); | 
|---|
| 691 | return pnode->gb_node == gb_parent_group; | 
|---|
| 692 | } | 
|---|
| 693 |  | 
|---|
| 694 | gs_assert(node->is_keeled_group());     // node is keeled | 
|---|
| 695 | gs_assert(pnode->keelTarget() == node); // pnode is node storing that keeled node | 
|---|
| 696 | gs_assert(pnode->gb_node == gb_group);  // groupdata is attached at pnode | 
|---|
| 697 |  | 
|---|
| 698 | TreeNode *ppnode = pnode->find_parent_with_groupInfo(true); // continue with next parent | 
|---|
| 699 | if (ppnode) { | 
|---|
| 700 | return ppnode->gb_node == gb_parent_group; | 
|---|
| 701 | } | 
|---|
| 702 | } | 
|---|
| 703 | #if defined(ASSERTION_USED) | 
|---|
| 704 | else { | 
|---|
| 705 | gs_assert(node->gb_node == gb_group); | 
|---|
| 706 | } | 
|---|
| 707 | #endif | 
|---|
| 708 |  | 
|---|
| 709 | return gb_parent_group == NULp; | 
|---|
| 710 | } | 
|---|
| 711 | #endif | 
|---|
| 712 |  | 
|---|
| 713 | double GroupSearchTree::weighted_branchlength_sum(int group_size) const { | 
|---|
| 714 | int    leafs = get_leaf_count(); | 
|---|
| 715 | double sum   = father ? get_branchlength() * leafs * (group_size-leafs) : 0.0; | 
|---|
| 716 |  | 
|---|
| 717 | if (!is_leaf()) { | 
|---|
| 718 | sum += get_leftson()->weighted_branchlength_sum(group_size); | 
|---|
| 719 | sum += get_rightson()->weighted_branchlength_sum(group_size); | 
|---|
| 720 | } | 
|---|
| 721 |  | 
|---|
| 722 | return sum; | 
|---|
| 723 | } | 
|---|
| 724 |  | 
|---|
| 725 | void GroupSearchTree::calc_average_ingroup_distance(int group_size) const { | 
|---|
| 726 | long pairs = long(group_size)*(group_size-1)/2; // warning: int-overflow with SSURef_NR99_128_SILVA_07_09_16_opt.arb | 
|---|
| 727 |  | 
|---|
| 728 | if (pairs) { | 
|---|
| 729 | double wbranchsum = weighted_branchlength_sum(group_size); | 
|---|
| 730 | aid               = wbranchsum / pairs; | 
|---|
| 731 |  | 
|---|
| 732 | gs_assert(aid>=0); | 
|---|
| 733 | } | 
|---|
| 734 | else { | 
|---|
| 735 | aid = 0; | 
|---|
| 736 | } | 
|---|
| 737 | } | 
|---|
| 738 |  | 
|---|
| 739 | void GroupSearch::perform_search(GroupSearchMode mode) { | 
|---|
| 740 | typedef set< RefPtr<GBDATA> > ExistingHits; | 
|---|
| 741 |  | 
|---|
| 742 | ExistingHits existing_hits; | 
|---|
| 743 | if (mode & GSM_FORGET_EXISTING) forget_results(); // from last search | 
|---|
| 744 | else { | 
|---|
| 745 | for (FoundGroupCIter prev = found->begin(); prev != found->end(); ++prev) { | 
|---|
| 746 | existing_hits.insert(prev->get_pointer()); | 
|---|
| 747 | } | 
|---|
| 748 | } | 
|---|
| 749 |  | 
|---|
| 750 | bool match_unlisted = mode&GSM_ADD; | 
|---|
| 751 |  | 
|---|
| 752 | if (query_expr.isNull()) addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*"); // default | 
|---|
| 753 |  | 
|---|
| 754 | if (mode&GSM_MISMATCH) { | 
|---|
| 755 | query_expr->negate(); | 
|---|
| 756 | } | 
|---|
| 757 |  | 
|---|
| 758 | GB_ERROR error = NULp; | 
|---|
| 759 | { | 
|---|
| 760 | GB_transaction        ta(gb_main); | 
|---|
| 761 | SearchedTreeContainer searched_tree; | 
|---|
| 762 |  | 
|---|
| 763 | GroupSearchTree::set_species_data(GBT_get_species_data(gb_main)); | 
|---|
| 764 |  | 
|---|
| 765 | collect_searched_trees(gb_main, trees_to_search, searched_tree); | 
|---|
| 766 |  | 
|---|
| 767 | // calc overall iteration count (for progress) | 
|---|
| 768 | long overall_iter_count = 0; | 
|---|
| 769 | for (SearchedTreeIter st = searched_tree.begin(); st != searched_tree.end(); ++st) { // LOOP_VECTORIZED[!<6.0] | 
|---|
| 770 | overall_iter_count += st->get_edge_iteration_count(); | 
|---|
| 771 | } | 
|---|
| 772 |  | 
|---|
| 773 | // iterate over all trees | 
|---|
| 774 | arb_progress progress("Searching groups", overall_iter_count); | 
|---|
| 775 |  | 
|---|
| 776 | bool load_failures = false; | 
|---|
| 777 | for (SearchedTreeIter st = searched_tree.begin(); !error && st != searched_tree.end(); ++st) { | 
|---|
| 778 | GroupSearchRoot *troot = st->get_tree_root(); | 
|---|
| 779 |  | 
|---|
| 780 | TargetGroup target_group(gb_main, st->get_name()); | 
|---|
| 781 |  | 
|---|
| 782 | if (!troot) { | 
|---|
| 783 | GBT_message(gb_main, GBS_global_string("Tree skipped: %s", st->get_load_error())); | 
|---|
| 784 | progress.inc_by(st->get_edge_iteration_count()); | 
|---|
| 785 | load_failures = true; | 
|---|
| 786 | } | 
|---|
| 787 | else { | 
|---|
| 788 | CandidateList candidate; | 
|---|
| 789 | { | 
|---|
| 790 | // search candidate groups (and populate parent-group cache on-the-fly) | 
|---|
| 791 |  | 
|---|
| 792 | GBDATA       *gb_parent_group = NULp; // last traversed parent group | 
|---|
| 793 | ParentCache&  pcache          = common->get_parent_cache(); | 
|---|
| 794 | ARB_edge      start           = rootEdge(troot); | 
|---|
| 795 | ARB_edge      e               = start; | 
|---|
| 796 |  | 
|---|
| 797 | do { | 
|---|
| 798 | switch (e.get_type()) { | 
|---|
| 799 | case ROOT_EDGE: | 
|---|
| 800 | gb_parent_group = NULp; | 
|---|
| 801 | // fall-through | 
|---|
| 802 | case EDGE_TO_LEAF: { // descent (store parents; perform match) | 
|---|
| 803 | TreeNode *node = e.dest(); | 
|---|
| 804 | // [Note: order of if-tests is important, when keeled and normal group fall to same location] | 
|---|
| 805 | if (node->is_keeled_group()) { | 
|---|
| 806 | TreeNode *parent = e.source(); | 
|---|
| 807 | gs_assert(parent == node->get_father()); | 
|---|
| 808 |  | 
|---|
| 809 | GBDATA *gb_group = parent->gb_node; | 
|---|
| 810 | pcache.defineParentOf(gb_group, gb_parent_group); | 
|---|
| 811 | gs_assert(isCorrectParent(node, gb_group, gb_parent_group)); | 
|---|
| 812 | gb_parent_group = gb_group; | 
|---|
| 813 | } | 
|---|
| 814 | if (!node->is_leaf() && node->has_group_info()) { | 
|---|
| 815 | GBDATA *gb_group = node->gb_node; | 
|---|
| 816 |  | 
|---|
| 817 | if (node->is_normal_group()) { | 
|---|
| 818 | pcache.defineParentOf(gb_group, gb_parent_group); | 
|---|
| 819 | gs_assert(isCorrectParent(node, gb_group, gb_parent_group)); | 
|---|
| 820 | gb_parent_group = gb_group; | 
|---|
| 821 | } | 
|---|
| 822 |  | 
|---|
| 823 | ExistingHits::iterator prev_hit = existing_hits.find(gb_group); | 
|---|
| 824 |  | 
|---|
| 825 | bool was_listed = prev_hit != existing_hits.end(); | 
|---|
| 826 | bool test_match = !was_listed == match_unlisted; | 
|---|
| 827 |  | 
|---|
| 828 | if (test_match) { // store candidates | 
|---|
| 829 | candidate.push_back(Candidate(gb_group, DOWNCAST(GroupSearchTree*, node))); | 
|---|
| 830 | } | 
|---|
| 831 | } | 
|---|
| 832 | break; | 
|---|
| 833 | } | 
|---|
| 834 | case EDGE_TO_ROOT: { // ascent (restore parents) | 
|---|
| 835 | TreeNode *node = e.source(); | 
|---|
| 836 | // [Note: order of if-tests is important, when keeled and normal group fall to same location] | 
|---|
| 837 | if (!node->is_leaf() && node->is_normal_group()) { | 
|---|
| 838 | GBDATA *gb_group = node->gb_node; | 
|---|
| 839 | gb_parent_group  = pcache.lookupParent(gb_group); // restore parent group | 
|---|
| 840 | gs_assert(isCorrectParent(node, gb_group, gb_parent_group)); | 
|---|
| 841 | } | 
|---|
| 842 | if (node->is_keeled_group()) { | 
|---|
| 843 | TreeNode *parent = e.dest(); | 
|---|
| 844 | gs_assert(parent == node->get_father()); | 
|---|
| 845 |  | 
|---|
| 846 | GBDATA *gb_group = parent->gb_node; | 
|---|
| 847 | gb_parent_group  = pcache.lookupParent(gb_group); // restore parent group | 
|---|
| 848 | gs_assert(isCorrectParent(node, gb_group, gb_parent_group)); | 
|---|
| 849 | } | 
|---|
| 850 | break; | 
|---|
| 851 | } | 
|---|
| 852 | } | 
|---|
| 853 |  | 
|---|
| 854 | error = progress.inc_and_error_if_aborted(); | 
|---|
| 855 | e     = e.next(); | 
|---|
| 856 | } | 
|---|
| 857 | while (e != start && !error); | 
|---|
| 858 | } | 
|---|
| 859 |  | 
|---|
| 860 | // now run queries for all candidates: | 
|---|
| 861 | bool was_listed = !match_unlisted; | 
|---|
| 862 | for (CandidateList::iterator cand = candidate.begin(); !error && cand != candidate.end(); ++cand) { | 
|---|
| 863 | target_group.aimTo(*cand); | 
|---|
| 864 |  | 
|---|
| 865 | string hit_reason; | 
|---|
| 866 | if (query_expr->matches(target_group, hit_reason)) { | 
|---|
| 867 | if (!was_listed) { | 
|---|
| 868 | found->add_candidate(*this, *cand, hit_reason); | 
|---|
| 869 | } | 
|---|
| 870 | } | 
|---|
| 871 | else { | 
|---|
| 872 | if (was_listed) { | 
|---|
| 873 | ExistingHits::iterator prev_hit = existing_hits.find(cand->get_group().get_pointer()); | 
|---|
| 874 | gs_assert(prev_hit != existing_hits.end()); // internal logic error | 
|---|
| 875 | existing_hits.erase(prev_hit); | 
|---|
| 876 | } | 
|---|
| 877 | } | 
|---|
| 878 | } | 
|---|
| 879 | target_group.unAim(); | 
|---|
| 880 | st->flush_loaded_tree(); | 
|---|
| 881 | } | 
|---|
| 882 | } | 
|---|
| 883 |  | 
|---|
| 884 | if (load_failures) { | 
|---|
| 885 | // remove failed trees from 'searched_tree' | 
|---|
| 886 | SearchedTreeContainer reduced; | 
|---|
| 887 | for (unsigned t = 0; t<searched_tree.size(); ++t) { | 
|---|
| 888 | if (!searched_tree[t].failed_to_load()) { | 
|---|
| 889 | reduced.push_back(searched_tree[t]); | 
|---|
| 890 | } | 
|---|
| 891 | } | 
|---|
| 892 | int failed_trees = searched_tree.size()-reduced.size(); | 
|---|
| 893 | GBT_message(gb_main, GBS_global_string("%i tree(s) failed to load (will operate on rest)", failed_trees)); | 
|---|
| 894 | swap(reduced, searched_tree); | 
|---|
| 895 | } | 
|---|
| 896 |  | 
|---|
| 897 | if (!match_unlisted && !error) { // keep only hits still listed in existing_hits | 
|---|
| 898 | QueriedGroups *kept = new QueriedGroups; | 
|---|
| 899 |  | 
|---|
| 900 | for (FoundGroupCIter prev = found->begin(); prev != found->end(); ++prev) { | 
|---|
| 901 | if (existing_hits.find(prev->get_pointer()) != existing_hits.end()) { | 
|---|
| 902 | kept->add_informed_group(*prev); | 
|---|
| 903 | } | 
|---|
| 904 | } | 
|---|
| 905 | found = kept; | 
|---|
| 906 | } | 
|---|
| 907 | } | 
|---|
| 908 |  | 
|---|
| 909 | if (dups.isSet() && !error) { | 
|---|
| 910 | // if elements were kept from last search, they have an outdated clusterID -> reset | 
|---|
| 911 | for (FoundGroupIter g = found->begin(); g != found->end(); ++g) g->forget_cluster_id(); | 
|---|
| 912 |  | 
|---|
| 913 | error = clusterDuplicates(); | 
|---|
| 914 | } | 
|---|
| 915 |  | 
|---|
| 916 | if (error) { | 
|---|
| 917 | GBT_message(gb_main, error); | 
|---|
| 918 | found = new QueriedGroups; // clear results | 
|---|
| 919 | } | 
|---|
| 920 |  | 
|---|
| 921 | sortedByOrder = false; | 
|---|
| 922 | } | 
|---|
| 923 |  | 
|---|
| 924 | // ----------------------------------------- | 
|---|
| 925 | //      code for dupe-cluster detection | 
|---|
| 926 |  | 
|---|
| 927 | inline bool contains(const WordSet& ws, const string& w) { return ws.find(w) != ws.end(); } | 
|---|
| 928 | inline bool contains(const WordSet& ws, const char *w) { string W(w); return contains(ws, W); } | 
|---|
| 929 |  | 
|---|
| 930 | static void string2WordSet(const char *name, WordSet& words, const char *wordSeparators, const WordSet& ignored_words) { | 
|---|
| 931 | char *namedup = strdup(name); | 
|---|
| 932 |  | 
|---|
| 933 | gs_assert(wordSeparators); | 
|---|
| 934 |  | 
|---|
| 935 | ConstStrArray w; | 
|---|
| 936 | GBT_splitNdestroy_string(w, namedup, wordSeparators, SPLIT_DROPEMPTY); | 
|---|
| 937 | for (int i = 0; w[i]; ++i) { | 
|---|
| 938 | if (!contains(ignored_words, w[i])) words.insert(w[i]); | 
|---|
| 939 | } | 
|---|
| 940 | } | 
|---|
| 941 | inline void string_to_lower(string& s) { | 
|---|
| 942 | for (string::iterator c = s.begin(); c != s.end(); ++c) { | 
|---|
| 943 | *c = tolower(*c); | 
|---|
| 944 | } | 
|---|
| 945 | } | 
|---|
| 946 |  | 
|---|
| 947 | struct GroupInfo {       // helper class for Clusterer::calc_matches | 
|---|
| 948 | string            name; // groupname (lowercase if constructed with sens==GB_IGNORE_CASE) | 
|---|
| 949 | RefPtr<GBDATA>    tree; | 
|---|
| 950 | SmartPtr<WordSet> words; // single words (if groupname consists of multiple words and 'prep_wordwise' was true) | 
|---|
| 951 |  | 
|---|
| 952 | GroupInfo(const FoundGroup& g, bool prep_wordwise, GB_CASE sens, const char *wordSeparators, const WordSet& ignored_words) : | 
|---|
| 953 | name(g.get_name()), | 
|---|
| 954 | tree(g.get_tree_data()) | 
|---|
| 955 | { | 
|---|
| 956 | if (sens == GB_IGNORE_CASE) string_to_lower(name); | 
|---|
| 957 |  | 
|---|
| 958 | if (prep_wordwise) { | 
|---|
| 959 | words = new WordSet; | 
|---|
| 960 | string2WordSet(name.c_str(), *words, wordSeparators, ignored_words); | 
|---|
| 961 | } | 
|---|
| 962 | } | 
|---|
| 963 |  | 
|---|
| 964 | size_t get_word_count() const { | 
|---|
| 965 | // may return zero (if group name only contains ignored words!) | 
|---|
| 966 | return words.isNull() ? 1 : words->size(); | 
|---|
| 967 | } | 
|---|
| 968 | }; | 
|---|
| 969 | typedef vector<GroupInfo> GroupInfoVec; | 
|---|
| 970 |  | 
|---|
| 971 | class DupNameCriterion { | 
|---|
| 972 | DupNameCriterionType type; | 
|---|
| 973 | GB_CASE              sens; | 
|---|
| 974 |  | 
|---|
| 975 | int     min_words;     // only used by DNC_WORDWISE | 
|---|
| 976 | WordSet ignored_words; // only used by DNC_WORDWISE | 
|---|
| 977 |  | 
|---|
| 978 | string wordSeparators; | 
|---|
| 979 |  | 
|---|
| 980 | public: | 
|---|
| 981 | explicit DupNameCriterion(DupNameCriterionType exact, GB_CASE sens_) : | 
|---|
| 982 | type(exact), | 
|---|
| 983 | sens(sens_), | 
|---|
| 984 | min_words(1) | 
|---|
| 985 | { | 
|---|
| 986 | gs_assert(exact == DNC_WHOLENAME); | 
|---|
| 987 | } | 
|---|
| 988 |  | 
|---|
| 989 | DupNameCriterion(DupNameCriterionType wordwise, GB_CASE sens_, int min_words_, const WordSet& ignored_words_, const char *wordSeparators_) : | 
|---|
| 990 | type(wordwise), | 
|---|
| 991 | sens(sens_), | 
|---|
| 992 | min_words(min_words_), | 
|---|
| 993 | wordSeparators(wordSeparators_) | 
|---|
| 994 | { | 
|---|
| 995 | gs_assert(wordwise == DNC_WORDWISE); | 
|---|
| 996 | gs_assert(min_words>0); | 
|---|
| 997 |  | 
|---|
| 998 | for (WordSet::const_iterator wi = ignored_words_.begin(); wi != ignored_words_.end(); ++wi) { | 
|---|
| 999 | string word = *wi; | 
|---|
| 1000 | if (sens == GB_IGNORE_CASE) string_to_lower(word); | 
|---|
| 1001 | ignored_words.insert(word); | 
|---|
| 1002 | } | 
|---|
| 1003 | } | 
|---|
| 1004 |  | 
|---|
| 1005 | DupNameCriterionType get_name_type() const { return type; } | 
|---|
| 1006 | bool wordwise_name_matching() const { return get_name_type() == DNC_WORDWISE; } | 
|---|
| 1007 |  | 
|---|
| 1008 | GB_CASE get_sensitivity() const { return sens; } | 
|---|
| 1009 | const char *get_word_separators() const { return wordSeparators.c_str(); } | 
|---|
| 1010 |  | 
|---|
| 1011 | const WordSet& get_ignored_words() const { return ignored_words; } | 
|---|
| 1012 |  | 
|---|
| 1013 | int get_min_wanted_words() const { return min_words; } | 
|---|
| 1014 | void set_min_wanted_words(int words) { min_words = words; } | 
|---|
| 1015 |  | 
|---|
| 1016 | int name_matches_wordwise(const GroupInfo& gi1, const GroupInfo& gi2) const { | 
|---|
| 1017 | int max_possible_word_matches = min(gi1.get_word_count(), gi2.get_word_count()); | 
|---|
| 1018 | if (max_possible_word_matches<min_words) return false; | 
|---|
| 1019 |  | 
|---|
| 1020 | if (gi1.words.isNull()) { | 
|---|
| 1021 | if (gi2.words.isNull()) { | 
|---|
| 1022 | gs_assert(min_words<=1); | 
|---|
| 1023 | gs_assert(!contains(ignored_words, gi1.name)); | 
|---|
| 1024 | gs_assert(!contains(ignored_words, gi2.name)); | 
|---|
| 1025 | return gi1.name.compare(gi2.name) == 0; | 
|---|
| 1026 | } | 
|---|
| 1027 | return name_matches_wordwise(gi2, gi1); | 
|---|
| 1028 | } | 
|---|
| 1029 |  | 
|---|
| 1030 | if (gi2.words.isNull()) { | 
|---|
| 1031 | gs_assert(min_words<=1); | 
|---|
| 1032 | gs_assert(!contains(ignored_words, gi2.name)); | 
|---|
| 1033 | return contains(*gi1.words, gi2.name); | 
|---|
| 1034 | } | 
|---|
| 1035 |  | 
|---|
| 1036 | int matched_words = 0; | 
|---|
| 1037 | for (WordSet::const_iterator wi = gi1.words->begin(); wi != gi1.words->end(); ++wi) { | 
|---|
| 1038 | if (contains(*gi2.words, *wi)) ++matched_words; | 
|---|
| 1039 | } | 
|---|
| 1040 |  | 
|---|
| 1041 | return matched_words>=min_words ? matched_words : false; | 
|---|
| 1042 | } | 
|---|
| 1043 |  | 
|---|
| 1044 | int name_matches(const GroupInfo& gi1, const GroupInfo& gi2) const { | 
|---|
| 1045 | return type == DNC_WHOLENAME | 
|---|
| 1046 | ? gi1.name.compare(gi2.name) == 0 | 
|---|
| 1047 | : name_matches_wordwise(gi1, gi2); | 
|---|
| 1048 | } | 
|---|
| 1049 | }; | 
|---|
| 1050 |  | 
|---|
| 1051 | typedef set<int>                        GroupClusterSet; | 
|---|
| 1052 | typedef GroupClusterSet::const_iterator GroupClusterCIter; | 
|---|
| 1053 |  | 
|---|
| 1054 | class GroupCluster { | 
|---|
| 1055 | GroupClusterSet members;    // contains indices into Clusterer::groups | 
|---|
| 1056 | int             num_groups; // size of Clusterer::groups | 
|---|
| 1057 |  | 
|---|
| 1058 | mutable vector<uint8_t> lookup; // when non-empty: contains true for members | 
|---|
| 1059 |  | 
|---|
| 1060 | inline bool valid(int i) const { return i >= 0 && i<num_groups; } | 
|---|
| 1061 | inline bool have_lookup() const { return !lookup.empty(); } | 
|---|
| 1062 |  | 
|---|
| 1063 | public: | 
|---|
| 1064 | GroupCluster(int num_of_groups) | 
|---|
| 1065 | : num_groups(num_of_groups) | 
|---|
| 1066 | {} | 
|---|
| 1067 | ~GroupCluster() {} | 
|---|
| 1068 |  | 
|---|
| 1069 | GroupCluster(const GroupCluster& other) : // does NOT copy lookup table | 
|---|
| 1070 | members(other.members), | 
|---|
| 1071 | num_groups(other.num_groups) | 
|---|
| 1072 | {} | 
|---|
| 1073 | DECLARE_ASSIGNMENT_OPERATOR(GroupCluster); | 
|---|
| 1074 |  | 
|---|
| 1075 | void allow_lookup() const { // create lookup table -> allows to run 'contains()' | 
|---|
| 1076 | if (!have_lookup()) { | 
|---|
| 1077 | lookup.resize(num_groups, int(false)); | 
|---|
| 1078 | for (GroupClusterCIter ci = begin(); ci != end(); ++ci) { | 
|---|
| 1079 | lookup[*ci] = true; | 
|---|
| 1080 | } | 
|---|
| 1081 | gs_assert(have_lookup()); | 
|---|
| 1082 | } | 
|---|
| 1083 | } | 
|---|
| 1084 | void forget_lookup() const { lookup.clear(); } | 
|---|
| 1085 |  | 
|---|
| 1086 | void clear() { | 
|---|
| 1087 | if (have_lookup()) { | 
|---|
| 1088 | for (GroupClusterCIter ci = begin(); ci != end(); ++ci) lookup[*ci] = false; | 
|---|
| 1089 | } | 
|---|
| 1090 | members.clear(); | 
|---|
| 1091 | } | 
|---|
| 1092 |  | 
|---|
| 1093 | void insert(int i) { | 
|---|
| 1094 | gs_assert(valid(i)); | 
|---|
| 1095 | members.insert(i); | 
|---|
| 1096 | if (have_lookup()) lookup[i] = true; | 
|---|
| 1097 | } | 
|---|
| 1098 | void erase(int i) { | 
|---|
| 1099 | gs_assert(valid(i)); | 
|---|
| 1100 | members.erase(i); | 
|---|
| 1101 | if (have_lookup()) lookup[i] = false; | 
|---|
| 1102 | } | 
|---|
| 1103 |  | 
|---|
| 1104 | bool contains(int i) const { | 
|---|
| 1105 | gs_assert(valid(i)); | 
|---|
| 1106 | gs_assert(have_lookup()); | 
|---|
| 1107 | return lookup[i]; | 
|---|
| 1108 | } | 
|---|
| 1109 |  | 
|---|
| 1110 | bool empty() const { return members.empty(); } | 
|---|
| 1111 | size_t size() const { return members.size(); } | 
|---|
| 1112 |  | 
|---|
| 1113 | GroupClusterCIter begin() const { return members.begin(); } | 
|---|
| 1114 | GroupClusterCIter end() const { return members.end(); } | 
|---|
| 1115 | }; | 
|---|
| 1116 |  | 
|---|
| 1117 |  | 
|---|
| 1118 | class DupCriteria : public DupNameCriterion { | 
|---|
| 1119 | bool                 listDups; // true->list duplicate groups; false->list "unique" groups (non-duplicate groups) | 
|---|
| 1120 | DupTreeCriterionType ttype; | 
|---|
| 1121 | int                  minSize;  // minimum cluster size (for DLC_DIFF_TREE: minimum number of different trees per cluster) | 
|---|
| 1122 |  | 
|---|
| 1123 | public: | 
|---|
| 1124 | DupCriteria(bool listDups_, const DupNameCriterion& nameCrit_, DupTreeCriterionType ttype_, int minSize_) : | 
|---|
| 1125 | DupNameCriterion(nameCrit_), | 
|---|
| 1126 | listDups(listDups_), | 
|---|
| 1127 | ttype(ttype_), | 
|---|
| 1128 | minSize(minSize_) | 
|---|
| 1129 | { | 
|---|
| 1130 | gs_assert(minSize>=2); | 
|---|
| 1131 | } | 
|---|
| 1132 |  | 
|---|
| 1133 | DupTreeCriterionType get_tree_type() const { return ttype; } | 
|---|
| 1134 | bool want_unique_groups() const { return !listDups; } | 
|---|
| 1135 |  | 
|---|
| 1136 | bool is_inferable() const { | 
|---|
| 1137 | // An inferable criteria has to allow the following deduction: | 
|---|
| 1138 | // (A == B) and (B == C) -> (A == C) | 
|---|
| 1139 | // | 
|---|
| 1140 | // For comparing group names, | 
|---|
| 1141 | // - whole name comparison is an inferable criteria | 
|---|
| 1142 | // - wordwise comparison isnt! | 
|---|
| 1143 |  | 
|---|
| 1144 | // Note: comparing trees for equality is inferable, | 
|---|
| 1145 | //       comparing trees for difference isnt. | 
|---|
| 1146 |  | 
|---|
| 1147 | return !wordwise_name_matching(); | 
|---|
| 1148 | } | 
|---|
| 1149 |  | 
|---|
| 1150 | bool tree_matches(const GBDATA *data1, const GBDATA *data2) const { | 
|---|
| 1151 | bool did_match; | 
|---|
| 1152 | switch (ttype) { | 
|---|
| 1153 | case DLC_SAME_TREE: | 
|---|
| 1154 | did_match = data1 == data2; | 
|---|
| 1155 | break; | 
|---|
| 1156 |  | 
|---|
| 1157 | case DLC_DIFF_TREE: | 
|---|
| 1158 | did_match = data1 != data2; | 
|---|
| 1159 | break; | 
|---|
| 1160 |  | 
|---|
| 1161 | case DLC_ANYWHERE: | 
|---|
| 1162 | did_match = true; // ignore tree membership | 
|---|
| 1163 | break; | 
|---|
| 1164 | } | 
|---|
| 1165 | return did_match; | 
|---|
| 1166 | } | 
|---|
| 1167 |  | 
|---|
| 1168 | int min_cluster_size() const { return minSize; } | 
|---|
| 1169 | bool big_enough(const GroupCluster& cluster) const { return !cluster.empty() && int(cluster.size())>=minSize; } | 
|---|
| 1170 | }; | 
|---|
| 1171 |  | 
|---|
| 1172 | class SymmetricMatrixMapper : virtual Noncopyable { | 
|---|
| 1173 | // maps matrix indices to linear indices and vv. | 
|---|
| 1174 | // | 
|---|
| 1175 | // For each x/y-pair of matrix indices the following assumptions are made: | 
|---|
| 1176 | // - x!=y (i.e. never used) | 
|---|
| 1177 | // - value(x,y)==value(y,x) | 
|---|
| 1178 |  | 
|---|
| 1179 | int size; // matrix size (x and y) | 
|---|
| 1180 | int lin_size; | 
|---|
| 1181 |  | 
|---|
| 1182 | int *firstIndexOfRow; | 
|---|
| 1183 | void init_firstIndexOfRow() { | 
|---|
| 1184 | firstIndexOfRow[0] = 0; | 
|---|
| 1185 | for (int y = 1; y<size; ++y) { | 
|---|
| 1186 | firstIndexOfRow[y] = firstIndexOfRow[y-1]+(y-1); | 
|---|
| 1187 | } | 
|---|
| 1188 | } | 
|---|
| 1189 |  | 
|---|
| 1190 | public: | 
|---|
| 1191 | SymmetricMatrixMapper(int elements) : | 
|---|
| 1192 | size(elements), | 
|---|
| 1193 | lin_size(size*(size-1)/2), | 
|---|
| 1194 | firstIndexOfRow(new int[size]) | 
|---|
| 1195 | { | 
|---|
| 1196 | gs_assert(elements>=2); // smaller is useless | 
|---|
| 1197 | init_firstIndexOfRow(); | 
|---|
| 1198 | } | 
|---|
| 1199 | ~SymmetricMatrixMapper() { | 
|---|
| 1200 | delete [] firstIndexOfRow; | 
|---|
| 1201 | } | 
|---|
| 1202 |  | 
|---|
| 1203 | int linear_size() const { return lin_size; } | 
|---|
| 1204 | int linear_index(int x, int y) const { | 
|---|
| 1205 | if (x>y) swap(x, y); | 
|---|
| 1206 |  | 
|---|
| 1207 | gs_assert(x<y); // equal indices not allowed | 
|---|
| 1208 | gs_assert(y<size); | 
|---|
| 1209 | gs_assert(x>=0); | 
|---|
| 1210 |  | 
|---|
| 1211 | return firstIndexOfRow[y]+x; | 
|---|
| 1212 | } | 
|---|
| 1213 |  | 
|---|
| 1214 | #if defined(UNIT_TESTS) | 
|---|
| 1215 | void to_xy(int lin, int& x, int& y) const {      // Note: only used in test-code | 
|---|
| 1216 | for (y = 1; y<size && lin>=y; ++y) lin -= y; // if needed in production code: maybe use table for speedup | 
|---|
| 1217 | x = lin; | 
|---|
| 1218 | } | 
|---|
| 1219 | #endif | 
|---|
| 1220 | }; | 
|---|
| 1221 |  | 
|---|
| 1222 | class Clusterer { | 
|---|
| 1223 | SmartPtr<QueriedGroups> groups; | 
|---|
| 1224 | SmartPtr<DupCriteria>   criteria; | 
|---|
| 1225 | SymmetricMatrixMapper   symmap; | 
|---|
| 1226 |  | 
|---|
| 1227 | vector<uint8_t> name_matches; | 
|---|
| 1228 | vector<bool>    tree_matches; | 
|---|
| 1229 |  | 
|---|
| 1230 | vector<uint8_t> words; // stores number of words for each group (indices into 'groups'; only valid when wordwise_name_matching) | 
|---|
| 1231 |  | 
|---|
| 1232 | int          next_id;   // used for next cluster | 
|---|
| 1233 | GroupCluster delivered; // stores indices (into 'groups') of all delivered groups | 
|---|
| 1234 |  | 
|---|
| 1235 | int pairIdx(int i, int j) const { return symmap.linear_index(i, j); } | 
|---|
| 1236 | void calc_matches(GBDATA *gb_main); | 
|---|
| 1237 |  | 
|---|
| 1238 | int fits_into_cluster(int idx, const GroupCluster& cluster, bool strong_fit) const { | 
|---|
| 1239 | const int min_words    = criteria->get_min_wanted_words(); | 
|---|
| 1240 | bool      enough_words = min_words<2 || words[idx] >= min_words; | 
|---|
| 1241 |  | 
|---|
| 1242 | gs_assert(min_words>0); | 
|---|
| 1243 |  | 
|---|
| 1244 | int fitting = 0; | 
|---|
| 1245 | if (enough_words && !already_delivered(idx) && !cluster.contains(idx)) { | 
|---|
| 1246 | bool fitsAll    = true; | 
|---|
| 1247 | bool weakFitAny = true; | 
|---|
| 1248 |  | 
|---|
| 1249 | for (GroupClusterCIter ci = cluster.begin(); fitsAll && ci != cluster.end(); ++ci) { | 
|---|
| 1250 | const int pi      = pairIdx(idx, *ci); | 
|---|
| 1251 | bool      fitWeak = name_matches[pi] >= min_words; | 
|---|
| 1252 |  | 
|---|
| 1253 | fitsAll    = fitWeak && tree_matches[pi]; | 
|---|
| 1254 | weakFitAny = weakFitAny || fitWeak; | 
|---|
| 1255 | } | 
|---|
| 1256 |  | 
|---|
| 1257 | if      (fitsAll)                   fitting = idx; | 
|---|
| 1258 | else if (weakFitAny && !strong_fit) fitting = -idx; | 
|---|
| 1259 | } | 
|---|
| 1260 | return fitting; | 
|---|
| 1261 | } | 
|---|
| 1262 |  | 
|---|
| 1263 | int find_next_group_fitting_into(const GroupCluster& cluster, int behind_idx, bool strong_fit) const { | 
|---|
| 1264 | // searches for the next group (with an index > 'behind_idx') fitting into 'cluster'. | 
|---|
| 1265 | // | 
|---|
| 1266 | // returns: | 
|---|
| 1267 | // 0   = no such group found | 
|---|
| 1268 | // >0  = index of first fitting group | 
|---|
| 1269 | // <0  = index of candidate group (for cluster extension). not reported if 'strong_fit' is true | 
|---|
| 1270 |  | 
|---|
| 1271 | gs_assert(!cluster.empty()); | 
|---|
| 1272 | gs_assert(behind_idx>=0); | 
|---|
| 1273 |  | 
|---|
| 1274 | const int gcount  = groups->size(); | 
|---|
| 1275 | int       fitting = 0; | 
|---|
| 1276 |  | 
|---|
| 1277 | for (int idx = behind_idx+1; idx<gcount && !fitting; ++idx) { | 
|---|
| 1278 | fitting = fits_into_cluster(idx, cluster, strong_fit); | 
|---|
| 1279 | } | 
|---|
| 1280 |  | 
|---|
| 1281 | gs_assert(implicated(fitting>0, !cluster.contains(fitting))); | 
|---|
| 1282 | gs_assert(implicated(strong_fit, fitting>=0)); | 
|---|
| 1283 |  | 
|---|
| 1284 | return fitting; | 
|---|
| 1285 | } | 
|---|
| 1286 |  | 
|---|
| 1287 | int find_next_candidate_group_fitting_into(const GroupCluster& cluster, const vector<int>& candidates, int& cand_idx, bool strong_fit) const { | 
|---|
| 1288 | // similar to find_next_group_fitting_into(), but only considers indices listed in 'candidates' (instead of all) | 
|---|
| 1289 | // (they can be retrieved using find_next_group_fitting_into before) | 
|---|
| 1290 | // | 
|---|
| 1291 | // additionally 'cand_idx' is set to the index corresponding with result | 
|---|
| 1292 |  | 
|---|
| 1293 | gs_assert(!cluster.empty()); | 
|---|
| 1294 | gs_assert(cand_idx>=-1); | 
|---|
| 1295 |  | 
|---|
| 1296 | const int cand_size = candidates.size(); | 
|---|
| 1297 | int       fitting   = 0; | 
|---|
| 1298 |  | 
|---|
| 1299 | for (int cidx = cand_idx+1; cidx<cand_size; ++cidx) { | 
|---|
| 1300 | int idx = candidates[cidx]; | 
|---|
| 1301 |  | 
|---|
| 1302 | fitting = fits_into_cluster(idx, cluster, strong_fit); | 
|---|
| 1303 | if (fitting) { | 
|---|
| 1304 | cand_idx = cidx; | 
|---|
| 1305 | break; | 
|---|
| 1306 | } | 
|---|
| 1307 | } | 
|---|
| 1308 |  | 
|---|
| 1309 | gs_assert(implicated(fitting>0, !cluster.contains(fitting))); | 
|---|
| 1310 | gs_assert(implicated(strong_fit, fitting>=0)); | 
|---|
| 1311 |  | 
|---|
| 1312 | return fitting; | 
|---|
| 1313 | } | 
|---|
| 1314 |  | 
|---|
| 1315 | void extendClusterToBiggest(GroupCluster& curr, int next_idx, GroupCluster& best, arb_progress& progress_cluster, double done_low, double done_high); | 
|---|
| 1316 |  | 
|---|
| 1317 | public: | 
|---|
| 1318 | Clusterer(GBDATA *gb_main, SmartPtr<QueriedGroups> groups_, SmartPtr<DupCriteria> criteria_) : | 
|---|
| 1319 | groups(groups_), | 
|---|
| 1320 | criteria(criteria_), | 
|---|
| 1321 | symmap(groups->size()), | 
|---|
| 1322 | next_id(1), | 
|---|
| 1323 | delivered(groups->size()) | 
|---|
| 1324 | { | 
|---|
| 1325 | calc_matches(gb_main); | 
|---|
| 1326 | } | 
|---|
| 1327 |  | 
|---|
| 1328 | int max_cluster_start_index() const { return groups->size() - criteria->min_cluster_size(); } | 
|---|
| 1329 |  | 
|---|
| 1330 | void buildInferableClusterStartingWith(int start_idx, GroupCluster& cluster); | 
|---|
| 1331 | void findBestClusterBasedOnWords(int wanted_words, GroupCluster& best, arb_progress& progress_cluster, int& first_cluster_found_from_index); | 
|---|
| 1332 |  | 
|---|
| 1333 | bool already_delivered(int idx) const { return delivered.contains(idx); } | 
|---|
| 1334 | void deliverCluster(const GroupCluster& ofCluster, QueriedGroups& toResult) { | 
|---|
| 1335 | int this_id = next_id++; | 
|---|
| 1336 | for (GroupClusterCIter ci = ofCluster.begin(); ci != ofCluster.end(); ++ci) { | 
|---|
| 1337 | int idx = *ci; | 
|---|
| 1338 |  | 
|---|
| 1339 | // avoid duplication of groups in result list | 
|---|
| 1340 | gs_assert(!already_delivered(idx)); | 
|---|
| 1341 | delivered.insert(idx); | 
|---|
| 1342 |  | 
|---|
| 1343 | FoundGroup& g = (*groups)[idx]; | 
|---|
| 1344 | g.set_cluster_id(this_id); | 
|---|
| 1345 | toResult.add_informed_group(g); | 
|---|
| 1346 | } | 
|---|
| 1347 | } | 
|---|
| 1348 |  | 
|---|
| 1349 | void find_and_deliverTo(QueriedGroups& toResult); | 
|---|
| 1350 | void deliverRest(QueriedGroups& toResult) { | 
|---|
| 1351 | int idx = 0; | 
|---|
| 1352 | for (FoundGroupCIter g = groups->begin(); g != groups->end(); ++g, ++idx) { | 
|---|
| 1353 | if (!already_delivered(idx)) { | 
|---|
| 1354 | toResult.add_informed_group(*g); | 
|---|
| 1355 | } | 
|---|
| 1356 | } | 
|---|
| 1357 | } | 
|---|
| 1358 |  | 
|---|
| 1359 | int calc_max_used_words(bool ignore_delivered) { | 
|---|
| 1360 | gs_assert(criteria->wordwise_name_matching()); // otherwise words array contains nothing | 
|---|
| 1361 |  | 
|---|
| 1362 | int       maxWords = 0; | 
|---|
| 1363 | const int maxidx   = groups->size(); | 
|---|
| 1364 |  | 
|---|
| 1365 | for (int idx = 0; idx<maxidx; ++idx) { | 
|---|
| 1366 | int thisWords = words[idx]; | 
|---|
| 1367 |  | 
|---|
| 1368 | if (thisWords>maxWords && (ignore_delivered ? !already_delivered(idx) : true)) { | 
|---|
| 1369 | maxWords = thisWords; | 
|---|
| 1370 | } | 
|---|
| 1371 | } | 
|---|
| 1372 |  | 
|---|
| 1373 | return maxWords; | 
|---|
| 1374 | } | 
|---|
| 1375 |  | 
|---|
| 1376 | }; | 
|---|
| 1377 |  | 
|---|
| 1378 | void Clusterer::calc_matches(GBDATA *gb_main) { | 
|---|
| 1379 | const int  gcount    = groups->size(); | 
|---|
| 1380 | const int  lin_range = symmap.linear_size(); | 
|---|
| 1381 | const long way_to_go = long(gcount) + lin_range; | 
|---|
| 1382 |  | 
|---|
| 1383 | arb_progress progress(GBS_global_string("[pass 1/2: duplicity matrix (%s)]", GBS_readable_size(lin_range, "b")), way_to_go); | 
|---|
| 1384 |  | 
|---|
| 1385 | name_matches.reserve(lin_range); | 
|---|
| 1386 | tree_matches.reserve(lin_range); | 
|---|
| 1387 |  | 
|---|
| 1388 | GroupInfoVec info; | 
|---|
| 1389 | info.reserve(gcount); | 
|---|
| 1390 |  | 
|---|
| 1391 | { // fetch info to speed up calculation below | 
|---|
| 1392 | GB_transaction ta(gb_main); | 
|---|
| 1393 |  | 
|---|
| 1394 | bool            prep_wordwise  = criteria->wordwise_name_matching(); | 
|---|
| 1395 | GB_CASE         sens           = criteria->get_sensitivity(); | 
|---|
| 1396 | const char     *wordSeparators = criteria->get_word_separators(); | 
|---|
| 1397 | const WordSet&  ignoredWords   = criteria->get_ignored_words(); | 
|---|
| 1398 |  | 
|---|
| 1399 | for (FoundGroupCIter g = groups->begin(); g != groups->end() && !progress.aborted(); ++g) { | 
|---|
| 1400 | info.push_back(GroupInfo(*g, prep_wordwise, sens, wordSeparators, ignoredWords)); | 
|---|
| 1401 | if (prep_wordwise) { | 
|---|
| 1402 | const GroupInfo& ginfo = info.back(); | 
|---|
| 1403 | words.push_back(ginfo.get_word_count()); | 
|---|
| 1404 | } | 
|---|
| 1405 | ++progress; | 
|---|
| 1406 | } | 
|---|
| 1407 | } | 
|---|
| 1408 |  | 
|---|
| 1409 | for (int i1 = 0; i1<gcount && !progress.aborted(); ++i1) { // calculate pairwise group matches | 
|---|
| 1410 | for (int i2 = i1+1; i2<gcount && !progress.aborted(); ++i2) { | 
|---|
| 1411 | const int li = symmap.linear_index(i1, i2); | 
|---|
| 1412 |  | 
|---|
| 1413 | name_matches[li] = criteria->name_matches(info[i1], info[i2]); | 
|---|
| 1414 | tree_matches[li] = criteria->tree_matches(info[i1].tree, info[i2].tree); | 
|---|
| 1415 |  | 
|---|
| 1416 | ++progress; | 
|---|
| 1417 | } | 
|---|
| 1418 | } | 
|---|
| 1419 | } | 
|---|
| 1420 |  | 
|---|
| 1421 | void Clusterer::buildInferableClusterStartingWith(const int start_idx, GroupCluster& cluster) { | 
|---|
| 1422 | gs_assert(criteria->is_inferable()); // works only for inferable compare criteria | 
|---|
| 1423 |  | 
|---|
| 1424 | int          gcount = groups->size(); | 
|---|
| 1425 | arb_progress progress_build(long(gcount-start_idx-1)); | 
|---|
| 1426 |  | 
|---|
| 1427 | gs_assert(cluster.empty()); | 
|---|
| 1428 | gs_assert(!already_delivered(start_idx)); | 
|---|
| 1429 | cluster.insert(start_idx); // always add group at 'start_idx' | 
|---|
| 1430 |  | 
|---|
| 1431 | GroupCluster weakCand(gcount); // collects non-strong, possible weak matches | 
|---|
| 1432 |  | 
|---|
| 1433 | { | 
|---|
| 1434 | int pcount   = start_idx; | 
|---|
| 1435 | int curr_idx = start_idx; | 
|---|
| 1436 | while (!progress_build.aborted()) { | 
|---|
| 1437 | const int addable = find_next_group_fitting_into(cluster, curr_idx, false); | 
|---|
| 1438 | if (!addable) break; | 
|---|
| 1439 |  | 
|---|
| 1440 | if (addable>0) { // found a strong match | 
|---|
| 1441 | cluster.insert(addable); | 
|---|
| 1442 | curr_idx = addable; | 
|---|
| 1443 | } | 
|---|
| 1444 | else { | 
|---|
| 1445 | gs_assert(addable<0); // found a weak match | 
|---|
| 1446 | weakCand.insert(-addable); | 
|---|
| 1447 | curr_idx = -addable; | 
|---|
| 1448 | } | 
|---|
| 1449 |  | 
|---|
| 1450 | gs_assert(curr_idx>pcount); | 
|---|
| 1451 | progress_build.inc_by(curr_idx-pcount); | 
|---|
| 1452 | pcount = curr_idx; | 
|---|
| 1453 | } | 
|---|
| 1454 | } | 
|---|
| 1455 |  | 
|---|
| 1456 | if (criteria->big_enough(cluster) && !progress_build.aborted()) { | 
|---|
| 1457 | // extent cluster (by adding groups that match weak) | 
|---|
| 1458 | // - e.g. add groups from same tree when searching for different trees | 
|---|
| 1459 |  | 
|---|
| 1460 | if (!weakCand.empty()) { | 
|---|
| 1461 | GroupCluster toAdd(gcount); | 
|---|
| 1462 |  | 
|---|
| 1463 | if (criteria->get_tree_type() == DLC_DIFF_TREE) { | 
|---|
| 1464 | for (GroupClusterCIter w = weakCand.begin(); w != weakCand.end(); ++w) { | 
|---|
| 1465 | int nameFitsAll = true; | 
|---|
| 1466 | for (GroupClusterCIter ci = cluster.begin(); nameFitsAll && ci != cluster.end(); ++ci) { | 
|---|
| 1467 | int pi      = pairIdx(*w, *ci); | 
|---|
| 1468 | nameFitsAll = name_matches[pi]; | 
|---|
| 1469 | } | 
|---|
| 1470 | if (nameFitsAll) toAdd.insert(*w); | 
|---|
| 1471 | } | 
|---|
| 1472 | } | 
|---|
| 1473 | for (GroupClusterCIter a = toAdd.begin(); a != toAdd.end(); ++a) cluster.insert(*a); | 
|---|
| 1474 | } | 
|---|
| 1475 | } | 
|---|
| 1476 | else { // forget if too small | 
|---|
| 1477 | cluster.clear(); | 
|---|
| 1478 | } | 
|---|
| 1479 |  | 
|---|
| 1480 | progress_build.done(); | 
|---|
| 1481 |  | 
|---|
| 1482 | gs_assert(contradicted(cluster.empty(), criteria->big_enough(cluster))); | 
|---|
| 1483 | } | 
|---|
| 1484 |  | 
|---|
| 1485 | inline unsigned long permutations(int elems) { | 
|---|
| 1486 | return elems*elems/2-elems; | 
|---|
| 1487 | } | 
|---|
| 1488 |  | 
|---|
| 1489 | void Clusterer::extendClusterToBiggest(GroupCluster& curr, int next_idx, GroupCluster& best, arb_progress& progress_cluster, double done_low, double done_high) { | 
|---|
| 1490 | // extends cluster 'curr' (using all possible combinations starting at 'next_idx' = index into 'groups') | 
|---|
| 1491 | // stores best (=biggest) cluster in 'best' | 
|---|
| 1492 |  | 
|---|
| 1493 | vector<int> candidates; // collect all possible groups | 
|---|
| 1494 | { | 
|---|
| 1495 | int idx = next_idx; | 
|---|
| 1496 | while (1) { | 
|---|
| 1497 | const int addable = find_next_group_fitting_into(curr, idx, true); | 
|---|
| 1498 | if (!addable) break; | 
|---|
| 1499 |  | 
|---|
| 1500 | candidates.push_back(addable); | 
|---|
| 1501 | idx = addable; | 
|---|
| 1502 | } | 
|---|
| 1503 | } | 
|---|
| 1504 |  | 
|---|
| 1505 | if ((candidates.size()+curr.size()) > best.size()) { // any chance to find bigger cluster? | 
|---|
| 1506 | stack<int> previous;      // previously added indices (into candidates) | 
|---|
| 1507 | int        curr_idx = -1; // last added (i.e. start with candidates[0]) | 
|---|
| 1508 |  | 
|---|
| 1509 | const int           del_size          = delivered.size(); | 
|---|
| 1510 | const unsigned long permutation_count = permutations(candidates.size()); | 
|---|
| 1511 |  | 
|---|
| 1512 | while (!progress_cluster.aborted()) { | 
|---|
| 1513 | int addable = find_next_candidate_group_fitting_into(curr, candidates, curr_idx, true); | 
|---|
| 1514 | gs_assert(addable>=0); | 
|---|
| 1515 | if (addable) { | 
|---|
| 1516 | curr.insert(addable); | 
|---|
| 1517 | previous.push(curr_idx); | 
|---|
| 1518 | } | 
|---|
| 1519 | else { | 
|---|
| 1520 | if (curr.size() > best.size() && criteria->big_enough(curr)) { // store 'curr' cluster if better | 
|---|
| 1521 | best = curr; | 
|---|
| 1522 |  | 
|---|
| 1523 | const unsigned long permutations_left    = permutations(candidates.size()-best.size()); | 
|---|
| 1524 | const double        done_percent         = (permutation_count-permutations_left) / double(permutation_count); | 
|---|
| 1525 | const double        overall_done_percent = done_low + (done_high-done_low)*done_percent; | 
|---|
| 1526 |  | 
|---|
| 1527 | progress_cluster.inc_to_avoid_overflow(del_size + best.size() * overall_done_percent); // @@@ calculation seems to be wrong (overflows) | 
|---|
| 1528 | } | 
|---|
| 1529 | if (previous.empty()) break; // end iteration | 
|---|
| 1530 |  | 
|---|
| 1531 | const int last_cidx = previous.top(); | 
|---|
| 1532 | const int last_add  = candidates[last_cidx]; | 
|---|
| 1533 |  | 
|---|
| 1534 | curr.erase(last_add); | 
|---|
| 1535 | previous.pop(); | 
|---|
| 1536 | curr_idx = last_cidx; | 
|---|
| 1537 |  | 
|---|
| 1538 | const int    rest_cand = candidates.size() - (curr_idx+1); | 
|---|
| 1539 | const size_t poss_size = rest_cand + curr.size(); | 
|---|
| 1540 | if (poss_size<best.size()) break; // end iteration (impossible to collect enough groups to form a bigger cluster) | 
|---|
| 1541 | } | 
|---|
| 1542 | } | 
|---|
| 1543 |  | 
|---|
| 1544 | progress_cluster.inc_to_avoid_overflow(del_size + best.size() * done_high); // @@@ calculation seems to be wrong (overflows) | 
|---|
| 1545 | } | 
|---|
| 1546 | } | 
|---|
| 1547 |  | 
|---|
| 1548 | void Clusterer::findBestClusterBasedOnWords(int wanted_words, GroupCluster& best, arb_progress& progress_cluster, int& first_cluster_found_from_index) { | 
|---|
| 1549 | gs_assert(!criteria->is_inferable()); // thorough search not required | 
|---|
| 1550 | gs_assert(best.empty()); | 
|---|
| 1551 |  | 
|---|
| 1552 | { | 
|---|
| 1553 | const int old_min_words = criteria->get_min_wanted_words(); | 
|---|
| 1554 | criteria->set_min_wanted_words(wanted_words); | 
|---|
| 1555 |  | 
|---|
| 1556 | const int gcount        = groups->size(); | 
|---|
| 1557 | const int max_start_idx = gcount - criteria->min_cluster_size(); | 
|---|
| 1558 |  | 
|---|
| 1559 | GroupCluster curr(gcount); | 
|---|
| 1560 | curr.allow_lookup(); | 
|---|
| 1561 |  | 
|---|
| 1562 | const int    extension_count    = 1+(wanted_words-1-old_min_words); | 
|---|
| 1563 | const double done_per_extension = 1.0/extension_count; | 
|---|
| 1564 |  | 
|---|
| 1565 | int first_index = 0; | 
|---|
| 1566 |  | 
|---|
| 1567 | for (int start_idx = first_cluster_found_from_index; start_idx<max_start_idx && !progress_cluster.aborted(); ++start_idx) { | 
|---|
| 1568 | if (words[start_idx]>=wanted_words && !already_delivered(start_idx)) { | 
|---|
| 1569 | curr.clear(); | 
|---|
| 1570 | curr.insert(start_idx); | 
|---|
| 1571 |  | 
|---|
| 1572 | extendClusterToBiggest(curr, start_idx, best, progress_cluster, 0.0, done_per_extension); | 
|---|
| 1573 | if (!first_index && !best.empty()) { | 
|---|
| 1574 | first_cluster_found_from_index = first_index = start_idx; | 
|---|
| 1575 | } | 
|---|
| 1576 | } | 
|---|
| 1577 | } | 
|---|
| 1578 |  | 
|---|
| 1579 | if (wanted_words>old_min_words && !best.empty() && !progress_cluster.aborted()) { // may less words be accepted? | 
|---|
| 1580 | // extend cluster with "weaker" matches: | 
|---|
| 1581 |  | 
|---|
| 1582 | int ext_done = 1; | 
|---|
| 1583 | for (int fewer_words = wanted_words-1; fewer_words>=old_min_words && !progress_cluster.aborted(); --fewer_words, ++ext_done) { | 
|---|
| 1584 | criteria->set_min_wanted_words(fewer_words); | 
|---|
| 1585 |  | 
|---|
| 1586 | curr = best; | 
|---|
| 1587 | curr.allow_lookup(); | 
|---|
| 1588 |  | 
|---|
| 1589 | const double done_start = ext_done*done_per_extension; | 
|---|
| 1590 | extendClusterToBiggest(curr, 0, best, progress_cluster, done_start, done_start+done_per_extension); | 
|---|
| 1591 | } | 
|---|
| 1592 | } | 
|---|
| 1593 |  | 
|---|
| 1594 | criteria->set_min_wanted_words(old_min_words); | 
|---|
| 1595 | } | 
|---|
| 1596 |  | 
|---|
| 1597 | gs_assert(contradicted(best.empty(), criteria->big_enough(best))); | 
|---|
| 1598 | } | 
|---|
| 1599 |  | 
|---|
| 1600 |  | 
|---|
| 1601 | void Clusterer::find_and_deliverTo(QueriedGroups& toResult) { | 
|---|
| 1602 | int          gcount = groups->size(); | 
|---|
| 1603 | GroupCluster curr(gcount); | 
|---|
| 1604 |  | 
|---|
| 1605 | delivered.allow_lookup(); | 
|---|
| 1606 | curr.allow_lookup(); | 
|---|
| 1607 |  | 
|---|
| 1608 | if (criteria->is_inferable()) { // possible to use "fast" clustering? | 
|---|
| 1609 | const int max_i = max_cluster_start_index(); | 
|---|
| 1610 | gs_assert(max_i>0); | 
|---|
| 1611 |  | 
|---|
| 1612 | arb_progress progress_cluster("[pass 2/2: fast duplicate search]", long(max_i)); | 
|---|
| 1613 | for (int i = 0; i<max_i && !progress_cluster.aborted(); ++i) { | 
|---|
| 1614 | if (!already_delivered(i)) { | 
|---|
| 1615 | curr.clear(); | 
|---|
| 1616 | buildInferableClusterStartingWith(i, curr); | 
|---|
| 1617 | if (!curr.empty()) { // found a cluster | 
|---|
| 1618 | deliverCluster(curr, toResult); | 
|---|
| 1619 | } | 
|---|
| 1620 | } | 
|---|
| 1621 | ++progress_cluster; | 
|---|
| 1622 | } | 
|---|
| 1623 | } | 
|---|
| 1624 | else { // use thorough cluster search | 
|---|
| 1625 | int       max_words = calc_max_used_words(true); | 
|---|
| 1626 | const int min_words = criteria->get_min_wanted_words(); | 
|---|
| 1627 |  | 
|---|
| 1628 | long groups_with_min_words = 0; | 
|---|
| 1629 | for (int gidx = 0; gidx<gcount; ++gidx) { // LOOP_VECTORIZED [!<5.0] | 
|---|
| 1630 | if (words[gidx]>=min_words) ++groups_with_min_words; | 
|---|
| 1631 | } | 
|---|
| 1632 |  | 
|---|
| 1633 | arb_progress progress_cluster("[pass 2/2: thorough duplicate search]", groups_with_min_words); | 
|---|
| 1634 |  | 
|---|
| 1635 | int first_cluster_found_from_index = 0; | 
|---|
| 1636 | while (max_words >= min_words && !progress_cluster.aborted()) { | 
|---|
| 1637 | curr.clear(); | 
|---|
| 1638 | findBestClusterBasedOnWords(max_words, curr, progress_cluster, first_cluster_found_from_index); | 
|---|
| 1639 |  | 
|---|
| 1640 | if (curr.empty()) { | 
|---|
| 1641 | --max_words; | 
|---|
| 1642 | first_cluster_found_from_index = 0; | 
|---|
| 1643 | } | 
|---|
| 1644 | else { | 
|---|
| 1645 | deliverCluster(curr, toResult); | 
|---|
| 1646 | progress_cluster.inc_to(delivered.size()); | 
|---|
| 1647 | } | 
|---|
| 1648 | } | 
|---|
| 1649 | progress_cluster.done(); | 
|---|
| 1650 | } | 
|---|
| 1651 | } | 
|---|
| 1652 |  | 
|---|
| 1653 | GB_ERROR GroupSearch::clusterDuplicates() { | 
|---|
| 1654 | GB_ERROR error       = NULp; | 
|---|
| 1655 | bool     enough_hits = found->size()>=2; | 
|---|
| 1656 |  | 
|---|
| 1657 | if (enough_hits) { | 
|---|
| 1658 | arb_progress progress("Restricting to duplicate groups", 2L); | 
|---|
| 1659 | Clusterer    clusterer(gb_main, found, dups); | 
|---|
| 1660 |  | 
|---|
| 1661 | if (clusterer.max_cluster_start_index()<0) { | 
|---|
| 1662 | enough_hits = false; // e.g. 2 hits, but min. cluster-size==3 | 
|---|
| 1663 | progress.done(); | 
|---|
| 1664 | } | 
|---|
| 1665 | else { | 
|---|
| 1666 | found = new QueriedGroups;            // clear result list | 
|---|
| 1667 | clusterer.find_and_deliverTo(*found); // detect clusters of duplicates and add them to the result list | 
|---|
| 1668 |  | 
|---|
| 1669 | if (dups->want_unique_groups() && !progress.aborted()) { | 
|---|
| 1670 | QueriedGroups *nonDupGroups = new QueriedGroups; | 
|---|
| 1671 |  | 
|---|
| 1672 | clusterer.deliverRest(*nonDupGroups); | 
|---|
| 1673 | found = nonDupGroups; | 
|---|
| 1674 | } | 
|---|
| 1675 | } | 
|---|
| 1676 |  | 
|---|
| 1677 | if (!error) error = progress.error_if_aborted(); | 
|---|
| 1678 | } | 
|---|
| 1679 |  | 
|---|
| 1680 | if (!enough_hits && !error) { | 
|---|
| 1681 | error = GBS_global_string("Not enough hits (%zu) to find duplicates", found->size()); | 
|---|
| 1682 | } | 
|---|
| 1683 |  | 
|---|
| 1684 | return error; | 
|---|
| 1685 | } | 
|---|
| 1686 |  | 
|---|
| 1687 | const QueriedGroups& GroupSearch::get_results() { | 
|---|
| 1688 | if (found.isNull()) found = new QueriedGroups; | 
|---|
| 1689 | if (!sortedByOrder) sort_results(); | 
|---|
| 1690 | return *found; | 
|---|
| 1691 | } | 
|---|
| 1692 |  | 
|---|
| 1693 | struct has_been_deleted { | 
|---|
| 1694 | GroupSearchCommon *common; | 
|---|
| 1695 | has_been_deleted(GroupSearchCommon *common_) : common(common_) {} | 
|---|
| 1696 | bool operator()(const FoundGroup& g) { return common->has_been_deleted(g.get_pointer()); } | 
|---|
| 1697 | }; | 
|---|
| 1698 | struct was_modified { | 
|---|
| 1699 | GroupSearchCommon *common; | 
|---|
| 1700 | was_modified(GroupSearchCommon *common_) : common(common_) {} | 
|---|
| 1701 | bool operator()(const FoundGroup& g) { return common->has_been_modified(g.get_pointer()); } | 
|---|
| 1702 | }; | 
|---|
| 1703 |  | 
|---|
| 1704 | bool QueriedGroups::erase_deleted(GroupSearchCommon *common) { | 
|---|
| 1705 | FoundGroupIter first_removed = remove_if(found.begin(), found.end(), has_been_deleted(common)); | 
|---|
| 1706 | bool           erased        = first_removed != found.end(); | 
|---|
| 1707 |  | 
|---|
| 1708 | found.erase(first_removed, found.end()); | 
|---|
| 1709 | invalidate_widths(); | 
|---|
| 1710 | return erased; | 
|---|
| 1711 | } | 
|---|
| 1712 | bool QueriedGroups::contains_changed(GroupSearchCommon *common) const { | 
|---|
| 1713 | FoundGroupCIter modified  = find_if(found.begin(), found.end(), was_modified(common)); | 
|---|
| 1714 | return modified          != found.end(); | 
|---|
| 1715 | } | 
|---|
| 1716 |  | 
|---|
| 1717 | struct compare_by_criteria { | 
|---|
| 1718 | const SortCriteria& by; | 
|---|
| 1719 | compare_by_criteria(const SortCriteria& by_) : by(by_) {} | 
|---|
| 1720 | bool operator()(const FoundGroup& g1, const FoundGroup& g2) const { | 
|---|
| 1721 | int  cmp               = 0; | 
|---|
| 1722 | bool last_was_modifier = false; | 
|---|
| 1723 | bool reversed          = false; | 
|---|
| 1724 |  | 
|---|
| 1725 | SortCriteria::const_iterator crit = by.begin(); | 
|---|
| 1726 | while ((!cmp || last_was_modifier) && crit != by.end()) { | 
|---|
| 1727 | last_was_modifier = (*crit == GSC_REVERSE); | 
|---|
| 1728 | switch (*crit) { | 
|---|
| 1729 | case GSC_NONE:    gs_assert(0); break; // should not occur | 
|---|
| 1730 | case GSC_REVERSE: reversed = !reversed; break; | 
|---|
| 1731 |  | 
|---|
| 1732 | // alphabetically: | 
|---|
| 1733 | case GSC_NAME:     cmp = strcmp(g1.get_name(),      g2.get_name());      break; | 
|---|
| 1734 | case GSC_TREENAME: cmp = strcmp(g1.get_tree_name(), g2.get_tree_name()); break; | 
|---|
| 1735 |  | 
|---|
| 1736 | case GSC_HIT_REASON: cmp = g1.get_hit_reason().compare(g2.get_hit_reason()); break; | 
|---|
| 1737 |  | 
|---|
| 1738 | // small first: | 
|---|
| 1739 | case GSC_TREEORDER: cmp = g1.get_tree_order() - g2.get_tree_order(); break; | 
|---|
| 1740 | case GSC_NESTING:   cmp = g1.get_nesting()    - g2.get_nesting(); break; | 
|---|
| 1741 | case GSC_CLUSTER:   cmp = g1.get_cluster_id() - g2.get_cluster_id(); break; | 
|---|
| 1742 | case GSC_AID:       cmp = double_cmp(g1.get_aid(), g2.get_aid()); break; | 
|---|
| 1743 |  | 
|---|
| 1744 | // big first: | 
|---|
| 1745 | case GSC_SIZE:      cmp = g2.get_size()       - g1.get_size(); break; | 
|---|
| 1746 | case GSC_MARKED:    cmp = g2.get_marked()     - g1.get_marked(); break; | 
|---|
| 1747 | case GSC_MARKED_PC: cmp = g2.get_marked_pc()  - g1.get_marked_pc(); break; | 
|---|
| 1748 | case GSC_KEELED:    cmp = g2.get_keeled()     - g1.get_keeled(); break; | 
|---|
| 1749 | } | 
|---|
| 1750 | ++crit; | 
|---|
| 1751 | } | 
|---|
| 1752 | return reversed ? cmp>0 : cmp<0; | 
|---|
| 1753 | } | 
|---|
| 1754 | }; | 
|---|
| 1755 |  | 
|---|
| 1756 | void QueriedGroups::sort_by(const SortCriteria& by) { | 
|---|
| 1757 | stable_sort(found.begin(), found.end(), compare_by_criteria(by)); | 
|---|
| 1758 | sorted_by = &by; | 
|---|
| 1759 | } | 
|---|
| 1760 |  | 
|---|
| 1761 | void QueriedGroups::remove_hit(size_t idx) { | 
|---|
| 1762 | if (idx<size()) { | 
|---|
| 1763 | FoundGroupContainer::iterator del = found.begin(); | 
|---|
| 1764 | advance(del, idx); | 
|---|
| 1765 | found.erase(del); | 
|---|
| 1766 | invalidate_widths(); | 
|---|
| 1767 | } | 
|---|
| 1768 | } | 
|---|
| 1769 |  | 
|---|
| 1770 | const ColumnWidths& QueriedGroups::get_column_widths() const { | 
|---|
| 1771 | if (widths.isNull()) { | 
|---|
| 1772 | widths          = new ColumnWidths; | 
|---|
| 1773 | ColumnWidths& w = *widths; | 
|---|
| 1774 | for (FoundGroupCIter g = begin(); g != end(); ++g) { | 
|---|
| 1775 | g->track_max_widths(w); | 
|---|
| 1776 | } | 
|---|
| 1777 | } | 
|---|
| 1778 | return *widths; | 
|---|
| 1779 | } | 
|---|
| 1780 | const char *QueriedGroups::get_group_display(const FoundGroup& g, bool show_tree_name) const { | 
|---|
| 1781 | const ColumnWidths& width = get_column_widths(); // updates width information (if outdated) | 
|---|
| 1782 |  | 
|---|
| 1783 | static GBS_strstruct display; | 
|---|
| 1784 |  | 
|---|
| 1785 | display.erase(); | 
|---|
| 1786 |  | 
|---|
| 1787 | if (width.seen_keeled) display.put(g.get_keeled() ? KEELED_INDICATOR : ' '); | 
|---|
| 1788 | display.nprintf(width.name+1, "%-*s", width.name, g.get_name()); // insert name as 1st column | 
|---|
| 1789 |  | 
|---|
| 1790 | if (sorted_by) { | 
|---|
| 1791 | // generate display-string depending on active SortCriteria: | 
|---|
| 1792 | for (SortCriteria::const_iterator sc = sorted_by->begin(); sc != sorted_by->end(); ++sc) { | 
|---|
| 1793 | switch (*sc) { | 
|---|
| 1794 | case GSC_NONE: gs_assert(0); break; // invalid | 
|---|
| 1795 |  | 
|---|
| 1796 | case GSC_TREENAME:  // ignored (either already shown or only have one tree) | 
|---|
| 1797 | case GSC_TREEORDER: // dito | 
|---|
| 1798 | case GSC_REVERSE: | 
|---|
| 1799 | case GSC_NAME: | 
|---|
| 1800 | break;          // ignored for display | 
|---|
| 1801 |  | 
|---|
| 1802 | case GSC_HIT_REASON: | 
|---|
| 1803 | display.nprintf(width.reason+1, " %-*s", width.reason, g.get_hit_reason().c_str()); | 
|---|
| 1804 | break; | 
|---|
| 1805 |  | 
|---|
| 1806 | case GSC_NESTING: { | 
|---|
| 1807 | int nesting_width = ColumnWidths::max2width(width.max_nesting); | 
|---|
| 1808 | display.nprintf(nesting_width+1, " %*i", nesting_width, g.get_nesting()); | 
|---|
| 1809 | break; | 
|---|
| 1810 | } | 
|---|
| 1811 | case GSC_SIZE: { | 
|---|
| 1812 | int size_width = ColumnWidths::max2width(width.max_size); | 
|---|
| 1813 | display.nprintf(size_width+1, " %*i", size_width, g.get_size()); | 
|---|
| 1814 | break; | 
|---|
| 1815 | } | 
|---|
| 1816 | case GSC_MARKED: { | 
|---|
| 1817 | int marked_width = ColumnWidths::max2width(width.max_marked); | 
|---|
| 1818 | display.nprintf(marked_width+1, " %*i", marked_width, g.get_marked()); | 
|---|
| 1819 | break; | 
|---|
| 1820 | } | 
|---|
| 1821 | case GSC_MARKED_PC: { | 
|---|
| 1822 | int marked_width = ColumnWidths::max2width(width.max_marked_pc); | 
|---|
| 1823 | display.nprintf(marked_width+2, " %*i%%", marked_width, g.get_marked_pc()); | 
|---|
| 1824 | break; | 
|---|
| 1825 | } | 
|---|
| 1826 | case GSC_CLUSTER: { | 
|---|
| 1827 | int cluster_width = ColumnWidths::max2width(width.max_cluster_id); | 
|---|
| 1828 | display.nprintf(cluster_width+2, " %*ic", cluster_width, g.get_cluster_id()); | 
|---|
| 1829 | break; | 
|---|
| 1830 | } | 
|---|
| 1831 | case GSC_AID: { | 
|---|
| 1832 | int aid_width = ColumnWidths::max2width(width.max_aid); | 
|---|
| 1833 | display.nprintf(aid_width+6, " %*.4f", aid_width, g.get_aid()); | 
|---|
| 1834 | break; | 
|---|
| 1835 | } | 
|---|
| 1836 | case GSC_KEELED: { | 
|---|
| 1837 | display.nprintf(2, " %i", g.get_keeled()); | 
|---|
| 1838 | break; | 
|---|
| 1839 | } | 
|---|
| 1840 | } | 
|---|
| 1841 | } | 
|---|
| 1842 | } | 
|---|
| 1843 |  | 
|---|
| 1844 | if (show_tree_name) { | 
|---|
| 1845 | display.put(' '); | 
|---|
| 1846 | display.cat(g.get_tree_name()); | 
|---|
| 1847 | } | 
|---|
| 1848 |  | 
|---|
| 1849 | return display.get_data(); | 
|---|
| 1850 | } | 
|---|
| 1851 |  | 
|---|
| 1852 | void QueriedGroups::add_candidate(const GroupSearch& group_search, Candidate& cand, const std::string& hit_reason) { | 
|---|
| 1853 | cand.inform_group(group_search, hit_reason); | 
|---|
| 1854 | add_informed_group(cand.get_group()); | 
|---|
| 1855 | } | 
|---|
| 1856 |  | 
|---|
| 1857 |  | 
|---|
| 1858 | void GroupSearch::refresh_results_after_DBchanges() { | 
|---|
| 1859 | if (!found.isNull() && !found->empty()) { | 
|---|
| 1860 | bool erased  = found->erase_deleted(common); | 
|---|
| 1861 | bool changed = false; | 
|---|
| 1862 | if (!erased) { | 
|---|
| 1863 | changed = found->contains_changed(common); | 
|---|
| 1864 | } | 
|---|
| 1865 | if (erased || changed) { | 
|---|
| 1866 | redisplay_cb(this); | 
|---|
| 1867 | } | 
|---|
| 1868 | } | 
|---|
| 1869 | } | 
|---|
| 1870 |  | 
|---|
| 1871 | void GroupSearch::addSortCriterion(GroupSortCriterion gsc) { | 
|---|
| 1872 | /*! add new primary sort criterion | 
|---|
| 1873 | * previously added (different) criteria remain active, but become secondary, tertiary, ... | 
|---|
| 1874 | */ | 
|---|
| 1875 |  | 
|---|
| 1876 | if (gsc == GSC_NONE) { | 
|---|
| 1877 | forgetSortCriteria(); | 
|---|
| 1878 | } | 
|---|
| 1879 | else { | 
|---|
| 1880 | bool add = true; | 
|---|
| 1881 |  | 
|---|
| 1882 | if (!order.empty() && order.front() == gsc) { | 
|---|
| 1883 | add = false; | 
|---|
| 1884 | if (gsc == GSC_REVERSE) { | 
|---|
| 1885 | order.pop_front(); // eliminate duplicate reverse | 
|---|
| 1886 | sortedByOrder = false; | 
|---|
| 1887 | } | 
|---|
| 1888 | } | 
|---|
| 1889 |  | 
|---|
| 1890 | if (add) { | 
|---|
| 1891 | if (gsc != GSC_REVERSE) { | 
|---|
| 1892 | // remove duplicated search criterion from order | 
|---|
| 1893 | SortCriteria::iterator dup = find(order.begin(), order.end(), gsc); | 
|---|
| 1894 | if (dup != order.end()) { | 
|---|
| 1895 | SortCriteria::iterator pre = dup; | 
|---|
| 1896 | do --pre; while (pre != order.end() && *pre == GSC_REVERSE); | 
|---|
| 1897 |  | 
|---|
| 1898 | if (pre == order.end()) pre = order.begin(); // erase from start | 
|---|
| 1899 | else ++pre;                                  // step back to 1st GSC_REVERSE | 
|---|
| 1900 |  | 
|---|
| 1901 | ++dup; // point behind duplicate | 
|---|
| 1902 | order.erase(pre,dup); | 
|---|
| 1903 | } | 
|---|
| 1904 | } | 
|---|
| 1905 |  | 
|---|
| 1906 | order.push_front(gsc); | 
|---|
| 1907 | sortedByOrder = false; | 
|---|
| 1908 | } | 
|---|
| 1909 | } | 
|---|
| 1910 | } | 
|---|
| 1911 |  | 
|---|
| 1912 | void GroupSearch::sort_results() { | 
|---|
| 1913 | if (!order.empty()) { | 
|---|
| 1914 | GB_transaction ta(gb_main); | 
|---|
| 1915 | found->sort_by(order); | 
|---|
| 1916 | sortedByOrder = true; | 
|---|
| 1917 | } | 
|---|
| 1918 | } | 
|---|
| 1919 |  | 
|---|
| 1920 | void GroupSearch::setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size) { | 
|---|
| 1921 | gs_assert(ntype != DNC_WORDWISE); // use flavor below | 
|---|
| 1922 | dups = new DupCriteria(listDups, DupNameCriterion(ntype, sens), ttype, min_cluster_size); | 
|---|
| 1923 | } | 
|---|
| 1924 | void GroupSearch::setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const WordSet& ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size) { | 
|---|
| 1925 | gs_assert(ntype == DNC_WORDWISE); // use flavor above | 
|---|
| 1926 | dups = new DupCriteria(listDups, DupNameCriterion(ntype, sens, min_words, ignored_words, wordSeparators), ttype, min_cluster_size); | 
|---|
| 1927 | } | 
|---|
| 1928 | void GroupSearch::setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const char *ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size) { | 
|---|
| 1929 | WordSet ignoredWordsSet; | 
|---|
| 1930 | WordSet none; // no words ignored in ignoredWordsSet | 
|---|
| 1931 | string2WordSet(ignored_words, ignoredWordsSet, wordSeparators, none); | 
|---|
| 1932 | setDupCriteria(listDups, ntype, sens, min_words, ignoredWordsSet, wordSeparators, ttype, min_cluster_size); | 
|---|
| 1933 | } | 
|---|
| 1934 |  | 
|---|
| 1935 |  | 
|---|
| 1936 | void GroupSearch::forgetDupCriteria() { | 
|---|
| 1937 | dups.setNull(); | 
|---|
| 1938 | } | 
|---|
| 1939 |  | 
|---|
| 1940 | GB_ERROR GroupSearch::delete_group(size_t idx) { | 
|---|
| 1941 | if (idx<found->size()) return (*found)[idx].delete_from_DB(); | 
|---|
| 1942 | return "index out-of-bounds"; | 
|---|
| 1943 | } | 
|---|
| 1944 |  | 
|---|
| 1945 | GB_ERROR GroupSearch::delete_found_groups() { | 
|---|
| 1946 | GB_ERROR error = NULp; // @@@ use ARB_ERROR instead (whole module + callers) | 
|---|
| 1947 | if (has_results()) { | 
|---|
| 1948 | GB_transaction ta(gb_main); | 
|---|
| 1949 |  | 
|---|
| 1950 | for (FoundGroupIter group = found->begin(); !error && group != found->end(); ++group) { | 
|---|
| 1951 | error = group->delete_from_DB(); | 
|---|
| 1952 | } | 
|---|
| 1953 | error = ta.close(error); | 
|---|
| 1954 | } | 
|---|
| 1955 | return error; | 
|---|
| 1956 | } | 
|---|
| 1957 |  | 
|---|
| 1958 | // ------------------------------------------ | 
|---|
| 1959 | //      ACI extension for group renaming | 
|---|
| 1960 |  | 
|---|
| 1961 | using namespace GBL_IMPL; | 
|---|
| 1962 |  | 
|---|
| 1963 | struct GroupRename_callenv : public GBL_call_env { | 
|---|
| 1964 | const QueriedGroups& queried; | 
|---|
| 1965 | int                  hit_idx; | 
|---|
| 1966 |  | 
|---|
| 1967 | GroupRename_callenv(const QueriedGroups& queried_, int hit_idx_, const GBL_env& env_) : | 
|---|
| 1968 | GBL_call_env(NULp, env_), | 
|---|
| 1969 | queried(queried_), | 
|---|
| 1970 | hit_idx(hit_idx_) | 
|---|
| 1971 | {} | 
|---|
| 1972 |  | 
|---|
| 1973 | bool legal_hit_index() const { return hit_idx>=0 && unsigned(hit_idx)<queried.size(); } | 
|---|
| 1974 |  | 
|---|
| 1975 | const FoundGroup *get_hit_group() const { | 
|---|
| 1976 | if (legal_hit_index()) return &queried[hit_idx]; | 
|---|
| 1977 | return NULp; | 
|---|
| 1978 | } | 
|---|
| 1979 | }; | 
|---|
| 1980 |  | 
|---|
| 1981 | inline const GroupRename_callenv& custom_env(GBL_command_arguments *args) { | 
|---|
| 1982 | return DOWNCAST_REFERENCE(const GroupRename_callenv, args->get_callEnv()); | 
|---|
| 1983 | } | 
|---|
| 1984 |  | 
|---|
| 1985 | static GB_ERROR grl_hitidx(GBL_command_arguments *args) { | 
|---|
| 1986 | COMMAND_DROPS_INPUT_STREAMS(args); | 
|---|
| 1987 | GB_ERROR error = check_no_parameter(args); | 
|---|
| 1988 | if (!error) { | 
|---|
| 1989 | const GroupRename_callenv& callEnv = custom_env(args); | 
|---|
| 1990 | if (callEnv.legal_hit_index()) { | 
|---|
| 1991 | FORMAT_2_OUT(args, "%i", info2bio(callEnv.hit_idx)); | 
|---|
| 1992 | } | 
|---|
| 1993 | else { | 
|---|
| 1994 | error = "no hit"; | 
|---|
| 1995 | } | 
|---|
| 1996 | } | 
|---|
| 1997 |  | 
|---|
| 1998 | return error; | 
|---|
| 1999 | } | 
|---|
| 2000 | static GB_ERROR grl_hitcount(GBL_command_arguments *args) { | 
|---|
| 2001 | COMMAND_DROPS_INPUT_STREAMS(args); | 
|---|
| 2002 | GB_ERROR error = check_no_parameter(args); | 
|---|
| 2003 | if (!error) { | 
|---|
| 2004 | const GroupRename_callenv& callEnv = custom_env(args); | 
|---|
| 2005 | FORMAT_2_OUT(args, "%zu", callEnv.queried.size()); | 
|---|
| 2006 | } | 
|---|
| 2007 | return error; | 
|---|
| 2008 | } | 
|---|
| 2009 | static GB_ERROR grl_groupsize(GBL_command_arguments *args) { | 
|---|
| 2010 | COMMAND_DROPS_INPUT_STREAMS(args); | 
|---|
| 2011 | GB_ERROR error = check_no_parameter(args); | 
|---|
| 2012 | if (!error) { | 
|---|
| 2013 | const FoundGroup *hit = custom_env(args).get_hit_group(); | 
|---|
| 2014 | if (hit) { | 
|---|
| 2015 | FORMAT_2_OUT(args, "%i", hit->get_size()); | 
|---|
| 2016 | } | 
|---|
| 2017 | else { | 
|---|
| 2018 | error = "no hit"; | 
|---|
| 2019 | } | 
|---|
| 2020 | } | 
|---|
| 2021 | return error; | 
|---|
| 2022 | } | 
|---|
| 2023 | static GB_ERROR grl_markedingroup(GBL_command_arguments *args) { | 
|---|
| 2024 | COMMAND_DROPS_INPUT_STREAMS(args); | 
|---|
| 2025 | GB_ERROR error = check_no_parameter(args); | 
|---|
| 2026 | if (!error) { | 
|---|
| 2027 | const FoundGroup *hit = custom_env(args).get_hit_group(); | 
|---|
| 2028 | if (hit) { | 
|---|
| 2029 | FORMAT_2_OUT(args, "%i", hit->get_marked()); | 
|---|
| 2030 | } | 
|---|
| 2031 | else { | 
|---|
| 2032 | error = "no hit"; | 
|---|
| 2033 | } | 
|---|
| 2034 | } | 
|---|
| 2035 | return error; | 
|---|
| 2036 | } | 
|---|
| 2037 | static GB_ERROR grl_aid(GBL_command_arguments *args) { | 
|---|
| 2038 | COMMAND_DROPS_INPUT_STREAMS(args); | 
|---|
| 2039 | GB_ERROR error = check_no_parameter(args); | 
|---|
| 2040 | if (!error) { | 
|---|
| 2041 | const FoundGroup *hit = custom_env(args).get_hit_group(); | 
|---|
| 2042 | if (hit) { | 
|---|
| 2043 | FORMAT_2_OUT(args, "%f", hit->get_aid()); | 
|---|
| 2044 | } | 
|---|
| 2045 | else { | 
|---|
| 2046 | error = "no hit"; | 
|---|
| 2047 | } | 
|---|
| 2048 | } | 
|---|
| 2049 | return error; | 
|---|
| 2050 | } | 
|---|
| 2051 | static GB_ERROR grl_nesting(GBL_command_arguments *args) { | 
|---|
| 2052 | COMMAND_DROPS_INPUT_STREAMS(args); | 
|---|
| 2053 | GB_ERROR error = check_no_parameter(args); | 
|---|
| 2054 | if (!error) { | 
|---|
| 2055 | const FoundGroup *hit = custom_env(args).get_hit_group(); | 
|---|
| 2056 | if (hit) { | 
|---|
| 2057 | FORMAT_2_OUT(args, "%i", hit->get_nesting()); | 
|---|
| 2058 | } | 
|---|
| 2059 | else { | 
|---|
| 2060 | error = "no hit"; | 
|---|
| 2061 | } | 
|---|
| 2062 | } | 
|---|
| 2063 | return error; | 
|---|
| 2064 | } | 
|---|
| 2065 |  | 
|---|
| 2066 |  | 
|---|
| 2067 | static GBL_command_definition groupRename_command_table[] = { | 
|---|
| 2068 | { "hitidx",        grl_hitidx }, | 
|---|
| 2069 | { "hitcount",      grl_hitcount }, | 
|---|
| 2070 | { "groupSize",     grl_groupsize }, | 
|---|
| 2071 | { "markedInGroup", grl_markedingroup }, | 
|---|
| 2072 | { "aid",           grl_aid }, | 
|---|
| 2073 | { "nesting",       grl_nesting }, | 
|---|
| 2074 |  | 
|---|
| 2075 | { NULp, NULp } | 
|---|
| 2076 | }; | 
|---|
| 2077 |  | 
|---|
| 2078 | static const GBL_command_lookup_table& get_GroupRename_customized_ACI_commands() { | 
|---|
| 2079 | static GBL_custom_command_lookup_table clt(groupRename_command_table, | 
|---|
| 2080 | ARRAY_ELEMS(groupRename_command_table)-1, | 
|---|
| 2081 | ACI_get_standard_commands()); | 
|---|
| 2082 | return clt; | 
|---|
| 2083 | } | 
|---|
| 2084 |  | 
|---|
| 2085 | char *GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups& queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR& error) { | 
|---|
| 2086 | char *result = NULp; | 
|---|
| 2087 | if (!input_name || !input_name[0]) { | 
|---|
| 2088 | error = "Error: empty input groupname"; | 
|---|
| 2089 | } | 
|---|
| 2090 | else { | 
|---|
| 2091 | GB_transaction    ta(gb_main); | 
|---|
| 2092 | bool              know_hit = hit_idx>=0 && unsigned(hit_idx)<queried.size(); | 
|---|
| 2093 | const FoundGroup *hit      = know_hit ? &queried[hit_idx] : NULp; | 
|---|
| 2094 |  | 
|---|
| 2095 | GBL_env             env(gb_main, hit ? hit->get_tree_name() : NULp, get_GroupRename_customized_ACI_commands()); | 
|---|
| 2096 | GroupRename_callenv callEnv(queried, hit_idx, env); | 
|---|
| 2097 |  | 
|---|
| 2098 | result = GB_command_interpreter_in_env(input_name, acisrt, callEnv); | 
|---|
| 2099 | if (!result) { | 
|---|
| 2100 | error = GBS_global_string("Error: %s", GB_await_error()); | 
|---|
| 2101 | } | 
|---|
| 2102 | else { | 
|---|
| 2103 | freeset(result, GBS_trim(result)); // trim whitespace | 
|---|
| 2104 | } | 
|---|
| 2105 | } | 
|---|
| 2106 | return result; | 
|---|
| 2107 | } | 
|---|
| 2108 |  | 
|---|
| 2109 | ARB_ERROR GroupSearch::rename_group(size_t idx, const char *acisrt) { | 
|---|
| 2110 | if (idx<found->size()) { | 
|---|
| 2111 | return (*found)[idx].rename_by_ACI(acisrt, *found, idx); | 
|---|
| 2112 | } | 
|---|
| 2113 | return "index out-of-bounds"; | 
|---|
| 2114 | } | 
|---|
| 2115 |  | 
|---|
| 2116 | ARB_ERROR GroupSearch::rename_found_groups(const char *acisrt) { | 
|---|
| 2117 | ARB_ERROR error; | 
|---|
| 2118 | if (has_results()) { | 
|---|
| 2119 | GB_transaction ta(gb_main); | 
|---|
| 2120 |  | 
|---|
| 2121 | MessageSpamFilter suppress("problematic group names"); | 
|---|
| 2122 |  | 
|---|
| 2123 | int idx = 0; | 
|---|
| 2124 | for (FoundGroupIter group = found->begin(); !error && group != found->end(); ++group, ++idx) { | 
|---|
| 2125 | error = group->rename_by_ACI(acisrt, *found, idx); | 
|---|
| 2126 | } | 
|---|
| 2127 | error = ta.close(error); | 
|---|
| 2128 | } | 
|---|
| 2129 | return error; | 
|---|
| 2130 | } | 
|---|
| 2131 |  | 
|---|
| 2132 | ARB_ERROR GroupSearch::fold_group(size_t idx, GroupFoldingMode mode) { | 
|---|
| 2133 | if (idx<found->size()) { | 
|---|
| 2134 | return (*found)[idx].change_folding(mode); | 
|---|
| 2135 | } | 
|---|
| 2136 | return "index out-of-bounds"; | 
|---|
| 2137 | } | 
|---|
| 2138 |  | 
|---|
| 2139 | GBDATA *GroupSearch::get_parent_group(GBDATA *gb_group) const { | 
|---|
| 2140 | // works for groups which are members of one of the searched tree | 
|---|
| 2141 | return common->get_parent_cache().lookupParent(gb_group); | 
|---|
| 2142 | } | 
|---|
| 2143 |  | 
|---|
| 2144 | int GroupSearch::calc_nesting_level(GBDATA *gb_group) const { | 
|---|
| 2145 | int nesting = 0; | 
|---|
| 2146 | while (gb_group) { | 
|---|
| 2147 | gb_group = get_parent_group(gb_group); | 
|---|
| 2148 | if (gb_group) ++nesting; | 
|---|
| 2149 | } | 
|---|
| 2150 | return nesting; | 
|---|
| 2151 | } | 
|---|
| 2152 |  | 
|---|
| 2153 |  | 
|---|
| 2154 | ARB_ERROR GroupSearch::fold_found_groups(GroupFoldingMode mode) { | 
|---|
| 2155 | ARB_ERROR      error; | 
|---|
| 2156 | GB_transaction ta(gb_main); | 
|---|
| 2157 |  | 
|---|
| 2158 | GBDATAset modifiedTrees; | 
|---|
| 2159 |  | 
|---|
| 2160 | // create a set of affected groups | 
|---|
| 2161 | GBDATAset targetGroups; | 
|---|
| 2162 | for (FoundGroupCIter g = found->begin(); g != found->end(); ++g) { | 
|---|
| 2163 | GBDATA *gb_group = g->get_pointer(); | 
|---|
| 2164 | targetGroups.insert(gb_group); | 
|---|
| 2165 | } | 
|---|
| 2166 |  | 
|---|
| 2167 | if (mode & GFM_RECURSE) { // also operate on parents | 
|---|
| 2168 | GBDATAset testParentsOf = targetGroups; | 
|---|
| 2169 | if (mode & GFM_PARENTS_ONLY) targetGroups.clear(); | 
|---|
| 2170 | while (!testParentsOf.empty()) { // redo until no more parents get added | 
|---|
| 2171 | GBDATAset addedParents; | 
|---|
| 2172 | for (GBDATAset::iterator t = testParentsOf.begin(); t != testParentsOf.end(); ++t) { | 
|---|
| 2173 | GBDATA *gb_parent_group = get_parent_group(*t); | 
|---|
| 2174 | if (gb_parent_group && targetGroups.find(gb_parent_group) == targetGroups.end()) { | 
|---|
| 2175 | addedParents.insert(gb_parent_group); | 
|---|
| 2176 | targetGroups.insert(gb_parent_group); | 
|---|
| 2177 | } | 
|---|
| 2178 | } | 
|---|
| 2179 | testParentsOf = addedParents; | 
|---|
| 2180 | } | 
|---|
| 2181 | } | 
|---|
| 2182 |  | 
|---|
| 2183 | GroupFoldingMode basicMode = GroupFoldingMode(mode & (GFM_EXPAND|GFM_TOGGLE)); | 
|---|
| 2184 | for (GBDATAset::iterator n = targetGroups.begin(); n != targetGroups.end() && !error; ++n) { | 
|---|
| 2185 | error = FoundGroup(*n).change_folding(basicMode); | 
|---|
| 2186 | } | 
|---|
| 2187 |  | 
|---|
| 2188 | if (!error && (mode & GFM_COLLAPSE_REST)) { // collapse everything else | 
|---|
| 2189 | SearchedTreeContainer searched_tree; | 
|---|
| 2190 | collect_searched_trees(gb_main, trees_to_search, searched_tree); | 
|---|
| 2191 |  | 
|---|
| 2192 | for (SearchedTreeIter t = searched_tree.begin(); t != searched_tree.end() && !error; ++t) { | 
|---|
| 2193 | GBDATA *gb_tree_data = t->get_tree_data(); | 
|---|
| 2194 | for (GBDATA *gb_node = GB_entry(gb_tree_data, "node"); gb_node && !error; gb_node = GB_nextEntry(gb_node)) { | 
|---|
| 2195 | GBDATA *gb_name = GB_entry(gb_node, "group_name"); | 
|---|
| 2196 | if (gb_name) { // named node (aka group) | 
|---|
| 2197 | if (targetGroups.find(gb_node) == targetGroups.end()) { // not already handled before | 
|---|
| 2198 | error = FoundGroup(gb_node).change_folding(GFM_COLLAPSE); | 
|---|
| 2199 | } | 
|---|
| 2200 | } | 
|---|
| 2201 | } | 
|---|
| 2202 | } | 
|---|
| 2203 | } | 
|---|
| 2204 |  | 
|---|
| 2205 | return ta.close(error); | 
|---|
| 2206 | } | 
|---|
| 2207 |  | 
|---|
| 2208 | ARB_ERROR GroupSearch::collectSpecies(const QueriedGroups& groups, CollectMode cmode, SpeciesNames& species) { | 
|---|
| 2209 | SearchedTreeContainer searched_tree; | 
|---|
| 2210 | collect_searched_trees(gb_main, trees_to_search, searched_tree); | 
|---|
| 2211 |  | 
|---|
| 2212 | ARB_ERROR error; | 
|---|
| 2213 | for (SearchedTreeIter t = searched_tree.begin(); t != searched_tree.end() && !error; ++t) { | 
|---|
| 2214 | GBDATAset groupsFoundInTree; | 
|---|
| 2215 | for (FoundGroupCIter g = groups.begin(); g != groups.end(); ++g) { | 
|---|
| 2216 | if (t->get_tree_data() == g->get_tree_data()) { | 
|---|
| 2217 | groupsFoundInTree.insert(g->get_pointer()); | 
|---|
| 2218 | } | 
|---|
| 2219 | } | 
|---|
| 2220 |  | 
|---|
| 2221 | if (!groupsFoundInTree.empty()) { | 
|---|
| 2222 | // iterate over tree and insert or intersect species from each group with set | 
|---|
| 2223 | GroupSearchRoot *troot = t->get_tree_root(); | 
|---|
| 2224 |  | 
|---|
| 2225 | ARB_edge start = rootEdge(troot); | 
|---|
| 2226 | ARB_edge e     = start; | 
|---|
| 2227 | do { | 
|---|
| 2228 | if (e.is_inner_edge() && e.get_type() != EDGE_TO_ROOT) { | 
|---|
| 2229 | TreeNode *node = e.dest(); | 
|---|
| 2230 | if (node->is_normal_group()) { | 
|---|
| 2231 | if (groupsFoundInTree.find(node->gb_node) != groupsFoundInTree.end()) { | 
|---|
| 2232 | // iterate all leafs in subtree and store in 'speciesInGroup' | 
|---|
| 2233 | SpeciesNames speciesInGroup; | 
|---|
| 2234 | ARB_edge     sub  = e; | 
|---|
| 2235 | ARB_edge     stop = sub.inverse(); | 
|---|
| 2236 |  | 
|---|
| 2237 | while (sub != stop) { | 
|---|
| 2238 | if (sub.is_edge_to_leaf()) { | 
|---|
| 2239 | TreeNode *leaf = sub.dest(); | 
|---|
| 2240 | if (leaf->name) speciesInGroup.insert(leaf->name); | 
|---|
| 2241 | } | 
|---|
| 2242 | sub = sub.next(); | 
|---|
| 2243 | } | 
|---|
| 2244 |  | 
|---|
| 2245 | if (species.empty()) { // simply add first group | 
|---|
| 2246 | gs_assert(!speciesInGroup.empty()); // tree broken? | 
|---|
| 2247 | species = speciesInGroup; | 
|---|
| 2248 | } | 
|---|
| 2249 | else { // intersect or unite two groups | 
|---|
| 2250 | SpeciesNames combined; | 
|---|
| 2251 | if (cmode == INTERSECT) { | 
|---|
| 2252 | set_intersection( | 
|---|
| 2253 | speciesInGroup.begin(), speciesInGroup.end(), | 
|---|
| 2254 | species.begin(), species.end(), | 
|---|
| 2255 | // combined.begin() | 
|---|
| 2256 | inserter(combined, combined.begin()) | 
|---|
| 2257 | ); | 
|---|
| 2258 |  | 
|---|
| 2259 | if (combined.empty()) { | 
|---|
| 2260 | error = "No species is member of ALL groups"; | 
|---|
| 2261 | } | 
|---|
| 2262 | } | 
|---|
| 2263 | else { | 
|---|
| 2264 | gs_assert(cmode == UNITE); | 
|---|
| 2265 | set_union( | 
|---|
| 2266 | speciesInGroup.begin(), speciesInGroup.end(), | 
|---|
| 2267 | species.begin(), species.end(), | 
|---|
| 2268 | // combined.begin() | 
|---|
| 2269 | inserter(combined, combined.begin()) | 
|---|
| 2270 | ); | 
|---|
| 2271 | } | 
|---|
| 2272 | species = combined; | 
|---|
| 2273 | } | 
|---|
| 2274 | } | 
|---|
| 2275 | } | 
|---|
| 2276 | } | 
|---|
| 2277 | e = e.next(); | 
|---|
| 2278 | } | 
|---|
| 2279 | while (e != start && !error); | 
|---|
| 2280 | } | 
|---|
| 2281 | } | 
|---|
| 2282 | return error; | 
|---|
| 2283 | } | 
|---|
| 2284 |  | 
|---|
| 2285 | static void set_marks_of(const SpeciesNames& targetSpecies, GBDATA *gb_main, GroupMarkMode mode) { | 
|---|
| 2286 | if (!targetSpecies.empty()) { | 
|---|
| 2287 | size_t found    = 0; | 
|---|
| 2288 | for (GBDATA *gb_species = GBT_first_species(gb_main); | 
|---|
| 2289 | gb_species; | 
|---|
| 2290 | gb_species = GBT_next_species(gb_species)) | 
|---|
| 2291 | { | 
|---|
| 2292 | const char *name = GBT_get_name_or_description(gb_species); | 
|---|
| 2293 | if (targetSpecies.find(name) != targetSpecies.end()) { | 
|---|
| 2294 | ++found; | 
|---|
| 2295 | if (mode == GMM_INVERT) { | 
|---|
| 2296 | UNCOVERED(); | 
|---|
| 2297 | GB_write_flag(gb_species, !GB_read_flag(gb_species)); | 
|---|
| 2298 | } | 
|---|
| 2299 | else { | 
|---|
| 2300 | UNCOVERED(); | 
|---|
| 2301 | GB_write_flag(gb_species, mode == GMM_MARK); | 
|---|
| 2302 | } | 
|---|
| 2303 | } | 
|---|
| 2304 | } | 
|---|
| 2305 | size_t targetted = targetSpecies.size(); | 
|---|
| 2306 | if (found<targetted) { | 
|---|
| 2307 | size_t zombies = targetted-found; | 
|---|
| 2308 | GBT_message(gb_main, GBS_global_string("Warning: Refused to touch %zu zombies", zombies)); | 
|---|
| 2309 | } | 
|---|
| 2310 | } | 
|---|
| 2311 | } | 
|---|
| 2312 |  | 
|---|
| 2313 | ARB_ERROR GroupSearch::set_marks_in_group(size_t idx, GroupMarkMode mode) { | 
|---|
| 2314 | ARB_ERROR error; | 
|---|
| 2315 | if (idx<found->size()) { | 
|---|
| 2316 | QueriedGroups groups; | 
|---|
| 2317 | groups.add_informed_group((*found)[idx]); | 
|---|
| 2318 |  | 
|---|
| 2319 | SpeciesNames targetSpecies; | 
|---|
| 2320 | error = collectSpecies(groups, UNITE, targetSpecies); | 
|---|
| 2321 | if (!error) set_marks_of(targetSpecies, gb_main, mode); | 
|---|
| 2322 | } | 
|---|
| 2323 | return error; | 
|---|
| 2324 | } | 
|---|
| 2325 | ARB_ERROR GroupSearch::set_marks_in_found_groups(GroupMarkMode mode, CollectMode cmode) { | 
|---|
| 2326 | // intersect == true -> affect only species which are members of ALL found groups | 
|---|
| 2327 | ARB_ERROR error; | 
|---|
| 2328 | if (has_results()) { | 
|---|
| 2329 | SpeciesNames targetSpecies; | 
|---|
| 2330 | error = collectSpecies(*found, cmode, targetSpecies); | 
|---|
| 2331 | if (!error) set_marks_of(targetSpecies, gb_main, mode); | 
|---|
| 2332 | } | 
|---|
| 2333 | return error; | 
|---|
| 2334 | } | 
|---|
| 2335 |  | 
|---|
| 2336 | struct GroupNameQueryKey : public ExplicitQueryKey { | 
|---|
| 2337 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2338 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2339 | return strdup(target_group.get_group_name()); // retrieve group name | 
|---|
| 2340 | } | 
|---|
| 2341 | const char *get_name() const OVERRIDE { return "name"; } | 
|---|
| 2342 | }; | 
|---|
| 2343 |  | 
|---|
| 2344 | struct GroupFoldedKey : public ExplicitQueryKey { | 
|---|
| 2345 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2346 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2347 | const FoundGroup&  group        = target_group.get_group(); | 
|---|
| 2348 |  | 
|---|
| 2349 | return GBS_global_string_copy("%i", int(group.is_folded())); | 
|---|
| 2350 | } | 
|---|
| 2351 | const char *get_name() const OVERRIDE { return "folded"; } | 
|---|
| 2352 | }; | 
|---|
| 2353 |  | 
|---|
| 2354 | struct GroupAIDkey : public ExplicitQueryKey { | 
|---|
| 2355 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2356 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2357 | return GBS_global_string_copy("%e", target_group.get_average_ingroup_distance()); | 
|---|
| 2358 | } | 
|---|
| 2359 | const char *get_name() const OVERRIDE { return "AID"; } | 
|---|
| 2360 | }; | 
|---|
| 2361 |  | 
|---|
| 2362 | struct GroupSizeKey : public ExplicitQueryKey { | 
|---|
| 2363 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2364 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2365 | return GBS_global_string_copy("%u", target_group.get_group_size()); | 
|---|
| 2366 | } | 
|---|
| 2367 | const char *get_name() const OVERRIDE { return "size"; } | 
|---|
| 2368 | }; | 
|---|
| 2369 | struct GroupKeeledKey : public ExplicitQueryKey { | 
|---|
| 2370 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2371 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2372 | return GBS_global_string_copy("%i", target_group.get_keeledStateInfo()); | 
|---|
| 2373 | } | 
|---|
| 2374 | const char *get_name() const OVERRIDE { return "keeled"; } | 
|---|
| 2375 | }; | 
|---|
| 2376 | struct GroupZombiesKey : public ExplicitQueryKey { | 
|---|
| 2377 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2378 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2379 | return GBS_global_string_copy("%u", target_group.get_zombie_count()); | 
|---|
| 2380 | } | 
|---|
| 2381 | const char *get_name() const OVERRIDE { return "zombies"; } | 
|---|
| 2382 | }; | 
|---|
| 2383 | class GroupMarkedKey : public ExplicitQueryKey { | 
|---|
| 2384 | bool percent; | 
|---|
| 2385 | public: | 
|---|
| 2386 | GroupMarkedKey(bool percent_) : | 
|---|
| 2387 | percent(percent_) | 
|---|
| 2388 | {} | 
|---|
| 2389 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2390 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2391 |  | 
|---|
| 2392 | int marked = target_group.get_marked_count(); | 
|---|
| 2393 | if (percent) { | 
|---|
| 2394 | int    size = target_group.get_group_size(); | 
|---|
| 2395 | double pc   = 100.0*marked/size; | 
|---|
| 2396 | return GBS_global_string_copy("%5.2f", pc); | 
|---|
| 2397 | } | 
|---|
| 2398 |  | 
|---|
| 2399 | return GBS_global_string_copy("%u", marked); | 
|---|
| 2400 | } | 
|---|
| 2401 | const char *get_name() const OVERRIDE { return "marked"; } | 
|---|
| 2402 | }; | 
|---|
| 2403 |  | 
|---|
| 2404 | class NestingLevelKey : public ExplicitQueryKey { | 
|---|
| 2405 | const GroupSearch& group_search; | 
|---|
| 2406 | public: | 
|---|
| 2407 | NestingLevelKey(const GroupSearch& group_search_) : | 
|---|
| 2408 | group_search(group_search_) | 
|---|
| 2409 | {} | 
|---|
| 2410 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2411 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2412 | const FoundGroup&  group        = target_group.get_group(); | 
|---|
| 2413 |  | 
|---|
| 2414 | return GBS_global_string_copy("%i", group_search.calc_nesting_level(group.get_pointer())); | 
|---|
| 2415 | } | 
|---|
| 2416 | const char *get_name() const OVERRIDE { return "nesting"; } | 
|---|
| 2417 | }; | 
|---|
| 2418 |  | 
|---|
| 2419 | class ParentGroupNameQueryKey: public QueryKey, virtual Noncopyable { | 
|---|
| 2420 | const GroupSearch& group_search; | 
|---|
| 2421 | bool               directParentOnly; // true -> direct parent; false -> any parent (iterates) | 
|---|
| 2422 |  | 
|---|
| 2423 | mutable GBDATA *gb_parent; | 
|---|
| 2424 | mutable int     distance; // 1=direct parent, 2=parent of direct parent,  ... | 
|---|
| 2425 |  | 
|---|
| 2426 | static inline query_key_type detectKeyType(CriterionType ctype) { | 
|---|
| 2427 | query_key_type qkt; | 
|---|
| 2428 | switch (ctype) { | 
|---|
| 2429 | case CT_PARENT_DIRECT: qkt = QKEY_EXPLICIT; break; | 
|---|
| 2430 | case CT_PARENT_ANY:    qkt = QKEY_ANY;      break; | 
|---|
| 2431 | case CT_PARENT_ALL:    qkt = QKEY_ALL;      break; | 
|---|
| 2432 | default: gs_assert(0); break; | 
|---|
| 2433 | } | 
|---|
| 2434 | return qkt; | 
|---|
| 2435 | } | 
|---|
| 2436 |  | 
|---|
| 2437 | public: | 
|---|
| 2438 | ParentGroupNameQueryKey(const GroupSearch& group_search_, CriterionType ctype) : | 
|---|
| 2439 | QueryKey(detectKeyType(ctype)), | 
|---|
| 2440 | group_search(group_search_), | 
|---|
| 2441 | directParentOnly(ctype == CT_PARENT_DIRECT), | 
|---|
| 2442 | gb_parent(NULp), | 
|---|
| 2443 | distance(0) | 
|---|
| 2444 | { | 
|---|
| 2445 | gs_assert(ctype == CT_PARENT_DIRECT || ctype == CT_PARENT_ANY || ctype == CT_PARENT_ALL); | 
|---|
| 2446 | } | 
|---|
| 2447 | ~ParentGroupNameQueryKey() OVERRIDE {} | 
|---|
| 2448 |  | 
|---|
| 2449 | char *get_target_data(const QueryTarget& target, GB_ERROR& /*error*/) const OVERRIDE { | 
|---|
| 2450 | // retrieve name of parent group | 
|---|
| 2451 | if (!gb_parent) { // search first (direct) parent | 
|---|
| 2452 | const TargetGroup& target_group = DOWNCAST_REFERENCE(const TargetGroup, target); | 
|---|
| 2453 | const FoundGroup&  group        = target_group.get_group(); | 
|---|
| 2454 |  | 
|---|
| 2455 | gb_parent = group_search.get_parent_group(group.get_pointer()); | 
|---|
| 2456 | ++distance; | 
|---|
| 2457 | if (!gb_parent) return strdup(""); // does not match "*" | 
|---|
| 2458 | } | 
|---|
| 2459 |  | 
|---|
| 2460 | FoundGroup parent(gb_parent); | 
|---|
| 2461 | return strdup(parent.get_name()); | 
|---|
| 2462 | } | 
|---|
| 2463 | const char *get_name() const OVERRIDE { | 
|---|
| 2464 | // name of target (e.g. for reports) | 
|---|
| 2465 | if (get_type() == QKEY_EXPLICIT) { // direct parent | 
|---|
| 2466 | return "parent-name"; | 
|---|
| 2467 | } | 
|---|
| 2468 |  | 
|---|
| 2469 | return GBS_global_string("parent-%i-name", distance); | 
|---|
| 2470 | } | 
|---|
| 2471 | bool iterate() const OVERRIDE { | 
|---|
| 2472 | // iterate key to next entry (not for QKEY_EXPLICIT) | 
|---|
| 2473 | if (gb_parent && get_type() != QKEY_EXPLICIT) { | 
|---|
| 2474 | gb_parent = group_search.get_parent_group(gb_parent); | 
|---|
| 2475 | ++distance; | 
|---|
| 2476 | return gb_parent; | 
|---|
| 2477 | } | 
|---|
| 2478 | return false; | 
|---|
| 2479 | } | 
|---|
| 2480 | void reset() const OVERRIDE { | 
|---|
| 2481 | // reset iteration | 
|---|
| 2482 | gb_parent = NULp; | 
|---|
| 2483 | distance  = 0; | 
|---|
| 2484 | } | 
|---|
| 2485 |  | 
|---|
| 2486 | }; | 
|---|
| 2487 |  | 
|---|
| 2488 | void GroupSearch::addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression) { | 
|---|
| 2489 | query_operator aqo = ILLEGAL; | 
|---|
| 2490 |  | 
|---|
| 2491 | if (query_expr.isNull()) { | 
|---|
| 2492 | aqo = OR; // first is always OR | 
|---|
| 2493 | } | 
|---|
| 2494 | else { | 
|---|
| 2495 | switch (op) { | 
|---|
| 2496 | case CO_AND: aqo = AND; break; | 
|---|
| 2497 | case CO_OR: aqo  = OR; break; | 
|---|
| 2498 | case CO_IGNORE: | 
|---|
| 2499 | return; // ignore this expression | 
|---|
| 2500 | } | 
|---|
| 2501 | } | 
|---|
| 2502 |  | 
|---|
| 2503 | QueryKeyPtr key; | 
|---|
| 2504 | switch (type) { | 
|---|
| 2505 | case CT_NAME:          key = new GroupNameQueryKey; break; | 
|---|
| 2506 | case CT_FOLDED:        key = new GroupFoldedKey; break; | 
|---|
| 2507 | case CT_NESTING_LEVEL: key = new NestingLevelKey(*this); break; | 
|---|
| 2508 | case CT_SIZE:          key = new GroupSizeKey; break; | 
|---|
| 2509 | case CT_MARKED:        key = new GroupMarkedKey(false); break; | 
|---|
| 2510 | case CT_MARKED_PC:     key = new GroupMarkedKey(true); break; | 
|---|
| 2511 | case CT_ZOMBIES:       key = new GroupZombiesKey; break; | 
|---|
| 2512 |  | 
|---|
| 2513 | case CT_PARENT_DIRECT: | 
|---|
| 2514 | case CT_PARENT_ANY: | 
|---|
| 2515 | case CT_PARENT_ALL:    key = new ParentGroupNameQueryKey(*this, type); break; | 
|---|
| 2516 |  | 
|---|
| 2517 | case CT_AID:           key = new GroupAIDkey; break; | 
|---|
| 2518 | case CT_KEELED:        key = new GroupKeeledKey; break; | 
|---|
| 2519 | } | 
|---|
| 2520 |  | 
|---|
| 2521 | QueryExpr *qe = new QueryExpr(aqo, key, mtype == CM_MISMATCH, expression); | 
|---|
| 2522 | if (query_expr.isNull()) { // store 1st | 
|---|
| 2523 | query_expr = qe; | 
|---|
| 2524 | } | 
|---|
| 2525 | else { // append others | 
|---|
| 2526 | query_expr->append(qe); | 
|---|
| 2527 | } | 
|---|
| 2528 | } | 
|---|
| 2529 | void GroupSearch::forgetQExpressions() { | 
|---|
| 2530 | query_expr.setNull(); | 
|---|
| 2531 | } | 
|---|
| 2532 |  | 
|---|
| 2533 |  | 
|---|
| 2534 | // -------------------------------------------------------------------------------- | 
|---|
| 2535 |  | 
|---|
| 2536 | #ifdef UNIT_TESTS | 
|---|
| 2537 | #ifndef TEST_UNIT_H | 
|---|
| 2538 | #include <test_unit.h> | 
|---|
| 2539 | #endif | 
|---|
| 2540 |  | 
|---|
| 2541 | enum GroupListType { | 
|---|
| 2542 | GLT_NAME, | 
|---|
| 2543 | GLT_NAME_TREE, | 
|---|
| 2544 | GLT_NAME_SIZE, | 
|---|
| 2545 | GLT_NAME_AID, | 
|---|
| 2546 | GLT_CLUST_NT,        // cluster, name + tree | 
|---|
| 2547 | GLT_NAME_FOLD,       // shows foldings state | 
|---|
| 2548 | GLT_NAME_AND_PARENT, // shows parent relation (using ParentCache) | 
|---|
| 2549 | GLT_KNAME_NEST,      // shows keeled state and nesting | 
|---|
| 2550 | }; | 
|---|
| 2551 |  | 
|---|
| 2552 | static arb_test::match_expectation groupListingIs(const QueriedGroups& foundGroups, GroupListType type, const char *expected_entries) { | 
|---|
| 2553 | using namespace arb_test; | 
|---|
| 2554 |  | 
|---|
| 2555 | ParentCache& pcache = GroupSearch::get_common()->get_parent_cache(); | 
|---|
| 2556 |  | 
|---|
| 2557 | StrArray entries; | 
|---|
| 2558 | for (FoundGroupCIter g = foundGroups.begin(); g != foundGroups.end(); ++g) { | 
|---|
| 2559 | switch (type) { | 
|---|
| 2560 | case GLT_NAME: | 
|---|
| 2561 | entries.put(strdup(g->get_name())); | 
|---|
| 2562 | break; | 
|---|
| 2563 |  | 
|---|
| 2564 | case GLT_NAME_TREE: | 
|---|
| 2565 | entries.put(GBS_global_string_copy("%s/%s", g->get_name(), g->get_tree_name())); | 
|---|
| 2566 | break; | 
|---|
| 2567 |  | 
|---|
| 2568 | case GLT_NAME_SIZE: | 
|---|
| 2569 | entries.put(GBS_global_string_copy("%s(%i)", g->get_name(), g->get_size())); | 
|---|
| 2570 | break; | 
|---|
| 2571 |  | 
|---|
| 2572 | case GLT_NAME_AID: | 
|---|
| 2573 | entries.put(GBS_global_string_copy("%s(%.4f)", g->get_name(), g->get_aid())); | 
|---|
| 2574 | break; | 
|---|
| 2575 |  | 
|---|
| 2576 | case GLT_CLUST_NT: | 
|---|
| 2577 | entries.put(GBS_global_string_copy("%i/%s/%s", g->get_cluster_id(), g->get_name(), g->get_tree_name())); | 
|---|
| 2578 | break; | 
|---|
| 2579 |  | 
|---|
| 2580 | case GLT_NAME_FOLD: { | 
|---|
| 2581 | const char *format = g->is_folded() ? "[%s]" : "%s"; | 
|---|
| 2582 | entries.put(GBS_global_string_copy(format, g->get_name())); | 
|---|
| 2583 | break; | 
|---|
| 2584 | } | 
|---|
| 2585 | case GLT_NAME_AND_PARENT: { | 
|---|
| 2586 | GBDATA *gb_parent = pcache.lookupParent(g->get_pointer()); | 
|---|
| 2587 | if (gb_parent) { | 
|---|
| 2588 | entries.put(GBS_global_string_copy("%s<%s>", FoundGroup(gb_parent).get_name(), g->get_name())); | 
|---|
| 2589 | } | 
|---|
| 2590 | else { | 
|---|
| 2591 | entries.put(strdup(g->get_name())); | 
|---|
| 2592 | } | 
|---|
| 2593 | break; | 
|---|
| 2594 | } | 
|---|
| 2595 | case GLT_KNAME_NEST: { | 
|---|
| 2596 | int         kstate  = g->get_keeled(); | 
|---|
| 2597 | const char *kprefix = kstate ? (kstate == 1 ? "!" : "?") : ""; | 
|---|
| 2598 | entries.put(GBS_global_string_copy("%s%s(L%i)", kprefix, g->get_name(), g->get_nesting())); | 
|---|
| 2599 | break; | 
|---|
| 2600 | } | 
|---|
| 2601 | } | 
|---|
| 2602 | } | 
|---|
| 2603 |  | 
|---|
| 2604 | SmartCharPtr  found_entriesP = GBT_join_strings(entries, '*'); | 
|---|
| 2605 | const char   *found_entries = &*found_entriesP; | 
|---|
| 2606 | return that(found_entries).is_equal_to(expected_entries); | 
|---|
| 2607 | } | 
|---|
| 2608 |  | 
|---|
| 2609 | static arb_test::match_expectation speciesInGroupsAre(GroupSearch& gs, CollectMode cmode, const char *expected_species) { | 
|---|
| 2610 | using namespace   arb_test; | 
|---|
| 2611 | expectation_group fulfilled; | 
|---|
| 2612 |  | 
|---|
| 2613 | SpeciesNames species; | 
|---|
| 2614 | { | 
|---|
| 2615 | const QueriedGroups& groups = gs.get_results(); | 
|---|
| 2616 | ARB_ERROR            error  = gs.collectSpecies(groups, cmode, species); | 
|---|
| 2617 | fulfilled.add(doesnt_report_error(error)); | 
|---|
| 2618 | } | 
|---|
| 2619 |  | 
|---|
| 2620 | ConstStrArray entries; | 
|---|
| 2621 | for (SpeciesNames::const_iterator n = species.begin(); n != species.end(); ++n) { | 
|---|
| 2622 | entries.put(n->c_str()); | 
|---|
| 2623 | } | 
|---|
| 2624 | entries.sort(GB_string_comparator, NULp); | 
|---|
| 2625 |  | 
|---|
| 2626 | SmartCharPtr  contained_speciesP = GBT_join_strings(entries, ','); | 
|---|
| 2627 | const char   *contained_species  = &*contained_speciesP; | 
|---|
| 2628 | fulfilled.add(that(contained_species).is_equal_to(expected_species)); | 
|---|
| 2629 |  | 
|---|
| 2630 | return all().ofgroup(fulfilled); | 
|---|
| 2631 | } | 
|---|
| 2632 |  | 
|---|
| 2633 | static arb_test::match_expectation resultListingIs(GroupSearch& gs, GroupListType type, const char *expected_entries) { | 
|---|
| 2634 | using namespace arb_test; | 
|---|
| 2635 |  | 
|---|
| 2636 | const QueriedGroups& results = gs.get_results(); | 
|---|
| 2637 | GB_transaction       ta(gs.get_gb_main()); | 
|---|
| 2638 |  | 
|---|
| 2639 | return groupListingIs(results, type, expected_entries); | 
|---|
| 2640 | } | 
|---|
| 2641 |  | 
|---|
| 2642 | static arb_test::match_expectation hasOrder(const GroupSearch& gs, const char *expected_order) { | 
|---|
| 2643 | using namespace arb_test; | 
|---|
| 2644 |  | 
|---|
| 2645 | const int MAX_ORDER = 20; | 
|---|
| 2646 | char      found_order[MAX_ORDER]; | 
|---|
| 2647 | int       off       = 0; | 
|---|
| 2648 |  | 
|---|
| 2649 | const SortCriteria& order = gs.inspect_order(); | 
|---|
| 2650 | for (SortCriteria::const_iterator i = order.begin(); i != order.end(); ++i) { | 
|---|
| 2651 | char c = '?'; | 
|---|
| 2652 | switch (*i) { | 
|---|
| 2653 | case GSC_NONE:       c = '_'; break; | 
|---|
| 2654 | case GSC_NAME:       c = 'N'; break; | 
|---|
| 2655 | case GSC_TREENAME:   c = 'T'; break; | 
|---|
| 2656 | case GSC_TREEORDER:  c = 'O'; break; | 
|---|
| 2657 | case GSC_REVERSE:    c = '!'; break; | 
|---|
| 2658 | case GSC_HIT_REASON: c = 'R'; break; // @@@ untested | 
|---|
| 2659 | case GSC_NESTING:    c = 'G'; break; // --- dito --- | 
|---|
| 2660 | case GSC_SIZE:       c = 'S'; break; // --- dito --- | 
|---|
| 2661 | case GSC_MARKED:     c = 'M'; break; // --- dito --- | 
|---|
| 2662 | case GSC_MARKED_PC:  c = '%'; break; // --- dito --- | 
|---|
| 2663 | case GSC_CLUSTER:    c = 'C'; break; | 
|---|
| 2664 | case GSC_AID:        c = 'A'; break; | 
|---|
| 2665 | case GSC_KEELED:     c = 'k'; break; | 
|---|
| 2666 | } | 
|---|
| 2667 | found_order[off++] = c; | 
|---|
| 2668 | } | 
|---|
| 2669 | gs_assert(off<MAX_ORDER); | 
|---|
| 2670 | found_order[off] = 0; | 
|---|
| 2671 | return that(found_order).is_equal_to(expected_order); | 
|---|
| 2672 | } | 
|---|
| 2673 |  | 
|---|
| 2674 | static arb_test::match_expectation addingCriterionProduces(GroupSearch& gs, GroupSortCriterion crit, const char *expected_order, const char *expected_entries) { | 
|---|
| 2675 | using namespace   arb_test; | 
|---|
| 2676 | expectation_group fulfilled; | 
|---|
| 2677 |  | 
|---|
| 2678 | gs.addSortCriterion(crit); | 
|---|
| 2679 |  | 
|---|
| 2680 | fulfilled.add(hasOrder(gs, expected_order)); | 
|---|
| 2681 | fulfilled.add(resultListingIs(gs, GLT_NAME_TREE, expected_entries)); | 
|---|
| 2682 |  | 
|---|
| 2683 | return all().ofgroup(fulfilled); | 
|---|
| 2684 | } | 
|---|
| 2685 |  | 
|---|
| 2686 | static int refreshes_traced = 0; | 
|---|
| 2687 | static void trace_refresh_cb() { ++refreshes_traced; } | 
|---|
| 2688 |  | 
|---|
| 2689 | void TEST_group_search() { | 
|---|
| 2690 | GB_shell  shell; | 
|---|
| 2691 | GBDATA   *gb_main = GB_open("../../demo.arb", "r"); | 
|---|
| 2692 |  | 
|---|
| 2693 | GroupSearchCallback traceRefresh_cb = makeGroupSearchCallback(trace_refresh_cb); | 
|---|
| 2694 | refreshes_traced = 0; | 
|---|
| 2695 |  | 
|---|
| 2696 | { | 
|---|
| 2697 | GroupSearch allGroups(gb_main, traceRefresh_cb); | 
|---|
| 2698 | TEST_EXPECT(allGroups.get_results().empty()); | 
|---|
| 2699 |  | 
|---|
| 2700 | allGroups.perform_search(GSM_FIND); | 
|---|
| 2701 | TEST_EXPECT(!allGroups.get_results().empty()); | 
|---|
| 2702 | TEST_EXPECT_EQUAL(allGroups.get_results().size(), 28); | 
|---|
| 2703 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE, | 
|---|
| 2704 | "last/tree_test*another group/tree_test*outer/tree_test*inner/tree_test*test/tree_test*outer/tree_test*test/tree_test*xx/tree_test*" | 
|---|
| 2705 | "outer/tree_tree2*g2/tree_tree2*xx/tree_tree2*test/tree_tree2*outer/tree_tree2*inner/tree_tree2*test/tree_tree2*" | 
|---|
| 2706 | "zombsub/tree_zomb*zomb/tree_zomb*ZOMB/tree_zomb*dup/tree_zomb*inner outer group/tree_zomb*inner group/tree_zomb*outer group/tree_zomb*g4/tree_zomb*g3/tree_zomb*g2/tree_zomb*xx/tree_zomb*yy/tree_zomb*eee/tree_zomb" | 
|---|
| 2707 | )); | 
|---|
| 2708 |  | 
|---|
| 2709 | TEST_EXPECTATION(hasOrder(allGroups, "")); | 
|---|
| 2710 | allGroups.addSortCriterion(GSC_NAME); // sort by name | 
|---|
| 2711 | TEST_EXPECTATION(hasOrder(allGroups, "N")); | 
|---|
| 2712 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE, | 
|---|
| 2713 | "ZOMB/tree_zomb*" // @@@ should be sorted case insensitive | 
|---|
| 2714 | "another group/tree_test*dup/tree_zomb*eee/tree_zomb*" | 
|---|
| 2715 | "g2/tree_tree2*g2/tree_zomb*" | 
|---|
| 2716 | "g3/tree_zomb*g4/tree_zomb*" | 
|---|
| 2717 | "inner/tree_test*inner/tree_tree2*"                                  // order is stable | 
|---|
| 2718 | "inner group/tree_zomb*inner outer group/tree_zomb*last/tree_test*" | 
|---|
| 2719 | "outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2*" // order is stable | 
|---|
| 2720 | "outer group/tree_zomb*" | 
|---|
| 2721 | "test/tree_test*test/tree_test*test/tree_tree2*test/tree_tree2*"     // order is stable | 
|---|
| 2722 | "xx/tree_test*xx/tree_tree2*xx/tree_zomb*"                           // order is stable | 
|---|
| 2723 | "yy/tree_zomb*zomb/tree_zomb*zombsub/tree_zomb" | 
|---|
| 2724 | )); | 
|---|
| 2725 |  | 
|---|
| 2726 | // search only in tree_tree2 | 
|---|
| 2727 | TreeNameSet tree2; | 
|---|
| 2728 | tree2.insert("tree_tree2"); | 
|---|
| 2729 | allGroups.setSearchRange(tree2); | 
|---|
| 2730 | allGroups.perform_search(GSM_FIND); | 
|---|
| 2731 | TEST_EXPECT_EQUAL(allGroups.get_results().size(), 7); | 
|---|
| 2732 | TEST_EXPECTATION(hasOrder(allGroups, "N")); // results still sorted by name (sort criteria are not reset by new search) | 
|---|
| 2733 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE, "g2/tree_tree2*inner/tree_tree2*outer/tree_tree2*outer/tree_tree2*test/tree_tree2*test/tree_tree2*xx/tree_tree2")); | 
|---|
| 2734 | } | 
|---|
| 2735 |  | 
|---|
| 2736 | { | 
|---|
| 2737 | GroupSearch some(gb_main, traceRefresh_cb); | 
|---|
| 2738 |  | 
|---|
| 2739 | some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*ou*"); | 
|---|
| 2740 |  | 
|---|
| 2741 | some.perform_search(GSM_FIND); | 
|---|
| 2742 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "another group*outer*outer*outer*outer*inner outer group*inner group*outer group")); | 
|---|
| 2743 | TEST_EXPECT_EQUAL(some.get_results().get_column_widths().name, 17); | 
|---|
| 2744 |  | 
|---|
| 2745 | // test 2nd filter | 
|---|
| 2746 | some.forgetQExpressions(); | 
|---|
| 2747 | some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*er"); | 
|---|
| 2748 | some.perform_search(GSM_FIND); | 
|---|
| 2749 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME_TREE, "outer/tree_test*inner/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2*inner/tree_tree2")); | 
|---|
| 2750 | TEST_EXPECT_EQUAL(some.get_results().get_column_widths().name, 5); | 
|---|
| 2751 |  | 
|---|
| 2752 | { | 
|---|
| 2753 | // test order | 
|---|
| 2754 | const char *BY_NAME_FWD = "inner/tree_test*inner/tree_tree2*outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2"; | 
|---|
| 2755 | const char *BY_NAME_REV = "outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2*inner/tree_test*inner/tree_tree2"; | 
|---|
| 2756 |  | 
|---|
| 2757 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_NAME,    "N",  BY_NAME_FWD)); | 
|---|
| 2758 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!N", BY_NAME_REV)); | 
|---|
| 2759 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_NAME,    "N",  BY_NAME_FWD)); | 
|---|
| 2760 |  | 
|---|
| 2761 | // test multiple "reverse" criteria | 
|---|
| 2762 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!N", BY_NAME_REV)); | 
|---|
| 2763 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "N",  BY_NAME_FWD)); | 
|---|
| 2764 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE, "!N", BY_NAME_REV)); | 
|---|
| 2765 |  | 
|---|
| 2766 | // test sort by treename | 
|---|
| 2767 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_TREENAME, "T!N",  "outer/tree_test*outer/tree_test*inner/tree_test*outer/tree_tree2*outer/tree_tree2*inner/tree_tree2")); | 
|---|
| 2768 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE,  "!T!N", "inner/tree_tree2*outer/tree_tree2*outer/tree_tree2*inner/tree_test*outer/tree_test*outer/tree_test")); | 
|---|
| 2769 |  | 
|---|
| 2770 | // test sort by tree-order (as specified in tree-admin) | 
|---|
| 2771 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_TREEORDER, "O!T!N",  "inner/tree_test*outer/tree_test*outer/tree_test*inner/tree_tree2*outer/tree_tree2*outer/tree_tree2")); | 
|---|
| 2772 | TEST_EXPECTATION(addingCriterionProduces(some, GSC_REVERSE,   "!O!T!N", "outer/tree_tree2*outer/tree_tree2*inner/tree_tree2*outer/tree_test*outer/tree_test*inner/tree_test")); | 
|---|
| 2773 |  | 
|---|
| 2774 | some.forgetSortCriteria(); | 
|---|
| 2775 | } | 
|---|
| 2776 |  | 
|---|
| 2777 | // combine both filters (conjunction will only report 'outer') | 
|---|
| 2778 | some.addQueryExpression(CO_AND, CT_NAME, CM_MATCH, "*ou*"); | 
|---|
| 2779 | some.perform_search(GSM_FIND); | 
|---|
| 2780 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME_TREE, "outer/tree_test*outer/tree_test*outer/tree_tree2*outer/tree_tree2")); | 
|---|
| 2781 |  | 
|---|
| 2782 | // test adding results | 
|---|
| 2783 | some.forgetQExpressions(); | 
|---|
| 2784 | some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*xx*"); | 
|---|
| 2785 | some.perform_search(GSM_ADD); | 
|---|
| 2786 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*outer*outer*xx*xx*xx")); | 
|---|
| 2787 |  | 
|---|
| 2788 | some.forgetQExpressions(); | 
|---|
| 2789 | some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*er*"); | 
|---|
| 2790 | some.perform_search(GSM_ADD); // check no duplicates are reported (filter also matches 'outer') | 
|---|
| 2791 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*outer*outer*xx*xx*xx*another group*inner*inner*inner outer group*inner group*outer group")); | 
|---|
| 2792 |  | 
|---|
| 2793 | // test removing a single result | 
|---|
| 2794 | { | 
|---|
| 2795 | some.addSortCriterion(GSC_TREEORDER); // first change order to make removal comprehensible | 
|---|
| 2796 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*xx*another group*inner*outer*outer*xx*inner*xx*inner outer group*inner group*outer group")); | 
|---|
| 2797 |  | 
|---|
| 2798 | const char *FIRST_XX_REMOVED = "outer*outer*another group*inner*outer*outer*xx*inner*xx*inner outer group*inner group*outer group"; | 
|---|
| 2799 | some.remove_hit(2); // remove first 'xx' | 
|---|
| 2800 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME, FIRST_XX_REMOVED)); | 
|---|
| 2801 | // test that out-of-bounds removals are NOOPs: | 
|---|
| 2802 | some.remove_hit(-10); TEST_EXPECTATION(resultListingIs(some, GLT_NAME, FIRST_XX_REMOVED)); | 
|---|
| 2803 | some.remove_hit(100); TEST_EXPECTATION(resultListingIs(some, GLT_NAME, FIRST_XX_REMOVED)); | 
|---|
| 2804 | } | 
|---|
| 2805 |  | 
|---|
| 2806 | // test keeping results | 
|---|
| 2807 | some.forgetQExpressions(); | 
|---|
| 2808 | some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*ou*"); | 
|---|
| 2809 | some.perform_search(GSM_KEEP); | 
|---|
| 2810 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "outer*outer*another group*outer*outer*inner outer group*inner group*outer group")); | 
|---|
| 2811 |  | 
|---|
| 2812 | // test removing results (also tests "mismatch") | 
|---|
| 2813 | some.forgetQExpressions(); | 
|---|
| 2814 | some.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "outer"); | 
|---|
| 2815 | some.perform_search(GSM_REMOVE); | 
|---|
| 2816 | TEST_EXPECTATION(resultListingIs(some, GLT_NAME, "another group*inner outer group*inner group*outer group")); | 
|---|
| 2817 | } | 
|---|
| 2818 |  | 
|---|
| 2819 | // test different search keys | 
|---|
| 2820 | { | 
|---|
| 2821 | GroupSearch keyed(gb_main, traceRefresh_cb); | 
|---|
| 2822 | const char *TOP_GROUPS = "last*another group*outer*test*outer*outer*zombsub*dup*inner outer group"; | 
|---|
| 2823 |  | 
|---|
| 2824 | // CT_PARENT_DIRECT (direct parent group name) | 
|---|
| 2825 | keyed.addQueryExpression(CO_OR, CT_PARENT_DIRECT, CM_MATCH, ""); // direct parent w/o name (=no direct parent) | 
|---|
| 2826 | keyed.perform_search(GSM_FIND); | 
|---|
| 2827 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, TOP_GROUPS));  // -> TOP_GROUPS | 
|---|
| 2828 |  | 
|---|
| 2829 | keyed.forgetQExpressions(); | 
|---|
| 2830 | keyed.addQueryExpression(CO_OR, CT_PARENT_DIRECT, CM_MATCH, "/^[^ ]*ou[^ ]*$/"); // uses regular expression query | 
|---|
| 2831 | keyed.perform_search(GSM_FIND); | 
|---|
| 2832 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "outer<inner>*outer<test>*outer<xx>*outer<g2>*outer<test>*outer<inner>*outer<test>")); | 
|---|
| 2833 |  | 
|---|
| 2834 | // CT_PARENT_ANY | 
|---|
| 2835 | keyed.forgetQExpressions(); | 
|---|
| 2836 | keyed.addQueryExpression(CO_OR,  CT_PARENT_ANY, CM_MATCH,    "|contains(\"ou\");contains(\" \")|equals(0)|minus"); | 
|---|
| 2837 | keyed.perform_search(GSM_FIND); | 
|---|
| 2838 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "outer<inner>*outer<test>*outer<xx>*outer<g2>*g2<xx>*outer<test>*test<outer>*outer<inner>*outer<test>")); | 
|---|
| 2839 |  | 
|---|
| 2840 | // CT_PARENT_ALL | 
|---|
| 2841 | keyed.forgetQExpressions(); | 
|---|
| 2842 | keyed.addQueryExpression(CO_OR,  CT_PARENT_ALL, CM_MISMATCH, "/ou/"); // not inside group containing 'ou' | 
|---|
| 2843 | keyed.addQueryExpression(CO_AND, CT_NAME,       CM_MISMATCH, "/ou/"); // and not containing 'ou' itself | 
|---|
| 2844 | keyed.perform_search(GSM_FIND); | 
|---|
| 2845 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "last*test*zombsub*zombsub<zomb>*zombsub<ZOMB>*dup")); | 
|---|
| 2846 |  | 
|---|
| 2847 | // CT_NESTING_LEVEL | 
|---|
| 2848 | keyed.forgetQExpressions(); | 
|---|
| 2849 | keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, "<1");         // nesting level less than 1 | 
|---|
| 2850 | keyed.perform_search(GSM_FIND); | 
|---|
| 2851 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, TOP_GROUPS)); // -> TOP_GROUPS | 
|---|
| 2852 |  | 
|---|
| 2853 | keyed.forgetQExpressions(); | 
|---|
| 2854 | keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MISMATCH, ">0");      // nesting level not above 0 | 
|---|
| 2855 | keyed.perform_search(GSM_FIND); | 
|---|
| 2856 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, TOP_GROUPS)); // -> TOP_GROUPS | 
|---|
| 2857 |  | 
|---|
| 2858 | keyed.forgetQExpressions(); | 
|---|
| 2859 | keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, ">4"); // too high nesting level | 
|---|
| 2860 | keyed.perform_search(GSM_FIND); | 
|---|
| 2861 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "")); | 
|---|
| 2862 |  | 
|---|
| 2863 | keyed.forgetQExpressions(); | 
|---|
| 2864 | keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, ">3"); // highest occurring nesting level | 
|---|
| 2865 | keyed.perform_search(GSM_FIND); | 
|---|
| 2866 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "yy<eee>")); // one group with nesting level 4 | 
|---|
| 2867 |  | 
|---|
| 2868 | keyed.forgetQExpressions(); | 
|---|
| 2869 | keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, ">2"); | 
|---|
| 2870 | keyed.perform_search(GSM_FIND); | 
|---|
| 2871 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "outer<inner>*g2<xx>*g2<yy>*yy<eee>")); // 1xL4 + 3xL3 | 
|---|
| 2872 |  | 
|---|
| 2873 | keyed.forgetQExpressions(); | 
|---|
| 2874 | keyed.addQueryExpression(CO_OR,  CT_NESTING_LEVEL, CM_MATCH, ">1"); | 
|---|
| 2875 | keyed.addQueryExpression(CO_AND, CT_NESTING_LEVEL, CM_MATCH, "<4"); | 
|---|
| 2876 | keyed.perform_search(GSM_FIND); | 
|---|
| 2877 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "g2<xx>*test<outer>*outer<inner>*outer group<g4>*outer group<g3>*outer group<g2>*g2<xx>*g2<yy>")); // 5x L2 + 3x L3 | 
|---|
| 2878 |  | 
|---|
| 2879 | keyed.forgetQExpressions(); | 
|---|
| 2880 | keyed.addQueryExpression(CO_OR, CT_NESTING_LEVEL, CM_MATCH, "2"); | 
|---|
| 2881 | keyed.perform_search(GSM_FIND); | 
|---|
| 2882 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, "g2<xx>*test<outer>*outer group<g4>*outer group<g3>*outer group<g2>")); // 5x L2 | 
|---|
| 2883 |  | 
|---|
| 2884 | // CT_FOLDED | 
|---|
| 2885 | const char *EXPANDED_GROUPS = "last*outer*outer<inner>*outer*outer*zombsub"; | 
|---|
| 2886 | keyed.forgetQExpressions(); | 
|---|
| 2887 | keyed.addQueryExpression(CO_OR, CT_FOLDED, CM_MATCH, "0"); | 
|---|
| 2888 | keyed.perform_search(GSM_FIND); | 
|---|
| 2889 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, EXPANDED_GROUPS)); | 
|---|
| 2890 |  | 
|---|
| 2891 | keyed.forgetQExpressions(); | 
|---|
| 2892 | keyed.addQueryExpression(CO_OR, CT_NAME /*does not matter*/, CM_MISMATCH, "|readdb(grouped)|equals(1)"); // directly access field of group-container | 
|---|
| 2893 | keyed.perform_search(GSM_FIND); | 
|---|
| 2894 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AND_PARENT, EXPANDED_GROUPS)); | 
|---|
| 2895 |  | 
|---|
| 2896 | // CT_SIZE | 
|---|
| 2897 | keyed.forgetQExpressions(); | 
|---|
| 2898 | keyed.addQueryExpression(CO_OR,  CT_SIZE, CM_MATCH, ">12");             // find bigger groups | 
|---|
| 2899 | keyed.perform_search(GSM_FIND); | 
|---|
| 2900 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_SIZE, "another group(29)*outer(15)*outer(47)*zombsub(14)*inner outer group(19)*outer group(15)")); | 
|---|
| 2901 | keyed.addQueryExpression(CO_AND, CT_SIZE, CM_MATCH, "|rest(2)|equals(0)"); // with even groupsize only | 
|---|
| 2902 | keyed.perform_search(GSM_FIND); | 
|---|
| 2903 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_SIZE, "zombsub(14)")); // the only bigger group with an even number of members | 
|---|
| 2904 |  | 
|---|
| 2905 | // CT_MARKED + CT_MARKED_PC | 
|---|
| 2906 | keyed.forgetQExpressions(); | 
|---|
| 2907 | keyed.addQueryExpression(CO_OR, CT_MARKED, CM_MATCH, ">7"); // at least 8 marked species inside group | 
|---|
| 2908 | keyed.perform_search(GSM_FIND); | 
|---|
| 2909 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, "another group*outer*inner outer group*outer group")); | 
|---|
| 2910 |  | 
|---|
| 2911 | const char *COMPLETELY_MARKED_GROUPS = "test*xx*xx*g4*xx*eee"; | 
|---|
| 2912 | keyed.forgetQExpressions(); | 
|---|
| 2913 | keyed.addQueryExpression(CO_OR, CT_MARKED_PC, CM_MATCH, ">99");                      // completely marked groups (more than 99%) | 
|---|
| 2914 | keyed.perform_search(GSM_FIND); | 
|---|
| 2915 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, COMPLETELY_MARKED_GROUPS)); | 
|---|
| 2916 | keyed.forgetQExpressions(); | 
|---|
| 2917 | keyed.addQueryExpression(CO_OR, CT_MARKED_PC, CM_MISMATCH, "<100");                  // completely marked groups (not less than 100%) | 
|---|
| 2918 | keyed.perform_search(GSM_FIND); | 
|---|
| 2919 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, COMPLETELY_MARKED_GROUPS)); | 
|---|
| 2920 | keyed.forgetQExpressions(); | 
|---|
| 2921 | keyed.addQueryExpression(CO_OR, CT_MARKED_PC, CM_MATCH, "100");                      // completely marked groups (equal to 100%) | 
|---|
| 2922 | keyed.perform_search(GSM_FIND); | 
|---|
| 2923 | TEST_EXPECTATION__BROKEN(resultListingIs(keyed, GLT_NAME, COMPLETELY_MARKED_GROUPS), // @@@ matching % for equality does not work as expected | 
|---|
| 2924 | resultListingIs(keyed, GLT_NAME, "")); | 
|---|
| 2925 |  | 
|---|
| 2926 |  | 
|---|
| 2927 | keyed.forgetQExpressions(); | 
|---|
| 2928 | keyed.addQueryExpression(CO_OR,  CT_MARKED,    CM_MISMATCH, "0");   // groups with marked.. | 
|---|
| 2929 | keyed.addQueryExpression(CO_AND, CT_MARKED_PC, CM_MATCH,    "<50"); // ..but less than 50% | 
|---|
| 2930 | keyed.perform_search(GSM_FIND); | 
|---|
| 2931 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, "outer*outer*test")); | 
|---|
| 2932 |  | 
|---|
| 2933 | // CT_ZOMBIES | 
|---|
| 2934 | keyed.forgetQExpressions(); | 
|---|
| 2935 | keyed.addQueryExpression(CO_OR, CT_ZOMBIES, CM_MISMATCH, "0"); // groups with zombies | 
|---|
| 2936 | keyed.perform_search(GSM_FIND); | 
|---|
| 2937 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME, "zombsub*zomb*ZOMB")); | 
|---|
| 2938 |  | 
|---|
| 2939 | // CT_AID | 
|---|
| 2940 | keyed.forgetQExpressions(); | 
|---|
| 2941 | keyed.addQueryExpression(CO_OR, CT_AID, CM_MATCH, ">1"); // groups with high AID | 
|---|
| 2942 | keyed.perform_search(GSM_FIND); | 
|---|
| 2943 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AID, "outer(1.0996)*outer(1.1605)")); | 
|---|
| 2944 |  | 
|---|
| 2945 | keyed.forgetQExpressions(); | 
|---|
| 2946 | keyed.addQueryExpression(CO_OR, CT_AID, CM_MATCH, "<.1"); // groups with low AID | 
|---|
| 2947 | keyed.perform_search(GSM_FIND); | 
|---|
| 2948 | keyed.addSortCriterion(GSC_AID); | 
|---|
| 2949 | keyed.addSortCriterion(GSC_REVERSE); | 
|---|
| 2950 | TEST_EXPECTATION(resultListingIs(keyed, GLT_NAME_AID, "xx(0.0786)*xx(0.0786)*g3(0.0665)*dup(0.0399)*inner group(0.0259)")); | 
|---|
| 2951 |  | 
|---|
| 2952 | // CT_KEELED is tested in TEST_keeled_group_search() | 
|---|
| 2953 | } | 
|---|
| 2954 |  | 
|---|
| 2955 | TEST_EXPECT_EQUAL(refreshes_traced, 0); // no refresh traced up to here | 
|---|
| 2956 |  | 
|---|
| 2957 | // test group-actions: | 
|---|
| 2958 |  | 
|---|
| 2959 | { | 
|---|
| 2960 | refreshes_traced = 0; | 
|---|
| 2961 |  | 
|---|
| 2962 | GroupSearch misc(gb_main, traceRefresh_cb); | 
|---|
| 2963 |  | 
|---|
| 2964 | misc.addQueryExpression(CO_OR,  CT_NAME, CM_MATCH,    "*e*"); | 
|---|
| 2965 | misc.addQueryExpression(CO_AND, CT_NAME, CM_MISMATCH, "* *"); | 
|---|
| 2966 | misc.perform_search(GSM_FIND); | 
|---|
| 2967 | { | 
|---|
| 2968 | const char *ACI_add_tag = "\"[TAG] \";dd"; | 
|---|
| 2969 |  | 
|---|
| 2970 | const char *BEFORE_RENAME    = "outer*inner*test*outer*test*outer*test*outer*inner*test*eee"; | 
|---|
| 2971 | const char *OUTER_PREFIXED = "[TAG] outer*inner*test*outer*test*outer*test*outer*inner*test*eee"; | 
|---|
| 2972 |  | 
|---|
| 2973 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, BEFORE_RENAME)); | 
|---|
| 2974 |  | 
|---|
| 2975 | // test renaming groups: | 
|---|
| 2976 | TEST_EXPECT_NO_ERROR(misc.rename_group(0, ACI_add_tag));      TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, OUTER_PREFIXED)); // prefix first 'outer' | 
|---|
| 2977 | TEST_EXPECT_NO_ERROR(misc.rename_group(0, "\"\""));           TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, OUTER_PREFIXED)); // test empty ACI-result does not rename anything | 
|---|
| 2978 |  | 
|---|
| 2979 | TEST_EXPECT_NO_ERROR(misc.rename_found_groups("\"[X]\";dd;\"   \"")); // prefix '[X]' to all found groups + suffix space (which are trimmed away afterwards) | 
|---|
| 2980 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "[X][TAG] outer*[X]inner*[X]test*[X]outer*[X]test*[X]outer*[X]test*[X]outer*[X]inner*[X]test*[X]eee")); | 
|---|
| 2981 |  | 
|---|
| 2982 | // test errors get reported: | 
|---|
| 2983 | TEST_EXPECT_ERROR_CONTAINS(misc.rename_group(0,     ":x"), "no '=' found"); | 
|---|
| 2984 | TEST_EXPECT_ERROR_CONTAINS(misc.rename_found_groups(":x"), "no '=' found"); | 
|---|
| 2985 |  | 
|---|
| 2986 | TEST_EXPECT_NO_ERROR(misc.rename_found_groups("/\\[.*\\]//")); // remove any prefixes | 
|---|
| 2987 |  | 
|---|
| 2988 | TEST_EXPECT_NO_ERROR(misc.rename_found_groups("dd;\"_\";hitidx;\"/\";hitcount")); // append "_index/hitcount" to groupname | 
|---|
| 2989 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer_1/11*inner_2/11*test_3/11*outer_4/11*test_5/11*outer_6/11*test_7/11*outer_8/11*inner_9/11*test_10/11*eee_11/11")); | 
|---|
| 2990 |  | 
|---|
| 2991 | TEST_EXPECT_NO_ERROR(misc.rename_found_groups("command(\"/_.*$//\")|dd;\"_\";markedInGroup;\"/\";groupSize")); // replace suffix with "marked/size" | 
|---|
| 2992 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer_6/11*inner_4/5*test_7/7*outer_7/15*test_0/4*outer_20/47*test_6/12*outer_6/11*inner_4/5*test_2/6*eee_3/3")); | 
|---|
| 2993 |  | 
|---|
| 2994 | TEST_EXPECT_NO_ERROR(misc.rename_found_groups(":_*=_L*(|nesting)\\=*(|aid)")); // replace suffix with nesting level and aid | 
|---|
| 2995 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer_L0=0.695293*inner_L1=0.269289*test_L0=0.160956*outer_L0=1.099650*test_L1=0.591923*outer_L0=1.160535*test_L1=0.726679*outer_L2=0.704352*inner_L3=0.265516*test_L1=0.303089*eee_L4=0.229693")); | 
|---|
| 2996 |  | 
|---|
| 2997 | // undo renaming groups (to avoid need to change tests below) | 
|---|
| 2998 | TEST_EXPECT_NO_ERROR(misc.rename_found_groups("/_.*$//"));     // remove all behind '_' | 
|---|
| 2999 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, BEFORE_RENAME)); | 
|---|
| 3000 |  | 
|---|
| 3001 | TEST_EXPECT_EQUAL(refreshes_traced, 7); // amount of result-list refreshes that would happen (1 * rename_group() + 6 * rename_found_groups(); one rename_group did nothing!) | 
|---|
| 3002 | refreshes_traced = 0; | 
|---|
| 3003 | } | 
|---|
| 3004 |  | 
|---|
| 3005 | { | 
|---|
| 3006 | GroupSearch all(gb_main, traceRefresh_cb);  // run a 2nd search | 
|---|
| 3007 | GroupSearch none(gb_main, traceRefresh_cb); // run a 3rd search | 
|---|
| 3008 | GroupSearch few(gb_main, traceRefresh_cb);  // run a 4th search | 
|---|
| 3009 |  | 
|---|
| 3010 | // test folding single groups | 
|---|
| 3011 | TEST_EXPECTATION(                                                      resultListingIs(misc, GLT_NAME_FOLD, "outer*inner*[test]*outer*[test]*outer*[test]*[outer]*[inner]*[test]*[eee]"));   // shows current folding state | 
|---|
| 3012 | TEST_EXPECT_NO_ERROR(misc.fold_group(0, GFM_TOGGLE)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_FOLD, "[outer]*inner*[test]*outer*[test]*outer*[test]*[outer]*[inner]*[test]*[eee]")); // fold 1st 'outer' | 
|---|
| 3013 | TEST_EXPECT_NO_ERROR(misc.fold_group(0, GFM_TOGGLE)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_FOLD, "outer*inner*[test]*outer*[test]*outer*[test]*[outer]*[inner]*[test]*[eee]"));   // unfold 1st 'outer' | 
|---|
| 3014 |  | 
|---|
| 3015 | TEST_EXPECT_EQUAL(refreshes_traced, 2); // 2 result-list refreshes would happen (one for each fold_group()) | 
|---|
| 3016 | refreshes_traced = 0; | 
|---|
| 3017 |  | 
|---|
| 3018 | none.addQueryExpression(CO_OR, CT_NAME, CM_MISMATCH, "*"); // no such group | 
|---|
| 3019 | all.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "*"); // matches all groups | 
|---|
| 3020 | few.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "inner"); | 
|---|
| 3021 |  | 
|---|
| 3022 | none.perform_search(GSM_FIND); | 
|---|
| 3023 | few.perform_search(GSM_FIND); | 
|---|
| 3024 | all.perform_search(GSM_FIND); | 
|---|
| 3025 |  | 
|---|
| 3026 | TEST_EXPECTATION(resultListingIs(none, GLT_NAME,      "")); // shows no results | 
|---|
| 3027 | TEST_EXPECTATION(resultListingIs(few,  GLT_NAME_FOLD, "inner*[inner]")); // shows some results | 
|---|
| 3028 | // shows current folding state (of all groups from all trees): | 
|---|
| 3029 | TEST_EXPECTATION(resultListingIs(all,  GLT_NAME_FOLD, "last*[another group]*outer*inner*[test]*outer*[test]*[xx]*outer*[g2]*[xx]*[test]*[outer]*[inner]*[test]*zombsub*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]")); | 
|---|
| 3030 |  | 
|---|
| 3031 | TEST_EXPECT_EQUAL(refreshes_traced, 0); | 
|---|
| 3032 |  | 
|---|
| 3033 | // test folding listed groups | 
|---|
| 3034 | // (Note: that results used for folding and for test differ!) | 
|---|
| 3035 | TEST_EXPECT_NO_ERROR( few.fold_found_groups(GFM_EXPANDREC));          TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "last*[another group]*outer*inner*[test]*outer*[test]*[xx]*"          "outer*[g2]*[xx]*test*outer*inner*[test]*"         "zombsub*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]"));  // [A] only unfolds 2nd inner and 2 of its 3 parent groups | 
|---|
| 3036 | TEST_EXPECT_NO_ERROR(misc.fold_found_groups(GFM_EXPANDREC));          TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "last*[another group]*outer*inner*test*outer*test*[xx]*"              "outer*[g2]*[xx]*test*outer*inner*test*"           "zombsub*[zomb]*[ZOMB]*[dup]*inner outer group*[inner group]*outer group*[g4]*[g3]*g2*[xx]*yy*eee"));            // 'xx' and 'g2' remain folded | 
|---|
| 3037 | TEST_EXPECT_NO_ERROR(misc.fold_found_groups(GFM_COLLAPSE));           TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "last*[another group]*[outer]*[inner]*[test]*[outer]*[test]*[xx]*"   "[outer]*[g2]*[xx]*[test]*[outer]*[inner]*[test]*"  "zombsub*[zomb]*[ZOMB]*[dup]*inner outer group*[inner group]*outer group*[g4]*[g3]*g2*[xx]*yy*[eee]"));          // 'last' remains unfolded | 
|---|
| 3038 | TEST_EXPECT_NO_ERROR( few.fold_found_groups(GFM_EXPANDREC_COLLREST)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "[last]*[another group]*outer*inner*[test]*[outer]*[test]*[xx]*"      "outer*[g2]*[xx]*test*outer*inner*[test]*"        "[zombsub]*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]")); // similar to line [A], but 'last' gets folded | 
|---|
| 3039 | TEST_EXPECT_NO_ERROR(none.fold_found_groups(GFM_EXPANDREC_COLLREST)); TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "[last]*[another group]*[outer]*[inner]*[test]*[outer]*[test]*[xx]*" "[outer]*[g2]*[xx]*[test]*[outer]*[inner]*[test]*" "[zombsub]*[zomb]*[ZOMB]*[dup]*[inner outer group]*[inner group]*[outer group]*[g4]*[g3]*[g2]*[xx]*[yy]*[eee]")); // unfold none+collapse rest = fold all | 
|---|
| 3040 | TEST_EXPECT_NO_ERROR(misc.fold_found_groups(GFM_EXPANDPARENTS));      TEST_EXPECTATION(resultListingIs(all, GLT_NAME_FOLD, "[last]*[another group]*outer*[inner]*[test]*outer*[test]*[xx]*"      "outer*[g2]*[xx]*test*outer*[inner]*[test]*"      "[zombsub]*[zomb]*[ZOMB]*[dup]*inner outer group*[inner group]*outer group*[g4]*[g3]*g2*[xx]*yy*[eee]"));         // unfold all groups containing listed groups | 
|---|
| 3041 |  | 
|---|
| 3042 | TEST_EXPECT_EQUAL(refreshes_traced, 16); // @@@ want less refreshes! | 
|---|
| 3043 | refreshes_traced = 0; | 
|---|
| 3044 |  | 
|---|
| 3045 | { | 
|---|
| 3046 | GroupSearch group2(gb_main, traceRefresh_cb);  // run a 5th search | 
|---|
| 3047 | group2.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "g2"); // group 'g2' exists in 2 tree; species overlap, but are not identical | 
|---|
| 3048 | group2.perform_search(GSM_FIND); | 
|---|
| 3049 |  | 
|---|
| 3050 | GB_transaction ta(gb_main); | 
|---|
| 3051 |  | 
|---|
| 3052 | // test retrieval of species contained in groups: | 
|---|
| 3053 | TEST_EXPECTATION(speciesInGroupsAre(none, INTERSECT, "")); | 
|---|
| 3054 |  | 
|---|
| 3055 | // groups 'inner' are identical in all trees: | 
|---|
| 3056 | const char *INNER_SPECIES = "McpCapri,McpMyco2,McpMycoi,McpSpeci,SpiMelli"; | 
|---|
| 3057 | TEST_EXPECTATION(speciesInGroupsAre(few, UNITE,     INNER_SPECIES)); | 
|---|
| 3058 | TEST_EXPECTATION(speciesInGroupsAre(few, INTERSECT, INNER_SPECIES)); | 
|---|
| 3059 |  | 
|---|
| 3060 | TEST_EXPECTATION(speciesInGroupsAre(group2, UNITE,     "AnaAbact,BacMegat,BacPaste,CloTyro2,CloTyro4,CloTyrob,StaAureu,StaEpide")); | 
|---|
| 3061 | TEST_EXPECTATION(speciesInGroupsAre(group2, INTERSECT, "AnaAbact,BacMegat,BacPaste,"       "CloTyro4,CloTyrob,StaAureu")); | 
|---|
| 3062 | } | 
|---|
| 3063 | } | 
|---|
| 3064 |  | 
|---|
| 3065 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_AND_PARENT, "outer*outer<inner>*test*outer*outer<test>*outer*outer<test>*test<outer>*outer<inner>*outer<test>*yy<eee>")); // format is "parent<child>" | 
|---|
| 3066 |  | 
|---|
| 3067 | // test deleting groups: | 
|---|
| 3068 | TEST_EXPECT_NO_ERROR(misc.delete_group(6)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer*inner*test*outer*test*outer*outer*inner*test*eee")); // delete 1st 'test' from 'tree_test2' (DEL_TEST) | 
|---|
| 3069 | TEST_EXPECT_NO_ERROR(misc.delete_group(3)); TEST_EXPECTATION(resultListingIs(misc, GLT_NAME, "outer*inner*test*test*outer*outer*inner*test*eee"));       // delete 2nd 'outer' from 'tree_tree' (DEL_OUTER) | 
|---|
| 3070 |  | 
|---|
| 3071 | // deleting invalid index only returns an error: | 
|---|
| 3072 | TEST_EXPECT_ERROR_CONTAINS(misc.delete_group(100), "out-of-bounds"); | 
|---|
| 3073 | TEST_EXPECT_ERROR_CONTAINS(misc.delete_group(-1), "out-of-bounds"); | 
|---|
| 3074 |  | 
|---|
| 3075 | TEST_EXPECT_EQUAL(refreshes_traced, 2); // 2 result-list refreshes would happen (one for each delete_group()) | 
|---|
| 3076 | refreshes_traced = 0; | 
|---|
| 3077 |  | 
|---|
| 3078 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_AND_PARENT, "outer*outer<inner>*test*test*outer*outer<outer>*outer<inner>*outer<test>*yy<eee>")); // 'test' between 'outer<outer>' got removed | 
|---|
| 3079 |  | 
|---|
| 3080 | // delete all (but one) groups named 'outer': | 
|---|
| 3081 | misc.forgetQExpressions(); | 
|---|
| 3082 | misc.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "outer"); | 
|---|
| 3083 | misc.perform_search(GSM_FIND); | 
|---|
| 3084 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "outer/tree_test*outer/tree_tree2*outer/tree_tree2")); // also tests that 'outer' was deleted from DB; see .@DEL_OUTER | 
|---|
| 3085 |  | 
|---|
| 3086 | misc.remove_hit(1); // will not get deleted | 
|---|
| 3087 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "outer/tree_test*outer/tree_tree2")); | 
|---|
| 3088 |  | 
|---|
| 3089 | TEST_EXPECT_NO_ERROR(misc.delete_found_groups());           // now delete all listed groups | 
|---|
| 3090 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "")); // result-list is empty now | 
|---|
| 3091 |  | 
|---|
| 3092 | misc.perform_search(GSM_FIND);                                              // search again | 
|---|
| 3093 | TEST_EXPECTATION(resultListingIs(misc, GLT_NAME_TREE, "outer/tree_tree2")); // hit removed before deleting listed still exists in DB | 
|---|
| 3094 |  | 
|---|
| 3095 | TEST_EXPECT_EQUAL(refreshes_traced, 1); // only one refresh triggered for deletion of all listed groups | 
|---|
| 3096 | } | 
|---|
| 3097 |  | 
|---|
| 3098 | { | 
|---|
| 3099 | refreshes_traced = 0; | 
|---|
| 3100 |  | 
|---|
| 3101 | GroupSearch outer(gb_main, traceRefresh_cb); | 
|---|
| 3102 | outer.addQueryExpression(CO_OR, CT_NAME, CM_MATCH, "test"); | 
|---|
| 3103 | outer.perform_search(GSM_FIND); | 
|---|
| 3104 | TEST_EXPECTATION(resultListingIs(outer, GLT_NAME_TREE, "test/tree_test*test/tree_test*test/tree_tree2")); // also tests that 'test' was deleted from DB; see .@DEL_TEST | 
|---|
| 3105 |  | 
|---|
| 3106 | // test result-update callbacks (triggered by DB-changes) | 
|---|
| 3107 | { // delete tree_tree2: | 
|---|
| 3108 | GB_transaction  ta(gb_main); | 
|---|
| 3109 | GBDATA         *gb_tree = GBT_find_tree(gb_main, "tree_tree2"); | 
|---|
| 3110 | TEST_REJECT_NULL(gb_tree); | 
|---|
| 3111 | TEST_EXPECT_NO_ERROR(GB_delete(gb_tree)); | 
|---|
| 3112 | } | 
|---|
| 3113 | TEST_EXPECT_EQUAL(refreshes_traced, 1); // one modifying TA => only one refresh callback triggered | 
|---|
| 3114 | TEST_EXPECTATION(resultListingIs(outer, GLT_NAME_TREE, "test/tree_test*test/tree_test")); // all results referring 'tree_tree2' were removed | 
|---|
| 3115 | } | 
|---|
| 3116 |  | 
|---|
| 3117 |  | 
|---|
| 3118 | GB_close(gb_main); | 
|---|
| 3119 | } | 
|---|
| 3120 |  | 
|---|
| 3121 | void TEST_keeled_group_search() { | 
|---|
| 3122 | GB_shell shell; | 
|---|
| 3123 | GBDATA   *gb_main = GB_open("TEST_trees.arb", "rw"); | 
|---|
| 3124 |  | 
|---|
| 3125 | GroupSearchCallback traceRefresh_cb = makeGroupSearchCallback(trace_refresh_cb); | 
|---|
| 3126 | refreshes_traced = 0; | 
|---|
| 3127 | { | 
|---|
| 3128 | GB_transaction ta(gb_main); | 
|---|
| 3129 |  | 
|---|
| 3130 | GroupSearch allGroups(gb_main, traceRefresh_cb); | 
|---|
| 3131 | { | 
|---|
| 3132 | GroupSearch keeledGroups(gb_main, traceRefresh_cb); | 
|---|
| 3133 | GroupSearch normalGroups(gb_main, traceRefresh_cb); | 
|---|
| 3134 |  | 
|---|
| 3135 | TEST_EXPECT(allGroups.get_results().empty()); | 
|---|
| 3136 | TEST_EXPECT(keeledGroups.get_results().empty()); | 
|---|
| 3137 | TEST_EXPECT(normalGroups.get_results().empty()); | 
|---|
| 3138 |  | 
|---|
| 3139 | // CT_KEELED: | 
|---|
| 3140 | keeledGroups.addQueryExpression(CO_OR, CT_KEELED, CM_MISMATCH, "0"); // find keeled groups | 
|---|
| 3141 | normalGroups.addQueryExpression(CO_OR, CT_KEELED, CM_MATCH,    "0"); // find normal groups | 
|---|
| 3142 |  | 
|---|
| 3143 | allGroups.perform_search(GSM_FIND); | 
|---|
| 3144 | keeledGroups.perform_search(GSM_FIND); | 
|---|
| 3145 | normalGroups.perform_search(GSM_FIND); | 
|---|
| 3146 |  | 
|---|
| 3147 | TEST_EXPECT(!allGroups.get_results().empty()); | 
|---|
| 3148 | TEST_EXPECT(!keeledGroups.get_results().empty()); | 
|---|
| 3149 | TEST_EXPECT(!normalGroups.get_results().empty()); | 
|---|
| 3150 |  | 
|---|
| 3151 | TEST_EXPECT_EQUAL(allGroups.get_results().size(), 21); | 
|---|
| 3152 | TEST_EXPECT_EQUAL(allGroups.get_results().size(), | 
|---|
| 3153 | keeledGroups.get_results().size()+normalGroups.get_results().size()); | 
|---|
| 3154 | TEST_EXPECT_EQUAL(keeledGroups.get_results().size(), 6); | 
|---|
| 3155 | TEST_EXPECT_EQUAL(normalGroups.get_results().size(), 15); | 
|---|
| 3156 |  | 
|---|
| 3157 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_TREE, | 
|---|
| 3158 | "test/tree_test*" | 
|---|
| 3159 | "outer/tree_tree2*g2/tree_tree2*" | 
|---|
| 3160 | "outer/tree_removal*g2 [was: test]/tree_removal*" | 
|---|
| 3161 | "lower/tree_groups*low2/tree_groups*twoleafs/tree_groups*low1/tree_groups*upper/tree_groups*" | 
|---|
| 3162 | "twoleafs/tree_keeled*low2/tree_keeled*lower/tree_keeled*upper/tree_keeled*low1/tree_keeled*" | 
|---|
| 3163 | "low2/tree_keeled_2*twoleafs/tree_keeled_2*lower/tree_keeled_2*upper/tree_keeled_2*low1/tree_keeled_2*allButOne/tree_keeled_2" // finds "keeled group at leaf" 'allButOne'; see also ../../ARBDB/adtree.cxx@HIDDEN_KEELED_GROUP | 
|---|
| 3164 | )); | 
|---|
| 3165 |  | 
|---|
| 3166 | TEST_EXPECTATION(resultListingIs(keeledGroups, GLT_KNAME_NEST, | 
|---|
| 3167 | "!twoleafs(L0)*!low2(L1)*?lower(L2)*" // tree_keeled | 
|---|
| 3168 | "!low2(L0)*?lower(L1)*!allButOne(L2)" // tree_keeled_2 | 
|---|
| 3169 | )); | 
|---|
| 3170 | } | 
|---|
| 3171 |  | 
|---|
| 3172 | TreeNameSet keeledTrees; | 
|---|
| 3173 | keeledTrees.insert("tree_keeled"); | 
|---|
| 3174 | keeledTrees.insert("tree_keeled_2"); | 
|---|
| 3175 |  | 
|---|
| 3176 | allGroups.setSearchRange(keeledTrees); | 
|---|
| 3177 | allGroups.perform_search(GSM_FIND); | 
|---|
| 3178 |  | 
|---|
| 3179 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_AND_PARENT, | 
|---|
| 3180 | // tree_keeled: | 
|---|
| 3181 | "twoleafs*twoleafs<low2>*low2<lower>*lower<upper>*" | 
|---|
| 3182 | "low2<low1>*" | 
|---|
| 3183 |  | 
|---|
| 3184 | // tree_keeled_2: | 
|---|
| 3185 | "low2*" | 
|---|
| 3186 | "twoleafs*" | 
|---|
| 3187 | "low2<lower>*" | 
|---|
| 3188 | "lower<upper>*"   // keeled group 'lower' encloses 'upper' | 
|---|
| 3189 | "low2<low1>*" | 
|---|
| 3190 | "low1<allButOne>" | 
|---|
| 3191 | )); | 
|---|
| 3192 |  | 
|---|
| 3193 | // test folding of keeled groups: | 
|---|
| 3194 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD, | 
|---|
| 3195 | "twoleafs*low2*lower*upper*low1*"          // tree_keeled | 
|---|
| 3196 | "low2*twoleafs*lower*upper*low1*allButOne" // tree_keeled_2 | 
|---|
| 3197 | )); | 
|---|
| 3198 |  | 
|---|
| 3199 | TEST_EXPECT_NO_ERROR(allGroups.fold_group(0, GFM_TOGGLE)); // fold 'twoleafs' | 
|---|
| 3200 | TEST_EXPECT_NO_ERROR(allGroups.fold_group(2, GFM_TOGGLE)); // fold 'lower' -> does as well fold 'upper' (overlayed groups) | 
|---|
| 3201 |  | 
|---|
| 3202 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD, | 
|---|
| 3203 | "[twoleafs]*low2*[lower]*[upper]*low1*"    // tree_keeled | 
|---|
| 3204 | "low2*twoleafs*lower*upper*low1*allButOne" // tree_keeled_2 | 
|---|
| 3205 | )); | 
|---|
| 3206 |  | 
|---|
| 3207 | TEST_EXPECT_NO_ERROR(allGroups.fold_group(3,  GFM_TOGGLE)); // unfold 'upper' -> does as well unfold 'lower' (overlayed groups) | 
|---|
| 3208 | TEST_EXPECT_NO_ERROR(allGroups.fold_group(10, GFM_TOGGLE)); | 
|---|
| 3209 |  | 
|---|
| 3210 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD, | 
|---|
| 3211 | "[twoleafs]*low2*lower*upper*low1*"          // tree_keeled | 
|---|
| 3212 | "low2*twoleafs*lower*upper*low1*[allButOne]" // tree_keeled_2 | 
|---|
| 3213 | )); | 
|---|
| 3214 |  | 
|---|
| 3215 | TEST_EXPECT_NO_ERROR(allGroups.fold_group(0,  GFM_TOGGLE)); | 
|---|
| 3216 | TEST_EXPECT_NO_ERROR(allGroups.fold_group(10, GFM_TOGGLE)); | 
|---|
| 3217 |  | 
|---|
| 3218 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_FOLD, | 
|---|
| 3219 | "twoleafs*low2*lower*upper*low1*"          // tree_keeled | 
|---|
| 3220 | "low2*twoleafs*lower*upper*low1*allButOne" // tree_keeled_2 | 
|---|
| 3221 | )); | 
|---|
| 3222 |  | 
|---|
| 3223 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_AID, | 
|---|
| 3224 | // tree_keeled: | 
|---|
| 3225 | "twoleafs(1.4310)*low2(1.4436)*lower(1.0288)*upper(1.0288)*low1(1.1200)*" | 
|---|
| 3226 |  | 
|---|
| 3227 | // tree_keeled_2: | 
|---|
| 3228 | "low2(1.4436)*twoleafs(0.0087)*lower(1.0288)*upper(1.0288)*low1(1.1200)*" | 
|---|
| 3229 | "allButOne(0.0000)" // 1 member -> zero AID | 
|---|
| 3230 | )); | 
|---|
| 3231 |  | 
|---|
| 3232 | keeledTrees.insert("tree_groups"); | 
|---|
| 3233 | allGroups.setSearchRange(keeledTrees); | 
|---|
| 3234 | allGroups.perform_search(GSM_FIND); | 
|---|
| 3235 |  | 
|---|
| 3236 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_KNAME_NEST, | 
|---|
| 3237 | // tree_groups: | 
|---|
| 3238 | "lower(L0)*low2(L1)*twoleafs(L2)*low1(L1)*upper(L0)*" | 
|---|
| 3239 |  | 
|---|
| 3240 | // tree_keeled: | 
|---|
| 3241 | "!twoleafs(L0)*!low2(L1)*?lower(L2)*upper(L3)*" | 
|---|
| 3242 | "low1(L2)*" | 
|---|
| 3243 |  | 
|---|
| 3244 | // tree_keeled_2: | 
|---|
| 3245 | "!low2(L0)*" | 
|---|
| 3246 | "twoleafs(L0)*" | 
|---|
| 3247 | "?lower(L1)*upper(L2)*low1(L1)*!allButOne(L2)" | 
|---|
| 3248 | )); | 
|---|
| 3249 |  | 
|---|
| 3250 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_NAME_SIZE, | 
|---|
| 3251 | // tree_groups: | 
|---|
| 3252 | "lower(10)*low2(3)*twoleafs(2)*low1(7)*upper(5)*" | 
|---|
| 3253 |  | 
|---|
| 3254 | // tree_keeled: | 
|---|
| 3255 | "twoleafs(13)*" | 
|---|
| 3256 | "low2(12)*" | 
|---|
| 3257 | "lower(5)*upper(5)*" | 
|---|
| 3258 | "low1(7)*" | 
|---|
| 3259 |  | 
|---|
| 3260 | // tree_keeled_2: | 
|---|
| 3261 | "low2(12)*" | 
|---|
| 3262 | "twoleafs(2)*" | 
|---|
| 3263 | "lower(5)*" | 
|---|
| 3264 | "upper(5)*low1(7)*" | 
|---|
| 3265 | "allButOne(1)" // only 1 species! | 
|---|
| 3266 | )); | 
|---|
| 3267 |  | 
|---|
| 3268 | allGroups.addSortCriterion(GSC_KEELED); | 
|---|
| 3269 | TEST_EXPECTATION(resultListingIs(allGroups, GLT_KNAME_NEST, | 
|---|
| 3270 | "?lower(L2)*?lower(L1)*!twoleafs(L0)*!low2(L1)*!low2(L0)*!allButOne(L2)*lower(L0)*low2(L1)*twoleafs(L2)*low1(L1)*upper(L0)*upper(L3)*low1(L2)*twoleafs(L0)*upper(L2)*low1(L1)" | 
|---|
| 3271 | )); | 
|---|
| 3272 | } | 
|---|
| 3273 |  | 
|---|
| 3274 | GB_close(gb_main); | 
|---|
| 3275 | } | 
|---|
| 3276 |  | 
|---|
| 3277 |  | 
|---|
| 3278 |  | 
|---|
| 3279 | static arb_test::match_expectation does_map_index(const SymmetricMatrixMapper& mm, int x, int y, int lin) { | 
|---|
| 3280 | using namespace   arb_test; | 
|---|
| 3281 | expectation_group fulfilled; | 
|---|
| 3282 |  | 
|---|
| 3283 | fulfilled.add(that(mm.linear_index(x, y)).is_equal_to(lin)); | 
|---|
| 3284 | fulfilled.add(that(mm.linear_index(y, x)).is_equal_to(lin)); | 
|---|
| 3285 |  | 
|---|
| 3286 | int rx, ry; | 
|---|
| 3287 | mm.to_xy(lin, rx, ry); | 
|---|
| 3288 | if (x>y) swap(x, y); | 
|---|
| 3289 |  | 
|---|
| 3290 | fulfilled.add(that(rx).is_equal_to(x)); | 
|---|
| 3291 | fulfilled.add(that(ry).is_equal_to(y)); | 
|---|
| 3292 |  | 
|---|
| 3293 | return all().ofgroup(fulfilled); | 
|---|
| 3294 | } | 
|---|
| 3295 |  | 
|---|
| 3296 | void TEST_SymmetricMatrixMapper() { | 
|---|
| 3297 | { | 
|---|
| 3298 | SymmetricMatrixMapper m2(2); | 
|---|
| 3299 | TEST_EXPECT_EQUAL(m2.linear_size(), 1); | 
|---|
| 3300 | TEST_EXPECTATION(does_map_index(m2, 0, 1, 0)); | 
|---|
| 3301 | } | 
|---|
| 3302 | { | 
|---|
| 3303 | SymmetricMatrixMapper m3(3); | 
|---|
| 3304 | TEST_EXPECT_EQUAL(m3.linear_size(), 3); | 
|---|
| 3305 | TEST_EXPECTATION(does_map_index(m3, 0, 1, 0)); | 
|---|
| 3306 | TEST_EXPECTATION(does_map_index(m3, 2, 0, 1)); | 
|---|
| 3307 | TEST_EXPECTATION(does_map_index(m3, 2, 1, 2)); | 
|---|
| 3308 | } | 
|---|
| 3309 | { | 
|---|
| 3310 | SymmetricMatrixMapper m100(100); | 
|---|
| 3311 | TEST_EXPECT_EQUAL(m100.linear_size(), 4950); | 
|---|
| 3312 | TEST_EXPECTATION(does_map_index(m100, 0, 1, 0)); | 
|---|
| 3313 | TEST_EXPECTATION(does_map_index(m100, 49, 50, 1274)); | 
|---|
| 3314 | TEST_EXPECTATION(does_map_index(m100, 51, 50, 1274+51)); | 
|---|
| 3315 | TEST_EXPECTATION(does_map_index(m100, 99, 98, 4949)); | 
|---|
| 3316 | } | 
|---|
| 3317 | } | 
|---|
| 3318 |  | 
|---|
| 3319 | void TEST_group_duplicate_detection() { | 
|---|
| 3320 | GB_shell  shell; | 
|---|
| 3321 | GBDATA   *gb_main = GB_open("../../demo.arb", "r"); | 
|---|
| 3322 |  | 
|---|
| 3323 | GroupSearchCallback traceRefresh_cb = makeGroupSearchCallback(trace_refresh_cb); | 
|---|
| 3324 |  | 
|---|
| 3325 | { | 
|---|
| 3326 | refreshes_traced = 0; | 
|---|
| 3327 |  | 
|---|
| 3328 | GroupSearch search(gb_main, traceRefresh_cb); | 
|---|
| 3329 | search.addSortCriterion(GSC_NAME); | 
|---|
| 3330 | search.addSortCriterion(GSC_TREENAME); | 
|---|
| 3331 |  | 
|---|
| 3332 | search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_SAME_TREE, 2); | 
|---|
| 3333 | search.perform_search(GSM_FIND); | 
|---|
| 3334 | TEST_EXPECTATION(hasOrder(search, "TN")); // treename, groupname | 
|---|
| 3335 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3336 | "1/outer/tree_test*" | 
|---|
| 3337 | "1/outer/tree_test*" | 
|---|
| 3338 | "2/test/tree_test*" | 
|---|
| 3339 | "2/test/tree_test*" | 
|---|
| 3340 | "3/outer/tree_tree2*" | 
|---|
| 3341 | "3/outer/tree_tree2*" | 
|---|
| 3342 | "4/test/tree_tree2*" | 
|---|
| 3343 | "4/test/tree_tree2" | 
|---|
| 3344 | )); | 
|---|
| 3345 |  | 
|---|
| 3346 | search.addSortCriterion(GSC_REVERSE); | 
|---|
| 3347 | search.addSortCriterion(GSC_CLUSTER); | 
|---|
| 3348 | search.addSortCriterion(GSC_REVERSE); | 
|---|
| 3349 |  | 
|---|
| 3350 | search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_ANYWHERE, 2); | 
|---|
| 3351 | search.perform_search(GSM_FIND); | 
|---|
| 3352 | TEST_EXPECTATION(hasOrder(search, "!C!TN")); // cluster(rev), treename, groupname | 
|---|
| 3353 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3354 | "5/g2/tree_tree2*" | 
|---|
| 3355 | "5/g2/tree_zomb*" | 
|---|
| 3356 | "4/xx/tree_test*" | 
|---|
| 3357 | "4/xx/tree_tree2*" | 
|---|
| 3358 | "4/xx/tree_zomb*" | 
|---|
| 3359 | "3/test/tree_test*" | 
|---|
| 3360 | "3/test/tree_test*" | 
|---|
| 3361 | "3/test/tree_tree2*" | 
|---|
| 3362 | "3/test/tree_tree2*" | 
|---|
| 3363 | "2/inner/tree_test*" | 
|---|
| 3364 | "2/inner/tree_tree2*" | 
|---|
| 3365 | "1/outer/tree_test*" | 
|---|
| 3366 | "1/outer/tree_test*" | 
|---|
| 3367 | "1/outer/tree_tree2*" | 
|---|
| 3368 | "1/outer/tree_tree2" | 
|---|
| 3369 | )); | 
|---|
| 3370 |  | 
|---|
| 3371 | search.setDupCriteria(false, DNC_WHOLENAME, GB_MIND_CASE, DLC_ANYWHERE, 2); // search "unique" groups | 
|---|
| 3372 | search.perform_search(GSM_FIND); | 
|---|
| 3373 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3374 | "0/another group/tree_test*" | 
|---|
| 3375 | "0/last/tree_test*" | 
|---|
| 3376 | "0/ZOMB/tree_zomb*" | 
|---|
| 3377 | "0/dup/tree_zomb*" | 
|---|
| 3378 | "0/eee/tree_zomb*" | 
|---|
| 3379 | "0/g3/tree_zomb*" | 
|---|
| 3380 | "0/g4/tree_zomb*" | 
|---|
| 3381 | "0/inner group/tree_zomb*" | 
|---|
| 3382 | "0/inner outer group/tree_zomb*" | 
|---|
| 3383 | "0/outer group/tree_zomb*" | 
|---|
| 3384 | "0/yy/tree_zomb*" | 
|---|
| 3385 | "0/zomb/tree_zomb*" | 
|---|
| 3386 | "0/zombsub/tree_zomb" | 
|---|
| 3387 | )); | 
|---|
| 3388 |  | 
|---|
| 3389 | search.addSortCriterion(GSC_NAME); | 
|---|
| 3390 | search.addSortCriterion(GSC_TREENAME); | 
|---|
| 3391 | search.addSortCriterion(GSC_CLUSTER); | 
|---|
| 3392 |  | 
|---|
| 3393 | search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_DIFF_TREE, 2); | 
|---|
| 3394 | search.perform_search(GSM_FIND); | 
|---|
| 3395 | TEST_EXPECTATION(hasOrder(search, "CTN")); // cluster, treename, groupname | 
|---|
| 3396 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3397 | "1/outer/tree_test*" | 
|---|
| 3398 | "1/outer/tree_test*" | 
|---|
| 3399 | "1/outer/tree_tree2*" | 
|---|
| 3400 | "1/outer/tree_tree2*" | 
|---|
| 3401 | "2/inner/tree_test*" | 
|---|
| 3402 | "2/inner/tree_tree2*" | 
|---|
| 3403 | "3/test/tree_test*" | 
|---|
| 3404 | "3/test/tree_test*" | 
|---|
| 3405 | "3/test/tree_tree2*" | 
|---|
| 3406 | "3/test/tree_tree2*" | 
|---|
| 3407 | "4/xx/tree_test*" | 
|---|
| 3408 | "4/xx/tree_tree2*" | 
|---|
| 3409 | "4/xx/tree_zomb*" | 
|---|
| 3410 | "5/g2/tree_tree2*" | 
|---|
| 3411 | "5/g2/tree_zomb" | 
|---|
| 3412 | )); | 
|---|
| 3413 |  | 
|---|
| 3414 | search.setDupCriteria(true, DNC_WHOLENAME, GB_MIND_CASE, DLC_DIFF_TREE, 3); // expect hits in 3 diff. trees | 
|---|
| 3415 | search.perform_search(GSM_FIND); | 
|---|
| 3416 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, // Note: does not add 'outer' or 'test' (they occur 4 times, but only in 2 trees!) | 
|---|
| 3417 | "1/xx/tree_test*" | 
|---|
| 3418 | "1/xx/tree_tree2*" | 
|---|
| 3419 | "1/xx/tree_zomb" | 
|---|
| 3420 | )); | 
|---|
| 3421 |  | 
|---|
| 3422 | // -------------------------------------------- | 
|---|
| 3423 | //      test DNC_WORDWISE name comparison: | 
|---|
| 3424 |  | 
|---|
| 3425 | const char *word_sep = " "; | 
|---|
| 3426 | WordSet     no_words_ignored; | 
|---|
| 3427 | search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 1, no_words_ignored, word_sep, DLC_ANYWHERE, 2); | 
|---|
| 3428 | search.perform_search(GSM_FIND); | 
|---|
| 3429 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3430 | "1/another group/tree_test*" | 
|---|
| 3431 | "1/inner group/tree_zomb*" | 
|---|
| 3432 | "1/inner outer group/tree_zomb*" | 
|---|
| 3433 | "1/outer group/tree_zomb*" | 
|---|
| 3434 |  | 
|---|
| 3435 | "2/outer/tree_test*" | 
|---|
| 3436 | "2/outer/tree_test*" | 
|---|
| 3437 | "2/outer/tree_tree2*" | 
|---|
| 3438 | "2/outer/tree_tree2*" | 
|---|
| 3439 |  | 
|---|
| 3440 | "3/test/tree_test*" | 
|---|
| 3441 | "3/test/tree_test*" | 
|---|
| 3442 | "3/test/tree_tree2*" | 
|---|
| 3443 | "3/test/tree_tree2*" | 
|---|
| 3444 |  | 
|---|
| 3445 | "4/xx/tree_test*" | 
|---|
| 3446 | "4/xx/tree_tree2*" | 
|---|
| 3447 | "4/xx/tree_zomb*" | 
|---|
| 3448 |  | 
|---|
| 3449 | "5/inner/tree_test*" | 
|---|
| 3450 | "5/inner/tree_tree2*" | 
|---|
| 3451 |  | 
|---|
| 3452 | "6/g2/tree_tree2*" | 
|---|
| 3453 | "6/g2/tree_zomb" | 
|---|
| 3454 | )); | 
|---|
| 3455 |  | 
|---|
| 3456 | search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2); | 
|---|
| 3457 | search.perform_search(GSM_FIND); | 
|---|
| 3458 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3459 | "1/inner group/tree_zomb*" | 
|---|
| 3460 | "1/inner outer group/tree_zomb" | 
|---|
| 3461 | )); | 
|---|
| 3462 |  | 
|---|
| 3463 | // rename one group (spaces->commas) to test special word separators | 
|---|
| 3464 | { | 
|---|
| 3465 | GB_transaction ta(gb_main); | 
|---|
| 3466 | TEST_EXPECT_NO_ERROR(search.rename_group(0, "/ /,/")); | 
|---|
| 3467 | TEST_EXPECT_EQUAL(search.get_results()[0].get_name(), "inner,group"); | 
|---|
| 3468 | } | 
|---|
| 3469 |  | 
|---|
| 3470 | search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2); | 
|---|
| 3471 | search.perform_search(GSM_FIND); | 
|---|
| 3472 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, // rename of group causes a change of detected cluster | 
|---|
| 3473 | "1/inner outer group/tree_zomb*" | 
|---|
| 3474 | "1/outer group/tree_zomb" | 
|---|
| 3475 | )); | 
|---|
| 3476 |  | 
|---|
| 3477 |  | 
|---|
| 3478 | word_sep = ", "; // <<<------------------------------ commas separate words from now on! | 
|---|
| 3479 |  | 
|---|
| 3480 | search.setDupCriteria(true, DNC_WORDWISE, GB_MIND_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2); | 
|---|
| 3481 | search.perform_search(GSM_FIND); | 
|---|
| 3482 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3483 | "1/inner outer group/tree_zomb*" | 
|---|
| 3484 | "1/inner,group/tree_zomb" | 
|---|
| 3485 | )); | 
|---|
| 3486 |  | 
|---|
| 3487 | WordSet ignore_group; | 
|---|
| 3488 | ignore_group.insert("Group"); | 
|---|
| 3489 |  | 
|---|
| 3490 | search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 1, ignore_group, word_sep, DLC_ANYWHERE, 2); | 
|---|
| 3491 | search.perform_search(GSM_FIND); | 
|---|
| 3492 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3493 | "1/outer/tree_test*" | 
|---|
| 3494 | "1/outer/tree_test*" | 
|---|
| 3495 | "1/outer/tree_tree2*" | 
|---|
| 3496 | "1/outer/tree_tree2*" | 
|---|
| 3497 | "1/inner outer group/tree_zomb*" | 
|---|
| 3498 | "1/outer group/tree_zomb*" | 
|---|
| 3499 |  | 
|---|
| 3500 | "2/test/tree_test*" | 
|---|
| 3501 | "2/test/tree_test*" | 
|---|
| 3502 | "2/test/tree_tree2*" | 
|---|
| 3503 | "2/test/tree_tree2*" | 
|---|
| 3504 |  | 
|---|
| 3505 | "3/inner/tree_test*" | 
|---|
| 3506 | "3/inner/tree_tree2*" | 
|---|
| 3507 | "3/inner,group/tree_zomb*" | 
|---|
| 3508 |  | 
|---|
| 3509 | "4/xx/tree_test*" | 
|---|
| 3510 | "4/xx/tree_tree2*" | 
|---|
| 3511 | "4/xx/tree_zomb*" | 
|---|
| 3512 |  | 
|---|
| 3513 | "5/g2/tree_tree2*" | 
|---|
| 3514 | "5/g2/tree_zomb*" | 
|---|
| 3515 |  | 
|---|
| 3516 | "6/ZOMB/tree_zomb*" | 
|---|
| 3517 | "6/zomb/tree_zomb" | 
|---|
| 3518 | )); | 
|---|
| 3519 |  | 
|---|
| 3520 | search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 2, ignore_group, word_sep, DLC_ANYWHERE, 2); | 
|---|
| 3521 | search.perform_search(GSM_FIND); | 
|---|
| 3522 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, "")); // none | 
|---|
| 3523 |  | 
|---|
| 3524 | search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 1, ignore_group, "", DLC_ANYWHERE, 2); // empty word separator -> uses whole names | 
|---|
| 3525 | search.perform_search(GSM_FIND); | 
|---|
| 3526 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3527 | "1/outer/tree_test*" | 
|---|
| 3528 | "1/outer/tree_test*" | 
|---|
| 3529 | "1/outer/tree_tree2*" | 
|---|
| 3530 | "1/outer/tree_tree2*" | 
|---|
| 3531 |  | 
|---|
| 3532 | "2/test/tree_test*" | 
|---|
| 3533 | "2/test/tree_test*" | 
|---|
| 3534 | "2/test/tree_tree2*" | 
|---|
| 3535 | "2/test/tree_tree2*" | 
|---|
| 3536 |  | 
|---|
| 3537 | "3/xx/tree_test*" | 
|---|
| 3538 | "3/xx/tree_tree2*" | 
|---|
| 3539 | "3/xx/tree_zomb*" | 
|---|
| 3540 |  | 
|---|
| 3541 | "4/inner/tree_test*" | 
|---|
| 3542 | "4/inner/tree_tree2*" | 
|---|
| 3543 |  | 
|---|
| 3544 | "5/g2/tree_tree2*" | 
|---|
| 3545 | "5/g2/tree_zomb*" | 
|---|
| 3546 |  | 
|---|
| 3547 | "6/ZOMB/tree_zomb*" | 
|---|
| 3548 | "6/zomb/tree_zomb" | 
|---|
| 3549 | )); | 
|---|
| 3550 |  | 
|---|
| 3551 | // rename more groups to test cluster-search based on 3 words and extension based on 2 words | 
|---|
| 3552 | { | 
|---|
| 3553 | GB_transaction ta(gb_main); | 
|---|
| 3554 | TEST_EXPECT_NO_ERROR(search.rename_group(0, "/outer/group inner outer/")); | 
|---|
| 3555 | TEST_EXPECT_NO_ERROR(search.rename_group(1, "/outer/group outer/")); | 
|---|
| 3556 | TEST_EXPECT_NO_ERROR(search.rename_group(2, "/outer/outer group/")); | 
|---|
| 3557 | TEST_EXPECT_EQUAL(search.get_results()[0].get_name(), "group inner outer"); | 
|---|
| 3558 | TEST_EXPECT_EQUAL(search.get_results()[1].get_name(), "group outer"); | 
|---|
| 3559 | TEST_EXPECT_EQUAL(search.get_results()[2].get_name(), "outer group"); | 
|---|
| 3560 | } | 
|---|
| 3561 |  | 
|---|
| 3562 | search.setDupCriteria(true, DNC_WORDWISE, GB_IGNORE_CASE, 2, no_words_ignored, word_sep, DLC_ANYWHERE, 2); | 
|---|
| 3563 | search.perform_search(GSM_FIND); | 
|---|
| 3564 | TEST_EXPECTATION(resultListingIs(search, GLT_CLUST_NT, | 
|---|
| 3565 | "1/group inner outer/tree_test*" // cluster based on 3 words gets extended by groups matching 2 of these words ("group" and "outer") | 
|---|
| 3566 | "1/group outer/tree_test*"       // (note that group containing 'inner' and 'group' is discarded, because resulting cluster would be smaller) | 
|---|
| 3567 | "1/outer group/tree_tree2*" | 
|---|
| 3568 | "1/inner outer group/tree_zomb*" | 
|---|
| 3569 | "1/outer group/tree_zomb" | 
|---|
| 3570 | )); | 
|---|
| 3571 |  | 
|---|
| 3572 | TEST_EXPECT_EQUAL(refreshes_traced, 2); // 2 renames | 
|---|
| 3573 | } | 
|---|
| 3574 | GB_close(gb_main); | 
|---|
| 3575 | } | 
|---|
| 3576 |  | 
|---|
| 3577 | static double bruteForce_calc_average_ingroup_distance(GroupSearchTree *node) { | 
|---|
| 3578 | unsigned leafs = node->get_leaf_count(); | 
|---|
| 3579 |  | 
|---|
| 3580 | if (leafs == 1) return 0.0; // single leaf -> zero distance | 
|---|
| 3581 |  | 
|---|
| 3582 | ARB_edge last  = parentEdge(node->get_leftson()); | 
|---|
| 3583 | ARB_edge start = parentEdge(node->get_rightson()).inverse(); | 
|---|
| 3584 |  | 
|---|
| 3585 | if (start == last) { | 
|---|
| 3586 | gs_assert(start.get_type() == ROOT_EDGE); | 
|---|
| 3587 | start = start.next(); | 
|---|
| 3588 | } | 
|---|
| 3589 |  | 
|---|
| 3590 | unsigned pairs    = 0; | 
|---|
| 3591 | double   dist_sum = 0.0; | 
|---|
| 3592 |  | 
|---|
| 3593 | for (ARB_edge e1 = start; e1 != last; e1 = e1.next()) { | 
|---|
| 3594 | if (e1.is_edge_to_leaf()) { | 
|---|
| 3595 | for (ARB_edge e2 = e1.next(); e2 != last; e2 = e2.next()) { | 
|---|
| 3596 | if (e2.is_edge_to_leaf()) { | 
|---|
| 3597 | dist_sum += e1.dest()->intree_distance_to(e2.dest()); | 
|---|
| 3598 | ++pairs; | 
|---|
| 3599 | } | 
|---|
| 3600 | } | 
|---|
| 3601 | } | 
|---|
| 3602 | } | 
|---|
| 3603 |  | 
|---|
| 3604 | #if defined(ASSERTION_USED) | 
|---|
| 3605 | const unsigned calc_pairs = (leafs*(leafs-1))/2; | 
|---|
| 3606 | gs_assert(pairs == calc_pairs); | 
|---|
| 3607 | #endif | 
|---|
| 3608 |  | 
|---|
| 3609 | return dist_sum/pairs; | 
|---|
| 3610 | } | 
|---|
| 3611 |  | 
|---|
| 3612 | #define TEST_EXPECT_PROPER_AID(node) do{                                        \ | 
|---|
| 3613 | const double EPSILON = 0.000001;                                        \ | 
|---|
| 3614 | TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(node),     \ | 
|---|
| 3615 | (node)->get_average_ingroup_distance(),             \ | 
|---|
| 3616 | EPSILON);                                           \ | 
|---|
| 3617 | }while(0) | 
|---|
| 3618 |  | 
|---|
| 3619 | void TEST_ingroup_distance() { | 
|---|
| 3620 | GB_shell  shell; | 
|---|
| 3621 | GBDATA   *gb_main = GB_open("TEST_trees.arb", "r"); | 
|---|
| 3622 |  | 
|---|
| 3623 | { | 
|---|
| 3624 | GB_transaction ta(gb_main); | 
|---|
| 3625 | SearchedTree   stree("tree_test", gb_main); | 
|---|
| 3626 |  | 
|---|
| 3627 | GroupSearchRoot *troot = stree.get_tree_root(); | 
|---|
| 3628 | TEST_REJECT(stree.failed_to_load()); | 
|---|
| 3629 |  | 
|---|
| 3630 | // get some specific nodes: | 
|---|
| 3631 | GroupSearchTree *rootNode = troot->get_root_node(); | 
|---|
| 3632 | GroupSearchTree *leftSon  = rootNode->get_leftson(); | 
|---|
| 3633 | GroupSearchTree *grandSon = leftSon->get_rightson(); | 
|---|
| 3634 |  | 
|---|
| 3635 | GroupSearchTree *someLeaf = grandSon->get_leftson(); | 
|---|
| 3636 | while (!someLeaf->is_leaf()) { // descent into bigger subtree => reaches subtree containing 2 leafs | 
|---|
| 3637 | GroupSearchTree *L = someLeaf->get_leftson(); | 
|---|
| 3638 | GroupSearchTree *R = someLeaf->get_rightson(); | 
|---|
| 3639 |  | 
|---|
| 3640 | someLeaf = L->get_leaf_count() > R->get_leaf_count() ? L : R; | 
|---|
| 3641 | } | 
|---|
| 3642 |  | 
|---|
| 3643 | TEST_EXPECT_EQUAL(someLeaf->get_leaf_count(), 1); | 
|---|
| 3644 |  | 
|---|
| 3645 | GroupSearchTree *minSubtree = someLeaf->get_father(); | 
|---|
| 3646 | TEST_EXPECT_EQUAL(minSubtree->get_leaf_count(), 2); | 
|---|
| 3647 |  | 
|---|
| 3648 | // brute-force AID calculation: | 
|---|
| 3649 | { | 
|---|
| 3650 | const double EPSILON = 0.000001; | 
|---|
| 3651 | TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(someLeaf),   0.0,      EPSILON); | 
|---|
| 3652 | TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(minSubtree), minSubtree->leftlen + minSubtree->rightlen, EPSILON); | 
|---|
| 3653 | TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(grandSon),   0.534927, EPSILON); | 
|---|
| 3654 | TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(leftSon),    0.976091, EPSILON); | 
|---|
| 3655 | TEST_EXPECT_SIMILAR(bruteForce_calc_average_ingroup_distance(rootNode),   1.108438, EPSILON); | 
|---|
| 3656 | } | 
|---|
| 3657 |  | 
|---|
| 3658 | // calculate AID on-the-fly and compare with brute-force results | 
|---|
| 3659 | TEST_EXPECT_PROPER_AID(someLeaf); | 
|---|
| 3660 | TEST_EXPECT_PROPER_AID(minSubtree); | 
|---|
| 3661 | TEST_EXPECT_PROPER_AID(grandSon); | 
|---|
| 3662 | TEST_EXPECT_PROPER_AID(leftSon); | 
|---|
| 3663 | TEST_EXPECT_PROPER_AID(rootNode); | 
|---|
| 3664 |  | 
|---|
| 3665 | ARB_edge start = rootEdge(troot); | 
|---|
| 3666 | for (ARB_edge e = start.next(); e != start; e = e.next()) { | 
|---|
| 3667 | TEST_EXPECT_PROPER_AID(DOWNCAST(GroupSearchTree*, e.dest())); | 
|---|
| 3668 | } | 
|---|
| 3669 | } | 
|---|
| 3670 | GB_close(gb_main); | 
|---|
| 3671 | } | 
|---|
| 3672 |  | 
|---|
| 3673 | #endif // UNIT_TESTS | 
|---|
| 3674 |  | 
|---|
| 3675 | // -------------------------------------------------------------------------------- | 
|---|
| 3676 |  | 
|---|