| 1 | // ============================================================= // |
|---|
| 2 | // // |
|---|
| 3 | // File : group_search.h // |
|---|
| 4 | // Purpose : public interface of group search // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in April 2017 // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // ============================================================= // |
|---|
| 10 | |
|---|
| 11 | #ifndef GROUP_SEARCH_H |
|---|
| 12 | #define GROUP_SEARCH_H |
|---|
| 13 | |
|---|
| 14 | #ifndef LAZY_H |
|---|
| 15 | #include <lazy.h> |
|---|
| 16 | #endif |
|---|
| 17 | #ifndef QUERY_EXPR_H |
|---|
| 18 | #include <query_expr.h> |
|---|
| 19 | #endif |
|---|
| 20 | #ifndef ARBDB_BASE_H |
|---|
| 21 | #include <arbdb_base.h> |
|---|
| 22 | #endif |
|---|
| 23 | #ifndef ARBTOOLS_H |
|---|
| 24 | #include <arbtools.h> |
|---|
| 25 | #endif |
|---|
| 26 | #ifndef ARB_ERROR_H |
|---|
| 27 | #include <arb_error.h> |
|---|
| 28 | #endif |
|---|
| 29 | #ifndef SMARTPTR_H |
|---|
| 30 | #include <smartptr.h> |
|---|
| 31 | #endif |
|---|
| 32 | #ifndef CB_H |
|---|
| 33 | #include <cb.h> |
|---|
| 34 | #endif |
|---|
| 35 | #ifndef _GLIBCXX_VECTOR |
|---|
| 36 | #include <vector> |
|---|
| 37 | #endif |
|---|
| 38 | #ifndef _GLIBCXX_SET |
|---|
| 39 | #include <set> |
|---|
| 40 | #endif |
|---|
| 41 | #ifndef _GLIBCXX_LIST |
|---|
| 42 | #include <list> |
|---|
| 43 | #endif |
|---|
| 44 | #ifndef _GLIBCXX_STRING |
|---|
| 45 | #include <string> |
|---|
| 46 | #endif |
|---|
| 47 | |
|---|
| 48 | #define gs_assert(cond) arb_assert(cond) |
|---|
| 49 | |
|---|
| 50 | class GroupSearchCommon; |
|---|
| 51 | |
|---|
| 52 | enum GroupFoldingMode { // bit mask |
|---|
| 53 | GFM_EXPAND = 1, // expand groups (otherwise collapse) |
|---|
| 54 | GFM_TOGGLE = 2, // toggle state of groups (if set -> bit0 is ignored) |
|---|
| 55 | GFM_RECURSE = 4, // apply to parents (use with GFM_EXPAND) |
|---|
| 56 | GFM_PARENTS_ONLY = 8, // only apply to parents, not to listed groups (use with GFM_EXPAND) |
|---|
| 57 | GFM_COLLAPSE_REST = 16, // collapse all other groups in each tree |
|---|
| 58 | |
|---|
| 59 | // convenience defs |
|---|
| 60 | GFM_COLLAPSE = 0, |
|---|
| 61 | GFM_EXPANDREC = GFM_EXPAND|GFM_RECURSE, |
|---|
| 62 | GFM_EXPANDPARENTS = GFM_EXPAND|GFM_RECURSE|GFM_PARENTS_ONLY, |
|---|
| 63 | GFM_EXPANDREC_COLLREST = GFM_EXPANDREC|GFM_COLLAPSE_REST, |
|---|
| 64 | }; |
|---|
| 65 | |
|---|
| 66 | enum GroupMarkMode { // same values as ../TREEDISP/TreeCallbacks.cxx@MARK_MODE_LOWER_BITS |
|---|
| 67 | GMM_UNMARK = 0, |
|---|
| 68 | GMM_MARK = 1, |
|---|
| 69 | GMM_INVERT = 2, |
|---|
| 70 | }; |
|---|
| 71 | |
|---|
| 72 | enum CollectMode { INTERSECT, UNITE }; |
|---|
| 73 | |
|---|
| 74 | // ---------------------------------------- |
|---|
| 75 | // query expression related types |
|---|
| 76 | |
|---|
| 77 | enum CriterionOperator { // @@@ rename |
|---|
| 78 | CO_AND, |
|---|
| 79 | CO_OR, |
|---|
| 80 | CO_IGNORE, |
|---|
| 81 | }; |
|---|
| 82 | enum CriterionMatch { // @@@ rename |
|---|
| 83 | CM_MATCH, |
|---|
| 84 | CM_MISMATCH, |
|---|
| 85 | }; |
|---|
| 86 | enum CriterionType { // @@@ rename |
|---|
| 87 | CT_NAME, // groupname (STRING) |
|---|
| 88 | CT_PARENT_DIRECT, // groupname of direct parent (STRING) |
|---|
| 89 | CT_PARENT_ANY, // groupname of any parent (STRING) |
|---|
| 90 | CT_PARENT_ALL, // groupname of all parents (STRING) |
|---|
| 91 | CT_NESTING_LEVEL, // group-nesting-level (INT) |
|---|
| 92 | CT_FOLDED, // folded (INT) |
|---|
| 93 | CT_SIZE, // size of group = number of species (INT) |
|---|
| 94 | CT_MARKED, // number of marked species (INT) |
|---|
| 95 | CT_MARKED_PC, // percentage of marked species (FLOAT) |
|---|
| 96 | CT_ZOMBIES, // number of zombies (INT) |
|---|
| 97 | CT_AID, // average ingroup distance |
|---|
| 98 | CT_KEELED, // keeled state (INT) |
|---|
| 99 | }; |
|---|
| 100 | |
|---|
| 101 | // ------------------------ |
|---|
| 102 | // search results |
|---|
| 103 | |
|---|
| 104 | struct ColumnWidths { |
|---|
| 105 | // Stores widths (or maxvalues for integers) of all |
|---|
| 106 | // FoundGroup-attributes that might get displayed in result list. |
|---|
| 107 | |
|---|
| 108 | int name; // max. width of name |
|---|
| 109 | int reason; // max. width of hit_reason |
|---|
| 110 | |
|---|
| 111 | int max_nesting; // max. value for nesting |
|---|
| 112 | int max_size; // max. value for size |
|---|
| 113 | int max_marked; // max. value for marked |
|---|
| 114 | int max_marked_pc; // max. value for marked% |
|---|
| 115 | int max_cluster_id; // max. value of cluster id |
|---|
| 116 | int max_aid; // max. value in front of dot |
|---|
| 117 | |
|---|
| 118 | bool seen_keeled; // any keeled group found? |
|---|
| 119 | |
|---|
| 120 | ColumnWidths() : |
|---|
| 121 | name(0), |
|---|
| 122 | reason(0), |
|---|
| 123 | max_nesting(0), |
|---|
| 124 | max_size(0), |
|---|
| 125 | max_marked(0), |
|---|
| 126 | max_marked_pc(0), |
|---|
| 127 | max_cluster_id(0), |
|---|
| 128 | seen_keeled(false) |
|---|
| 129 | {} |
|---|
| 130 | |
|---|
| 131 | static int max2width(const int& i) { // @@@ a bit expensive (better use specialized type for 'max_nesting' and other maximas) |
|---|
| 132 | // calculate column width from maximum |
|---|
| 133 | // (Note: fails if negative values are involved) |
|---|
| 134 | return strlen(GBS_global_string("%i", i)); |
|---|
| 135 | } |
|---|
| 136 | static int percent(int part, int all) { |
|---|
| 137 | return int(part*100.0/all+0.5); |
|---|
| 138 | } |
|---|
| 139 | |
|---|
| 140 | void track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled); |
|---|
| 141 | }; |
|---|
| 142 | |
|---|
| 143 | class QueriedGroups; |
|---|
| 144 | |
|---|
| 145 | class FoundGroup { |
|---|
| 146 | // stores/exports info about a group (as found by group search) |
|---|
| 147 | // - usable w/o tree loaded |
|---|
| 148 | // - (able to) know details (like nesting, number of marked species, ...) |
|---|
| 149 | |
|---|
| 150 | RefPtr<GBDATA> gb_group; |
|---|
| 151 | |
|---|
| 152 | ARB_ERROR set_folded(bool folded); |
|---|
| 153 | ARB_ERROR set_overlap_folded(bool folded); |
|---|
| 154 | int get_name_length() const; |
|---|
| 155 | |
|---|
| 156 | GBDATA *get_overlap_group() const { |
|---|
| 157 | gs_assert(keeled.has_value()); // means: pointer 'gb_overlap_group' is valid |
|---|
| 158 | return gb_overlap_group; |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | protected: |
|---|
| 162 | // optional values (needed to be "informed") |
|---|
| 163 | // - set manually by derivate ('Candidate' in group_search.cxx@inform_group) |
|---|
| 164 | // - knows_details() returns true if everything is properly set |
|---|
| 165 | std::string hit_reason; |
|---|
| 166 | Lazy<int, -1> nesting; |
|---|
| 167 | Lazy<int, -1> size; |
|---|
| 168 | Lazy<int, -1> marked; |
|---|
| 169 | |
|---|
| 170 | LazyFloat<double> aid; // average ingroup distance |
|---|
| 171 | |
|---|
| 172 | Lazy<int, -1> keeled; |
|---|
| 173 | RefPtr<GBDATA> gb_overlap_group; // valid if 'keeled.has_value()'; NULp -> no overlapping group |
|---|
| 174 | |
|---|
| 175 | int clusterID; |
|---|
| 176 | |
|---|
| 177 | public: |
|---|
| 178 | explicit FoundGroup(GBDATA *gb_group_) : |
|---|
| 179 | gb_group(gb_group_), |
|---|
| 180 | gb_overlap_group(NULp), |
|---|
| 181 | clusterID(0) |
|---|
| 182 | {} |
|---|
| 183 | |
|---|
| 184 | GBDATA *get_pointer() const { return gb_group; } |
|---|
| 185 | GBDATA *get_tree_data() const; |
|---|
| 186 | |
|---|
| 187 | const char *get_name() const; |
|---|
| 188 | const char *get_tree_name() const; |
|---|
| 189 | int get_tree_order() const; |
|---|
| 190 | const std::string& get_hit_reason() const { return hit_reason; } |
|---|
| 191 | |
|---|
| 192 | void set_cluster_id(int id) { gs_assert(id>0); clusterID = id; } |
|---|
| 193 | void forget_cluster_id() { clusterID = 0; } |
|---|
| 194 | int get_cluster_id() const { return clusterID; } |
|---|
| 195 | |
|---|
| 196 | bool overlap_is_folded() const; |
|---|
| 197 | bool is_folded() const; |
|---|
| 198 | |
|---|
| 199 | bool knows_details() const { // returns true if group is "informed" |
|---|
| 200 | return |
|---|
| 201 | !hit_reason.empty() && |
|---|
| 202 | nesting.has_value() && |
|---|
| 203 | size.has_value() && |
|---|
| 204 | marked.has_value() && |
|---|
| 205 | keeled.has_value() && |
|---|
| 206 | aid.has_value(); |
|---|
| 207 | } |
|---|
| 208 | void track_max_widths(ColumnWidths& widths) const; |
|---|
| 209 | |
|---|
| 210 | // getters for "informed" groups: |
|---|
| 211 | int get_nesting() const { gs_assert(knows_details()); return nesting; } |
|---|
| 212 | int get_size() const { gs_assert(knows_details()); return size; } |
|---|
| 213 | int get_marked() const { gs_assert(knows_details()); return marked; } |
|---|
| 214 | int get_marked_pc() const { gs_assert(knows_details()); return ColumnWidths::percent(marked, size); } |
|---|
| 215 | int get_keeled() const { gs_assert(knows_details()); return keeled; } |
|---|
| 216 | double get_aid() const { gs_assert(knows_details()); return aid; } |
|---|
| 217 | |
|---|
| 218 | bool operator == (const FoundGroup& other) const { |
|---|
| 219 | return gb_group == other.gb_group; |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | // modify DB: |
|---|
| 223 | GB_ERROR delete_from_DB(); |
|---|
| 224 | ARB_ERROR rename_by_ACI(const char *acisrt, const QueriedGroups& results, int hit_idx); |
|---|
| 225 | ARB_ERROR change_folding(GroupFoldingMode mode); |
|---|
| 226 | }; |
|---|
| 227 | |
|---|
| 228 | class Candidate; // a derivate of FoundGroup |
|---|
| 229 | class GroupSearch; |
|---|
| 230 | |
|---|
| 231 | typedef std::set<GBDATA*> GBDATAset; |
|---|
| 232 | typedef std::set<std::string> SpeciesNames; |
|---|
| 233 | typedef std::vector<FoundGroup> FoundGroupContainer; |
|---|
| 234 | typedef FoundGroupContainer::const_iterator FoundGroupCIter; |
|---|
| 235 | typedef FoundGroupContainer::iterator FoundGroupIter; |
|---|
| 236 | |
|---|
| 237 | enum GroupSortCriterion { |
|---|
| 238 | GSC_NONE, // special criterion (not allowed in SortCriteria) |
|---|
| 239 | GSC_REVERSE, // special criterion: modifies previous criterion; may occur multiple times |
|---|
| 240 | GSC_NAME, |
|---|
| 241 | GSC_TREENAME, |
|---|
| 242 | GSC_TREEORDER, // as defined in tree-admin |
|---|
| 243 | GSC_HIT_REASON, |
|---|
| 244 | GSC_NESTING, |
|---|
| 245 | GSC_SIZE, |
|---|
| 246 | GSC_MARKED, |
|---|
| 247 | GSC_MARKED_PC, |
|---|
| 248 | GSC_CLUSTER, |
|---|
| 249 | GSC_AID, // average ingroup distance |
|---|
| 250 | GSC_KEELED, |
|---|
| 251 | }; |
|---|
| 252 | |
|---|
| 253 | typedef std::list<GroupSortCriterion> SortCriteria; |
|---|
| 254 | |
|---|
| 255 | class QueriedGroups { |
|---|
| 256 | FoundGroupContainer found; |
|---|
| 257 | RefPtr<const SortCriteria> sorted_by; |
|---|
| 258 | |
|---|
| 259 | mutable SmartPtr<ColumnWidths> widths; |
|---|
| 260 | void invalidate_widths() const { widths.setNull(); } |
|---|
| 261 | |
|---|
| 262 | public: |
|---|
| 263 | QueriedGroups() : sorted_by(NULp) {} |
|---|
| 264 | |
|---|
| 265 | size_t size() const { return found.size(); } |
|---|
| 266 | bool empty() const { return size() == 0; } |
|---|
| 267 | |
|---|
| 268 | void add_informed_group(const FoundGroup& group) { |
|---|
| 269 | gs_assert(group.knows_details()); |
|---|
| 270 | found.push_back(group); |
|---|
| 271 | invalidate_widths(); |
|---|
| 272 | } |
|---|
| 273 | void add_candidate(const GroupSearch& group_search, Candidate& cand, const std::string& hit_reason); |
|---|
| 274 | |
|---|
| 275 | FoundGroupCIter begin() const { return found.begin(); } |
|---|
| 276 | FoundGroupIter begin() { return found.begin(); } |
|---|
| 277 | FoundGroupCIter end() const { return found.end(); } |
|---|
| 278 | FoundGroupIter end() { return found.end(); } |
|---|
| 279 | |
|---|
| 280 | const FoundGroup& operator[](size_t idx) const { gs_assert(idx<size()); return found.at(idx); } |
|---|
| 281 | FoundGroup& operator[](size_t idx) { gs_assert(idx<size()); return found.at(idx); } |
|---|
| 282 | |
|---|
| 283 | bool erase_deleted(GroupSearchCommon *common); |
|---|
| 284 | bool contains_changed(GroupSearchCommon *common) const; |
|---|
| 285 | |
|---|
| 286 | const ColumnWidths& get_column_widths() const; |
|---|
| 287 | const char *get_group_display(const FoundGroup& g, bool show_tree_name) const; |
|---|
| 288 | |
|---|
| 289 | void sort_by(const SortCriteria& by); |
|---|
| 290 | |
|---|
| 291 | void remove_hit(size_t idx); |
|---|
| 292 | }; |
|---|
| 293 | |
|---|
| 294 | // -------------------------- |
|---|
| 295 | // DupCriteria |
|---|
| 296 | |
|---|
| 297 | enum DupNameCriterionType { |
|---|
| 298 | DNC_WHOLENAME, // match of whole groupname |
|---|
| 299 | DNC_WORDWISE, // wordwise match |
|---|
| 300 | |
|---|
| 301 | // maybe add something more fuzzy? |
|---|
| 302 | }; |
|---|
| 303 | enum DupTreeCriterionType { |
|---|
| 304 | DLC_SAME_TREE, // duplicates need to be in same tree |
|---|
| 305 | DLC_DIFF_TREE, // duplicates need to be in different trees (but: cluster will be extended with groups from same tree) |
|---|
| 306 | DLC_ANYWHERE, // no restriction |
|---|
| 307 | }; |
|---|
| 308 | |
|---|
| 309 | typedef std::set<std::string> WordSet; |
|---|
| 310 | |
|---|
| 311 | class DupCriteria; |
|---|
| 312 | |
|---|
| 313 | // --------------------- |
|---|
| 314 | // GroupSearch |
|---|
| 315 | |
|---|
| 316 | DECLARE_CBTYPE_FVV_AND_BUILDERS(GroupSearchCallback, void, class GroupSearch*); // generates makeGroupSearchCallback |
|---|
| 317 | |
|---|
| 318 | typedef std::set<std::string> TreeNameSet; |
|---|
| 319 | |
|---|
| 320 | enum GroupSearchMode { // bit mask |
|---|
| 321 | GSM_FORGET_EXISTING = 1, // otherwise keep existing results |
|---|
| 322 | GSM_ADD = 2, // otherwise remove |
|---|
| 323 | GSM_MISMATCH = 4, // otherwise target matching |
|---|
| 324 | |
|---|
| 325 | // convenience defs |
|---|
| 326 | GSM_FIND = GSM_FORGET_EXISTING|GSM_ADD, |
|---|
| 327 | GSM_KEEP = 0, |
|---|
| 328 | GSM_REMOVE = GSM_KEEP|GSM_MISMATCH, // has to be XORed |
|---|
| 329 | GSM_MATCH = 0, |
|---|
| 330 | }; |
|---|
| 331 | |
|---|
| 332 | class GroupSearch : virtual Noncopyable { |
|---|
| 333 | GBDATA *gb_main; |
|---|
| 334 | SmartPtr<QueryExpr> query_expr; |
|---|
| 335 | TreeNameSet trees_to_search; |
|---|
| 336 | SmartPtr<QueriedGroups> found; |
|---|
| 337 | GroupSearchCallback redisplay_cb; |
|---|
| 338 | |
|---|
| 339 | SortCriteria order; |
|---|
| 340 | bool sortedByOrder; |
|---|
| 341 | |
|---|
| 342 | SmartPtr<DupCriteria> dups; |
|---|
| 343 | |
|---|
| 344 | static GroupSearchCommon *common; |
|---|
| 345 | |
|---|
| 346 | void sort_results(); |
|---|
| 347 | GB_ERROR clusterDuplicates(); |
|---|
| 348 | |
|---|
| 349 | #if defined(UNIT_TESTS) |
|---|
| 350 | public: |
|---|
| 351 | #endif |
|---|
| 352 | ARB_ERROR collectSpecies(const QueriedGroups& groups, CollectMode cmode, SpeciesNames& species); |
|---|
| 353 | public: |
|---|
| 354 | GroupSearch(GBDATA *gb_main_, const GroupSearchCallback& redisplay_results_cb); |
|---|
| 355 | ~GroupSearch(); |
|---|
| 356 | |
|---|
| 357 | GBDATA *get_gb_main() const { return gb_main; } |
|---|
| 358 | |
|---|
| 359 | // ------------------------- |
|---|
| 360 | // define search: |
|---|
| 361 | |
|---|
| 362 | void addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression); |
|---|
| 363 | void forgetQExpressions(); |
|---|
| 364 | |
|---|
| 365 | void addSortCriterion(GroupSortCriterion gsc); |
|---|
| 366 | void forgetSortCriteria() { |
|---|
| 367 | order.clear(); |
|---|
| 368 | sortedByOrder = false; |
|---|
| 369 | } |
|---|
| 370 | |
|---|
| 371 | void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size); |
|---|
| 372 | void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const WordSet& ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size); |
|---|
| 373 | void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const char *ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size); |
|---|
| 374 | void forgetDupCriteria(); |
|---|
| 375 | |
|---|
| 376 | void setSearchRange(const TreeNameSet& trees_to_search_) { // empty -> search all trees |
|---|
| 377 | trees_to_search = trees_to_search_; |
|---|
| 378 | } |
|---|
| 379 | |
|---|
| 380 | // ---------------- |
|---|
| 381 | // search |
|---|
| 382 | |
|---|
| 383 | void perform_search(GroupSearchMode mode); |
|---|
| 384 | |
|---|
| 385 | GBDATA *get_parent_group(GBDATA *gb_group) const; |
|---|
| 386 | int calc_nesting_level(GBDATA *gb_group) const; |
|---|
| 387 | |
|---|
| 388 | // ------------------------ |
|---|
| 389 | // handle results: |
|---|
| 390 | |
|---|
| 391 | const QueriedGroups& get_results(); |
|---|
| 392 | bool has_results() const { return found.isSet() && !found->empty(); } |
|---|
| 393 | void remove_hit(size_t idx) { found->remove_hit(idx); } |
|---|
| 394 | void forget_results() { found = new QueriedGroups; } |
|---|
| 395 | |
|---|
| 396 | // ---------------------------- |
|---|
| 397 | // actions on results: |
|---|
| 398 | |
|---|
| 399 | GB_ERROR delete_group(size_t idx); |
|---|
| 400 | GB_ERROR delete_found_groups(); |
|---|
| 401 | |
|---|
| 402 | ARB_ERROR rename_group(size_t idx, const char *acisrt); |
|---|
| 403 | ARB_ERROR rename_found_groups(const char *acisrt); |
|---|
| 404 | |
|---|
| 405 | ARB_ERROR fold_group(size_t idx, GroupFoldingMode mode); |
|---|
| 406 | ARB_ERROR fold_found_groups(GroupFoldingMode mode); |
|---|
| 407 | |
|---|
| 408 | ARB_ERROR set_marks_in_group(size_t idx, GroupMarkMode mode); |
|---|
| 409 | ARB_ERROR set_marks_in_found_groups(GroupMarkMode mode, CollectMode cmode); |
|---|
| 410 | |
|---|
| 411 | #if defined(UNIT_TESTS) |
|---|
| 412 | // inspectors: |
|---|
| 413 | const SortCriteria& inspect_order() const { return order; } |
|---|
| 414 | static GroupSearchCommon *get_common() { return common; } |
|---|
| 415 | #endif |
|---|
| 416 | |
|---|
| 417 | // internal (dont use): |
|---|
| 418 | void refresh_results_after_DBchanges(); |
|---|
| 419 | }; |
|---|
| 420 | |
|---|
| 421 | char *GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups& queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR& error); |
|---|
| 422 | |
|---|
| 423 | |
|---|
| 424 | #else |
|---|
| 425 | #error group_search.h included twice |
|---|
| 426 | #endif // GROUP_SEARCH_H |
|---|