1 | // ============================================================= // |
---|
2 | // // |
---|
3 | // File : group_search.h // |
---|
4 | // Purpose : public interface of group search // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in April 2017 // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================= // |
---|
10 | |
---|
11 | #ifndef GROUP_SEARCH_H |
---|
12 | #define GROUP_SEARCH_H |
---|
13 | |
---|
14 | #ifndef LAZY_H |
---|
15 | #include <lazy.h> |
---|
16 | #endif |
---|
17 | #ifndef QUERY_EXPR_H |
---|
18 | #include <query_expr.h> |
---|
19 | #endif |
---|
20 | #ifndef ARBDB_BASE_H |
---|
21 | #include <arbdb_base.h> |
---|
22 | #endif |
---|
23 | #ifndef ARBTOOLS_H |
---|
24 | #include <arbtools.h> |
---|
25 | #endif |
---|
26 | #ifndef ARB_ERROR_H |
---|
27 | #include <arb_error.h> |
---|
28 | #endif |
---|
29 | #ifndef SMARTPTR_H |
---|
30 | #include <smartptr.h> |
---|
31 | #endif |
---|
32 | #ifndef CB_H |
---|
33 | #include <cb.h> |
---|
34 | #endif |
---|
35 | #ifndef _GLIBCXX_VECTOR |
---|
36 | #include <vector> |
---|
37 | #endif |
---|
38 | #ifndef _GLIBCXX_SET |
---|
39 | #include <set> |
---|
40 | #endif |
---|
41 | #ifndef _GLIBCXX_LIST |
---|
42 | #include <list> |
---|
43 | #endif |
---|
44 | #ifndef _GLIBCXX_STRING |
---|
45 | #include <string> |
---|
46 | #endif |
---|
47 | |
---|
48 | #define gs_assert(cond) arb_assert(cond) |
---|
49 | |
---|
50 | class GroupSearchCommon; |
---|
51 | |
---|
52 | enum GroupFoldingMode { // bit mask |
---|
53 | GFM_EXPAND = 1, // expand groups (otherwise collapse) |
---|
54 | GFM_TOGGLE = 2, // toggle state of groups (if set -> bit0 is ignored) |
---|
55 | GFM_RECURSE = 4, // apply to parents (use with GFM_EXPAND) |
---|
56 | GFM_PARENTS_ONLY = 8, // only apply to parents, not to listed groups (use with GFM_EXPAND) |
---|
57 | GFM_COLLAPSE_REST = 16, // collapse all other groups in each tree |
---|
58 | |
---|
59 | // convenience defs |
---|
60 | GFM_COLLAPSE = 0, |
---|
61 | GFM_EXPANDREC = GFM_EXPAND|GFM_RECURSE, |
---|
62 | GFM_EXPANDPARENTS = GFM_EXPAND|GFM_RECURSE|GFM_PARENTS_ONLY, |
---|
63 | GFM_EXPANDREC_COLLREST = GFM_EXPANDREC|GFM_COLLAPSE_REST, |
---|
64 | }; |
---|
65 | |
---|
66 | enum GroupMarkMode { // same values as ../TREEDISP/TreeCallbacks.cxx@MARK_MODE_LOWER_BITS |
---|
67 | GMM_UNMARK = 0, |
---|
68 | GMM_MARK = 1, |
---|
69 | GMM_INVERT = 2, |
---|
70 | }; |
---|
71 | |
---|
72 | enum CollectMode { INTERSECT, UNITE }; |
---|
73 | |
---|
74 | // ---------------------------------------- |
---|
75 | // query expression related types |
---|
76 | |
---|
77 | enum CriterionOperator { // @@@ rename |
---|
78 | CO_AND, |
---|
79 | CO_OR, |
---|
80 | CO_IGNORE, |
---|
81 | }; |
---|
82 | enum CriterionMatch { // @@@ rename |
---|
83 | CM_MATCH, |
---|
84 | CM_MISMATCH, |
---|
85 | }; |
---|
86 | enum CriterionType { // @@@ rename |
---|
87 | CT_NAME, // groupname (STRING) |
---|
88 | CT_PARENT_DIRECT, // groupname of direct parent (STRING) |
---|
89 | CT_PARENT_ANY, // groupname of any parent (STRING) |
---|
90 | CT_PARENT_ALL, // groupname of all parents (STRING) |
---|
91 | CT_NESTING_LEVEL, // group-nesting-level (INT) |
---|
92 | CT_FOLDED, // folded (INT) |
---|
93 | CT_SIZE, // size of group = number of species (INT) |
---|
94 | CT_MARKED, // number of marked species (INT) |
---|
95 | CT_MARKED_PC, // percentage of marked species (FLOAT) |
---|
96 | CT_ZOMBIES, // number of zombies (INT) |
---|
97 | CT_AID, // average ingroup distance |
---|
98 | CT_KEELED, // keeled state (INT) |
---|
99 | }; |
---|
100 | |
---|
101 | // ------------------------ |
---|
102 | // search results |
---|
103 | |
---|
104 | struct ColumnWidths { |
---|
105 | // Stores widths (or maxvalues for integers) of all |
---|
106 | // FoundGroup-attributes that might get displayed in result list. |
---|
107 | |
---|
108 | int name; // max. width of name |
---|
109 | int reason; // max. width of hit_reason |
---|
110 | |
---|
111 | int max_nesting; // max. value for nesting |
---|
112 | int max_size; // max. value for size |
---|
113 | int max_marked; // max. value for marked |
---|
114 | int max_marked_pc; // max. value for marked% |
---|
115 | int max_cluster_id; // max. value of cluster id |
---|
116 | int max_aid; // max. value in front of dot |
---|
117 | |
---|
118 | bool seen_keeled; // any keeled group found? |
---|
119 | |
---|
120 | ColumnWidths() : |
---|
121 | name(0), |
---|
122 | reason(0), |
---|
123 | max_nesting(0), |
---|
124 | max_size(0), |
---|
125 | max_marked(0), |
---|
126 | max_marked_pc(0), |
---|
127 | max_cluster_id(0), |
---|
128 | seen_keeled(false) |
---|
129 | {} |
---|
130 | |
---|
131 | static int max2width(const int& i) { // @@@ a bit expensive (better use specialized type for 'max_nesting' and other maximas) |
---|
132 | // calculate column width from maximum |
---|
133 | // (Note: fails if negative values are involved) |
---|
134 | return strlen(GBS_global_string("%i", i)); |
---|
135 | } |
---|
136 | static int percent(int part, int all) { |
---|
137 | return int(part*100.0/all+0.5); |
---|
138 | } |
---|
139 | |
---|
140 | void track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled); |
---|
141 | }; |
---|
142 | |
---|
143 | class QueriedGroups; |
---|
144 | |
---|
145 | class FoundGroup { |
---|
146 | // stores/exports info about a group (as found by group search) |
---|
147 | // - usable w/o tree loaded |
---|
148 | // - (able to) know details (like nesting, number of marked species, ...) |
---|
149 | |
---|
150 | RefPtr<GBDATA> gb_group; |
---|
151 | |
---|
152 | ARB_ERROR set_folded(bool folded); |
---|
153 | ARB_ERROR set_overlap_folded(bool folded); |
---|
154 | int get_name_length() const; |
---|
155 | |
---|
156 | GBDATA *get_overlap_group() const { |
---|
157 | gs_assert(keeled.has_value()); // means: pointer 'gb_overlap_group' is valid |
---|
158 | return gb_overlap_group; |
---|
159 | } |
---|
160 | |
---|
161 | protected: |
---|
162 | // optional values (needed to be "informed") |
---|
163 | // - set manually by derivate ('Candidate' in group_search.cxx@inform_group) |
---|
164 | // - knows_details() returns true if everything is properly set |
---|
165 | std::string hit_reason; |
---|
166 | Lazy<int, -1> nesting; |
---|
167 | Lazy<int, -1> size; |
---|
168 | Lazy<int, -1> marked; |
---|
169 | |
---|
170 | LazyFloat<double> aid; // average ingroup distance |
---|
171 | |
---|
172 | Lazy<int, -1> keeled; |
---|
173 | RefPtr<GBDATA> gb_overlap_group; // valid if 'keeled.has_value()'; NULp -> no overlapping group |
---|
174 | |
---|
175 | int clusterID; |
---|
176 | |
---|
177 | public: |
---|
178 | explicit FoundGroup(GBDATA *gb_group_) : |
---|
179 | gb_group(gb_group_), |
---|
180 | gb_overlap_group(NULp), |
---|
181 | clusterID(0) |
---|
182 | {} |
---|
183 | |
---|
184 | GBDATA *get_pointer() const { return gb_group; } |
---|
185 | GBDATA *get_tree_data() const; |
---|
186 | |
---|
187 | const char *get_name() const; |
---|
188 | const char *get_tree_name() const; |
---|
189 | int get_tree_order() const; |
---|
190 | const std::string& get_hit_reason() const { return hit_reason; } |
---|
191 | |
---|
192 | void set_cluster_id(int id) { gs_assert(id>0); clusterID = id; } |
---|
193 | void forget_cluster_id() { clusterID = 0; } |
---|
194 | int get_cluster_id() const { return clusterID; } |
---|
195 | |
---|
196 | bool overlap_is_folded() const; |
---|
197 | bool is_folded() const; |
---|
198 | |
---|
199 | bool knows_details() const { // returns true if group is "informed" |
---|
200 | return |
---|
201 | !hit_reason.empty() && |
---|
202 | nesting.has_value() && |
---|
203 | size.has_value() && |
---|
204 | marked.has_value() && |
---|
205 | keeled.has_value() && |
---|
206 | aid.has_value(); |
---|
207 | } |
---|
208 | void track_max_widths(ColumnWidths& widths) const; |
---|
209 | |
---|
210 | // getters for "informed" groups: |
---|
211 | int get_nesting() const { gs_assert(knows_details()); return nesting; } |
---|
212 | int get_size() const { gs_assert(knows_details()); return size; } |
---|
213 | int get_marked() const { gs_assert(knows_details()); return marked; } |
---|
214 | int get_marked_pc() const { gs_assert(knows_details()); return ColumnWidths::percent(marked, size); } |
---|
215 | int get_keeled() const { gs_assert(knows_details()); return keeled; } |
---|
216 | double get_aid() const { gs_assert(knows_details()); return aid; } |
---|
217 | |
---|
218 | bool operator == (const FoundGroup& other) const { |
---|
219 | return gb_group == other.gb_group; |
---|
220 | } |
---|
221 | |
---|
222 | // modify DB: |
---|
223 | GB_ERROR delete_from_DB(); |
---|
224 | ARB_ERROR rename_by_ACI(const char *acisrt, const QueriedGroups& results, int hit_idx); |
---|
225 | ARB_ERROR change_folding(GroupFoldingMode mode); |
---|
226 | }; |
---|
227 | |
---|
228 | class Candidate; // a derivate of FoundGroup |
---|
229 | class GroupSearch; |
---|
230 | |
---|
231 | typedef std::set<GBDATA*> GBDATAset; |
---|
232 | typedef std::set<std::string> SpeciesNames; |
---|
233 | typedef std::vector<FoundGroup> FoundGroupContainer; |
---|
234 | typedef FoundGroupContainer::const_iterator FoundGroupCIter; |
---|
235 | typedef FoundGroupContainer::iterator FoundGroupIter; |
---|
236 | |
---|
237 | enum GroupSortCriterion { |
---|
238 | GSC_NONE, // special criterion (not allowed in SortCriteria) |
---|
239 | GSC_REVERSE, // special criterion: modifies previous criterion; may occur multiple times |
---|
240 | GSC_NAME, |
---|
241 | GSC_TREENAME, |
---|
242 | GSC_TREEORDER, // as defined in tree-admin |
---|
243 | GSC_HIT_REASON, |
---|
244 | GSC_NESTING, |
---|
245 | GSC_SIZE, |
---|
246 | GSC_MARKED, |
---|
247 | GSC_MARKED_PC, |
---|
248 | GSC_CLUSTER, |
---|
249 | GSC_AID, // average ingroup distance |
---|
250 | GSC_KEELED, |
---|
251 | }; |
---|
252 | |
---|
253 | typedef std::list<GroupSortCriterion> SortCriteria; |
---|
254 | |
---|
255 | class QueriedGroups { |
---|
256 | FoundGroupContainer found; |
---|
257 | RefPtr<const SortCriteria> sorted_by; |
---|
258 | |
---|
259 | mutable SmartPtr<ColumnWidths> widths; |
---|
260 | void invalidate_widths() const { widths.setNull(); } |
---|
261 | |
---|
262 | public: |
---|
263 | QueriedGroups() : sorted_by(NULp) {} |
---|
264 | |
---|
265 | size_t size() const { return found.size(); } |
---|
266 | bool empty() const { return size() == 0; } |
---|
267 | |
---|
268 | void add_informed_group(const FoundGroup& group) { |
---|
269 | gs_assert(group.knows_details()); |
---|
270 | found.push_back(group); |
---|
271 | invalidate_widths(); |
---|
272 | } |
---|
273 | void add_candidate(const GroupSearch& group_search, Candidate& cand, const std::string& hit_reason); |
---|
274 | |
---|
275 | FoundGroupCIter begin() const { return found.begin(); } |
---|
276 | FoundGroupIter begin() { return found.begin(); } |
---|
277 | FoundGroupCIter end() const { return found.end(); } |
---|
278 | FoundGroupIter end() { return found.end(); } |
---|
279 | |
---|
280 | const FoundGroup& operator[](size_t idx) const { gs_assert(idx<size()); return found.at(idx); } |
---|
281 | FoundGroup& operator[](size_t idx) { gs_assert(idx<size()); return found.at(idx); } |
---|
282 | |
---|
283 | bool erase_deleted(GroupSearchCommon *common); |
---|
284 | bool contains_changed(GroupSearchCommon *common) const; |
---|
285 | |
---|
286 | const ColumnWidths& get_column_widths() const; |
---|
287 | const char *get_group_display(const FoundGroup& g, bool show_tree_name) const; |
---|
288 | |
---|
289 | void sort_by(const SortCriteria& by); |
---|
290 | |
---|
291 | void remove_hit(size_t idx); |
---|
292 | }; |
---|
293 | |
---|
294 | // -------------------------- |
---|
295 | // DupCriteria |
---|
296 | |
---|
297 | enum DupNameCriterionType { |
---|
298 | DNC_WHOLENAME, // match of whole groupname |
---|
299 | DNC_WORDWISE, // wordwise match |
---|
300 | |
---|
301 | // maybe add something more fuzzy? |
---|
302 | }; |
---|
303 | enum DupTreeCriterionType { |
---|
304 | DLC_SAME_TREE, // duplicates need to be in same tree |
---|
305 | DLC_DIFF_TREE, // duplicates need to be in different trees (but: cluster will be extended with groups from same tree) |
---|
306 | DLC_ANYWHERE, // no restriction |
---|
307 | }; |
---|
308 | |
---|
309 | typedef std::set<std::string> WordSet; |
---|
310 | |
---|
311 | class DupCriteria; |
---|
312 | |
---|
313 | // --------------------- |
---|
314 | // GroupSearch |
---|
315 | |
---|
316 | DECLARE_CBTYPE_FVV_AND_BUILDERS(GroupSearchCallback, void, class GroupSearch*); // generates makeGroupSearchCallback |
---|
317 | |
---|
318 | typedef std::set<std::string> TreeNameSet; |
---|
319 | |
---|
320 | enum GroupSearchMode { // bit mask |
---|
321 | GSM_FORGET_EXISTING = 1, // otherwise keep existing results |
---|
322 | GSM_ADD = 2, // otherwise remove |
---|
323 | GSM_MISMATCH = 4, // otherwise target matching |
---|
324 | |
---|
325 | // convenience defs |
---|
326 | GSM_FIND = GSM_FORGET_EXISTING|GSM_ADD, |
---|
327 | GSM_KEEP = 0, |
---|
328 | GSM_REMOVE = GSM_KEEP|GSM_MISMATCH, // has to be XORed |
---|
329 | GSM_MATCH = 0, |
---|
330 | }; |
---|
331 | |
---|
332 | class GroupSearch : virtual Noncopyable { |
---|
333 | GBDATA *gb_main; |
---|
334 | SmartPtr<QueryExpr> query_expr; |
---|
335 | TreeNameSet trees_to_search; |
---|
336 | SmartPtr<QueriedGroups> found; |
---|
337 | GroupSearchCallback redisplay_cb; |
---|
338 | |
---|
339 | SortCriteria order; |
---|
340 | bool sortedByOrder; |
---|
341 | |
---|
342 | SmartPtr<DupCriteria> dups; |
---|
343 | |
---|
344 | static GroupSearchCommon *common; |
---|
345 | |
---|
346 | void sort_results(); |
---|
347 | GB_ERROR clusterDuplicates(); |
---|
348 | |
---|
349 | #if defined(UNIT_TESTS) |
---|
350 | public: |
---|
351 | #endif |
---|
352 | ARB_ERROR collectSpecies(const QueriedGroups& groups, CollectMode cmode, SpeciesNames& species); |
---|
353 | public: |
---|
354 | GroupSearch(GBDATA *gb_main_, const GroupSearchCallback& redisplay_results_cb); |
---|
355 | ~GroupSearch(); |
---|
356 | |
---|
357 | GBDATA *get_gb_main() const { return gb_main; } |
---|
358 | |
---|
359 | // ------------------------- |
---|
360 | // define search: |
---|
361 | |
---|
362 | void addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression); |
---|
363 | void forgetQExpressions(); |
---|
364 | |
---|
365 | void addSortCriterion(GroupSortCriterion gsc); |
---|
366 | void forgetSortCriteria() { |
---|
367 | order.clear(); |
---|
368 | sortedByOrder = false; |
---|
369 | } |
---|
370 | |
---|
371 | void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size); |
---|
372 | void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const WordSet& ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size); |
---|
373 | void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const char *ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size); |
---|
374 | void forgetDupCriteria(); |
---|
375 | |
---|
376 | void setSearchRange(const TreeNameSet& trees_to_search_) { // empty -> search all trees |
---|
377 | trees_to_search = trees_to_search_; |
---|
378 | } |
---|
379 | |
---|
380 | // ---------------- |
---|
381 | // search |
---|
382 | |
---|
383 | void perform_search(GroupSearchMode mode); |
---|
384 | |
---|
385 | GBDATA *get_parent_group(GBDATA *gb_group) const; |
---|
386 | int calc_nesting_level(GBDATA *gb_group) const; |
---|
387 | |
---|
388 | // ------------------------ |
---|
389 | // handle results: |
---|
390 | |
---|
391 | const QueriedGroups& get_results(); |
---|
392 | bool has_results() const { return found.isSet() && !found->empty(); } |
---|
393 | void remove_hit(size_t idx) { found->remove_hit(idx); } |
---|
394 | void forget_results() { found = new QueriedGroups; } |
---|
395 | |
---|
396 | // ---------------------------- |
---|
397 | // actions on results: |
---|
398 | |
---|
399 | GB_ERROR delete_group(size_t idx); |
---|
400 | GB_ERROR delete_found_groups(); |
---|
401 | |
---|
402 | ARB_ERROR rename_group(size_t idx, const char *acisrt); |
---|
403 | ARB_ERROR rename_found_groups(const char *acisrt); |
---|
404 | |
---|
405 | ARB_ERROR fold_group(size_t idx, GroupFoldingMode mode); |
---|
406 | ARB_ERROR fold_found_groups(GroupFoldingMode mode); |
---|
407 | |
---|
408 | ARB_ERROR set_marks_in_group(size_t idx, GroupMarkMode mode); |
---|
409 | ARB_ERROR set_marks_in_found_groups(GroupMarkMode mode, CollectMode cmode); |
---|
410 | |
---|
411 | #if defined(UNIT_TESTS) |
---|
412 | // inspectors: |
---|
413 | const SortCriteria& inspect_order() const { return order; } |
---|
414 | static GroupSearchCommon *get_common() { return common; } |
---|
415 | #endif |
---|
416 | |
---|
417 | // internal (dont use): |
---|
418 | void refresh_results_after_DBchanges(); |
---|
419 | }; |
---|
420 | |
---|
421 | char *GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups& queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR& error); |
---|
422 | |
---|
423 | |
---|
424 | #else |
---|
425 | #error group_search.h included twice |
---|
426 | #endif // GROUP_SEARCH_H |
---|