source: branches/ali/SL/GROUP_SEARCH/group_search.h

Last change on this file was 17110, checked in by westram, 6 years ago
File size: 13.6 KB
Line 
1// ============================================================= //
2//                                                               //
3//   File      : group_search.h                                  //
4//   Purpose   : public interface of group search                //
5//                                                               //
6//   Coded by Ralf Westram (coder@reallysoft.de) in April 2017   //
7//   http://www.arb-home.de/                                     //
8//                                                               //
9// ============================================================= //
10
11#ifndef GROUP_SEARCH_H
12#define GROUP_SEARCH_H
13
14#ifndef LAZY_H
15#include <lazy.h>
16#endif
17#ifndef QUERY_EXPR_H
18#include <query_expr.h>
19#endif
20#ifndef ARBDB_BASE_H
21#include <arbdb_base.h>
22#endif
23#ifndef ARBTOOLS_H
24#include <arbtools.h>
25#endif
26#ifndef ARB_ERROR_H
27#include <arb_error.h>
28#endif
29#ifndef SMARTPTR_H
30#include <smartptr.h>
31#endif
32#ifndef CB_H
33#include <cb.h>
34#endif
35#ifndef _GLIBCXX_VECTOR
36#include <vector>
37#endif
38#ifndef _GLIBCXX_SET
39#include <set>
40#endif
41#ifndef _GLIBCXX_LIST
42#include <list>
43#endif
44#ifndef _GLIBCXX_STRING
45#include <string>
46#endif
47
48#define gs_assert(cond) arb_assert(cond)
49
50class GroupSearchCommon;
51
52enum GroupFoldingMode { // bit mask
53    GFM_EXPAND        = 1,  // expand groups (otherwise collapse)
54    GFM_TOGGLE        = 2,  // toggle state of groups (if set -> bit0 is ignored)
55    GFM_RECURSE       = 4,  // apply to parents                            (use with GFM_EXPAND)
56    GFM_PARENTS_ONLY  = 8,  // only apply to parents, not to listed groups (use with GFM_EXPAND)
57    GFM_COLLAPSE_REST = 16, // collapse all other groups in each tree
58
59    // convenience defs
60    GFM_COLLAPSE           = 0,
61    GFM_EXPANDREC          = GFM_EXPAND|GFM_RECURSE,
62    GFM_EXPANDPARENTS      = GFM_EXPAND|GFM_RECURSE|GFM_PARENTS_ONLY,
63    GFM_EXPANDREC_COLLREST = GFM_EXPANDREC|GFM_COLLAPSE_REST,
64};
65
66enum GroupMarkMode { // same values as ../TREEDISP/TreeCallbacks.cxx@MARK_MODE_LOWER_BITS
67    GMM_UNMARK = 0,
68    GMM_MARK   = 1,
69    GMM_INVERT = 2,
70};
71
72enum CollectMode { INTERSECT, UNITE };
73
74// ----------------------------------------
75//      query expression related types
76
77enum CriterionOperator { // @@@ rename
78    CO_AND,
79    CO_OR,
80    CO_IGNORE,
81};
82enum CriterionMatch { // @@@ rename
83    CM_MATCH,
84    CM_MISMATCH,
85};
86enum CriterionType { // @@@ rename
87    CT_NAME,          // groupname (STRING)
88    CT_PARENT_DIRECT, // groupname of direct parent (STRING)
89    CT_PARENT_ANY,    // groupname of any parent (STRING)
90    CT_PARENT_ALL,    // groupname of all parents (STRING)
91    CT_NESTING_LEVEL, // group-nesting-level (INT)
92    CT_FOLDED,        // folded (INT)
93    CT_SIZE,          // size of group = number of species (INT)
94    CT_MARKED,        // number of marked species (INT)
95    CT_MARKED_PC,     // percentage of marked species (FLOAT)
96    CT_ZOMBIES,       // number of zombies (INT)
97    CT_AID,           // average ingroup distance
98    CT_KEELED,        // keeled state (INT)
99};
100
101// ------------------------
102//      search results
103
104struct ColumnWidths {
105    // Stores widths (or maxvalues for integers) of all
106    // FoundGroup-attributes that might get displayed in result list.
107
108    int name;    // max. width of name
109    int reason;  // max. width of hit_reason
110
111    int max_nesting;    // max. value for nesting
112    int max_size;       // max. value for size
113    int max_marked;     // max. value for marked
114    int max_marked_pc;  // max. value for marked%
115    int max_cluster_id; // max. value of cluster id
116    int max_aid;        // max. value in front of dot
117
118    bool seen_keeled; // any keeled group found?
119
120    ColumnWidths() :
121        name(0),
122        reason(0),
123        max_nesting(0),
124        max_size(0),
125        max_marked(0),
126        max_marked_pc(0),
127        max_cluster_id(0),
128        seen_keeled(false)
129    {}
130
131    static int max2width(const int& i) { // @@@ a bit expensive (better use specialized type for 'max_nesting' and other maximas)
132        // calculate column width from maximum
133        // (Note: fails if negative values are involved)
134        return strlen(GBS_global_string("%i", i));
135    }
136    static int percent(int part, int all) {
137        return int(part*100.0/all+0.5);
138    }
139
140    void track(int wName, int wReason, int nesting, int size, int marked, int clusID, double aid, bool keeled);
141};
142
143class QueriedGroups;
144
145class FoundGroup {
146    // stores/exports info about a group (as found by group search)
147    // - usable w/o tree loaded
148    // - (able to) know details (like nesting, number of marked species, ...)
149
150    RefPtr<GBDATA> gb_group;
151
152    ARB_ERROR set_folded(bool folded);
153    ARB_ERROR set_overlap_folded(bool folded);
154    int get_name_length() const;
155
156    GBDATA *get_overlap_group() const {
157        gs_assert(keeled.has_value()); // means: pointer 'gb_overlap_group' is valid
158        return gb_overlap_group;
159    }
160
161protected:
162    // optional values (needed to be "informed")
163    // - set manually by derivate ('Candidate' in group_search.cxx@inform_group)
164    // - knows_details() returns true if everything is properly set
165    std::string   hit_reason;
166    Lazy<int, -1> nesting;
167    Lazy<int, -1> size;
168    Lazy<int, -1> marked;
169
170    LazyFloat<double> aid;              // average ingroup distance
171
172    Lazy<int, -1>  keeled;
173    RefPtr<GBDATA> gb_overlap_group;    // valid if 'keeled.has_value()'; NULp -> no overlapping group
174
175    int clusterID;
176
177public:
178    explicit FoundGroup(GBDATA *gb_group_) :
179        gb_group(gb_group_),
180        gb_overlap_group(NULp),
181        clusterID(0)
182    {}
183
184    GBDATA *get_pointer() const { return gb_group; }
185    GBDATA *get_tree_data() const;
186
187    const char *get_name() const;
188    const char *get_tree_name() const;
189    int get_tree_order() const;
190    const std::string& get_hit_reason() const { return hit_reason; }
191
192    void set_cluster_id(int id) { gs_assert(id>0); clusterID = id; }
193    void forget_cluster_id() { clusterID = 0; }
194    int get_cluster_id() const { return clusterID; }
195
196    bool overlap_is_folded() const;
197    bool is_folded() const;
198
199    bool knows_details() const { // returns true if group is "informed"
200        return
201            !hit_reason.empty() &&
202            nesting.has_value() &&
203            size.has_value()    &&
204            marked.has_value()  &&
205            keeled.has_value()  &&
206            aid.has_value();
207    }
208    void track_max_widths(ColumnWidths& widths) const;
209
210    // getters for "informed" groups:
211    int get_nesting()   const { gs_assert(knows_details()); return nesting; }
212    int get_size()      const { gs_assert(knows_details()); return size; }
213    int get_marked()    const { gs_assert(knows_details()); return marked; }
214    int get_marked_pc() const { gs_assert(knows_details()); return ColumnWidths::percent(marked, size); }
215    int get_keeled()    const { gs_assert(knows_details()); return keeled; }
216    double get_aid()    const { gs_assert(knows_details()); return aid; }
217
218    bool operator == (const FoundGroup& other) const {
219        return gb_group == other.gb_group;
220    }
221
222    // modify DB:
223    GB_ERROR  delete_from_DB();
224    ARB_ERROR rename_by_ACI(const char *acisrt, const QueriedGroups& results, int hit_idx);
225    ARB_ERROR change_folding(GroupFoldingMode mode);
226};
227
228class Candidate; // a derivate of FoundGroup
229class GroupSearch;
230
231typedef std::set<GBDATA*>                   GBDATAset;
232typedef std::set<std::string>               SpeciesNames;
233typedef std::vector<FoundGroup>             FoundGroupContainer;
234typedef FoundGroupContainer::const_iterator FoundGroupCIter;
235typedef FoundGroupContainer::iterator       FoundGroupIter;
236
237enum GroupSortCriterion {
238    GSC_NONE,      // special criterion (not allowed in SortCriteria)
239    GSC_REVERSE,   // special criterion: modifies previous criterion; may occur multiple times
240    GSC_NAME,
241    GSC_TREENAME,
242    GSC_TREEORDER, // as defined in tree-admin
243    GSC_HIT_REASON,
244    GSC_NESTING,
245    GSC_SIZE,
246    GSC_MARKED,
247    GSC_MARKED_PC,
248    GSC_CLUSTER,
249    GSC_AID, // average ingroup distance
250    GSC_KEELED,
251};
252
253typedef std::list<GroupSortCriterion> SortCriteria;
254
255class QueriedGroups {
256    FoundGroupContainer        found;
257    RefPtr<const SortCriteria> sorted_by;
258
259    mutable SmartPtr<ColumnWidths> widths;
260    void invalidate_widths() const { widths.setNull(); }
261
262public:
263    QueriedGroups() : sorted_by(NULp) {}
264
265    size_t size() const { return found.size(); }
266    bool empty() const { return size() == 0; }
267
268    void add_informed_group(const FoundGroup& group) {
269        gs_assert(group.knows_details());
270        found.push_back(group);
271        invalidate_widths();
272    }
273    void add_candidate(const GroupSearch& group_search, Candidate& cand, const std::string& hit_reason);
274
275    FoundGroupCIter begin() const { return found.begin(); }
276    FoundGroupIter  begin()       { return found.begin(); }
277    FoundGroupCIter end()   const { return found.end(); }
278    FoundGroupIter  end()         { return found.end(); }
279
280    const FoundGroup& operator[](size_t idx) const { gs_assert(idx<size()); return found.at(idx); }
281    FoundGroup&       operator[](size_t idx)       { gs_assert(idx<size()); return found.at(idx); }
282
283    bool erase_deleted(GroupSearchCommon *common);
284    bool contains_changed(GroupSearchCommon *common) const;
285
286    const ColumnWidths& get_column_widths() const;
287    const char *get_group_display(const FoundGroup& g, bool show_tree_name) const;
288
289    void sort_by(const SortCriteria& by);
290
291    void remove_hit(size_t idx);
292};
293
294// --------------------------
295//      DupCriteria
296
297enum DupNameCriterionType {
298    DNC_WHOLENAME, // match of whole groupname
299    DNC_WORDWISE,  // wordwise match
300
301    // maybe add something more fuzzy?
302};
303enum DupTreeCriterionType {
304    DLC_SAME_TREE, // duplicates need to be in same tree
305    DLC_DIFF_TREE, // duplicates need to be in different trees (but: cluster will be extended with groups from same tree)
306    DLC_ANYWHERE,  // no restriction
307};
308
309typedef std::set<std::string> WordSet;
310
311class DupCriteria;
312
313// ---------------------
314//      GroupSearch
315
316DECLARE_CBTYPE_FVV_AND_BUILDERS(GroupSearchCallback, void, class GroupSearch*);   // generates makeGroupSearchCallback
317
318typedef std::set<std::string> TreeNameSet;
319
320enum GroupSearchMode {       // bit mask
321    GSM_FORGET_EXISTING = 1, // otherwise keep existing results
322    GSM_ADD             = 2, // otherwise remove
323    GSM_MISMATCH        = 4, // otherwise target matching
324
325    // convenience defs
326    GSM_FIND   = GSM_FORGET_EXISTING|GSM_ADD,
327    GSM_KEEP   = 0,
328    GSM_REMOVE = GSM_KEEP|GSM_MISMATCH, // has to be XORed
329    GSM_MATCH  = 0,
330};
331
332class GroupSearch : virtual Noncopyable {
333    GBDATA                  *gb_main;
334    SmartPtr<QueryExpr>      query_expr;
335    TreeNameSet              trees_to_search;
336    SmartPtr<QueriedGroups>  found;
337    GroupSearchCallback      redisplay_cb;
338
339    SortCriteria order;
340    bool         sortedByOrder;
341
342    SmartPtr<DupCriteria> dups;
343
344    static GroupSearchCommon *common;
345
346    void sort_results();
347    GB_ERROR clusterDuplicates();
348
349#if defined(UNIT_TESTS)
350public:
351#endif
352    ARB_ERROR collectSpecies(const QueriedGroups& groups, CollectMode cmode, SpeciesNames& species);
353public:
354    GroupSearch(GBDATA *gb_main_, const GroupSearchCallback& redisplay_results_cb);
355    ~GroupSearch();
356
357    GBDATA *get_gb_main() const { return gb_main; }
358
359    // -------------------------
360    //      define search:
361
362    void addQueryExpression(CriterionOperator op, CriterionType type, CriterionMatch mtype, const char *expression);
363    void forgetQExpressions();
364
365    void addSortCriterion(GroupSortCriterion gsc);
366    void forgetSortCriteria() {
367        order.clear();
368        sortedByOrder = false;
369    }
370
371    void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, DupTreeCriterionType ttype, int min_cluster_size);
372    void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const WordSet& ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size);
373    void setDupCriteria(bool listDups, DupNameCriterionType ntype, GB_CASE sens, int min_words, const char    *ignored_words, const char *wordSeparators, DupTreeCriterionType ttype, int min_cluster_size);
374    void forgetDupCriteria();
375
376    void setSearchRange(const TreeNameSet& trees_to_search_) { // empty -> search all trees
377        trees_to_search = trees_to_search_;
378    }
379
380    // ----------------
381    //      search
382
383    void perform_search(GroupSearchMode mode);
384
385    GBDATA *get_parent_group(GBDATA *gb_group) const;
386    int calc_nesting_level(GBDATA *gb_group) const;
387
388    // ------------------------
389    //      handle results:
390
391    const QueriedGroups& get_results();
392    bool has_results() const { return found.isSet() && !found->empty(); }
393    void remove_hit(size_t idx) { found->remove_hit(idx); }
394    void forget_results() { found = new QueriedGroups; }
395
396    // ----------------------------
397    //      actions on results:
398
399    GB_ERROR delete_group(size_t idx);
400    GB_ERROR delete_found_groups();
401
402    ARB_ERROR rename_group(size_t idx, const char *acisrt);
403    ARB_ERROR rename_found_groups(const char *acisrt);
404
405    ARB_ERROR fold_group(size_t idx, GroupFoldingMode mode);
406    ARB_ERROR fold_found_groups(GroupFoldingMode mode);
407
408    ARB_ERROR set_marks_in_group(size_t idx, GroupMarkMode mode);
409    ARB_ERROR set_marks_in_found_groups(GroupMarkMode mode, CollectMode cmode);
410
411#if defined(UNIT_TESTS)
412    // inspectors:
413    const SortCriteria& inspect_order() const { return order; }
414    static GroupSearchCommon *get_common() { return common; }
415#endif
416
417    // internal (dont use):
418    void refresh_results_after_DBchanges();
419};
420
421char *GS_calc_resulting_groupname(GBDATA *gb_main, const QueriedGroups& queried, int hit_idx, const char *input_name, const char *acisrt, ARB_ERROR& error);
422
423
424#else
425#error group_search.h included twice
426#endif // GROUP_SEARCH_H
Note: See TracBrowser for help on using the repository browser.