| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : NT_tree_cmp.h // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #ifndef NT_TREE_CMP_H |
|---|
| 12 | #define NT_TREE_CMP_H |
|---|
| 13 | |
|---|
| 14 | #ifndef ARBDB_BASE_H |
|---|
| 15 | #include <arbdb_base.h> |
|---|
| 16 | #endif |
|---|
| 17 | #ifndef NT_LOCAL_H |
|---|
| 18 | #include "NT_local.h" |
|---|
| 19 | #endif |
|---|
| 20 | |
|---|
| 21 | enum GroupTransferMode { |
|---|
| 22 | COMPARE_TOPOLOGY, // compare inner nodes + write differences to tree remarks |
|---|
| 23 | REMOVE_EXISTING_GROUPS, // transfer groups (removes existing groups) |
|---|
| 24 | KEEP_OLD_NAMES, // transfer groups (keep old groups; combines mismatching names) |
|---|
| 25 | }; |
|---|
| 26 | |
|---|
| 27 | enum GroupsToTransfer { |
|---|
| 28 | XFER_ALL_GROUPS, |
|---|
| 29 | XFER_GROUPS_WITH_MARKED, |
|---|
| 30 | }; |
|---|
| 31 | |
|---|
| 32 | class RSpecSet; |
|---|
| 33 | class TSpecSet; |
|---|
| 34 | class GroupMatchScorer; |
|---|
| 35 | |
|---|
| 36 | class RatioLimits { |
|---|
| 37 | double minRatio; |
|---|
| 38 | double maxRatio; |
|---|
| 39 | |
|---|
| 40 | public: |
|---|
| 41 | RatioLimits() : minRatio(0.0), maxRatio(1.0) {} |
|---|
| 42 | RatioLimits(double lower, double upper) : minRatio(lower), maxRatio(upper) {} |
|---|
| 43 | |
|---|
| 44 | bool insideLimits(double ratio) const { return minRatio <= ratio && ratio <= maxRatio; } |
|---|
| 45 | bool isValid() const { return minRatio>=0.0 && minRatio<=maxRatio && maxRatio<=1.0; } |
|---|
| 46 | }; |
|---|
| 47 | |
|---|
| 48 | class GroupPenalty { |
|---|
| 49 | double penalty; // 0 = perfect match; >0 otherwise |
|---|
| 50 | |
|---|
| 51 | double ingroup_ratio; // how many percent of original groups members are members of new group [0..1] |
|---|
| 52 | double outgroup_ratio; // how many percent of new group are no original group members [0..1] |
|---|
| 53 | |
|---|
| 54 | bool keeled; // @@@ shall be used for #451 |
|---|
| 55 | |
|---|
| 56 | // data copied from unregistered set (TSpecSet; i.e. from source tree when moving groups): |
|---|
| 57 | int groupSize; // size of source group (known members) |
|---|
| 58 | int unknown; // unknown members in source group |
|---|
| 59 | |
|---|
| 60 | static double no_match_penalty() { return LONG_MAX; } |
|---|
| 61 | |
|---|
| 62 | #if defined(ASSERTION_USED) |
|---|
| 63 | bool isRatio(double r) const { return r>=0.0 && r<=1.0; } |
|---|
| 64 | #endif |
|---|
| 65 | |
|---|
| 66 | public: |
|---|
| 67 | GroupPenalty() : |
|---|
| 68 | penalty(no_match_penalty()), |
|---|
| 69 | ingroup_ratio(-1), |
|---|
| 70 | outgroup_ratio(-1), |
|---|
| 71 | keeled(false), |
|---|
| 72 | groupSize(-1), |
|---|
| 73 | unknown(-1) |
|---|
| 74 | {} |
|---|
| 75 | GroupPenalty(double penalty_, double ingroup_ratio_, double outgroup_ratio_, int unreg_groupsize) : |
|---|
| 76 | penalty(penalty_), |
|---|
| 77 | ingroup_ratio(ingroup_ratio_), |
|---|
| 78 | outgroup_ratio(outgroup_ratio_), |
|---|
| 79 | keeled(false), |
|---|
| 80 | groupSize(unreg_groupsize), |
|---|
| 81 | unknown(-1) |
|---|
| 82 | { |
|---|
| 83 | nt_assert(isRatio(ingroup_ratio)); |
|---|
| 84 | nt_assert(isRatio(outgroup_ratio)); |
|---|
| 85 | nt_assert(groupSize>0); |
|---|
| 86 | } |
|---|
| 87 | |
|---|
| 88 | bool doesMatch() const { return penalty != no_match_penalty(); } |
|---|
| 89 | bool betterThan(const GroupPenalty& other) const { return penalty < other.penalty; } |
|---|
| 90 | |
|---|
| 91 | double get_penalty() const { return penalty; } |
|---|
| 92 | bool isPerfectMatch() const { return penalty == 0.0; } |
|---|
| 93 | double get_ingroup_ratio() const { return ingroup_ratio; } |
|---|
| 94 | double get_outgroup_ratio() const { return outgroup_ratio; } |
|---|
| 95 | |
|---|
| 96 | int get_groupsize() const { nt_assert(groupSize>0); return groupSize; } |
|---|
| 97 | int get_unknown() const { nt_assert(unknown>=0); return unknown; } |
|---|
| 98 | |
|---|
| 99 | bool shouldHaveBeenKeeled() const { return keeled; } |
|---|
| 100 | |
|---|
| 101 | static GroupPenalty NoMatch() { return GroupPenalty(); } // syntactic sugar |
|---|
| 102 | |
|---|
| 103 | void mark_as_keeled() { keeled = true; } |
|---|
| 104 | void addPenalty(double p) { |
|---|
| 105 | nt_assert(p>=0.0); |
|---|
| 106 | nt_assert(doesMatch()); // avoid "!doesMatch()"-condition gets destroyed |
|---|
| 107 | penalty += p; |
|---|
| 108 | } |
|---|
| 109 | void registerUnfound(const GroupMatchScorer& scorer, const TSpecSet& tset); |
|---|
| 110 | }; |
|---|
| 111 | |
|---|
| 112 | class GroupMatchScorer { |
|---|
| 113 | RatioLimits ingroup; // only need lower limit (upper should always be 100%) |
|---|
| 114 | RatioLimits outgroup; // only need upper limit (lower should always be 0%) |
|---|
| 115 | |
|---|
| 116 | // Pep: per error penalty (=absolute penalty=traditional penalty scoring) |
|---|
| 117 | double ingroupPep; // penalty for each former member of group (when moved out of group, but still member of tree) |
|---|
| 118 | double outgroupPep; // penalty for each former non-member (when moved into group) |
|---|
| 119 | double unfoundPep; // penalty for each unknown species (when moved into group and not member of source tree) |
|---|
| 120 | |
|---|
| 121 | double keelPenalty; // penalty added for using keeled group |
|---|
| 122 | |
|---|
| 123 | // RelPen: ratio penalty (=relative penalty) |
|---|
| 124 | double ingroupInvRelPen; // factor for (1-GroupPenalty::ingroup_ratio), i.e. penalty rel.to ratio of non-included former group members |
|---|
| 125 | double outgroupRelPen; // factor for ( GroupPenalty::outgroup_ratio), i.e. penalty rel.to ratio of former non-member in new group |
|---|
| 126 | |
|---|
| 127 | bool insideLimits(double ingroupRatio, double outgroupRatio) const { |
|---|
| 128 | return |
|---|
| 129 | ingroup.insideLimits(ingroupRatio) && |
|---|
| 130 | outgroup.insideLimits(outgroupRatio); |
|---|
| 131 | } |
|---|
| 132 | GroupPenalty calcPenalty(long removed, long added, long commonSpecies); |
|---|
| 133 | |
|---|
| 134 | public: |
|---|
| 135 | GroupMatchScorer() : |
|---|
| 136 | ingroupPep(1.0), |
|---|
| 137 | outgroupPep(1.0), |
|---|
| 138 | unfoundPep(0.0001), |
|---|
| 139 | keelPenalty(0.01), |
|---|
| 140 | ingroupInvRelPen(0.0), |
|---|
| 141 | outgroupRelPen(0.0) |
|---|
| 142 | {} |
|---|
| 143 | |
|---|
| 144 | GB_ERROR check_validity() const; |
|---|
| 145 | |
|---|
| 146 | // setup: |
|---|
| 147 | void setLimits(const RatioLimits& ingroupLimits, const RatioLimits& outgroupLimits) { |
|---|
| 148 | nt_assert(ingroupLimits.insideLimits(1.0)); // upper should always be 100% |
|---|
| 149 | nt_assert(outgroupLimits.insideLimits(0.0)); // lower should always be 0% |
|---|
| 150 | |
|---|
| 151 | ingroup = ingroupLimits; |
|---|
| 152 | outgroup = outgroupLimits; |
|---|
| 153 | } |
|---|
| 154 | void setPerErrorPenalties(double ingroup_pep, double outgroup_pep, double unfound_pep) { |
|---|
| 155 | ingroupPep = ingroup_pep; |
|---|
| 156 | outgroupPep = outgroup_pep; |
|---|
| 157 | unfoundPep = unfound_pep; |
|---|
| 158 | } |
|---|
| 159 | void setRelativePenalties(double ingroup_inv_relpen, double outgroup_relpen) { |
|---|
| 160 | ingroupInvRelPen = ingroup_inv_relpen; |
|---|
| 161 | outgroupRelPen = outgroup_relpen; |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | // scoring: |
|---|
| 165 | GroupPenalty matchGroups(const TSpecSet& sourceSet, const RSpecSet& targetSet, long commonSpecies, long overallSpecies); |
|---|
| 166 | double calcUnknownMembersPenalty(const TSpecSet& sourceSet) const; |
|---|
| 167 | }; |
|---|
| 168 | |
|---|
| 169 | GB_ERROR NTREE_move_tree_info(GBDATA *gb_main, const char *tree_source, const char *tree_dest, const char *log_file, GroupTransferMode mode, GroupsToTransfer what, const GroupMatchScorer& scorer, const char *aci); |
|---|
| 170 | |
|---|
| 171 | #else |
|---|
| 172 | #error NT_tree_cmp.h included twice |
|---|
| 173 | #endif // NT_TREE_CMP_H |
|---|