| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : st_ml.hxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #ifndef ST_ML_HXX |
|---|
| 12 | #define ST_ML_HXX |
|---|
| 13 | |
|---|
| 14 | #ifndef ARBDB_BASE_H |
|---|
| 15 | #include <arbdb_base.h> |
|---|
| 16 | #endif |
|---|
| 17 | #ifndef _GLIBCXX_ALGORITHM |
|---|
| 18 | #include <algorithm> |
|---|
| 19 | #endif |
|---|
| 20 | #ifndef ATTRIBUTES_H |
|---|
| 21 | #include <attributes.h> |
|---|
| 22 | #endif |
|---|
| 23 | #ifndef ARBTOOLS_H |
|---|
| 24 | #include <arbtools.h> |
|---|
| 25 | #endif |
|---|
| 26 | #ifndef CB_H |
|---|
| 27 | #include <cb.h> |
|---|
| 28 | #endif |
|---|
| 29 | |
|---|
| 30 | #define st_assert(cond) arb_assert(cond) |
|---|
| 31 | |
|---|
| 32 | class ColumnStat; |
|---|
| 33 | class MostLikelySeq; |
|---|
| 34 | |
|---|
| 35 | class AW_root; |
|---|
| 36 | class AW_awar; |
|---|
| 37 | class AW_window; |
|---|
| 38 | |
|---|
| 39 | class AP_tree; |
|---|
| 40 | class AP_tree_root; |
|---|
| 41 | class AP_filter; |
|---|
| 42 | class WeightedFilter; |
|---|
| 43 | |
|---|
| 44 | typedef unsigned char ST_ML_Color; |
|---|
| 45 | |
|---|
| 46 | typedef float ST_FLOAT; |
|---|
| 47 | // typedef double ST_FLOAT; // careful, using double has quite an impact on the memory footprint |
|---|
| 48 | |
|---|
| 49 | enum DNA_Base { |
|---|
| 50 | ST_A, |
|---|
| 51 | ST_C, |
|---|
| 52 | ST_G, |
|---|
| 53 | ST_T, |
|---|
| 54 | ST_GAP, |
|---|
| 55 | ST_MAX_BASE, // [can't be changed w/o recoding several parts] |
|---|
| 56 | ST_UNKNOWN = -1 |
|---|
| 57 | }; |
|---|
| 58 | |
|---|
| 59 | struct ST_base_vector { |
|---|
| 60 | ST_FLOAT b[ST_MAX_BASE]; // acgt- |
|---|
| 61 | int ld_lik; |
|---|
| 62 | ST_FLOAT lik; // likelihood = 2^ld_lik * lik * (b[0] + b[1] + b[2] ..) |
|---|
| 63 | |
|---|
| 64 | void setTo(ST_FLOAT freq) { |
|---|
| 65 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
|---|
| 66 | b[base] = freq; |
|---|
| 67 | } |
|---|
| 68 | } |
|---|
| 69 | void multiplyWith(ST_FLOAT factor) { |
|---|
| 70 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
|---|
| 71 | b[base] *= factor; |
|---|
| 72 | } |
|---|
| 73 | } |
|---|
| 74 | void increaseBy(ST_FLOAT inc) { |
|---|
| 75 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
|---|
| 76 | b[base] += inc; |
|---|
| 77 | } |
|---|
| 78 | } |
|---|
| 79 | void makeInverseOf(const ST_base_vector& other, ST_FLOAT other_min_freq) { |
|---|
| 80 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
|---|
| 81 | b[base] = other_min_freq / other.b[base]; |
|---|
| 82 | } |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | void setBase(const ST_base_vector& frequencies, char base); |
|---|
| 86 | |
|---|
| 87 | void check_overflow(); |
|---|
| 88 | void print(); |
|---|
| 89 | |
|---|
| 90 | ST_FLOAT summarize() const { return b[ST_A] + b[ST_C] + b[ST_G] + b[ST_T] + b[ST_GAP]; } |
|---|
| 91 | ST_FLOAT min_frequency() { return std::min(std::min(std::min(b[ST_A], b[ST_C]), std::min(b[ST_G], b[ST_T])), b[ST_GAP]); } |
|---|
| 92 | ST_FLOAT max_frequency() { return std::max(std::max(std::max(b[ST_A], b[ST_C]), std::max(b[ST_G], b[ST_T])), b[ST_GAP]); } |
|---|
| 93 | |
|---|
| 94 | inline ST_base_vector& operator *= (const ST_base_vector& other); |
|---|
| 95 | }; |
|---|
| 96 | |
|---|
| 97 | class ST_rate_matrix { |
|---|
| 98 | // symmetric matrix containing only two different values |
|---|
| 99 | |
|---|
| 100 | ST_FLOAT diag; // value for indices [0][0], [1][1]... |
|---|
| 101 | ST_FLOAT rest; // other indices |
|---|
| 102 | |
|---|
| 103 | public: |
|---|
| 104 | |
|---|
| 105 | ST_rate_matrix() : diag(0.0), rest(0.0) {} |
|---|
| 106 | void set(double dist, double TT_ratio); |
|---|
| 107 | inline void transform(const ST_base_vector& in, ST_base_vector& out) const; |
|---|
| 108 | void print(); |
|---|
| 109 | }; |
|---|
| 110 | |
|---|
| 111 | class ST_ML : virtual Noncopyable { |
|---|
| 112 | |
|---|
| 113 | /* Note: Because we have only limited memory we split the |
|---|
| 114 | * sequence into several ST_MAX_SEQ_PART-sized parts during stat-calculation |
|---|
| 115 | */ |
|---|
| 116 | |
|---|
| 117 | char *alignment_name; |
|---|
| 118 | GB_HASH *hash_2_ap_tree; // hash table to get from name to tree_node |
|---|
| 119 | GB_HASH *keep_species_hash; // temporary hash to find ??? |
|---|
| 120 | int refresh_n; |
|---|
| 121 | int *not_valid; // which columns are valid |
|---|
| 122 | |
|---|
| 123 | AP_tree_root *tree_root; |
|---|
| 124 | int latest_modification; // last mod |
|---|
| 125 | size_t first_pos; // first.. |
|---|
| 126 | size_t last_pos; // .. and last alignment position of current slice |
|---|
| 127 | |
|---|
| 128 | WindowCallbackSimple postcalc_cb; |
|---|
| 129 | AW_window *cb_window; |
|---|
| 130 | |
|---|
| 131 | GBDATA *gb_main; |
|---|
| 132 | |
|---|
| 133 | ColumnStat *column_stat; |
|---|
| 134 | // if column_stat == 0 -> rates and ttratio are owned by ST_ML, |
|---|
| 135 | // otherwise they belong to column_stat |
|---|
| 136 | const float *rates; // column independent |
|---|
| 137 | const float *ttratio; // column independent |
|---|
| 138 | |
|---|
| 139 | ST_base_vector *base_frequencies; // column independent |
|---|
| 140 | ST_base_vector *inv_base_frequencies; // column independent |
|---|
| 141 | |
|---|
| 142 | double max_dist; // max_dist for rate_matrices |
|---|
| 143 | double step_size; // max_dist/step_size matrices |
|---|
| 144 | |
|---|
| 145 | int max_rate_matrices; // number of rate_matrices |
|---|
| 146 | ST_rate_matrix *rate_matrices; // one matrix for each distance |
|---|
| 147 | |
|---|
| 148 | bool is_initialized; |
|---|
| 149 | |
|---|
| 150 | size_t distance_2_matrices_index(int distance) { |
|---|
| 151 | if (distance<0) distance = -distance; // same matrix for neg/pos distance |
|---|
| 152 | |
|---|
| 153 | return distance >= max_rate_matrices |
|---|
| 154 | ? (max_rate_matrices-1) |
|---|
| 155 | : size_t(distance); |
|---|
| 156 | } |
|---|
| 157 | |
|---|
| 158 | MostLikelySeq *getOrCreate_seq(AP_tree *node); |
|---|
| 159 | |
|---|
| 160 | const MostLikelySeq *get_mostlikely_sequence(AP_tree *node); |
|---|
| 161 | void undo_tree(AP_tree *node); // opposite of get_mostlikely_sequence() |
|---|
| 162 | |
|---|
| 163 | void insert_tree_into_hash_rek(AP_tree *node); |
|---|
| 164 | void create_matrices(double max_disti, int nmatrices); |
|---|
| 165 | void create_frequencies(); |
|---|
| 166 | |
|---|
| 167 | static long delete_species(const char *key, long val, void *cd_st_ml); |
|---|
| 168 | |
|---|
| 169 | public: |
|---|
| 170 | |
|---|
| 171 | ST_ML(GBDATA *gb_main); |
|---|
| 172 | ~ST_ML(); |
|---|
| 173 | |
|---|
| 174 | void create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment); |
|---|
| 175 | ColumnStat *get_column_statistic() { return column_stat; } |
|---|
| 176 | |
|---|
| 177 | GB_ERROR calc_st_ml(const char *tree_name, |
|---|
| 178 | const char *alignment_name, |
|---|
| 179 | const char *species_names, // 0 -> all [marked] species (else species_names is a (char)1 separated list of species) |
|---|
| 180 | int marked_only, |
|---|
| 181 | ColumnStat *colstat, |
|---|
| 182 | const WeightedFilter *weighted_filter) __ATTR__USERESULT; |
|---|
| 183 | |
|---|
| 184 | void cleanup(); |
|---|
| 185 | |
|---|
| 186 | const AP_filter *get_filter() const; |
|---|
| 187 | size_t get_filtered_length() const; |
|---|
| 188 | size_t get_alignment_length() const; |
|---|
| 189 | |
|---|
| 190 | ST_rate_matrix& get_matrix_for(int distance) { |
|---|
| 191 | return rate_matrices[distance_2_matrices_index(distance)]; |
|---|
| 192 | } |
|---|
| 193 | |
|---|
| 194 | GBDATA *get_gb_main() const { return gb_main; } |
|---|
| 195 | const TreeNode *get_gbt_tree() const; |
|---|
| 196 | size_t count_species_in_tree() const; |
|---|
| 197 | |
|---|
| 198 | AP_tree *find_node_by_name(const char *species_name); |
|---|
| 199 | |
|---|
| 200 | void set_postcalc_callback(WindowCallbackSimple postcalc_cb_, AW_window *cb_window_) { |
|---|
| 201 | postcalc_cb = postcalc_cb_; |
|---|
| 202 | cb_window = cb_window_; |
|---|
| 203 | } |
|---|
| 204 | void do_postcalc_callback() { if (postcalc_cb) postcalc_cb(cb_window); } |
|---|
| 205 | |
|---|
| 206 | size_t get_first_pos() const { return first_pos; } |
|---|
| 207 | size_t get_last_pos() const { return last_pos; } |
|---|
| 208 | |
|---|
| 209 | double get_step_size() const { return step_size; } |
|---|
| 210 | |
|---|
| 211 | const ST_base_vector *get_inv_base_frequencies() const { return inv_base_frequencies; } |
|---|
| 212 | const ST_base_vector& get_base_frequency_at(size_t pos) const { return base_frequencies[pos]; } |
|---|
| 213 | float get_rate_at(size_t pos) const { return rates[pos]; } |
|---|
| 214 | |
|---|
| 215 | void set_modified(int *what = NULp); |
|---|
| 216 | void set_refresh(); // set flag for refresh |
|---|
| 217 | |
|---|
| 218 | void clear_all(); // delete all caches |
|---|
| 219 | |
|---|
| 220 | MostLikelySeq *get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos); |
|---|
| 221 | ST_ML_Color *get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos); |
|---|
| 222 | |
|---|
| 223 | bool update_ml_likelihood(char *result[4], int& latest_update, const char *species_name, AP_tree *node); |
|---|
| 224 | |
|---|
| 225 | int refresh_needed(); |
|---|
| 226 | }; |
|---|
| 227 | |
|---|
| 228 | #else |
|---|
| 229 | #error st_ml.hxx included twice |
|---|
| 230 | #endif // ST_ML_HXX |
|---|