1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : st_ml.hxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #ifndef ST_ML_HXX |
---|
12 | #define ST_ML_HXX |
---|
13 | |
---|
14 | #ifndef ARBDB_BASE_H |
---|
15 | #include <arbdb_base.h> |
---|
16 | #endif |
---|
17 | #ifndef _GLIBCXX_ALGORITHM |
---|
18 | #include <algorithm> |
---|
19 | #endif |
---|
20 | #ifndef ATTRIBUTES_H |
---|
21 | #include <attributes.h> |
---|
22 | #endif |
---|
23 | #ifndef ARBTOOLS_H |
---|
24 | #include <arbtools.h> |
---|
25 | #endif |
---|
26 | #ifndef CB_H |
---|
27 | #include <cb.h> |
---|
28 | #endif |
---|
29 | |
---|
30 | #define st_assert(cond) arb_assert(cond) |
---|
31 | |
---|
32 | class ColumnStat; |
---|
33 | class MostLikelySeq; |
---|
34 | |
---|
35 | class AW_root; |
---|
36 | class AW_awar; |
---|
37 | class AW_window; |
---|
38 | |
---|
39 | class AP_tree; |
---|
40 | class AP_tree_root; |
---|
41 | class AP_filter; |
---|
42 | class WeightedFilter; |
---|
43 | |
---|
44 | typedef unsigned char ST_ML_Color; |
---|
45 | |
---|
46 | typedef float ST_FLOAT; |
---|
47 | // typedef double ST_FLOAT; // careful, using double has quite an impact on the memory footprint |
---|
48 | |
---|
49 | enum DNA_Base { |
---|
50 | ST_A, |
---|
51 | ST_C, |
---|
52 | ST_G, |
---|
53 | ST_T, |
---|
54 | ST_GAP, |
---|
55 | ST_MAX_BASE, // [can't be changed w/o recoding several parts] |
---|
56 | ST_UNKNOWN = -1 |
---|
57 | }; |
---|
58 | |
---|
59 | struct ST_base_vector { |
---|
60 | ST_FLOAT b[ST_MAX_BASE]; // acgt- |
---|
61 | int ld_lik; |
---|
62 | ST_FLOAT lik; // likelihood = 2^ld_lik * lik * (b[0] + b[1] + b[2] ..) |
---|
63 | |
---|
64 | void setTo(ST_FLOAT freq) { |
---|
65 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
---|
66 | b[base] = freq; |
---|
67 | } |
---|
68 | } |
---|
69 | void multiplyWith(ST_FLOAT factor) { |
---|
70 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
---|
71 | b[base] *= factor; |
---|
72 | } |
---|
73 | } |
---|
74 | void increaseBy(ST_FLOAT inc) { |
---|
75 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
---|
76 | b[base] += inc; |
---|
77 | } |
---|
78 | } |
---|
79 | void makeInverseOf(const ST_base_vector& other, ST_FLOAT other_min_freq) { |
---|
80 | for (int base = ST_A; base<ST_MAX_BASE; ++base) { |
---|
81 | b[base] = other_min_freq / other.b[base]; |
---|
82 | } |
---|
83 | } |
---|
84 | |
---|
85 | void setBase(const ST_base_vector& frequencies, char base); |
---|
86 | |
---|
87 | void check_overflow(); |
---|
88 | void print(); |
---|
89 | |
---|
90 | ST_FLOAT summarize() const { return b[ST_A] + b[ST_C] + b[ST_G] + b[ST_T] + b[ST_GAP]; } |
---|
91 | ST_FLOAT min_frequency() { return std::min(std::min(std::min(b[ST_A], b[ST_C]), std::min(b[ST_G], b[ST_T])), b[ST_GAP]); } |
---|
92 | ST_FLOAT max_frequency() { return std::max(std::max(std::max(b[ST_A], b[ST_C]), std::max(b[ST_G], b[ST_T])), b[ST_GAP]); } |
---|
93 | |
---|
94 | inline ST_base_vector& operator *= (const ST_base_vector& other); |
---|
95 | }; |
---|
96 | |
---|
97 | class ST_rate_matrix { |
---|
98 | // symmetric matrix containing only two different values |
---|
99 | |
---|
100 | ST_FLOAT diag; // value for indices [0][0], [1][1]... |
---|
101 | ST_FLOAT rest; // other indices |
---|
102 | |
---|
103 | public: |
---|
104 | |
---|
105 | ST_rate_matrix() : diag(0.0), rest(0.0) {} |
---|
106 | void set(double dist, double TT_ratio); |
---|
107 | inline void transform(const ST_base_vector& in, ST_base_vector& out) const; |
---|
108 | void print(); |
---|
109 | }; |
---|
110 | |
---|
111 | class ST_ML : virtual Noncopyable { |
---|
112 | |
---|
113 | /* Note: Because we have only limited memory we split the |
---|
114 | * sequence into several ST_MAX_SEQ_PART-sized parts during stat-calculation |
---|
115 | */ |
---|
116 | |
---|
117 | char *alignment_name; |
---|
118 | GB_HASH *hash_2_ap_tree; // hash table to get from name to tree_node |
---|
119 | GB_HASH *keep_species_hash; // temporary hash to find ??? |
---|
120 | int refresh_n; |
---|
121 | int *not_valid; // which columns are valid |
---|
122 | |
---|
123 | AP_tree_root *tree_root; |
---|
124 | int latest_modification; // last mod |
---|
125 | size_t first_pos; // first.. |
---|
126 | size_t last_pos; // .. and last alignment position of current slice |
---|
127 | |
---|
128 | WindowCallbackSimple postcalc_cb; |
---|
129 | AW_window *cb_window; |
---|
130 | |
---|
131 | GBDATA *gb_main; |
---|
132 | |
---|
133 | ColumnStat *column_stat; |
---|
134 | // if column_stat == 0 -> rates and ttratio are owned by ST_ML, |
---|
135 | // otherwise they belong to column_stat |
---|
136 | const float *rates; // column independent |
---|
137 | const float *ttratio; // column independent |
---|
138 | |
---|
139 | ST_base_vector *base_frequencies; // column independent |
---|
140 | ST_base_vector *inv_base_frequencies; // column independent |
---|
141 | |
---|
142 | double max_dist; // max_dist for rate_matrices |
---|
143 | double step_size; // max_dist/step_size matrices |
---|
144 | |
---|
145 | int max_rate_matrices; // number of rate_matrices |
---|
146 | ST_rate_matrix *rate_matrices; // one matrix for each distance |
---|
147 | |
---|
148 | bool is_initialized; |
---|
149 | |
---|
150 | size_t distance_2_matrices_index(int distance) { |
---|
151 | if (distance<0) distance = -distance; // same matrix for neg/pos distance |
---|
152 | |
---|
153 | return distance >= max_rate_matrices |
---|
154 | ? (max_rate_matrices-1) |
---|
155 | : size_t(distance); |
---|
156 | } |
---|
157 | |
---|
158 | MostLikelySeq *getOrCreate_seq(AP_tree *node); |
---|
159 | |
---|
160 | const MostLikelySeq *get_mostlikely_sequence(AP_tree *node); |
---|
161 | void undo_tree(AP_tree *node); // opposite of get_mostlikely_sequence() |
---|
162 | |
---|
163 | void insert_tree_into_hash_rek(AP_tree *node); |
---|
164 | void create_matrices(double max_disti, int nmatrices); |
---|
165 | void create_frequencies(); |
---|
166 | |
---|
167 | static long delete_species(const char *key, long val, void *cd_st_ml); |
---|
168 | |
---|
169 | public: |
---|
170 | |
---|
171 | ST_ML(GBDATA *gb_main); |
---|
172 | ~ST_ML(); |
---|
173 | |
---|
174 | void create_column_statistic(AW_root *awr, const char *awarname, AW_awar *awar_default_alignment); |
---|
175 | ColumnStat *get_column_statistic() { return column_stat; } |
---|
176 | |
---|
177 | GB_ERROR calc_st_ml(const char *tree_name, |
---|
178 | const char *alignment_name, |
---|
179 | const char *species_names, // 0 -> all [marked] species (else species_names is a (char)1 separated list of species) |
---|
180 | int marked_only, |
---|
181 | ColumnStat *colstat, |
---|
182 | const WeightedFilter *weighted_filter) __ATTR__USERESULT; |
---|
183 | |
---|
184 | void cleanup(); |
---|
185 | |
---|
186 | const AP_filter *get_filter() const; |
---|
187 | size_t get_filtered_length() const; |
---|
188 | size_t get_alignment_length() const; |
---|
189 | |
---|
190 | ST_rate_matrix& get_matrix_for(int distance) { |
---|
191 | return rate_matrices[distance_2_matrices_index(distance)]; |
---|
192 | } |
---|
193 | |
---|
194 | GBDATA *get_gb_main() const { return gb_main; } |
---|
195 | const TreeNode *get_gbt_tree() const; |
---|
196 | size_t count_species_in_tree() const; |
---|
197 | |
---|
198 | AP_tree *find_node_by_name(const char *species_name); |
---|
199 | |
---|
200 | void set_postcalc_callback(WindowCallbackSimple postcalc_cb_, AW_window *cb_window_) { |
---|
201 | postcalc_cb = postcalc_cb_; |
---|
202 | cb_window = cb_window_; |
---|
203 | } |
---|
204 | void do_postcalc_callback() { if (postcalc_cb) postcalc_cb(cb_window); } |
---|
205 | |
---|
206 | size_t get_first_pos() const { return first_pos; } |
---|
207 | size_t get_last_pos() const { return last_pos; } |
---|
208 | |
---|
209 | double get_step_size() const { return step_size; } |
---|
210 | |
---|
211 | const ST_base_vector *get_inv_base_frequencies() const { return inv_base_frequencies; } |
---|
212 | const ST_base_vector& get_base_frequency_at(size_t pos) const { return base_frequencies[pos]; } |
---|
213 | float get_rate_at(size_t pos) const { return rates[pos]; } |
---|
214 | |
---|
215 | void set_modified(int *what = NULp); |
---|
216 | void set_refresh(); // set flag for refresh |
---|
217 | |
---|
218 | void clear_all(); // delete all caches |
---|
219 | |
---|
220 | MostLikelySeq *get_ml_vectors(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos); |
---|
221 | ST_ML_Color *get_color_string(const char *species_name, AP_tree *node, size_t start_ali_pos, size_t end_ali_pos); |
---|
222 | |
---|
223 | bool update_ml_likelihood(char *result[4], int& latest_update, const char *species_name, AP_tree *node); |
---|
224 | |
---|
225 | int refresh_needed(); |
---|
226 | }; |
---|
227 | |
---|
228 | #else |
---|
229 | #error st_ml.hxx included twice |
---|
230 | #endif // ST_ML_HXX |
---|