1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : ST_quality.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "st_quality.hxx" |
---|
12 | #include "st_ml.hxx" |
---|
13 | #include "MostLikelySeq.hxx" |
---|
14 | |
---|
15 | #include <ColumnStat.hxx> |
---|
16 | #include <BI_helix.hxx> |
---|
17 | #include <AP_filter.hxx> |
---|
18 | #include <arb_progress.h> |
---|
19 | #include <arbdbt.h> |
---|
20 | #include <arb_strbuf.h> |
---|
21 | |
---|
22 | #include <cctype> |
---|
23 | #include <cmath> |
---|
24 | #include <items.h> |
---|
25 | |
---|
26 | // -------------------------- |
---|
27 | // LikelihoodRanges |
---|
28 | |
---|
29 | LikelihoodRanges::LikelihoodRanges(size_t no_of_ranges) { |
---|
30 | ranges = no_of_ranges; |
---|
31 | likelihood = new VarianceSampler[ranges]; |
---|
32 | } |
---|
33 | |
---|
34 | VarianceSampler LikelihoodRanges::summarize_all_ranges() { |
---|
35 | VarianceSampler allRange; |
---|
36 | for (size_t range = 0; range<ranges; ++range) { |
---|
37 | allRange.add(likelihood[range]); |
---|
38 | } |
---|
39 | return allRange; |
---|
40 | } |
---|
41 | |
---|
42 | char *LikelihoodRanges::generate_string(Sampler& t_values) { |
---|
43 | VarianceSampler allRange = summarize_all_ranges(); |
---|
44 | char *report = new char[ranges+1]; |
---|
45 | |
---|
46 | if (allRange.get_count()>1) { |
---|
47 | double all_median = allRange.get_median(); |
---|
48 | double all_variance = allRange.get_variance(all_median); |
---|
49 | double all_std_deviation = sqrt(all_variance); |
---|
50 | |
---|
51 | for (size_t range = 0; range<ranges; ++range) { |
---|
52 | size_t count = likelihood[range].get_count(); |
---|
53 | if (count>1) { |
---|
54 | // perform students-t-test |
---|
55 | |
---|
56 | double standard_error = all_std_deviation / sqrt(count); // standard error of mean |
---|
57 | double median_diff = likelihood[range].get_median() - all_median; |
---|
58 | double t_value = median_diff / standard_error; // predictand of t_value is 0.0 |
---|
59 | |
---|
60 | t_values.add(fabs(t_value)); // sample t_values |
---|
61 | |
---|
62 | double val = 0.7 * t_value; |
---|
63 | |
---|
64 | // val outside [-0.5 .. +0.5] -> "t-test fails" |
---|
65 | // |
---|
66 | // 0.7 * t_value = approx. t_value / 1.428; |
---|
67 | // |
---|
68 | // 1.428 scheint ein fest-kodiertes quantil zu sein |
---|
69 | // (bzw. dessen Haelfte, da ja nur der Range [-0.5 .. +0.5] getestet wird) |
---|
70 | // Das eigentliche Quantil ist also ca. 2.856 |
---|
71 | // |
---|
72 | // Eigentlich haengt das Quantil von der Stichprobengroesse (hier "count", |
---|
73 | // d.h. "Anzahl der Basen im aktuellen Range") ab. |
---|
74 | // (vgl. http://de.wikipedia.org/wiki/T-Verteilung#Ausgew.C3.A4hlte_Quantile_der_t-Verteilung ) |
---|
75 | |
---|
76 | int ival = int (val + .5) + 5; |
---|
77 | |
---|
78 | if (ival > 9) ival = 9; |
---|
79 | if (ival < 0) ival = 0; |
---|
80 | |
---|
81 | report[range] = (ival == 5) ? '-' : '0'+ival; |
---|
82 | } |
---|
83 | else { |
---|
84 | report[range] = '.'; |
---|
85 | } |
---|
86 | } |
---|
87 | } |
---|
88 | else { |
---|
89 | for (size_t range = 0; range<ranges; ++range) { |
---|
90 | report[range] = '.'; |
---|
91 | } |
---|
92 | } |
---|
93 | |
---|
94 | report[ranges] = 0; |
---|
95 | |
---|
96 | return report; |
---|
97 | } |
---|
98 | |
---|
99 | // -------------------------- |
---|
100 | // ColumnQualityInfo |
---|
101 | |
---|
102 | ColumnQualityInfo::ColumnQualityInfo(int seq_len, int bucket_size) : |
---|
103 | stat_half(2), |
---|
104 | stat_five(5), |
---|
105 | stat_user(seq_len / bucket_size + 1), |
---|
106 | stat_cons(2) // @@@ stat_cons is unused - implement |
---|
107 | {} |
---|
108 | |
---|
109 | |
---|
110 | static void st_ml_add_sequence_part_to_stat(ST_ML *st_ml, ColumnStat */* awt_csp */, |
---|
111 | const char *species_name, int seq_len, int bucket_size, |
---|
112 | GB_HASH *species_to_info_hash, int start, int end) |
---|
113 | { |
---|
114 | AP_tree *node = STAT_find_node_by_name(st_ml, species_name); |
---|
115 | if (node) { |
---|
116 | MostLikelySeq *sml = st_ml->get_ml_vectors(NULp, node, start, end); |
---|
117 | if (sml) { |
---|
118 | ColumnQualityInfo *info; |
---|
119 | if (start > 0) { |
---|
120 | info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name); |
---|
121 | } |
---|
122 | else { |
---|
123 | info = new ColumnQualityInfo(seq_len, bucket_size); |
---|
124 | GBS_write_hash(species_to_info_hash, species_name, long (info)); |
---|
125 | } |
---|
126 | |
---|
127 | int pos; |
---|
128 | const char *source_sequence = NULp; |
---|
129 | int source_sequence_len = 0; |
---|
130 | |
---|
131 | GBDATA *gb_data = sml->get_bound_species_data(); |
---|
132 | if (gb_data) { |
---|
133 | source_sequence_len = GB_read_string_count(gb_data); |
---|
134 | source_sequence = GB_read_char_pntr(gb_data); |
---|
135 | } |
---|
136 | if (end > source_sequence_len) end = source_sequence_len; |
---|
137 | |
---|
138 | ST_base_vector *vec = sml->tmp_out + start; |
---|
139 | for (pos = start; pos < end; vec++, pos++) { |
---|
140 | DNA_Base base = dna_table.char_to_enum(source_sequence[pos]); |
---|
141 | if (base != ST_UNKNOWN && base != ST_GAP) { // don't count gaps |
---|
142 | ST_FLOAT max = vec->max_frequency(); |
---|
143 | |
---|
144 | double val = max / (0.0001 + vec->b[base]); |
---|
145 | double log_val = log(val); |
---|
146 | double seq_rel_pos = double(pos)/seq_len; |
---|
147 | |
---|
148 | info->stat_half.add_relative(seq_rel_pos, log_val); |
---|
149 | info->stat_five.add_relative(seq_rel_pos, log_val); |
---|
150 | info->stat_user.add_relative(seq_rel_pos, log_val); |
---|
151 | } |
---|
152 | } |
---|
153 | } |
---|
154 | } |
---|
155 | } |
---|
156 | |
---|
157 | static GB_ERROR st_ml_add_quality_string_to_species(GBDATA *gb_main, const AP_filter *filter, |
---|
158 | const char *alignment_name, const char *species_name, |
---|
159 | size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report, |
---|
160 | const char *dest_field) |
---|
161 | { |
---|
162 | GB_ERROR error = NULp; |
---|
163 | GBDATA *gb_species = GBT_find_species(gb_main, species_name); |
---|
164 | if (!gb_species) error = GBS_global_string("Unknown species '%s'", species_name); |
---|
165 | else { |
---|
166 | ColumnQualityInfo *info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name); |
---|
167 | if (!info) error = GBS_global_string("Statistic missing for species '%s'", species_name); |
---|
168 | else { |
---|
169 | GBDATA *gb_dest = GBT_searchOrCreate_itemfield_according_to_changekey(gb_species, dest_field, SPECIES_get_selector().change_key_path); |
---|
170 | if (!gb_dest) error = GB_await_error(); |
---|
171 | else { |
---|
172 | Sampler t_values; // sample t-values here |
---|
173 | char *user_str = info->stat_user.generate_string(t_values); |
---|
174 | { |
---|
175 | char *half_str = info->stat_half.generate_string(t_values); |
---|
176 | char *five_str = info->stat_five.generate_string(t_values); |
---|
177 | |
---|
178 | GBS_strstruct *buffer = GBS_stropen(2*9 + 3*2 + (2+5+info->overall_range_count())); |
---|
179 | |
---|
180 | #if 1 |
---|
181 | GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_median())); |
---|
182 | GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_sum())); |
---|
183 | #else |
---|
184 | #warning t-value summary disabled for test-purposes |
---|
185 | #endif |
---|
186 | |
---|
187 | GBS_chrcat(buffer, 'a'); GBS_strcat(buffer, half_str); GBS_chrcat(buffer, ' '); |
---|
188 | GBS_chrcat(buffer, 'b'); GBS_strcat(buffer, five_str); GBS_chrcat(buffer, ' '); |
---|
189 | GBS_chrcat(buffer, 'c'); GBS_strcat(buffer, user_str); |
---|
190 | |
---|
191 | error = GB_write_string(gb_dest, GBS_mempntr(buffer)); |
---|
192 | GBS_strforget(buffer); |
---|
193 | |
---|
194 | delete [] half_str; |
---|
195 | delete [] five_str; |
---|
196 | } |
---|
197 | |
---|
198 | if (!error && report) { |
---|
199 | // name "quality" handled specially by ../EDIT4/EDB_root_bact.cxx@chimera_check_quality_string |
---|
200 | GBDATA *gb_report = GBT_add_data(gb_species, alignment_name, "quality", GB_STRING); |
---|
201 | |
---|
202 | if (!gb_report) error = GB_await_error(); |
---|
203 | else { |
---|
204 | char *report_str; |
---|
205 | { |
---|
206 | size_t filtered_len = filter->get_filtered_length(); |
---|
207 | report_str = new char[filtered_len + 1]; |
---|
208 | |
---|
209 | for (size_t i = 0; i < filtered_len; i++) { |
---|
210 | report_str[i] = user_str[i / bucket_size]; |
---|
211 | } |
---|
212 | report_str[filtered_len] = 0; |
---|
213 | } |
---|
214 | |
---|
215 | { |
---|
216 | char *blownUp_report = filter->blowup_string(report_str, ' '); |
---|
217 | error = GB_write_string(gb_report, blownUp_report); |
---|
218 | free(blownUp_report); |
---|
219 | } |
---|
220 | if (report == ST_QUALITY_REPORT_TEMP) error = GB_set_temporary(gb_report); |
---|
221 | |
---|
222 | delete [] report_str; |
---|
223 | } |
---|
224 | } |
---|
225 | delete [] user_str; |
---|
226 | } |
---|
227 | } |
---|
228 | } |
---|
229 | |
---|
230 | return error; |
---|
231 | } |
---|
232 | |
---|
233 | static void destroy_ColumnQualityInfo(long cl_info) { |
---|
234 | ColumnQualityInfo *info = (ColumnQualityInfo*)cl_info; |
---|
235 | delete info; |
---|
236 | } |
---|
237 | |
---|
238 | GB_ERROR st_ml_check_sequence_quality(GBDATA *gb_main, const char *tree_name, |
---|
239 | const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size, |
---|
240 | int marked_only, st_report_enum report, const char *dest_field) |
---|
241 | { |
---|
242 | // @@@ parameter 'alignment_name' can be retrieved from WeightedFilter (as soon as automatic mapping works for filters) |
---|
243 | |
---|
244 | ST_ML st_ml(gb_main); |
---|
245 | GB_ERROR error = st_ml.calc_st_ml(tree_name, alignment_name, NULp, marked_only, colstat, weighted_filter); |
---|
246 | |
---|
247 | if (!error) { |
---|
248 | GB_HASH *species2info = GBS_create_dynaval_hash(GBT_get_species_count(gb_main), GB_IGNORE_CASE, destroy_ColumnQualityInfo); |
---|
249 | size_t species_count; |
---|
250 | |
---|
251 | GB_CSTR *snames = GBT_get_names_of_species_in_tree(st_ml.get_gbt_tree(), &species_count); |
---|
252 | int seq_len = st_ml.get_filtered_length(); |
---|
253 | size_t parts = (seq_len-1)/ST_MAX_SEQ_PART+1; |
---|
254 | |
---|
255 | { |
---|
256 | arb_progress add_progress("Calculating stat", parts*species_count); |
---|
257 | |
---|
258 | for (int pos = 0; pos < seq_len && !error; pos += ST_MAX_SEQ_PART) { |
---|
259 | int end = pos + ST_MAX_SEQ_PART - 1; |
---|
260 | if (end > seq_len) end = seq_len; |
---|
261 | |
---|
262 | for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) { |
---|
263 | st_ml_add_sequence_part_to_stat(&st_ml, colstat, *pspecies_name, seq_len, bucket_size, species2info, pos, end); |
---|
264 | add_progress.inc_and_check_user_abort(error); |
---|
265 | } |
---|
266 | } |
---|
267 | } |
---|
268 | |
---|
269 | if (!error) { |
---|
270 | arb_progress res_progress("Calculating reports", species_count); |
---|
271 | const AP_filter *filter = st_ml.get_filter(); |
---|
272 | |
---|
273 | for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) { |
---|
274 | error = st_ml_add_quality_string_to_species(gb_main, filter, alignment_name, *pspecies_name, bucket_size, species2info, report, dest_field); |
---|
275 | res_progress.inc_and_check_user_abort(error); |
---|
276 | } |
---|
277 | } |
---|
278 | |
---|
279 | free(snames); |
---|
280 | GBS_free_hash(species2info); |
---|
281 | } |
---|
282 | |
---|
283 | return error; |
---|
284 | } |
---|