| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ST_quality.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "st_quality.hxx" |
|---|
| 12 | #include "st_ml.hxx" |
|---|
| 13 | #include "MostLikelySeq.hxx" |
|---|
| 14 | |
|---|
| 15 | #include <ColumnStat.hxx> |
|---|
| 16 | #include <BI_helix.hxx> |
|---|
| 17 | #include <AP_filter.hxx> |
|---|
| 18 | #include <arb_progress.h> |
|---|
| 19 | #include <arbdbt.h> |
|---|
| 20 | #include <arb_strbuf.h> |
|---|
| 21 | |
|---|
| 22 | #include <cctype> |
|---|
| 23 | #include <cmath> |
|---|
| 24 | |
|---|
| 25 | // -------------------------- |
|---|
| 26 | // LikelihoodRanges |
|---|
| 27 | |
|---|
| 28 | LikelihoodRanges::LikelihoodRanges(size_t no_of_ranges) { |
|---|
| 29 | ranges = no_of_ranges; |
|---|
| 30 | likelihood = new VarianceSampler[ranges]; |
|---|
| 31 | } |
|---|
| 32 | |
|---|
| 33 | VarianceSampler LikelihoodRanges::summarize_all_ranges() { |
|---|
| 34 | VarianceSampler allRange; |
|---|
| 35 | for (size_t range = 0; range<ranges; ++range) { |
|---|
| 36 | allRange.add(likelihood[range]); |
|---|
| 37 | } |
|---|
| 38 | return allRange; |
|---|
| 39 | } |
|---|
| 40 | |
|---|
| 41 | char *LikelihoodRanges::generate_string(Sampler& t_values) { |
|---|
| 42 | VarianceSampler allRange = summarize_all_ranges(); |
|---|
| 43 | char *report = new char[ranges+1]; |
|---|
| 44 | |
|---|
| 45 | if (allRange.get_count()>1) { |
|---|
| 46 | double all_median = allRange.get_median(); |
|---|
| 47 | double all_variance = allRange.get_variance(all_median); |
|---|
| 48 | double all_std_deviation = sqrt(all_variance); |
|---|
| 49 | |
|---|
| 50 | for (size_t range = 0; range<ranges; ++range) { |
|---|
| 51 | size_t count = likelihood[range].get_count(); |
|---|
| 52 | if (count>1) { |
|---|
| 53 | // perform students-t-test |
|---|
| 54 | |
|---|
| 55 | double standard_error = all_std_deviation / sqrt(count); // standard error of mean |
|---|
| 56 | double median_diff = likelihood[range].get_median() - all_median; |
|---|
| 57 | double t_value = median_diff / standard_error; // predictand of t_value is 0.0 |
|---|
| 58 | |
|---|
| 59 | t_values.add(fabs(t_value)); // sample t_values |
|---|
| 60 | |
|---|
| 61 | double val = 0.7 * t_value; |
|---|
| 62 | |
|---|
| 63 | // val outside [-0.5 .. +0.5] -> "t-test fails" |
|---|
| 64 | // |
|---|
| 65 | // 0.7 * t_value = approx. t_value / 1.428; |
|---|
| 66 | // |
|---|
| 67 | // 1.428 scheint ein fest-kodiertes quantil zu sein |
|---|
| 68 | // (bzw. dessen Haelfte, da ja nur der Range [-0.5 .. +0.5] getestet wird) |
|---|
| 69 | // Das eigentliche Quantil ist also ca. 2.856 |
|---|
| 70 | // |
|---|
| 71 | // Eigentlich haengt das Quantil von der Stichprobengroesse (hier "count", |
|---|
| 72 | // d.h. "Anzahl der Basen im aktuellen Range") ab. |
|---|
| 73 | // (vgl. http://de.wikipedia.org/wiki/T-Verteilung#Ausgew.C3.A4hlte_Quantile_der_t-Verteilung ) |
|---|
| 74 | |
|---|
| 75 | int ival = int (val + .5) + 5; |
|---|
| 76 | |
|---|
| 77 | if (ival > 9) ival = 9; |
|---|
| 78 | if (ival < 0) ival = 0; |
|---|
| 79 | |
|---|
| 80 | report[range] = (ival == 5) ? '-' : '0'+ival; |
|---|
| 81 | } |
|---|
| 82 | else { |
|---|
| 83 | report[range] = '.'; |
|---|
| 84 | } |
|---|
| 85 | } |
|---|
| 86 | } |
|---|
| 87 | else { |
|---|
| 88 | for (size_t range = 0; range<ranges; ++range) { |
|---|
| 89 | report[range] = '.'; |
|---|
| 90 | } |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | report[ranges] = 0; |
|---|
| 94 | |
|---|
| 95 | return report; |
|---|
| 96 | } |
|---|
| 97 | |
|---|
| 98 | // -------------------------- |
|---|
| 99 | // ColumnQualityInfo |
|---|
| 100 | |
|---|
| 101 | ColumnQualityInfo::ColumnQualityInfo(int seq_len, int bucket_size) |
|---|
| 102 | : stat_half(2) |
|---|
| 103 | , stat_five(5) |
|---|
| 104 | , stat_user(seq_len / bucket_size + 1) |
|---|
| 105 | , stat_cons(2) |
|---|
| 106 | {} |
|---|
| 107 | |
|---|
| 108 | #if defined(WARN_TODO) |
|---|
| 109 | #warning stat_cons unused - implement |
|---|
| 110 | #endif |
|---|
| 111 | |
|---|
| 112 | static void st_ml_add_sequence_part_to_stat(ST_ML *st_ml, ColumnStat */* awt_csp */, |
|---|
| 113 | const char *species_name, int seq_len, int bucket_size, |
|---|
| 114 | GB_HASH *species_to_info_hash, int start, int end) |
|---|
| 115 | { |
|---|
| 116 | AP_tree *node = STAT_find_node_by_name(st_ml, species_name); |
|---|
| 117 | if (node) { |
|---|
| 118 | MostLikelySeq *sml = st_ml->get_ml_vectors(0, node, start, end); |
|---|
| 119 | if (sml) { |
|---|
| 120 | ColumnQualityInfo *info; |
|---|
| 121 | if (start > 0) { |
|---|
| 122 | info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name); |
|---|
| 123 | } |
|---|
| 124 | else { |
|---|
| 125 | info = new ColumnQualityInfo(seq_len, bucket_size); |
|---|
| 126 | GBS_write_hash(species_to_info_hash, species_name, long (info)); |
|---|
| 127 | } |
|---|
| 128 | |
|---|
| 129 | int pos; |
|---|
| 130 | const char *source_sequence = 0; |
|---|
| 131 | int source_sequence_len = 0; |
|---|
| 132 | |
|---|
| 133 | GBDATA *gb_data = sml->get_bound_species_data(); |
|---|
| 134 | if (gb_data) { |
|---|
| 135 | source_sequence_len = GB_read_string_count(gb_data); |
|---|
| 136 | source_sequence = GB_read_char_pntr(gb_data); |
|---|
| 137 | } |
|---|
| 138 | if (end > source_sequence_len) end = source_sequence_len; |
|---|
| 139 | |
|---|
| 140 | ST_base_vector *vec = sml->tmp_out + start; |
|---|
| 141 | for (pos = start; pos < end; vec++, pos++) { |
|---|
| 142 | DNA_Base base = dna_table.char_to_enum(source_sequence[pos]); |
|---|
| 143 | if (base != ST_UNKNOWN && base != ST_GAP) { // don't count gaps |
|---|
| 144 | ST_FLOAT max = vec->max_frequency(); |
|---|
| 145 | |
|---|
| 146 | double val = max / (0.0001 + vec->b[base]); |
|---|
| 147 | double log_val = log(val); |
|---|
| 148 | double seq_rel_pos = double(pos)/seq_len; |
|---|
| 149 | |
|---|
| 150 | info->stat_half.add_relative(seq_rel_pos, log_val); |
|---|
| 151 | info->stat_five.add_relative(seq_rel_pos, log_val); |
|---|
| 152 | info->stat_user.add_relative(seq_rel_pos, log_val); |
|---|
| 153 | } |
|---|
| 154 | } |
|---|
| 155 | } |
|---|
| 156 | } |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | static GB_ERROR st_ml_add_quality_string_to_species(GBDATA *gb_main, const AP_filter *filter, |
|---|
| 160 | const char *alignment_name, const char *species_name, |
|---|
| 161 | size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report, |
|---|
| 162 | const char *dest_field) |
|---|
| 163 | { |
|---|
| 164 | GB_ERROR error = 0; |
|---|
| 165 | GBDATA *gb_species = GBT_find_species(gb_main, species_name); |
|---|
| 166 | if (!gb_species) error = GBS_global_string("Unknown species '%s'", species_name); |
|---|
| 167 | else { |
|---|
| 168 | ColumnQualityInfo *info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name); |
|---|
| 169 | if (!info) error = GBS_global_string("Statistic missing for species '%s'", species_name); |
|---|
| 170 | else { |
|---|
| 171 | GBDATA *gb_dest = GB_search(gb_species, dest_field, GB_STRING); |
|---|
| 172 | if (!gb_dest) error = GB_await_error(); |
|---|
| 173 | else { |
|---|
| 174 | Sampler t_values; // sample t-values here |
|---|
| 175 | char *user_str = info->stat_user.generate_string(t_values); |
|---|
| 176 | { |
|---|
| 177 | char *half_str = info->stat_half.generate_string(t_values); |
|---|
| 178 | char *five_str = info->stat_five.generate_string(t_values); |
|---|
| 179 | |
|---|
| 180 | GBS_strstruct *buffer = GBS_stropen(2*9 + 3*2 + (2+5+info->overall_range_count())); |
|---|
| 181 | |
|---|
| 182 | #if 1 |
|---|
| 183 | GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_median())); |
|---|
| 184 | GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_sum())); |
|---|
| 185 | #else |
|---|
| 186 | #warning t-value summary disabled for test-purposes |
|---|
| 187 | #endif |
|---|
| 188 | |
|---|
| 189 | GBS_chrcat(buffer, 'a'); GBS_strcat(buffer, half_str); GBS_chrcat(buffer, ' '); |
|---|
| 190 | GBS_chrcat(buffer, 'b'); GBS_strcat(buffer, five_str); GBS_chrcat(buffer, ' '); |
|---|
| 191 | GBS_chrcat(buffer, 'c'); GBS_strcat(buffer, user_str); |
|---|
| 192 | |
|---|
| 193 | error = GB_write_string(gb_dest, GBS_mempntr(buffer)); |
|---|
| 194 | GBS_strforget(buffer); |
|---|
| 195 | |
|---|
| 196 | delete [] half_str; |
|---|
| 197 | delete [] five_str; |
|---|
| 198 | } |
|---|
| 199 | |
|---|
| 200 | if (!error && report) { |
|---|
| 201 | // name "quality" handled specially by ../EDIT4/EDB_root_bact.cxx@chimera_check_quality_string |
|---|
| 202 | GBDATA *gb_report = GBT_add_data(gb_species, alignment_name, "quality", GB_STRING); |
|---|
| 203 | |
|---|
| 204 | if (!gb_report) error = GB_await_error(); |
|---|
| 205 | else { |
|---|
| 206 | char *report_str; |
|---|
| 207 | { |
|---|
| 208 | size_t filtered_len = filter->get_filtered_length(); |
|---|
| 209 | report_str = new char[filtered_len + 1]; |
|---|
| 210 | |
|---|
| 211 | for (size_t i = 0; i < filtered_len; i++) { |
|---|
| 212 | report_str[i] = user_str[i / bucket_size]; |
|---|
| 213 | } |
|---|
| 214 | report_str[filtered_len] = 0; |
|---|
| 215 | } |
|---|
| 216 | |
|---|
| 217 | { |
|---|
| 218 | char *blownUp_report = filter->blowup_string(report_str, ' '); |
|---|
| 219 | error = GB_write_string(gb_report, blownUp_report); |
|---|
| 220 | free(blownUp_report); |
|---|
| 221 | } |
|---|
| 222 | if (report == ST_QUALITY_REPORT_TEMP) error = GB_set_temporary(gb_report); |
|---|
| 223 | |
|---|
| 224 | delete [] report_str; |
|---|
| 225 | } |
|---|
| 226 | } |
|---|
| 227 | delete [] user_str; |
|---|
| 228 | } |
|---|
| 229 | } |
|---|
| 230 | } |
|---|
| 231 | |
|---|
| 232 | return error; |
|---|
| 233 | } |
|---|
| 234 | |
|---|
| 235 | static void destroy_ColumnQualityInfo(long cl_info) { |
|---|
| 236 | ColumnQualityInfo *info = (ColumnQualityInfo*)cl_info; |
|---|
| 237 | delete info; |
|---|
| 238 | } |
|---|
| 239 | |
|---|
| 240 | GB_ERROR st_ml_check_sequence_quality(GBDATA *gb_main, const char *tree_name, |
|---|
| 241 | const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size, |
|---|
| 242 | int marked_only, st_report_enum report, const char *dest_field) |
|---|
| 243 | { |
|---|
| 244 | #if defined(WARN_TODO) |
|---|
| 245 | #warning parameter 'alignment_name' can be retrieved from WeightedFilter (as soon as automatic mapping works for filters) |
|---|
| 246 | #endif |
|---|
| 247 | ST_ML st_ml(gb_main); |
|---|
| 248 | GB_ERROR error = st_ml.calc_st_ml(tree_name, alignment_name, 0, marked_only, colstat, weighted_filter); |
|---|
| 249 | arb_progress glob_progress("Sequence Quality Check"); |
|---|
| 250 | |
|---|
| 251 | if (!error) { |
|---|
| 252 | GB_HASH *species2info = GBS_create_dynaval_hash(GBT_get_species_count(gb_main), GB_IGNORE_CASE, destroy_ColumnQualityInfo); |
|---|
| 253 | size_t species_count; |
|---|
| 254 | GB_CSTR *snames = GBT_get_names_of_species_in_tree(st_ml.get_gbt_tree(), &species_count); |
|---|
| 255 | int seq_len = st_ml.get_filtered_length(); |
|---|
| 256 | size_t parts = (seq_len-1)/ST_MAX_SEQ_PART+1; |
|---|
| 257 | |
|---|
| 258 | { |
|---|
| 259 | arb_progress add_progress("Calculating stat", parts*species_count); |
|---|
| 260 | |
|---|
| 261 | for (int pos = 0; pos < seq_len && !error; pos += ST_MAX_SEQ_PART) { |
|---|
| 262 | int end = pos + ST_MAX_SEQ_PART - 1; |
|---|
| 263 | if (end > seq_len) end = seq_len; |
|---|
| 264 | |
|---|
| 265 | for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) { |
|---|
| 266 | st_ml_add_sequence_part_to_stat(&st_ml, colstat, *pspecies_name, seq_len, bucket_size, species2info, pos, end); |
|---|
| 267 | add_progress.inc_and_check_user_abort(error); |
|---|
| 268 | } |
|---|
| 269 | } |
|---|
| 270 | } |
|---|
| 271 | |
|---|
| 272 | if (!error) { |
|---|
| 273 | arb_progress res_progress("Calculating result", species_count); |
|---|
| 274 | const AP_filter *filter = st_ml.get_filter(); |
|---|
| 275 | |
|---|
| 276 | for (GB_CSTR *pspecies_name = snames; *pspecies_name && !error; pspecies_name++) { |
|---|
| 277 | error = st_ml_add_quality_string_to_species(gb_main, filter, alignment_name, *pspecies_name, bucket_size, species2info, report, dest_field); |
|---|
| 278 | res_progress.inc_and_check_user_abort(error); |
|---|
| 279 | } |
|---|
| 280 | } |
|---|
| 281 | |
|---|
| 282 | free(snames); |
|---|
| 283 | GBS_free_hash(species2info); |
|---|
| 284 | } |
|---|
| 285 | |
|---|
| 286 | return error; |
|---|
| 287 | } |
|---|