| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ColumnStat.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "ColumnStat.hxx" |
|---|
| 12 | #include "awt_sel_boxes.hxx" |
|---|
| 13 | #include "ga_local.h" |
|---|
| 14 | #include <AP_filter.hxx> |
|---|
| 15 | #include <BI_helix.hxx> |
|---|
| 16 | |
|---|
| 17 | #include <aw_root.hxx> |
|---|
| 18 | #include <aw_awar.hxx> |
|---|
| 19 | #include <aw_msg.hxx> |
|---|
| 20 | #include <aw_select.hxx> |
|---|
| 21 | |
|---|
| 22 | #include <arbdbt.h> |
|---|
| 23 | #include <arb_strbuf.h> |
|---|
| 24 | #include <arb_defs.h> |
|---|
| 25 | |
|---|
| 26 | #define SRT_AWARCOLSTAT_NAME "/name=/name" |
|---|
| 27 | #define SRT_AWARCOLSTAT_ALIGNMENT "/name=/alignment" |
|---|
| 28 | #define SRT_AWARCOLSTAT_SMOOTH "/name=/smooth" |
|---|
| 29 | #define SRT_AWARCOLSTAT_ENABLE_HELIX "/name=/enable_helix" |
|---|
| 30 | |
|---|
| 31 | void ColumnStat::refresh_sai_selection_list() { |
|---|
| 32 | GB_transaction ta(gb_main); |
|---|
| 33 | |
|---|
| 34 | freeset(alignment_name, awr->awar(awar_alignment)->read_string()); |
|---|
| 35 | freeset(type_path, GBS_string_eval(alignment_name, "*=*1/_TYPE")); |
|---|
| 36 | |
|---|
| 37 | if (saisel) saisel->refresh(); |
|---|
| 38 | } |
|---|
| 39 | |
|---|
| 40 | static void refresh_columnstat_selection(AW_root*, ColumnStat *column_stat) { |
|---|
| 41 | column_stat->refresh_sai_selection_list(); |
|---|
| 42 | } |
|---|
| 43 | |
|---|
| 44 | ColumnStat::ColumnStat(GBDATA *gb_main_, AW_root *awr_, const char *awar_template, AW_awar *awar_used_alignment) : |
|---|
| 45 | gb_main(gb_main_), |
|---|
| 46 | awr(awr_), |
|---|
| 47 | alignment_name(NULp), |
|---|
| 48 | type_path(NULp), |
|---|
| 49 | saisel(NULp), |
|---|
| 50 | seq_len(0), |
|---|
| 51 | weights(NULp), |
|---|
| 52 | rates(NULp), |
|---|
| 53 | ttratio(NULp), |
|---|
| 54 | is_helix(NULp), |
|---|
| 55 | mut_sum(NULp), |
|---|
| 56 | freq_sum(NULp), |
|---|
| 57 | desc(NULp) |
|---|
| 58 | { |
|---|
| 59 | /* awar_template == ".../name" |
|---|
| 60 | * -> generated ".../alignment" |
|---|
| 61 | * ".../smooth" |
|---|
| 62 | * ".../enable_helix" |
|---|
| 63 | */ |
|---|
| 64 | |
|---|
| 65 | memset(frequency, 0, ARRAY_ELEMS(frequency)*sizeof(*frequency)); |
|---|
| 66 | |
|---|
| 67 | awar_name = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_NAME); |
|---|
| 68 | awar_alignment = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_ALIGNMENT); |
|---|
| 69 | awar_smooth = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_SMOOTH); |
|---|
| 70 | awar_enable_helix = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_ENABLE_HELIX); |
|---|
| 71 | |
|---|
| 72 | ga_assert(strcmp(awar_name, awar_alignment) != 0); // awar_template must end with (or contain) "/name" |
|---|
| 73 | |
|---|
| 74 | awr->awar_string(awar_name, ""); |
|---|
| 75 | awr->awar_string(awar_alignment)->map(awar_used_alignment); |
|---|
| 76 | awr->awar_int(awar_smooth); |
|---|
| 77 | awr->awar_int(awar_enable_helix, 1); |
|---|
| 78 | |
|---|
| 79 | awr->awar(awar_alignment)->add_callback(makeRootCallback(refresh_columnstat_selection, this)); |
|---|
| 80 | refresh_sai_selection_list(); // same as refresh_columnstat_selection(this) |
|---|
| 81 | } |
|---|
| 82 | |
|---|
| 83 | void ColumnStat::forget() { |
|---|
| 84 | delete [] weights; weights = NULp; |
|---|
| 85 | delete [] rates; rates = NULp; |
|---|
| 86 | delete [] ttratio; ttratio = NULp; |
|---|
| 87 | delete [] is_helix; is_helix = NULp; |
|---|
| 88 | delete [] mut_sum; mut_sum = NULp; |
|---|
| 89 | delete [] freq_sum; freq_sum = NULp; |
|---|
| 90 | delete desc; desc = NULp; |
|---|
| 91 | |
|---|
| 92 | for (int i=0; i<256; i++) { |
|---|
| 93 | delete [] frequency[i]; |
|---|
| 94 | frequency[i] = NULp; |
|---|
| 95 | } |
|---|
| 96 | seq_len = 0; // mark invalid |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | ColumnStat::~ColumnStat() { |
|---|
| 100 | forget(); |
|---|
| 101 | delete awar_name; |
|---|
| 102 | delete awar_alignment; |
|---|
| 103 | delete awar_smooth; |
|---|
| 104 | delete awar_enable_helix; |
|---|
| 105 | } |
|---|
| 106 | |
|---|
| 107 | GB_ERROR ColumnStat::calculate(AP_filter *filter) { |
|---|
| 108 | forget(); // delete previously calculated stats |
|---|
| 109 | |
|---|
| 110 | GB_transaction ta(gb_main); |
|---|
| 111 | GB_ERROR error = filter ? filter->is_invalid() : NULp; |
|---|
| 112 | size_t alignment_length = 0; |
|---|
| 113 | |
|---|
| 114 | if (!error) { |
|---|
| 115 | long alignment_length_l = GBT_get_alignment_len(gb_main, alignment_name); |
|---|
| 116 | if (alignment_length_l<=0) { |
|---|
| 117 | error = GB_await_error(); |
|---|
| 118 | } |
|---|
| 119 | else if (alignment_length_l == 1) { |
|---|
| 120 | error = GB_append_exportedError(GBS_global_string("bad size for alignment '%s'", alignment_name)); |
|---|
| 121 | } |
|---|
| 122 | else { |
|---|
| 123 | alignment_length = alignment_length_l; |
|---|
| 124 | if (filter && filter->get_length() != alignment_length) { |
|---|
| 125 | error = GBS_global_string("Incompatible filter (filter=%zu bp, alignment=%zu bp)", |
|---|
| 126 | filter->get_length(), alignment_length); |
|---|
| 127 | } |
|---|
| 128 | } |
|---|
| 129 | } |
|---|
| 130 | seq_len = 0; |
|---|
| 131 | |
|---|
| 132 | if (!error) { |
|---|
| 133 | char *sai_name = awr->awar(awar_name)->read_string(); |
|---|
| 134 | GBDATA *gb_sai = GBT_find_SAI(gb_main, sai_name); |
|---|
| 135 | |
|---|
| 136 | if (!gb_sai) { |
|---|
| 137 | if (strcmp(sai_name, "") != 0) { // no error if SAI name is empty |
|---|
| 138 | error = GBS_global_string("Column statistic: no such SAI: '%s'", sai_name); |
|---|
| 139 | } |
|---|
| 140 | } |
|---|
| 141 | |
|---|
| 142 | if (gb_sai) { |
|---|
| 143 | GBDATA *gb_ali = NULp; |
|---|
| 144 | GBDATA *gb_freqs = NULp; |
|---|
| 145 | if (!error) { |
|---|
| 146 | gb_ali = GB_entry(gb_sai, alignment_name); |
|---|
| 147 | if (!gb_ali) error = GBS_global_string("SAI '%s' does not exist in alignment '%s'", sai_name, alignment_name); |
|---|
| 148 | } |
|---|
| 149 | if (!error) { |
|---|
| 150 | gb_freqs = GB_entry(gb_ali, "FREQUENCIES"); |
|---|
| 151 | if (!gb_freqs) error = GBS_global_string("Column statistic '%s' is invalid (has no FREQUENCIES)", sai_name); |
|---|
| 152 | } |
|---|
| 153 | |
|---|
| 154 | if (!error) { |
|---|
| 155 | seq_len = filter ? filter->get_filtered_length() : alignment_length; |
|---|
| 156 | |
|---|
| 157 | delete [] weights; weights = new GB_UINT4[seq_len]; |
|---|
| 158 | delete [] rates; rates = new float[seq_len]; |
|---|
| 159 | delete [] ttratio; ttratio = new float[seq_len]; |
|---|
| 160 | delete [] is_helix; is_helix = new bool[seq_len]; |
|---|
| 161 | delete [] mut_sum; mut_sum = new GB_UINT4[seq_len]; |
|---|
| 162 | delete [] freq_sum; freq_sum = new GB_UINT4[seq_len]; |
|---|
| 163 | |
|---|
| 164 | delete desc; desc = NULp; |
|---|
| 165 | |
|---|
| 166 | size_t i; |
|---|
| 167 | size_t j; |
|---|
| 168 | for (i=0; i<256; i++) { delete frequency[i]; frequency[i]=NULp; } |
|---|
| 169 | |
|---|
| 170 | long use_helix = awr->awar(awar_enable_helix)->read_int(); |
|---|
| 171 | |
|---|
| 172 | for (i=0; i<seq_len; i++) { // LOOP_VECTORIZED |
|---|
| 173 | is_helix[i] = false; |
|---|
| 174 | weights[i] = 1; |
|---|
| 175 | } |
|---|
| 176 | |
|---|
| 177 | if (!error && use_helix) { // calculate weights and helix filter |
|---|
| 178 | BI_helix helix; |
|---|
| 179 | error = helix.init(this->gb_main, alignment_name); |
|---|
| 180 | if (error) { |
|---|
| 181 | aw_message(error); |
|---|
| 182 | error = NULp; |
|---|
| 183 | goto no_helix; |
|---|
| 184 | } |
|---|
| 185 | error = NULp; |
|---|
| 186 | for (j=i=0; i<alignment_length; i++) { |
|---|
| 187 | if (!filter || filter->use_position(i)) { |
|---|
| 188 | if (helix.is_pairpos(i)) { |
|---|
| 189 | is_helix[j] = true; |
|---|
| 190 | weights[j] = 1; |
|---|
| 191 | } |
|---|
| 192 | else { |
|---|
| 193 | is_helix[j] = false; |
|---|
| 194 | weights[j] = 2; |
|---|
| 195 | } |
|---|
| 196 | j++; |
|---|
| 197 | } |
|---|
| 198 | } |
|---|
| 199 | } |
|---|
| 200 | no_helix : |
|---|
| 201 | |
|---|
| 202 | for (i=0; i<seq_len; i++) freq_sum[i] = 0; |
|---|
| 203 | |
|---|
| 204 | GBDATA *gb_freq; |
|---|
| 205 | GB_UINT4 *freqi[256]; |
|---|
| 206 | for (i=0; i<256; i++) freqi[i] = NULp; |
|---|
| 207 | |
|---|
| 208 | int wf; // ********* read the frequency statistic |
|---|
| 209 | for (gb_freq = GB_child(gb_freqs); gb_freq; gb_freq = GB_nextChild(gb_freq)) { |
|---|
| 210 | char *key = GB_read_key(gb_freq); |
|---|
| 211 | if (key[0] == 'N' && key[1] && !key[2]) { |
|---|
| 212 | wf = key[1]; |
|---|
| 213 | freqi[wf] = GB_read_ints(gb_freq); |
|---|
| 214 | } |
|---|
| 215 | free(key); |
|---|
| 216 | } |
|---|
| 217 | |
|---|
| 218 | GB_UINT4 *minmut = NULp; |
|---|
| 219 | GB_UINT4 *transver = NULp; |
|---|
| 220 | GBDATA *gb_minmut = GB_entry(gb_freqs, "TRANSITIONS"); |
|---|
| 221 | GBDATA *gb_transver = GB_entry(gb_freqs, "TRANSVERSIONS"); |
|---|
| 222 | |
|---|
| 223 | if (!gb_minmut) error = GBS_global_string("Column statistic '%s' is invalid (has no FREQUENCIES/TRANSITIONS)", sai_name); |
|---|
| 224 | if (!gb_transver) error = GBS_global_string("Column statistic '%s' is invalid (has no FREQUENCIES/TRANSVERSIONS)", sai_name); |
|---|
| 225 | |
|---|
| 226 | if (gb_minmut) { |
|---|
| 227 | minmut = GB_read_ints(gb_minmut); |
|---|
| 228 | if (!minmut) error = GBS_global_string("Column statistic '%s' is invalid (error reading FREQUENCIES/TRANSITIONS: %s)", sai_name, GB_await_error()); |
|---|
| 229 | } |
|---|
| 230 | if (gb_transver) { |
|---|
| 231 | transver = GB_read_ints(gb_transver); |
|---|
| 232 | if (!transver) error = GBS_global_string("Column statistic '%s' is invalid (error reading FREQUENCIES/TRANSVERSIONS: %s)", sai_name, GB_await_error()); |
|---|
| 233 | } |
|---|
| 234 | |
|---|
| 235 | if (!error) { |
|---|
| 236 | unsigned long max_freq_sum = 0; |
|---|
| 237 | for (wf = 0; wf<256; wf++) { // ********* calculate sum of mutations |
|---|
| 238 | if (!freqi[wf]) continue; // ********* calculate nbases for each col. |
|---|
| 239 | for (j=i=0; i<alignment_length; i++) { |
|---|
| 240 | if (filter && !filter->use_position(i)) continue; |
|---|
| 241 | freq_sum[j] += freqi[wf][i]; |
|---|
| 242 | mut_sum[j] = minmut[i]; |
|---|
| 243 | j++; |
|---|
| 244 | } |
|---|
| 245 | } |
|---|
| 246 | for (i=0; i<seq_len; i++) |
|---|
| 247 | if (freq_sum[i] > max_freq_sum) |
|---|
| 248 | max_freq_sum = freq_sum[i]; |
|---|
| 249 | unsigned long min_freq_sum = DIST_MIN_SEQ(max_freq_sum); |
|---|
| 250 | |
|---|
| 251 | for (wf = 0; wf<256; wf++) { |
|---|
| 252 | if (!freqi[wf]) continue; |
|---|
| 253 | frequency[wf] = new float[seq_len]; |
|---|
| 254 | for (j=i=0; i<alignment_length; i++) { |
|---|
| 255 | if (filter && !filter->use_position(i)) continue; |
|---|
| 256 | if (freq_sum[j] > min_freq_sum) { |
|---|
| 257 | frequency[wf][j] = freqi[wf][i]/(float)freq_sum[j]; |
|---|
| 258 | } |
|---|
| 259 | else { |
|---|
| 260 | frequency[wf][j] = 0; |
|---|
| 261 | weights[j] = 0; |
|---|
| 262 | } |
|---|
| 263 | j++; |
|---|
| 264 | } |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | for (j=i=0; i<alignment_length; i++) { // ******* calculate rates |
|---|
| 268 | if (filter && !filter->use_position(i)) continue; |
|---|
| 269 | if (!weights[j]) { |
|---|
| 270 | rates[j] = 1.0; |
|---|
| 271 | ttratio[j] = 0.5; |
|---|
| 272 | j++; |
|---|
| 273 | continue; |
|---|
| 274 | } |
|---|
| 275 | rates[j] = (mut_sum[j] / (float)freq_sum[j]); |
|---|
| 276 | if (transver[i] > 0) { |
|---|
| 277 | ttratio[j] = (minmut[i] - transver[i]) / (float)transver[i]; |
|---|
| 278 | } |
|---|
| 279 | else { |
|---|
| 280 | ttratio[j] = 2.0; |
|---|
| 281 | } |
|---|
| 282 | if (ttratio[j] < 0.05) ttratio[j] = 0.05; |
|---|
| 283 | if (ttratio[j] > 5.0) ttratio[j] = 5.0; |
|---|
| 284 | j++; |
|---|
| 285 | } |
|---|
| 286 | |
|---|
| 287 | // ****** normalize rates |
|---|
| 288 | |
|---|
| 289 | double sum_rates = 0; |
|---|
| 290 | for (i=0; i<seq_len; i++) sum_rates += rates[i]; |
|---|
| 291 | sum_rates /= seq_len; |
|---|
| 292 | for (i=0; i<seq_len; i++) rates[i] /= sum_rates; // LOOP_VECTORIZED |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | free(transver); |
|---|
| 296 | free(minmut); |
|---|
| 297 | for (i=0; i<256; i++) free(freqi[i]); |
|---|
| 298 | } |
|---|
| 299 | } |
|---|
| 300 | free(sai_name); |
|---|
| 301 | } |
|---|
| 302 | |
|---|
| 303 | return error; |
|---|
| 304 | } |
|---|
| 305 | |
|---|
| 306 | void ColumnStat::weight_by_inverseRates() const { |
|---|
| 307 | for (size_t i = 0; i<seq_len; ++i) { |
|---|
| 308 | if (rates[i]>0.0000001) { |
|---|
| 309 | weights[i] *= (int)(2.0 / rates[i]); // before weights is in [0 .. 2] => resulting weights <= 10 million |
|---|
| 310 | } |
|---|
| 311 | } |
|---|
| 312 | } |
|---|
| 313 | |
|---|
| 314 | |
|---|
| 315 | |
|---|
| 316 | void ColumnStat::print() { |
|---|
| 317 | if (seq_len) { |
|---|
| 318 | double sum_rates[2] = { 0.0, 0.0 }; |
|---|
| 319 | double sum_tt[2] = { 0.0, 0.0 }; |
|---|
| 320 | long count[2] = { 0, 0 }; |
|---|
| 321 | |
|---|
| 322 | for (unsigned j=0; j<seq_len; j++) { |
|---|
| 323 | if (weights[j]) { |
|---|
| 324 | fputc(".#"[is_helix[j]], stdout); |
|---|
| 325 | printf("%u: minmut %5i freqs %5i rates %5f tt %5f ", |
|---|
| 326 | j, mut_sum[j], freq_sum[j], rates[j], ttratio[j]); |
|---|
| 327 | for (int wf = 0; wf<256; wf++) { |
|---|
| 328 | if (frequency[wf]) printf("%c:%5f ", wf, frequency[wf][j]); |
|---|
| 329 | } |
|---|
| 330 | sum_rates[is_helix[j]] += rates[j]; |
|---|
| 331 | sum_tt[is_helix[j]] += ttratio[j]; |
|---|
| 332 | count[is_helix[j]]++; |
|---|
| 333 | printf("w: %i\n", weights[j]); |
|---|
| 334 | } |
|---|
| 335 | } |
|---|
| 336 | printf("Helical Rates %5f Non Hel. Rates %5f\n", |
|---|
| 337 | sum_rates[1]/count[1], sum_rates[0]/count[0]); |
|---|
| 338 | printf("Helical TT %5f Non Hel. TT %5f\n", |
|---|
| 339 | sum_tt[1]/count[1], sum_tt[0]/count[0]); |
|---|
| 340 | } |
|---|
| 341 | } |
|---|
| 342 | |
|---|
| 343 | static char *filter_columnstat_SAIs(GBDATA *gb_extended, ColumnStat *column_stat) { |
|---|
| 344 | // returns NULp for non-columnstat SAIs |
|---|
| 345 | char *result = NULp; |
|---|
| 346 | if (column_stat->has_valid_alignment()) { |
|---|
| 347 | GBDATA *gb_type = GB_search(gb_extended, column_stat->get_type_path(), GB_FIND); |
|---|
| 348 | |
|---|
| 349 | if (gb_type) { |
|---|
| 350 | const char *type = GB_read_char_pntr(gb_type); |
|---|
| 351 | |
|---|
| 352 | if (GBS_string_matches(type, "PV?:*", GB_MIND_CASE)) { |
|---|
| 353 | GBS_strstruct buf(100); |
|---|
| 354 | |
|---|
| 355 | buf.cat(GBT_get_name_or_description(gb_extended)); |
|---|
| 356 | buf.cat(": <"); |
|---|
| 357 | buf.cat(type); |
|---|
| 358 | buf.put('>'); |
|---|
| 359 | |
|---|
| 360 | result = buf.release(); |
|---|
| 361 | } |
|---|
| 362 | } |
|---|
| 363 | } |
|---|
| 364 | return result; |
|---|
| 365 | } |
|---|
| 366 | |
|---|
| 367 | void ColumnStat::create_sai_selection_list(AW_window *aww) { |
|---|
| 368 | GB_transaction ta(gb_main); |
|---|
| 369 | saisel = awt_create_SAI_selection_list(gb_main, aww, awar_name, makeSaiSelectionlistFilterCallback(filter_columnstat_SAIs, this)); |
|---|
| 370 | } |
|---|
| 371 | |
|---|
| 372 | void COLSTAT_create_selection_list(AW_window *aws, ColumnStat *column_stat) { |
|---|
| 373 | column_stat->create_sai_selection_list(aws); |
|---|
| 374 | } |
|---|
| 375 | |
|---|
| 376 | AW_window *COLSTAT_create_selection_window(AW_root *aw_root, ColumnStat *column_stat) { |
|---|
| 377 | GB_transaction ta(column_stat->get_gb_main()); |
|---|
| 378 | AW_window_simple *aws = new AW_window_simple; |
|---|
| 379 | aws->init(aw_root, "SELECT_CSP", "Select Column Statistic"); |
|---|
| 380 | aws->load_xfig("awt/col_statistic.fig"); |
|---|
| 381 | |
|---|
| 382 | aws->at("close"); |
|---|
| 383 | aws->callback(AW_POPDOWN); |
|---|
| 384 | aws->create_button("CLOSE", "CLOSE", "C"); |
|---|
| 385 | |
|---|
| 386 | aws->at("help"); aws->callback(makeHelpCallback("awt_csp.hlp")); |
|---|
| 387 | aws->create_button("HELP", "HELP", "H"); |
|---|
| 388 | |
|---|
| 389 | aws->at("box"); |
|---|
| 390 | COLSTAT_create_selection_list(aws, column_stat); |
|---|
| 391 | |
|---|
| 392 | aws->at("smooth"); |
|---|
| 393 | aws->create_toggle_field(column_stat->get_awar_smooth()); |
|---|
| 394 | aws->insert_toggle("Calculate each column (nearly) independently", "D", 0); |
|---|
| 395 | aws->insert_toggle("Smooth parameter estimates a little", "M", 1); |
|---|
| 396 | aws->insert_toggle("Smooth parameter estimates across columns", "S", 2); |
|---|
| 397 | aws->update_toggle_field(); |
|---|
| 398 | |
|---|
| 399 | aws->at("helix"); |
|---|
| 400 | aws->label("Use Helix Information (SAI 'HELIX')"); |
|---|
| 401 | aws->create_toggle(column_stat->get_awar_enable_helix()); |
|---|
| 402 | return aws; |
|---|
| 403 | } |
|---|