| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ColumnStat.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "ColumnStat.hxx" |
|---|
| 12 | #include "awt_sel_boxes.hxx" |
|---|
| 13 | #include "ga_local.h" |
|---|
| 14 | #include <AP_filter.hxx> |
|---|
| 15 | #include <BI_helix.hxx> |
|---|
| 16 | |
|---|
| 17 | #include <aw_root.hxx> |
|---|
| 18 | #include <aw_awar.hxx> |
|---|
| 19 | #include <aw_msg.hxx> |
|---|
| 20 | #include <aw_select.hxx> |
|---|
| 21 | |
|---|
| 22 | #include <arbdbt.h> |
|---|
| 23 | #include <arb_strbuf.h> |
|---|
| 24 | #include <arb_defs.h> |
|---|
| 25 | |
|---|
| 26 | #define SRT_AWARCOLSTAT_NAME "/name=/name" |
|---|
| 27 | #define SRT_AWARCOLSTAT_ALIGNMENT "/name=/alignment" |
|---|
| 28 | #define SRT_AWARCOLSTAT_SMOOTH "/name=/smooth" |
|---|
| 29 | #define SRT_AWARCOLSTAT_ENABLE_HELIX "/name=/enable_helix" |
|---|
| 30 | |
|---|
| 31 | void ColumnStat::refresh_sai_selection_list() { |
|---|
| 32 | GB_transaction ta(gb_main); |
|---|
| 33 | |
|---|
| 34 | freeset(alignment_name, awr->awar(awar_alignment)->read_string()); |
|---|
| 35 | freeset(type_path, GBS_string_eval(alignment_name, "*=*1/_TYPE")); |
|---|
| 36 | |
|---|
| 37 | if (saisel) saisel->refresh(); |
|---|
| 38 | } |
|---|
| 39 | |
|---|
| 40 | static void refresh_columnstat_selection(AW_root*, ColumnStat *column_stat) { |
|---|
| 41 | column_stat->refresh_sai_selection_list(); |
|---|
| 42 | } |
|---|
| 43 | |
|---|
| 44 | ColumnStat::ColumnStat(GBDATA *gb_main_, AW_root *awr_, const char *awar_template, AW_awar *awar_used_alignment) : |
|---|
| 45 | gb_main(gb_main_), |
|---|
| 46 | awr(awr_), |
|---|
| 47 | alignment_name(NULp), |
|---|
| 48 | type_path(NULp), |
|---|
| 49 | saisel(NULp), |
|---|
| 50 | seq_len(0), |
|---|
| 51 | weights(NULp), |
|---|
| 52 | rates(NULp), |
|---|
| 53 | ttratio(NULp), |
|---|
| 54 | is_helix(NULp), |
|---|
| 55 | mut_sum(NULp), |
|---|
| 56 | freq_sum(NULp), |
|---|
| 57 | desc(NULp) |
|---|
| 58 | { |
|---|
| 59 | /* awar_template == ".../name" |
|---|
| 60 | * -> generated ".../alignment" |
|---|
| 61 | * ".../smooth" |
|---|
| 62 | * ".../enable_helix" |
|---|
| 63 | */ |
|---|
| 64 | |
|---|
| 65 | memset(frequency, 0, ARRAY_ELEMS(frequency)*sizeof(*frequency)); |
|---|
| 66 | |
|---|
| 67 | awar_name = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_NAME); |
|---|
| 68 | awar_alignment = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_ALIGNMENT); |
|---|
| 69 | awar_smooth = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_SMOOTH); |
|---|
| 70 | awar_enable_helix = GBS_string_eval(awar_template, SRT_AWARCOLSTAT_ENABLE_HELIX); |
|---|
| 71 | |
|---|
| 72 | ga_assert(strcmp(awar_name, awar_alignment) != 0); // awar_template must end with (or contain) "/name" |
|---|
| 73 | |
|---|
| 74 | awr->awar_string(awar_name, ""); |
|---|
| 75 | awr->awar_string(awar_alignment)->map(awar_used_alignment); |
|---|
| 76 | awr->awar_int(awar_smooth); |
|---|
| 77 | awr->awar_int(awar_enable_helix, 1); |
|---|
| 78 | |
|---|
| 79 | awr->awar(awar_alignment)->add_callback(makeRootCallback(refresh_columnstat_selection, this)); |
|---|
| 80 | refresh_sai_selection_list(); // same as refresh_columnstat_selection(this) |
|---|
| 81 | } |
|---|
| 82 | |
|---|
| 83 | void ColumnStat::forget() { |
|---|
| 84 | delete [] weights; weights = NULp; |
|---|
| 85 | delete [] rates; rates = NULp; |
|---|
| 86 | delete [] ttratio; ttratio = NULp; |
|---|
| 87 | delete [] is_helix; is_helix = NULp; |
|---|
| 88 | delete [] mut_sum; mut_sum = NULp; |
|---|
| 89 | delete [] freq_sum; freq_sum = NULp; |
|---|
| 90 | delete desc; desc = NULp; |
|---|
| 91 | |
|---|
| 92 | for (int i=0; i<256; i++) { |
|---|
| 93 | delete [] frequency[i]; |
|---|
| 94 | frequency[i] = NULp; |
|---|
| 95 | } |
|---|
| 96 | seq_len = 0; // mark invalid |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | ColumnStat::~ColumnStat() { |
|---|
| 100 | forget(); |
|---|
| 101 | delete awar_name; |
|---|
| 102 | delete awar_alignment; |
|---|
| 103 | delete awar_smooth; |
|---|
| 104 | delete awar_enable_helix; |
|---|
| 105 | } |
|---|
| 106 | |
|---|
| 107 | GB_ERROR ColumnStat::calculate(AP_filter *filter) { |
|---|
| 108 | forget(); // delete previously calculated stats |
|---|
| 109 | |
|---|
| 110 | GB_transaction ta(gb_main); |
|---|
| 111 | GB_ERROR error = filter ? filter->is_invalid() : NULp; |
|---|
| 112 | size_t alignment_length = 0; |
|---|
| 113 | |
|---|
| 114 | if (!error) { |
|---|
| 115 | long alignment_length_l = GBT_get_alignment_len(gb_main, alignment_name); |
|---|
| 116 | if (alignment_length_l <= 1) { |
|---|
| 117 | error = GBS_global_string("Unknown size for alignment '%s'", alignment_name); |
|---|
| 118 | } |
|---|
| 119 | else { |
|---|
| 120 | alignment_length = alignment_length_l; |
|---|
| 121 | if (filter && filter->get_length() != alignment_length) { |
|---|
| 122 | error = GBS_global_string("Incompatible filter (filter=%zu bp, alignment=%zu bp)", |
|---|
| 123 | filter->get_length(), alignment_length); |
|---|
| 124 | } |
|---|
| 125 | } |
|---|
| 126 | } |
|---|
| 127 | seq_len = 0; |
|---|
| 128 | |
|---|
| 129 | if (!error) { |
|---|
| 130 | char *sai_name = awr->awar(awar_name)->read_string(); |
|---|
| 131 | GBDATA *gb_sai = GBT_find_SAI(gb_main, sai_name); |
|---|
| 132 | |
|---|
| 133 | if (!gb_sai) { |
|---|
| 134 | if (strcmp(sai_name, "") != 0) { // no error if SAI name is empty |
|---|
| 135 | error = GBS_global_string("Column statistic: no such SAI: '%s'", sai_name); |
|---|
| 136 | } |
|---|
| 137 | } |
|---|
| 138 | |
|---|
| 139 | if (gb_sai) { |
|---|
| 140 | GBDATA *gb_ali = NULp; |
|---|
| 141 | GBDATA *gb_freqs = NULp; |
|---|
| 142 | if (!error) { |
|---|
| 143 | gb_ali = GB_entry(gb_sai, alignment_name); |
|---|
| 144 | if (!gb_ali) error = GBS_global_string("SAI '%s' does not exist in alignment '%s'", sai_name, alignment_name); |
|---|
| 145 | } |
|---|
| 146 | if (!error) { |
|---|
| 147 | gb_freqs = GB_entry(gb_ali, "FREQUENCIES"); |
|---|
| 148 | if (!gb_freqs) error = GBS_global_string("Column statistic '%s' is invalid (has no FREQUENCIES)", sai_name); |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | if (!error) { |
|---|
| 152 | seq_len = filter ? filter->get_filtered_length() : alignment_length; |
|---|
| 153 | |
|---|
| 154 | delete [] weights; weights = new GB_UINT4[seq_len]; |
|---|
| 155 | delete [] rates; rates = new float[seq_len]; |
|---|
| 156 | delete [] ttratio; ttratio = new float[seq_len]; |
|---|
| 157 | delete [] is_helix; is_helix = new bool[seq_len]; |
|---|
| 158 | delete [] mut_sum; mut_sum = new GB_UINT4[seq_len]; |
|---|
| 159 | delete [] freq_sum; freq_sum = new GB_UINT4[seq_len]; |
|---|
| 160 | |
|---|
| 161 | delete desc; desc = NULp; |
|---|
| 162 | |
|---|
| 163 | size_t i; |
|---|
| 164 | size_t j; |
|---|
| 165 | for (i=0; i<256; i++) { delete frequency[i]; frequency[i]=NULp; } |
|---|
| 166 | |
|---|
| 167 | long use_helix = awr->awar(awar_enable_helix)->read_int(); |
|---|
| 168 | |
|---|
| 169 | for (i=0; i<seq_len; i++) { // LOOP_VECTORIZED |
|---|
| 170 | is_helix[i] = false; |
|---|
| 171 | weights[i] = 1; |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | if (!error && use_helix) { // calculate weights and helix filter |
|---|
| 175 | BI_helix helix; |
|---|
| 176 | error = helix.init(this->gb_main, alignment_name); |
|---|
| 177 | if (error) { |
|---|
| 178 | aw_message(error); |
|---|
| 179 | error = NULp; |
|---|
| 180 | goto no_helix; |
|---|
| 181 | } |
|---|
| 182 | error = NULp; |
|---|
| 183 | for (j=i=0; i<alignment_length; i++) { |
|---|
| 184 | if (!filter || filter->use_position(i)) { |
|---|
| 185 | if (helix.pairtype(i) == HELIX_PAIR) { |
|---|
| 186 | is_helix[j] = true; |
|---|
| 187 | weights[j] = 1; |
|---|
| 188 | } |
|---|
| 189 | else { |
|---|
| 190 | is_helix[j] = false; |
|---|
| 191 | weights[j] = 2; |
|---|
| 192 | } |
|---|
| 193 | j++; |
|---|
| 194 | } |
|---|
| 195 | } |
|---|
| 196 | } |
|---|
| 197 | no_helix : |
|---|
| 198 | |
|---|
| 199 | for (i=0; i<seq_len; i++) freq_sum[i] = 0; |
|---|
| 200 | |
|---|
| 201 | GBDATA *gb_freq; |
|---|
| 202 | GB_UINT4 *freqi[256]; |
|---|
| 203 | for (i=0; i<256; i++) freqi[i] = NULp; |
|---|
| 204 | |
|---|
| 205 | int wf; // ********* read the frequency statistic |
|---|
| 206 | for (gb_freq = GB_child(gb_freqs); gb_freq; gb_freq = GB_nextChild(gb_freq)) { |
|---|
| 207 | char *key = GB_read_key(gb_freq); |
|---|
| 208 | if (key[0] == 'N' && key[1] && !key[2]) { |
|---|
| 209 | wf = key[1]; |
|---|
| 210 | freqi[wf] = GB_read_ints(gb_freq); |
|---|
| 211 | } |
|---|
| 212 | free(key); |
|---|
| 213 | } |
|---|
| 214 | |
|---|
| 215 | GB_UINT4 *minmut = NULp; |
|---|
| 216 | GB_UINT4 *transver = NULp; |
|---|
| 217 | GBDATA *gb_minmut = GB_entry(gb_freqs, "TRANSITIONS"); |
|---|
| 218 | GBDATA *gb_transver = GB_entry(gb_freqs, "TRANSVERSIONS"); |
|---|
| 219 | |
|---|
| 220 | if (!gb_minmut) error = GBS_global_string("Column statistic '%s' is invalid (has no FREQUENCIES/TRANSITIONS)", sai_name); |
|---|
| 221 | if (!gb_transver) error = GBS_global_string("Column statistic '%s' is invalid (has no FREQUENCIES/TRANSVERSIONS)", sai_name); |
|---|
| 222 | |
|---|
| 223 | if (gb_minmut) { |
|---|
| 224 | minmut = GB_read_ints(gb_minmut); |
|---|
| 225 | if (!minmut) error = GBS_global_string("Column statistic '%s' is invalid (error reading FREQUENCIES/TRANSITIONS: %s)", sai_name, GB_await_error()); |
|---|
| 226 | } |
|---|
| 227 | if (gb_transver) { |
|---|
| 228 | transver = GB_read_ints(gb_transver); |
|---|
| 229 | if (!transver) error = GBS_global_string("Column statistic '%s' is invalid (error reading FREQUENCIES/TRANSVERSIONS: %s)", sai_name, GB_await_error()); |
|---|
| 230 | } |
|---|
| 231 | |
|---|
| 232 | if (!error) { |
|---|
| 233 | unsigned long max_freq_sum = 0; |
|---|
| 234 | for (wf = 0; wf<256; wf++) { // ********* calculate sum of mutations |
|---|
| 235 | if (!freqi[wf]) continue; // ********* calculate nbases for each col. |
|---|
| 236 | for (j=i=0; i<alignment_length; i++) { |
|---|
| 237 | if (filter && !filter->use_position(i)) continue; |
|---|
| 238 | freq_sum[j] += freqi[wf][i]; |
|---|
| 239 | mut_sum[j] = minmut[i]; |
|---|
| 240 | j++; |
|---|
| 241 | } |
|---|
| 242 | } |
|---|
| 243 | for (i=0; i<seq_len; i++) |
|---|
| 244 | if (freq_sum[i] > max_freq_sum) |
|---|
| 245 | max_freq_sum = freq_sum[i]; |
|---|
| 246 | unsigned long min_freq_sum = DIST_MIN_SEQ(max_freq_sum); |
|---|
| 247 | |
|---|
| 248 | for (wf = 0; wf<256; wf++) { |
|---|
| 249 | if (!freqi[wf]) continue; |
|---|
| 250 | frequency[wf] = new float[seq_len]; |
|---|
| 251 | for (j=i=0; i<alignment_length; i++) { |
|---|
| 252 | if (filter && !filter->use_position(i)) continue; |
|---|
| 253 | if (freq_sum[j] > min_freq_sum) { |
|---|
| 254 | frequency[wf][j] = freqi[wf][i]/(float)freq_sum[j]; |
|---|
| 255 | } |
|---|
| 256 | else { |
|---|
| 257 | frequency[wf][j] = 0; |
|---|
| 258 | weights[j] = 0; |
|---|
| 259 | } |
|---|
| 260 | j++; |
|---|
| 261 | } |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | for (j=i=0; i<alignment_length; i++) { // ******* calculate rates |
|---|
| 265 | if (filter && !filter->use_position(i)) continue; |
|---|
| 266 | if (!weights[j]) { |
|---|
| 267 | rates[j] = 1.0; |
|---|
| 268 | ttratio[j] = 0.5; |
|---|
| 269 | j++; |
|---|
| 270 | continue; |
|---|
| 271 | } |
|---|
| 272 | rates[j] = (mut_sum[j] / (float)freq_sum[j]); |
|---|
| 273 | if (transver[i] > 0) { |
|---|
| 274 | ttratio[j] = (minmut[i] - transver[i]) / (float)transver[i]; |
|---|
| 275 | } |
|---|
| 276 | else { |
|---|
| 277 | ttratio[j] = 2.0; |
|---|
| 278 | } |
|---|
| 279 | if (ttratio[j] < 0.05) ttratio[j] = 0.05; |
|---|
| 280 | if (ttratio[j] > 5.0) ttratio[j] = 5.0; |
|---|
| 281 | j++; |
|---|
| 282 | } |
|---|
| 283 | |
|---|
| 284 | // ****** normalize rates |
|---|
| 285 | |
|---|
| 286 | double sum_rates = 0; |
|---|
| 287 | for (i=0; i<seq_len; i++) sum_rates += rates[i]; |
|---|
| 288 | sum_rates /= seq_len; |
|---|
| 289 | for (i=0; i<seq_len; i++) rates[i] /= sum_rates; // LOOP_VECTORIZED |
|---|
| 290 | } |
|---|
| 291 | |
|---|
| 292 | free(transver); |
|---|
| 293 | free(minmut); |
|---|
| 294 | for (i=0; i<256; i++) free(freqi[i]); |
|---|
| 295 | } |
|---|
| 296 | } |
|---|
| 297 | free(sai_name); |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | return error; |
|---|
| 301 | } |
|---|
| 302 | |
|---|
| 303 | void ColumnStat::weight_by_inverseRates() const { |
|---|
| 304 | for (size_t i = 0; i<seq_len; ++i) { |
|---|
| 305 | if (rates[i]>0.0000001) { |
|---|
| 306 | weights[i] *= (int)(2.0 / rates[i]); // before weights is in [0 .. 2] => resulting weights <= 10 million |
|---|
| 307 | } |
|---|
| 308 | } |
|---|
| 309 | } |
|---|
| 310 | |
|---|
| 311 | |
|---|
| 312 | |
|---|
| 313 | void ColumnStat::print() { |
|---|
| 314 | if (seq_len) { |
|---|
| 315 | double sum_rates[2] = { 0.0, 0.0 }; |
|---|
| 316 | double sum_tt[2] = { 0.0, 0.0 }; |
|---|
| 317 | long count[2] = { 0, 0 }; |
|---|
| 318 | |
|---|
| 319 | for (unsigned j=0; j<seq_len; j++) { |
|---|
| 320 | if (weights[j]) { |
|---|
| 321 | fputc(".#"[is_helix[j]], stdout); |
|---|
| 322 | printf("%u: minmut %5i freqs %5i rates %5f tt %5f ", |
|---|
| 323 | j, mut_sum[j], freq_sum[j], rates[j], ttratio[j]); |
|---|
| 324 | for (int wf = 0; wf<256; wf++) { |
|---|
| 325 | if (frequency[wf]) printf("%c:%5f ", wf, frequency[wf][j]); |
|---|
| 326 | } |
|---|
| 327 | sum_rates[is_helix[j]] += rates[j]; |
|---|
| 328 | sum_tt[is_helix[j]] += ttratio[j]; |
|---|
| 329 | count[is_helix[j]]++; |
|---|
| 330 | printf("w: %i\n", weights[j]); |
|---|
| 331 | } |
|---|
| 332 | } |
|---|
| 333 | printf("Helical Rates %5f Non Hel. Rates %5f\n", |
|---|
| 334 | sum_rates[1]/count[1], sum_rates[0]/count[0]); |
|---|
| 335 | printf("Helical TT %5f Non Hel. TT %5f\n", |
|---|
| 336 | sum_tt[1]/count[1], sum_tt[0]/count[0]); |
|---|
| 337 | } |
|---|
| 338 | } |
|---|
| 339 | |
|---|
| 340 | static char *filter_columnstat_SAIs(GBDATA *gb_extended, ColumnStat *column_stat) { |
|---|
| 341 | // returns NULp for non-columnstat SAIs |
|---|
| 342 | char *result = NULp; |
|---|
| 343 | if (column_stat->has_valid_alignment()) { |
|---|
| 344 | GBDATA *gb_type = GB_search(gb_extended, column_stat->get_type_path(), GB_FIND); |
|---|
| 345 | |
|---|
| 346 | if (gb_type) { |
|---|
| 347 | const char *type = GB_read_char_pntr(gb_type); |
|---|
| 348 | |
|---|
| 349 | if (GBS_string_matches(type, "PV?:*", GB_MIND_CASE)) { |
|---|
| 350 | GBS_strstruct *strstruct = GBS_stropen(100); |
|---|
| 351 | |
|---|
| 352 | GBS_strcat(strstruct, GBT_get_name_or_description(gb_extended)); |
|---|
| 353 | GBS_strcat(strstruct, ": <"); |
|---|
| 354 | GBS_strcat(strstruct, type); |
|---|
| 355 | GBS_strcat(strstruct, ">"); |
|---|
| 356 | |
|---|
| 357 | result = GBS_strclose(strstruct); |
|---|
| 358 | } |
|---|
| 359 | } |
|---|
| 360 | } |
|---|
| 361 | return result; |
|---|
| 362 | } |
|---|
| 363 | |
|---|
| 364 | void ColumnStat::create_sai_selection_list(AW_window *aww) { |
|---|
| 365 | GB_transaction ta(gb_main); |
|---|
| 366 | saisel = awt_create_SAI_selection_list(gb_main, aww, awar_name, true, makeSaiSelectionlistFilterCallback(filter_columnstat_SAIs, this)); |
|---|
| 367 | } |
|---|
| 368 | |
|---|
| 369 | void COLSTAT_create_selection_list(AW_window *aws, ColumnStat *column_stat) { |
|---|
| 370 | column_stat->create_sai_selection_list(aws); |
|---|
| 371 | } |
|---|
| 372 | |
|---|
| 373 | AW_window *COLSTAT_create_selection_window(AW_root *aw_root, ColumnStat *column_stat) { |
|---|
| 374 | GB_transaction ta(column_stat->get_gb_main()); |
|---|
| 375 | AW_window_simple *aws = new AW_window_simple; |
|---|
| 376 | aws->init(aw_root, "SELECT_CSP", "Select Column Statistic"); |
|---|
| 377 | aws->load_xfig("awt/col_statistic.fig"); |
|---|
| 378 | |
|---|
| 379 | aws->at("close"); |
|---|
| 380 | aws->callback(AW_POPDOWN); |
|---|
| 381 | aws->create_button("CLOSE", "CLOSE", "C"); |
|---|
| 382 | |
|---|
| 383 | aws->at("help"); aws->callback(makeHelpCallback("awt_csp.hlp")); |
|---|
| 384 | aws->create_button("HELP", "HELP", "H"); |
|---|
| 385 | |
|---|
| 386 | aws->at("box"); |
|---|
| 387 | COLSTAT_create_selection_list(aws, column_stat); |
|---|
| 388 | |
|---|
| 389 | aws->at("smooth"); |
|---|
| 390 | aws->create_toggle_field(column_stat->get_awar_smooth()); |
|---|
| 391 | aws->insert_toggle("Calculate each column (nearly) independently", "D", 0); |
|---|
| 392 | aws->insert_toggle("Smooth parameter estimates a little", "M", 1); |
|---|
| 393 | aws->insert_toggle("Smooth parameter estimates across columns", "S", 2); |
|---|
| 394 | aws->update_toggle_field(); |
|---|
| 395 | |
|---|
| 396 | aws->at("helix"); |
|---|
| 397 | aws->label("Use Helix Information (SAI 'HELIX')"); |
|---|
| 398 | aws->create_toggle(column_stat->get_awar_enable_helix()); |
|---|
| 399 | return aws; |
|---|
| 400 | } |
|---|