| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : AP_pos_var.cxx // |
|---|
| 4 | // Purpose : provides PVP calculation // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include "AP_pos_var.h" |
|---|
| 12 | |
|---|
| 13 | #include <AP_pro_a_nucs.hxx> |
|---|
| 14 | #include <TreeNode.h> |
|---|
| 15 | |
|---|
| 16 | #include <arb_progress.h> |
|---|
| 17 | #include <arb_strbuf.h> |
|---|
| 18 | |
|---|
| 19 | #include <cctype> |
|---|
| 20 | |
|---|
| 21 | #define ap_assert(cond) arb_assert(cond) |
|---|
| 22 | |
|---|
| 23 | AP_pos_var::AP_pos_var(GBDATA *gb_main_, const char *ali_name_, long ali_len_, bool is_nuc_, const char *tree_name_) : |
|---|
| 24 | gb_main(gb_main_), |
|---|
| 25 | treeLeafs(0), |
|---|
| 26 | treeNodes(0), |
|---|
| 27 | progress(NULp), |
|---|
| 28 | transitions(NULp), |
|---|
| 29 | transversions(NULp), |
|---|
| 30 | ali_len(ali_len_), |
|---|
| 31 | ali_name(ARB_strdup(ali_name_)), |
|---|
| 32 | tree_name(ARB_strdup(tree_name_)), |
|---|
| 33 | is_nuc(is_nuc_) |
|---|
| 34 | { |
|---|
| 35 | for (int i=0; i<256; i++) { |
|---|
| 36 | frequencies[i] = NULp; |
|---|
| 37 | char_2_freq[i] = 0; |
|---|
| 38 | char_2_transition[i] = 0; |
|---|
| 39 | char_2_transition[i] = 0; |
|---|
| 40 | } |
|---|
| 41 | } |
|---|
| 42 | |
|---|
| 43 | AP_pos_var::~AP_pos_var() { |
|---|
| 44 | delete progress; |
|---|
| 45 | free(ali_name); |
|---|
| 46 | free(tree_name); |
|---|
| 47 | free(transitions); |
|---|
| 48 | free(transversions); |
|---|
| 49 | for (int i=0; i<256; i++) free(frequencies[i]); |
|---|
| 50 | } |
|---|
| 51 | |
|---|
| 52 | const char *AP_pos_var::parsimony(TreeNode *tree, GB_UINT4 *bases, GB_UINT4 *tbases) { |
|---|
| 53 | GB_ERROR error = NULp; |
|---|
| 54 | |
|---|
| 55 | if (tree->is_leaf()) { |
|---|
| 56 | if (tree->gb_node) { |
|---|
| 57 | GBDATA *gb_data = GBT_find_sequence(tree->gb_node, ali_name); |
|---|
| 58 | if (gb_data) { |
|---|
| 59 | size_t seq_len = ali_len; |
|---|
| 60 | if (GB_read_string_count(gb_data) < seq_len) { |
|---|
| 61 | seq_len = GB_read_string_count(gb_data); |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | unsigned char *sequence = (unsigned char*)GB_read_char_pntr(gb_data); |
|---|
| 65 | for (size_t i = 0; i< seq_len; i++) { |
|---|
| 66 | long L = char_2_freq[sequence[i]]; |
|---|
| 67 | if (L) { |
|---|
| 68 | ap_assert(frequencies[L]); |
|---|
| 69 | frequencies[L][i]++; |
|---|
| 70 | } |
|---|
| 71 | } |
|---|
| 72 | |
|---|
| 73 | if (bases) { |
|---|
| 74 | for (size_t i = 0; i< seq_len; i++) bases[i] = char_2_transition[sequence[i]]; |
|---|
| 75 | } |
|---|
| 76 | if (tbases) { |
|---|
| 77 | for (size_t i = 0; i< seq_len; i++) tbases[i] = char_2_transversion[sequence[i]]; |
|---|
| 78 | } |
|---|
| 79 | } |
|---|
| 80 | } |
|---|
| 81 | } |
|---|
| 82 | else { |
|---|
| 83 | GB_UINT4 *ls = ARB_calloc<GB_UINT4>(ali_len); |
|---|
| 84 | GB_UINT4 *rs = ARB_calloc<GB_UINT4>(ali_len); |
|---|
| 85 | GB_UINT4 *lts = ARB_calloc<GB_UINT4>(ali_len); |
|---|
| 86 | GB_UINT4 *rts = ARB_calloc<GB_UINT4>(ali_len); |
|---|
| 87 | |
|---|
| 88 | if (!error) error = this->parsimony(tree->get_leftson(), ls, lts); |
|---|
| 89 | if (!error) error = this->parsimony(tree->get_rightson(), rs, rts); |
|---|
| 90 | if (!error) { |
|---|
| 91 | for (long i=0; i< ali_len; i++) { |
|---|
| 92 | long l = ls[i]; |
|---|
| 93 | long r = rs[i]; |
|---|
| 94 | if (l & r) { |
|---|
| 95 | if (bases) bases[i] = l&r; |
|---|
| 96 | } |
|---|
| 97 | else { |
|---|
| 98 | transitions[i] ++; |
|---|
| 99 | if (bases) bases[i] = l|r; |
|---|
| 100 | } |
|---|
| 101 | l = lts[i]; |
|---|
| 102 | r = rts[i]; |
|---|
| 103 | if (l & r) { |
|---|
| 104 | if (tbases) tbases[i] = l&r; |
|---|
| 105 | } |
|---|
| 106 | else { |
|---|
| 107 | transversions[i] ++; |
|---|
| 108 | if (tbases) tbases[i] = l|r; |
|---|
| 109 | } |
|---|
| 110 | } |
|---|
| 111 | } |
|---|
| 112 | |
|---|
| 113 | free(lts); |
|---|
| 114 | free(rts); |
|---|
| 115 | |
|---|
| 116 | free(ls); |
|---|
| 117 | free(rs); |
|---|
| 118 | } |
|---|
| 119 | progress->inc_and_check_user_abort(error); |
|---|
| 120 | return error; |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | |
|---|
| 124 | // Calculate the positional variability: control procedure |
|---|
| 125 | GB_ERROR AP_pos_var::retrieve(TreeNode *tree) { |
|---|
| 126 | ap_assert(!treeNodes); // calling retrieve() multiple times is untested |
|---|
| 127 | |
|---|
| 128 | GB_ERROR error = NULp; |
|---|
| 129 | if (!tree) { |
|---|
| 130 | error = "No tree specified for PVP calculation"; |
|---|
| 131 | } |
|---|
| 132 | else { |
|---|
| 133 | treeLeafs = GBT_count_leafs(tree); |
|---|
| 134 | treeNodes = leafs_2_nodes(treeLeafs, ROOTED); // used for progress |
|---|
| 135 | |
|---|
| 136 | ap_assert(treeNodes>0); |
|---|
| 137 | |
|---|
| 138 | if (is_nuc) { |
|---|
| 139 | char_2_freq[(unsigned char)'a'] = 'A'; |
|---|
| 140 | char_2_freq[(unsigned char)'A'] = 'A'; |
|---|
| 141 | char_2_freq[(unsigned char)'c'] = 'C'; |
|---|
| 142 | char_2_freq[(unsigned char)'C'] = 'C'; |
|---|
| 143 | char_2_freq[(unsigned char)'g'] = 'G'; |
|---|
| 144 | char_2_freq[(unsigned char)'G'] = 'G'; |
|---|
| 145 | char_2_freq[(unsigned char)'t'] = 'U'; |
|---|
| 146 | char_2_freq[(unsigned char)'T'] = 'U'; |
|---|
| 147 | char_2_freq[(unsigned char)'u'] = 'U'; |
|---|
| 148 | char_2_freq[(unsigned char)'U'] = 'U'; |
|---|
| 149 | |
|---|
| 150 | unsigned char *char_2_bitstring = (unsigned char *)AP_create_dna_to_ap_bases(); |
|---|
| 151 | |
|---|
| 152 | for (int i=0; i<256; i++) { |
|---|
| 153 | int j; |
|---|
| 154 | if (i=='-') j = '.'; else j = i; |
|---|
| 155 | long base = char_2_transition[i] = char_2_bitstring[j]; |
|---|
| 156 | char_2_transversion[i] = 0; |
|---|
| 157 | if (base & (AP_A | AP_G)) char_2_transversion[i] = 1; |
|---|
| 158 | if (base & (AP_C | AP_T)) char_2_transversion[i] |= 2; |
|---|
| 159 | } |
|---|
| 160 | delete [] char_2_bitstring; |
|---|
| 161 | } |
|---|
| 162 | else { |
|---|
| 163 | AWT_translator *translator = AWT_get_user_translator(gb_main); |
|---|
| 164 | |
|---|
| 165 | long char_2_bitstring[256]; |
|---|
| 166 | { |
|---|
| 167 | for (int i=0; i<256; ++i) { |
|---|
| 168 | char_2_bitstring[i] = 0; |
|---|
| 169 | } |
|---|
| 170 | int aa_max = translator->MaxAA(); |
|---|
| 171 | for (int i = 0; i<=aa_max; ++i) { |
|---|
| 172 | long bs = translator->index2bitset(i); |
|---|
| 173 | unsigned char spro = translator->index2spro(i); |
|---|
| 174 | |
|---|
| 175 | char_2_bitstring[spro] = bs; |
|---|
| 176 | } |
|---|
| 177 | } |
|---|
| 178 | |
|---|
| 179 | for (int i=0; i<256; ++i) { |
|---|
| 180 | char_2_transversion[i] = 0; |
|---|
| 181 | long base = char_2_transition[i] = char_2_bitstring[i]; |
|---|
| 182 | if (base) char_2_freq[i] = toupper(i); |
|---|
| 183 | } |
|---|
| 184 | } |
|---|
| 185 | |
|---|
| 186 | progress = new arb_progress(treeNodes); |
|---|
| 187 | |
|---|
| 188 | for (int i=0; i<256; i++) { |
|---|
| 189 | int j = char_2_freq[i]; |
|---|
| 190 | if (j && !frequencies[j]) ARB_calloc(frequencies[j], ali_len); |
|---|
| 191 | } |
|---|
| 192 | |
|---|
| 193 | ARB_calloc(transitions, ali_len); |
|---|
| 194 | ARB_calloc(transversions, ali_len); |
|---|
| 195 | |
|---|
| 196 | error = this->parsimony(tree); |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | return error; |
|---|
| 200 | } |
|---|
| 201 | |
|---|
| 202 | GB_ERROR AP_pos_var::delete_aliEntry_from_SAI(const char *sai_name) { |
|---|
| 203 | // deletes existing alignment sub-container from SAI 'sai_name' |
|---|
| 204 | |
|---|
| 205 | GBDATA *gb_extended = GBT_find_SAI(gb_main, sai_name); |
|---|
| 206 | if (gb_extended) { |
|---|
| 207 | GBDATA *gb_ali = GB_search(gb_extended, ali_name, GB_FIND); |
|---|
| 208 | if (gb_ali) { |
|---|
| 209 | return GB_delete(gb_ali); |
|---|
| 210 | } |
|---|
| 211 | } |
|---|
| 212 | return NULp; // sai/ali did not exist |
|---|
| 213 | } |
|---|
| 214 | |
|---|
| 215 | GB_ERROR AP_pos_var::save_aliEntry_to_SAI(const char *sai_name) { |
|---|
| 216 | // save whole SAI |
|---|
| 217 | // or |
|---|
| 218 | // add alignment sub-container to existing SAI |
|---|
| 219 | |
|---|
| 220 | GB_ERROR error = NULp; |
|---|
| 221 | GBDATA *gb_extended = GBT_find_or_create_SAI(gb_main, sai_name); |
|---|
| 222 | |
|---|
| 223 | if (!gb_extended) error = GB_await_error(); |
|---|
| 224 | else { |
|---|
| 225 | GBDATA *gb_ali = GB_search(gb_extended, ali_name, GB_DB); |
|---|
| 226 | if (!gb_ali) error = GB_await_error(); |
|---|
| 227 | else { |
|---|
| 228 | const char *description = |
|---|
| 229 | GBS_global_string("PVP: Positional Variability by Parsimony: tree '%s' ntaxa %li", |
|---|
| 230 | tree_name, treeLeafs); |
|---|
| 231 | |
|---|
| 232 | error = GBT_write_string(gb_ali, "_TYPE", description); |
|---|
| 233 | } |
|---|
| 234 | |
|---|
| 235 | if (!error) { |
|---|
| 236 | char *data = ARB_calloc<char>(ali_len+1); |
|---|
| 237 | int *sum = ARB_calloc<int>(ali_len); |
|---|
| 238 | |
|---|
| 239 | for (int j=0; j<256 && !error; j++) { // get sum of frequencies |
|---|
| 240 | if (frequencies[j]) { |
|---|
| 241 | for (int i=0; i<ali_len; i++) { // LOOP_VECTORIZED |
|---|
| 242 | sum[i] += frequencies[j][i]; |
|---|
| 243 | } |
|---|
| 244 | |
|---|
| 245 | if (j >= 'A' && j <= 'Z') { |
|---|
| 246 | GBDATA *gb_freq = GB_search(gb_ali, GBS_global_string("FREQUENCIES/N%c", j), GB_INTS); |
|---|
| 247 | if (!gb_freq) error = GB_await_error(); |
|---|
| 248 | else error = GB_write_ints(gb_freq, frequencies[j], ali_len); |
|---|
| 249 | } |
|---|
| 250 | } |
|---|
| 251 | } |
|---|
| 252 | |
|---|
| 253 | if (!error) { |
|---|
| 254 | GBDATA *gb_transi = GB_search(gb_ali, "FREQUENCIES/TRANSITIONS", GB_INTS); |
|---|
| 255 | if (!gb_transi) error = GB_await_error(); |
|---|
| 256 | else error = GB_write_ints(gb_transi, transitions, ali_len); |
|---|
| 257 | } |
|---|
| 258 | if (!error) { |
|---|
| 259 | GBDATA *gb_transv = GB_search(gb_ali, "FREQUENCIES/TRANSVERSIONS", GB_INTS); |
|---|
| 260 | if (!gb_transv) error = GB_await_error(); |
|---|
| 261 | else error = GB_write_ints(gb_transv, transversions, ali_len); |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | if (!error) { |
|---|
| 265 | int max_categ = 0; |
|---|
| 266 | double logbase = sqrt(2.0); |
|---|
| 267 | double lnlogbase = log(logbase); |
|---|
| 268 | double b = .75; |
|---|
| 269 | double max_rate = 1.0; |
|---|
| 270 | |
|---|
| 271 | int tenPercentOfLeafs = treeLeafs*0.1; |
|---|
| 272 | if ((10*tenPercentOfLeafs) < treeLeafs) ++tenPercentOfLeafs; |
|---|
| 273 | |
|---|
| 274 | for (int i=0; i<ali_len; i++) { |
|---|
| 275 | if (sum[i] < tenPercentOfLeafs) { // less than 10% valid characters; as documented in ../../HELP_SOURCE/source/pos_var_pars.hlp@valid |
|---|
| 276 | data[i] = '.'; |
|---|
| 277 | continue; |
|---|
| 278 | } |
|---|
| 279 | if (transitions[i] == 0) { |
|---|
| 280 | data[i] = '-'; |
|---|
| 281 | continue; |
|---|
| 282 | } |
|---|
| 283 | double rate = transitions[i] / (double)sum[i]; |
|---|
| 284 | if (rate >= b * .95) { |
|---|
| 285 | rate = b * .95; |
|---|
| 286 | } |
|---|
| 287 | rate = -b * log(1-rate/b); |
|---|
| 288 | if (rate > max_rate) rate = max_rate; |
|---|
| 289 | rate /= max_rate; // scaled 1.0 == fast rate |
|---|
| 290 | // ~0.0 slow rate |
|---|
| 291 | double dcat = -log(rate)/lnlogbase; |
|---|
| 292 | int icat = (int)dcat; |
|---|
| 293 | if (icat > 35) icat = 35; |
|---|
| 294 | if (icat >= max_categ) max_categ = icat + 1; |
|---|
| 295 | data[i] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"[icat]; |
|---|
| 296 | } |
|---|
| 297 | |
|---|
| 298 | error = GBT_write_string(gb_ali, "data", data); |
|---|
| 299 | |
|---|
| 300 | if (!error) { |
|---|
| 301 | // Generate Categories |
|---|
| 302 | GBS_strstruct out(1000); |
|---|
| 303 | for (int i = 0; i<max_categ; i++) { |
|---|
| 304 | out.putfloat(pow(1.0/logbase, i)); |
|---|
| 305 | out.put(' '); |
|---|
| 306 | } |
|---|
| 307 | |
|---|
| 308 | error = GBT_write_string(gb_ali, "_CATEGORIES", out.get_data()); |
|---|
| 309 | } |
|---|
| 310 | } |
|---|
| 311 | |
|---|
| 312 | free(sum); |
|---|
| 313 | free(data); |
|---|
| 314 | } |
|---|
| 315 | } |
|---|
| 316 | |
|---|
| 317 | return error; |
|---|
| 318 | } |
|---|
| 319 | |
|---|