| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <memory.h> |
|---|
| 4 | #include <malloc.h> |
|---|
| 5 | #include <string.h> |
|---|
| 6 | |
|---|
| 7 | #include <math.h> |
|---|
| 8 | |
|---|
| 9 | #include <arbdb.h> |
|---|
| 10 | #include <arbdbt.h> |
|---|
| 11 | |
|---|
| 12 | #include <aw_root.hxx> |
|---|
| 13 | #include <aw_device.hxx> |
|---|
| 14 | #include <aw_window.hxx> |
|---|
| 15 | #include <aw_preset.hxx> |
|---|
| 16 | #include <awt.hxx> |
|---|
| 17 | |
|---|
| 18 | #include <awt_tree.hxx> |
|---|
| 19 | #include "dist.hxx" |
|---|
| 20 | #include <BI_helix.hxx> |
|---|
| 21 | #include <awt_csp.hxx> |
|---|
| 22 | |
|---|
| 23 | #include "ph_matr.hxx" |
|---|
| 24 | #include "ph_protdist.hxx" |
|---|
| 25 | |
|---|
| 26 | #define epsilon 0.000001/* a small number */ |
|---|
| 27 | |
|---|
| 28 | double ph_protdist::pameigs[20] = { |
|---|
| 29 | -0.022091252, -0.019297602, 0.000004760, -0.017477817, |
|---|
| 30 | -0.016575549, -0.015504543, -0.002112213, -0.002685727, |
|---|
| 31 | -0.002976402, -0.013440755, -0.012926992, -0.004293227, |
|---|
| 32 | -0.005356688, -0.011064786, -0.010480731, -0.008760449, |
|---|
| 33 | -0.007142318, -0.007381851, -0.007806557, -0.008127024 |
|---|
| 34 | }; |
|---|
| 35 | |
|---|
| 36 | double ph_protdist::pamprobs[20][20] = |
|---|
| 37 | { |
|---|
| 38 | { -0.01522976, -0.00746819, -0.13934468, 0.11755315, -0.00212101, |
|---|
| 39 | 0.01558456, -0.07408235, -0.00322387, 0.01375826, 0.00448826, |
|---|
| 40 | 0.00154174, 0.02013313, -0.00159183, -0.00069275, -0.00399898, |
|---|
| 41 | 0.08414055, -0.01188178, -0.00029870, 0.00220371, 0.00042546}, |
|---|
| 42 | { -0.07765582, -0.00712634, -0.03683209, -0.08065755, -0.00462872, |
|---|
| 43 | -0.03791039, 0.10642147, -0.00912185, 0.01436308, -0.00133243, |
|---|
| 44 | 0.00166346, 0.00624657, -0.00003363, -0.00128729, -0.00690319, |
|---|
| 45 | 0.17442028, -0.05373336, -0.00078751, -0.00038151, 0.01382718}, |
|---|
| 46 | { -0.08810973, -0.04081786, -0.04066004, -0.04736004, -0.03275406, |
|---|
| 47 | -0.03761164, -0.05047487, -0.09086213, -0.03269598, -0.03558015, |
|---|
| 48 | -0.08407966, -0.07970977, -0.01504743, -0.04011920, -0.05182232, |
|---|
| 49 | -0.07026991, -0.05846931, -0.01016998, -0.03047472, -0.06280511}, |
|---|
| 50 | { 0.02513756, -0.00578333, 0.09865453, 0.01322314, -0.00310665, |
|---|
| 51 | 0.05880899, -0.09252443, -0.02986539, -0.03127460, 0.01007539, |
|---|
| 52 | -0.00360119, -0.01995024, 0.00094940, -0.00145868, -0.01388816, |
|---|
| 53 | 0.11358341, -0.12127513, -0.00054696, -0.00055627, 0.00417284}, |
|---|
| 54 | { 0.16517316, -0.00254742, -0.03318745, -0.01984173, 0.00031890, |
|---|
| 55 | -0.02817810, 0.02661678, -0.01761215, 0.01665112, 0.10513343, |
|---|
| 56 | -0.00545026, 0.01827470, -0.00207616, -0.00763758, -0.01322808, |
|---|
| 57 | -0.02202576, -0.07434204, 0.00020593, 0.00119979, -0.10827873}, |
|---|
| 58 | { 0.16088826, 0.00056313, -0.02579303, -0.00319655, 0.00037228, |
|---|
| 59 | -0.03193150, 0.01655305, -0.03028640, 0.01367746, -0.11248153, |
|---|
| 60 | 0.00778371, 0.02675579, 0.00243718, 0.00895470, -0.01729803, |
|---|
| 61 | -0.02686964, -0.08262584, 0.00011794, -0.00225134, 0.09415650}, |
|---|
| 62 | { -0.01739295, 0.00572017, -0.00712592, -0.01100922, -0.00870113, |
|---|
| 63 | -0.00663461, -0.01153857, -0.02248432, -0.00382264, -0.00358612, |
|---|
| 64 | -0.00139345, -0.00971460, -0.00133312, 0.01927783, -0.01053838, |
|---|
| 65 | -0.00911362, -0.01010908, 0.09417598, 0.01763850, -0.00955454}, |
|---|
| 66 | { 0.01728888, 0.01344211, 0.01200836, 0.01857259, -0.17088517, |
|---|
| 67 | 0.01457592, 0.01997839, 0.02844884, 0.00839403, 0.00196862, |
|---|
| 68 | 0.01391984, 0.03270465, 0.00347173, -0.01940984, 0.01233979, |
|---|
| 69 | 0.00542887, 0.01008836, 0.00126491, -0.02863042, 0.00449764}, |
|---|
| 70 | { -0.02881366, -0.02184155, -0.01566086, -0.02593764, -0.04050907, |
|---|
| 71 | -0.01539603, -0.02576729, -0.05089606, -0.00597430, 0.02181643, |
|---|
| 72 | 0.09835597, -0.04040940, 0.00873512, 0.12139434, -0.02427882, |
|---|
| 73 | -0.02945238, -0.01566867, -0.01606503, 0.09475319, 0.02238670}, |
|---|
| 74 | { 0.04080274, -0.02869626, -0.05191093, -0.08435843, 0.00021141, |
|---|
| 75 | 0.13043842, 0.00871530, 0.00496058, -0.02797641, -0.00636933, |
|---|
| 76 | 0.02243277, 0.03640362, -0.05735517, 0.00196918, -0.02218934, |
|---|
| 77 | -0.00608972, 0.02872922, 0.00047619, 0.00151285, 0.00883489}, |
|---|
| 78 | { -0.02623824, 0.00331152, 0.03640692, 0.04260231, -0.00038223, |
|---|
| 79 | -0.07480340, -0.01022492, -0.00426473, 0.01448116, 0.01456847, |
|---|
| 80 | 0.05786680, 0.03368691, -0.10126924, -0.00147454, 0.01275395, |
|---|
| 81 | 0.00017574, -0.01585206, -0.00015767, -0.00231848, 0.02310137}, |
|---|
| 82 | { -0.00846258, -0.01508106, -0.01967505, -0.02772004, 0.01248253, |
|---|
| 83 | -0.01331243, -0.02569382, -0.04461524, -0.02207075, 0.04663443, |
|---|
| 84 | 0.19347923, -0.02745691, 0.02288515, -0.04883849, -0.01084597, |
|---|
| 85 | -0.01947187, -0.00081675, 0.00516540, -0.07815919, 0.08035585}, |
|---|
| 86 | { -0.06553111, 0.09756831, 0.00524326, -0.00885098, 0.00756653, |
|---|
| 87 | 0.02783099, -0.00427042, -0.16680359, 0.03951331, -0.00490540, |
|---|
| 88 | 0.01719610, 0.15018204, 0.00882722, -0.00423197, -0.01919217, |
|---|
| 89 | -0.02963619, -0.01831342, -0.00524338, 0.00011379, -0.02566864}, |
|---|
| 90 | { -0.07494341, -0.11348850, 0.00241343, -0.00803016, 0.00492438, |
|---|
| 91 | 0.00711909, -0.00829147, 0.05793337, 0.02734209, 0.02059759, |
|---|
| 92 | -0.02770280, 0.14128338, 0.01532479, 0.00364307, 0.05968116, |
|---|
| 93 | -0.06497960, -0.08113941, 0.00319445, -0.00104222, 0.03553497}, |
|---|
| 94 | { 0.05948223, -0.08959930, 0.03269977, -0.03272374, -0.00365667, |
|---|
| 95 | -0.03423294, -0.06418925, -0.05902138, 0.05746317, -0.02580596, |
|---|
| 96 | 0.01259572, 0.05848832, 0.00672666, 0.00233355, -0.05145149, |
|---|
| 97 | 0.07348503, 0.11427955, 0.00142592, -0.01030651, -0.04862799}, |
|---|
| 98 | { -0.01606880, 0.05200845, -0.01212967, -0.06824429, -0.00234304, |
|---|
| 99 | 0.01094203, -0.07375538, 0.08808629, 0.12394822, 0.02231351, |
|---|
| 100 | -0.03608265, -0.06978045, -0.00618360, 0.00274747, -0.01921876, |
|---|
| 101 | -0.01541969, -0.02223856, -0.00107603, -0.01251777, 0.05412534}, |
|---|
| 102 | { 0.01688843, 0.05784728, -0.02256966, -0.07072251, -0.00422551, |
|---|
| 103 | -0.06261233, -0.08502830, 0.08925346, -0.08529597, 0.01519343, |
|---|
| 104 | -0.05008258, 0.10931873, 0.00521033, 0.02593305, -0.00717855, |
|---|
| 105 | 0.02291527, 0.02527388, -0.00266188, -0.00871160, 0.02708135}, |
|---|
| 106 | { -0.04233344, 0.00076379, 0.01571257, 0.04003092, 0.00901468, |
|---|
| 107 | 0.00670577, 0.03459487, 0.12420216, -0.00067366, -0.01515094, |
|---|
| 108 | 0.05306642, 0.04338407, 0.00511287, 0.01036639, -0.17867462, |
|---|
| 109 | -0.02289440, -0.03213205, 0.00017924, -0.01187362, -0.03933874}, |
|---|
| 110 | { 0.01284817, -0.01685622, 0.00724363, 0.01687952, -0.00882070, |
|---|
| 111 | -0.00555957, 0.01676246, -0.05560456, -0.00966893, 0.06197684, |
|---|
| 112 | -0.09058758, 0.00880607, 0.00108629, -0.08308956, -0.08056832, |
|---|
| 113 | -0.00413297, 0.02973107, 0.00092948, 0.07010111, 0.13007418}, |
|---|
| 114 | { 0.00700223, -0.01347574, 0.00691332, 0.03122905, 0.00310308, |
|---|
| 115 | 0.00946862, 0.03455040, -0.06712536, -0.00304506, 0.04267941, |
|---|
| 116 | -0.10422292, -0.01127831, -0.00549798, 0.11680505, -0.03352701, |
|---|
| 117 | -0.00084536, 0.01631369, 0.00095063, -0.09570217, 0.06480321} |
|---|
| 118 | }; |
|---|
| 119 | |
|---|
| 120 | void ph_protdist::cats(ph_cattype wcat) |
|---|
| 121 | { |
|---|
| 122 | /* define categories of amino acids */ |
|---|
| 123 | aas b; |
|---|
| 124 | |
|---|
| 125 | /* fundamental subgroups */ |
|---|
| 126 | cat[(long) cys - (long) ala] = 1; |
|---|
| 127 | cat[(long) met - (long) ala] = 2; |
|---|
| 128 | cat[(long) val - (long) ala] = 3; |
|---|
| 129 | cat[(long) leu - (long) ala] = 3; |
|---|
| 130 | cat[(long) ileu - (long) ala] = 3; |
|---|
| 131 | cat[(long) gly - (long) ala] = 4; |
|---|
| 132 | cat[0] = 4; |
|---|
| 133 | cat[(long) ser - (long) ala] = 4; |
|---|
| 134 | cat[(long) thr - (long) ala] = 4; |
|---|
| 135 | cat[(long) pro - (long) ala] = 5; |
|---|
| 136 | cat[(long) phe - (long) ala] = 6; |
|---|
| 137 | cat[(long) tyr - (long) ala] = 6; |
|---|
| 138 | cat[(long) trp - (long) ala] = 6; |
|---|
| 139 | cat[(long) glu - (long) ala] = 7; |
|---|
| 140 | cat[(long) gln - (long) ala] = 7; |
|---|
| 141 | cat[(long) asp - (long) ala] = 7; |
|---|
| 142 | cat[(long) asn - (long) ala] = 7; |
|---|
| 143 | cat[(long) lys - (long) ala] = 8; |
|---|
| 144 | cat[(long) arg - (long) ala] = 8; |
|---|
| 145 | cat[(long) his - (long) ala] = 8; |
|---|
| 146 | if (wcat == george) { |
|---|
| 147 | /* |
|---|
| 148 | * George, Hunt and Barker: sulfhydryl, small hydrophobic, |
|---|
| 149 | * small hydrophilic, aromatic, acid/acid-amide/hydrophilic, |
|---|
| 150 | * basic |
|---|
| 151 | */ |
|---|
| 152 | for (b = ala; (long) b <= (long) val; b = (aas) ((long) b + 1)) { |
|---|
| 153 | if (cat[(long) b - (long) ala] == 3) |
|---|
| 154 | cat[(long) b - (long) ala] = 2; |
|---|
| 155 | if (cat[(long) b - (long) ala] == 5) |
|---|
| 156 | cat[(long) b - (long) ala] = 4; |
|---|
| 157 | } |
|---|
| 158 | } |
|---|
| 159 | if (wcat == chemical) { |
|---|
| 160 | /* |
|---|
| 161 | * Conn and Stumpf: monoamino, aliphatic, heterocyclic, |
|---|
| 162 | * aromatic, dicarboxylic, basic |
|---|
| 163 | */ |
|---|
| 164 | for (b = ala; (long) b <= (long) val; b = (aas) ((long) b + 1)) { |
|---|
| 165 | if (cat[(long) b - (long) ala] == 2) |
|---|
| 166 | cat[(long) b - (long) ala] = 1; |
|---|
| 167 | if (cat[(long) b - (long) ala] == 4) |
|---|
| 168 | cat[(long) b - (long) ala] = 3; |
|---|
| 169 | } |
|---|
| 170 | } |
|---|
| 171 | /* Ben Hall's personal opinion */ |
|---|
| 172 | if (wcat != hall) |
|---|
| 173 | return; |
|---|
| 174 | for (b = ala; (long) b <= (long) val; b = (aas) ((long) b + 1)) { |
|---|
| 175 | if (cat[(long) b - (long) ala] == 3) |
|---|
| 176 | cat[(long) b - (long) ala] = 2; |
|---|
| 177 | } |
|---|
| 178 | } /* cats */ |
|---|
| 179 | |
|---|
| 180 | |
|---|
| 181 | void ph_protdist::maketrans() |
|---|
| 182 | { |
|---|
| 183 | /* |
|---|
| 184 | * Make up transition probability matrix from code and category |
|---|
| 185 | * tables |
|---|
| 186 | */ |
|---|
| 187 | long i, j, k, m, n, s; |
|---|
| 188 | double x, sum = 0; |
|---|
| 189 | long sub[3], newsub[3]; |
|---|
| 190 | double f[4], g[4]; |
|---|
| 191 | aas b1, b2; |
|---|
| 192 | double TEMP, TEMP1, TEMP2, TEMP3; |
|---|
| 193 | |
|---|
| 194 | for (i = 0; i <= 19; i++) { |
|---|
| 195 | pi[i] = 0.0; |
|---|
| 196 | for (j = 0; j <= 19; j++) |
|---|
| 197 | prob[i][j] = 0.0; |
|---|
| 198 | } |
|---|
| 199 | f[0] = freqt; |
|---|
| 200 | f[1] = freqc; |
|---|
| 201 | f[2] = freqa; |
|---|
| 202 | f[3] = freqg; |
|---|
| 203 | g[0] = freqc + freqt; |
|---|
| 204 | g[1] = freqc + freqt; |
|---|
| 205 | g[2] = freqa + freqg; |
|---|
| 206 | g[3] = freqa + freqg; |
|---|
| 207 | TEMP = f[0]; |
|---|
| 208 | TEMP1 = f[1]; |
|---|
| 209 | TEMP2 = f[2]; |
|---|
| 210 | TEMP3 = f[3]; |
|---|
| 211 | fracchange = xi * (2 * f[0] * f[1] / g[0] + 2 * f[2] * f[3] / g[2]) + |
|---|
| 212 | xv * (1 - TEMP * TEMP - TEMP1 * TEMP1 - TEMP2 * TEMP2 - TEMP3 * TEMP3); |
|---|
| 213 | for (i = 0; i <= 3; i++) { |
|---|
| 214 | for (j = 0; j <= 3; j++) { |
|---|
| 215 | for (k = 0; k <= 3; k++) { |
|---|
| 216 | if (trans[i][j][k] != stop) |
|---|
| 217 | sum += f[i] * f[j] * f[k]; |
|---|
| 218 | } |
|---|
| 219 | } |
|---|
| 220 | } |
|---|
| 221 | for (i = 0; i <= 3; i++) { |
|---|
| 222 | sub[0] = i + 1; |
|---|
| 223 | for (j = 0; j <= 3; j++) { |
|---|
| 224 | sub[1] = j + 1; |
|---|
| 225 | for (k = 0; k <= 3; k++) { |
|---|
| 226 | sub[2] = k + 1; |
|---|
| 227 | b1 = trans[i][j][k]; |
|---|
| 228 | for (m = 0; m <= 2; m++) { |
|---|
| 229 | s = sub[m]; |
|---|
| 230 | for (n = 1; n <= 4; n++) { |
|---|
| 231 | memcpy(newsub, sub, sizeof(long) * 3L); |
|---|
| 232 | newsub[m] = n; |
|---|
| 233 | x = f[i] * f[j] * f[k] / (3.0 * sum); |
|---|
| 234 | if (((s == 1 || s == 2) && (n == 3 || n == 4)) || |
|---|
| 235 | ((n == 1 || n == 2) && (s == 3 || s == 4))) |
|---|
| 236 | x *= xv * f[n - 1]; |
|---|
| 237 | else |
|---|
| 238 | x *= xi * f[n - 1] / g[n - 1] + xv * f[n - 1]; |
|---|
| 239 | b2 = trans[newsub[0] - 1][newsub[1] - 1][newsub[2] - 1]; |
|---|
| 240 | if (b1 != stop) { |
|---|
| 241 | pi[b1] += x; |
|---|
| 242 | if (b2 != stop) { |
|---|
| 243 | if (cat[b1] != cat[b2]) { |
|---|
| 244 | prob[b1][b2] += x * ease; |
|---|
| 245 | prob[b1][b1] += x * (1.0 - ease); |
|---|
| 246 | } else |
|---|
| 247 | prob[b1][b2] += x; |
|---|
| 248 | } else |
|---|
| 249 | prob[b1][b1] += x; |
|---|
| 250 | } |
|---|
| 251 | } |
|---|
| 252 | } |
|---|
| 253 | } |
|---|
| 254 | } |
|---|
| 255 | } |
|---|
| 256 | for (i = 0; i <= 19; i++) |
|---|
| 257 | prob[i][i] -= pi[i]; |
|---|
| 258 | for (i = 0; i <= 19; i++) { |
|---|
| 259 | for (j = 0; j <= 19; j++) |
|---|
| 260 | prob[i][j] /= sqrt(pi[i] * pi[j]); |
|---|
| 261 | } |
|---|
| 262 | /* computes pi^(1/2)*B*pi^(-1/2) */ |
|---|
| 263 | } /* maketrans */ |
|---|
| 264 | |
|---|
| 265 | void ph_protdist::code() |
|---|
| 266 | { |
|---|
| 267 | /* make up table of the code 0 = u, 1 = c, 2 = a, 3 = g */ |
|---|
| 268 | |
|---|
| 269 | trans[0][0][0] = phe; |
|---|
| 270 | trans[0][0][1] = phe; |
|---|
| 271 | trans[0][0][2] = leu; |
|---|
| 272 | trans[0][0][3] = leu; |
|---|
| 273 | trans[0][1][0] = ser; |
|---|
| 274 | trans[0][1][1] = ser; |
|---|
| 275 | trans[0][1][2] = ser; |
|---|
| 276 | trans[0][1][3] = ser; |
|---|
| 277 | trans[0][2][0] = tyr; |
|---|
| 278 | trans[0][2][1] = tyr; |
|---|
| 279 | trans[0][2][2] = stop; |
|---|
| 280 | trans[0][2][3] = stop; |
|---|
| 281 | trans[0][3][0] = cys; |
|---|
| 282 | trans[0][3][1] = cys; |
|---|
| 283 | trans[0][3][2] = stop; |
|---|
| 284 | trans[0][3][3] = trp; |
|---|
| 285 | trans[1][0][0] = leu; |
|---|
| 286 | trans[1][0][1] = leu; |
|---|
| 287 | trans[1][0][2] = leu; |
|---|
| 288 | trans[1][0][3] = leu; |
|---|
| 289 | trans[1][1][0] = pro; |
|---|
| 290 | trans[1][1][1] = pro; |
|---|
| 291 | trans[1][1][2] = pro; |
|---|
| 292 | trans[1][1][3] = pro; |
|---|
| 293 | trans[1][2][0] = his; |
|---|
| 294 | trans[1][2][1] = his; |
|---|
| 295 | trans[1][2][2] = gln; |
|---|
| 296 | trans[1][2][3] = gln; |
|---|
| 297 | trans[1][3][0] = arg; |
|---|
| 298 | trans[1][3][1] = arg; |
|---|
| 299 | trans[1][3][2] = arg; |
|---|
| 300 | trans[1][3][3] = arg; |
|---|
| 301 | trans[2][0][0] = ileu; |
|---|
| 302 | trans[2][0][1] = ileu; |
|---|
| 303 | trans[2][0][2] = ileu; |
|---|
| 304 | trans[2][0][3] = met; |
|---|
| 305 | trans[2][1][0] = thr; |
|---|
| 306 | trans[2][1][1] = thr; |
|---|
| 307 | trans[2][1][2] = thr; |
|---|
| 308 | trans[2][1][3] = thr; |
|---|
| 309 | trans[2][2][0] = asn; |
|---|
| 310 | trans[2][2][1] = asn; |
|---|
| 311 | trans[2][2][2] = lys; |
|---|
| 312 | trans[2][2][3] = lys; |
|---|
| 313 | trans[2][3][0] = ser; |
|---|
| 314 | trans[2][3][1] = ser; |
|---|
| 315 | trans[2][3][2] = arg; |
|---|
| 316 | trans[2][3][3] = arg; |
|---|
| 317 | trans[3][0][0] = val; |
|---|
| 318 | trans[3][0][1] = val; |
|---|
| 319 | trans[3][0][2] = val; |
|---|
| 320 | trans[3][0][3] = val; |
|---|
| 321 | trans[3][1][0] = ala; |
|---|
| 322 | trans[3][1][1] = ala; |
|---|
| 323 | trans[3][1][2] = ala; |
|---|
| 324 | trans[3][1][3] = ala; |
|---|
| 325 | trans[3][2][0] = asp; |
|---|
| 326 | trans[3][2][1] = asp; |
|---|
| 327 | trans[3][2][2] = glu; |
|---|
| 328 | trans[3][2][3] = glu; |
|---|
| 329 | trans[3][3][0] = gly; |
|---|
| 330 | trans[3][3][1] = gly; |
|---|
| 331 | trans[3][3][2] = gly; |
|---|
| 332 | trans[3][3][3] = gly; |
|---|
| 333 | if (whichcode == mito) |
|---|
| 334 | trans[0][3][2] = trp; |
|---|
| 335 | if (whichcode == vertmito) { |
|---|
| 336 | trans[0][3][2] = trp; |
|---|
| 337 | trans[2][3][2] = stop; |
|---|
| 338 | trans[2][3][3] = stop; |
|---|
| 339 | trans[2][0][2] = met; |
|---|
| 340 | } |
|---|
| 341 | if (whichcode == flymito) { |
|---|
| 342 | trans[0][3][2] = trp; |
|---|
| 343 | trans[2][0][2] = met; |
|---|
| 344 | trans[2][3][2] = ser; |
|---|
| 345 | } |
|---|
| 346 | if (whichcode == yeastmito) { |
|---|
| 347 | trans[0][3][2] = trp; |
|---|
| 348 | trans[1][0][2] = thr; |
|---|
| 349 | trans[2][0][2] = met; |
|---|
| 350 | } |
|---|
| 351 | } /* code */ |
|---|
| 352 | |
|---|
| 353 | void ph_protdist::transition() |
|---|
| 354 | { |
|---|
| 355 | /* calculations related to transition-transversion ratio */ |
|---|
| 356 | double aa, bb, freqr, freqy, freqgr, freqty; |
|---|
| 357 | |
|---|
| 358 | freqr = freqa + freqg; |
|---|
| 359 | freqy = freqc + freqt; |
|---|
| 360 | freqgr = freqg / freqr; |
|---|
| 361 | freqty = freqt / freqy; |
|---|
| 362 | aa = ttratio * freqr * freqy - freqa * freqg - freqc * freqt; |
|---|
| 363 | bb = freqa * freqgr + freqc * freqty; |
|---|
| 364 | xi = aa / (aa + bb); |
|---|
| 365 | xv = 1.0 - xi; |
|---|
| 366 | if (xi <= 0.0 && xi >= -epsilon) |
|---|
| 367 | xi = 0.0; |
|---|
| 368 | if (xi < 0.0) { |
|---|
| 369 | printf("THIS TRANSITION-TRANSVERSION RATIO IS IMPOSSIBLE WITH"); |
|---|
| 370 | printf(" THESE BASE FREQUENCIES\n"); |
|---|
| 371 | exit(-1); |
|---|
| 372 | } |
|---|
| 373 | } /* transition */ |
|---|
| 374 | |
|---|
| 375 | void ph_protdist::givens(ph_aa_matrix a,long i,long j,long n,double ctheta,double stheta,GB_BOOL left) |
|---|
| 376 | { |
|---|
| 377 | /* Givens transform at i,j for 1..n with angle theta */ |
|---|
| 378 | long k; |
|---|
| 379 | double d; |
|---|
| 380 | |
|---|
| 381 | for (k = 0; k < n; k++) { |
|---|
| 382 | if (left) { |
|---|
| 383 | d = ctheta * a[i - 1][k] + stheta * a[j - 1][k]; |
|---|
| 384 | a[j - 1][k] = ctheta * a[j - 1][k] - stheta * a[i - 1][k]; |
|---|
| 385 | a[i - 1][k] = d; |
|---|
| 386 | } else { |
|---|
| 387 | d = ctheta * a[k][i - 1] + stheta * a[k][j - 1]; |
|---|
| 388 | a[k][j - 1] = ctheta * a[k][j - 1] - stheta * a[k][i - 1]; |
|---|
| 389 | a[k][i - 1] = d; |
|---|
| 390 | } |
|---|
| 391 | } |
|---|
| 392 | } /* givens */ |
|---|
| 393 | |
|---|
| 394 | void ph_protdist::coeffs(double x,double y,double *c,double *s,double accuracy) |
|---|
| 395 | { |
|---|
| 396 | /* compute cosine and sine of theta */ |
|---|
| 397 | double root; |
|---|
| 398 | |
|---|
| 399 | root = sqrt(x * x + y * y); |
|---|
| 400 | if (root < accuracy) { |
|---|
| 401 | *c = 1.0; |
|---|
| 402 | *s = 0.0; |
|---|
| 403 | } else { |
|---|
| 404 | *c = x / root; |
|---|
| 405 | *s = y / root; |
|---|
| 406 | } |
|---|
| 407 | } /* coeffs */ |
|---|
| 408 | |
|---|
| 409 | void ph_protdist::tridiag(ph_aa_matrix a,long n,double accuracy) |
|---|
| 410 | { |
|---|
| 411 | /* Givens tridiagonalization */ |
|---|
| 412 | long i, j; |
|---|
| 413 | double s, c; |
|---|
| 414 | |
|---|
| 415 | for (i = 2; i < n; i++) { |
|---|
| 416 | for (j = i + 1; j <= n; j++) { |
|---|
| 417 | coeffs(a[i - 2][i - 1], a[i - 2][j - 1], &c, &s, accuracy); |
|---|
| 418 | givens(a, i, j, n, c, s, GB_TRUE); |
|---|
| 419 | givens(a, i, j, n, c, s, GB_FALSE); |
|---|
| 420 | givens(eigvecs, i, j, n, c, s, GB_TRUE); |
|---|
| 421 | } |
|---|
| 422 | } |
|---|
| 423 | } /* tridiag */ |
|---|
| 424 | |
|---|
| 425 | void ph_protdist::shiftqr(ph_aa_matrix a, long n, double accuracy) |
|---|
| 426 | { |
|---|
| 427 | /* QR eigenvalue-finder */ |
|---|
| 428 | long i, j; |
|---|
| 429 | double approx, s, c, d, TEMP, TEMP1; |
|---|
| 430 | |
|---|
| 431 | for (i = n; i >= 2; i--) { |
|---|
| 432 | do { |
|---|
| 433 | TEMP = a[i - 2][i - 2] - a[i - 1][i - 1]; |
|---|
| 434 | TEMP1 = a[i - 1][i - 2]; |
|---|
| 435 | d = sqrt(TEMP * TEMP + TEMP1 * TEMP1); |
|---|
| 436 | approx = a[i - 2][i - 2] + a[i - 1][i - 1]; |
|---|
| 437 | if (a[i - 1][i - 1] < a[i - 2][i - 2]) |
|---|
| 438 | approx = (approx - d) / 2.0; |
|---|
| 439 | else |
|---|
| 440 | approx = (approx + d) / 2.0; |
|---|
| 441 | for (j = 0; j < i; j++) |
|---|
| 442 | a[j][j] -= approx; |
|---|
| 443 | for (j = 1; j < i; j++) { |
|---|
| 444 | coeffs(a[j - 1][j - 1], a[j][j - 1], &c, &s, accuracy); |
|---|
| 445 | givens(a, j, j + 1, i, c, s, GB_TRUE); |
|---|
| 446 | givens(a, j, j + 1, i, c, s, GB_FALSE); |
|---|
| 447 | givens(eigvecs, j, j + 1, n, c, s, GB_TRUE); |
|---|
| 448 | } |
|---|
| 449 | for (j = 0; j < i; j++) |
|---|
| 450 | a[j][j] += approx; |
|---|
| 451 | } while (fabs(a[i - 1][i - 2]) > accuracy); |
|---|
| 452 | } |
|---|
| 453 | } /* shiftqr */ |
|---|
| 454 | |
|---|
| 455 | |
|---|
| 456 | void ph_protdist::qreigen(ph_aa_matrix proba,long n) |
|---|
| 457 | { |
|---|
| 458 | /* QR eigenvector/eigenvalue method for symmetric matrix */ |
|---|
| 459 | double accuracy; |
|---|
| 460 | long i, j; |
|---|
| 461 | |
|---|
| 462 | accuracy = 1.0e-6; |
|---|
| 463 | for (i = 0; i < n; i++) { |
|---|
| 464 | for (j = 0; j < n; j++) |
|---|
| 465 | eigvecs[i][j] = 0.0; |
|---|
| 466 | eigvecs[i][i] = 1.0; |
|---|
| 467 | } |
|---|
| 468 | tridiag(proba, n, accuracy); |
|---|
| 469 | shiftqr(proba, n, accuracy); |
|---|
| 470 | for (i = 0; i < n; i++) |
|---|
| 471 | eig[i] = proba[i][i]; |
|---|
| 472 | for (i = 0; i <= 19; i++) { |
|---|
| 473 | for (j = 0; j <= 19; j++) |
|---|
| 474 | proba[i][j] = sqrt(pi[j]) * eigvecs[i][j]; |
|---|
| 475 | } |
|---|
| 476 | /* proba[i][j] is the value of U' times pi^(1/2) */ |
|---|
| 477 | } /* qreigen */ |
|---|
| 478 | |
|---|
| 479 | |
|---|
| 480 | void ph_protdist::pameigen() |
|---|
| 481 | { |
|---|
| 482 | /* eigenanalysis for PAM matrix, precomputed */ |
|---|
| 483 | memcpy(prob, pamprobs, sizeof(pamprobs)); |
|---|
| 484 | memcpy(eig, pameigs, sizeof(pameigs)); |
|---|
| 485 | fracchange = 0.01; |
|---|
| 486 | } /* pameigen */ |
|---|
| 487 | |
|---|
| 488 | void ph_protdist::build_exptteig(double tt){ |
|---|
| 489 | int m; |
|---|
| 490 | for (m = 0; m <= 19; m++) { |
|---|
| 491 | exptteig[m] = exp(tt * eig[m]); |
|---|
| 492 | } |
|---|
| 493 | } |
|---|
| 494 | |
|---|
| 495 | void ph_protdist::predict(double /*tt*/, long nb1,long nb2) |
|---|
| 496 | { |
|---|
| 497 | /* make contribution to prediction of this aa pair */ |
|---|
| 498 | long m; |
|---|
| 499 | double q; |
|---|
| 500 | double TEMP; |
|---|
| 501 | for (m = 0; m <= 19; m++) { |
|---|
| 502 | q = prob[m][nb1] * prob[m][nb2] * exptteig[m]; |
|---|
| 503 | p += q; |
|---|
| 504 | TEMP = eig[m]; |
|---|
| 505 | dp += TEMP * q; |
|---|
| 506 | d2p += TEMP * TEMP * q; |
|---|
| 507 | } |
|---|
| 508 | } /* predict */ |
|---|
| 509 | |
|---|
| 510 | void ph_protdist::build_predikt_table(int pos){ |
|---|
| 511 | int b1, b2; |
|---|
| 512 | double tt = pos_2_tt(pos); |
|---|
| 513 | build_exptteig(tt); |
|---|
| 514 | akt_slopes = slopes[pos] = (ph_paa_matrix *) calloc(sizeof(ph_paa_matrix), 1); |
|---|
| 515 | akt_curves = curves[pos] = (ph_paa_matrix *) calloc(sizeof(ph_paa_matrix), 1); |
|---|
| 516 | akt_infs = infs[pos] = (ph_bool_matrix *) calloc(sizeof(ph_bool_matrix), 1); |
|---|
| 517 | |
|---|
| 518 | for (b1 = ala; b1 < ph_max_paa; b1++) { |
|---|
| 519 | for (b2 = ala; b2 <= b1; b2++) { |
|---|
| 520 | if (b1 != stop && b1 != del && b1 != quest && b1 != unk && |
|---|
| 521 | b2 != stop && b2 != del && b2 != quest && b2 != unk) { |
|---|
| 522 | p = 0.0; |
|---|
| 523 | dp = 0.0; |
|---|
| 524 | d2p = 0.0; |
|---|
| 525 | if (b1 != asx && b1 != glx && b2 != asx && b2 != glx){ |
|---|
| 526 | predict(tt, b1, b2); |
|---|
| 527 | } else { |
|---|
| 528 | if (b1 == asx) { |
|---|
| 529 | if (b2 == asx) { |
|---|
| 530 | predict(tt, 2L, 2L); |
|---|
| 531 | predict(tt, 2L, 3L); |
|---|
| 532 | predict(tt, 3L, 2L); |
|---|
| 533 | predict(tt, 3L, 3L); |
|---|
| 534 | } else { |
|---|
| 535 | if (b2 == glx) { |
|---|
| 536 | predict(tt, 2L, 5L); |
|---|
| 537 | predict(tt, 2L, 6L); |
|---|
| 538 | predict(tt, 3L, 5L); |
|---|
| 539 | predict(tt, 3L, 6L); |
|---|
| 540 | } else { |
|---|
| 541 | predict(tt, 2L, b2); |
|---|
| 542 | predict(tt, 3L, b2); |
|---|
| 543 | } |
|---|
| 544 | } |
|---|
| 545 | } else { |
|---|
| 546 | if (b1 == glx) { |
|---|
| 547 | if (b2 == asx) { |
|---|
| 548 | predict(tt, 5L, 2L); |
|---|
| 549 | predict(tt, 5L, 3L); |
|---|
| 550 | predict(tt, 6L, 2L); |
|---|
| 551 | predict(tt, 6L, 3L); |
|---|
| 552 | } else { |
|---|
| 553 | if (b2 == glx) { |
|---|
| 554 | predict(tt, 5L, 5L); |
|---|
| 555 | predict(tt, 5L, 6L); |
|---|
| 556 | predict(tt, 6L, 5L); |
|---|
| 557 | predict(tt, 6L, 6L); |
|---|
| 558 | } else { |
|---|
| 559 | predict(tt, 5L, b2); |
|---|
| 560 | predict(tt, 6L, b2); |
|---|
| 561 | } |
|---|
| 562 | } |
|---|
| 563 | } else { |
|---|
| 564 | if (b2 == asx) { |
|---|
| 565 | predict(tt, b1, 2L); |
|---|
| 566 | predict(tt, b1, 3L); |
|---|
| 567 | predict(tt, b1, 2L); |
|---|
| 568 | predict(tt, b1, 3L); |
|---|
| 569 | } else if (b2 == glx) { |
|---|
| 570 | predict(tt, b1, 5L); |
|---|
| 571 | predict(tt, b1, 6L); |
|---|
| 572 | predict(tt, b1, 5L); |
|---|
| 573 | predict(tt, b1, 6L); |
|---|
| 574 | } |
|---|
| 575 | } |
|---|
| 576 | } |
|---|
| 577 | } |
|---|
| 578 | if (p > 0.0){ |
|---|
| 579 | akt_slopes[0][b1][b2] = dp / p; |
|---|
| 580 | akt_curves[0][b1][b2] = d2p / p - dp * dp / (p * p); |
|---|
| 581 | akt_infs[0][b1][b2] = 0; |
|---|
| 582 | akt_slopes[0][b2][b1] = akt_slopes[0][b1][b2]; |
|---|
| 583 | akt_curves[0][b2][b1] = akt_curves[0][b1][b2]; |
|---|
| 584 | akt_infs[0][b2][b1] = 0; |
|---|
| 585 | }else{ |
|---|
| 586 | akt_infs[0][b1][b2] = 1; |
|---|
| 587 | akt_infs[0][b2][b1] = 1; |
|---|
| 588 | } |
|---|
| 589 | }//if |
|---|
| 590 | }// b2 |
|---|
| 591 | |
|---|
| 592 | } //for b1 |
|---|
| 593 | } |
|---|
| 594 | |
|---|
| 595 | int ph_protdist::tt_2_pos(double tt) { |
|---|
| 596 | int pos = (int)(tt * fracchange * ph_resolution); |
|---|
| 597 | if (pos >= ph_resolution * ph_max_dist ) |
|---|
| 598 | pos = ph_resolution * ph_max_dist - 1; |
|---|
| 599 | if (pos < 0) |
|---|
| 600 | pos = 0; |
|---|
| 601 | return pos; |
|---|
| 602 | } |
|---|
| 603 | |
|---|
| 604 | double ph_protdist::pos_2_tt(int pos) { |
|---|
| 605 | double tt = pos / (fracchange * ph_resolution); |
|---|
| 606 | return tt+epsilon; |
|---|
| 607 | } |
|---|
| 608 | |
|---|
| 609 | void ph_protdist::build_akt_predikt(double tt) |
|---|
| 610 | { |
|---|
| 611 | /* take an aktual slope from the hash table, else calculate a new one */ |
|---|
| 612 | int pos = tt_2_pos(tt); |
|---|
| 613 | if (!slopes[pos]){ |
|---|
| 614 | build_predikt_table(pos); |
|---|
| 615 | } |
|---|
| 616 | akt_slopes = slopes[pos]; |
|---|
| 617 | akt_curves = curves[pos]; |
|---|
| 618 | akt_infs = infs[pos]; |
|---|
| 619 | return; |
|---|
| 620 | |
|---|
| 621 | } |
|---|
| 622 | |
|---|
| 623 | const char *ph_protdist::makedists() |
|---|
| 624 | { |
|---|
| 625 | /* compute the distances */ |
|---|
| 626 | long i, j, k, m, n, iterations; |
|---|
| 627 | double delta, slope, curv; |
|---|
| 628 | int b1=0, b2=0; |
|---|
| 629 | double tt=0; |
|---|
| 630 | int pos; |
|---|
| 631 | |
|---|
| 632 | for (i = 0; i < spp; i++) { |
|---|
| 633 | matrix->set(i,i,0.0); |
|---|
| 634 | { |
|---|
| 635 | double gauge = (double)i/(double)spp; |
|---|
| 636 | if (aw_status(gauge*gauge)) return "Aborted"; |
|---|
| 637 | } |
|---|
| 638 | { |
|---|
| 639 | /* move all unknown characters to del */ |
|---|
| 640 | ap_pro *seq1 = entries[i]->sequence_protein->sequence; |
|---|
| 641 | for (k = 0; k <chars ; k++) { |
|---|
| 642 | b1 = seq1[k]; |
|---|
| 643 | if (b1 <=val) continue; |
|---|
| 644 | if (b1 == asx || b1 == glx) continue; |
|---|
| 645 | seq1[k] = del; |
|---|
| 646 | } |
|---|
| 647 | } |
|---|
| 648 | |
|---|
| 649 | for (j = 0; j < i ; j++) { |
|---|
| 650 | if (whichcat > kimura ) { |
|---|
| 651 | if (whichcat == pam) |
|---|
| 652 | tt = 10.0; |
|---|
| 653 | else |
|---|
| 654 | tt = 1.0; |
|---|
| 655 | delta = tt / 2.0; |
|---|
| 656 | iterations = 0; |
|---|
| 657 | do { |
|---|
| 658 | slope = 0.0; |
|---|
| 659 | curv = 0.0; |
|---|
| 660 | pos = tt_2_pos(tt); |
|---|
| 661 | tt = pos_2_tt(pos); |
|---|
| 662 | build_akt_predikt(tt); |
|---|
| 663 | ap_pro *seq1 = entries[i]->sequence_protein->sequence; |
|---|
| 664 | ap_pro *seq2 = entries[j]->sequence_protein->sequence; |
|---|
| 665 | for (k = chars; k >0; k--) { |
|---|
| 666 | b1 = *(seq1++); |
|---|
| 667 | b2 = *(seq2++); |
|---|
| 668 | if (predict_infinity(b1,b2)){ |
|---|
| 669 | break; |
|---|
| 670 | } |
|---|
| 671 | slope += predict_slope(b1,b2); |
|---|
| 672 | curv += predict_curve(b1,b2); |
|---|
| 673 | } |
|---|
| 674 | iterations++; |
|---|
| 675 | if (!predict_infinity(b1,b2)) { |
|---|
| 676 | if (curv < 0.0) { |
|---|
| 677 | tt -= slope / curv; |
|---|
| 678 | if (tt > 10000.0) { |
|---|
| 679 | aw_message(GB_export_error("INFINITE DISTANCE BETWEEN SPECIES %ld AND %ld; -1.0 WAS WRITTEN\n", i, j)); |
|---|
| 680 | tt = -1.0 / fracchange; |
|---|
| 681 | break; |
|---|
| 682 | } |
|---|
| 683 | int npos = tt_2_pos(tt); |
|---|
| 684 | int d = npos - pos; if (d<0) d=-d; |
|---|
| 685 | if (d<=1){ // cannot optimize |
|---|
| 686 | break; |
|---|
| 687 | } |
|---|
| 688 | |
|---|
| 689 | } else { |
|---|
| 690 | if ((slope > 0.0 && delta < 0.0) || (slope < 0.0 && delta > 0.0)) |
|---|
| 691 | delta /= -2; |
|---|
| 692 | if (tt + delta < 0 && tt<= epsilon) { |
|---|
| 693 | break; |
|---|
| 694 | } |
|---|
| 695 | tt += delta; |
|---|
| 696 | } |
|---|
| 697 | } else { |
|---|
| 698 | delta /= -2; |
|---|
| 699 | tt += delta; |
|---|
| 700 | if (tt < 0) tt = 0; |
|---|
| 701 | } |
|---|
| 702 | } while (iterations < 20); |
|---|
| 703 | } else { // cat < kimura |
|---|
| 704 | m = 0; |
|---|
| 705 | n = 0; |
|---|
| 706 | ap_pro *seq1 = entries[i]->sequence_protein->sequence; |
|---|
| 707 | ap_pro *seq2 = entries[j]->sequence_protein->sequence; |
|---|
| 708 | for (k = chars; k >0; k--) { |
|---|
| 709 | b1 = *(seq1++); |
|---|
| 710 | b2 = *(seq2++); |
|---|
| 711 | if (b1 <= val && b2 <= val) { |
|---|
| 712 | if (b1 == b2) m++; |
|---|
| 713 | n++; |
|---|
| 714 | } |
|---|
| 715 | } |
|---|
| 716 | if (n < 5) { // no info |
|---|
| 717 | tt = -1.0; |
|---|
| 718 | }else{ |
|---|
| 719 | switch (whichcat) { |
|---|
| 720 | case kimura: |
|---|
| 721 | { |
|---|
| 722 | double rel = 1 - (double) m / n; |
|---|
| 723 | double drel = 1.0 - rel - 0.2 * rel * rel; |
|---|
| 724 | if (drel < 0.0) { |
|---|
| 725 | aw_message(GB_export_error("DISTANCE BETWEEN SEQUENCES %3ld AND %3ld IS TOO LARGE FOR KIMURA FORMULA", i, j)); |
|---|
| 726 | tt = -1.0; |
|---|
| 727 | }else{ |
|---|
| 728 | tt = -log(drel); |
|---|
| 729 | } |
|---|
| 730 | } |
|---|
| 731 | break; |
|---|
| 732 | case none: |
|---|
| 733 | tt = (n-m)/(double)n; |
|---|
| 734 | break; |
|---|
| 735 | case similarity: |
|---|
| 736 | tt = m/(double)n; |
|---|
| 737 | break; |
|---|
| 738 | default: |
|---|
| 739 | GB_CORE; |
|---|
| 740 | break; |
|---|
| 741 | } |
|---|
| 742 | } |
|---|
| 743 | } |
|---|
| 744 | matrix->set(i,j,fracchange * tt); |
|---|
| 745 | } |
|---|
| 746 | } |
|---|
| 747 | return 0; |
|---|
| 748 | } /* makedists */ |
|---|
| 749 | |
|---|
| 750 | |
|---|
| 751 | void ph_protdist::clean_slopes(){ |
|---|
| 752 | int i; |
|---|
| 753 | if (slopes) { |
|---|
| 754 | for (i=0;i<ph_resolution*ph_max_dist;i++) { |
|---|
| 755 | delete slopes[i]; slopes[i] = 0; |
|---|
| 756 | delete curves[i]; curves[i] = 0; |
|---|
| 757 | delete infs[i]; infs[i] = 0; |
|---|
| 758 | } |
|---|
| 759 | } |
|---|
| 760 | akt_slopes = 0; |
|---|
| 761 | akt_curves = 0; |
|---|
| 762 | akt_infs = 0; |
|---|
| 763 | } |
|---|
| 764 | |
|---|
| 765 | ph_protdist::~ph_protdist(){ |
|---|
| 766 | clean_slopes(); |
|---|
| 767 | } |
|---|
| 768 | |
|---|
| 769 | ph_protdist::ph_protdist(ph_codetype codei, ph_cattype cati, long nentries, PHENTRY **entriesi, long seq_len, AP_smatrix *matrixi){ |
|---|
| 770 | memset((char *)this,0,sizeof(ph_protdist)); |
|---|
| 771 | entries = entriesi; |
|---|
| 772 | matrix = matrixi; |
|---|
| 773 | freqa = .25; |
|---|
| 774 | freqc = .25; |
|---|
| 775 | freqg = .25; |
|---|
| 776 | freqt = .25; |
|---|
| 777 | ttratio = 2.0; |
|---|
| 778 | ease = 0.457; |
|---|
| 779 | spp = nentries; |
|---|
| 780 | chars = seq_len; |
|---|
| 781 | transition(); |
|---|
| 782 | whichcode = codei; |
|---|
| 783 | whichcat = cati; |
|---|
| 784 | switch(cati){ |
|---|
| 785 | case none: |
|---|
| 786 | case similarity: |
|---|
| 787 | case kimura: |
|---|
| 788 | fracchange = 1.0; |
|---|
| 789 | break; |
|---|
| 790 | case pam: |
|---|
| 791 | code(); |
|---|
| 792 | pameigen(); |
|---|
| 793 | break; |
|---|
| 794 | default: |
|---|
| 795 | code(); |
|---|
| 796 | maketrans(); |
|---|
| 797 | qreigen(prob, 20L); |
|---|
| 798 | break; |
|---|
| 799 | } |
|---|
| 800 | } |
|---|
| 801 | |
|---|