1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : DI_protdist.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "di_protdist.hxx" |
---|
12 | #include "di_matr.hxx" |
---|
13 | #include <aw_msg.hxx> |
---|
14 | #include <arb_progress.h> |
---|
15 | #include <cmath> |
---|
16 | |
---|
17 | #define di_assert(cond) arb_assert(cond) |
---|
18 | |
---|
19 | #define epsilon 0.000001 // a small number |
---|
20 | |
---|
21 | double di_protdist::pameigs[20] = { |
---|
22 | -0.022091252, -0.019297602, 0.000004760, -0.017477817, |
---|
23 | -0.016575549, -0.015504543, -0.002112213, -0.002685727, |
---|
24 | -0.002976402, -0.013440755, -0.012926992, -0.004293227, |
---|
25 | -0.005356688, -0.011064786, -0.010480731, -0.008760449, |
---|
26 | -0.007142318, -0.007381851, -0.007806557, -0.008127024 |
---|
27 | }; |
---|
28 | |
---|
29 | double di_protdist::pamprobs[20][20] = { |
---|
30 | { |
---|
31 | -0.01522976, -0.00746819, -0.13934468, 0.11755315, -0.00212101, |
---|
32 | 0.01558456, -0.07408235, -0.00322387, 0.01375826, 0.00448826, |
---|
33 | 0.00154174, 0.02013313, -0.00159183, -0.00069275, -0.00399898, |
---|
34 | 0.08414055, -0.01188178, -0.00029870, 0.00220371, 0.00042546 |
---|
35 | }, |
---|
36 | { |
---|
37 | -0.07765582, -0.00712634, -0.03683209, -0.08065755, -0.00462872, |
---|
38 | -0.03791039, 0.10642147, -0.00912185, 0.01436308, -0.00133243, |
---|
39 | 0.00166346, 0.00624657, -0.00003363, -0.00128729, -0.00690319, |
---|
40 | 0.17442028, -0.05373336, -0.00078751, -0.00038151, 0.01382718 |
---|
41 | }, |
---|
42 | { |
---|
43 | -0.08810973, -0.04081786, -0.04066004, -0.04736004, -0.03275406, |
---|
44 | -0.03761164, -0.05047487, -0.09086213, -0.03269598, -0.03558015, |
---|
45 | -0.08407966, -0.07970977, -0.01504743, -0.04011920, -0.05182232, |
---|
46 | -0.07026991, -0.05846931, -0.01016998, -0.03047472, -0.06280511 |
---|
47 | }, |
---|
48 | { |
---|
49 | 0.02513756, -0.00578333, 0.09865453, 0.01322314, -0.00310665, |
---|
50 | 0.05880899, -0.09252443, -0.02986539, -0.03127460, 0.01007539, |
---|
51 | -0.00360119, -0.01995024, 0.00094940, -0.00145868, -0.01388816, |
---|
52 | 0.11358341, -0.12127513, -0.00054696, -0.00055627, 0.00417284 |
---|
53 | }, |
---|
54 | { |
---|
55 | 0.16517316, -0.00254742, -0.03318745, -0.01984173, 0.00031890, |
---|
56 | -0.02817810, 0.02661678, -0.01761215, 0.01665112, 0.10513343, |
---|
57 | -0.00545026, 0.01827470, -0.00207616, -0.00763758, -0.01322808, |
---|
58 | -0.02202576, -0.07434204, 0.00020593, 0.00119979, -0.10827873 |
---|
59 | }, |
---|
60 | { |
---|
61 | 0.16088826, 0.00056313, -0.02579303, -0.00319655, 0.00037228, |
---|
62 | -0.03193150, 0.01655305, -0.03028640, 0.01367746, -0.11248153, |
---|
63 | 0.00778371, 0.02675579, 0.00243718, 0.00895470, -0.01729803, |
---|
64 | -0.02686964, -0.08262584, 0.00011794, -0.00225134, 0.09415650 |
---|
65 | }, |
---|
66 | { |
---|
67 | -0.01739295, 0.00572017, -0.00712592, -0.01100922, -0.00870113, |
---|
68 | -0.00663461, -0.01153857, -0.02248432, -0.00382264, -0.00358612, |
---|
69 | -0.00139345, -0.00971460, -0.00133312, 0.01927783, -0.01053838, |
---|
70 | -0.00911362, -0.01010908, 0.09417598, 0.01763850, -0.00955454 |
---|
71 | }, |
---|
72 | { |
---|
73 | 0.01728888, 0.01344211, 0.01200836, 0.01857259, -0.17088517, |
---|
74 | 0.01457592, 0.01997839, 0.02844884, 0.00839403, 0.00196862, |
---|
75 | 0.01391984, 0.03270465, 0.00347173, -0.01940984, 0.01233979, |
---|
76 | 0.00542887, 0.01008836, 0.00126491, -0.02863042, 0.00449764 |
---|
77 | }, |
---|
78 | { |
---|
79 | -0.02881366, -0.02184155, -0.01566086, -0.02593764, -0.04050907, |
---|
80 | -0.01539603, -0.02576729, -0.05089606, -0.00597430, 0.02181643, |
---|
81 | 0.09835597, -0.04040940, 0.00873512, 0.12139434, -0.02427882, |
---|
82 | -0.02945238, -0.01566867, -0.01606503, 0.09475319, 0.02238670 |
---|
83 | }, |
---|
84 | { |
---|
85 | 0.04080274, -0.02869626, -0.05191093, -0.08435843, 0.00021141, |
---|
86 | 0.13043842, 0.00871530, 0.00496058, -0.02797641, -0.00636933, |
---|
87 | 0.02243277, 0.03640362, -0.05735517, 0.00196918, -0.02218934, |
---|
88 | -0.00608972, 0.02872922, 0.00047619, 0.00151285, 0.00883489 |
---|
89 | }, |
---|
90 | { |
---|
91 | -0.02623824, 0.00331152, 0.03640692, 0.04260231, -0.00038223, |
---|
92 | -0.07480340, -0.01022492, -0.00426473, 0.01448116, 0.01456847, |
---|
93 | 0.05786680, 0.03368691, -0.10126924, -0.00147454, 0.01275395, |
---|
94 | 0.00017574, -0.01585206, -0.00015767, -0.00231848, 0.02310137 |
---|
95 | }, |
---|
96 | { |
---|
97 | -0.00846258, -0.01508106, -0.01967505, -0.02772004, 0.01248253, |
---|
98 | -0.01331243, -0.02569382, -0.04461524, -0.02207075, 0.04663443, |
---|
99 | 0.19347923, -0.02745691, 0.02288515, -0.04883849, -0.01084597, |
---|
100 | -0.01947187, -0.00081675, 0.00516540, -0.07815919, 0.08035585 |
---|
101 | }, |
---|
102 | { |
---|
103 | -0.06553111, 0.09756831, 0.00524326, -0.00885098, 0.00756653, |
---|
104 | 0.02783099, -0.00427042, -0.16680359, 0.03951331, -0.00490540, |
---|
105 | 0.01719610, 0.15018204, 0.00882722, -0.00423197, -0.01919217, |
---|
106 | -0.02963619, -0.01831342, -0.00524338, 0.00011379, -0.02566864 |
---|
107 | }, |
---|
108 | { |
---|
109 | -0.07494341, -0.11348850, 0.00241343, -0.00803016, 0.00492438, |
---|
110 | 0.00711909, -0.00829147, 0.05793337, 0.02734209, 0.02059759, |
---|
111 | -0.02770280, 0.14128338, 0.01532479, 0.00364307, 0.05968116, |
---|
112 | -0.06497960, -0.08113941, 0.00319445, -0.00104222, 0.03553497 |
---|
113 | }, |
---|
114 | { |
---|
115 | 0.05948223, -0.08959930, 0.03269977, -0.03272374, -0.00365667, |
---|
116 | -0.03423294, -0.06418925, -0.05902138, 0.05746317, -0.02580596, |
---|
117 | 0.01259572, 0.05848832, 0.00672666, 0.00233355, -0.05145149, |
---|
118 | 0.07348503, 0.11427955, 0.00142592, -0.01030651, -0.04862799 |
---|
119 | }, |
---|
120 | { |
---|
121 | -0.01606880, 0.05200845, -0.01212967, -0.06824429, -0.00234304, |
---|
122 | 0.01094203, -0.07375538, 0.08808629, 0.12394822, 0.02231351, |
---|
123 | -0.03608265, -0.06978045, -0.00618360, 0.00274747, -0.01921876, |
---|
124 | -0.01541969, -0.02223856, -0.00107603, -0.01251777, 0.05412534 |
---|
125 | }, |
---|
126 | { |
---|
127 | 0.01688843, 0.05784728, -0.02256966, -0.07072251, -0.00422551, |
---|
128 | -0.06261233, -0.08502830, 0.08925346, -0.08529597, 0.01519343, |
---|
129 | -0.05008258, 0.10931873, 0.00521033, 0.02593305, -0.00717855, |
---|
130 | 0.02291527, 0.02527388, -0.00266188, -0.00871160, 0.02708135 |
---|
131 | }, |
---|
132 | { |
---|
133 | -0.04233344, 0.00076379, 0.01571257, 0.04003092, 0.00901468, |
---|
134 | 0.00670577, 0.03459487, 0.12420216, -0.00067366, -0.01515094, |
---|
135 | 0.05306642, 0.04338407, 0.00511287, 0.01036639, -0.17867462, |
---|
136 | -0.02289440, -0.03213205, 0.00017924, -0.01187362, -0.03933874 |
---|
137 | }, |
---|
138 | { |
---|
139 | 0.01284817, -0.01685622, 0.00724363, 0.01687952, -0.00882070, |
---|
140 | -0.00555957, 0.01676246, -0.05560456, -0.00966893, 0.06197684, |
---|
141 | -0.09058758, 0.00880607, 0.00108629, -0.08308956, -0.08056832, |
---|
142 | -0.00413297, 0.02973107, 0.00092948, 0.07010111, 0.13007418 |
---|
143 | }, |
---|
144 | { |
---|
145 | 0.00700223, -0.01347574, 0.00691332, 0.03122905, 0.00310308, |
---|
146 | 0.00946862, 0.03455040, -0.06712536, -0.00304506, 0.04267941, |
---|
147 | -0.10422292, -0.01127831, -0.00549798, 0.11680505, -0.03352701, |
---|
148 | -0.00084536, 0.01631369, 0.00095063, -0.09570217, 0.06480321 |
---|
149 | } |
---|
150 | }; |
---|
151 | |
---|
152 | void di_protdist::cats(di_cattype wcat) |
---|
153 | { |
---|
154 | // define categories of amino acids |
---|
155 | aas b; |
---|
156 | |
---|
157 | // fundamental subgroups |
---|
158 | cat[(long) CYS - (long) ALA] = 1; |
---|
159 | cat[(long) MET - (long) ALA] = 2; |
---|
160 | cat[(long) VAL - (long) ALA] = 3; |
---|
161 | cat[(long) LEU - (long) ALA] = 3; |
---|
162 | cat[(long) ILEU - (long) ALA] = 3; |
---|
163 | cat[(long) GLY - (long) ALA] = 4; |
---|
164 | cat[0] = 4; |
---|
165 | cat[(long) SER - (long) ALA] = 4; |
---|
166 | cat[(long) THR - (long) ALA] = 4; |
---|
167 | cat[(long) PRO - (long) ALA] = 5; |
---|
168 | cat[(long) PHE - (long) ALA] = 6; |
---|
169 | cat[(long) TYR - (long) ALA] = 6; |
---|
170 | cat[(long) TRP - (long) ALA] = 6; |
---|
171 | cat[(long) GLU - (long) ALA] = 7; |
---|
172 | cat[(long) GLN - (long) ALA] = 7; |
---|
173 | cat[(long) ASP - (long) ALA] = 7; |
---|
174 | cat[(long) ASN - (long) ALA] = 7; |
---|
175 | cat[(long) LYS - (long) ALA] = 8; |
---|
176 | cat[(long) ARG - (long) ALA] = 8; |
---|
177 | cat[(long) HIS - (long) ALA] = 8; |
---|
178 | if (wcat == GEORGE) { |
---|
179 | /* George, Hunt and Barker: sulfhydryl, small hydrophobic, |
---|
180 | * small hydrophilic, aromatic, acid/acid-amide/hydrophilic, |
---|
181 | * basic |
---|
182 | */ |
---|
183 | for (b = ALA; (long) b <= (long) VAL; b = (aas) ((long) b + 1)) { |
---|
184 | if (cat[(long) b - (long) ALA] == 3) |
---|
185 | cat[(long) b - (long) ALA] = 2; |
---|
186 | if (cat[(long) b - (long) ALA] == 5) |
---|
187 | cat[(long) b - (long) ALA] = 4; |
---|
188 | } |
---|
189 | } |
---|
190 | if (wcat == CHEMICAL) { |
---|
191 | /* Conn and Stumpf: monoamino, aliphatic, heterocyclic, |
---|
192 | * aromatic, dicarboxylic, basic |
---|
193 | */ |
---|
194 | for (b = ALA; (long) b <= (long) VAL; b = (aas) ((long) b + 1)) { |
---|
195 | if (cat[(long) b - (long) ALA] == 2) |
---|
196 | cat[(long) b - (long) ALA] = 1; |
---|
197 | if (cat[(long) b - (long) ALA] == 4) |
---|
198 | cat[(long) b - (long) ALA] = 3; |
---|
199 | } |
---|
200 | } |
---|
201 | // Ben Hall's personal opinion |
---|
202 | if (wcat != HALL) |
---|
203 | return; |
---|
204 | for (b = ALA; (long) b <= (long) VAL; b = (aas) ((long) b + 1)) { |
---|
205 | if (cat[(long) b - (long) ALA] == 3) |
---|
206 | cat[(long) b - (long) ALA] = 2; |
---|
207 | } |
---|
208 | } |
---|
209 | |
---|
210 | |
---|
211 | void di_protdist::maketrans() { |
---|
212 | // Make up transition probability matrix from code and category tables |
---|
213 | long i, j, k, m, n, s; |
---|
214 | double x, sum = 0; |
---|
215 | long sub[3], newsub[3]; |
---|
216 | double f[4], g[4]; |
---|
217 | aas b1, b2; |
---|
218 | double TEMP, TEMP1, TEMP2, TEMP3; |
---|
219 | |
---|
220 | for (i = 0; i <= 19; i++) { |
---|
221 | pi[i] = 0.0; |
---|
222 | for (j = 0; j <= 19; j++) |
---|
223 | prob[i][j] = 0.0; |
---|
224 | } |
---|
225 | f[0] = freqt; |
---|
226 | f[1] = freqc; |
---|
227 | f[2] = freqa; |
---|
228 | f[3] = freqg; |
---|
229 | g[0] = freqc + freqt; |
---|
230 | g[1] = freqc + freqt; |
---|
231 | g[2] = freqa + freqg; |
---|
232 | g[3] = freqa + freqg; |
---|
233 | TEMP = f[0]; |
---|
234 | TEMP1 = f[1]; |
---|
235 | TEMP2 = f[2]; |
---|
236 | TEMP3 = f[3]; |
---|
237 | fracchange = xi * (2 * f[0] * f[1] / g[0] + 2 * f[2] * f[3] / g[2]) + |
---|
238 | xv * (1 - TEMP * TEMP - TEMP1 * TEMP1 - TEMP2 * TEMP2 - TEMP3 * TEMP3); |
---|
239 | for (i = 0; i <= 3; i++) { |
---|
240 | for (j = 0; j <= 3; j++) { |
---|
241 | for (k = 0; k <= 3; k++) { |
---|
242 | if (trans[i][j][k] != STOP) |
---|
243 | sum += f[i] * f[j] * f[k]; |
---|
244 | } |
---|
245 | } |
---|
246 | } |
---|
247 | for (i = 0; i <= 3; i++) { |
---|
248 | sub[0] = i + 1; |
---|
249 | for (j = 0; j <= 3; j++) { |
---|
250 | sub[1] = j + 1; |
---|
251 | for (k = 0; k <= 3; k++) { |
---|
252 | sub[2] = k + 1; |
---|
253 | b1 = trans[i][j][k]; |
---|
254 | for (m = 0; m <= 2; m++) { |
---|
255 | s = sub[m]; |
---|
256 | for (n = 1; n <= 4; n++) { |
---|
257 | memcpy(newsub, sub, sizeof(long) * 3L); |
---|
258 | newsub[m] = n; |
---|
259 | x = f[i] * f[j] * f[k] / (3.0 * sum); |
---|
260 | if (((s == 1 || s == 2) && (n == 3 || n == 4)) || |
---|
261 | ((n == 1 || n == 2) && (s == 3 || s == 4))) |
---|
262 | x *= xv * f[n - 1]; |
---|
263 | else |
---|
264 | x *= xi * f[n - 1] / g[n - 1] + xv * f[n - 1]; |
---|
265 | b2 = trans[newsub[0] - 1][newsub[1] - 1][newsub[2] - 1]; |
---|
266 | if (b1 != STOP) { |
---|
267 | pi[b1] += x; |
---|
268 | if (b2 != STOP) { |
---|
269 | if (cat[b1] != cat[b2]) { |
---|
270 | prob[b1][b2] += x * ease; |
---|
271 | prob[b1][b1] += x * (1.0 - ease); |
---|
272 | } else |
---|
273 | prob[b1][b2] += x; |
---|
274 | } else |
---|
275 | prob[b1][b1] += x; |
---|
276 | } |
---|
277 | } |
---|
278 | } |
---|
279 | } |
---|
280 | } |
---|
281 | } |
---|
282 | for (i = 0; i <= 19; i++) |
---|
283 | prob[i][i] -= pi[i]; |
---|
284 | for (i = 0; i <= 19; i++) { |
---|
285 | for (j = 0; j <= 19; j++) |
---|
286 | prob[i][j] /= sqrt(pi[i] * pi[j]); |
---|
287 | } |
---|
288 | // computes pi^(1/2)*B*pi^(-1/2) |
---|
289 | } |
---|
290 | |
---|
291 | void di_protdist::code() |
---|
292 | { |
---|
293 | // make up table of the code 0 = u, 1 = c, 2 = a, 3 = g |
---|
294 | |
---|
295 | trans[0][0][0] = PHE; |
---|
296 | trans[0][0][1] = PHE; |
---|
297 | trans[0][0][2] = LEU; |
---|
298 | trans[0][0][3] = LEU; |
---|
299 | trans[0][1][0] = SER; |
---|
300 | trans[0][1][1] = SER; |
---|
301 | trans[0][1][2] = SER; |
---|
302 | trans[0][1][3] = SER; |
---|
303 | trans[0][2][0] = TYR; |
---|
304 | trans[0][2][1] = TYR; |
---|
305 | trans[0][2][2] = STOP; |
---|
306 | trans[0][2][3] = STOP; |
---|
307 | trans[0][3][0] = CYS; |
---|
308 | trans[0][3][1] = CYS; |
---|
309 | trans[0][3][2] = STOP; |
---|
310 | trans[0][3][3] = TRP; |
---|
311 | trans[1][0][0] = LEU; |
---|
312 | trans[1][0][1] = LEU; |
---|
313 | trans[1][0][2] = LEU; |
---|
314 | trans[1][0][3] = LEU; |
---|
315 | trans[1][1][0] = PRO; |
---|
316 | trans[1][1][1] = PRO; |
---|
317 | trans[1][1][2] = PRO; |
---|
318 | trans[1][1][3] = PRO; |
---|
319 | trans[1][2][0] = HIS; |
---|
320 | trans[1][2][1] = HIS; |
---|
321 | trans[1][2][2] = GLN; |
---|
322 | trans[1][2][3] = GLN; |
---|
323 | trans[1][3][0] = ARG; |
---|
324 | trans[1][3][1] = ARG; |
---|
325 | trans[1][3][2] = ARG; |
---|
326 | trans[1][3][3] = ARG; |
---|
327 | trans[2][0][0] = ILEU; |
---|
328 | trans[2][0][1] = ILEU; |
---|
329 | trans[2][0][2] = ILEU; |
---|
330 | trans[2][0][3] = MET; |
---|
331 | trans[2][1][0] = THR; |
---|
332 | trans[2][1][1] = THR; |
---|
333 | trans[2][1][2] = THR; |
---|
334 | trans[2][1][3] = THR; |
---|
335 | trans[2][2][0] = ASN; |
---|
336 | trans[2][2][1] = ASN; |
---|
337 | trans[2][2][2] = LYS; |
---|
338 | trans[2][2][3] = LYS; |
---|
339 | trans[2][3][0] = SER; |
---|
340 | trans[2][3][1] = SER; |
---|
341 | trans[2][3][2] = ARG; |
---|
342 | trans[2][3][3] = ARG; |
---|
343 | trans[3][0][0] = VAL; |
---|
344 | trans[3][0][1] = VAL; |
---|
345 | trans[3][0][2] = VAL; |
---|
346 | trans[3][0][3] = VAL; |
---|
347 | trans[3][1][0] = ALA; |
---|
348 | trans[3][1][1] = ALA; |
---|
349 | trans[3][1][2] = ALA; |
---|
350 | trans[3][1][3] = ALA; |
---|
351 | trans[3][2][0] = ASP; |
---|
352 | trans[3][2][1] = ASP; |
---|
353 | trans[3][2][2] = GLU; |
---|
354 | trans[3][2][3] = GLU; |
---|
355 | trans[3][3][0] = GLY; |
---|
356 | trans[3][3][1] = GLY; |
---|
357 | trans[3][3][2] = GLY; |
---|
358 | trans[3][3][3] = GLY; |
---|
359 | |
---|
360 | switch (whichcode) { |
---|
361 | case UNIVERSAL: |
---|
362 | case CILIATE: |
---|
363 | break; // use default code above |
---|
364 | |
---|
365 | case MITO: |
---|
366 | trans[0][3][2] = TRP; |
---|
367 | break; |
---|
368 | |
---|
369 | case VERTMITO: |
---|
370 | trans[0][3][2] = TRP; |
---|
371 | trans[2][3][2] = STOP; |
---|
372 | trans[2][3][3] = STOP; |
---|
373 | trans[2][0][2] = MET; |
---|
374 | break; |
---|
375 | |
---|
376 | case FLYMITO: |
---|
377 | trans[0][3][2] = TRP; |
---|
378 | trans[2][0][2] = MET; |
---|
379 | trans[2][3][2] = SER; |
---|
380 | break; |
---|
381 | |
---|
382 | case YEASTMITO: |
---|
383 | trans[0][3][2] = TRP; |
---|
384 | trans[1][0][2] = THR; |
---|
385 | trans[2][0][2] = MET; |
---|
386 | break; |
---|
387 | } |
---|
388 | } |
---|
389 | |
---|
390 | void di_protdist::transition() |
---|
391 | { |
---|
392 | // calculations related to transition-transversion ratio |
---|
393 | double aa, bb, freqr, freqy, freqgr, freqty; |
---|
394 | |
---|
395 | freqr = freqa + freqg; |
---|
396 | freqy = freqc + freqt; |
---|
397 | freqgr = freqg / freqr; |
---|
398 | freqty = freqt / freqy; |
---|
399 | aa = ttratio * freqr * freqy - freqa * freqg - freqc * freqt; |
---|
400 | bb = freqa * freqgr + freqc * freqty; |
---|
401 | xi = aa / (aa + bb); |
---|
402 | xv = 1.0 - xi; |
---|
403 | if (xi <= 0.0 && xi >= -epsilon) |
---|
404 | xi = 0.0; |
---|
405 | if (xi < 0.0) { |
---|
406 | GBK_terminate("This transition-transversion ratio is impossible with these base frequencies"); // @@@ should be handled better |
---|
407 | } |
---|
408 | } |
---|
409 | |
---|
410 | void di_protdist::givens(di_aa_matrix a, long i, long j, long n, double ctheta, double stheta, bool left) |
---|
411 | { |
---|
412 | // Givens transform at i,j for 1..n with angle theta |
---|
413 | long k; |
---|
414 | double d; |
---|
415 | |
---|
416 | for (k = 0; k < n; k++) { |
---|
417 | if (left) { |
---|
418 | d = ctheta * a[i - 1][k] + stheta * a[j - 1][k]; |
---|
419 | a[j - 1][k] = ctheta * a[j - 1][k] - stheta * a[i - 1][k]; |
---|
420 | a[i - 1][k] = d; |
---|
421 | } |
---|
422 | else { |
---|
423 | d = ctheta * a[k][i - 1] + stheta * a[k][j - 1]; |
---|
424 | a[k][j - 1] = ctheta * a[k][j - 1] - stheta * a[k][i - 1]; |
---|
425 | a[k][i - 1] = d; |
---|
426 | } |
---|
427 | } |
---|
428 | } |
---|
429 | |
---|
430 | void di_protdist::coeffs(double x, double y, double *c, double *s, double accuracy) |
---|
431 | { |
---|
432 | // compute cosine and sine of theta |
---|
433 | double root; |
---|
434 | |
---|
435 | root = sqrt(x * x + y * y); |
---|
436 | if (root < accuracy) { |
---|
437 | *c = 1.0; |
---|
438 | *s = 0.0; |
---|
439 | } |
---|
440 | else { |
---|
441 | *c = x / root; |
---|
442 | *s = y / root; |
---|
443 | } |
---|
444 | } |
---|
445 | |
---|
446 | void di_protdist::tridiag(di_aa_matrix a, long n, double accuracy) |
---|
447 | { |
---|
448 | // Givens tridiagonalization |
---|
449 | long i, j; |
---|
450 | double s, c; |
---|
451 | |
---|
452 | for (i = 2; i < n; i++) { |
---|
453 | for (j = i + 1; j <= n; j++) { |
---|
454 | coeffs(a[i - 2][i - 1], a[i - 2][j - 1], &c, &s, accuracy); |
---|
455 | givens(a, i, j, n, c, s, true); |
---|
456 | givens(a, i, j, n, c, s, false); |
---|
457 | givens(eigvecs, i, j, n, c, s, true); |
---|
458 | } |
---|
459 | } |
---|
460 | } |
---|
461 | |
---|
462 | void di_protdist::shiftqr(di_aa_matrix a, long n, double accuracy) |
---|
463 | { |
---|
464 | // QR eigenvalue-finder |
---|
465 | long i, j; |
---|
466 | double approx, s, c, d, TEMP, TEMP1; |
---|
467 | |
---|
468 | for (i = n; i >= 2; i--) { |
---|
469 | do { |
---|
470 | TEMP = a[i - 2][i - 2] - a[i - 1][i - 1]; |
---|
471 | TEMP1 = a[i - 1][i - 2]; |
---|
472 | d = sqrt(TEMP * TEMP + TEMP1 * TEMP1); |
---|
473 | approx = a[i - 2][i - 2] + a[i - 1][i - 1]; |
---|
474 | if (a[i - 1][i - 1] < a[i - 2][i - 2]) |
---|
475 | approx = (approx - d) / 2.0; |
---|
476 | else |
---|
477 | approx = (approx + d) / 2.0; |
---|
478 | for (j = 0; j < i; j++) |
---|
479 | a[j][j] -= approx; |
---|
480 | for (j = 1; j < i; j++) { |
---|
481 | coeffs(a[j - 1][j - 1], a[j][j - 1], &c, &s, accuracy); |
---|
482 | givens(a, j, j + 1, i, c, s, true); |
---|
483 | givens(a, j, j + 1, i, c, s, false); |
---|
484 | givens(eigvecs, j, j + 1, n, c, s, true); |
---|
485 | } |
---|
486 | for (j = 0; j < i; j++) |
---|
487 | a[j][j] += approx; |
---|
488 | } while (fabs(a[i - 1][i - 2]) > accuracy); |
---|
489 | } |
---|
490 | } |
---|
491 | |
---|
492 | |
---|
493 | void di_protdist::qreigen(di_aa_matrix proba, long n) |
---|
494 | { |
---|
495 | // QR eigenvector/eigenvalue method for symmetric matrix |
---|
496 | double accuracy; |
---|
497 | long i, j; |
---|
498 | |
---|
499 | accuracy = 1.0e-6; |
---|
500 | for (i = 0; i < n; i++) { |
---|
501 | for (j = 0; j < n; j++) |
---|
502 | eigvecs[i][j] = 0.0; |
---|
503 | eigvecs[i][i] = 1.0; |
---|
504 | } |
---|
505 | tridiag(proba, n, accuracy); |
---|
506 | shiftqr(proba, n, accuracy); |
---|
507 | for (i = 0; i < n; i++) |
---|
508 | eig[i] = proba[i][i]; |
---|
509 | for (i = 0; i <= 19; i++) { |
---|
510 | for (j = 0; j <= 19; j++) |
---|
511 | proba[i][j] = sqrt(pi[j]) * eigvecs[i][j]; |
---|
512 | } |
---|
513 | // proba[i][j] is the value of U' times pi^(1/2) |
---|
514 | } |
---|
515 | |
---|
516 | |
---|
517 | void di_protdist::pameigen() |
---|
518 | { |
---|
519 | // eigenanalysis for PAM matrix, precomputed |
---|
520 | memcpy(prob, pamprobs, sizeof(pamprobs)); |
---|
521 | memcpy(eig, pameigs, sizeof(pameigs)); |
---|
522 | fracchange = 0.01; |
---|
523 | } |
---|
524 | |
---|
525 | void di_protdist::build_exptteig(double tt) { |
---|
526 | int m; |
---|
527 | for (m = 0; m <= 19; m++) { |
---|
528 | exptteig[m] = exp(tt * eig[m]); |
---|
529 | } |
---|
530 | } |
---|
531 | |
---|
532 | void di_protdist::predict(double /* tt */, long nb1, long nb2) { |
---|
533 | // make contribution to prediction of this aa pair |
---|
534 | for (long m = 0; m <= 19; m++) { |
---|
535 | double q = prob[m][nb1] * prob[m][nb2] * exptteig[m]; |
---|
536 | p += q; |
---|
537 | double TEMP = eig[m]; |
---|
538 | dp += TEMP * q; |
---|
539 | d2p += TEMP * TEMP * q; |
---|
540 | } |
---|
541 | } |
---|
542 | |
---|
543 | void di_protdist::build_predikt_table(int pos) { |
---|
544 | int b1, b2; |
---|
545 | double tt = pos_2_tt(pos); |
---|
546 | build_exptteig(tt); |
---|
547 | akt_slopes = slopes[pos] = (di_paa_matrix *) calloc(sizeof(di_paa_matrix), 1); |
---|
548 | akt_curves = curves[pos] = (di_paa_matrix *) calloc(sizeof(di_paa_matrix), 1); |
---|
549 | akt_infs = infs[pos] = (di_bool_matrix *) calloc(sizeof(di_bool_matrix), 1); |
---|
550 | |
---|
551 | for (b1 = ALA; b1 < DI_MAX_PAA; b1++) { |
---|
552 | for (b2 = ALA; b2 <= b1; b2++) { |
---|
553 | if (b1 != STOP && b1 != DEL && b1 != QUEST && b1 != UNK && |
---|
554 | b2 != STOP && b2 != DEL && b2 != QUEST && b2 != UNK) { |
---|
555 | p = 0.0; |
---|
556 | dp = 0.0; |
---|
557 | d2p = 0.0; |
---|
558 | if (b1 != ASX && b1 != GLX && b2 != ASX && b2 != GLX) { |
---|
559 | predict(tt, b1, b2); |
---|
560 | } |
---|
561 | else { |
---|
562 | if (b1 == ASX) { |
---|
563 | if (b2 == ASX) { |
---|
564 | predict(tt, 2L, 2L); |
---|
565 | predict(tt, 2L, 3L); |
---|
566 | predict(tt, 3L, 2L); |
---|
567 | predict(tt, 3L, 3L); |
---|
568 | } |
---|
569 | else { |
---|
570 | if (b2 == GLX) { |
---|
571 | predict(tt, 2L, 5L); |
---|
572 | predict(tt, 2L, 6L); |
---|
573 | predict(tt, 3L, 5L); |
---|
574 | predict(tt, 3L, 6L); |
---|
575 | } |
---|
576 | else { |
---|
577 | predict(tt, 2L, b2); |
---|
578 | predict(tt, 3L, b2); |
---|
579 | } |
---|
580 | } |
---|
581 | } |
---|
582 | else { |
---|
583 | if (b1 == GLX) { |
---|
584 | if (b2 == ASX) { |
---|
585 | predict(tt, 5L, 2L); |
---|
586 | predict(tt, 5L, 3L); |
---|
587 | predict(tt, 6L, 2L); |
---|
588 | predict(tt, 6L, 3L); |
---|
589 | } |
---|
590 | else { |
---|
591 | if (b2 == GLX) { |
---|
592 | predict(tt, 5L, 5L); |
---|
593 | predict(tt, 5L, 6L); |
---|
594 | predict(tt, 6L, 5L); |
---|
595 | predict(tt, 6L, 6L); |
---|
596 | } |
---|
597 | else { |
---|
598 | predict(tt, 5L, b2); |
---|
599 | predict(tt, 6L, b2); |
---|
600 | } |
---|
601 | } |
---|
602 | } |
---|
603 | else { |
---|
604 | if (b2 == ASX) { |
---|
605 | predict(tt, b1, 2L); |
---|
606 | predict(tt, b1, 3L); |
---|
607 | predict(tt, b1, 2L); |
---|
608 | predict(tt, b1, 3L); |
---|
609 | } |
---|
610 | else if (b2 == GLX) { |
---|
611 | predict(tt, b1, 5L); |
---|
612 | predict(tt, b1, 6L); |
---|
613 | predict(tt, b1, 5L); |
---|
614 | predict(tt, b1, 6L); |
---|
615 | } |
---|
616 | } |
---|
617 | } |
---|
618 | } |
---|
619 | if (p > 0.0) { |
---|
620 | akt_slopes[0][b1][b2] = dp / p; |
---|
621 | akt_curves[0][b1][b2] = d2p / p - dp * dp / (p * p); |
---|
622 | akt_infs[0][b1][b2] = 0; |
---|
623 | akt_slopes[0][b2][b1] = akt_slopes[0][b1][b2]; |
---|
624 | akt_curves[0][b2][b1] = akt_curves[0][b1][b2]; |
---|
625 | akt_infs[0][b2][b1] = 0; |
---|
626 | } |
---|
627 | else { |
---|
628 | akt_infs[0][b1][b2] = 1; |
---|
629 | akt_infs[0][b2][b1] = 1; |
---|
630 | } |
---|
631 | } |
---|
632 | } |
---|
633 | } |
---|
634 | } |
---|
635 | |
---|
636 | int di_protdist::tt_2_pos(double tt) { |
---|
637 | int pos = (int)(tt * fracchange * DI_RESOLUTION); |
---|
638 | if (pos >= DI_RESOLUTION * DI_MAX_DIST) |
---|
639 | pos = DI_RESOLUTION * DI_MAX_DIST - 1; |
---|
640 | if (pos < 0) |
---|
641 | pos = 0; |
---|
642 | return pos; |
---|
643 | } |
---|
644 | |
---|
645 | double di_protdist::pos_2_tt(int pos) { |
---|
646 | double tt = pos / (fracchange * DI_RESOLUTION); |
---|
647 | return tt+epsilon; |
---|
648 | } |
---|
649 | |
---|
650 | void di_protdist::build_akt_predikt(double tt) |
---|
651 | { |
---|
652 | // take an actual slope from the hash table, else calculate a new one |
---|
653 | int pos = tt_2_pos(tt); |
---|
654 | if (!slopes[pos]) { |
---|
655 | build_predikt_table(pos); |
---|
656 | } |
---|
657 | akt_slopes = slopes[pos]; |
---|
658 | akt_curves = curves[pos]; |
---|
659 | akt_infs = infs[pos]; |
---|
660 | return; |
---|
661 | |
---|
662 | } |
---|
663 | |
---|
664 | GB_ERROR di_protdist::makedists(bool *aborted_flag) { |
---|
665 | /* compute the distances. |
---|
666 | * sets 'aborted_flag' to true, if it is non-NULL and user aborts the calculation |
---|
667 | */ |
---|
668 | long i, j, k, m, n, iterations; |
---|
669 | double delta, slope, curv; |
---|
670 | int b1 = 0, b2=0; |
---|
671 | double tt = 0; |
---|
672 | int pos; |
---|
673 | |
---|
674 | arb_progress progress("Calculating distances", matrix_halfsize(spp, false)); |
---|
675 | GB_ERROR error = NULL; |
---|
676 | |
---|
677 | for (i = 0; i < spp && !error; i++) { |
---|
678 | matrix->set(i, i, 0.0); |
---|
679 | { |
---|
680 | // move all unknown characters to del |
---|
681 | ap_pro *seq1 = entries[i]->sequence_protein->get_sequence(); |
---|
682 | for (k = 0; k <chars; k++) { |
---|
683 | b1 = seq1[k]; |
---|
684 | if (b1 <= VAL) continue; |
---|
685 | if (b1 == ASX || b1 == GLX) continue; |
---|
686 | seq1[k] = DEL; |
---|
687 | } |
---|
688 | } |
---|
689 | |
---|
690 | for (j = 0; j < i && !error; j++) { |
---|
691 | if (whichcat > KIMURA) { |
---|
692 | if (whichcat == PAM) |
---|
693 | tt = 10.0; |
---|
694 | else |
---|
695 | tt = 1.0; |
---|
696 | delta = tt / 2.0; |
---|
697 | iterations = 0; |
---|
698 | do { |
---|
699 | slope = 0.0; |
---|
700 | curv = 0.0; |
---|
701 | pos = tt_2_pos(tt); |
---|
702 | tt = pos_2_tt(pos); |
---|
703 | build_akt_predikt(tt); |
---|
704 | const ap_pro *seq1 = entries[i]->sequence_protein->get_sequence(); |
---|
705 | const ap_pro *seq2 = entries[j]->sequence_protein->get_sequence(); |
---|
706 | for (k = chars; k >0; k--) { |
---|
707 | b1 = *(seq1++); |
---|
708 | b2 = *(seq2++); |
---|
709 | if (predict_infinity(b1, b2)) { |
---|
710 | break; |
---|
711 | } |
---|
712 | slope += predict_slope(b1, b2); |
---|
713 | curv += predict_curve(b1, b2); |
---|
714 | } |
---|
715 | iterations++; |
---|
716 | if (!predict_infinity(b1, b2)) { |
---|
717 | if (curv < 0.0) { |
---|
718 | tt -= slope / curv; |
---|
719 | if (tt > 10000.0) { |
---|
720 | aw_message(GBS_global_string("Warning: infinite distance between species '%s' and '%s'\n", entries[i]->name, entries[j]->name)); |
---|
721 | tt = -1.0 / fracchange; |
---|
722 | break; |
---|
723 | } |
---|
724 | int npos = tt_2_pos(tt); |
---|
725 | int d = npos - pos; if (d<0) d=-d; |
---|
726 | if (d<=1) { // cannot optimize |
---|
727 | break; |
---|
728 | } |
---|
729 | |
---|
730 | } |
---|
731 | else { |
---|
732 | if ((slope > 0.0 && delta < 0.0) || (slope < 0.0 && delta > 0.0)) |
---|
733 | delta /= -2; |
---|
734 | if (tt + delta < 0 && tt <= epsilon) { |
---|
735 | break; |
---|
736 | } |
---|
737 | tt += delta; |
---|
738 | } |
---|
739 | } |
---|
740 | else { |
---|
741 | delta /= -2; |
---|
742 | tt += delta; |
---|
743 | if (tt < 0) tt = 0; |
---|
744 | } |
---|
745 | } while (iterations < 20); |
---|
746 | } |
---|
747 | else { // cat < kimura |
---|
748 | m = 0; |
---|
749 | n = 0; |
---|
750 | const ap_pro *seq1 = entries[i]->sequence_protein->get_sequence(); |
---|
751 | const ap_pro *seq2 = entries[j]->sequence_protein->get_sequence(); |
---|
752 | for (k = chars; k >0; k--) { |
---|
753 | b1 = *(seq1++); |
---|
754 | b2 = *(seq2++); |
---|
755 | if (b1 <= VAL && b2 <= VAL) { |
---|
756 | if (b1 == b2) m++; |
---|
757 | n++; |
---|
758 | } |
---|
759 | } |
---|
760 | if (n < 5) { // no info |
---|
761 | tt = -1.0; |
---|
762 | } |
---|
763 | else { |
---|
764 | switch (whichcat) { |
---|
765 | case KIMURA: |
---|
766 | { |
---|
767 | double rel = 1 - (double) m / n; |
---|
768 | double drel = 1.0 - rel - 0.2 * rel * rel; |
---|
769 | if (drel < 0.0) { |
---|
770 | aw_message(GBS_global_string("Warning: distance between sequences '%s' and '%s' is too large for kimura formula", entries[i]->name, entries[j]->name)); |
---|
771 | tt = -1.0; |
---|
772 | } |
---|
773 | else { |
---|
774 | tt = -log(drel); |
---|
775 | } |
---|
776 | } |
---|
777 | break; |
---|
778 | case NONE: |
---|
779 | tt = (n-m)/(double)n; |
---|
780 | break; |
---|
781 | case SIMILARITY: |
---|
782 | tt = m/(double)n; |
---|
783 | break; |
---|
784 | default: |
---|
785 | di_assert(0); |
---|
786 | break; |
---|
787 | } |
---|
788 | } |
---|
789 | } |
---|
790 | matrix->set(i, j, fracchange * tt); |
---|
791 | progress.inc_and_check_user_abort(error); |
---|
792 | } |
---|
793 | } |
---|
794 | if (aborted_flag && error) *aborted_flag = true; |
---|
795 | return error; |
---|
796 | } |
---|
797 | |
---|
798 | |
---|
799 | void di_protdist::clean_slopes() { |
---|
800 | for (int i=0; i<DI_RESOLUTION*DI_MAX_DIST; i++) { |
---|
801 | freenull(slopes[i]); |
---|
802 | freenull(curves[i]); |
---|
803 | freenull(infs[i]); |
---|
804 | } |
---|
805 | akt_slopes = 0; |
---|
806 | akt_curves = 0; |
---|
807 | akt_infs = 0; |
---|
808 | } |
---|
809 | |
---|
810 | di_protdist::~di_protdist() { |
---|
811 | clean_slopes(); |
---|
812 | } |
---|
813 | |
---|
814 | di_protdist::di_protdist(di_codetype codei, di_cattype cati, long nentries, DI_ENTRY **entriesi, long seq_len, AP_smatrix *matrixi) { |
---|
815 | memset((char *)this, 0, sizeof(di_protdist)); |
---|
816 | entries = entriesi; |
---|
817 | matrix = matrixi; |
---|
818 | freqa = .25; |
---|
819 | freqc = .25; |
---|
820 | freqg = .25; |
---|
821 | freqt = .25; |
---|
822 | ttratio = 2.0; |
---|
823 | ease = 0.457; |
---|
824 | spp = nentries; |
---|
825 | chars = seq_len; |
---|
826 | transition(); |
---|
827 | whichcode = codei; |
---|
828 | whichcat = cati; |
---|
829 | switch (cati) { |
---|
830 | case NONE: |
---|
831 | case SIMILARITY: |
---|
832 | case KIMURA: |
---|
833 | fracchange = 1.0; |
---|
834 | break; |
---|
835 | case PAM: |
---|
836 | code(); |
---|
837 | pameigen(); |
---|
838 | break; |
---|
839 | default: |
---|
840 | code(); |
---|
841 | maketrans(); |
---|
842 | qreigen(prob, 20L); |
---|
843 | break; |
---|
844 | } |
---|
845 | } |
---|