| 1 | #include "muscle.h" |
|---|
| 2 | #include "msa.h" |
|---|
| 3 | #include <math.h> |
|---|
| 4 | |
|---|
| 5 | // "Standard" NJ distance: the Kimura measure. |
|---|
| 6 | // This is defined to be: |
|---|
| 7 | // |
|---|
| 8 | // log_e(1 - p - p*p/5) |
|---|
| 9 | // |
|---|
| 10 | // where p is the fraction of residues that differ, i.e.: |
|---|
| 11 | // |
|---|
| 12 | // p = (1 - fractional_conservation) |
|---|
| 13 | // |
|---|
| 14 | // This measure is infinite for p = 0.8541 and is considered |
|---|
| 15 | // unreliable for p >= 0.75 (according to the ClustalW docs). |
|---|
| 16 | // ClustalW uses a table lookup for values > 0.75. |
|---|
| 17 | // The following table was copied from the ClustalW file dayhoff.h. |
|---|
| 18 | |
|---|
| 19 | static int dayhoff_pams[]={ |
|---|
| 20 | 195, /* 75.0% observed d; 195 PAMs estimated = 195% estimated d */ |
|---|
| 21 | 196, /* 75.1% observed d; 196 PAMs estimated */ |
|---|
| 22 | 197, 198, 199, 200, 200, 201, 202, 203, |
|---|
| 23 | 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, |
|---|
| 24 | 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, |
|---|
| 25 | 223, 224, 226, 227, 228, 229, 230, 231, 232, 233, |
|---|
| 26 | 234, 236, 237, 238, 239, 240, 241, 243, 244, 245, |
|---|
| 27 | 246, 248, 249, 250, /* 250 PAMs = 80.3% observed d */ |
|---|
| 28 | 252, 253, 254, 255, 257, 258, |
|---|
| 29 | 260, 261, 262, 264, 265, 267, 268, 270, 271, 273, |
|---|
| 30 | 274, 276, 277, 279, 281, 282, 284, 285, 287, 289, |
|---|
| 31 | 291, 292, 294, 296, 298, 299, 301, 303, 305, 307, |
|---|
| 32 | 309, 311, 313, 315, 317, 319, 321, 323, 325, 328, |
|---|
| 33 | 330, 332, 335, 337, 339, 342, 344, 347, 349, 352, |
|---|
| 34 | 354, 357, 360, 362, 365, 368, 371, 374, 377, 380, |
|---|
| 35 | 383, 386, 389, 393, 396, 399, 403, 407, 410, 414, |
|---|
| 36 | 418, 422, 426, 430, 434, 438, 442, 447, 451, 456, |
|---|
| 37 | 461, 466, 471, 476, 482, 487, 493, 498, 504, 511, |
|---|
| 38 | 517, 524, 531, 538, 545, 553, 560, 569, 577, 586, |
|---|
| 39 | 595, 605, 615, 626, 637, 649, 661, 675, 688, 703, |
|---|
| 40 | 719, 736, 754, 775, 796, 819, 845, 874, 907, 945, |
|---|
| 41 | /* 92.9% observed; 945 PAMs */ |
|---|
| 42 | 988 /* 93.0% observed; 988 PAMs */ |
|---|
| 43 | }; |
|---|
| 44 | static int iTableEntries = sizeof(dayhoff_pams)/sizeof(dayhoff_pams[0]); |
|---|
| 45 | |
|---|
| 46 | double KimuraDist(double dPctId) |
|---|
| 47 | { |
|---|
| 48 | double p = 1 - dPctId; |
|---|
| 49 | // Typical case: use Kimura's empirical formula |
|---|
| 50 | if (p < 0.75) |
|---|
| 51 | return -log(1 - p - (p*p)/5); |
|---|
| 52 | |
|---|
| 53 | // Per ClustalW, return 10.0 for anything over 93% |
|---|
| 54 | if (p > 0.93) |
|---|
| 55 | return 10.0; |
|---|
| 56 | |
|---|
| 57 | // If p >= 0.75, use table lookup |
|---|
| 58 | assert(p <= 1 && p >= 0.75); |
|---|
| 59 | // Thanks for Michael Hoel for pointing out a bug |
|---|
| 60 | // in the table index calculation in versions <= 3.52. |
|---|
| 61 | int iTableIndex = (int) ((p - 0.75)*1000 + 0.5); |
|---|
| 62 | if (iTableIndex < 0 || iTableIndex >= iTableEntries) |
|---|
| 63 | Quit("Internal error in MSADistKimura::ComputeDist"); |
|---|
| 64 | |
|---|
| 65 | return dayhoff_pams[iTableIndex] / 100.0; |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | //double MSADistKimura::ComputeDist(const MSA &msa, unsigned uSeqIndex1, |
|---|
| 69 | // unsigned uSeqIndex2) |
|---|
| 70 | // { |
|---|
| 71 | // double dPctId = msa.GetPctIdentityPair(uSeqIndex1, uSeqIndex2); |
|---|
| 72 | // return KimuraDist(dPctId); |
|---|
| 73 | // } |
|---|
| 74 | |
|---|
| 75 | double KimuraDistToPctId(double dKimuraDist) |
|---|
| 76 | { |
|---|
| 77 | // Solve quadratic equation |
|---|
| 78 | const double a = 0.2; |
|---|
| 79 | const double b = 1; |
|---|
| 80 | const double c = 1.0 - exp(-dKimuraDist); |
|---|
| 81 | const double p = (-b + sqrt(b*b + 4*a*c))/(2*a); |
|---|
| 82 | return 1 - p; |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | double PctIdToHeightKimura(double dPctId) |
|---|
| 86 | { |
|---|
| 87 | return KimuraDist(dPctId); |
|---|
| 88 | } |
|---|