| 1 | #include "muscle.h" |
|---|
| 2 | #include "distfunc.h" |
|---|
| 3 | #include "seqvect.h" |
|---|
| 4 | #include <math.h> |
|---|
| 5 | |
|---|
| 6 | #define TRACE 0 |
|---|
| 7 | |
|---|
| 8 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) |
|---|
| 9 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) |
|---|
| 10 | |
|---|
| 11 | const unsigned TUPLE_COUNT = 6*6*6*6*6*6; |
|---|
| 12 | static unsigned char Count1[TUPLE_COUNT]; |
|---|
| 13 | static unsigned char Count2[TUPLE_COUNT]; |
|---|
| 14 | |
|---|
| 15 | // Nucleotide groups according to MAFFT (sextet5) |
|---|
| 16 | // 0 = A |
|---|
| 17 | // 1 = C |
|---|
| 18 | // 2 = G |
|---|
| 19 | // 3 = T |
|---|
| 20 | // 4 = other |
|---|
| 21 | |
|---|
| 22 | static unsigned ResidueGroup[] = |
|---|
| 23 | { |
|---|
| 24 | 0, // NX_A, |
|---|
| 25 | 1, // NX_C, |
|---|
| 26 | 2, // NX_G, |
|---|
| 27 | 3, // NX_T/U |
|---|
| 28 | 4, // NX_N, |
|---|
| 29 | 4, // NX_R, |
|---|
| 30 | 4, // NX_Y, |
|---|
| 31 | 4, // NX_GAP |
|---|
| 32 | }; |
|---|
| 33 | static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); |
|---|
| 34 | |
|---|
| 35 | static char *TupleToStr(int t) |
|---|
| 36 | { |
|---|
| 37 | static char s[7]; |
|---|
| 38 | int t1, t2, t3, t4, t5, t6; |
|---|
| 39 | |
|---|
| 40 | t1 = t%6; |
|---|
| 41 | t2 = (t/6)%6; |
|---|
| 42 | t3 = (t/(6*6))%6; |
|---|
| 43 | t4 = (t/(6*6*6))%6; |
|---|
| 44 | t5 = (t/(6*6*6*6))%6; |
|---|
| 45 | t6 = (t/(6*6*6*6*6))%6; |
|---|
| 46 | |
|---|
| 47 | s[5] = '0' + t1; |
|---|
| 48 | s[4] = '0' + t2; |
|---|
| 49 | s[3] = '0' + t3; |
|---|
| 50 | s[2] = '0' + t4; |
|---|
| 51 | s[1] = '0' + t5; |
|---|
| 52 | s[0] = '0' + t6; |
|---|
| 53 | return s; |
|---|
| 54 | } |
|---|
| 55 | |
|---|
| 56 | static unsigned GetTuple(const unsigned uLetters[], unsigned n) |
|---|
| 57 | { |
|---|
| 58 | assert(uLetters[n] < uResidueGroupCount); |
|---|
| 59 | assert(uLetters[n+1] < uResidueGroupCount); |
|---|
| 60 | assert(uLetters[n+2] < uResidueGroupCount); |
|---|
| 61 | assert(uLetters[n+3] < uResidueGroupCount); |
|---|
| 62 | assert(uLetters[n+4] < uResidueGroupCount); |
|---|
| 63 | assert(uLetters[n+5] < uResidueGroupCount); |
|---|
| 64 | |
|---|
| 65 | unsigned u1 = ResidueGroup[uLetters[n]]; |
|---|
| 66 | unsigned u2 = ResidueGroup[uLetters[n+1]]; |
|---|
| 67 | unsigned u3 = ResidueGroup[uLetters[n+2]]; |
|---|
| 68 | unsigned u4 = ResidueGroup[uLetters[n+3]]; |
|---|
| 69 | unsigned u5 = ResidueGroup[uLetters[n+4]]; |
|---|
| 70 | unsigned u6 = ResidueGroup[uLetters[n+5]]; |
|---|
| 71 | |
|---|
| 72 | return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; |
|---|
| 73 | } |
|---|
| 74 | |
|---|
| 75 | static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) |
|---|
| 76 | { |
|---|
| 77 | memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); |
|---|
| 78 | for (unsigned n = 0; n < uTupleCount; ++n) |
|---|
| 79 | { |
|---|
| 80 | const unsigned uTuple = GetTuple(L, n); |
|---|
| 81 | ++(Count[uTuple]); |
|---|
| 82 | } |
|---|
| 83 | } |
|---|
| 84 | |
|---|
| 85 | static void ListCount(const unsigned char Count[]) |
|---|
| 86 | { |
|---|
| 87 | for (unsigned n = 0; n < TUPLE_COUNT; ++n) |
|---|
| 88 | { |
|---|
| 89 | if (0 == Count[n]) |
|---|
| 90 | continue; |
|---|
| 91 | Log("%s %u\n", TupleToStr(n), Count[n]); |
|---|
| 92 | } |
|---|
| 93 | } |
|---|
| 94 | |
|---|
| 95 | void DistKmer4_6(const SeqVect &v, DistFunc &DF) |
|---|
| 96 | { |
|---|
| 97 | if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) |
|---|
| 98 | Quit("DistKmer4_6 requires nucleo alphabet"); |
|---|
| 99 | |
|---|
| 100 | const unsigned uSeqCount = v.Length(); |
|---|
| 101 | |
|---|
| 102 | DF.SetCount(uSeqCount); |
|---|
| 103 | if (0 == uSeqCount) |
|---|
| 104 | return; |
|---|
| 105 | |
|---|
| 106 | // Initialize distance matrix to zero |
|---|
| 107 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
|---|
| 108 | { |
|---|
| 109 | DF.SetDist(uSeq1, uSeq1, 0); |
|---|
| 110 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
|---|
| 111 | DF.SetDist(uSeq1, uSeq2, 0); |
|---|
| 112 | } |
|---|
| 113 | |
|---|
| 114 | // Convert to letters |
|---|
| 115 | unsigned **Letters = new unsigned *[uSeqCount]; |
|---|
| 116 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 117 | { |
|---|
| 118 | Seq &s = *(v[uSeqIndex]); |
|---|
| 119 | const unsigned uSeqLength = s.Length(); |
|---|
| 120 | unsigned *L = new unsigned[uSeqLength]; |
|---|
| 121 | Letters[uSeqIndex] = L; |
|---|
| 122 | for (unsigned n = 0; n < uSeqLength; ++n) |
|---|
| 123 | { |
|---|
| 124 | char c = s[n]; |
|---|
| 125 | L[n] = CharToLetterEx(c); |
|---|
| 126 | if (L[n] >= 4) |
|---|
| 127 | L[n] = 4; |
|---|
| 128 | } |
|---|
| 129 | } |
|---|
| 130 | |
|---|
| 131 | unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; |
|---|
| 132 | for (unsigned n = 0; n < uSeqCount; ++n) |
|---|
| 133 | { |
|---|
| 134 | uCommonTupleCount[n] = new unsigned[uSeqCount]; |
|---|
| 135 | memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); |
|---|
| 136 | } |
|---|
| 137 | |
|---|
| 138 | const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; |
|---|
| 139 | unsigned uCount = 0; |
|---|
| 140 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
|---|
| 141 | { |
|---|
| 142 | Seq &seq1 = *(v[uSeq1]); |
|---|
| 143 | const unsigned uSeqLength1 = seq1.Length(); |
|---|
| 144 | if (uSeqLength1 < 5) |
|---|
| 145 | continue; |
|---|
| 146 | |
|---|
| 147 | const unsigned uTupleCount = uSeqLength1 - 5; |
|---|
| 148 | const unsigned *L = Letters[uSeq1]; |
|---|
| 149 | CountTuples(L, uTupleCount, Count1); |
|---|
| 150 | #if TRACE |
|---|
| 151 | { |
|---|
| 152 | Log("Seq1=%d\n", uSeq1); |
|---|
| 153 | Log("Groups:\n"); |
|---|
| 154 | for (unsigned n = 0; n < uSeqLength1; ++n) |
|---|
| 155 | Log("%u", ResidueGroup[L[n]]); |
|---|
| 156 | Log("\n"); |
|---|
| 157 | |
|---|
| 158 | Log("Tuples:\n"); |
|---|
| 159 | ListCount(Count1); |
|---|
| 160 | } |
|---|
| 161 | #endif |
|---|
| 162 | |
|---|
| 163 | SetProgressDesc("K-mer dist pass 1"); |
|---|
| 164 | for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) |
|---|
| 165 | { |
|---|
| 166 | if (0 == uCount%500) |
|---|
| 167 | Progress(uCount, uPairCount); |
|---|
| 168 | ++uCount; |
|---|
| 169 | Seq &seq2 = *(v[uSeq2]); |
|---|
| 170 | const unsigned uSeqLength2 = seq2.Length(); |
|---|
| 171 | if (uSeqLength2 < 5) |
|---|
| 172 | { |
|---|
| 173 | if (uSeq1 == uSeq2) |
|---|
| 174 | DF.SetDist(uSeq1, uSeq2, 0); |
|---|
| 175 | else |
|---|
| 176 | DF.SetDist(uSeq1, uSeq2, 1); |
|---|
| 177 | continue; |
|---|
| 178 | } |
|---|
| 179 | |
|---|
| 180 | // First pass through seq 2 to count tuples |
|---|
| 181 | const unsigned uTupleCount = uSeqLength2 - 5; |
|---|
| 182 | const unsigned *L = Letters[uSeq2]; |
|---|
| 183 | CountTuples(L, uTupleCount, Count2); |
|---|
| 184 | #if TRACE |
|---|
| 185 | Log("Seq2=%d Counts=\n", uSeq2); |
|---|
| 186 | ListCount(Count2); |
|---|
| 187 | #endif |
|---|
| 188 | |
|---|
| 189 | // Second pass to accumulate sum of shared tuples |
|---|
| 190 | // MAFFT defines this as the sum over unique tuples |
|---|
| 191 | // in seq2 of the minimum of the number of tuples found |
|---|
| 192 | // in the two sequences. |
|---|
| 193 | unsigned uSum = 0; |
|---|
| 194 | for (unsigned n = 0; n < uTupleCount; ++n) |
|---|
| 195 | { |
|---|
| 196 | const unsigned uTuple = GetTuple(L, n); |
|---|
| 197 | uSum += MIN(Count1[uTuple], Count2[uTuple]); |
|---|
| 198 | |
|---|
| 199 | // This is a hack to make sure each unique tuple counted only once. |
|---|
| 200 | Count2[uTuple] = 0; |
|---|
| 201 | } |
|---|
| 202 | #if TRACE |
|---|
| 203 | { |
|---|
| 204 | Seq &s1 = *(v[uSeq1]); |
|---|
| 205 | Seq &s2 = *(v[uSeq2]); |
|---|
| 206 | const char *pName1 = s1.GetName(); |
|---|
| 207 | const char *pName2 = s2.GetName(); |
|---|
| 208 | Log("Common count %s(%d) - %s(%d) =%u\n", |
|---|
| 209 | pName1, uSeq1, pName2, uSeq2, uSum); |
|---|
| 210 | } |
|---|
| 211 | #endif |
|---|
| 212 | uCommonTupleCount[uSeq1][uSeq2] = uSum; |
|---|
| 213 | uCommonTupleCount[uSeq2][uSeq1] = uSum; |
|---|
| 214 | } |
|---|
| 215 | } |
|---|
| 216 | ProgressStepsDone(); |
|---|
| 217 | |
|---|
| 218 | uCount = 0; |
|---|
| 219 | SetProgressDesc("K-mer dist pass 2"); |
|---|
| 220 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
|---|
| 221 | { |
|---|
| 222 | Seq &s1 = *(v[uSeq1]); |
|---|
| 223 | const char *pName1 = s1.GetName(); |
|---|
| 224 | |
|---|
| 225 | double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; |
|---|
| 226 | if (0 == dCommonTupleCount11) |
|---|
| 227 | dCommonTupleCount11 = 1; |
|---|
| 228 | |
|---|
| 229 | DF.SetDist(uSeq1, uSeq1, 0); |
|---|
| 230 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
|---|
| 231 | { |
|---|
| 232 | if (0 == uCount%500) |
|---|
| 233 | Progress(uCount, uPairCount); |
|---|
| 234 | ++uCount; |
|---|
| 235 | |
|---|
| 236 | double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; |
|---|
| 237 | if (0 == dCommonTupleCount22) |
|---|
| 238 | dCommonTupleCount22 = 1; |
|---|
| 239 | |
|---|
| 240 | const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) |
|---|
| 241 | /dCommonTupleCount11; |
|---|
| 242 | const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) |
|---|
| 243 | /dCommonTupleCount22; |
|---|
| 244 | |
|---|
| 245 | // dMinDist is the value used for tree-building in MAFFT |
|---|
| 246 | const double dMinDist = MIN(dDist1, dDist2); |
|---|
| 247 | DF.SetDist(uSeq1, uSeq2, (float) dMinDist); |
|---|
| 248 | |
|---|
| 249 | //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); |
|---|
| 250 | //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); |
|---|
| 251 | // **** TODO **** why does this make score slightly worse?? |
|---|
| 252 | //const double dKimuraDist = KimuraDist(dEstimatedPctId); |
|---|
| 253 | //DF.SetDist(uSeq1, uSeq2, dKimuraDist); |
|---|
| 254 | } |
|---|
| 255 | } |
|---|
| 256 | ProgressStepsDone(); |
|---|
| 257 | |
|---|
| 258 | for (unsigned n = 0; n < uSeqCount; ++n) |
|---|
| 259 | { |
|---|
| 260 | delete[] uCommonTupleCount[n]; |
|---|
| 261 | delete[] Letters[n]; |
|---|
| 262 | } |
|---|
| 263 | delete[] uCommonTupleCount; |
|---|
| 264 | delete[] Letters; |
|---|
| 265 | } |
|---|