1 | #include "muscle.h" |
---|
2 | #include "distfunc.h" |
---|
3 | #include "seqvect.h" |
---|
4 | #include <math.h> |
---|
5 | |
---|
6 | #define TRACE 0 |
---|
7 | |
---|
8 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) |
---|
9 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) |
---|
10 | |
---|
11 | const unsigned TUPLE_COUNT = 6*6*6*6*6*6; |
---|
12 | static unsigned char Count1[TUPLE_COUNT]; |
---|
13 | static unsigned char Count2[TUPLE_COUNT]; |
---|
14 | |
---|
15 | // Amino acid groups according to MAFFT (sextet5) |
---|
16 | // 0 = A G P S T |
---|
17 | // 1 = I L M V |
---|
18 | // 2 = N D Q E B Z |
---|
19 | // 3 = R H K |
---|
20 | // 4 = F W Y |
---|
21 | // 5 = C |
---|
22 | // 6 = X . - U |
---|
23 | unsigned ResidueGroup[] = |
---|
24 | { |
---|
25 | 0, // AX_A, |
---|
26 | 5, // AX_C, |
---|
27 | 2, // AX_D, |
---|
28 | 2, // AX_E, |
---|
29 | 4, // AX_F, |
---|
30 | 0, // AX_G, |
---|
31 | 3, // AX_H, |
---|
32 | 1, // AX_I, |
---|
33 | 3, // AX_K, |
---|
34 | 1, // AX_L, |
---|
35 | 1, // AX_M, |
---|
36 | 2, // AX_N, |
---|
37 | 0, // AX_P, |
---|
38 | 2, // AX_Q, |
---|
39 | 3, // AX_R, |
---|
40 | 0, // AX_S, |
---|
41 | 0, // AX_T, |
---|
42 | 1, // AX_V, |
---|
43 | 4, // AX_W, |
---|
44 | 4, // AX_Y, |
---|
45 | |
---|
46 | 2, // AX_B, // D or N |
---|
47 | 2, // AX_Z, // E or Q |
---|
48 | 0, // AX_X, // Unknown // ******** TODO ************* |
---|
49 | // This isn't the correct way of avoiding group 6 |
---|
50 | 0 // AX_GAP, // ******** TODO ****************** |
---|
51 | }; |
---|
52 | unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); |
---|
53 | |
---|
54 | static char *TupleToStr(int t) |
---|
55 | { |
---|
56 | static char s[7]; |
---|
57 | int t1, t2, t3, t4, t5, t6; |
---|
58 | |
---|
59 | t1 = t%6; |
---|
60 | t2 = (t/6)%6; |
---|
61 | t3 = (t/(6*6))%6; |
---|
62 | t4 = (t/(6*6*6))%6; |
---|
63 | t5 = (t/(6*6*6*6))%6; |
---|
64 | t6 = (t/(6*6*6*6*6))%6; |
---|
65 | |
---|
66 | s[5] = '0' + t1; |
---|
67 | s[4] = '0' + t2; |
---|
68 | s[3] = '0' + t3; |
---|
69 | s[2] = '0' + t4; |
---|
70 | s[1] = '0' + t5; |
---|
71 | s[0] = '0' + t6; |
---|
72 | return s; |
---|
73 | } |
---|
74 | |
---|
75 | static unsigned GetTuple(const unsigned uLetters[], unsigned n) |
---|
76 | { |
---|
77 | assert(uLetters[n] < uResidueGroupCount); |
---|
78 | assert(uLetters[n+1] < uResidueGroupCount); |
---|
79 | assert(uLetters[n+2] < uResidueGroupCount); |
---|
80 | assert(uLetters[n+3] < uResidueGroupCount); |
---|
81 | assert(uLetters[n+4] < uResidueGroupCount); |
---|
82 | assert(uLetters[n+5] < uResidueGroupCount); |
---|
83 | |
---|
84 | unsigned u1 = ResidueGroup[uLetters[n]]; |
---|
85 | unsigned u2 = ResidueGroup[uLetters[n+1]]; |
---|
86 | unsigned u3 = ResidueGroup[uLetters[n+2]]; |
---|
87 | unsigned u4 = ResidueGroup[uLetters[n+3]]; |
---|
88 | unsigned u5 = ResidueGroup[uLetters[n+4]]; |
---|
89 | unsigned u6 = ResidueGroup[uLetters[n+5]]; |
---|
90 | |
---|
91 | return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; |
---|
92 | } |
---|
93 | |
---|
94 | static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) |
---|
95 | { |
---|
96 | memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); |
---|
97 | for (unsigned n = 0; n < uTupleCount; ++n) |
---|
98 | { |
---|
99 | const unsigned uTuple = GetTuple(L, n); |
---|
100 | ++(Count[uTuple]); |
---|
101 | } |
---|
102 | } |
---|
103 | |
---|
104 | static void ListCount(const unsigned char Count[]) |
---|
105 | { |
---|
106 | for (unsigned n = 0; n < TUPLE_COUNT; ++n) |
---|
107 | { |
---|
108 | if (0 == Count[n]) |
---|
109 | continue; |
---|
110 | Log("%s %u\n", TupleToStr(n), Count[n]); |
---|
111 | } |
---|
112 | } |
---|
113 | |
---|
114 | void DistKmer6_6(const SeqVect &v, DistFunc &DF) |
---|
115 | { |
---|
116 | const unsigned uSeqCount = v.Length(); |
---|
117 | |
---|
118 | DF.SetCount(uSeqCount); |
---|
119 | if (0 == uSeqCount) |
---|
120 | return; |
---|
121 | |
---|
122 | // Initialize distance matrix to zero |
---|
123 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
124 | { |
---|
125 | DF.SetDist(uSeq1, uSeq1, 0); |
---|
126 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
127 | DF.SetDist(uSeq1, uSeq2, 0); |
---|
128 | } |
---|
129 | |
---|
130 | // Convert to letters |
---|
131 | unsigned **Letters = new unsigned *[uSeqCount]; |
---|
132 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
133 | { |
---|
134 | Seq &s = *(v[uSeqIndex]); |
---|
135 | const unsigned uSeqLength = s.Length(); |
---|
136 | unsigned *L = new unsigned[uSeqLength]; |
---|
137 | Letters[uSeqIndex] = L; |
---|
138 | for (unsigned n = 0; n < uSeqLength; ++n) |
---|
139 | { |
---|
140 | char c = s[n]; |
---|
141 | L[n] = CharToLetterEx(c); |
---|
142 | assert(L[n] < uResidueGroupCount); |
---|
143 | } |
---|
144 | } |
---|
145 | |
---|
146 | unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; |
---|
147 | for (unsigned n = 0; n < uSeqCount; ++n) |
---|
148 | { |
---|
149 | uCommonTupleCount[n] = new unsigned[uSeqCount]; |
---|
150 | memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); |
---|
151 | } |
---|
152 | |
---|
153 | const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; |
---|
154 | unsigned uCount = 0; |
---|
155 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
156 | { |
---|
157 | Seq &seq1 = *(v[uSeq1]); |
---|
158 | const unsigned uSeqLength1 = seq1.Length(); |
---|
159 | if (uSeqLength1 < 5) |
---|
160 | continue; |
---|
161 | |
---|
162 | const unsigned uTupleCount = uSeqLength1 - 5; |
---|
163 | const unsigned *L = Letters[uSeq1]; |
---|
164 | CountTuples(L, uTupleCount, Count1); |
---|
165 | #if TRACE |
---|
166 | { |
---|
167 | Log("Seq1=%d\n", uSeq1); |
---|
168 | Log("Groups:\n"); |
---|
169 | for (unsigned n = 0; n < uSeqLength1; ++n) |
---|
170 | Log("%u", ResidueGroup[L[n]]); |
---|
171 | Log("\n"); |
---|
172 | |
---|
173 | Log("Tuples:\n"); |
---|
174 | ListCount(Count1); |
---|
175 | } |
---|
176 | #endif |
---|
177 | |
---|
178 | SetProgressDesc("K-mer dist pass 1"); |
---|
179 | for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) |
---|
180 | { |
---|
181 | if (0 == uCount%500) |
---|
182 | Progress(uCount, uPairCount); |
---|
183 | ++uCount; |
---|
184 | Seq &seq2 = *(v[uSeq2]); |
---|
185 | const unsigned uSeqLength2 = seq2.Length(); |
---|
186 | if (uSeqLength2 < 5) |
---|
187 | { |
---|
188 | if (uSeq1 == uSeq2) |
---|
189 | DF.SetDist(uSeq1, uSeq2, 0); |
---|
190 | else |
---|
191 | DF.SetDist(uSeq1, uSeq2, 1); |
---|
192 | continue; |
---|
193 | } |
---|
194 | |
---|
195 | // First pass through seq 2 to count tuples |
---|
196 | const unsigned uTupleCount = uSeqLength2 - 5; |
---|
197 | const unsigned *L = Letters[uSeq2]; |
---|
198 | CountTuples(L, uTupleCount, Count2); |
---|
199 | #if TRACE |
---|
200 | Log("Seq2=%d Counts=\n", uSeq2); |
---|
201 | ListCount(Count2); |
---|
202 | #endif |
---|
203 | |
---|
204 | // Second pass to accumulate sum of shared tuples |
---|
205 | // MAFFT defines this as the sum over unique tuples |
---|
206 | // in seq2 of the minimum of the number of tuples found |
---|
207 | // in the two sequences. |
---|
208 | unsigned uSum = 0; |
---|
209 | for (unsigned n = 0; n < uTupleCount; ++n) |
---|
210 | { |
---|
211 | const unsigned uTuple = GetTuple(L, n); |
---|
212 | uSum += MIN(Count1[uTuple], Count2[uTuple]); |
---|
213 | |
---|
214 | // This is a hack to make sure each unique tuple counted only once. |
---|
215 | Count2[uTuple] = 0; |
---|
216 | } |
---|
217 | #if TRACE |
---|
218 | { |
---|
219 | Seq &s1 = *(v[uSeq1]); |
---|
220 | Seq &s2 = *(v[uSeq2]); |
---|
221 | const char *pName1 = s1.GetName(); |
---|
222 | const char *pName2 = s2.GetName(); |
---|
223 | Log("Common count %s(%d) - %s(%d) =%u\n", |
---|
224 | pName1, uSeq1, pName2, uSeq2, uSum); |
---|
225 | } |
---|
226 | #endif |
---|
227 | uCommonTupleCount[uSeq1][uSeq2] = uSum; |
---|
228 | uCommonTupleCount[uSeq2][uSeq1] = uSum; |
---|
229 | } |
---|
230 | } |
---|
231 | ProgressStepsDone(); |
---|
232 | |
---|
233 | uCount = 0; |
---|
234 | SetProgressDesc("K-mer dist pass 2"); |
---|
235 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
236 | { |
---|
237 | Seq &s1 = *(v[uSeq1]); |
---|
238 | const char *pName1 = s1.GetName(); |
---|
239 | |
---|
240 | double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; |
---|
241 | if (0 == dCommonTupleCount11) |
---|
242 | dCommonTupleCount11 = 1; |
---|
243 | |
---|
244 | DF.SetDist(uSeq1, uSeq1, 0); |
---|
245 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
246 | { |
---|
247 | if (0 == uCount%500) |
---|
248 | Progress(uCount, uPairCount); |
---|
249 | ++uCount; |
---|
250 | |
---|
251 | double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; |
---|
252 | if (0 == dCommonTupleCount22) |
---|
253 | dCommonTupleCount22 = 1; |
---|
254 | |
---|
255 | const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) |
---|
256 | /dCommonTupleCount11; |
---|
257 | const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) |
---|
258 | /dCommonTupleCount22; |
---|
259 | |
---|
260 | // dMinDist is the value used for tree-building in MAFFT |
---|
261 | const double dMinDist = MIN(dDist1, dDist2); |
---|
262 | DF.SetDist(uSeq1, uSeq2, (float) dMinDist); |
---|
263 | |
---|
264 | //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); |
---|
265 | //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); |
---|
266 | // **** TODO **** why does this make score slightly worse?? |
---|
267 | //const double dKimuraDist = KimuraDist(dEstimatedPctId); |
---|
268 | //DF.SetDist(uSeq1, uSeq2, dKimuraDist); |
---|
269 | } |
---|
270 | } |
---|
271 | ProgressStepsDone(); |
---|
272 | |
---|
273 | for (unsigned n = 0; n < uSeqCount; ++n) |
---|
274 | delete[] uCommonTupleCount[n]; |
---|
275 | delete[] uCommonTupleCount; |
---|
276 | delete[] Letters; |
---|
277 | } |
---|
278 | |
---|
279 | double PctIdToMAFFTDist(double dPctId) |
---|
280 | { |
---|
281 | if (dPctId < 0.05) |
---|
282 | dPctId = 0.05; |
---|
283 | double dDist = -log(dPctId); |
---|
284 | return dDist; |
---|
285 | } |
---|
286 | |
---|
287 | double PctIdToHeightMAFFT(double dPctId) |
---|
288 | { |
---|
289 | return PctIdToMAFFTDist(dPctId); |
---|
290 | } |
---|