1 | #include "muscle.h" |
---|
2 | #include "distfunc.h" |
---|
3 | #include "seqvect.h" |
---|
4 | #include <math.h> |
---|
5 | |
---|
6 | #define TRACE 0 |
---|
7 | |
---|
8 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) |
---|
9 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) |
---|
10 | |
---|
11 | const unsigned TUPLE_COUNT = 6*6*6*6*6*6; |
---|
12 | static unsigned char Count1[TUPLE_COUNT]; |
---|
13 | static unsigned char Count2[TUPLE_COUNT]; |
---|
14 | |
---|
15 | // Nucleotide groups according to MAFFT (sextet5) |
---|
16 | // 0 = A |
---|
17 | // 1 = C |
---|
18 | // 2 = G |
---|
19 | // 3 = T |
---|
20 | // 4 = other |
---|
21 | |
---|
22 | static unsigned ResidueGroup[] = |
---|
23 | { |
---|
24 | 0, // NX_A, |
---|
25 | 1, // NX_C, |
---|
26 | 2, // NX_G, |
---|
27 | 3, // NX_T/U |
---|
28 | 4, // NX_N, |
---|
29 | 4, // NX_R, |
---|
30 | 4, // NX_Y, |
---|
31 | 4, // NX_GAP |
---|
32 | }; |
---|
33 | static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]); |
---|
34 | |
---|
35 | static char *TupleToStr(int t) |
---|
36 | { |
---|
37 | static char s[7]; |
---|
38 | int t1, t2, t3, t4, t5, t6; |
---|
39 | |
---|
40 | t1 = t%6; |
---|
41 | t2 = (t/6)%6; |
---|
42 | t3 = (t/(6*6))%6; |
---|
43 | t4 = (t/(6*6*6))%6; |
---|
44 | t5 = (t/(6*6*6*6))%6; |
---|
45 | t6 = (t/(6*6*6*6*6))%6; |
---|
46 | |
---|
47 | s[5] = '0' + t1; |
---|
48 | s[4] = '0' + t2; |
---|
49 | s[3] = '0' + t3; |
---|
50 | s[2] = '0' + t4; |
---|
51 | s[1] = '0' + t5; |
---|
52 | s[0] = '0' + t6; |
---|
53 | return s; |
---|
54 | } |
---|
55 | |
---|
56 | static unsigned GetTuple(const unsigned uLetters[], unsigned n) |
---|
57 | { |
---|
58 | assert(uLetters[n] < uResidueGroupCount); |
---|
59 | assert(uLetters[n+1] < uResidueGroupCount); |
---|
60 | assert(uLetters[n+2] < uResidueGroupCount); |
---|
61 | assert(uLetters[n+3] < uResidueGroupCount); |
---|
62 | assert(uLetters[n+4] < uResidueGroupCount); |
---|
63 | assert(uLetters[n+5] < uResidueGroupCount); |
---|
64 | |
---|
65 | unsigned u1 = ResidueGroup[uLetters[n]]; |
---|
66 | unsigned u2 = ResidueGroup[uLetters[n+1]]; |
---|
67 | unsigned u3 = ResidueGroup[uLetters[n+2]]; |
---|
68 | unsigned u4 = ResidueGroup[uLetters[n+3]]; |
---|
69 | unsigned u5 = ResidueGroup[uLetters[n+4]]; |
---|
70 | unsigned u6 = ResidueGroup[uLetters[n+5]]; |
---|
71 | |
---|
72 | return u6 + u5*6 + u4*6*6 + u3*6*6*6 + u2*6*6*6*6 + u1*6*6*6*6*6; |
---|
73 | } |
---|
74 | |
---|
75 | static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[]) |
---|
76 | { |
---|
77 | memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char)); |
---|
78 | for (unsigned n = 0; n < uTupleCount; ++n) |
---|
79 | { |
---|
80 | const unsigned uTuple = GetTuple(L, n); |
---|
81 | ++(Count[uTuple]); |
---|
82 | } |
---|
83 | } |
---|
84 | |
---|
85 | static void ListCount(const unsigned char Count[]) |
---|
86 | { |
---|
87 | for (unsigned n = 0; n < TUPLE_COUNT; ++n) |
---|
88 | { |
---|
89 | if (0 == Count[n]) |
---|
90 | continue; |
---|
91 | Log("%s %u\n", TupleToStr(n), Count[n]); |
---|
92 | } |
---|
93 | } |
---|
94 | |
---|
95 | void DistKmer4_6(const SeqVect &v, DistFunc &DF) |
---|
96 | { |
---|
97 | if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha) |
---|
98 | Quit("DistKmer4_6 requires nucleo alphabet"); |
---|
99 | |
---|
100 | const unsigned uSeqCount = v.Length(); |
---|
101 | |
---|
102 | DF.SetCount(uSeqCount); |
---|
103 | if (0 == uSeqCount) |
---|
104 | return; |
---|
105 | |
---|
106 | // Initialize distance matrix to zero |
---|
107 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
108 | { |
---|
109 | DF.SetDist(uSeq1, uSeq1, 0); |
---|
110 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
111 | DF.SetDist(uSeq1, uSeq2, 0); |
---|
112 | } |
---|
113 | |
---|
114 | // Convert to letters |
---|
115 | unsigned **Letters = new unsigned *[uSeqCount]; |
---|
116 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
117 | { |
---|
118 | Seq &s = *(v[uSeqIndex]); |
---|
119 | const unsigned uSeqLength = s.Length(); |
---|
120 | unsigned *L = new unsigned[uSeqLength]; |
---|
121 | Letters[uSeqIndex] = L; |
---|
122 | for (unsigned n = 0; n < uSeqLength; ++n) |
---|
123 | { |
---|
124 | char c = s[n]; |
---|
125 | L[n] = CharToLetterEx(c); |
---|
126 | if (L[n] >= 4) |
---|
127 | L[n] = 4; |
---|
128 | } |
---|
129 | } |
---|
130 | |
---|
131 | unsigned **uCommonTupleCount = new unsigned *[uSeqCount]; |
---|
132 | for (unsigned n = 0; n < uSeqCount; ++n) |
---|
133 | { |
---|
134 | uCommonTupleCount[n] = new unsigned[uSeqCount]; |
---|
135 | memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned)); |
---|
136 | } |
---|
137 | |
---|
138 | const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2; |
---|
139 | unsigned uCount = 0; |
---|
140 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
141 | { |
---|
142 | Seq &seq1 = *(v[uSeq1]); |
---|
143 | const unsigned uSeqLength1 = seq1.Length(); |
---|
144 | if (uSeqLength1 < 5) |
---|
145 | continue; |
---|
146 | |
---|
147 | const unsigned uTupleCount = uSeqLength1 - 5; |
---|
148 | const unsigned *L = Letters[uSeq1]; |
---|
149 | CountTuples(L, uTupleCount, Count1); |
---|
150 | #if TRACE |
---|
151 | { |
---|
152 | Log("Seq1=%d\n", uSeq1); |
---|
153 | Log("Groups:\n"); |
---|
154 | for (unsigned n = 0; n < uSeqLength1; ++n) |
---|
155 | Log("%u", ResidueGroup[L[n]]); |
---|
156 | Log("\n"); |
---|
157 | |
---|
158 | Log("Tuples:\n"); |
---|
159 | ListCount(Count1); |
---|
160 | } |
---|
161 | #endif |
---|
162 | |
---|
163 | SetProgressDesc("K-mer dist pass 1"); |
---|
164 | for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2) |
---|
165 | { |
---|
166 | if (0 == uCount%500) |
---|
167 | Progress(uCount, uPairCount); |
---|
168 | ++uCount; |
---|
169 | Seq &seq2 = *(v[uSeq2]); |
---|
170 | const unsigned uSeqLength2 = seq2.Length(); |
---|
171 | if (uSeqLength2 < 5) |
---|
172 | { |
---|
173 | if (uSeq1 == uSeq2) |
---|
174 | DF.SetDist(uSeq1, uSeq2, 0); |
---|
175 | else |
---|
176 | DF.SetDist(uSeq1, uSeq2, 1); |
---|
177 | continue; |
---|
178 | } |
---|
179 | |
---|
180 | // First pass through seq 2 to count tuples |
---|
181 | const unsigned uTupleCount = uSeqLength2 - 5; |
---|
182 | const unsigned *L = Letters[uSeq2]; |
---|
183 | CountTuples(L, uTupleCount, Count2); |
---|
184 | #if TRACE |
---|
185 | Log("Seq2=%d Counts=\n", uSeq2); |
---|
186 | ListCount(Count2); |
---|
187 | #endif |
---|
188 | |
---|
189 | // Second pass to accumulate sum of shared tuples |
---|
190 | // MAFFT defines this as the sum over unique tuples |
---|
191 | // in seq2 of the minimum of the number of tuples found |
---|
192 | // in the two sequences. |
---|
193 | unsigned uSum = 0; |
---|
194 | for (unsigned n = 0; n < uTupleCount; ++n) |
---|
195 | { |
---|
196 | const unsigned uTuple = GetTuple(L, n); |
---|
197 | uSum += MIN(Count1[uTuple], Count2[uTuple]); |
---|
198 | |
---|
199 | // This is a hack to make sure each unique tuple counted only once. |
---|
200 | Count2[uTuple] = 0; |
---|
201 | } |
---|
202 | #if TRACE |
---|
203 | { |
---|
204 | Seq &s1 = *(v[uSeq1]); |
---|
205 | Seq &s2 = *(v[uSeq2]); |
---|
206 | const char *pName1 = s1.GetName(); |
---|
207 | const char *pName2 = s2.GetName(); |
---|
208 | Log("Common count %s(%d) - %s(%d) =%u\n", |
---|
209 | pName1, uSeq1, pName2, uSeq2, uSum); |
---|
210 | } |
---|
211 | #endif |
---|
212 | uCommonTupleCount[uSeq1][uSeq2] = uSum; |
---|
213 | uCommonTupleCount[uSeq2][uSeq1] = uSum; |
---|
214 | } |
---|
215 | } |
---|
216 | ProgressStepsDone(); |
---|
217 | |
---|
218 | uCount = 0; |
---|
219 | SetProgressDesc("K-mer dist pass 2"); |
---|
220 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
221 | { |
---|
222 | Seq &s1 = *(v[uSeq1]); |
---|
223 | const char *pName1 = s1.GetName(); |
---|
224 | |
---|
225 | double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1]; |
---|
226 | if (0 == dCommonTupleCount11) |
---|
227 | dCommonTupleCount11 = 1; |
---|
228 | |
---|
229 | DF.SetDist(uSeq1, uSeq1, 0); |
---|
230 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
231 | { |
---|
232 | if (0 == uCount%500) |
---|
233 | Progress(uCount, uPairCount); |
---|
234 | ++uCount; |
---|
235 | |
---|
236 | double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2]; |
---|
237 | if (0 == dCommonTupleCount22) |
---|
238 | dCommonTupleCount22 = 1; |
---|
239 | |
---|
240 | const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2]) |
---|
241 | /dCommonTupleCount11; |
---|
242 | const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2]) |
---|
243 | /dCommonTupleCount22; |
---|
244 | |
---|
245 | // dMinDist is the value used for tree-building in MAFFT |
---|
246 | const double dMinDist = MIN(dDist1, dDist2); |
---|
247 | DF.SetDist(uSeq1, uSeq2, (float) dMinDist); |
---|
248 | |
---|
249 | //const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist); |
---|
250 | //g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId); |
---|
251 | // **** TODO **** why does this make score slightly worse?? |
---|
252 | //const double dKimuraDist = KimuraDist(dEstimatedPctId); |
---|
253 | //DF.SetDist(uSeq1, uSeq2, dKimuraDist); |
---|
254 | } |
---|
255 | } |
---|
256 | ProgressStepsDone(); |
---|
257 | |
---|
258 | for (unsigned n = 0; n < uSeqCount; ++n) |
---|
259 | { |
---|
260 | delete[] uCommonTupleCount[n]; |
---|
261 | delete[] Letters[n]; |
---|
262 | } |
---|
263 | delete[] uCommonTupleCount; |
---|
264 | delete[] Letters; |
---|
265 | } |
---|