1 | #include "muscle.h" |
---|
2 | #include "msa.h" |
---|
3 | #include "seqvect.h" |
---|
4 | #include "seq.h" |
---|
5 | #include "distfunc.h" |
---|
6 | #include <math.h> |
---|
7 | |
---|
8 | #define TRACE 0 |
---|
9 | |
---|
10 | /*** |
---|
11 | Some candidate alphabets considered because they |
---|
12 | have high correlations and small table sizes. |
---|
13 | Correlation coefficent is between k-mer distance |
---|
14 | and %id D measured from a CLUSTALW alignment. |
---|
15 | Table size is N^k where N is size of alphabet. |
---|
16 | A is standard (uncompressed) amino alphabet. |
---|
17 | |
---|
18 | Correlation |
---|
19 | Alpha N k Table Size all 25-50% |
---|
20 | ----- -- - ---------- ---- ------ |
---|
21 | A 20 3 8,000 0.943 0.575 |
---|
22 | A 20 4 160,000 0.962 0.685 << |
---|
23 | LiA 14 4 38,416 0.966 0.645 |
---|
24 | SEB 14 4 38,416 0.964 0.634 |
---|
25 | LiA 13 4 28,561 0.965 0.640 |
---|
26 | LiA 12 4 20,736 0.963 0.620 |
---|
27 | LiA 10 5 100,000 0.964 0.652 |
---|
28 | |
---|
29 | We select A with k=4 because it has the best |
---|
30 | correlations. The only drawback is a large table |
---|
31 | size, but space is readily available and the only |
---|
32 | additional time cost is in resetting the table to |
---|
33 | zero, which can be done quickly with memset or by |
---|
34 | keeping a list of the k-mers that were found (should |
---|
35 | test to see which is faster, and may vary by compiler |
---|
36 | and processor type). It also has the minor advantage |
---|
37 | that we don't need to convert the alphabet. |
---|
38 | |
---|
39 | Fractional identity d is estimated as follows. |
---|
40 | |
---|
41 | F = fractional k-mer count |
---|
42 | if F is 0: F = 0.01 |
---|
43 | Y = log(0.02 + F) |
---|
44 | d = -4.1 + 4.12*Y |
---|
45 | |
---|
46 | The constant 0.02 was chosen to make the relationship |
---|
47 | between Y and D linear. The constants -4.1 and 4.12 |
---|
48 | were chosen to fit a straight line to the scatterplot |
---|
49 | of Y vs D. |
---|
50 | ***/ |
---|
51 | |
---|
52 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) |
---|
53 | |
---|
54 | const unsigned K = 4; |
---|
55 | const unsigned N = 20; |
---|
56 | const unsigned N_2 = 20*20; |
---|
57 | const unsigned N_3 = 20*20*20; |
---|
58 | const unsigned N_4 = 20*20*20*20; |
---|
59 | |
---|
60 | const unsigned TABLE_SIZE = N_4; |
---|
61 | |
---|
62 | // For debug output |
---|
63 | const char *KmerToStr(unsigned Kmer) |
---|
64 | { |
---|
65 | static char s[5]; |
---|
66 | |
---|
67 | unsigned c3 = (Kmer/N_3)%N; |
---|
68 | unsigned c2 = (Kmer/N_2)%N; |
---|
69 | unsigned c1 = (Kmer/N)%N; |
---|
70 | unsigned c0 = Kmer%N; |
---|
71 | |
---|
72 | s[0] = LetterToChar(c3); |
---|
73 | s[1] = LetterToChar(c2); |
---|
74 | s[2] = LetterToChar(c1); |
---|
75 | s[3] = LetterToChar(c0); |
---|
76 | return s; |
---|
77 | } |
---|
78 | |
---|
79 | void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[]) |
---|
80 | { |
---|
81 | #if TRACE |
---|
82 | Log("CountKmers\n"); |
---|
83 | #endif |
---|
84 | memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte)); |
---|
85 | |
---|
86 | const byte *ptrKmerStart = s; |
---|
87 | const byte *ptrKmerEnd = s + 4; |
---|
88 | const byte *ptrSeqEnd = s + uSeqLength; |
---|
89 | |
---|
90 | unsigned c3 = s[0]*N_3; |
---|
91 | unsigned c2 = s[1]*N_2; |
---|
92 | unsigned c1 = s[2]*N; |
---|
93 | unsigned c0 = s[3]; |
---|
94 | |
---|
95 | unsigned Kmer = c3 + c2 + c1 + c0; |
---|
96 | |
---|
97 | for (;;) |
---|
98 | { |
---|
99 | assert(Kmer < TABLE_SIZE); |
---|
100 | |
---|
101 | #if TRACE |
---|
102 | Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer)); |
---|
103 | #endif |
---|
104 | ++(KmerCounts[Kmer]); |
---|
105 | |
---|
106 | if (ptrKmerEnd == ptrSeqEnd) |
---|
107 | break; |
---|
108 | |
---|
109 | // Compute k-mer as function of previous k-mer: |
---|
110 | // 1. Subtract first letter from previous k-mer. |
---|
111 | // 2. Multiply by N. |
---|
112 | // 3. Add next letter. |
---|
113 | c3 = (*ptrKmerStart++) * N_3; |
---|
114 | Kmer = (Kmer - c3)*N; |
---|
115 | Kmer += *ptrKmerEnd++; |
---|
116 | } |
---|
117 | } |
---|
118 | |
---|
119 | unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength, |
---|
120 | const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2) |
---|
121 | { |
---|
122 | byte KmerCounts2[TABLE_SIZE]; |
---|
123 | CountKmers(Seq2, uSeqLength2, KmerCounts2); |
---|
124 | |
---|
125 | const byte *ptrKmerStart = Seq; |
---|
126 | const byte *ptrKmerEnd = Seq + 4; |
---|
127 | const byte *ptrSeqEnd = Seq + uSeqLength; |
---|
128 | |
---|
129 | unsigned c3 = Seq[0]*N_3; |
---|
130 | unsigned c2 = Seq[1]*N_2; |
---|
131 | unsigned c1 = Seq[2]*N; |
---|
132 | unsigned c0 = Seq[3]; |
---|
133 | |
---|
134 | unsigned Kmer = c3 + c2 + c1 + c0; |
---|
135 | |
---|
136 | unsigned uCommonCount = 0; |
---|
137 | for (;;) |
---|
138 | { |
---|
139 | assert(Kmer < TABLE_SIZE); |
---|
140 | |
---|
141 | const byte Count1 = KmerCounts1[Kmer]; |
---|
142 | const byte Count2 = KmerCounts2[Kmer]; |
---|
143 | |
---|
144 | uCommonCount += MIN(Count1, Count2); |
---|
145 | |
---|
146 | // Hack so we don't double-count |
---|
147 | KmerCounts2[Kmer] = 0; |
---|
148 | |
---|
149 | if (ptrKmerEnd == ptrSeqEnd) |
---|
150 | break; |
---|
151 | |
---|
152 | // Compute k-mer as function of previous k-mer: |
---|
153 | // 1. Subtract first letter from previous k-mer. |
---|
154 | // 2. Multiply by N. |
---|
155 | // 3. Add next letter. |
---|
156 | c3 = (*ptrKmerStart++) * N_3; |
---|
157 | Kmer = (Kmer - c3)*N; |
---|
158 | Kmer += *ptrKmerEnd++; |
---|
159 | } |
---|
160 | return uCommonCount; |
---|
161 | } |
---|
162 | |
---|
163 | static void SeqToLetters(const Seq &s, byte Letters[]) |
---|
164 | { |
---|
165 | const unsigned uSeqLength = s.Length(); |
---|
166 | for (unsigned uCol = 0; uCol < uSeqLength; ++uCol) |
---|
167 | { |
---|
168 | char c = s.GetChar(uCol); |
---|
169 | // Ugly hack. My k-mer counting code isn't wild-card |
---|
170 | // aware. Arbitrarily replace wildcards by a specific |
---|
171 | // amino acid. |
---|
172 | if (IsWildcardChar(c)) |
---|
173 | c = 'A'; |
---|
174 | *Letters++ = CharToLetter(c); |
---|
175 | } |
---|
176 | } |
---|
177 | |
---|
178 | void FastDistKmer(const SeqVect &v, DistFunc &DF) |
---|
179 | { |
---|
180 | byte KmerCounts[TABLE_SIZE]; |
---|
181 | |
---|
182 | const unsigned uSeqCount = v.GetSeqCount(); |
---|
183 | |
---|
184 | DF.SetCount(uSeqCount); |
---|
185 | if (0 == uSeqCount) |
---|
186 | return; |
---|
187 | |
---|
188 | // Initialize distance matrix to zero |
---|
189 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
190 | { |
---|
191 | DF.SetDist(uSeq1, uSeq1, 0); |
---|
192 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
193 | DF.SetDist(uSeq1, uSeq2, 0); |
---|
194 | } |
---|
195 | |
---|
196 | unsigned uMaxLength = 0; |
---|
197 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
198 | { |
---|
199 | const Seq &s = v.GetSeq(uSeqIndex); |
---|
200 | unsigned uSeqLength = s.Length(); |
---|
201 | if (uSeqLength > uMaxLength) |
---|
202 | uMaxLength = uSeqLength; |
---|
203 | } |
---|
204 | if (0 == uMaxLength) |
---|
205 | return; |
---|
206 | |
---|
207 | byte *Seq1Letters = new byte[uMaxLength]; |
---|
208 | byte *Seq2Letters = new byte[uMaxLength]; |
---|
209 | |
---|
210 | for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1) |
---|
211 | { |
---|
212 | const Seq &s1 = v.GetSeq(uSeqIndex1); |
---|
213 | const unsigned uSeqLength1 = s1.Length(); |
---|
214 | |
---|
215 | SeqToLetters(s1, Seq1Letters); |
---|
216 | CountKmers(Seq1Letters, uSeqLength1, KmerCounts); |
---|
217 | |
---|
218 | for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; |
---|
219 | ++uSeqIndex2) |
---|
220 | { |
---|
221 | const Seq &s2 = v.GetSeq(uSeqIndex2); |
---|
222 | const unsigned uSeqLength2 = s2.Length(); |
---|
223 | |
---|
224 | SeqToLetters(s2, Seq2Letters); |
---|
225 | |
---|
226 | unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1, |
---|
227 | KmerCounts, Seq2Letters, uSeqLength2); |
---|
228 | |
---|
229 | unsigned uMinLength = MIN(uSeqLength1, uSeqLength2); |
---|
230 | double F = (double) uCommonKmerCount / (uMinLength - K + 1); |
---|
231 | if (0.0 == F) |
---|
232 | F = 0.01; |
---|
233 | double Y = log(0.02 + F); |
---|
234 | double EstimatedPctId = Y/4.12 + 0.995; |
---|
235 | double KD = KimuraDist(EstimatedPctId); |
---|
236 | // DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD); |
---|
237 | DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F)); |
---|
238 | #if TRACE |
---|
239 | Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n", |
---|
240 | uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD); |
---|
241 | #endif |
---|
242 | } |
---|
243 | } |
---|
244 | |
---|
245 | delete[] Seq1Letters; |
---|
246 | delete[] Seq2Letters; |
---|
247 | } |
---|