| 1 | #include "muscle.h" |
|---|
| 2 | #include "msa.h" |
|---|
| 3 | #include "seqvect.h" |
|---|
| 4 | #include "seq.h" |
|---|
| 5 | #include "distfunc.h" |
|---|
| 6 | #include <math.h> |
|---|
| 7 | |
|---|
| 8 | #define TRACE 0 |
|---|
| 9 | |
|---|
| 10 | /*** |
|---|
| 11 | Some candidate alphabets considered because they |
|---|
| 12 | have high correlations and small table sizes. |
|---|
| 13 | Correlation coefficent is between k-mer distance |
|---|
| 14 | and %id D measured from a CLUSTALW alignment. |
|---|
| 15 | Table size is N^k where N is size of alphabet. |
|---|
| 16 | A is standard (uncompressed) amino alphabet. |
|---|
| 17 | |
|---|
| 18 | Correlation |
|---|
| 19 | Alpha N k Table Size all 25-50% |
|---|
| 20 | ----- -- - ---------- ---- ------ |
|---|
| 21 | A 20 3 8,000 0.943 0.575 |
|---|
| 22 | A 20 4 160,000 0.962 0.685 << |
|---|
| 23 | LiA 14 4 38,416 0.966 0.645 |
|---|
| 24 | SEB 14 4 38,416 0.964 0.634 |
|---|
| 25 | LiA 13 4 28,561 0.965 0.640 |
|---|
| 26 | LiA 12 4 20,736 0.963 0.620 |
|---|
| 27 | LiA 10 5 100,000 0.964 0.652 |
|---|
| 28 | |
|---|
| 29 | We select A with k=4 because it has the best |
|---|
| 30 | correlations. The only drawback is a large table |
|---|
| 31 | size, but space is readily available and the only |
|---|
| 32 | additional time cost is in resetting the table to |
|---|
| 33 | zero, which can be done quickly with memset or by |
|---|
| 34 | keeping a list of the k-mers that were found (should |
|---|
| 35 | test to see which is faster, and may vary by compiler |
|---|
| 36 | and processor type). It also has the minor advantage |
|---|
| 37 | that we don't need to convert the alphabet. |
|---|
| 38 | |
|---|
| 39 | Fractional identity d is estimated as follows. |
|---|
| 40 | |
|---|
| 41 | F = fractional k-mer count |
|---|
| 42 | if F is 0: F = 0.01 |
|---|
| 43 | Y = log(0.02 + F) |
|---|
| 44 | d = -4.1 + 4.12*Y |
|---|
| 45 | |
|---|
| 46 | The constant 0.02 was chosen to make the relationship |
|---|
| 47 | between Y and D linear. The constants -4.1 and 4.12 |
|---|
| 48 | were chosen to fit a straight line to the scatterplot |
|---|
| 49 | of Y vs D. |
|---|
| 50 | ***/ |
|---|
| 51 | |
|---|
| 52 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) |
|---|
| 53 | |
|---|
| 54 | const unsigned K = 4; |
|---|
| 55 | const unsigned N = 20; |
|---|
| 56 | const unsigned N_2 = 20*20; |
|---|
| 57 | const unsigned N_3 = 20*20*20; |
|---|
| 58 | const unsigned N_4 = 20*20*20*20; |
|---|
| 59 | |
|---|
| 60 | const unsigned TABLE_SIZE = N_4; |
|---|
| 61 | |
|---|
| 62 | // For debug output |
|---|
| 63 | const char *KmerToStr(unsigned Kmer) |
|---|
| 64 | { |
|---|
| 65 | static char s[5]; |
|---|
| 66 | |
|---|
| 67 | unsigned c3 = (Kmer/N_3)%N; |
|---|
| 68 | unsigned c2 = (Kmer/N_2)%N; |
|---|
| 69 | unsigned c1 = (Kmer/N)%N; |
|---|
| 70 | unsigned c0 = Kmer%N; |
|---|
| 71 | |
|---|
| 72 | s[0] = LetterToChar(c3); |
|---|
| 73 | s[1] = LetterToChar(c2); |
|---|
| 74 | s[2] = LetterToChar(c1); |
|---|
| 75 | s[3] = LetterToChar(c0); |
|---|
| 76 | return s; |
|---|
| 77 | } |
|---|
| 78 | |
|---|
| 79 | void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[]) |
|---|
| 80 | { |
|---|
| 81 | #if TRACE |
|---|
| 82 | Log("CountKmers\n"); |
|---|
| 83 | #endif |
|---|
| 84 | memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte)); |
|---|
| 85 | |
|---|
| 86 | const byte *ptrKmerStart = s; |
|---|
| 87 | const byte *ptrKmerEnd = s + 4; |
|---|
| 88 | const byte *ptrSeqEnd = s + uSeqLength; |
|---|
| 89 | |
|---|
| 90 | unsigned c3 = s[0]*N_3; |
|---|
| 91 | unsigned c2 = s[1]*N_2; |
|---|
| 92 | unsigned c1 = s[2]*N; |
|---|
| 93 | unsigned c0 = s[3]; |
|---|
| 94 | |
|---|
| 95 | unsigned Kmer = c3 + c2 + c1 + c0; |
|---|
| 96 | |
|---|
| 97 | for (;;) |
|---|
| 98 | { |
|---|
| 99 | assert(Kmer < TABLE_SIZE); |
|---|
| 100 | |
|---|
| 101 | #if TRACE |
|---|
| 102 | Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer)); |
|---|
| 103 | #endif |
|---|
| 104 | ++(KmerCounts[Kmer]); |
|---|
| 105 | |
|---|
| 106 | if (ptrKmerEnd == ptrSeqEnd) |
|---|
| 107 | break; |
|---|
| 108 | |
|---|
| 109 | // Compute k-mer as function of previous k-mer: |
|---|
| 110 | // 1. Subtract first letter from previous k-mer. |
|---|
| 111 | // 2. Multiply by N. |
|---|
| 112 | // 3. Add next letter. |
|---|
| 113 | c3 = (*ptrKmerStart++) * N_3; |
|---|
| 114 | Kmer = (Kmer - c3)*N; |
|---|
| 115 | Kmer += *ptrKmerEnd++; |
|---|
| 116 | } |
|---|
| 117 | } |
|---|
| 118 | |
|---|
| 119 | unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength, |
|---|
| 120 | const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2) |
|---|
| 121 | { |
|---|
| 122 | byte KmerCounts2[TABLE_SIZE]; |
|---|
| 123 | CountKmers(Seq2, uSeqLength2, KmerCounts2); |
|---|
| 124 | |
|---|
| 125 | const byte *ptrKmerStart = Seq; |
|---|
| 126 | const byte *ptrKmerEnd = Seq + 4; |
|---|
| 127 | const byte *ptrSeqEnd = Seq + uSeqLength; |
|---|
| 128 | |
|---|
| 129 | unsigned c3 = Seq[0]*N_3; |
|---|
| 130 | unsigned c2 = Seq[1]*N_2; |
|---|
| 131 | unsigned c1 = Seq[2]*N; |
|---|
| 132 | unsigned c0 = Seq[3]; |
|---|
| 133 | |
|---|
| 134 | unsigned Kmer = c3 + c2 + c1 + c0; |
|---|
| 135 | |
|---|
| 136 | unsigned uCommonCount = 0; |
|---|
| 137 | for (;;) |
|---|
| 138 | { |
|---|
| 139 | assert(Kmer < TABLE_SIZE); |
|---|
| 140 | |
|---|
| 141 | const byte Count1 = KmerCounts1[Kmer]; |
|---|
| 142 | const byte Count2 = KmerCounts2[Kmer]; |
|---|
| 143 | |
|---|
| 144 | uCommonCount += MIN(Count1, Count2); |
|---|
| 145 | |
|---|
| 146 | // Hack so we don't double-count |
|---|
| 147 | KmerCounts2[Kmer] = 0; |
|---|
| 148 | |
|---|
| 149 | if (ptrKmerEnd == ptrSeqEnd) |
|---|
| 150 | break; |
|---|
| 151 | |
|---|
| 152 | // Compute k-mer as function of previous k-mer: |
|---|
| 153 | // 1. Subtract first letter from previous k-mer. |
|---|
| 154 | // 2. Multiply by N. |
|---|
| 155 | // 3. Add next letter. |
|---|
| 156 | c3 = (*ptrKmerStart++) * N_3; |
|---|
| 157 | Kmer = (Kmer - c3)*N; |
|---|
| 158 | Kmer += *ptrKmerEnd++; |
|---|
| 159 | } |
|---|
| 160 | return uCommonCount; |
|---|
| 161 | } |
|---|
| 162 | |
|---|
| 163 | static void SeqToLetters(const Seq &s, byte Letters[]) |
|---|
| 164 | { |
|---|
| 165 | const unsigned uSeqLength = s.Length(); |
|---|
| 166 | for (unsigned uCol = 0; uCol < uSeqLength; ++uCol) |
|---|
| 167 | { |
|---|
| 168 | char c = s.GetChar(uCol); |
|---|
| 169 | // Ugly hack. My k-mer counting code isn't wild-card |
|---|
| 170 | // aware. Arbitrarily replace wildcards by a specific |
|---|
| 171 | // amino acid. |
|---|
| 172 | if (IsWildcardChar(c)) |
|---|
| 173 | c = 'A'; |
|---|
| 174 | *Letters++ = CharToLetter(c); |
|---|
| 175 | } |
|---|
| 176 | } |
|---|
| 177 | |
|---|
| 178 | void FastDistKmer(const SeqVect &v, DistFunc &DF) |
|---|
| 179 | { |
|---|
| 180 | byte KmerCounts[TABLE_SIZE]; |
|---|
| 181 | |
|---|
| 182 | const unsigned uSeqCount = v.GetSeqCount(); |
|---|
| 183 | |
|---|
| 184 | DF.SetCount(uSeqCount); |
|---|
| 185 | if (0 == uSeqCount) |
|---|
| 186 | return; |
|---|
| 187 | |
|---|
| 188 | // Initialize distance matrix to zero |
|---|
| 189 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
|---|
| 190 | { |
|---|
| 191 | DF.SetDist(uSeq1, uSeq1, 0); |
|---|
| 192 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
|---|
| 193 | DF.SetDist(uSeq1, uSeq2, 0); |
|---|
| 194 | } |
|---|
| 195 | |
|---|
| 196 | unsigned uMaxLength = 0; |
|---|
| 197 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 198 | { |
|---|
| 199 | const Seq &s = v.GetSeq(uSeqIndex); |
|---|
| 200 | unsigned uSeqLength = s.Length(); |
|---|
| 201 | if (uSeqLength > uMaxLength) |
|---|
| 202 | uMaxLength = uSeqLength; |
|---|
| 203 | } |
|---|
| 204 | if (0 == uMaxLength) |
|---|
| 205 | return; |
|---|
| 206 | |
|---|
| 207 | byte *Seq1Letters = new byte[uMaxLength]; |
|---|
| 208 | byte *Seq2Letters = new byte[uMaxLength]; |
|---|
| 209 | |
|---|
| 210 | for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1) |
|---|
| 211 | { |
|---|
| 212 | const Seq &s1 = v.GetSeq(uSeqIndex1); |
|---|
| 213 | const unsigned uSeqLength1 = s1.Length(); |
|---|
| 214 | |
|---|
| 215 | SeqToLetters(s1, Seq1Letters); |
|---|
| 216 | CountKmers(Seq1Letters, uSeqLength1, KmerCounts); |
|---|
| 217 | |
|---|
| 218 | for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount; |
|---|
| 219 | ++uSeqIndex2) |
|---|
| 220 | { |
|---|
| 221 | const Seq &s2 = v.GetSeq(uSeqIndex2); |
|---|
| 222 | const unsigned uSeqLength2 = s2.Length(); |
|---|
| 223 | |
|---|
| 224 | SeqToLetters(s2, Seq2Letters); |
|---|
| 225 | |
|---|
| 226 | unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1, |
|---|
| 227 | KmerCounts, Seq2Letters, uSeqLength2); |
|---|
| 228 | |
|---|
| 229 | unsigned uMinLength = MIN(uSeqLength1, uSeqLength2); |
|---|
| 230 | double F = (double) uCommonKmerCount / (uMinLength - K + 1); |
|---|
| 231 | if (0.0 == F) |
|---|
| 232 | F = 0.01; |
|---|
| 233 | double Y = log(0.02 + F); |
|---|
| 234 | double EstimatedPctId = Y/4.12 + 0.995; |
|---|
| 235 | double KD = KimuraDist(EstimatedPctId); |
|---|
| 236 | // DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD); |
|---|
| 237 | DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F)); |
|---|
| 238 | #if TRACE |
|---|
| 239 | Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n", |
|---|
| 240 | uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD); |
|---|
| 241 | #endif |
|---|
| 242 | } |
|---|
| 243 | } |
|---|
| 244 | |
|---|
| 245 | delete[] Seq1Letters; |
|---|
| 246 | delete[] Seq2Letters; |
|---|
| 247 | } |
|---|