| 1 | #include "muscle.h" |
|---|
| 2 | #include "distfunc.h" |
|---|
| 3 | #include "seqvect.h" |
|---|
| 4 | #include <math.h> |
|---|
| 5 | |
|---|
| 6 | #define MIN(x, y) ((x) < (y) ? (x) : (y)) |
|---|
| 7 | |
|---|
| 8 | static void SetKmerBitVector(const Seq &s, byte Bits[]) |
|---|
| 9 | { |
|---|
| 10 | const unsigned uLength = s.Length(); |
|---|
| 11 | const unsigned k = 3; // kmer length |
|---|
| 12 | unsigned i = 0; |
|---|
| 13 | unsigned c = 0; |
|---|
| 14 | unsigned h = 0; |
|---|
| 15 | for (unsigned j = 0; j < k - 1; ++j) |
|---|
| 16 | { |
|---|
| 17 | unsigned x = CharToLetterEx(s[i++]); |
|---|
| 18 | if (x <= AX_Y) |
|---|
| 19 | c = c*20 + x; |
|---|
| 20 | else |
|---|
| 21 | { |
|---|
| 22 | c = 0; |
|---|
| 23 | h = j + 1; |
|---|
| 24 | } |
|---|
| 25 | } |
|---|
| 26 | for ( ; i < uLength; ++i) |
|---|
| 27 | { |
|---|
| 28 | unsigned x = CharToLetterEx(s[i++]); |
|---|
| 29 | if (x <= AX_Y) |
|---|
| 30 | c = (c*20 + x)%8000; |
|---|
| 31 | else |
|---|
| 32 | { |
|---|
| 33 | c = 0; |
|---|
| 34 | h = i + k; |
|---|
| 35 | } |
|---|
| 36 | if (i >= h) |
|---|
| 37 | { |
|---|
| 38 | unsigned ByteOffset = c/8; |
|---|
| 39 | unsigned BitOffset = c%8; |
|---|
| 40 | Bits[ByteOffset] |= (1 << BitOffset); |
|---|
| 41 | } |
|---|
| 42 | } |
|---|
| 43 | } |
|---|
| 44 | |
|---|
| 45 | static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[]) |
|---|
| 46 | { |
|---|
| 47 | const byte * const p1end = Bits1 + 1000; |
|---|
| 48 | const byte *p2 = Bits2; |
|---|
| 49 | |
|---|
| 50 | unsigned uCount = 0; |
|---|
| 51 | for (const byte *p1 = Bits1; p1 != p1end; ++p1) |
|---|
| 52 | { |
|---|
| 53 | // Here is a cute trick for efficiently counting the |
|---|
| 54 | // bits common between two bytes by combining them into |
|---|
| 55 | // a single word. |
|---|
| 56 | unsigned b = *p1 | (*p2 << 8); |
|---|
| 57 | while (b != 0) |
|---|
| 58 | { |
|---|
| 59 | if (b & 0x101) |
|---|
| 60 | ++uCount; |
|---|
| 61 | b >>= 1; |
|---|
| 62 | } |
|---|
| 63 | ++p2; |
|---|
| 64 | } |
|---|
| 65 | return uCount; |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | void DistKbit20_3(const SeqVect &v, DistFunc &DF) |
|---|
| 69 | { |
|---|
| 70 | const unsigned uSeqCount = v.Length(); |
|---|
| 71 | DF.SetCount(uSeqCount); |
|---|
| 72 | |
|---|
| 73 | // There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet. |
|---|
| 74 | // For each sequence, we create a bit vector of length 8,000, i.e. |
|---|
| 75 | // 1,000 bytes, having one bit per kmer. The bit is set to 1 if the |
|---|
| 76 | // kmer is present in the sequence. |
|---|
| 77 | const unsigned uBytes = uSeqCount*1000; |
|---|
| 78 | byte *BitVector = new byte[uBytes]; |
|---|
| 79 | memset(BitVector, 0, uBytes); |
|---|
| 80 | |
|---|
| 81 | SetProgressDesc("K-bit distance matrix"); |
|---|
| 82 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 83 | SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000); |
|---|
| 84 | |
|---|
| 85 | unsigned uDone = 0; |
|---|
| 86 | const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; |
|---|
| 87 | for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) |
|---|
| 88 | { |
|---|
| 89 | const byte *Bits1 = BitVector + uSeqIndex1*1000; |
|---|
| 90 | const unsigned uLength1 = v[uSeqIndex1]->Length(); |
|---|
| 91 | for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) |
|---|
| 92 | { |
|---|
| 93 | const byte *Bits2 = BitVector + uSeqIndex2*1000; |
|---|
| 94 | const unsigned uLength2 = v[uSeqIndex2]->Length(); |
|---|
| 95 | const float fCount = (float) CommonBitCount(Bits1, Bits2); |
|---|
| 96 | |
|---|
| 97 | // Distance measure = K / min(L1, L2) |
|---|
| 98 | // K is number of distinct kmers that are found in both sequences |
|---|
| 99 | const float fDist = fCount / MIN(uLength1, uLength2); |
|---|
| 100 | DF.SetDist(uSeqIndex1, uSeqIndex2, fDist); |
|---|
| 101 | if (uDone%10000 == 0) |
|---|
| 102 | Progress(uDone, uTotal); |
|---|
| 103 | ++uDone; |
|---|
| 104 | } |
|---|
| 105 | } |
|---|
| 106 | ProgressStepsDone(); |
|---|
| 107 | |
|---|
| 108 | delete[] BitVector; |
|---|
| 109 | } |
|---|