1 | #include "muscle.h" |
---|
2 | #include "distfunc.h" |
---|
3 | #include "seqvect.h" |
---|
4 | #include <math.h> |
---|
5 | |
---|
6 | #define MIN(x, y) ((x) < (y) ? (x) : (y)) |
---|
7 | |
---|
8 | static void SetKmerBitVector(const Seq &s, byte Bits[]) |
---|
9 | { |
---|
10 | const unsigned uLength = s.Length(); |
---|
11 | const unsigned k = 3; // kmer length |
---|
12 | unsigned i = 0; |
---|
13 | unsigned c = 0; |
---|
14 | unsigned h = 0; |
---|
15 | for (unsigned j = 0; j < k - 1; ++j) |
---|
16 | { |
---|
17 | unsigned x = CharToLetterEx(s[i++]); |
---|
18 | if (x <= AX_Y) |
---|
19 | c = c*20 + x; |
---|
20 | else |
---|
21 | { |
---|
22 | c = 0; |
---|
23 | h = j + 1; |
---|
24 | } |
---|
25 | } |
---|
26 | for ( ; i < uLength; ++i) |
---|
27 | { |
---|
28 | unsigned x = CharToLetterEx(s[i++]); |
---|
29 | if (x <= AX_Y) |
---|
30 | c = (c*20 + x)%8000; |
---|
31 | else |
---|
32 | { |
---|
33 | c = 0; |
---|
34 | h = i + k; |
---|
35 | } |
---|
36 | if (i >= h) |
---|
37 | { |
---|
38 | unsigned ByteOffset = c/8; |
---|
39 | unsigned BitOffset = c%8; |
---|
40 | Bits[ByteOffset] |= (1 << BitOffset); |
---|
41 | } |
---|
42 | } |
---|
43 | } |
---|
44 | |
---|
45 | static unsigned CommonBitCount(const byte Bits1[], const byte Bits2[]) |
---|
46 | { |
---|
47 | const byte * const p1end = Bits1 + 1000; |
---|
48 | const byte *p2 = Bits2; |
---|
49 | |
---|
50 | unsigned uCount = 0; |
---|
51 | for (const byte *p1 = Bits1; p1 != p1end; ++p1) |
---|
52 | { |
---|
53 | // Here is a cute trick for efficiently counting the |
---|
54 | // bits common between two bytes by combining them into |
---|
55 | // a single word. |
---|
56 | unsigned b = *p1 | (*p2 << 8); |
---|
57 | while (b != 0) |
---|
58 | { |
---|
59 | if (b & 0x101) |
---|
60 | ++uCount; |
---|
61 | b >>= 1; |
---|
62 | } |
---|
63 | ++p2; |
---|
64 | } |
---|
65 | return uCount; |
---|
66 | } |
---|
67 | |
---|
68 | void DistKbit20_3(const SeqVect &v, DistFunc &DF) |
---|
69 | { |
---|
70 | const unsigned uSeqCount = v.Length(); |
---|
71 | DF.SetCount(uSeqCount); |
---|
72 | |
---|
73 | // There are 20^3 = 8,000 distinct kmers in the 20-letter alphabet. |
---|
74 | // For each sequence, we create a bit vector of length 8,000, i.e. |
---|
75 | // 1,000 bytes, having one bit per kmer. The bit is set to 1 if the |
---|
76 | // kmer is present in the sequence. |
---|
77 | const unsigned uBytes = uSeqCount*1000; |
---|
78 | byte *BitVector = new byte[uBytes]; |
---|
79 | memset(BitVector, 0, uBytes); |
---|
80 | |
---|
81 | SetProgressDesc("K-bit distance matrix"); |
---|
82 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
83 | SetKmerBitVector(*v[uSeqIndex], BitVector + uSeqIndex*1000); |
---|
84 | |
---|
85 | unsigned uDone = 0; |
---|
86 | const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; |
---|
87 | for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount; ++uSeqIndex1) |
---|
88 | { |
---|
89 | const byte *Bits1 = BitVector + uSeqIndex1*1000; |
---|
90 | const unsigned uLength1 = v[uSeqIndex1]->Length(); |
---|
91 | for (unsigned uSeqIndex2 = 0; uSeqIndex2 < uSeqIndex1; ++uSeqIndex2) |
---|
92 | { |
---|
93 | const byte *Bits2 = BitVector + uSeqIndex2*1000; |
---|
94 | const unsigned uLength2 = v[uSeqIndex2]->Length(); |
---|
95 | const float fCount = (float) CommonBitCount(Bits1, Bits2); |
---|
96 | |
---|
97 | // Distance measure = K / min(L1, L2) |
---|
98 | // K is number of distinct kmers that are found in both sequences |
---|
99 | const float fDist = fCount / MIN(uLength1, uLength2); |
---|
100 | DF.SetDist(uSeqIndex1, uSeqIndex2, fDist); |
---|
101 | if (uDone%10000 == 0) |
---|
102 | Progress(uDone, uTotal); |
---|
103 | ++uDone; |
---|
104 | } |
---|
105 | } |
---|
106 | ProgressStepsDone(); |
---|
107 | |
---|
108 | delete[] BitVector; |
---|
109 | } |
---|