1 | #include "muscle.h" |
---|
2 | #include "distfunc.h" |
---|
3 | #include "seqvect.h" |
---|
4 | #include <math.h> |
---|
5 | |
---|
6 | const unsigned TRIPLE_COUNT = 20*20*20; |
---|
7 | |
---|
8 | struct TripleCount |
---|
9 | { |
---|
10 | unsigned m_uSeqCount; // How many sequences have this triple? |
---|
11 | unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s |
---|
12 | }; |
---|
13 | static TripleCount *TripleCounts; |
---|
14 | |
---|
15 | // WARNING: Sequences MUST be stripped of gaps and upper case! |
---|
16 | void DistKmer20_3(const SeqVect &v, DistFunc &DF) |
---|
17 | { |
---|
18 | const unsigned uSeqCount = v.Length(); |
---|
19 | |
---|
20 | DF.SetCount(uSeqCount); |
---|
21 | if (0 == uSeqCount) |
---|
22 | return; |
---|
23 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
24 | { |
---|
25 | DF.SetDist(uSeq1, uSeq1, 0); |
---|
26 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
27 | DF.SetDist(uSeq1, uSeq2, 0); |
---|
28 | } |
---|
29 | |
---|
30 | const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount); |
---|
31 | TripleCounts = (TripleCount *) malloc(uTripleArrayBytes); |
---|
32 | if (0 == TripleCounts) |
---|
33 | Quit("Not enough memory (TripleCounts)"); |
---|
34 | memset(TripleCounts, 0, uTripleArrayBytes); |
---|
35 | |
---|
36 | for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) |
---|
37 | { |
---|
38 | TripleCount &tc = *(TripleCounts + uWord); |
---|
39 | const unsigned uBytes = uSeqCount*sizeof(short); |
---|
40 | tc.m_Counts = (unsigned short *) malloc(uBytes); |
---|
41 | memset(tc.m_Counts, 0, uBytes); |
---|
42 | } |
---|
43 | |
---|
44 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
45 | { |
---|
46 | Seq &s = *(v[uSeqIndex]); |
---|
47 | const unsigned uSeqLength = s.Length(); |
---|
48 | for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos) |
---|
49 | { |
---|
50 | const unsigned uLetter1 = CharToLetterEx(s[uPos]); |
---|
51 | if (uLetter1 >= 20) |
---|
52 | continue; |
---|
53 | const unsigned uLetter2 = CharToLetterEx(s[uPos+1]); |
---|
54 | if (uLetter2 >= 20) |
---|
55 | continue; |
---|
56 | const unsigned uLetter3 = CharToLetterEx(s[uPos+2]); |
---|
57 | if (uLetter3 >= 20) |
---|
58 | continue; |
---|
59 | |
---|
60 | const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20; |
---|
61 | assert(uWord < TRIPLE_COUNT); |
---|
62 | |
---|
63 | TripleCount &tc = *(TripleCounts + uWord); |
---|
64 | const unsigned uOldCount = tc.m_Counts[uSeqIndex]; |
---|
65 | if (0 == uOldCount) |
---|
66 | ++(tc.m_uSeqCount); |
---|
67 | |
---|
68 | ++(tc.m_Counts[uSeqIndex]); |
---|
69 | } |
---|
70 | } |
---|
71 | |
---|
72 | #if TRACE |
---|
73 | { |
---|
74 | Log("TripleCounts\n"); |
---|
75 | unsigned uGrandTotal = 0; |
---|
76 | for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) |
---|
77 | { |
---|
78 | const TripleCount &tc = *(TripleCounts + uWord); |
---|
79 | if (0 == tc.m_uSeqCount) |
---|
80 | continue; |
---|
81 | |
---|
82 | const unsigned uLetter3 = uWord/(20*20); |
---|
83 | const unsigned uLetter2 = (uWord - uLetter3*20*20)/20; |
---|
84 | const unsigned uLetter1 = uWord%20; |
---|
85 | Log("Word %6u %c%c%c %6u", |
---|
86 | uWord, |
---|
87 | LetterToCharAmino(uLetter1), |
---|
88 | LetterToCharAmino(uLetter2), |
---|
89 | LetterToCharAmino(uLetter3), |
---|
90 | tc.m_uSeqCount); |
---|
91 | |
---|
92 | unsigned uSeqCountWithThisWord = 0; |
---|
93 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
94 | { |
---|
95 | const unsigned uCount = tc.m_Counts[uSeqIndex]; |
---|
96 | if (uCount > 0) |
---|
97 | { |
---|
98 | ++uSeqCountWithThisWord; |
---|
99 | Log(" %u=%u", uSeqIndex, uCount); |
---|
100 | uGrandTotal += uCount; |
---|
101 | } |
---|
102 | } |
---|
103 | if (uSeqCountWithThisWord != tc.m_uSeqCount) |
---|
104 | Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord); |
---|
105 | Log("\n"); |
---|
106 | } |
---|
107 | |
---|
108 | unsigned uTotalBySeqLength = 0; |
---|
109 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
110 | { |
---|
111 | Seq &s = *(v[uSeqIndex]); |
---|
112 | const unsigned uSeqLength = s.Length(); |
---|
113 | uTotalBySeqLength += uSeqLength - 2; |
---|
114 | } |
---|
115 | if (uGrandTotal != uTotalBySeqLength) |
---|
116 | Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength); |
---|
117 | } |
---|
118 | #endif |
---|
119 | |
---|
120 | const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned); |
---|
121 | unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes); |
---|
122 | |
---|
123 | for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) |
---|
124 | { |
---|
125 | const TripleCount &tc = *(TripleCounts + uWord); |
---|
126 | if (0 == tc.m_uSeqCount) |
---|
127 | continue; |
---|
128 | |
---|
129 | unsigned uSeqCountFound = 0; |
---|
130 | memset(SeqList, 0, uSeqListBytes); |
---|
131 | |
---|
132 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
133 | { |
---|
134 | if (tc.m_Counts[uSeqIndex] > 0) |
---|
135 | { |
---|
136 | SeqList[uSeqCountFound] = uSeqIndex; |
---|
137 | ++uSeqCountFound; |
---|
138 | if (uSeqCountFound == tc.m_uSeqCount) |
---|
139 | break; |
---|
140 | } |
---|
141 | } |
---|
142 | assert(uSeqCountFound == tc.m_uSeqCount); |
---|
143 | |
---|
144 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1) |
---|
145 | { |
---|
146 | const unsigned uSeqIndex1 = SeqList[uSeq1]; |
---|
147 | const unsigned uCount1 = tc.m_Counts[uSeqIndex1]; |
---|
148 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
149 | { |
---|
150 | const unsigned uSeqIndex2 = SeqList[uSeq2]; |
---|
151 | const unsigned uCount2 = tc.m_Counts[uSeqIndex2]; |
---|
152 | const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2; |
---|
153 | const double d = DF.GetDist(uSeqIndex1, uSeqIndex2); |
---|
154 | DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount)); |
---|
155 | } |
---|
156 | } |
---|
157 | } |
---|
158 | delete[] SeqList; |
---|
159 | free(TripleCounts); |
---|
160 | |
---|
161 | unsigned uDone = 0; |
---|
162 | const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; |
---|
163 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
---|
164 | { |
---|
165 | DF.SetDist(uSeq1, uSeq1, 0.0); |
---|
166 | |
---|
167 | const Seq &s1 = *(v[uSeq1]); |
---|
168 | const unsigned uLength1 = s1.Length(); |
---|
169 | |
---|
170 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
---|
171 | { |
---|
172 | const Seq &s2 = *(v[uSeq2]); |
---|
173 | const unsigned uLength2 = s2.Length(); |
---|
174 | unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2; |
---|
175 | if (uMinLength < 3) |
---|
176 | { |
---|
177 | DF.SetDist(uSeq1, uSeq2, 1.0); |
---|
178 | continue; |
---|
179 | } |
---|
180 | |
---|
181 | const double dTripleCount = DF.GetDist(uSeq1, uSeq2); |
---|
182 | if (dTripleCount == 0) |
---|
183 | { |
---|
184 | DF.SetDist(uSeq1, uSeq2, 1.0); |
---|
185 | continue; |
---|
186 | } |
---|
187 | double dNormalizedTripletScore = dTripleCount/(uMinLength - 2); |
---|
188 | //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore)); |
---|
189 | //if (dEstimatedPairwiseIdentity > 1) |
---|
190 | // dEstimatedPairwiseIdentity = 1; |
---|
191 | // DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity)); |
---|
192 | DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore); |
---|
193 | |
---|
194 | #if TRACE |
---|
195 | { |
---|
196 | Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n", |
---|
197 | s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2, |
---|
198 | dEstimatedPairwiseIdentity); |
---|
199 | } |
---|
200 | #endif |
---|
201 | if (uDone%1000 == 0) |
---|
202 | Progress(uDone, uTotal); |
---|
203 | } |
---|
204 | } |
---|
205 | ProgressStepsDone(); |
---|
206 | } |
---|