| 1 | #include "muscle.h" |
|---|
| 2 | #include "distfunc.h" |
|---|
| 3 | #include "seqvect.h" |
|---|
| 4 | #include <math.h> |
|---|
| 5 | |
|---|
| 6 | const unsigned TRIPLE_COUNT = 20*20*20; |
|---|
| 7 | |
|---|
| 8 | struct TripleCount |
|---|
| 9 | { |
|---|
| 10 | unsigned m_uSeqCount; // How many sequences have this triple? |
|---|
| 11 | unsigned short *m_Counts; // m_Counts[s] = nr of times triple found in seq s |
|---|
| 12 | }; |
|---|
| 13 | static TripleCount *TripleCounts; |
|---|
| 14 | |
|---|
| 15 | // WARNING: Sequences MUST be stripped of gaps and upper case! |
|---|
| 16 | void DistKmer20_3(const SeqVect &v, DistFunc &DF) |
|---|
| 17 | { |
|---|
| 18 | const unsigned uSeqCount = v.Length(); |
|---|
| 19 | |
|---|
| 20 | DF.SetCount(uSeqCount); |
|---|
| 21 | if (0 == uSeqCount) |
|---|
| 22 | return; |
|---|
| 23 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
|---|
| 24 | { |
|---|
| 25 | DF.SetDist(uSeq1, uSeq1, 0); |
|---|
| 26 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
|---|
| 27 | DF.SetDist(uSeq1, uSeq2, 0); |
|---|
| 28 | } |
|---|
| 29 | |
|---|
| 30 | const unsigned uTripleArrayBytes = TRIPLE_COUNT*sizeof(TripleCount); |
|---|
| 31 | TripleCounts = (TripleCount *) malloc(uTripleArrayBytes); |
|---|
| 32 | if (0 == TripleCounts) |
|---|
| 33 | Quit("Not enough memory (TripleCounts)"); |
|---|
| 34 | memset(TripleCounts, 0, uTripleArrayBytes); |
|---|
| 35 | |
|---|
| 36 | for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) |
|---|
| 37 | { |
|---|
| 38 | TripleCount &tc = *(TripleCounts + uWord); |
|---|
| 39 | const unsigned uBytes = uSeqCount*sizeof(short); |
|---|
| 40 | tc.m_Counts = (unsigned short *) malloc(uBytes); |
|---|
| 41 | memset(tc.m_Counts, 0, uBytes); |
|---|
| 42 | } |
|---|
| 43 | |
|---|
| 44 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 45 | { |
|---|
| 46 | Seq &s = *(v[uSeqIndex]); |
|---|
| 47 | const unsigned uSeqLength = s.Length(); |
|---|
| 48 | for (unsigned uPos = 0; uPos < uSeqLength - 2; ++uPos) |
|---|
| 49 | { |
|---|
| 50 | const unsigned uLetter1 = CharToLetterEx(s[uPos]); |
|---|
| 51 | if (uLetter1 >= 20) |
|---|
| 52 | continue; |
|---|
| 53 | const unsigned uLetter2 = CharToLetterEx(s[uPos+1]); |
|---|
| 54 | if (uLetter2 >= 20) |
|---|
| 55 | continue; |
|---|
| 56 | const unsigned uLetter3 = CharToLetterEx(s[uPos+2]); |
|---|
| 57 | if (uLetter3 >= 20) |
|---|
| 58 | continue; |
|---|
| 59 | |
|---|
| 60 | const unsigned uWord = uLetter1 + uLetter2*20 + uLetter3*20*20; |
|---|
| 61 | assert(uWord < TRIPLE_COUNT); |
|---|
| 62 | |
|---|
| 63 | TripleCount &tc = *(TripleCounts + uWord); |
|---|
| 64 | const unsigned uOldCount = tc.m_Counts[uSeqIndex]; |
|---|
| 65 | if (0 == uOldCount) |
|---|
| 66 | ++(tc.m_uSeqCount); |
|---|
| 67 | |
|---|
| 68 | ++(tc.m_Counts[uSeqIndex]); |
|---|
| 69 | } |
|---|
| 70 | } |
|---|
| 71 | |
|---|
| 72 | #if TRACE |
|---|
| 73 | { |
|---|
| 74 | Log("TripleCounts\n"); |
|---|
| 75 | unsigned uGrandTotal = 0; |
|---|
| 76 | for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) |
|---|
| 77 | { |
|---|
| 78 | const TripleCount &tc = *(TripleCounts + uWord); |
|---|
| 79 | if (0 == tc.m_uSeqCount) |
|---|
| 80 | continue; |
|---|
| 81 | |
|---|
| 82 | const unsigned uLetter3 = uWord/(20*20); |
|---|
| 83 | const unsigned uLetter2 = (uWord - uLetter3*20*20)/20; |
|---|
| 84 | const unsigned uLetter1 = uWord%20; |
|---|
| 85 | Log("Word %6u %c%c%c %6u", |
|---|
| 86 | uWord, |
|---|
| 87 | LetterToCharAmino(uLetter1), |
|---|
| 88 | LetterToCharAmino(uLetter2), |
|---|
| 89 | LetterToCharAmino(uLetter3), |
|---|
| 90 | tc.m_uSeqCount); |
|---|
| 91 | |
|---|
| 92 | unsigned uSeqCountWithThisWord = 0; |
|---|
| 93 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 94 | { |
|---|
| 95 | const unsigned uCount = tc.m_Counts[uSeqIndex]; |
|---|
| 96 | if (uCount > 0) |
|---|
| 97 | { |
|---|
| 98 | ++uSeqCountWithThisWord; |
|---|
| 99 | Log(" %u=%u", uSeqIndex, uCount); |
|---|
| 100 | uGrandTotal += uCount; |
|---|
| 101 | } |
|---|
| 102 | } |
|---|
| 103 | if (uSeqCountWithThisWord != tc.m_uSeqCount) |
|---|
| 104 | Log(" *** SQ ERROR *** %u %u", tc.m_uSeqCount, uSeqCountWithThisWord); |
|---|
| 105 | Log("\n"); |
|---|
| 106 | } |
|---|
| 107 | |
|---|
| 108 | unsigned uTotalBySeqLength = 0; |
|---|
| 109 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 110 | { |
|---|
| 111 | Seq &s = *(v[uSeqIndex]); |
|---|
| 112 | const unsigned uSeqLength = s.Length(); |
|---|
| 113 | uTotalBySeqLength += uSeqLength - 2; |
|---|
| 114 | } |
|---|
| 115 | if (uGrandTotal != uTotalBySeqLength) |
|---|
| 116 | Log("*** TOTALS DISAGREE *** %u %u\n", uGrandTotal, uTotalBySeqLength); |
|---|
| 117 | } |
|---|
| 118 | #endif |
|---|
| 119 | |
|---|
| 120 | const unsigned uSeqListBytes = uSeqCount*sizeof(unsigned); |
|---|
| 121 | unsigned short *SeqList = (unsigned short *) malloc(uSeqListBytes); |
|---|
| 122 | |
|---|
| 123 | for (unsigned uWord = 0; uWord < TRIPLE_COUNT; ++uWord) |
|---|
| 124 | { |
|---|
| 125 | const TripleCount &tc = *(TripleCounts + uWord); |
|---|
| 126 | if (0 == tc.m_uSeqCount) |
|---|
| 127 | continue; |
|---|
| 128 | |
|---|
| 129 | unsigned uSeqCountFound = 0; |
|---|
| 130 | memset(SeqList, 0, uSeqListBytes); |
|---|
| 131 | |
|---|
| 132 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 133 | { |
|---|
| 134 | if (tc.m_Counts[uSeqIndex] > 0) |
|---|
| 135 | { |
|---|
| 136 | SeqList[uSeqCountFound] = uSeqIndex; |
|---|
| 137 | ++uSeqCountFound; |
|---|
| 138 | if (uSeqCountFound == tc.m_uSeqCount) |
|---|
| 139 | break; |
|---|
| 140 | } |
|---|
| 141 | } |
|---|
| 142 | assert(uSeqCountFound == tc.m_uSeqCount); |
|---|
| 143 | |
|---|
| 144 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCountFound; ++uSeq1) |
|---|
| 145 | { |
|---|
| 146 | const unsigned uSeqIndex1 = SeqList[uSeq1]; |
|---|
| 147 | const unsigned uCount1 = tc.m_Counts[uSeqIndex1]; |
|---|
| 148 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
|---|
| 149 | { |
|---|
| 150 | const unsigned uSeqIndex2 = SeqList[uSeq2]; |
|---|
| 151 | const unsigned uCount2 = tc.m_Counts[uSeqIndex2]; |
|---|
| 152 | const unsigned uMinCount = uCount1 < uCount2 ? uCount1 : uCount2; |
|---|
| 153 | const double d = DF.GetDist(uSeqIndex1, uSeqIndex2); |
|---|
| 154 | DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (d + uMinCount)); |
|---|
| 155 | } |
|---|
| 156 | } |
|---|
| 157 | } |
|---|
| 158 | delete[] SeqList; |
|---|
| 159 | free(TripleCounts); |
|---|
| 160 | |
|---|
| 161 | unsigned uDone = 0; |
|---|
| 162 | const unsigned uTotal = (uSeqCount*(uSeqCount - 1))/2; |
|---|
| 163 | for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1) |
|---|
| 164 | { |
|---|
| 165 | DF.SetDist(uSeq1, uSeq1, 0.0); |
|---|
| 166 | |
|---|
| 167 | const Seq &s1 = *(v[uSeq1]); |
|---|
| 168 | const unsigned uLength1 = s1.Length(); |
|---|
| 169 | |
|---|
| 170 | for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2) |
|---|
| 171 | { |
|---|
| 172 | const Seq &s2 = *(v[uSeq2]); |
|---|
| 173 | const unsigned uLength2 = s2.Length(); |
|---|
| 174 | unsigned uMinLength = uLength1 < uLength2 ? uLength1 : uLength2; |
|---|
| 175 | if (uMinLength < 3) |
|---|
| 176 | { |
|---|
| 177 | DF.SetDist(uSeq1, uSeq2, 1.0); |
|---|
| 178 | continue; |
|---|
| 179 | } |
|---|
| 180 | |
|---|
| 181 | const double dTripleCount = DF.GetDist(uSeq1, uSeq2); |
|---|
| 182 | if (dTripleCount == 0) |
|---|
| 183 | { |
|---|
| 184 | DF.SetDist(uSeq1, uSeq2, 1.0); |
|---|
| 185 | continue; |
|---|
| 186 | } |
|---|
| 187 | double dNormalizedTripletScore = dTripleCount/(uMinLength - 2); |
|---|
| 188 | //double dEstimatedPairwiseIdentity = exp(0.3912*log(dNormalizedTripletScore)); |
|---|
| 189 | //if (dEstimatedPairwiseIdentity > 1) |
|---|
| 190 | // dEstimatedPairwiseIdentity = 1; |
|---|
| 191 | // DF.SetDist(uSeq1, uSeq2, (float) (1.0 - dEstimatedPairwiseIdentity)); |
|---|
| 192 | DF.SetDist(uSeq1, uSeq2, (float) dNormalizedTripletScore); |
|---|
| 193 | |
|---|
| 194 | #if TRACE |
|---|
| 195 | { |
|---|
| 196 | Log("%s - %s Triplet count = %g Lengths %u, %u Estimated pwid = %g\n", |
|---|
| 197 | s1.GetName(), s2.GetName(), dTripleCount, uLength1, uLength2, |
|---|
| 198 | dEstimatedPairwiseIdentity); |
|---|
| 199 | } |
|---|
| 200 | #endif |
|---|
| 201 | if (uDone%1000 == 0) |
|---|
| 202 | Progress(uDone, uTotal); |
|---|
| 203 | } |
|---|
| 204 | } |
|---|
| 205 | ProgressStepsDone(); |
|---|
| 206 | } |
|---|