| 1 | #include "muscle.h" |
|---|
| 2 | #include "msa.h" |
|---|
| 3 | #include "objscore.h" |
|---|
| 4 | |
|---|
| 5 | #define TRACE 0 |
|---|
| 6 | |
|---|
| 7 | static void WindowSmooth(const SCORE Score[], unsigned uCount, unsigned uWindowLength, |
|---|
| 8 | SCORE SmoothScore[], double dCeil) |
|---|
| 9 | { |
|---|
| 10 | #define Ceil(x) ((SCORE) ((x) > dCeil ? dCeil : (x))) |
|---|
| 11 | |
|---|
| 12 | if (1 != uWindowLength%2) |
|---|
| 13 | Quit("WindowSmooth=%u must be odd", uWindowLength); |
|---|
| 14 | |
|---|
| 15 | if (uCount <= uWindowLength) |
|---|
| 16 | { |
|---|
| 17 | for (unsigned i = 0; i < uCount; ++i) |
|---|
| 18 | SmoothScore[i] = 0; |
|---|
| 19 | return; |
|---|
| 20 | } |
|---|
| 21 | |
|---|
| 22 | const unsigned w2 = uWindowLength/2; |
|---|
| 23 | for (unsigned i = 0; i < w2; ++i) |
|---|
| 24 | { |
|---|
| 25 | SmoothScore[i] = 0; |
|---|
| 26 | SmoothScore[uCount - i - 1] = 0; |
|---|
| 27 | } |
|---|
| 28 | |
|---|
| 29 | SCORE scoreWindowTotal = 0; |
|---|
| 30 | for (unsigned i = 0; i < uWindowLength; ++i) |
|---|
| 31 | { |
|---|
| 32 | scoreWindowTotal += Ceil(Score[i]); |
|---|
| 33 | } |
|---|
| 34 | |
|---|
| 35 | for (unsigned i = w2; ; ++i) |
|---|
| 36 | { |
|---|
| 37 | SmoothScore[i] = scoreWindowTotal/uWindowLength; |
|---|
| 38 | if (i == uCount - w2 - 1) |
|---|
| 39 | break; |
|---|
| 40 | |
|---|
| 41 | scoreWindowTotal -= Ceil(Score[i - w2]); |
|---|
| 42 | scoreWindowTotal += Ceil(Score[i + w2 + 1]); |
|---|
| 43 | } |
|---|
| 44 | #undef Ceil |
|---|
| 45 | } |
|---|
| 46 | |
|---|
| 47 | // Find columns that score above the given threshold. |
|---|
| 48 | // A range of scores is defined between the average |
|---|
| 49 | // and the maximum. The threshold is a fraction 0.0 .. 1.0 |
|---|
| 50 | // within that range, where 0.0 is the average score |
|---|
| 51 | // and 1.0 is the maximum score. |
|---|
| 52 | // "Grade" is by analogy with grading on a curve. |
|---|
| 53 | static void FindBestColsGrade(const SCORE Score[], unsigned uCount, |
|---|
| 54 | double dThreshold, unsigned BestCols[], unsigned *ptruBestColCount) |
|---|
| 55 | { |
|---|
| 56 | SCORE scoreTotal = 0; |
|---|
| 57 | for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) |
|---|
| 58 | scoreTotal += Score[uIndex]; |
|---|
| 59 | const SCORE scoreAvg = scoreTotal / uCount; |
|---|
| 60 | |
|---|
| 61 | SCORE scoreMax = MINUS_INFINITY; |
|---|
| 62 | for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) |
|---|
| 63 | if (Score[uIndex] > scoreMax) |
|---|
| 64 | scoreMax = Score[uIndex]; |
|---|
| 65 | |
|---|
| 66 | unsigned uBestColCount = 0; |
|---|
| 67 | for (unsigned uIndex = 0; uIndex < uCount; ++uIndex) |
|---|
| 68 | { |
|---|
| 69 | const SCORE s = Score[uIndex]; |
|---|
| 70 | const double dHeight = (s - scoreAvg)/(scoreMax - scoreAvg); |
|---|
| 71 | if (dHeight >= dThreshold) |
|---|
| 72 | { |
|---|
| 73 | BestCols[uBestColCount] = uIndex; |
|---|
| 74 | ++uBestColCount; |
|---|
| 75 | } |
|---|
| 76 | } |
|---|
| 77 | *ptruBestColCount = uBestColCount; |
|---|
| 78 | } |
|---|
| 79 | |
|---|
| 80 | // Best col only if all following criteria satisfied: |
|---|
| 81 | // (1) Score >= min |
|---|
| 82 | // (2) Smoothed score >= min |
|---|
| 83 | // (3) No gaps. |
|---|
| 84 | static void FindBestColsCombo(const MSA &msa, const SCORE Score[], |
|---|
| 85 | const SCORE SmoothScore[], double dMinScore, double dMinSmoothScore, |
|---|
| 86 | unsigned BestCols[], unsigned *ptruBestColCount) |
|---|
| 87 | { |
|---|
| 88 | const unsigned uColCount = msa.GetColCount(); |
|---|
| 89 | |
|---|
| 90 | unsigned uBestColCount = 0; |
|---|
| 91 | for (unsigned uIndex = 0; uIndex < uColCount; ++uIndex) |
|---|
| 92 | { |
|---|
| 93 | if (Score[uIndex] < dMinScore) |
|---|
| 94 | continue; |
|---|
| 95 | if (SmoothScore[uIndex] < dMinSmoothScore) |
|---|
| 96 | continue; |
|---|
| 97 | if (msa.ColumnHasGap(uIndex)) |
|---|
| 98 | continue; |
|---|
| 99 | BestCols[uBestColCount] = uIndex; |
|---|
| 100 | ++uBestColCount; |
|---|
| 101 | } |
|---|
| 102 | *ptruBestColCount = uBestColCount; |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | static void ListBestCols(const MSA &msa, const SCORE Score[], const SCORE SmoothScore[], |
|---|
| 106 | unsigned BestCols[], unsigned uBestColCount) |
|---|
| 107 | { |
|---|
| 108 | const unsigned uColCount = msa.GetColCount(); |
|---|
| 109 | const unsigned uSeqCount = msa.GetSeqCount(); |
|---|
| 110 | |
|---|
| 111 | Log("Col "); |
|---|
| 112 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 113 | Log("%u", uSeqIndex%10); |
|---|
| 114 | Log(" "); |
|---|
| 115 | |
|---|
| 116 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
|---|
| 117 | { |
|---|
| 118 | Log("%3u ", uColIndex); |
|---|
| 119 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 120 | Log("%c", msa.GetChar(uSeqIndex, uColIndex)); |
|---|
| 121 | |
|---|
| 122 | Log(" %10.3f", Score[uColIndex]); |
|---|
| 123 | Log(" %10.3f", SmoothScore[uColIndex]); |
|---|
| 124 | |
|---|
| 125 | for (unsigned i = 0; i < uBestColCount; ++i) |
|---|
| 126 | if (BestCols[i] == uColIndex) |
|---|
| 127 | Log(" <-- Best"); |
|---|
| 128 | Log("\n"); |
|---|
| 129 | } |
|---|
| 130 | } |
|---|
| 131 | |
|---|
| 132 | // If two best columns are found within a window, choose |
|---|
| 133 | // the highest-scoring. If more than two, choose the one |
|---|
| 134 | // closest to the center of the window. |
|---|
| 135 | static void MergeBestCols(const SCORE Scores[], const unsigned BestCols[], |
|---|
| 136 | unsigned uBestColCount, unsigned uWindowLength, unsigned AnchorCols[], |
|---|
| 137 | unsigned *ptruAnchorColCount) |
|---|
| 138 | { |
|---|
| 139 | unsigned uAnchorColCount = 0; |
|---|
| 140 | for (unsigned n = 0; n < uBestColCount; /* update inside loop */) |
|---|
| 141 | { |
|---|
| 142 | unsigned uBestColIndex = BestCols[n]; |
|---|
| 143 | unsigned uCountWithinWindow = 0; |
|---|
| 144 | for (unsigned i = n + 1; i < uBestColCount; ++i) |
|---|
| 145 | { |
|---|
| 146 | unsigned uBestColIndex2 = BestCols[i]; |
|---|
| 147 | if (uBestColIndex2 - uBestColIndex >= uWindowLength) |
|---|
| 148 | break; |
|---|
| 149 | ++uCountWithinWindow; |
|---|
| 150 | } |
|---|
| 151 | unsigned uAnchorCol = uBestColIndex; |
|---|
| 152 | if (1 == uCountWithinWindow) |
|---|
| 153 | { |
|---|
| 154 | unsigned uBestColIndex2 = BestCols[n+1]; |
|---|
| 155 | if (Scores[uBestColIndex] > Scores[uBestColIndex2]) |
|---|
| 156 | uAnchorCol = uBestColIndex; |
|---|
| 157 | else |
|---|
| 158 | uAnchorCol = uBestColIndex2; |
|---|
| 159 | } |
|---|
| 160 | else if (uCountWithinWindow > 1) |
|---|
| 161 | { |
|---|
| 162 | unsigned uWindowCenter = uBestColIndex + uWindowLength/2; |
|---|
| 163 | int iClosestDist = uWindowLength; |
|---|
| 164 | unsigned uClosestCol = uBestColIndex; |
|---|
| 165 | for (unsigned i = n + 1; i < n + uCountWithinWindow; ++i) |
|---|
| 166 | { |
|---|
| 167 | unsigned uColIndex = BestCols[i]; |
|---|
| 168 | int iDist = uColIndex - uBestColIndex; |
|---|
| 169 | if (iDist < 0) |
|---|
| 170 | iDist = -iDist; |
|---|
| 171 | if (iDist < iClosestDist) |
|---|
| 172 | { |
|---|
| 173 | uClosestCol = uColIndex; |
|---|
| 174 | iClosestDist = iDist; |
|---|
| 175 | } |
|---|
| 176 | } |
|---|
| 177 | uAnchorCol = uClosestCol; |
|---|
| 178 | } |
|---|
| 179 | AnchorCols[uAnchorColCount] = uAnchorCol; |
|---|
| 180 | ++uAnchorColCount; |
|---|
| 181 | n += uCountWithinWindow + 1; |
|---|
| 182 | } |
|---|
| 183 | *ptruAnchorColCount = uAnchorColCount; |
|---|
| 184 | } |
|---|
| 185 | |
|---|
| 186 | void FindAnchorCols(const MSA &msa, unsigned AnchorCols[], |
|---|
| 187 | unsigned *ptruAnchorColCount) |
|---|
| 188 | { |
|---|
| 189 | const unsigned uColCount = msa.GetColCount(); |
|---|
| 190 | if (uColCount < 16) |
|---|
| 191 | { |
|---|
| 192 | *ptruAnchorColCount = 0; |
|---|
| 193 | return; |
|---|
| 194 | } |
|---|
| 195 | |
|---|
| 196 | SCORE *MatchScore = new SCORE[uColCount]; |
|---|
| 197 | SCORE *SmoothScore = new SCORE[uColCount]; |
|---|
| 198 | unsigned *BestCols = new unsigned[uColCount]; |
|---|
| 199 | |
|---|
| 200 | GetLetterScores(msa, MatchScore); |
|---|
| 201 | WindowSmooth(MatchScore, uColCount, g_uSmoothWindowLength, SmoothScore, |
|---|
| 202 | g_dSmoothScoreCeil); |
|---|
| 203 | |
|---|
| 204 | unsigned uBestColCount; |
|---|
| 205 | FindBestColsCombo(msa, MatchScore, SmoothScore, g_dMinBestColScore, g_dMinSmoothScore, |
|---|
| 206 | BestCols, &uBestColCount); |
|---|
| 207 | |
|---|
| 208 | #if TRACE |
|---|
| 209 | ListBestCols(msa, MatchScore, SmoothScore, BestCols, uBestColCount); |
|---|
| 210 | #endif |
|---|
| 211 | |
|---|
| 212 | MergeBestCols(MatchScore, BestCols, uBestColCount, g_uAnchorSpacing, AnchorCols, |
|---|
| 213 | ptruAnchorColCount); |
|---|
| 214 | |
|---|
| 215 | delete[] MatchScore; |
|---|
| 216 | delete[] SmoothScore; |
|---|
| 217 | delete[] BestCols; |
|---|
| 218 | } |
|---|