| 1 | #include "muscle.h" |
|---|
| 2 | #include <math.h> |
|---|
| 3 | #include <stdio.h> // for sprintf |
|---|
| 4 | #include "pwpath.h" |
|---|
| 5 | #include "profile.h" |
|---|
| 6 | #include "gapscoredimer.h" |
|---|
| 7 | |
|---|
| 8 | #define TRACE 0 |
|---|
| 9 | |
|---|
| 10 | static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, |
|---|
| 11 | const char *TBM_, const char *TBD_, const char *TBI_, |
|---|
| 12 | unsigned uLengthA, unsigned uLengthB, PWPath &Path); |
|---|
| 13 | |
|---|
| 14 | static const char *LocalScoreToStr(SCORE s) |
|---|
| 15 | { |
|---|
| 16 | static char str[16]; |
|---|
| 17 | if (MINUS_INFINITY == s) |
|---|
| 18 | return " *"; |
|---|
| 19 | sprintf(str, "%6.3g", s); |
|---|
| 20 | return str; |
|---|
| 21 | } |
|---|
| 22 | |
|---|
| 23 | #if TRACE |
|---|
| 24 | static void ListDP(const SCORE *DPM_, const ProfPos *PA, const ProfPos *PB, |
|---|
| 25 | unsigned uPrefixCountA, unsigned uPrefixCountB) |
|---|
| 26 | { |
|---|
| 27 | Log(" "); |
|---|
| 28 | for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) |
|---|
| 29 | { |
|---|
| 30 | char c = ' '; |
|---|
| 31 | if (uPrefixLengthB > 0) |
|---|
| 32 | c = ConsensusChar(PB[uPrefixLengthB - 1]); |
|---|
| 33 | Log(" %4u:%c", uPrefixLengthB, c); |
|---|
| 34 | } |
|---|
| 35 | Log("\n"); |
|---|
| 36 | for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) |
|---|
| 37 | { |
|---|
| 38 | char c = ' '; |
|---|
| 39 | if (uPrefixLengthA > 0) |
|---|
| 40 | c = ConsensusChar(PA[uPrefixLengthA - 1]); |
|---|
| 41 | Log("%4u:%c ", uPrefixLengthA, c); |
|---|
| 42 | for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) |
|---|
| 43 | Log(" %s", LocalScoreToStr(DPM(uPrefixLengthA, uPrefixLengthB))); |
|---|
| 44 | Log("\n"); |
|---|
| 45 | } |
|---|
| 46 | } |
|---|
| 47 | |
|---|
| 48 | static void ListTB(const char *TBM_, const ProfPos *PA, const ProfPos *PB, |
|---|
| 49 | unsigned uPrefixCountA, unsigned uPrefixCountB) |
|---|
| 50 | { |
|---|
| 51 | Log(" "); |
|---|
| 52 | for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) |
|---|
| 53 | Log("%2d", uPrefixLengthB); |
|---|
| 54 | Log("\n"); |
|---|
| 55 | Log(" "); |
|---|
| 56 | for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) |
|---|
| 57 | { |
|---|
| 58 | char c = ' '; |
|---|
| 59 | if (uPrefixLengthB > 0) |
|---|
| 60 | c = ConsensusChar(PB[uPrefixLengthB - 1]); |
|---|
| 61 | Log(" %c", c); |
|---|
| 62 | } |
|---|
| 63 | Log("\n"); |
|---|
| 64 | for (unsigned uPrefixLengthA = 0; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) |
|---|
| 65 | { |
|---|
| 66 | char c = ' '; |
|---|
| 67 | if (uPrefixLengthA > 0) |
|---|
| 68 | c = ConsensusChar(PA[uPrefixLengthA - 1]); |
|---|
| 69 | Log("%4u:%c ", uPrefixLengthA, c); |
|---|
| 70 | for (unsigned uPrefixLengthB = 0; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) |
|---|
| 71 | Log(" %c", TBM(uPrefixLengthA, uPrefixLengthB)); |
|---|
| 72 | Log("\n"); |
|---|
| 73 | } |
|---|
| 74 | } |
|---|
| 75 | #endif // TRACE |
|---|
| 76 | |
|---|
| 77 | static ProfPos PPTerm; |
|---|
| 78 | static bool InitializePPTerm() |
|---|
| 79 | { |
|---|
| 80 | PPTerm.m_bAllGaps = false; |
|---|
| 81 | PPTerm.m_LL = 1; |
|---|
| 82 | PPTerm.m_LG = 0; |
|---|
| 83 | PPTerm.m_GL = 0; |
|---|
| 84 | PPTerm.m_GG = 0; |
|---|
| 85 | PPTerm.m_fOcc = 1; |
|---|
| 86 | return true; |
|---|
| 87 | } |
|---|
| 88 | static bool PPTermInitialized = InitializePPTerm(); |
|---|
| 89 | |
|---|
| 90 | static SCORE ScoreProfPosDimerLE(const ProfPos &PPA, const ProfPos &PPB) |
|---|
| 91 | { |
|---|
| 92 | SCORE Score = 0; |
|---|
| 93 | for (unsigned n = 0; n < 20; ++n) |
|---|
| 94 | { |
|---|
| 95 | const unsigned uLetter = PPA.m_uSortOrder[n]; |
|---|
| 96 | const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; |
|---|
| 97 | if (0 == fcLetter) |
|---|
| 98 | break; |
|---|
| 99 | Score += fcLetter*PPB.m_AAScores[uLetter]; |
|---|
| 100 | } |
|---|
| 101 | if (0 == Score) |
|---|
| 102 | return -2.5; |
|---|
| 103 | SCORE logScore = logf(Score); |
|---|
| 104 | return (SCORE) (logScore*(PPA.m_fOcc * PPB.m_fOcc)); |
|---|
| 105 | } |
|---|
| 106 | |
|---|
| 107 | static SCORE ScoreProfPosDimerPSP(const ProfPos &PPA, const ProfPos &PPB) |
|---|
| 108 | { |
|---|
| 109 | SCORE Score = 0; |
|---|
| 110 | for (unsigned n = 0; n < 20; ++n) |
|---|
| 111 | { |
|---|
| 112 | const unsigned uLetter = PPA.m_uSortOrder[n]; |
|---|
| 113 | const FCOUNT fcLetter = PPA.m_fcCounts[uLetter]; |
|---|
| 114 | if (0 == fcLetter) |
|---|
| 115 | break; |
|---|
| 116 | Score += fcLetter*PPB.m_AAScores[uLetter]; |
|---|
| 117 | } |
|---|
| 118 | return Score; |
|---|
| 119 | } |
|---|
| 120 | |
|---|
| 121 | static SCORE ScoreProfPosDimer(const ProfPos &PPA, const ProfPos &PPB) |
|---|
| 122 | { |
|---|
| 123 | switch (g_PPScore) |
|---|
| 124 | { |
|---|
| 125 | case PPSCORE_LE: |
|---|
| 126 | return ScoreProfPosDimerLE(PPA, PPB); |
|---|
| 127 | |
|---|
| 128 | case PPSCORE_SP: |
|---|
| 129 | case PPSCORE_SV: |
|---|
| 130 | return ScoreProfPosDimerPSP(PPA, PPB); |
|---|
| 131 | } |
|---|
| 132 | Quit("Invalid g_PPScore"); |
|---|
| 133 | return 0; |
|---|
| 134 | } |
|---|
| 135 | |
|---|
| 136 | // Global alignment dynamic programming |
|---|
| 137 | // This variant optimizes the profile-profile SP score under the |
|---|
| 138 | // dimer approximation. |
|---|
| 139 | SCORE GlobalAlignDimer(const ProfPos *PA, unsigned uLengthA, const ProfPos *PB, |
|---|
| 140 | unsigned uLengthB, PWPath &Path) |
|---|
| 141 | { |
|---|
| 142 | assert(uLengthB > 0 && uLengthA > 0); |
|---|
| 143 | |
|---|
| 144 | const unsigned uPrefixCountA = uLengthA + 1; |
|---|
| 145 | const unsigned uPrefixCountB = uLengthB + 1; |
|---|
| 146 | |
|---|
| 147 | // Allocate DP matrices |
|---|
| 148 | const size_t LM = uPrefixCountA*uPrefixCountB; |
|---|
| 149 | SCORE *DPM_ = new SCORE[LM]; |
|---|
| 150 | SCORE *DPD_ = new SCORE[LM]; |
|---|
| 151 | SCORE *DPI_ = new SCORE[LM]; |
|---|
| 152 | |
|---|
| 153 | char *TBM_ = new char[LM]; |
|---|
| 154 | char *TBD_ = new char[LM]; |
|---|
| 155 | char *TBI_ = new char[LM]; |
|---|
| 156 | |
|---|
| 157 | DPM(0, 0) = 0; |
|---|
| 158 | DPD(0, 0) = MINUS_INFINITY; |
|---|
| 159 | DPI(0, 0) = MINUS_INFINITY; |
|---|
| 160 | |
|---|
| 161 | TBM(0, 0) = 'S'; |
|---|
| 162 | TBD(0, 0) = '?'; |
|---|
| 163 | TBI(0, 0) = '?'; |
|---|
| 164 | |
|---|
| 165 | DPM(1, 0) = MINUS_INFINITY; |
|---|
| 166 | DPD(1, 0) = GapScoreMD(PA[0], PPTerm); |
|---|
| 167 | DPI(1, 0) = MINUS_INFINITY; |
|---|
| 168 | |
|---|
| 169 | TBM(1, 0) = '?'; |
|---|
| 170 | TBD(1, 0) = 'S'; |
|---|
| 171 | TBI(1, 0) = '?'; |
|---|
| 172 | |
|---|
| 173 | DPM(0, 1) = MINUS_INFINITY; |
|---|
| 174 | DPD(0, 1) = MINUS_INFINITY; |
|---|
| 175 | DPI(0, 1) = GapScoreMI(PPTerm, PB[0]); |
|---|
| 176 | |
|---|
| 177 | TBM(0, 1) = '?'; |
|---|
| 178 | TBD(0, 1) = '?'; |
|---|
| 179 | TBI(0, 1) = 'S'; |
|---|
| 180 | |
|---|
| 181 | // Empty prefix of B is special case |
|---|
| 182 | for (unsigned uPrefixLengthA = 2; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) |
|---|
| 183 | { |
|---|
| 184 | // M=LetterA+LetterB, impossible with empty prefix |
|---|
| 185 | DPM(uPrefixLengthA, 0) = MINUS_INFINITY; |
|---|
| 186 | TBM(uPrefixLengthA, 0) = '?'; |
|---|
| 187 | |
|---|
| 188 | // D=LetterA+GapB |
|---|
| 189 | DPD(uPrefixLengthA, 0) = DPD(uPrefixLengthA - 1, 0) + |
|---|
| 190 | GapScoreDD(PA[uPrefixLengthA - 1], PPTerm); |
|---|
| 191 | TBD(uPrefixLengthA, 0) = 'D'; |
|---|
| 192 | |
|---|
| 193 | // I=GapA+LetterB, impossible with empty prefix |
|---|
| 194 | DPI(uPrefixLengthA, 0) = MINUS_INFINITY; |
|---|
| 195 | TBI(uPrefixLengthA, 0) = '?'; |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | // Empty prefix of A is special case |
|---|
| 199 | for (unsigned uPrefixLengthB = 2; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) |
|---|
| 200 | { |
|---|
| 201 | // M=LetterA+LetterB, impossible with empty prefix |
|---|
| 202 | DPM(0, uPrefixLengthB) = MINUS_INFINITY; |
|---|
| 203 | TBM(0, uPrefixLengthB) = '?'; |
|---|
| 204 | |
|---|
| 205 | // D=LetterA+GapB, impossible with empty prefix |
|---|
| 206 | DPD(0, uPrefixLengthB) = MINUS_INFINITY; |
|---|
| 207 | TBD(0, uPrefixLengthB) = '?'; |
|---|
| 208 | |
|---|
| 209 | // I=GapA+LetterB |
|---|
| 210 | DPI(0, uPrefixLengthB) = DPI(0, uPrefixLengthB - 1) + |
|---|
| 211 | GapScoreII(PPTerm, PB[uPrefixLengthB - 1]); |
|---|
| 212 | TBI(0, uPrefixLengthB) = 'I'; |
|---|
| 213 | } |
|---|
| 214 | |
|---|
| 215 | // ============ |
|---|
| 216 | // Main DP loop |
|---|
| 217 | // ============ |
|---|
| 218 | for (unsigned uPrefixLengthB = 1; uPrefixLengthB < uPrefixCountB; ++uPrefixLengthB) |
|---|
| 219 | { |
|---|
| 220 | const ProfPos &PPB = PB[uPrefixLengthB - 1]; |
|---|
| 221 | for (unsigned uPrefixLengthA = 1; uPrefixLengthA < uPrefixCountA; ++uPrefixLengthA) |
|---|
| 222 | { |
|---|
| 223 | const ProfPos &PPA = PA[uPrefixLengthA - 1]; |
|---|
| 224 | { |
|---|
| 225 | // Match M=LetterA+LetterB |
|---|
| 226 | SCORE scoreLL = ScoreProfPosDimer(PPA, PPB); |
|---|
| 227 | |
|---|
| 228 | SCORE scoreMM = DPM(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreMM(PPA, PPB); |
|---|
| 229 | SCORE scoreDM = DPD(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreDM(PPA, PPB); |
|---|
| 230 | SCORE scoreIM = DPI(uPrefixLengthA-1, uPrefixLengthB-1) + GapScoreIM(PPA, PPB); |
|---|
| 231 | |
|---|
| 232 | SCORE scoreBest = scoreMM; |
|---|
| 233 | char c = 'M'; |
|---|
| 234 | if (scoreDM > scoreBest) |
|---|
| 235 | { |
|---|
| 236 | scoreBest = scoreDM; |
|---|
| 237 | c = 'D'; |
|---|
| 238 | } |
|---|
| 239 | if (scoreIM > scoreBest) |
|---|
| 240 | { |
|---|
| 241 | scoreBest = scoreIM; |
|---|
| 242 | c = 'I'; |
|---|
| 243 | } |
|---|
| 244 | |
|---|
| 245 | DPM(uPrefixLengthA, uPrefixLengthB) = scoreBest + scoreLL; |
|---|
| 246 | TBM(uPrefixLengthA, uPrefixLengthB) = c; |
|---|
| 247 | } |
|---|
| 248 | { |
|---|
| 249 | // Delete D=LetterA+GapB |
|---|
| 250 | SCORE scoreMD = DPM(uPrefixLengthA-1, uPrefixLengthB) + GapScoreMD(PPA, PPB); |
|---|
| 251 | SCORE scoreDD = DPD(uPrefixLengthA-1, uPrefixLengthB) + GapScoreDD(PPA, PPB); |
|---|
| 252 | SCORE scoreID = DPI(uPrefixLengthA-1, uPrefixLengthB) + GapScoreID(PPA, PPB); |
|---|
| 253 | |
|---|
| 254 | SCORE scoreBest = scoreMD; |
|---|
| 255 | char c = 'M'; |
|---|
| 256 | if (scoreDD > scoreBest) |
|---|
| 257 | { |
|---|
| 258 | scoreBest = scoreDD; |
|---|
| 259 | c = 'D'; |
|---|
| 260 | } |
|---|
| 261 | if (scoreID > scoreBest) |
|---|
| 262 | { |
|---|
| 263 | scoreBest = scoreID; |
|---|
| 264 | c = 'I'; |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | DPD(uPrefixLengthA, uPrefixLengthB) = scoreBest; |
|---|
| 268 | TBD(uPrefixLengthA, uPrefixLengthB) = c; |
|---|
| 269 | } |
|---|
| 270 | { |
|---|
| 271 | // Insert I=GapA+LetterB |
|---|
| 272 | SCORE scoreMI = DPM(uPrefixLengthA, uPrefixLengthB-1) + GapScoreMI(PPA, PPB); |
|---|
| 273 | SCORE scoreDI = DPD(uPrefixLengthA, uPrefixLengthB-1) + GapScoreDI(PPA, PPB); |
|---|
| 274 | SCORE scoreII = DPI(uPrefixLengthA, uPrefixLengthB-1) + GapScoreII(PPA, PPB); |
|---|
| 275 | |
|---|
| 276 | SCORE scoreBest = scoreMI; |
|---|
| 277 | char c = 'M'; |
|---|
| 278 | if (scoreDI > scoreBest) |
|---|
| 279 | { |
|---|
| 280 | scoreBest = scoreDI; |
|---|
| 281 | c = 'D'; |
|---|
| 282 | } |
|---|
| 283 | if (scoreII > scoreBest) |
|---|
| 284 | { |
|---|
| 285 | scoreBest = scoreII; |
|---|
| 286 | c = 'I'; |
|---|
| 287 | } |
|---|
| 288 | |
|---|
| 289 | DPI(uPrefixLengthA, uPrefixLengthB) = scoreBest; |
|---|
| 290 | TBI(uPrefixLengthA, uPrefixLengthB) = c; |
|---|
| 291 | } |
|---|
| 292 | } |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | #if TRACE |
|---|
| 296 | Log("DPM:\n"); |
|---|
| 297 | ListDP(DPM_, PA, PB, uPrefixCountA, uPrefixCountB); |
|---|
| 298 | Log("DPD:\n"); |
|---|
| 299 | ListDP(DPD_, PA, PB, uPrefixCountA, uPrefixCountB); |
|---|
| 300 | Log("DPI:\n"); |
|---|
| 301 | ListDP(DPI_, PA, PB, uPrefixCountA, uPrefixCountB); |
|---|
| 302 | Log("TBM:\n"); |
|---|
| 303 | ListTB(TBM_, PA, PB, uPrefixCountA, uPrefixCountB); |
|---|
| 304 | Log("TBD:\n"); |
|---|
| 305 | ListTB(TBD_, PA, PB, uPrefixCountA, uPrefixCountB); |
|---|
| 306 | Log("TBI:\n"); |
|---|
| 307 | ListTB(TBI_, PA, PB, uPrefixCountA, uPrefixCountB); |
|---|
| 308 | #endif |
|---|
| 309 | |
|---|
| 310 | SCORE Score = TraceBackDimer(DPM_, DPD_, DPI_, TBM_, TBD_, TBI_, |
|---|
| 311 | uLengthA, uLengthB, Path); |
|---|
| 312 | |
|---|
| 313 | #if TRACE |
|---|
| 314 | Log("GlobalAlignDimer score = %.3g\n", Score); |
|---|
| 315 | #endif |
|---|
| 316 | |
|---|
| 317 | delete[] DPM_; |
|---|
| 318 | delete[] DPD_; |
|---|
| 319 | delete[] DPI_; |
|---|
| 320 | |
|---|
| 321 | delete[] TBM_; |
|---|
| 322 | delete[] TBD_; |
|---|
| 323 | delete[] TBI_; |
|---|
| 324 | |
|---|
| 325 | return Score; |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | static SCORE TraceBackDimer( const SCORE *DPM_, const SCORE *DPD_, const SCORE *DPI_, |
|---|
| 329 | const char *TBM_, const char *TBD_, const char *TBI_, |
|---|
| 330 | unsigned uLengthA, unsigned uLengthB, PWPath &Path) |
|---|
| 331 | { |
|---|
| 332 | const unsigned uPrefixCountA = uLengthA + 1; |
|---|
| 333 | |
|---|
| 334 | unsigned uPrefixLengthA = uLengthA; |
|---|
| 335 | unsigned uPrefixLengthB = uLengthB; |
|---|
| 336 | |
|---|
| 337 | char cEdge = 'M'; |
|---|
| 338 | SCORE scoreMax = DPM(uLengthA, uLengthB); |
|---|
| 339 | if (DPD(uLengthA, uLengthB) > scoreMax) |
|---|
| 340 | { |
|---|
| 341 | scoreMax = DPD(uLengthA, uLengthB); |
|---|
| 342 | cEdge = 'D'; |
|---|
| 343 | } |
|---|
| 344 | if (DPI(uLengthA, uLengthB) > scoreMax) |
|---|
| 345 | { |
|---|
| 346 | scoreMax = DPI(uLengthA, uLengthB); |
|---|
| 347 | cEdge = 'I'; |
|---|
| 348 | } |
|---|
| 349 | |
|---|
| 350 | for (;;) |
|---|
| 351 | { |
|---|
| 352 | if (0 == uPrefixLengthA && 0 == uPrefixLengthB) |
|---|
| 353 | break; |
|---|
| 354 | |
|---|
| 355 | PWEdge Edge; |
|---|
| 356 | Edge.cType = cEdge; |
|---|
| 357 | Edge.uPrefixLengthA = uPrefixLengthA; |
|---|
| 358 | Edge.uPrefixLengthB = uPrefixLengthB; |
|---|
| 359 | Path.PrependEdge(Edge); |
|---|
| 360 | |
|---|
| 361 | #if TRACE |
|---|
| 362 | Log("PLA=%u PLB=%u Edge=%c\n", uPrefixLengthA, uPrefixLengthB, cEdge); |
|---|
| 363 | #endif |
|---|
| 364 | switch (cEdge) |
|---|
| 365 | { |
|---|
| 366 | case 'M': |
|---|
| 367 | assert(uPrefixLengthA > 0 && uPrefixLengthB > 0); |
|---|
| 368 | cEdge = TBM(uPrefixLengthA, uPrefixLengthB); |
|---|
| 369 | --uPrefixLengthA; |
|---|
| 370 | --uPrefixLengthB; |
|---|
| 371 | break; |
|---|
| 372 | case 'D': |
|---|
| 373 | assert(uPrefixLengthA > 0); |
|---|
| 374 | cEdge = TBD(uPrefixLengthA, uPrefixLengthB); |
|---|
| 375 | --uPrefixLengthA; |
|---|
| 376 | break; |
|---|
| 377 | case 'I': |
|---|
| 378 | assert(uPrefixLengthB > 0); |
|---|
| 379 | cEdge = TBI(uPrefixLengthA, uPrefixLengthB); |
|---|
| 380 | --uPrefixLengthB; |
|---|
| 381 | break; |
|---|
| 382 | default: |
|---|
| 383 | Quit("Invalid edge PLA=%u PLB=%u %c", uPrefixLengthA, uPrefixLengthB, cEdge); |
|---|
| 384 | } |
|---|
| 385 | } |
|---|
| 386 | #if TRACE |
|---|
| 387 | Path.LogMe(); |
|---|
| 388 | #endif |
|---|
| 389 | return scoreMax; |
|---|
| 390 | } |
|---|