| 1 | ///////////////////////////////////////////////////////////////// |
|---|
| 2 | // ProbabilisticModel.h |
|---|
| 3 | // |
|---|
| 4 | // Routines for (1) posterior probability computations |
|---|
| 5 | // (2) chained anchoring |
|---|
| 6 | // (3) maximum weight trace alignment |
|---|
| 7 | ///////////////////////////////////////////////////////////////// |
|---|
| 8 | |
|---|
| 9 | #ifndef PROBABILISTICMODEL_H |
|---|
| 10 | #define PROBABILISTICMODEL_H |
|---|
| 11 | |
|---|
| 12 | #include <list> |
|---|
| 13 | #include <cmath> |
|---|
| 14 | #include <cstdio> |
|---|
| 15 | #include "SafeVector.h" |
|---|
| 16 | #include "ScoreType.h" |
|---|
| 17 | #include "SparseMatrix.h" |
|---|
| 18 | #include "MultiSequence.h" |
|---|
| 19 | #include "StemCandidate.hpp" |
|---|
| 20 | #include "scarna.hpp" |
|---|
| 21 | #include "nrutil.h" |
|---|
| 22 | #include <vector> |
|---|
| 23 | |
|---|
| 24 | using namespace std; |
|---|
| 25 | |
|---|
| 26 | const int NumMatchStates = 1; // note that in this version the number |
|---|
| 27 | // of match states is fixed at 1...will |
|---|
| 28 | // change in future versions |
|---|
| 29 | const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2; |
|---|
| 30 | |
|---|
| 31 | ///////////////////////////////////////////////////////////////// |
|---|
| 32 | // ProbabilisticModel |
|---|
| 33 | // |
|---|
| 34 | // Class for storing the parameters of a probabilistic model and |
|---|
| 35 | // performing different computations based on those parameters. |
|---|
| 36 | // In particular, this class handles the computation of |
|---|
| 37 | // posterior probabilities that may be used in alignment. |
|---|
| 38 | ///////////////////////////////////////////////////////////////// |
|---|
| 39 | namespace MXSCARNA { |
|---|
| 40 | class ProbabilisticModel { |
|---|
| 41 | |
|---|
| 42 | float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state |
|---|
| 43 | float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities |
|---|
| 44 | float matchProb[256][256]; // emission probabilities for match states |
|---|
| 45 | float insProb[256][NumMatrixTypes]; // emission probabilities for insert states |
|---|
| 46 | NRMat<float> WM; |
|---|
| 47 | |
|---|
| 48 | public: |
|---|
| 49 | |
|---|
| 50 | ///////////////////////////////////////////////////////////////// |
|---|
| 51 | // ProbabilisticModel::ProbabilisticModel() |
|---|
| 52 | // |
|---|
| 53 | // Constructor. Builds a new probabilistic model using the |
|---|
| 54 | // given parameters. |
|---|
| 55 | ///////////////////////////////////////////////////////////////// |
|---|
| 56 | |
|---|
| 57 | ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend, |
|---|
| 58 | const VVF &emitPairs, const VF &emitSingle){ |
|---|
| 59 | |
|---|
| 60 | // build transition matrix |
|---|
| 61 | VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f)); |
|---|
| 62 | transMat[0][0] = 1; |
|---|
| 63 | for (int i = 0; i < NumInsertStates; i++){ |
|---|
| 64 | transMat[0][2*i+1] = gapOpen[2*i]; |
|---|
| 65 | transMat[0][2*i+2] = gapOpen[2*i+1]; |
|---|
| 66 | transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]); |
|---|
| 67 | assert (transMat[0][0] > 0); |
|---|
| 68 | transMat[2*i+1][2*i+1] = gapExtend[2*i]; |
|---|
| 69 | transMat[2*i+2][2*i+2] = gapExtend[2*i+1]; |
|---|
| 70 | transMat[2*i+1][2*i+2] = 0; |
|---|
| 71 | transMat[2*i+2][2*i+1] = 0; |
|---|
| 72 | transMat[2*i+1][0] = 1 - gapExtend[2*i]; |
|---|
| 73 | transMat[2*i+2][0] = 1 - gapExtend[2*i+1]; |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | // create initial and transition probability matrices |
|---|
| 77 | for (int i = 0; i < NumMatrixTypes; i++){ |
|---|
| 78 | initialDistribution[i] = LOG (initDistribMat[i]); |
|---|
| 79 | for (int j = 0; j < NumMatrixTypes; j++) |
|---|
| 80 | transProb[i][j] = LOG (transMat[i][j]); |
|---|
| 81 | } |
|---|
| 82 | |
|---|
| 83 | // create insertion and match probability matrices |
|---|
| 84 | for (int i = 0; i < 256; i++){ |
|---|
| 85 | for (int j = 0; j < NumMatrixTypes; j++) |
|---|
| 86 | insProb[i][j] = LOG (emitSingle[i]); |
|---|
| 87 | for (int j = 0; j < 256; j++) |
|---|
| 88 | matchProb[i][j] = LOG (emitPairs[i][j]); |
|---|
| 89 | } |
|---|
| 90 | } |
|---|
| 91 | |
|---|
| 92 | NRMat<float> weightMatchScore(std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2, |
|---|
| 93 | std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2, NRMat<float> WM) { |
|---|
| 94 | int len = WORDLENGTH; |
|---|
| 95 | int size = matchPSCS1->size(); |
|---|
| 96 | float weight = 1000; |
|---|
| 97 | |
|---|
| 98 | for(int iter = 0; iter < size; iter++) { |
|---|
| 99 | int i = matchPSCS1->at(iter); |
|---|
| 100 | int j = matchPSCS2->at(iter); |
|---|
| 101 | |
|---|
| 102 | const StemCandidate &sc1 = pscs1->at(i); |
|---|
| 103 | const StemCandidate &sc2 = pscs2->at(j); |
|---|
| 104 | |
|---|
| 105 | for(int k = 0; k < len; k++) { |
|---|
| 106 | WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight; |
|---|
| 107 | // sumWeight += weight; |
|---|
| 108 | } |
|---|
| 109 | } |
|---|
| 110 | return WM; |
|---|
| 111 | } |
|---|
| 112 | |
|---|
| 113 | ///////////////////////////////////////////////////////////////// |
|---|
| 114 | // ProbabilisticModel::ComputeForwardMatrix() |
|---|
| 115 | // |
|---|
| 116 | // Computes a set of forward probability matrices for aligning |
|---|
| 117 | // seq1 and seq2. |
|---|
| 118 | // |
|---|
| 119 | // For efficiency reasons, a single-dimensional floating-point |
|---|
| 120 | // array is used here, with the following indexing scheme: |
|---|
| 121 | // |
|---|
| 122 | // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] |
|---|
| 123 | // refers to the probability of aligning through j characters |
|---|
| 124 | // of the first sequence, k characters of the second sequence, |
|---|
| 125 | // and ending in state i. |
|---|
| 126 | ///////////////////////////////////////////////////////////////// |
|---|
| 127 | |
|---|
| 128 | VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const { |
|---|
| 129 | |
|---|
| 130 | assert (seq1); |
|---|
| 131 | assert (seq2); |
|---|
| 132 | |
|---|
| 133 | const int seq1Length = seq1->GetLength(); |
|---|
| 134 | const int seq2Length = seq2->GetLength(); |
|---|
| 135 | |
|---|
| 136 | // retrieve the points to the beginning of each sequence |
|---|
| 137 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
|---|
| 138 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
|---|
| 139 | |
|---|
| 140 | // create matrix |
|---|
| 141 | VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
|---|
| 142 | assert (forwardPtr); |
|---|
| 143 | VF &forward = *forwardPtr; |
|---|
| 144 | |
|---|
| 145 | // initialization condition |
|---|
| 146 | forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] = |
|---|
| 147 | initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]]; |
|---|
| 148 | |
|---|
| 149 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 150 | forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] = |
|---|
| 151 | initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k]; |
|---|
| 152 | forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] = |
|---|
| 153 | initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k]; |
|---|
| 154 | } |
|---|
| 155 | |
|---|
| 156 | // remember offset for each index combination |
|---|
| 157 | int ij = 0; |
|---|
| 158 | int i1j = -seq2Length - 1; |
|---|
| 159 | int ij1 = -1; |
|---|
| 160 | int i1j1 = -seq2Length - 2; |
|---|
| 161 | |
|---|
| 162 | ij *= NumMatrixTypes; |
|---|
| 163 | i1j *= NumMatrixTypes; |
|---|
| 164 | ij1 *= NumMatrixTypes; |
|---|
| 165 | i1j1 *= NumMatrixTypes; |
|---|
| 166 | |
|---|
| 167 | // compute forward scores |
|---|
| 168 | for (int i = 0; i <= seq1Length; i++){ |
|---|
| 169 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
|---|
| 170 | for (int j = 0; j <= seq2Length; j++){ |
|---|
| 171 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
|---|
| 172 | |
|---|
| 173 | if (i > 1 || j > 1){ |
|---|
| 174 | if (i > 0 && j > 0){ |
|---|
| 175 | forward[0 + ij] = forward[0 + i1j1] + transProb[0][0]; |
|---|
| 176 | for (int k = 1; k < NumMatrixTypes; k++) |
|---|
| 177 | LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]); |
|---|
| 178 | forward[0 + ij] += matchProb[c1][c2]; |
|---|
| 179 | } |
|---|
| 180 | if (i > 0){ |
|---|
| 181 | for (int k = 0; k < NumInsertStates; k++) |
|---|
| 182 | forward[2*k+1 + ij] = insProb[c1][k] + |
|---|
| 183 | LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1], |
|---|
| 184 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]); |
|---|
| 185 | } |
|---|
| 186 | if (j > 0){ |
|---|
| 187 | for (int k = 0; k < NumInsertStates; k++) |
|---|
| 188 | forward[2*k+2 + ij] = insProb[c2][k] + |
|---|
| 189 | LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2], |
|---|
| 190 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]); |
|---|
| 191 | } |
|---|
| 192 | } |
|---|
| 193 | |
|---|
| 194 | ij += NumMatrixTypes; |
|---|
| 195 | i1j += NumMatrixTypes; |
|---|
| 196 | ij1 += NumMatrixTypes; |
|---|
| 197 | i1j1 += NumMatrixTypes; |
|---|
| 198 | } |
|---|
| 199 | } |
|---|
| 200 | |
|---|
| 201 | return forwardPtr; |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | ///////////////////////////////////////////////////////////////// |
|---|
| 205 | // ProbabilisticModel::ComputeBackwardMatrix() |
|---|
| 206 | // |
|---|
| 207 | // Computes a set of backward probability matrices for aligning |
|---|
| 208 | // seq1 and seq2. |
|---|
| 209 | // |
|---|
| 210 | // For efficiency reasons, a single-dimensional floating-point |
|---|
| 211 | // array is used here, with the following indexing scheme: |
|---|
| 212 | // |
|---|
| 213 | // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] |
|---|
| 214 | // refers to the probability of starting in state i and |
|---|
| 215 | // aligning from character j+1 to the end of the first |
|---|
| 216 | // sequence and from character k+1 to the end of the second |
|---|
| 217 | // sequence. |
|---|
| 218 | ///////////////////////////////////////////////////////////////// |
|---|
| 219 | |
|---|
| 220 | VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const { |
|---|
| 221 | |
|---|
| 222 | assert (seq1); |
|---|
| 223 | assert (seq2); |
|---|
| 224 | |
|---|
| 225 | const int seq1Length = seq1->GetLength(); |
|---|
| 226 | const int seq2Length = seq2->GetLength(); |
|---|
| 227 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
|---|
| 228 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
|---|
| 229 | |
|---|
| 230 | // create matrix |
|---|
| 231 | VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
|---|
| 232 | assert (backwardPtr); |
|---|
| 233 | VF &backward = *backwardPtr; |
|---|
| 234 | |
|---|
| 235 | // initialization condition |
|---|
| 236 | for (int k = 0; k < NumMatrixTypes; k++) |
|---|
| 237 | backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k]; |
|---|
| 238 | |
|---|
| 239 | // remember offset for each index combination |
|---|
| 240 | int ij = (seq1Length+1) * (seq2Length+1) - 1; |
|---|
| 241 | int i1j = ij + seq2Length + 1; |
|---|
| 242 | int ij1 = ij + 1; |
|---|
| 243 | int i1j1 = ij + seq2Length + 2; |
|---|
| 244 | |
|---|
| 245 | ij *= NumMatrixTypes; |
|---|
| 246 | i1j *= NumMatrixTypes; |
|---|
| 247 | ij1 *= NumMatrixTypes; |
|---|
| 248 | i1j1 *= NumMatrixTypes; |
|---|
| 249 | |
|---|
| 250 | // compute backward scores |
|---|
| 251 | for (int i = seq1Length; i >= 0; i--){ |
|---|
| 252 | unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1]; |
|---|
| 253 | for (int j = seq2Length; j >= 0; j--){ |
|---|
| 254 | unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1]; |
|---|
| 255 | |
|---|
| 256 | if (i < seq1Length && j < seq2Length){ |
|---|
| 257 | const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2]; |
|---|
| 258 | for (int k = 0; k < NumMatrixTypes; k++) |
|---|
| 259 | LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]); |
|---|
| 260 | } |
|---|
| 261 | if (i < seq1Length){ |
|---|
| 262 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 263 | LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]); |
|---|
| 264 | LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]); |
|---|
| 265 | } |
|---|
| 266 | } |
|---|
| 267 | if (j < seq2Length){ |
|---|
| 268 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 269 | LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]); |
|---|
| 270 | LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]); |
|---|
| 271 | } |
|---|
| 272 | } |
|---|
| 273 | |
|---|
| 274 | ij -= NumMatrixTypes; |
|---|
| 275 | i1j -= NumMatrixTypes; |
|---|
| 276 | ij1 -= NumMatrixTypes; |
|---|
| 277 | i1j1 -= NumMatrixTypes; |
|---|
| 278 | } |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | return backwardPtr; |
|---|
| 282 | } |
|---|
| 283 | |
|---|
| 284 | ///////////////////////////////////////////////////////////////// |
|---|
| 285 | // ProbabilisticModel::ComputeTotalProbability() |
|---|
| 286 | // |
|---|
| 287 | // Computes the total probability of an alignment given |
|---|
| 288 | // the forward and backward matrices. |
|---|
| 289 | ///////////////////////////////////////////////////////////////// |
|---|
| 290 | |
|---|
| 291 | float ComputeTotalProbability (int seq1Length, int seq2Length, |
|---|
| 292 | const VF &forward, const VF &backward) const { |
|---|
| 293 | |
|---|
| 294 | // compute total probability |
|---|
| 295 | float totalForwardProb = LOG_ZERO; |
|---|
| 296 | float totalBackwardProb = LOG_ZERO; |
|---|
| 297 | for (int k = 0; k < NumMatrixTypes; k++){ |
|---|
| 298 | LOG_PLUS_EQUALS (totalForwardProb, |
|---|
| 299 | forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
|---|
| 300 | backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
|---|
| 301 | } |
|---|
| 302 | |
|---|
| 303 | totalBackwardProb = |
|---|
| 304 | forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + |
|---|
| 305 | backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)]; |
|---|
| 306 | |
|---|
| 307 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 308 | LOG_PLUS_EQUALS (totalBackwardProb, |
|---|
| 309 | forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + |
|---|
| 310 | backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]); |
|---|
| 311 | LOG_PLUS_EQUALS (totalBackwardProb, |
|---|
| 312 | forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + |
|---|
| 313 | backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]); |
|---|
| 314 | } |
|---|
| 315 | |
|---|
| 316 | // cerr << totalForwardProb << " " << totalBackwardProb << endl; |
|---|
| 317 | |
|---|
| 318 | return (totalForwardProb + totalBackwardProb) / 2; |
|---|
| 319 | } |
|---|
| 320 | |
|---|
| 321 | ///////////////////////////////////////////////////////////////// |
|---|
| 322 | // ProbabilisticModel::ComputePosteriorMatrix() |
|---|
| 323 | // |
|---|
| 324 | // Computes the posterior probability matrix based on |
|---|
| 325 | // the forward and backward matrices. |
|---|
| 326 | ///////////////////////////////////////////////////////////////// |
|---|
| 327 | |
|---|
| 328 | VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2, |
|---|
| 329 | const VF &forward, const VF &backward) const { |
|---|
| 330 | |
|---|
| 331 | assert (seq1); |
|---|
| 332 | assert (seq2); |
|---|
| 333 | |
|---|
| 334 | const int seq1Length = seq1->GetLength(); |
|---|
| 335 | const int seq2Length = seq2->GetLength(); |
|---|
| 336 | |
|---|
| 337 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
|---|
| 338 | forward, backward); |
|---|
| 339 | |
|---|
| 340 | // compute posterior matrices |
|---|
| 341 | VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr); |
|---|
| 342 | VF &posterior = *posteriorPtr; |
|---|
| 343 | |
|---|
| 344 | int ij = 0; |
|---|
| 345 | VF::iterator ptr = posterior.begin(); |
|---|
| 346 | |
|---|
| 347 | for (int i = 0; i <= seq1Length; i++){ |
|---|
| 348 | for (int j = 0; j <= seq2Length; j++){ |
|---|
| 349 | *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb)); |
|---|
| 350 | ij += NumMatrixTypes; |
|---|
| 351 | } |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | posterior[0] = 0; |
|---|
| 355 | |
|---|
| 356 | return posteriorPtr; |
|---|
| 357 | } |
|---|
| 358 | |
|---|
| 359 | /* |
|---|
| 360 | ///////////////////////////////////////////////////////////////// |
|---|
| 361 | // ProbabilisticModel::ComputeExpectedCounts() |
|---|
| 362 | // |
|---|
| 363 | // Computes the expected counts for the various transitions. |
|---|
| 364 | ///////////////////////////////////////////////////////////////// |
|---|
| 365 | |
|---|
| 366 | VVF *ComputeExpectedCounts () const { |
|---|
| 367 | |
|---|
| 368 | assert (seq1); |
|---|
| 369 | assert (seq2); |
|---|
| 370 | |
|---|
| 371 | const int seq1Length = seq1->GetLength(); |
|---|
| 372 | const int seq2Length = seq2->GetLength(); |
|---|
| 373 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
|---|
| 374 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
|---|
| 375 | |
|---|
| 376 | // compute total probability |
|---|
| 377 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
|---|
| 378 | forward, backward); |
|---|
| 379 | |
|---|
| 380 | // initialize expected counts |
|---|
| 381 | VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr); |
|---|
| 382 | VVF &counts = *countsPtr; |
|---|
| 383 | |
|---|
| 384 | // remember offset for each index combination |
|---|
| 385 | int ij = 0; |
|---|
| 386 | int i1j = -seq2Length - 1; |
|---|
| 387 | int ij1 = -1; |
|---|
| 388 | int i1j1 = -seq2Length - 2; |
|---|
| 389 | |
|---|
| 390 | ij *= NumMatrixTypes; |
|---|
| 391 | i1j *= NumMatrixTypes; |
|---|
| 392 | ij1 *= NumMatrixTypes; |
|---|
| 393 | i1j1 *= NumMatrixTypes; |
|---|
| 394 | |
|---|
| 395 | // compute expected counts |
|---|
| 396 | for (int i = 0; i <= seq1Length; i++){ |
|---|
| 397 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
|---|
| 398 | for (int j = 0; j <= seq2Length; j++){ |
|---|
| 399 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
|---|
| 400 | |
|---|
| 401 | if (i > 0 && j > 0){ |
|---|
| 402 | for (int k = 0; k < NumMatrixTypes; k++) |
|---|
| 403 | LOG_PLUS_EQUALS (counts[k][0], |
|---|
| 404 | forward[k + i1j1] + transProb[k][0] + |
|---|
| 405 | matchProb[c1][c2] + backward[0 + ij]); |
|---|
| 406 | } |
|---|
| 407 | if (i > 0){ |
|---|
| 408 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 409 | LOG_PLUS_EQUALS (counts[0][2*k+1], |
|---|
| 410 | forward[0 + i1j] + transProb[0][2*k+1] + |
|---|
| 411 | insProb[c1][k] + backward[2*k+1 + ij]); |
|---|
| 412 | LOG_PLUS_EQUALS (counts[2*k+1][2*k+1], |
|---|
| 413 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
|---|
| 414 | insProb[c1][k] + backward[2*k+1 + ij]); |
|---|
| 415 | } |
|---|
| 416 | } |
|---|
| 417 | if (j > 0){ |
|---|
| 418 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 419 | LOG_PLUS_EQUALS (counts[0][2*k+2], |
|---|
| 420 | forward[0 + ij1] + transProb[0][2*k+2] + |
|---|
| 421 | insProb[c2][k] + backward[2*k+2 + ij]); |
|---|
| 422 | LOG_PLUS_EQUALS (counts[2*k+2][2*k+2], |
|---|
| 423 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
|---|
| 424 | insProb[c2][k] + backward[2*k+2 + ij]); |
|---|
| 425 | } |
|---|
| 426 | } |
|---|
| 427 | |
|---|
| 428 | ij += NumMatrixTypes; |
|---|
| 429 | i1j += NumMatrixTypes; |
|---|
| 430 | ij1 += NumMatrixTypes; |
|---|
| 431 | i1j1 += NumMatrixTypes; |
|---|
| 432 | } |
|---|
| 433 | } |
|---|
| 434 | |
|---|
| 435 | // scale all expected counts appropriately |
|---|
| 436 | for (int i = 0; i < NumMatrixTypes; i++) |
|---|
| 437 | for (int j = 0; j < NumMatrixTypes; j++) |
|---|
| 438 | counts[i][j] -= totalProb; |
|---|
| 439 | |
|---|
| 440 | } |
|---|
| 441 | */ |
|---|
| 442 | |
|---|
| 443 | ///////////////////////////////////////////////////////////////// |
|---|
| 444 | // ProbabilisticModel::ComputeNewParameters() |
|---|
| 445 | // |
|---|
| 446 | // Computes a new parameter set based on the expected counts |
|---|
| 447 | // given. |
|---|
| 448 | ///////////////////////////////////////////////////////////////// |
|---|
| 449 | |
|---|
| 450 | void ComputeNewParameters (Sequence *seq1, Sequence *seq2, |
|---|
| 451 | const VF &forward, const VF &backward, |
|---|
| 452 | VF &initDistribMat, VF &gapOpen, |
|---|
| 453 | VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const { |
|---|
| 454 | |
|---|
| 455 | assert (seq1); |
|---|
| 456 | assert (seq2); |
|---|
| 457 | |
|---|
| 458 | const int seq1Length = seq1->GetLength(); |
|---|
| 459 | const int seq2Length = seq2->GetLength(); |
|---|
| 460 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
|---|
| 461 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
|---|
| 462 | |
|---|
| 463 | // compute total probability |
|---|
| 464 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
|---|
| 465 | forward, backward); |
|---|
| 466 | |
|---|
| 467 | // initialize expected counts |
|---|
| 468 | VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO)); |
|---|
| 469 | VF initCounts (NumMatrixTypes, LOG_ZERO); |
|---|
| 470 | VVF pairCounts (256, VF (256, LOG_ZERO)); |
|---|
| 471 | VF singleCounts (256, LOG_ZERO); |
|---|
| 472 | |
|---|
| 473 | // remember offset for each index combination |
|---|
| 474 | int ij = 0; |
|---|
| 475 | int i1j = -seq2Length - 1; |
|---|
| 476 | int ij1 = -1; |
|---|
| 477 | int i1j1 = -seq2Length - 2; |
|---|
| 478 | |
|---|
| 479 | ij *= NumMatrixTypes; |
|---|
| 480 | i1j *= NumMatrixTypes; |
|---|
| 481 | ij1 *= NumMatrixTypes; |
|---|
| 482 | i1j1 *= NumMatrixTypes; |
|---|
| 483 | |
|---|
| 484 | // compute initial distribution posteriors |
|---|
| 485 | initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + |
|---|
| 486 | backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)], |
|---|
| 487 | forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
|---|
| 488 | backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
|---|
| 489 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 490 | initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + |
|---|
| 491 | backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)], |
|---|
| 492 | forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
|---|
| 493 | backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
|---|
| 494 | initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + |
|---|
| 495 | backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)], |
|---|
| 496 | forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
|---|
| 497 | backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
|---|
| 498 | } |
|---|
| 499 | |
|---|
| 500 | // compute expected counts |
|---|
| 501 | for (int i = 0; i <= seq1Length; i++){ |
|---|
| 502 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]); |
|---|
| 503 | for (int j = 0; j <= seq2Length; j++){ |
|---|
| 504 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]); |
|---|
| 505 | |
|---|
| 506 | if (i > 0 && j > 0){ |
|---|
| 507 | if (enableTrainEmissions && i == 1 && j == 1){ |
|---|
| 508 | LOG_PLUS_EQUALS (pairCounts[c1][c2], |
|---|
| 509 | initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]); |
|---|
| 510 | LOG_PLUS_EQUALS (pairCounts[c2][c1], |
|---|
| 511 | initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]); |
|---|
| 512 | } |
|---|
| 513 | |
|---|
| 514 | for (int k = 0; k < NumMatrixTypes; k++){ |
|---|
| 515 | LOG_PLUS_EQUALS (transCounts[k][0], |
|---|
| 516 | forward[k + i1j1] + transProb[k][0] + |
|---|
| 517 | matchProb[c1][c2] + backward[0 + ij]); |
|---|
| 518 | if (enableTrainEmissions && i != 1 || j != 1){ |
|---|
| 519 | LOG_PLUS_EQUALS (pairCounts[c1][c2], |
|---|
| 520 | forward[k + i1j1] + transProb[k][0] + |
|---|
| 521 | matchProb[c1][c2] + backward[0 + ij]); |
|---|
| 522 | LOG_PLUS_EQUALS (pairCounts[c2][c1], |
|---|
| 523 | forward[k + i1j1] + transProb[k][0] + |
|---|
| 524 | matchProb[c2][c1] + backward[0 + ij]); |
|---|
| 525 | } |
|---|
| 526 | } |
|---|
| 527 | } |
|---|
| 528 | if (i > 0){ |
|---|
| 529 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 530 | LOG_PLUS_EQUALS (transCounts[0][2*k+1], |
|---|
| 531 | forward[0 + i1j] + transProb[0][2*k+1] + |
|---|
| 532 | insProb[c1][k] + backward[2*k+1 + ij]); |
|---|
| 533 | LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1], |
|---|
| 534 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
|---|
| 535 | insProb[c1][k] + backward[2*k+1 + ij]); |
|---|
| 536 | if (enableTrainEmissions){ |
|---|
| 537 | if (i == 1 && j == 0){ |
|---|
| 538 | LOG_PLUS_EQUALS (singleCounts[c1], |
|---|
| 539 | initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); |
|---|
| 540 | } |
|---|
| 541 | else { |
|---|
| 542 | LOG_PLUS_EQUALS (singleCounts[c1], |
|---|
| 543 | forward[0 + i1j] + transProb[0][2*k+1] + |
|---|
| 544 | insProb[c1][k] + backward[2*k+1 + ij]); |
|---|
| 545 | LOG_PLUS_EQUALS (singleCounts[c1], |
|---|
| 546 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
|---|
| 547 | insProb[c1][k] + backward[2*k+1 + ij]); |
|---|
| 548 | } |
|---|
| 549 | } |
|---|
| 550 | } |
|---|
| 551 | } |
|---|
| 552 | if (j > 0){ |
|---|
| 553 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 554 | LOG_PLUS_EQUALS (transCounts[0][2*k+2], |
|---|
| 555 | forward[0 + ij1] + transProb[0][2*k+2] + |
|---|
| 556 | insProb[c2][k] + backward[2*k+2 + ij]); |
|---|
| 557 | LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2], |
|---|
| 558 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
|---|
| 559 | insProb[c2][k] + backward[2*k+2 + ij]); |
|---|
| 560 | if (enableTrainEmissions){ |
|---|
| 561 | if (i == 0 && j == 1){ |
|---|
| 562 | LOG_PLUS_EQUALS (singleCounts[c2], |
|---|
| 563 | initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); |
|---|
| 564 | } |
|---|
| 565 | else { |
|---|
| 566 | LOG_PLUS_EQUALS (singleCounts[c2], |
|---|
| 567 | forward[0 + ij1] + transProb[0][2*k+2] + |
|---|
| 568 | insProb[c2][k] + backward[2*k+2 + ij]); |
|---|
| 569 | LOG_PLUS_EQUALS (singleCounts[c2], |
|---|
| 570 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
|---|
| 571 | insProb[c2][k] + backward[2*k+2 + ij]); |
|---|
| 572 | } |
|---|
| 573 | } |
|---|
| 574 | } |
|---|
| 575 | } |
|---|
| 576 | |
|---|
| 577 | ij += NumMatrixTypes; |
|---|
| 578 | i1j += NumMatrixTypes; |
|---|
| 579 | ij1 += NumMatrixTypes; |
|---|
| 580 | i1j1 += NumMatrixTypes; |
|---|
| 581 | } |
|---|
| 582 | } |
|---|
| 583 | |
|---|
| 584 | // scale all expected counts appropriately |
|---|
| 585 | for (int i = 0; i < NumMatrixTypes; i++){ |
|---|
| 586 | initCounts[i] -= totalProb; |
|---|
| 587 | for (int j = 0; j < NumMatrixTypes; j++) |
|---|
| 588 | transCounts[i][j] -= totalProb; |
|---|
| 589 | } |
|---|
| 590 | if (enableTrainEmissions){ |
|---|
| 591 | for (int i = 0; i < 256; i++){ |
|---|
| 592 | for (int j = 0; j < 256; j++) |
|---|
| 593 | pairCounts[i][j] -= totalProb; |
|---|
| 594 | singleCounts[i] -= totalProb; |
|---|
| 595 | } |
|---|
| 596 | } |
|---|
| 597 | |
|---|
| 598 | // compute new initial distribution |
|---|
| 599 | float totalInitDistribCounts = 0; |
|---|
| 600 | for (int i = 0; i < NumMatrixTypes; i++) |
|---|
| 601 | totalInitDistribCounts += exp (initCounts[i]); // should be 2 |
|---|
| 602 | initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts)); |
|---|
| 603 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 604 | float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2; |
|---|
| 605 | initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts)); |
|---|
| 606 | } |
|---|
| 607 | |
|---|
| 608 | // compute total counts for match state |
|---|
| 609 | float inMatchStateCounts = 0; |
|---|
| 610 | for (int i = 0; i < NumMatrixTypes; i++) |
|---|
| 611 | inMatchStateCounts += exp (transCounts[0][i]); |
|---|
| 612 | for (int i = 0; i < NumInsertStates; i++){ |
|---|
| 613 | |
|---|
| 614 | // compute total counts for gap state |
|---|
| 615 | float inGapStateCounts = |
|---|
| 616 | exp (transCounts[2*i+1][0]) + |
|---|
| 617 | exp (transCounts[2*i+1][2*i+1]) + |
|---|
| 618 | exp (transCounts[2*i+2][0]) + |
|---|
| 619 | exp (transCounts[2*i+2][2*i+2]); |
|---|
| 620 | |
|---|
| 621 | gapOpen[2*i] = gapOpen[2*i+1] = |
|---|
| 622 | (exp (transCounts[0][2*i+1]) + |
|---|
| 623 | exp (transCounts[0][2*i+2])) / |
|---|
| 624 | (2 * inMatchStateCounts); |
|---|
| 625 | |
|---|
| 626 | gapExtend[2*i] = gapExtend[2*i+1] = |
|---|
| 627 | (exp (transCounts[2*i+1][2*i+1]) + |
|---|
| 628 | exp (transCounts[2*i+2][2*i+2])) / |
|---|
| 629 | inGapStateCounts; |
|---|
| 630 | } |
|---|
| 631 | |
|---|
| 632 | if (enableTrainEmissions){ |
|---|
| 633 | float totalPairCounts = 0; |
|---|
| 634 | float totalSingleCounts = 0; |
|---|
| 635 | for (int i = 0; i < 256; i++){ |
|---|
| 636 | for (int j = 0; j <= i; j++) |
|---|
| 637 | totalPairCounts += exp (pairCounts[j][i]); |
|---|
| 638 | totalSingleCounts += exp (singleCounts[i]); |
|---|
| 639 | } |
|---|
| 640 | |
|---|
| 641 | for (int i = 0; i < 256; i++) if (!islower ((char) i)){ |
|---|
| 642 | int li = (int)((unsigned char) tolower ((char) i)); |
|---|
| 643 | for (int j = 0; j <= i; j++) if (!islower ((char) j)){ |
|---|
| 644 | int lj = (int)((unsigned char) tolower ((char) j)); |
|---|
| 645 | emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] = |
|---|
| 646 | emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts; |
|---|
| 647 | } |
|---|
| 648 | emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts; |
|---|
| 649 | } |
|---|
| 650 | } |
|---|
| 651 | } |
|---|
| 652 | |
|---|
| 653 | ///////////////////////////////////////////////////////////////// |
|---|
| 654 | // ProbabilisticModel::ComputeAlignment() |
|---|
| 655 | // |
|---|
| 656 | // Computes an alignment based on the given posterior matrix. |
|---|
| 657 | // This is done by finding the maximum summing path (or |
|---|
| 658 | // maximum weight trace) through the posterior matrix. The |
|---|
| 659 | // final alignment is returned as a pair consisting of: |
|---|
| 660 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
|---|
| 661 | // denote insertions in one of the two sequences and |
|---|
| 662 | // B's denote that both sequences are present (i.e. |
|---|
| 663 | // matches). |
|---|
| 664 | // (2) a float indicating the sum achieved |
|---|
| 665 | ///////////////////////////////////////////////////////////////// |
|---|
| 666 | |
|---|
| 667 | pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length, const VF &posterior) const { |
|---|
| 668 | |
|---|
| 669 | float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows); |
|---|
| 670 | float *oldRow = twoRows; |
|---|
| 671 | float *newRow = twoRows + seq2Length + 1; |
|---|
| 672 | |
|---|
| 673 | char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
|---|
| 674 | char *tracebackPtr = tracebackMatrix; |
|---|
| 675 | |
|---|
| 676 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
|---|
| 677 | |
|---|
| 678 | // initialization |
|---|
| 679 | for (int i = 0; i <= seq2Length; i++){ |
|---|
| 680 | oldRow[i] = 0; |
|---|
| 681 | *(tracebackPtr++) = 'L'; |
|---|
| 682 | } |
|---|
| 683 | |
|---|
| 684 | // fill in matrix |
|---|
| 685 | for (int i = 1; i <= seq1Length; i++){ |
|---|
| 686 | |
|---|
| 687 | // initialize left column |
|---|
| 688 | newRow[0] = 0; |
|---|
| 689 | posteriorPtr++; |
|---|
| 690 | *(tracebackPtr++) = 'U'; |
|---|
| 691 | |
|---|
| 692 | // fill in rest of row |
|---|
| 693 | for (int j = 1; j <= seq2Length; j++){ |
|---|
| 694 | ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j], |
|---|
| 695 | 'D', 'L', 'U', &newRow[j], tracebackPtr++); // Match, insert, delete |
|---|
| 696 | } |
|---|
| 697 | |
|---|
| 698 | // swap rows |
|---|
| 699 | float *temp = oldRow; |
|---|
| 700 | oldRow = newRow; |
|---|
| 701 | newRow = temp; |
|---|
| 702 | } |
|---|
| 703 | |
|---|
| 704 | // store best score |
|---|
| 705 | float total = oldRow[seq2Length]; |
|---|
| 706 | delete [] twoRows; |
|---|
| 707 | |
|---|
| 708 | // compute traceback |
|---|
| 709 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
|---|
| 710 | int r = seq1Length, c = seq2Length; |
|---|
| 711 | while (r != 0 || c != 0){ |
|---|
| 712 | char ch = tracebackMatrix[r*(seq2Length+1) + c]; |
|---|
| 713 | switch (ch){ |
|---|
| 714 | case 'L': c--; alignment->push_back ('Y'); break; |
|---|
| 715 | case 'U': r--; alignment->push_back ('X'); break; |
|---|
| 716 | case 'D': c--; r--; alignment->push_back ('B'); break; |
|---|
| 717 | default: assert (false); |
|---|
| 718 | } |
|---|
| 719 | } |
|---|
| 720 | |
|---|
| 721 | delete [] tracebackMatrix; |
|---|
| 722 | |
|---|
| 723 | reverse (alignment->begin(), alignment->end()); |
|---|
| 724 | |
|---|
| 725 | return make_pair(alignment, total); |
|---|
| 726 | } |
|---|
| 727 | |
|---|
| 728 | ///////////////////////////////////////////////////////////////// |
|---|
| 729 | // ProbabilisticModel::ComputeAlignment2() |
|---|
| 730 | // |
|---|
| 731 | // Computes an alignment based on the given posterior matrix. |
|---|
| 732 | // This is done by finding the maximum summing path (or |
|---|
| 733 | // maximum weight trace) through the posterior matrix. The |
|---|
| 734 | // final alignment is returned as a pair consisting of: |
|---|
| 735 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
|---|
| 736 | // denote insertions in one of the two sequences and |
|---|
| 737 | // B's denote that both sequences are present (i.e. |
|---|
| 738 | // matches). |
|---|
| 739 | // (2) a float indicating the sum achieved |
|---|
| 740 | ///////////////////////////////////////////////////////////////// |
|---|
| 741 | |
|---|
| 742 | pair<SafeVector<char> *, float> ComputeAlignment2 (int seq1Length, int seq2Length, |
|---|
| 743 | const VF &posterior, std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2, |
|---|
| 744 | std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2) const { |
|---|
| 745 | NRMat<float> WM(seq1Length + 1, seq2Length + 1); |
|---|
| 746 | for (int i = 0; i <= seq1Length; i++) { |
|---|
| 747 | for (int j = 0; j <= seq2Length; j++) { |
|---|
| 748 | WM[i][j] = 0; |
|---|
| 749 | } |
|---|
| 750 | } |
|---|
| 751 | |
|---|
| 752 | int len = WORDLENGTH; |
|---|
| 753 | int size = matchPSCS1->size(); |
|---|
| 754 | float weight = 1000; |
|---|
| 755 | |
|---|
| 756 | for(int iter = 0; iter < size; iter++) { |
|---|
| 757 | int i = matchPSCS1->at(iter); |
|---|
| 758 | int j = matchPSCS2->at(iter); |
|---|
| 759 | |
|---|
| 760 | const StemCandidate &sc1 = pscs1->at(i); |
|---|
| 761 | const StemCandidate &sc2 = pscs2->at(j); |
|---|
| 762 | for(int k = 0; k < len; k++) { |
|---|
| 763 | WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight; |
|---|
| 764 | } |
|---|
| 765 | } |
|---|
| 766 | float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows); |
|---|
| 767 | float *oldRow = twoRows; |
|---|
| 768 | float *newRow = twoRows + seq2Length + 1; |
|---|
| 769 | |
|---|
| 770 | char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
|---|
| 771 | char *tracebackPtr = tracebackMatrix; |
|---|
| 772 | |
|---|
| 773 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
|---|
| 774 | |
|---|
| 775 | // initialization |
|---|
| 776 | for (int i = 0; i <= seq2Length; i++){ |
|---|
| 777 | oldRow[i] = 0; |
|---|
| 778 | *(tracebackPtr++) = 'L'; |
|---|
| 779 | } |
|---|
| 780 | |
|---|
| 781 | // fill in matrix |
|---|
| 782 | for (int i = 1; i <= seq1Length; i++){ |
|---|
| 783 | |
|---|
| 784 | // initialize left column |
|---|
| 785 | newRow[0] = 0; |
|---|
| 786 | posteriorPtr++; |
|---|
| 787 | *(tracebackPtr++) = 'U'; |
|---|
| 788 | |
|---|
| 789 | // fill in rest of row |
|---|
| 790 | for (int j = 1; j <= seq2Length; j++){ |
|---|
| 791 | ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1] + WM[i][j], newRow[j-1], oldRow[j], |
|---|
| 792 | 'D', 'L', 'U', &newRow[j], tracebackPtr++); |
|---|
| 793 | } |
|---|
| 794 | |
|---|
| 795 | // swap rows |
|---|
| 796 | float *temp = oldRow; |
|---|
| 797 | oldRow = newRow; |
|---|
| 798 | newRow = temp; |
|---|
| 799 | } |
|---|
| 800 | |
|---|
| 801 | // store best score |
|---|
| 802 | float total = oldRow[seq2Length]; |
|---|
| 803 | delete [] twoRows; |
|---|
| 804 | |
|---|
| 805 | // compute traceback |
|---|
| 806 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
|---|
| 807 | int r = seq1Length, c = seq2Length; |
|---|
| 808 | while (r != 0 || c != 0){ |
|---|
| 809 | char ch = tracebackMatrix[r*(seq2Length+1) + c]; |
|---|
| 810 | switch (ch){ |
|---|
| 811 | case 'L': c--; alignment->push_back ('Y'); break; |
|---|
| 812 | case 'U': r--; alignment->push_back ('X'); break; |
|---|
| 813 | case 'D': c--; r--; alignment->push_back ('B'); break; |
|---|
| 814 | default: assert (false); |
|---|
| 815 | } |
|---|
| 816 | } |
|---|
| 817 | |
|---|
| 818 | delete [] tracebackMatrix; |
|---|
| 819 | |
|---|
| 820 | reverse (alignment->begin(), alignment->end()); |
|---|
| 821 | |
|---|
| 822 | return make_pair(alignment, total); |
|---|
| 823 | } |
|---|
| 824 | |
|---|
| 825 | ///////////////////////////////////////////////////////////////// |
|---|
| 826 | // ProbabilisticModel::ComputeAlignmentWithGapPenalties() |
|---|
| 827 | // |
|---|
| 828 | // Similar to ComputeAlignment() except with gap penalties. |
|---|
| 829 | ///////////////////////////////////////////////////////////////// |
|---|
| 830 | |
|---|
| 831 | pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1, |
|---|
| 832 | MultiSequence *align2, |
|---|
| 833 | const VF &posterior, int numSeqs1, |
|---|
| 834 | int numSeqs2, |
|---|
| 835 | float gapOpenPenalty, |
|---|
| 836 | float gapContinuePenalty) const { |
|---|
| 837 | int seq1Length = align1->GetSequence(0)->GetLength(); |
|---|
| 838 | int seq2Length = align2->GetSequence(0)->GetLength(); |
|---|
| 839 | SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences()); |
|---|
| 840 | SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences()); |
|---|
| 841 | |
|---|
| 842 | // grab character data |
|---|
| 843 | for (int i = 0; i < align1->GetNumSequences(); i++) |
|---|
| 844 | dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr(); |
|---|
| 845 | for (int i = 0; i < align2->GetNumSequences(); i++) |
|---|
| 846 | dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr(); |
|---|
| 847 | |
|---|
| 848 | // the number of active sequences at any given column is defined to be the |
|---|
| 849 | // number of non-gap characters in that column; the number of gap opens at |
|---|
| 850 | // any given column is defined to be the number of gap characters in that |
|---|
| 851 | // column where the previous character in the respective sequence was not |
|---|
| 852 | // a gap |
|---|
| 853 | SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1); |
|---|
| 854 | SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1); |
|---|
| 855 | |
|---|
| 856 | // compute number of active sequences and gap opens for each group |
|---|
| 857 | for (int i = 0; i < align1->GetNumSequences(); i++){ |
|---|
| 858 | SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr(); |
|---|
| 859 | numActive1[0] = numGapOpens1[0] = 0; |
|---|
| 860 | for (int j = 1; j <= seq1Length; j++){ |
|---|
| 861 | if (dataPtr[j] != '-'){ |
|---|
| 862 | numActive1[j]++; |
|---|
| 863 | numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-'); |
|---|
| 864 | } |
|---|
| 865 | } |
|---|
| 866 | } |
|---|
| 867 | for (int i = 0; i < align2->GetNumSequences(); i++){ |
|---|
| 868 | SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr(); |
|---|
| 869 | numActive2[0] = numGapOpens2[0] = 0; |
|---|
| 870 | for (int j = 1; j <= seq2Length; j++){ |
|---|
| 871 | if (dataPtr[j] != '-'){ |
|---|
| 872 | numActive2[j]++; |
|---|
| 873 | numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-'); |
|---|
| 874 | } |
|---|
| 875 | } |
|---|
| 876 | } |
|---|
| 877 | |
|---|
| 878 | VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1)); |
|---|
| 879 | VF continuingPenalty1 (numSeqs1+1); |
|---|
| 880 | VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1)); |
|---|
| 881 | VF continuingPenalty2 (numSeqs2+1); |
|---|
| 882 | |
|---|
| 883 | // precompute penalties |
|---|
| 884 | for (int i = 0; i <= numSeqs1; i++) |
|---|
| 885 | for (int j = 0; j <= numSeqs2; j++) |
|---|
| 886 | openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j)); |
|---|
| 887 | for (int i = 0; i <= numSeqs1; i++) |
|---|
| 888 | continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2; |
|---|
| 889 | for (int i = 0; i <= numSeqs2; i++) |
|---|
| 890 | for (int j = 0; j <= numSeqs1; j++) |
|---|
| 891 | openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j)); |
|---|
| 892 | for (int i = 0; i <= numSeqs2; i++) |
|---|
| 893 | continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1; |
|---|
| 894 | |
|---|
| 895 | float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows); |
|---|
| 896 | float *oldRowMatch = twoRows; |
|---|
| 897 | float *newRowMatch = twoRows + (seq2Length+1); |
|---|
| 898 | float *oldRowInsertX = twoRows + 2*(seq2Length+1); |
|---|
| 899 | float *newRowInsertX = twoRows + 3*(seq2Length+1); |
|---|
| 900 | float *oldRowInsertY = twoRows + 4*(seq2Length+1); |
|---|
| 901 | float *newRowInsertY = twoRows + 5*(seq2Length+1); |
|---|
| 902 | |
|---|
| 903 | char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
|---|
| 904 | char *tracebackPtr = tracebackMatrix; |
|---|
| 905 | |
|---|
| 906 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
|---|
| 907 | |
|---|
| 908 | // initialization |
|---|
| 909 | for (int i = 0; i <= seq2Length; i++){ |
|---|
| 910 | oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO; |
|---|
| 911 | oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]]; |
|---|
| 912 | *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y'; |
|---|
| 913 | tracebackPtr += 3; |
|---|
| 914 | } |
|---|
| 915 | |
|---|
| 916 | // fill in matrix |
|---|
| 917 | for (int i = 1; i <= seq1Length; i++){ |
|---|
| 918 | |
|---|
| 919 | // initialize left column |
|---|
| 920 | newRowMatch[0] = newRowInsertY[0] = LOG_ZERO; |
|---|
| 921 | newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]]; |
|---|
| 922 | posteriorPtr++; |
|---|
| 923 | *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X'; |
|---|
| 924 | tracebackPtr += 3; |
|---|
| 925 | |
|---|
| 926 | // fill in rest of row |
|---|
| 927 | for (int j = 1; j <= seq2Length; j++){ |
|---|
| 928 | |
|---|
| 929 | // going to MATCH state |
|---|
| 930 | ChooseBestOfThree (oldRowMatch[j-1], |
|---|
| 931 | oldRowInsertX[j-1], |
|---|
| 932 | oldRowInsertY[j-1], |
|---|
| 933 | 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++); |
|---|
| 934 | newRowMatch[j] += *(posteriorPtr++); |
|---|
| 935 | |
|---|
| 936 | // going to INSERT X state |
|---|
| 937 | ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], |
|---|
| 938 | oldRowInsertX[j] + continuingPenalty1[numActive1[i]], |
|---|
| 939 | oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], |
|---|
| 940 | 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++); |
|---|
| 941 | |
|---|
| 942 | // going to INSERT Y state |
|---|
| 943 | ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], |
|---|
| 944 | newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], |
|---|
| 945 | newRowInsertY[j-1] + continuingPenalty2[numActive2[j]], |
|---|
| 946 | 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++); |
|---|
| 947 | } |
|---|
| 948 | |
|---|
| 949 | // swap rows |
|---|
| 950 | float *temp; |
|---|
| 951 | temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp; |
|---|
| 952 | temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp; |
|---|
| 953 | temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp; |
|---|
| 954 | } |
|---|
| 955 | |
|---|
| 956 | // store best score |
|---|
| 957 | float total; |
|---|
| 958 | char matrix; |
|---|
| 959 | ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length], |
|---|
| 960 | 'M', 'X', 'Y', &total, &matrix); |
|---|
| 961 | |
|---|
| 962 | delete [] twoRows; |
|---|
| 963 | |
|---|
| 964 | // compute traceback |
|---|
| 965 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
|---|
| 966 | int r = seq1Length, c = seq2Length; |
|---|
| 967 | while (r != 0 || c != 0){ |
|---|
| 968 | |
|---|
| 969 | int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2; |
|---|
| 970 | char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset]; |
|---|
| 971 | switch (matrix){ |
|---|
| 972 | case 'Y': c--; alignment->push_back ('Y'); break; |
|---|
| 973 | case 'X': r--; alignment->push_back ('X'); break; |
|---|
| 974 | case 'M': c--; r--; alignment->push_back ('B'); break; |
|---|
| 975 | default: assert (false); |
|---|
| 976 | } |
|---|
| 977 | matrix = ch; |
|---|
| 978 | } |
|---|
| 979 | |
|---|
| 980 | delete [] tracebackMatrix; |
|---|
| 981 | |
|---|
| 982 | reverse (alignment->begin(), alignment->end()); |
|---|
| 983 | |
|---|
| 984 | return make_pair(alignment, 1.0f); |
|---|
| 985 | } |
|---|
| 986 | |
|---|
| 987 | |
|---|
| 988 | ///////////////////////////////////////////////////////////////// |
|---|
| 989 | // ProbabilisticModel::ComputeViterbiAlignment() |
|---|
| 990 | // |
|---|
| 991 | // Computes the highest probability pairwise alignment using the |
|---|
| 992 | // probabilistic model. The final alignment is returned as a |
|---|
| 993 | // pair consisting of: |
|---|
| 994 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
|---|
| 995 | // denote insertions in one of the two sequences and |
|---|
| 996 | // B's denote that both sequences are present (i.e. |
|---|
| 997 | // matches). |
|---|
| 998 | // (2) a float containing the log probability of the best |
|---|
| 999 | // alignment (not used) |
|---|
| 1000 | ///////////////////////////////////////////////////////////////// |
|---|
| 1001 | |
|---|
| 1002 | pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const { |
|---|
| 1003 | |
|---|
| 1004 | assert (seq1); |
|---|
| 1005 | assert (seq2); |
|---|
| 1006 | |
|---|
| 1007 | const int seq1Length = seq1->GetLength(); |
|---|
| 1008 | const int seq2Length = seq2->GetLength(); |
|---|
| 1009 | |
|---|
| 1010 | // retrieve the points to the beginning of each sequence |
|---|
| 1011 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
|---|
| 1012 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
|---|
| 1013 | |
|---|
| 1014 | // create viterbi matrix |
|---|
| 1015 | VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
|---|
| 1016 | assert (viterbiPtr); |
|---|
| 1017 | VF &viterbi = *viterbiPtr; |
|---|
| 1018 | |
|---|
| 1019 | // create traceback matrix |
|---|
| 1020 | VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1); |
|---|
| 1021 | assert (tracebackPtr); |
|---|
| 1022 | VI &traceback = *tracebackPtr; |
|---|
| 1023 | |
|---|
| 1024 | // initialization condition |
|---|
| 1025 | for (int k = 0; k < NumMatrixTypes; k++) |
|---|
| 1026 | viterbi[k] = initialDistribution[k]; |
|---|
| 1027 | |
|---|
| 1028 | // remember offset for each index combination |
|---|
| 1029 | int ij = 0; |
|---|
| 1030 | int i1j = -seq2Length - 1; |
|---|
| 1031 | int ij1 = -1; |
|---|
| 1032 | int i1j1 = -seq2Length - 2; |
|---|
| 1033 | |
|---|
| 1034 | ij *= NumMatrixTypes; |
|---|
| 1035 | i1j *= NumMatrixTypes; |
|---|
| 1036 | ij1 *= NumMatrixTypes; |
|---|
| 1037 | i1j1 *= NumMatrixTypes; |
|---|
| 1038 | |
|---|
| 1039 | // compute viterbi scores |
|---|
| 1040 | for (int i = 0; i <= seq1Length; i++){ |
|---|
| 1041 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
|---|
| 1042 | for (int j = 0; j <= seq2Length; j++){ |
|---|
| 1043 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
|---|
| 1044 | |
|---|
| 1045 | if (i > 0 && j > 0){ |
|---|
| 1046 | for (int k = 0; k < NumMatrixTypes; k++){ |
|---|
| 1047 | float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2]; |
|---|
| 1048 | if (viterbi[0 + ij] < newVal){ |
|---|
| 1049 | viterbi[0 + ij] = newVal; |
|---|
| 1050 | traceback[0 + ij] = k; |
|---|
| 1051 | } |
|---|
| 1052 | } |
|---|
| 1053 | } |
|---|
| 1054 | if (i > 0){ |
|---|
| 1055 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 1056 | float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1]; |
|---|
| 1057 | float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1]; |
|---|
| 1058 | if (valFromMatch >= valFromIns){ |
|---|
| 1059 | viterbi[2*k+1 + ij] = valFromMatch; |
|---|
| 1060 | traceback[2*k+1 + ij] = 0; |
|---|
| 1061 | } |
|---|
| 1062 | else { |
|---|
| 1063 | viterbi[2*k+1 + ij] = valFromIns; |
|---|
| 1064 | traceback[2*k+1 + ij] = 2*k+1; |
|---|
| 1065 | } |
|---|
| 1066 | } |
|---|
| 1067 | } |
|---|
| 1068 | if (j > 0){ |
|---|
| 1069 | for (int k = 0; k < NumInsertStates; k++){ |
|---|
| 1070 | float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2]; |
|---|
| 1071 | float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2]; |
|---|
| 1072 | if (valFromMatch >= valFromIns){ |
|---|
| 1073 | viterbi[2*k+2 + ij] = valFromMatch; |
|---|
| 1074 | traceback[2*k+2 + ij] = 0; |
|---|
| 1075 | } |
|---|
| 1076 | else { |
|---|
| 1077 | viterbi[2*k+2 + ij] = valFromIns; |
|---|
| 1078 | traceback[2*k+2 + ij] = 2*k+2; |
|---|
| 1079 | } |
|---|
| 1080 | } |
|---|
| 1081 | } |
|---|
| 1082 | |
|---|
| 1083 | ij += NumMatrixTypes; |
|---|
| 1084 | i1j += NumMatrixTypes; |
|---|
| 1085 | ij1 += NumMatrixTypes; |
|---|
| 1086 | i1j1 += NumMatrixTypes; |
|---|
| 1087 | } |
|---|
| 1088 | } |
|---|
| 1089 | |
|---|
| 1090 | // figure out best terminating cell |
|---|
| 1091 | float bestProb = LOG_ZERO; |
|---|
| 1092 | int state = -1; |
|---|
| 1093 | for (int k = 0; k < NumMatrixTypes; k++){ |
|---|
| 1094 | float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k]; |
|---|
| 1095 | if (bestProb < thisProb){ |
|---|
| 1096 | bestProb = thisProb; |
|---|
| 1097 | state = k; |
|---|
| 1098 | } |
|---|
| 1099 | } |
|---|
| 1100 | assert (state != -1); |
|---|
| 1101 | |
|---|
| 1102 | delete viterbiPtr; |
|---|
| 1103 | |
|---|
| 1104 | // compute traceback |
|---|
| 1105 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
|---|
| 1106 | int r = seq1Length, c = seq2Length; |
|---|
| 1107 | while (r != 0 || c != 0){ |
|---|
| 1108 | int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)]; |
|---|
| 1109 | |
|---|
| 1110 | if (state == 0){ c--; r--; alignment->push_back ('B'); } |
|---|
| 1111 | else if (state % 2 == 1){ r--; alignment->push_back ('X'); } |
|---|
| 1112 | else { c--; alignment->push_back ('Y'); } |
|---|
| 1113 | |
|---|
| 1114 | state = newState; |
|---|
| 1115 | } |
|---|
| 1116 | |
|---|
| 1117 | delete tracebackPtr; |
|---|
| 1118 | |
|---|
| 1119 | reverse (alignment->begin(), alignment->end()); |
|---|
| 1120 | |
|---|
| 1121 | return make_pair(alignment, bestProb); |
|---|
| 1122 | } |
|---|
| 1123 | |
|---|
| 1124 | ///////////////////////////////////////////////////////////////// |
|---|
| 1125 | // ProbabilisticModel::BuildPosterior() |
|---|
| 1126 | // |
|---|
| 1127 | // Builds a posterior probability matrix needed to align a pair |
|---|
| 1128 | // of alignments. Mathematically, the returned matrix M is |
|---|
| 1129 | // defined as follows: |
|---|
| 1130 | // M[i,j] = sum sum f(s,t,i,j) |
|---|
| 1131 | // s in align1 t in align2 |
|---|
| 1132 | // where |
|---|
| 1133 | // [ P(s[i'] <--> t[j']) |
|---|
| 1134 | // [ if s[i'] is a letter in the ith column of align1 and |
|---|
| 1135 | // [ t[j'] it a letter in the jth column of align2 |
|---|
| 1136 | // f(s,t,i,j) = [ |
|---|
| 1137 | // [ 0 otherwise |
|---|
| 1138 | // |
|---|
| 1139 | ///////////////////////////////////////////////////////////////// |
|---|
| 1140 | |
|---|
| 1141 | VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2, |
|---|
| 1142 | const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices, |
|---|
| 1143 | float cutoff = 0.0f) const { |
|---|
| 1144 | const int seq1Length = align1->GetSequence(0)->GetLength(); |
|---|
| 1145 | const int seq2Length = align2->GetSequence(0)->GetLength(); |
|---|
| 1146 | |
|---|
| 1147 | VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr); |
|---|
| 1148 | VF &posterior = *posteriorPtr; |
|---|
| 1149 | VF::iterator postPtr = posterior.begin(); |
|---|
| 1150 | |
|---|
| 1151 | // for each s in align1 |
|---|
| 1152 | for (int i = 0; i < align1->GetNumSequences(); i++){ |
|---|
| 1153 | int first = align1->GetSequence(i)->GetLabel(); |
|---|
| 1154 | SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping(); |
|---|
| 1155 | |
|---|
| 1156 | // for each t in align2 |
|---|
| 1157 | for (int j = 0; j < align2->GetNumSequences(); j++){ |
|---|
| 1158 | int second = align2->GetSequence(j)->GetLabel(); |
|---|
| 1159 | SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping(); |
|---|
| 1160 | if (first < second){ |
|---|
| 1161 | |
|---|
| 1162 | // get the associated sparse matrix |
|---|
| 1163 | SparseMatrix *matrix = sparseMatrices[first][second]; |
|---|
| 1164 | |
|---|
| 1165 | for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){ |
|---|
| 1166 | SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii); |
|---|
| 1167 | int base = (*mapping1)[ii] * (seq2Length+1); |
|---|
| 1168 | int rowSize = matrix->GetRowSize(ii); |
|---|
| 1169 | // add in all relevant values |
|---|
| 1170 | for (int jj = 0; jj < rowSize; jj++) |
|---|
| 1171 | posterior[base + (*mapping2)[row[jj].first]] += row[jj].second; |
|---|
| 1172 | |
|---|
| 1173 | // subtract cutoff |
|---|
| 1174 | for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) { |
|---|
| 1175 | posterior[base + (*mapping2)[jj]] -= cutoff; |
|---|
| 1176 | } |
|---|
| 1177 | |
|---|
| 1178 | } |
|---|
| 1179 | |
|---|
| 1180 | } else { |
|---|
| 1181 | // get the associated sparse matrix |
|---|
| 1182 | SparseMatrix *matrix = sparseMatrices[second][first]; |
|---|
| 1183 | |
|---|
| 1184 | for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){ |
|---|
| 1185 | SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj); |
|---|
| 1186 | int base = (*mapping2)[jj]; |
|---|
| 1187 | int rowSize = matrix->GetRowSize(jj); |
|---|
| 1188 | |
|---|
| 1189 | // add in all relevant values |
|---|
| 1190 | for (int ii = 0; ii < rowSize; ii++) |
|---|
| 1191 | posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second; |
|---|
| 1192 | |
|---|
| 1193 | // subtract cutoff |
|---|
| 1194 | for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) |
|---|
| 1195 | posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff; |
|---|
| 1196 | } |
|---|
| 1197 | |
|---|
| 1198 | } |
|---|
| 1199 | |
|---|
| 1200 | |
|---|
| 1201 | delete mapping2; |
|---|
| 1202 | } |
|---|
| 1203 | |
|---|
| 1204 | delete mapping1; |
|---|
| 1205 | } |
|---|
| 1206 | |
|---|
| 1207 | return posteriorPtr; |
|---|
| 1208 | } |
|---|
| 1209 | }; |
|---|
| 1210 | } |
|---|
| 1211 | #endif |
|---|