1 | ///////////////////////////////////////////////////////////////// |
---|
2 | // ProbabilisticModel.h |
---|
3 | // |
---|
4 | // Routines for (1) posterior probability computations |
---|
5 | // (2) chained anchoring |
---|
6 | // (3) maximum weight trace alignment |
---|
7 | ///////////////////////////////////////////////////////////////// |
---|
8 | |
---|
9 | #ifndef PROBABILISTICMODEL_H |
---|
10 | #define PROBABILISTICMODEL_H |
---|
11 | |
---|
12 | #include <list> |
---|
13 | #include <cmath> |
---|
14 | #include <cstdio> |
---|
15 | #include "SafeVector.h" |
---|
16 | #include "ScoreType.h" |
---|
17 | #include "SparseMatrix.h" |
---|
18 | #include "MultiSequence.h" |
---|
19 | |
---|
20 | using namespace std; |
---|
21 | |
---|
22 | const int NumMatchStates = 1; // note that in this version the number |
---|
23 | // of match states is fixed at 1...will |
---|
24 | // change in future versions |
---|
25 | const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2; |
---|
26 | |
---|
27 | ///////////////////////////////////////////////////////////////// |
---|
28 | // ProbabilisticModel |
---|
29 | // |
---|
30 | // Class for storing the parameters of a probabilistic model and |
---|
31 | // performing different computations based on those parameters. |
---|
32 | // In particular, this class handles the computation of |
---|
33 | // posterior probabilities that may be used in alignment. |
---|
34 | ///////////////////////////////////////////////////////////////// |
---|
35 | |
---|
36 | class ProbabilisticModel { |
---|
37 | |
---|
38 | float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state |
---|
39 | float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities |
---|
40 | float matchProb[256][256]; // emission probabilities for match states |
---|
41 | float insProb[256][NumMatrixTypes]; // emission probabilities for insert states |
---|
42 | |
---|
43 | public: |
---|
44 | |
---|
45 | ///////////////////////////////////////////////////////////////// |
---|
46 | // ProbabilisticModel::ProbabilisticModel() |
---|
47 | // |
---|
48 | // Constructor. Builds a new probabilistic model using the |
---|
49 | // given parameters. |
---|
50 | ///////////////////////////////////////////////////////////////// |
---|
51 | |
---|
52 | ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend, |
---|
53 | const VVF &emitPairs, const VF &emitSingle){ |
---|
54 | |
---|
55 | // build transition matrix |
---|
56 | VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f)); |
---|
57 | transMat[0][0] = 1; |
---|
58 | for (int i = 0; i < NumInsertStates; i++){ |
---|
59 | transMat[0][2*i+1] = gapOpen[2*i]; |
---|
60 | transMat[0][2*i+2] = gapOpen[2*i+1]; |
---|
61 | transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]); |
---|
62 | assert (transMat[0][0] > 0); |
---|
63 | transMat[2*i+1][2*i+1] = gapExtend[2*i]; |
---|
64 | transMat[2*i+2][2*i+2] = gapExtend[2*i+1]; |
---|
65 | transMat[2*i+1][2*i+2] = 0; |
---|
66 | transMat[2*i+2][2*i+1] = 0; |
---|
67 | transMat[2*i+1][0] = 1 - gapExtend[2*i]; |
---|
68 | transMat[2*i+2][0] = 1 - gapExtend[2*i+1]; |
---|
69 | } |
---|
70 | |
---|
71 | // create initial and transition probability matrices |
---|
72 | for (int i = 0; i < NumMatrixTypes; i++){ |
---|
73 | initialDistribution[i] = LOG (initDistribMat[i]); |
---|
74 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
75 | transProb[i][j] = LOG (transMat[i][j]); |
---|
76 | } |
---|
77 | |
---|
78 | // create insertion and match probability matrices |
---|
79 | for (int i = 0; i < 256; i++){ |
---|
80 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
81 | insProb[i][j] = LOG (emitSingle[i]); |
---|
82 | for (int j = 0; j < 256; j++) |
---|
83 | matchProb[i][j] = LOG (emitPairs[i][j]); |
---|
84 | } |
---|
85 | } |
---|
86 | |
---|
87 | ///////////////////////////////////////////////////////////////// |
---|
88 | // ProbabilisticModel::ComputeForwardMatrix() |
---|
89 | // |
---|
90 | // Computes a set of forward probability matrices for aligning |
---|
91 | // seq1 and seq2. |
---|
92 | // |
---|
93 | // For efficiency reasons, a single-dimensional floating-point |
---|
94 | // array is used here, with the following indexing scheme: |
---|
95 | // |
---|
96 | // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] |
---|
97 | // refers to the probability of aligning through j characters |
---|
98 | // of the first sequence, k characters of the second sequence, |
---|
99 | // and ending in state i. |
---|
100 | ///////////////////////////////////////////////////////////////// |
---|
101 | |
---|
102 | VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const { |
---|
103 | |
---|
104 | assert (seq1); |
---|
105 | assert (seq2); |
---|
106 | |
---|
107 | const int seq1Length = seq1->GetLength(); |
---|
108 | const int seq2Length = seq2->GetLength(); |
---|
109 | |
---|
110 | // retrieve the points to the beginning of each sequence |
---|
111 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
112 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
113 | |
---|
114 | // create matrix |
---|
115 | VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
---|
116 | assert (forwardPtr); |
---|
117 | VF &forward = *forwardPtr; |
---|
118 | |
---|
119 | // initialization condition |
---|
120 | forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] = |
---|
121 | initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]]; |
---|
122 | |
---|
123 | for (int k = 0; k < NumInsertStates; k++){ |
---|
124 | forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] = |
---|
125 | initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k]; |
---|
126 | forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] = |
---|
127 | initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k]; |
---|
128 | } |
---|
129 | |
---|
130 | // remember offset for each index combination |
---|
131 | int ij = 0; |
---|
132 | int i1j = -seq2Length - 1; |
---|
133 | int ij1 = -1; |
---|
134 | int i1j1 = -seq2Length - 2; |
---|
135 | |
---|
136 | ij *= NumMatrixTypes; |
---|
137 | i1j *= NumMatrixTypes; |
---|
138 | ij1 *= NumMatrixTypes; |
---|
139 | i1j1 *= NumMatrixTypes; |
---|
140 | |
---|
141 | // compute forward scores |
---|
142 | for (int i = 0; i <= seq1Length; i++){ |
---|
143 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
---|
144 | for (int j = 0; j <= seq2Length; j++){ |
---|
145 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
---|
146 | |
---|
147 | if (i > 1 || j > 1){ |
---|
148 | if (i > 0 && j > 0){ |
---|
149 | forward[0 + ij] = forward[0 + i1j1] + transProb[0][0]; |
---|
150 | for (int k = 1; k < NumMatrixTypes; k++) |
---|
151 | LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]); |
---|
152 | forward[0 + ij] += matchProb[c1][c2]; |
---|
153 | } |
---|
154 | if (i > 0){ |
---|
155 | for (int k = 0; k < NumInsertStates; k++) |
---|
156 | forward[2*k+1 + ij] = insProb[c1][k] + |
---|
157 | LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1], |
---|
158 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]); |
---|
159 | } |
---|
160 | if (j > 0){ |
---|
161 | for (int k = 0; k < NumInsertStates; k++) |
---|
162 | forward[2*k+2 + ij] = insProb[c2][k] + |
---|
163 | LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2], |
---|
164 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]); |
---|
165 | } |
---|
166 | } |
---|
167 | |
---|
168 | ij += NumMatrixTypes; |
---|
169 | i1j += NumMatrixTypes; |
---|
170 | ij1 += NumMatrixTypes; |
---|
171 | i1j1 += NumMatrixTypes; |
---|
172 | } |
---|
173 | } |
---|
174 | |
---|
175 | return forwardPtr; |
---|
176 | } |
---|
177 | |
---|
178 | ///////////////////////////////////////////////////////////////// |
---|
179 | // ProbabilisticModel::ComputeBackwardMatrix() |
---|
180 | // |
---|
181 | // Computes a set of backward probability matrices for aligning |
---|
182 | // seq1 and seq2. |
---|
183 | // |
---|
184 | // For efficiency reasons, a single-dimensional floating-point |
---|
185 | // array is used here, with the following indexing scheme: |
---|
186 | // |
---|
187 | // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] |
---|
188 | // refers to the probability of starting in state i and |
---|
189 | // aligning from character j+1 to the end of the first |
---|
190 | // sequence and from character k+1 to the end of the second |
---|
191 | // sequence. |
---|
192 | ///////////////////////////////////////////////////////////////// |
---|
193 | |
---|
194 | VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const { |
---|
195 | |
---|
196 | assert (seq1); |
---|
197 | assert (seq2); |
---|
198 | |
---|
199 | const int seq1Length = seq1->GetLength(); |
---|
200 | const int seq2Length = seq2->GetLength(); |
---|
201 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
202 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
203 | |
---|
204 | // create matrix |
---|
205 | VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
---|
206 | assert (backwardPtr); |
---|
207 | VF &backward = *backwardPtr; |
---|
208 | |
---|
209 | // initialization condition |
---|
210 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
211 | backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k]; |
---|
212 | |
---|
213 | // remember offset for each index combination |
---|
214 | int ij = (seq1Length+1) * (seq2Length+1) - 1; |
---|
215 | int i1j = ij + seq2Length + 1; |
---|
216 | int ij1 = ij + 1; |
---|
217 | int i1j1 = ij + seq2Length + 2; |
---|
218 | |
---|
219 | ij *= NumMatrixTypes; |
---|
220 | i1j *= NumMatrixTypes; |
---|
221 | ij1 *= NumMatrixTypes; |
---|
222 | i1j1 *= NumMatrixTypes; |
---|
223 | |
---|
224 | // compute backward scores |
---|
225 | for (int i = seq1Length; i >= 0; i--){ |
---|
226 | unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1]; |
---|
227 | for (int j = seq2Length; j >= 0; j--){ |
---|
228 | unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1]; |
---|
229 | |
---|
230 | if (i < seq1Length && j < seq2Length){ |
---|
231 | const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2]; |
---|
232 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
233 | LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]); |
---|
234 | } |
---|
235 | if (i < seq1Length){ |
---|
236 | for (int k = 0; k < NumInsertStates; k++){ |
---|
237 | LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]); |
---|
238 | LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]); |
---|
239 | } |
---|
240 | } |
---|
241 | if (j < seq2Length){ |
---|
242 | for (int k = 0; k < NumInsertStates; k++){ |
---|
243 | LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]); |
---|
244 | LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]); |
---|
245 | } |
---|
246 | } |
---|
247 | |
---|
248 | ij -= NumMatrixTypes; |
---|
249 | i1j -= NumMatrixTypes; |
---|
250 | ij1 -= NumMatrixTypes; |
---|
251 | i1j1 -= NumMatrixTypes; |
---|
252 | } |
---|
253 | } |
---|
254 | |
---|
255 | return backwardPtr; |
---|
256 | } |
---|
257 | |
---|
258 | ///////////////////////////////////////////////////////////////// |
---|
259 | // ProbabilisticModel::ComputeTotalProbability() |
---|
260 | // |
---|
261 | // Computes the total probability of an alignment given |
---|
262 | // the forward and backward matrices. |
---|
263 | ///////////////////////////////////////////////////////////////// |
---|
264 | |
---|
265 | float ComputeTotalProbability (int seq1Length, int seq2Length, |
---|
266 | const VF &forward, const VF &backward) const { |
---|
267 | |
---|
268 | // compute total probability |
---|
269 | float totalForwardProb = LOG_ZERO; |
---|
270 | float totalBackwardProb = LOG_ZERO; |
---|
271 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
272 | LOG_PLUS_EQUALS (totalForwardProb, |
---|
273 | forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
274 | backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
275 | } |
---|
276 | |
---|
277 | totalBackwardProb = |
---|
278 | forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + |
---|
279 | backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)]; |
---|
280 | |
---|
281 | for (int k = 0; k < NumInsertStates; k++){ |
---|
282 | LOG_PLUS_EQUALS (totalBackwardProb, |
---|
283 | forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + |
---|
284 | backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]); |
---|
285 | LOG_PLUS_EQUALS (totalBackwardProb, |
---|
286 | forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + |
---|
287 | backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]); |
---|
288 | } |
---|
289 | |
---|
290 | // cerr << totalForwardProb << " " << totalBackwardProb << endl; |
---|
291 | |
---|
292 | return (totalForwardProb + totalBackwardProb) / 2; |
---|
293 | } |
---|
294 | |
---|
295 | ///////////////////////////////////////////////////////////////// |
---|
296 | // ProbabilisticModel::ComputePosteriorMatrix() |
---|
297 | // |
---|
298 | // Computes the posterior probability matrix based on |
---|
299 | // the forward and backward matrices. |
---|
300 | ///////////////////////////////////////////////////////////////// |
---|
301 | |
---|
302 | VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2, |
---|
303 | const VF &forward, const VF &backward) const { |
---|
304 | |
---|
305 | assert (seq1); |
---|
306 | assert (seq2); |
---|
307 | |
---|
308 | const int seq1Length = seq1->GetLength(); |
---|
309 | const int seq2Length = seq2->GetLength(); |
---|
310 | |
---|
311 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
---|
312 | forward, backward); |
---|
313 | |
---|
314 | // compute posterior matrices |
---|
315 | VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr); |
---|
316 | VF &posterior = *posteriorPtr; |
---|
317 | |
---|
318 | int ij = 0; |
---|
319 | VF::iterator ptr = posterior.begin(); |
---|
320 | |
---|
321 | for (int i = 0; i <= seq1Length; i++){ |
---|
322 | for (int j = 0; j <= seq2Length; j++){ |
---|
323 | *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb)); |
---|
324 | ij += NumMatrixTypes; |
---|
325 | } |
---|
326 | } |
---|
327 | |
---|
328 | posterior[0] = 0; |
---|
329 | |
---|
330 | return posteriorPtr; |
---|
331 | } |
---|
332 | |
---|
333 | /* |
---|
334 | ///////////////////////////////////////////////////////////////// |
---|
335 | // ProbabilisticModel::ComputeExpectedCounts() |
---|
336 | // |
---|
337 | // Computes the expected counts for the various transitions. |
---|
338 | ///////////////////////////////////////////////////////////////// |
---|
339 | |
---|
340 | VVF *ComputeExpectedCounts () const { |
---|
341 | |
---|
342 | assert (seq1); |
---|
343 | assert (seq2); |
---|
344 | |
---|
345 | const int seq1Length = seq1->GetLength(); |
---|
346 | const int seq2Length = seq2->GetLength(); |
---|
347 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
348 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
349 | |
---|
350 | // compute total probability |
---|
351 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
---|
352 | forward, backward); |
---|
353 | |
---|
354 | // initialize expected counts |
---|
355 | VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr); |
---|
356 | VVF &counts = *countsPtr; |
---|
357 | |
---|
358 | // remember offset for each index combination |
---|
359 | int ij = 0; |
---|
360 | int i1j = -seq2Length - 1; |
---|
361 | int ij1 = -1; |
---|
362 | int i1j1 = -seq2Length - 2; |
---|
363 | |
---|
364 | ij *= NumMatrixTypes; |
---|
365 | i1j *= NumMatrixTypes; |
---|
366 | ij1 *= NumMatrixTypes; |
---|
367 | i1j1 *= NumMatrixTypes; |
---|
368 | |
---|
369 | // compute expected counts |
---|
370 | for (int i = 0; i <= seq1Length; i++){ |
---|
371 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
---|
372 | for (int j = 0; j <= seq2Length; j++){ |
---|
373 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
---|
374 | |
---|
375 | if (i > 0 && j > 0){ |
---|
376 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
377 | LOG_PLUS_EQUALS (counts[k][0], |
---|
378 | forward[k + i1j1] + transProb[k][0] + |
---|
379 | matchProb[c1][c2] + backward[0 + ij]); |
---|
380 | } |
---|
381 | if (i > 0){ |
---|
382 | for (int k = 0; k < NumInsertStates; k++){ |
---|
383 | LOG_PLUS_EQUALS (counts[0][2*k+1], |
---|
384 | forward[0 + i1j] + transProb[0][2*k+1] + |
---|
385 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
386 | LOG_PLUS_EQUALS (counts[2*k+1][2*k+1], |
---|
387 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
---|
388 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
389 | } |
---|
390 | } |
---|
391 | if (j > 0){ |
---|
392 | for (int k = 0; k < NumInsertStates; k++){ |
---|
393 | LOG_PLUS_EQUALS (counts[0][2*k+2], |
---|
394 | forward[0 + ij1] + transProb[0][2*k+2] + |
---|
395 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
396 | LOG_PLUS_EQUALS (counts[2*k+2][2*k+2], |
---|
397 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
---|
398 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
399 | } |
---|
400 | } |
---|
401 | |
---|
402 | ij += NumMatrixTypes; |
---|
403 | i1j += NumMatrixTypes; |
---|
404 | ij1 += NumMatrixTypes; |
---|
405 | i1j1 += NumMatrixTypes; |
---|
406 | } |
---|
407 | } |
---|
408 | |
---|
409 | // scale all expected counts appropriately |
---|
410 | for (int i = 0; i < NumMatrixTypes; i++) |
---|
411 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
412 | counts[i][j] -= totalProb; |
---|
413 | |
---|
414 | } |
---|
415 | */ |
---|
416 | |
---|
417 | ///////////////////////////////////////////////////////////////// |
---|
418 | // ProbabilisticModel::ComputeNewParameters() |
---|
419 | // |
---|
420 | // Computes a new parameter set based on the expected counts |
---|
421 | // given. |
---|
422 | ///////////////////////////////////////////////////////////////// |
---|
423 | |
---|
424 | void ComputeNewParameters (Sequence *seq1, Sequence *seq2, |
---|
425 | const VF &forward, const VF &backward, |
---|
426 | VF &initDistribMat, VF &gapOpen, |
---|
427 | VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const { |
---|
428 | |
---|
429 | assert (seq1); |
---|
430 | assert (seq2); |
---|
431 | |
---|
432 | const int seq1Length = seq1->GetLength(); |
---|
433 | const int seq2Length = seq2->GetLength(); |
---|
434 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
435 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
436 | |
---|
437 | // compute total probability |
---|
438 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
---|
439 | forward, backward); |
---|
440 | |
---|
441 | // initialize expected counts |
---|
442 | VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO)); |
---|
443 | VF initCounts (NumMatrixTypes, LOG_ZERO); |
---|
444 | VVF pairCounts (256, VF (256, LOG_ZERO)); |
---|
445 | VF singleCounts (256, LOG_ZERO); |
---|
446 | |
---|
447 | // remember offset for each index combination |
---|
448 | int ij = 0; |
---|
449 | int i1j = -seq2Length - 1; |
---|
450 | int ij1 = -1; |
---|
451 | int i1j1 = -seq2Length - 2; |
---|
452 | |
---|
453 | ij *= NumMatrixTypes; |
---|
454 | i1j *= NumMatrixTypes; |
---|
455 | ij1 *= NumMatrixTypes; |
---|
456 | i1j1 *= NumMatrixTypes; |
---|
457 | |
---|
458 | // compute initial distribution posteriors |
---|
459 | initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + |
---|
460 | backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)], |
---|
461 | forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
462 | backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
463 | for (int k = 0; k < NumInsertStates; k++){ |
---|
464 | initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + |
---|
465 | backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)], |
---|
466 | forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
467 | backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
468 | initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + |
---|
469 | backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)], |
---|
470 | forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
471 | backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
472 | } |
---|
473 | |
---|
474 | // compute expected counts |
---|
475 | for (int i = 0; i <= seq1Length; i++){ |
---|
476 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]); |
---|
477 | for (int j = 0; j <= seq2Length; j++){ |
---|
478 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]); |
---|
479 | |
---|
480 | if (i > 0 && j > 0){ |
---|
481 | if (enableTrainEmissions && i == 1 && j == 1){ |
---|
482 | LOG_PLUS_EQUALS (pairCounts[c1][c2], |
---|
483 | initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]); |
---|
484 | LOG_PLUS_EQUALS (pairCounts[c2][c1], |
---|
485 | initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]); |
---|
486 | } |
---|
487 | |
---|
488 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
489 | LOG_PLUS_EQUALS (transCounts[k][0], |
---|
490 | forward[k + i1j1] + transProb[k][0] + |
---|
491 | matchProb[c1][c2] + backward[0 + ij]); |
---|
492 | if ((enableTrainEmissions && i != 1) || j != 1){ // parenthesis inserted to silence warning. previously was: (enableTrainEmissions && i != 1 || j != 1). possible bug? |
---|
493 | LOG_PLUS_EQUALS (pairCounts[c1][c2], |
---|
494 | forward[k + i1j1] + transProb[k][0] + |
---|
495 | matchProb[c1][c2] + backward[0 + ij]); |
---|
496 | LOG_PLUS_EQUALS (pairCounts[c2][c1], |
---|
497 | forward[k + i1j1] + transProb[k][0] + |
---|
498 | matchProb[c2][c1] + backward[0 + ij]); |
---|
499 | } |
---|
500 | } |
---|
501 | } |
---|
502 | if (i > 0){ |
---|
503 | for (int k = 0; k < NumInsertStates; k++){ |
---|
504 | LOG_PLUS_EQUALS (transCounts[0][2*k+1], |
---|
505 | forward[0 + i1j] + transProb[0][2*k+1] + |
---|
506 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
507 | LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1], |
---|
508 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
---|
509 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
510 | if (enableTrainEmissions){ |
---|
511 | if (i == 1 && j == 0){ |
---|
512 | LOG_PLUS_EQUALS (singleCounts[c1], |
---|
513 | initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); |
---|
514 | } |
---|
515 | else { |
---|
516 | LOG_PLUS_EQUALS (singleCounts[c1], |
---|
517 | forward[0 + i1j] + transProb[0][2*k+1] + |
---|
518 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
519 | LOG_PLUS_EQUALS (singleCounts[c1], |
---|
520 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
---|
521 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
522 | } |
---|
523 | } |
---|
524 | } |
---|
525 | } |
---|
526 | if (j > 0){ |
---|
527 | for (int k = 0; k < NumInsertStates; k++){ |
---|
528 | LOG_PLUS_EQUALS (transCounts[0][2*k+2], |
---|
529 | forward[0 + ij1] + transProb[0][2*k+2] + |
---|
530 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
531 | LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2], |
---|
532 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
---|
533 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
534 | if (enableTrainEmissions){ |
---|
535 | if (i == 0 && j == 1){ |
---|
536 | LOG_PLUS_EQUALS (singleCounts[c2], |
---|
537 | initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); |
---|
538 | } |
---|
539 | else { |
---|
540 | LOG_PLUS_EQUALS (singleCounts[c2], |
---|
541 | forward[0 + ij1] + transProb[0][2*k+2] + |
---|
542 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
543 | LOG_PLUS_EQUALS (singleCounts[c2], |
---|
544 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
---|
545 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
546 | } |
---|
547 | } |
---|
548 | } |
---|
549 | } |
---|
550 | |
---|
551 | ij += NumMatrixTypes; |
---|
552 | i1j += NumMatrixTypes; |
---|
553 | ij1 += NumMatrixTypes; |
---|
554 | i1j1 += NumMatrixTypes; |
---|
555 | } |
---|
556 | } |
---|
557 | |
---|
558 | // scale all expected counts appropriately |
---|
559 | for (int i = 0; i < NumMatrixTypes; i++){ |
---|
560 | initCounts[i] -= totalProb; |
---|
561 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
562 | transCounts[i][j] -= totalProb; |
---|
563 | } |
---|
564 | if (enableTrainEmissions){ |
---|
565 | for (int i = 0; i < 256; i++){ |
---|
566 | for (int j = 0; j < 256; j++) |
---|
567 | pairCounts[i][j] -= totalProb; |
---|
568 | singleCounts[i] -= totalProb; |
---|
569 | } |
---|
570 | } |
---|
571 | |
---|
572 | // compute new initial distribution |
---|
573 | float totalInitDistribCounts = 0; |
---|
574 | for (int i = 0; i < NumMatrixTypes; i++) |
---|
575 | totalInitDistribCounts += exp (initCounts[i]); // should be 2 |
---|
576 | initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts)); |
---|
577 | for (int k = 0; k < NumInsertStates; k++){ |
---|
578 | float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2; |
---|
579 | initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts)); |
---|
580 | } |
---|
581 | |
---|
582 | // compute total counts for match state |
---|
583 | float inMatchStateCounts = 0; |
---|
584 | for (int i = 0; i < NumMatrixTypes; i++) |
---|
585 | inMatchStateCounts += exp (transCounts[0][i]); |
---|
586 | for (int i = 0; i < NumInsertStates; i++){ |
---|
587 | |
---|
588 | // compute total counts for gap state |
---|
589 | float inGapStateCounts = |
---|
590 | exp (transCounts[2*i+1][0]) + |
---|
591 | exp (transCounts[2*i+1][2*i+1]) + |
---|
592 | exp (transCounts[2*i+2][0]) + |
---|
593 | exp (transCounts[2*i+2][2*i+2]); |
---|
594 | |
---|
595 | gapOpen[2*i] = gapOpen[2*i+1] = |
---|
596 | (exp (transCounts[0][2*i+1]) + |
---|
597 | exp (transCounts[0][2*i+2])) / |
---|
598 | (2 * inMatchStateCounts); |
---|
599 | |
---|
600 | gapExtend[2*i] = gapExtend[2*i+1] = |
---|
601 | (exp (transCounts[2*i+1][2*i+1]) + |
---|
602 | exp (transCounts[2*i+2][2*i+2])) / |
---|
603 | inGapStateCounts; |
---|
604 | } |
---|
605 | |
---|
606 | if (enableTrainEmissions){ |
---|
607 | float totalPairCounts = 0; |
---|
608 | float totalSingleCounts = 0; |
---|
609 | for (int i = 0; i < 256; i++){ |
---|
610 | for (int j = 0; j <= i; j++) |
---|
611 | totalPairCounts += exp (pairCounts[j][i]); |
---|
612 | totalSingleCounts += exp (singleCounts[i]); |
---|
613 | } |
---|
614 | |
---|
615 | for (int i = 0; i < 256; i++) if (!islower ((char) i)){ |
---|
616 | int li = (int)((unsigned char) tolower ((char) i)); |
---|
617 | for (int j = 0; j <= i; j++) if (!islower ((char) j)){ |
---|
618 | int lj = (int)((unsigned char) tolower ((char) j)); |
---|
619 | emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] = |
---|
620 | emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts; |
---|
621 | } |
---|
622 | emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts; |
---|
623 | } |
---|
624 | } |
---|
625 | } |
---|
626 | |
---|
627 | ///////////////////////////////////////////////////////////////// |
---|
628 | // ProbabilisticModel::ComputeAlignment() |
---|
629 | // |
---|
630 | // Computes an alignment based on the given posterior matrix. |
---|
631 | // This is done by finding the maximum summing path (or |
---|
632 | // maximum weight trace) through the posterior matrix. The |
---|
633 | // final alignment is returned as a pair consisting of: |
---|
634 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
---|
635 | // denote insertions in one of the two sequences and |
---|
636 | // B's denote that both sequences are present (i.e. |
---|
637 | // matches). |
---|
638 | // (2) a float indicating the sum achieved |
---|
639 | ///////////////////////////////////////////////////////////////// |
---|
640 | |
---|
641 | pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length, |
---|
642 | const VF &posterior) const { |
---|
643 | |
---|
644 | float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows); |
---|
645 | float *oldRow = twoRows; |
---|
646 | float *newRow = twoRows + seq2Length + 1; |
---|
647 | |
---|
648 | char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
---|
649 | char *tracebackPtr = tracebackMatrix; |
---|
650 | |
---|
651 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
---|
652 | |
---|
653 | // initialization |
---|
654 | for (int i = 0; i <= seq2Length; i++){ |
---|
655 | oldRow[i] = 0; |
---|
656 | *(tracebackPtr++) = 'L'; |
---|
657 | } |
---|
658 | |
---|
659 | // fill in matrix |
---|
660 | for (int i = 1; i <= seq1Length; i++){ |
---|
661 | |
---|
662 | // initialize left column |
---|
663 | newRow[0] = 0; |
---|
664 | posteriorPtr++; |
---|
665 | *(tracebackPtr++) = 'U'; |
---|
666 | |
---|
667 | // fill in rest of row |
---|
668 | for (int j = 1; j <= seq2Length; j++){ |
---|
669 | ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j], |
---|
670 | 'D', 'L', 'U', &newRow[j], tracebackPtr++); |
---|
671 | } |
---|
672 | |
---|
673 | // swap rows |
---|
674 | float *temp = oldRow; |
---|
675 | oldRow = newRow; |
---|
676 | newRow = temp; |
---|
677 | } |
---|
678 | |
---|
679 | // store best score |
---|
680 | float total = oldRow[seq2Length]; |
---|
681 | delete [] twoRows; |
---|
682 | |
---|
683 | // compute traceback |
---|
684 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
---|
685 | int r = seq1Length, c = seq2Length; |
---|
686 | while (r != 0 || c != 0){ |
---|
687 | char ch = tracebackMatrix[r*(seq2Length+1) + c]; |
---|
688 | switch (ch){ |
---|
689 | case 'L': c--; alignment->push_back ('Y'); break; |
---|
690 | case 'U': r--; alignment->push_back ('X'); break; |
---|
691 | case 'D': c--; r--; alignment->push_back ('B'); break; |
---|
692 | default: assert (false); |
---|
693 | } |
---|
694 | } |
---|
695 | |
---|
696 | delete [] tracebackMatrix; |
---|
697 | |
---|
698 | reverse (alignment->begin(), alignment->end()); |
---|
699 | |
---|
700 | return make_pair(alignment, total); |
---|
701 | } |
---|
702 | |
---|
703 | ///////////////////////////////////////////////////////////////// |
---|
704 | // ProbabilisticModel::ComputeAlignmentWithGapPenalties() |
---|
705 | // |
---|
706 | // Similar to ComputeAlignment() except with gap penalties. |
---|
707 | ///////////////////////////////////////////////////////////////// |
---|
708 | |
---|
709 | pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1, |
---|
710 | MultiSequence *align2, |
---|
711 | const VF &posterior, int numSeqs1, |
---|
712 | int numSeqs2, |
---|
713 | float gapOpenPenalty, |
---|
714 | float gapContinuePenalty) const { |
---|
715 | int seq1Length = align1->GetSequence(0)->GetLength(); |
---|
716 | int seq2Length = align2->GetSequence(0)->GetLength(); |
---|
717 | SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences()); |
---|
718 | SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences()); |
---|
719 | |
---|
720 | // grab character data |
---|
721 | for (int i = 0; i < align1->GetNumSequences(); i++) |
---|
722 | dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr(); |
---|
723 | for (int i = 0; i < align2->GetNumSequences(); i++) |
---|
724 | dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr(); |
---|
725 | |
---|
726 | // the number of active sequences at any given column is defined to be the |
---|
727 | // number of non-gap characters in that column; the number of gap opens at |
---|
728 | // any given column is defined to be the number of gap characters in that |
---|
729 | // column where the previous character in the respective sequence was not |
---|
730 | // a gap |
---|
731 | SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1); |
---|
732 | SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1); |
---|
733 | |
---|
734 | // compute number of active sequences and gap opens for each group |
---|
735 | for (int i = 0; i < align1->GetNumSequences(); i++){ |
---|
736 | SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr(); |
---|
737 | numActive1[0] = numGapOpens1[0] = 0; |
---|
738 | for (int j = 1; j <= seq1Length; j++){ |
---|
739 | if (dataPtr[j] != '-'){ |
---|
740 | numActive1[j]++; |
---|
741 | numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-'); |
---|
742 | } |
---|
743 | } |
---|
744 | } |
---|
745 | for (int i = 0; i < align2->GetNumSequences(); i++){ |
---|
746 | SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr(); |
---|
747 | numActive2[0] = numGapOpens2[0] = 0; |
---|
748 | for (int j = 1; j <= seq2Length; j++){ |
---|
749 | if (dataPtr[j] != '-'){ |
---|
750 | numActive2[j]++; |
---|
751 | numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-'); |
---|
752 | } |
---|
753 | } |
---|
754 | } |
---|
755 | |
---|
756 | VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1)); |
---|
757 | VF continuingPenalty1 (numSeqs1+1); |
---|
758 | VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1)); |
---|
759 | VF continuingPenalty2 (numSeqs2+1); |
---|
760 | |
---|
761 | // precompute penalties |
---|
762 | for (int i = 0; i <= numSeqs1; i++) |
---|
763 | for (int j = 0; j <= numSeqs2; j++) |
---|
764 | openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j)); |
---|
765 | for (int i = 0; i <= numSeqs1; i++) |
---|
766 | continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2; |
---|
767 | for (int i = 0; i <= numSeqs2; i++) |
---|
768 | for (int j = 0; j <= numSeqs1; j++) |
---|
769 | openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j)); |
---|
770 | for (int i = 0; i <= numSeqs2; i++) |
---|
771 | continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1; |
---|
772 | |
---|
773 | float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows); |
---|
774 | float *oldRowMatch = twoRows; |
---|
775 | float *newRowMatch = twoRows + (seq2Length+1); |
---|
776 | float *oldRowInsertX = twoRows + 2*(seq2Length+1); |
---|
777 | float *newRowInsertX = twoRows + 3*(seq2Length+1); |
---|
778 | float *oldRowInsertY = twoRows + 4*(seq2Length+1); |
---|
779 | float *newRowInsertY = twoRows + 5*(seq2Length+1); |
---|
780 | |
---|
781 | char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
---|
782 | char *tracebackPtr = tracebackMatrix; |
---|
783 | |
---|
784 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
---|
785 | |
---|
786 | // initialization |
---|
787 | for (int i = 0; i <= seq2Length; i++){ |
---|
788 | oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO; |
---|
789 | oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]]; |
---|
790 | *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y'; |
---|
791 | tracebackPtr += 3; |
---|
792 | } |
---|
793 | |
---|
794 | // fill in matrix |
---|
795 | for (int i = 1; i <= seq1Length; i++){ |
---|
796 | |
---|
797 | // initialize left column |
---|
798 | newRowMatch[0] = newRowInsertY[0] = LOG_ZERO; |
---|
799 | newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]]; |
---|
800 | posteriorPtr++; |
---|
801 | *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X'; |
---|
802 | tracebackPtr += 3; |
---|
803 | |
---|
804 | // fill in rest of row |
---|
805 | for (int j = 1; j <= seq2Length; j++){ |
---|
806 | |
---|
807 | // going to MATCH state |
---|
808 | ChooseBestOfThree (oldRowMatch[j-1], |
---|
809 | oldRowInsertX[j-1], |
---|
810 | oldRowInsertY[j-1], |
---|
811 | 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++); |
---|
812 | newRowMatch[j] += *(posteriorPtr++); |
---|
813 | |
---|
814 | // going to INSERT X state |
---|
815 | ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], |
---|
816 | oldRowInsertX[j] + continuingPenalty1[numActive1[i]], |
---|
817 | oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], |
---|
818 | 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++); |
---|
819 | |
---|
820 | // going to INSERT Y state |
---|
821 | ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], |
---|
822 | newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], |
---|
823 | newRowInsertY[j-1] + continuingPenalty2[numActive2[j]], |
---|
824 | 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++); |
---|
825 | } |
---|
826 | |
---|
827 | // swap rows |
---|
828 | float *temp; |
---|
829 | temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp; |
---|
830 | temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp; |
---|
831 | temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp; |
---|
832 | } |
---|
833 | |
---|
834 | // store best score |
---|
835 | float total; |
---|
836 | char matrix; |
---|
837 | ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length], |
---|
838 | 'M', 'X', 'Y', &total, &matrix); |
---|
839 | |
---|
840 | delete [] twoRows; |
---|
841 | |
---|
842 | // compute traceback |
---|
843 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
---|
844 | int r = seq1Length, c = seq2Length; |
---|
845 | while (r != 0 || c != 0){ |
---|
846 | |
---|
847 | int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2; |
---|
848 | char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset]; |
---|
849 | switch (matrix){ |
---|
850 | case 'Y': c--; alignment->push_back ('Y'); break; |
---|
851 | case 'X': r--; alignment->push_back ('X'); break; |
---|
852 | case 'M': c--; r--; alignment->push_back ('B'); break; |
---|
853 | default: assert (false); |
---|
854 | } |
---|
855 | matrix = ch; |
---|
856 | } |
---|
857 | |
---|
858 | delete [] tracebackMatrix; |
---|
859 | |
---|
860 | reverse (alignment->begin(), alignment->end()); |
---|
861 | |
---|
862 | return make_pair(alignment, 1.0f); |
---|
863 | } |
---|
864 | |
---|
865 | ///////////////////////////////////////////////////////////////// |
---|
866 | // ProbabilisticModel::ComputeViterbiAlignment() |
---|
867 | // |
---|
868 | // Computes the highest probability pairwise alignment using the |
---|
869 | // probabilistic model. The final alignment is returned as a |
---|
870 | // pair consisting of: |
---|
871 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
---|
872 | // denote insertions in one of the two sequences and |
---|
873 | // B's denote that both sequences are present (i.e. |
---|
874 | // matches). |
---|
875 | // (2) a float containing the log probability of the best |
---|
876 | // alignment (not used) |
---|
877 | ///////////////////////////////////////////////////////////////// |
---|
878 | |
---|
879 | pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const { |
---|
880 | |
---|
881 | assert (seq1); |
---|
882 | assert (seq2); |
---|
883 | |
---|
884 | const int seq1Length = seq1->GetLength(); |
---|
885 | const int seq2Length = seq2->GetLength(); |
---|
886 | |
---|
887 | // retrieve the points to the beginning of each sequence |
---|
888 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
889 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
890 | |
---|
891 | // create viterbi matrix |
---|
892 | VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
---|
893 | assert (viterbiPtr); |
---|
894 | VF &viterbi = *viterbiPtr; |
---|
895 | |
---|
896 | // create traceback matrix |
---|
897 | VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1); |
---|
898 | assert (tracebackPtr); |
---|
899 | VI &traceback = *tracebackPtr; |
---|
900 | |
---|
901 | // initialization condition |
---|
902 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
903 | viterbi[k] = initialDistribution[k]; |
---|
904 | |
---|
905 | // remember offset for each index combination |
---|
906 | int ij = 0; |
---|
907 | int i1j = -seq2Length - 1; |
---|
908 | int ij1 = -1; |
---|
909 | int i1j1 = -seq2Length - 2; |
---|
910 | |
---|
911 | ij *= NumMatrixTypes; |
---|
912 | i1j *= NumMatrixTypes; |
---|
913 | ij1 *= NumMatrixTypes; |
---|
914 | i1j1 *= NumMatrixTypes; |
---|
915 | |
---|
916 | // compute viterbi scores |
---|
917 | for (int i = 0; i <= seq1Length; i++){ |
---|
918 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
---|
919 | for (int j = 0; j <= seq2Length; j++){ |
---|
920 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
---|
921 | |
---|
922 | if (i > 0 && j > 0){ |
---|
923 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
924 | float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2]; |
---|
925 | if (viterbi[0 + ij] < newVal){ |
---|
926 | viterbi[0 + ij] = newVal; |
---|
927 | traceback[0 + ij] = k; |
---|
928 | } |
---|
929 | } |
---|
930 | } |
---|
931 | if (i > 0){ |
---|
932 | for (int k = 0; k < NumInsertStates; k++){ |
---|
933 | float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1]; |
---|
934 | float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1]; |
---|
935 | if (valFromMatch >= valFromIns){ |
---|
936 | viterbi[2*k+1 + ij] = valFromMatch; |
---|
937 | traceback[2*k+1 + ij] = 0; |
---|
938 | } |
---|
939 | else { |
---|
940 | viterbi[2*k+1 + ij] = valFromIns; |
---|
941 | traceback[2*k+1 + ij] = 2*k+1; |
---|
942 | } |
---|
943 | } |
---|
944 | } |
---|
945 | if (j > 0){ |
---|
946 | for (int k = 0; k < NumInsertStates; k++){ |
---|
947 | float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2]; |
---|
948 | float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2]; |
---|
949 | if (valFromMatch >= valFromIns){ |
---|
950 | viterbi[2*k+2 + ij] = valFromMatch; |
---|
951 | traceback[2*k+2 + ij] = 0; |
---|
952 | } |
---|
953 | else { |
---|
954 | viterbi[2*k+2 + ij] = valFromIns; |
---|
955 | traceback[2*k+2 + ij] = 2*k+2; |
---|
956 | } |
---|
957 | } |
---|
958 | } |
---|
959 | |
---|
960 | ij += NumMatrixTypes; |
---|
961 | i1j += NumMatrixTypes; |
---|
962 | ij1 += NumMatrixTypes; |
---|
963 | i1j1 += NumMatrixTypes; |
---|
964 | } |
---|
965 | } |
---|
966 | |
---|
967 | // figure out best terminating cell |
---|
968 | float bestProb = LOG_ZERO; |
---|
969 | int state = -1; |
---|
970 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
971 | float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k]; |
---|
972 | if (bestProb < thisProb){ |
---|
973 | bestProb = thisProb; |
---|
974 | state = k; |
---|
975 | } |
---|
976 | } |
---|
977 | assert (state != -1); |
---|
978 | |
---|
979 | delete viterbiPtr; |
---|
980 | |
---|
981 | // compute traceback |
---|
982 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
---|
983 | int r = seq1Length, c = seq2Length; |
---|
984 | while (r != 0 || c != 0){ |
---|
985 | int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)]; |
---|
986 | |
---|
987 | if (state == 0){ c--; r--; alignment->push_back ('B'); } |
---|
988 | else if (state % 2 == 1){ r--; alignment->push_back ('X'); } |
---|
989 | else { c--; alignment->push_back ('Y'); } |
---|
990 | |
---|
991 | state = newState; |
---|
992 | } |
---|
993 | |
---|
994 | delete tracebackPtr; |
---|
995 | |
---|
996 | reverse (alignment->begin(), alignment->end()); |
---|
997 | |
---|
998 | return make_pair(alignment, bestProb); |
---|
999 | } |
---|
1000 | |
---|
1001 | ///////////////////////////////////////////////////////////////// |
---|
1002 | // ProbabilisticModel::BuildPosterior() |
---|
1003 | // |
---|
1004 | // Builds a posterior probability matrix needed to align a pair |
---|
1005 | // of alignments. Mathematically, the returned matrix M is |
---|
1006 | // defined as follows: |
---|
1007 | // M[i,j] = sum sum f(s,t,i,j) |
---|
1008 | // s in align1 t in align2 |
---|
1009 | // where |
---|
1010 | // [ P(s[i'] <--> t[j']) |
---|
1011 | // [ if s[i'] is a letter in the ith column of align1 and |
---|
1012 | // [ t[j'] it a letter in the jth column of align2 |
---|
1013 | // f(s,t,i,j) = [ |
---|
1014 | // [ 0 otherwise |
---|
1015 | // |
---|
1016 | ///////////////////////////////////////////////////////////////// |
---|
1017 | |
---|
1018 | VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2, |
---|
1019 | const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices, |
---|
1020 | float cutoff = 0.0f) const { |
---|
1021 | const int seq1Length = align1->GetSequence(0)->GetLength(); |
---|
1022 | const int seq2Length = align2->GetSequence(0)->GetLength(); |
---|
1023 | |
---|
1024 | VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr); |
---|
1025 | VF &posterior = *posteriorPtr; |
---|
1026 | // VF::iterator postPtr = posterior.begin(); |
---|
1027 | |
---|
1028 | // for each s in align1 |
---|
1029 | for (int i = 0; i < align1->GetNumSequences(); i++){ |
---|
1030 | int first = align1->GetSequence(i)->GetLabel(); |
---|
1031 | SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping(); |
---|
1032 | |
---|
1033 | // for each t in align2 |
---|
1034 | for (int j = 0; j < align2->GetNumSequences(); j++){ |
---|
1035 | int second = align2->GetSequence(j)->GetLabel(); |
---|
1036 | SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping(); |
---|
1037 | |
---|
1038 | if (first < second){ |
---|
1039 | |
---|
1040 | // get the associated sparse matrix |
---|
1041 | SparseMatrix *matrix = sparseMatrices[first][second]; |
---|
1042 | |
---|
1043 | for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){ |
---|
1044 | SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii); |
---|
1045 | int base = (*mapping1)[ii] * (seq2Length+1); |
---|
1046 | int rowSize = matrix->GetRowSize(ii); |
---|
1047 | |
---|
1048 | // add in all relevant values |
---|
1049 | for (int jj = 0; jj < rowSize; jj++) |
---|
1050 | posterior[base + (*mapping2)[row[jj].first]] += row[jj].second; |
---|
1051 | |
---|
1052 | // subtract cutoff |
---|
1053 | for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) |
---|
1054 | posterior[base + (*mapping2)[jj]] -= cutoff; |
---|
1055 | } |
---|
1056 | |
---|
1057 | } else { |
---|
1058 | |
---|
1059 | // get the associated sparse matrix |
---|
1060 | SparseMatrix *matrix = sparseMatrices[second][first]; |
---|
1061 | |
---|
1062 | for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){ |
---|
1063 | SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj); |
---|
1064 | int base = (*mapping2)[jj]; |
---|
1065 | int rowSize = matrix->GetRowSize(jj); |
---|
1066 | |
---|
1067 | // add in all relevant values |
---|
1068 | for (int ii = 0; ii < rowSize; ii++) |
---|
1069 | posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second; |
---|
1070 | |
---|
1071 | // subtract cutoff |
---|
1072 | for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) |
---|
1073 | posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff; |
---|
1074 | } |
---|
1075 | |
---|
1076 | } |
---|
1077 | |
---|
1078 | |
---|
1079 | delete mapping2; |
---|
1080 | } |
---|
1081 | |
---|
1082 | delete mapping1; |
---|
1083 | } |
---|
1084 | |
---|
1085 | return posteriorPtr; |
---|
1086 | } |
---|
1087 | }; |
---|
1088 | |
---|
1089 | #endif |
---|