1 | ///////////////////////////////////////////////////////////////// |
---|
2 | // ProbabilisticModel.h |
---|
3 | // |
---|
4 | // Routines for (1) posterior probability computations |
---|
5 | // (2) chained anchoring |
---|
6 | // (3) maximum weight trace alignment |
---|
7 | ///////////////////////////////////////////////////////////////// |
---|
8 | |
---|
9 | #ifndef PROBABILISTICMODEL_H |
---|
10 | #define PROBABILISTICMODEL_H |
---|
11 | |
---|
12 | #include <list> |
---|
13 | #include <cmath> |
---|
14 | #include <cstdio> |
---|
15 | #include "SafeVector.h" |
---|
16 | #include "ScoreType.h" |
---|
17 | #include "SparseMatrix.h" |
---|
18 | #include "MultiSequence.h" |
---|
19 | #include "StemCandidate.hpp" |
---|
20 | #include "scarna.hpp" |
---|
21 | #include "nrutil.h" |
---|
22 | #include <vector> |
---|
23 | |
---|
24 | using namespace std; |
---|
25 | |
---|
26 | const int NumMatchStates = 1; // note that in this version the number |
---|
27 | // of match states is fixed at 1...will |
---|
28 | // change in future versions |
---|
29 | const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2; |
---|
30 | |
---|
31 | ///////////////////////////////////////////////////////////////// |
---|
32 | // ProbabilisticModel |
---|
33 | // |
---|
34 | // Class for storing the parameters of a probabilistic model and |
---|
35 | // performing different computations based on those parameters. |
---|
36 | // In particular, this class handles the computation of |
---|
37 | // posterior probabilities that may be used in alignment. |
---|
38 | ///////////////////////////////////////////////////////////////// |
---|
39 | namespace MXSCARNA { |
---|
40 | class ProbabilisticModel { |
---|
41 | |
---|
42 | float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state |
---|
43 | float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities |
---|
44 | float matchProb[256][256]; // emission probabilities for match states |
---|
45 | float insProb[256][NumMatrixTypes]; // emission probabilities for insert states |
---|
46 | NRMat<float> WM; |
---|
47 | |
---|
48 | public: |
---|
49 | |
---|
50 | ///////////////////////////////////////////////////////////////// |
---|
51 | // ProbabilisticModel::ProbabilisticModel() |
---|
52 | // |
---|
53 | // Constructor. Builds a new probabilistic model using the |
---|
54 | // given parameters. |
---|
55 | ///////////////////////////////////////////////////////////////// |
---|
56 | |
---|
57 | ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend, |
---|
58 | const VVF &emitPairs, const VF &emitSingle){ |
---|
59 | |
---|
60 | // build transition matrix |
---|
61 | VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f)); |
---|
62 | transMat[0][0] = 1; |
---|
63 | for (int i = 0; i < NumInsertStates; i++){ |
---|
64 | transMat[0][2*i+1] = gapOpen[2*i]; |
---|
65 | transMat[0][2*i+2] = gapOpen[2*i+1]; |
---|
66 | transMat[0][0] -= (gapOpen[2*i] + gapOpen[2*i+1]); |
---|
67 | assert (transMat[0][0] > 0); |
---|
68 | transMat[2*i+1][2*i+1] = gapExtend[2*i]; |
---|
69 | transMat[2*i+2][2*i+2] = gapExtend[2*i+1]; |
---|
70 | transMat[2*i+1][2*i+2] = 0; |
---|
71 | transMat[2*i+2][2*i+1] = 0; |
---|
72 | transMat[2*i+1][0] = 1 - gapExtend[2*i]; |
---|
73 | transMat[2*i+2][0] = 1 - gapExtend[2*i+1]; |
---|
74 | } |
---|
75 | |
---|
76 | // create initial and transition probability matrices |
---|
77 | for (int i = 0; i < NumMatrixTypes; i++){ |
---|
78 | initialDistribution[i] = LOG (initDistribMat[i]); |
---|
79 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
80 | transProb[i][j] = LOG (transMat[i][j]); |
---|
81 | } |
---|
82 | |
---|
83 | // create insertion and match probability matrices |
---|
84 | for (int i = 0; i < 256; i++){ |
---|
85 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
86 | insProb[i][j] = LOG (emitSingle[i]); |
---|
87 | for (int j = 0; j < 256; j++) |
---|
88 | matchProb[i][j] = LOG (emitPairs[i][j]); |
---|
89 | } |
---|
90 | } |
---|
91 | |
---|
92 | NRMat<float> weightMatchScore(std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2, |
---|
93 | std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2, NRMat<float> WM) { |
---|
94 | int len = WORDLENGTH; |
---|
95 | int size = matchPSCS1->size(); |
---|
96 | float weight = 1000; |
---|
97 | |
---|
98 | for(int iter = 0; iter < size; iter++) { |
---|
99 | int i = matchPSCS1->at(iter); |
---|
100 | int j = matchPSCS2->at(iter); |
---|
101 | |
---|
102 | const StemCandidate &sc1 = pscs1->at(i); |
---|
103 | const StemCandidate &sc2 = pscs2->at(j); |
---|
104 | |
---|
105 | for(int k = 0; k < len; k++) { |
---|
106 | WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight; |
---|
107 | // sumWeight += weight; |
---|
108 | } |
---|
109 | } |
---|
110 | return WM; |
---|
111 | } |
---|
112 | |
---|
113 | ///////////////////////////////////////////////////////////////// |
---|
114 | // ProbabilisticModel::ComputeForwardMatrix() |
---|
115 | // |
---|
116 | // Computes a set of forward probability matrices for aligning |
---|
117 | // seq1 and seq2. |
---|
118 | // |
---|
119 | // For efficiency reasons, a single-dimensional floating-point |
---|
120 | // array is used here, with the following indexing scheme: |
---|
121 | // |
---|
122 | // forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] |
---|
123 | // refers to the probability of aligning through j characters |
---|
124 | // of the first sequence, k characters of the second sequence, |
---|
125 | // and ending in state i. |
---|
126 | ///////////////////////////////////////////////////////////////// |
---|
127 | |
---|
128 | VF *ComputeForwardMatrix (Sequence *seq1, Sequence *seq2) const { |
---|
129 | |
---|
130 | assert (seq1); |
---|
131 | assert (seq2); |
---|
132 | |
---|
133 | const int seq1Length = seq1->GetLength(); |
---|
134 | const int seq2Length = seq2->GetLength(); |
---|
135 | |
---|
136 | // retrieve the points to the beginning of each sequence |
---|
137 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
138 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
139 | |
---|
140 | // create matrix |
---|
141 | VF *forwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
---|
142 | assert (forwardPtr); |
---|
143 | VF &forward = *forwardPtr; |
---|
144 | |
---|
145 | // initialization condition |
---|
146 | forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] = |
---|
147 | initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]]; |
---|
148 | |
---|
149 | for (int k = 0; k < NumInsertStates; k++){ |
---|
150 | forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] = |
---|
151 | initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k]; |
---|
152 | forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] = |
---|
153 | initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k]; |
---|
154 | } |
---|
155 | |
---|
156 | // remember offset for each index combination |
---|
157 | int ij = 0; |
---|
158 | int i1j = -seq2Length - 1; |
---|
159 | int ij1 = -1; |
---|
160 | int i1j1 = -seq2Length - 2; |
---|
161 | |
---|
162 | ij *= NumMatrixTypes; |
---|
163 | i1j *= NumMatrixTypes; |
---|
164 | ij1 *= NumMatrixTypes; |
---|
165 | i1j1 *= NumMatrixTypes; |
---|
166 | |
---|
167 | // compute forward scores |
---|
168 | for (int i = 0; i <= seq1Length; i++){ |
---|
169 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
---|
170 | for (int j = 0; j <= seq2Length; j++){ |
---|
171 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
---|
172 | |
---|
173 | if (i > 1 || j > 1){ |
---|
174 | if (i > 0 && j > 0){ |
---|
175 | forward[0 + ij] = forward[0 + i1j1] + transProb[0][0]; |
---|
176 | for (int k = 1; k < NumMatrixTypes; k++) |
---|
177 | LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]); |
---|
178 | forward[0 + ij] += matchProb[c1][c2]; |
---|
179 | } |
---|
180 | if (i > 0){ |
---|
181 | for (int k = 0; k < NumInsertStates; k++) |
---|
182 | forward[2*k+1 + ij] = insProb[c1][k] + |
---|
183 | LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1], |
---|
184 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1]); |
---|
185 | } |
---|
186 | if (j > 0){ |
---|
187 | for (int k = 0; k < NumInsertStates; k++) |
---|
188 | forward[2*k+2 + ij] = insProb[c2][k] + |
---|
189 | LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2], |
---|
190 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2]); |
---|
191 | } |
---|
192 | } |
---|
193 | |
---|
194 | ij += NumMatrixTypes; |
---|
195 | i1j += NumMatrixTypes; |
---|
196 | ij1 += NumMatrixTypes; |
---|
197 | i1j1 += NumMatrixTypes; |
---|
198 | } |
---|
199 | } |
---|
200 | |
---|
201 | return forwardPtr; |
---|
202 | } |
---|
203 | |
---|
204 | ///////////////////////////////////////////////////////////////// |
---|
205 | // ProbabilisticModel::ComputeBackwardMatrix() |
---|
206 | // |
---|
207 | // Computes a set of backward probability matrices for aligning |
---|
208 | // seq1 and seq2. |
---|
209 | // |
---|
210 | // For efficiency reasons, a single-dimensional floating-point |
---|
211 | // array is used here, with the following indexing scheme: |
---|
212 | // |
---|
213 | // backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)] |
---|
214 | // refers to the probability of starting in state i and |
---|
215 | // aligning from character j+1 to the end of the first |
---|
216 | // sequence and from character k+1 to the end of the second |
---|
217 | // sequence. |
---|
218 | ///////////////////////////////////////////////////////////////// |
---|
219 | |
---|
220 | VF *ComputeBackwardMatrix (Sequence *seq1, Sequence *seq2) const { |
---|
221 | |
---|
222 | assert (seq1); |
---|
223 | assert (seq2); |
---|
224 | |
---|
225 | const int seq1Length = seq1->GetLength(); |
---|
226 | const int seq2Length = seq2->GetLength(); |
---|
227 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
228 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
229 | |
---|
230 | // create matrix |
---|
231 | VF *backwardPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
---|
232 | assert (backwardPtr); |
---|
233 | VF &backward = *backwardPtr; |
---|
234 | |
---|
235 | // initialization condition |
---|
236 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
237 | backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k]; |
---|
238 | |
---|
239 | // remember offset for each index combination |
---|
240 | int ij = (seq1Length+1) * (seq2Length+1) - 1; |
---|
241 | int i1j = ij + seq2Length + 1; |
---|
242 | int ij1 = ij + 1; |
---|
243 | int i1j1 = ij + seq2Length + 2; |
---|
244 | |
---|
245 | ij *= NumMatrixTypes; |
---|
246 | i1j *= NumMatrixTypes; |
---|
247 | ij1 *= NumMatrixTypes; |
---|
248 | i1j1 *= NumMatrixTypes; |
---|
249 | |
---|
250 | // compute backward scores |
---|
251 | for (int i = seq1Length; i >= 0; i--){ |
---|
252 | unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1]; |
---|
253 | for (int j = seq2Length; j >= 0; j--){ |
---|
254 | unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1]; |
---|
255 | |
---|
256 | if (i < seq1Length && j < seq2Length){ |
---|
257 | const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2]; |
---|
258 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
259 | LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]); |
---|
260 | } |
---|
261 | if (i < seq1Length){ |
---|
262 | for (int k = 0; k < NumInsertStates; k++){ |
---|
263 | LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[0][2*k+1]); |
---|
264 | LOG_PLUS_EQUALS (backward[2*k+1 + ij], backward[2*k+1 + i1j] + insProb[c1][k] + transProb[2*k+1][2*k+1]); |
---|
265 | } |
---|
266 | } |
---|
267 | if (j < seq2Length){ |
---|
268 | for (int k = 0; k < NumInsertStates; k++){ |
---|
269 | LOG_PLUS_EQUALS (backward[0 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[0][2*k+2]); |
---|
270 | LOG_PLUS_EQUALS (backward[2*k+2 + ij], backward[2*k+2 + ij1] + insProb[c2][k] + transProb[2*k+2][2*k+2]); |
---|
271 | } |
---|
272 | } |
---|
273 | |
---|
274 | ij -= NumMatrixTypes; |
---|
275 | i1j -= NumMatrixTypes; |
---|
276 | ij1 -= NumMatrixTypes; |
---|
277 | i1j1 -= NumMatrixTypes; |
---|
278 | } |
---|
279 | } |
---|
280 | |
---|
281 | return backwardPtr; |
---|
282 | } |
---|
283 | |
---|
284 | ///////////////////////////////////////////////////////////////// |
---|
285 | // ProbabilisticModel::ComputeTotalProbability() |
---|
286 | // |
---|
287 | // Computes the total probability of an alignment given |
---|
288 | // the forward and backward matrices. |
---|
289 | ///////////////////////////////////////////////////////////////// |
---|
290 | |
---|
291 | float ComputeTotalProbability (int seq1Length, int seq2Length, |
---|
292 | const VF &forward, const VF &backward) const { |
---|
293 | |
---|
294 | // compute total probability |
---|
295 | float totalForwardProb = LOG_ZERO; |
---|
296 | float totalBackwardProb = LOG_ZERO; |
---|
297 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
298 | LOG_PLUS_EQUALS (totalForwardProb, |
---|
299 | forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
300 | backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
301 | } |
---|
302 | |
---|
303 | totalBackwardProb = |
---|
304 | forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + |
---|
305 | backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)]; |
---|
306 | |
---|
307 | for (int k = 0; k < NumInsertStates; k++){ |
---|
308 | LOG_PLUS_EQUALS (totalBackwardProb, |
---|
309 | forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + |
---|
310 | backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)]); |
---|
311 | LOG_PLUS_EQUALS (totalBackwardProb, |
---|
312 | forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + |
---|
313 | backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)]); |
---|
314 | } |
---|
315 | |
---|
316 | // cerr << totalForwardProb << " " << totalBackwardProb << endl; |
---|
317 | |
---|
318 | return (totalForwardProb + totalBackwardProb) / 2; |
---|
319 | } |
---|
320 | |
---|
321 | ///////////////////////////////////////////////////////////////// |
---|
322 | // ProbabilisticModel::ComputePosteriorMatrix() |
---|
323 | // |
---|
324 | // Computes the posterior probability matrix based on |
---|
325 | // the forward and backward matrices. |
---|
326 | ///////////////////////////////////////////////////////////////// |
---|
327 | |
---|
328 | VF *ComputePosteriorMatrix (Sequence *seq1, Sequence *seq2, |
---|
329 | const VF &forward, const VF &backward) const { |
---|
330 | |
---|
331 | assert (seq1); |
---|
332 | assert (seq2); |
---|
333 | |
---|
334 | const int seq1Length = seq1->GetLength(); |
---|
335 | const int seq2Length = seq2->GetLength(); |
---|
336 | |
---|
337 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
---|
338 | forward, backward); |
---|
339 | |
---|
340 | // compute posterior matrices |
---|
341 | VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1)); assert (posteriorPtr); |
---|
342 | VF &posterior = *posteriorPtr; |
---|
343 | |
---|
344 | int ij = 0; |
---|
345 | VF::iterator ptr = posterior.begin(); |
---|
346 | |
---|
347 | for (int i = 0; i <= seq1Length; i++){ |
---|
348 | for (int j = 0; j <= seq2Length; j++){ |
---|
349 | *(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb)); |
---|
350 | ij += NumMatrixTypes; |
---|
351 | } |
---|
352 | } |
---|
353 | |
---|
354 | posterior[0] = 0; |
---|
355 | |
---|
356 | return posteriorPtr; |
---|
357 | } |
---|
358 | |
---|
359 | /* |
---|
360 | ///////////////////////////////////////////////////////////////// |
---|
361 | // ProbabilisticModel::ComputeExpectedCounts() |
---|
362 | // |
---|
363 | // Computes the expected counts for the various transitions. |
---|
364 | ///////////////////////////////////////////////////////////////// |
---|
365 | |
---|
366 | VVF *ComputeExpectedCounts () const { |
---|
367 | |
---|
368 | assert (seq1); |
---|
369 | assert (seq2); |
---|
370 | |
---|
371 | const int seq1Length = seq1->GetLength(); |
---|
372 | const int seq2Length = seq2->GetLength(); |
---|
373 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
374 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
375 | |
---|
376 | // compute total probability |
---|
377 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
---|
378 | forward, backward); |
---|
379 | |
---|
380 | // initialize expected counts |
---|
381 | VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr); |
---|
382 | VVF &counts = *countsPtr; |
---|
383 | |
---|
384 | // remember offset for each index combination |
---|
385 | int ij = 0; |
---|
386 | int i1j = -seq2Length - 1; |
---|
387 | int ij1 = -1; |
---|
388 | int i1j1 = -seq2Length - 2; |
---|
389 | |
---|
390 | ij *= NumMatrixTypes; |
---|
391 | i1j *= NumMatrixTypes; |
---|
392 | ij1 *= NumMatrixTypes; |
---|
393 | i1j1 *= NumMatrixTypes; |
---|
394 | |
---|
395 | // compute expected counts |
---|
396 | for (int i = 0; i <= seq1Length; i++){ |
---|
397 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
---|
398 | for (int j = 0; j <= seq2Length; j++){ |
---|
399 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
---|
400 | |
---|
401 | if (i > 0 && j > 0){ |
---|
402 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
403 | LOG_PLUS_EQUALS (counts[k][0], |
---|
404 | forward[k + i1j1] + transProb[k][0] + |
---|
405 | matchProb[c1][c2] + backward[0 + ij]); |
---|
406 | } |
---|
407 | if (i > 0){ |
---|
408 | for (int k = 0; k < NumInsertStates; k++){ |
---|
409 | LOG_PLUS_EQUALS (counts[0][2*k+1], |
---|
410 | forward[0 + i1j] + transProb[0][2*k+1] + |
---|
411 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
412 | LOG_PLUS_EQUALS (counts[2*k+1][2*k+1], |
---|
413 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
---|
414 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
415 | } |
---|
416 | } |
---|
417 | if (j > 0){ |
---|
418 | for (int k = 0; k < NumInsertStates; k++){ |
---|
419 | LOG_PLUS_EQUALS (counts[0][2*k+2], |
---|
420 | forward[0 + ij1] + transProb[0][2*k+2] + |
---|
421 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
422 | LOG_PLUS_EQUALS (counts[2*k+2][2*k+2], |
---|
423 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
---|
424 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
425 | } |
---|
426 | } |
---|
427 | |
---|
428 | ij += NumMatrixTypes; |
---|
429 | i1j += NumMatrixTypes; |
---|
430 | ij1 += NumMatrixTypes; |
---|
431 | i1j1 += NumMatrixTypes; |
---|
432 | } |
---|
433 | } |
---|
434 | |
---|
435 | // scale all expected counts appropriately |
---|
436 | for (int i = 0; i < NumMatrixTypes; i++) |
---|
437 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
438 | counts[i][j] -= totalProb; |
---|
439 | |
---|
440 | } |
---|
441 | */ |
---|
442 | |
---|
443 | ///////////////////////////////////////////////////////////////// |
---|
444 | // ProbabilisticModel::ComputeNewParameters() |
---|
445 | // |
---|
446 | // Computes a new parameter set based on the expected counts |
---|
447 | // given. |
---|
448 | ///////////////////////////////////////////////////////////////// |
---|
449 | |
---|
450 | void ComputeNewParameters (Sequence *seq1, Sequence *seq2, |
---|
451 | const VF &forward, const VF &backward, |
---|
452 | VF &initDistribMat, VF &gapOpen, |
---|
453 | VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const { |
---|
454 | |
---|
455 | assert (seq1); |
---|
456 | assert (seq2); |
---|
457 | |
---|
458 | const int seq1Length = seq1->GetLength(); |
---|
459 | const int seq2Length = seq2->GetLength(); |
---|
460 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
461 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
462 | |
---|
463 | // compute total probability |
---|
464 | float totalProb = ComputeTotalProbability (seq1Length, seq2Length, |
---|
465 | forward, backward); |
---|
466 | |
---|
467 | // initialize expected counts |
---|
468 | VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO)); |
---|
469 | VF initCounts (NumMatrixTypes, LOG_ZERO); |
---|
470 | VVF pairCounts (256, VF (256, LOG_ZERO)); |
---|
471 | VF singleCounts (256, LOG_ZERO); |
---|
472 | |
---|
473 | // remember offset for each index combination |
---|
474 | int ij = 0; |
---|
475 | int i1j = -seq2Length - 1; |
---|
476 | int ij1 = -1; |
---|
477 | int i1j1 = -seq2Length - 2; |
---|
478 | |
---|
479 | ij *= NumMatrixTypes; |
---|
480 | i1j *= NumMatrixTypes; |
---|
481 | ij1 *= NumMatrixTypes; |
---|
482 | i1j1 *= NumMatrixTypes; |
---|
483 | |
---|
484 | // compute initial distribution posteriors |
---|
485 | initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] + |
---|
486 | backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)], |
---|
487 | forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
488 | backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
489 | for (int k = 0; k < NumInsertStates; k++){ |
---|
490 | initCounts[2*k+1] = LOG_ADD (forward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] + |
---|
491 | backward[2*k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)], |
---|
492 | forward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
493 | backward[2*k+1 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
494 | initCounts[2*k+2] = LOG_ADD (forward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] + |
---|
495 | backward[2*k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)], |
---|
496 | forward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] + |
---|
497 | backward[2*k+2 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]); |
---|
498 | } |
---|
499 | |
---|
500 | // compute expected counts |
---|
501 | for (int i = 0; i <= seq1Length; i++){ |
---|
502 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]); |
---|
503 | for (int j = 0; j <= seq2Length; j++){ |
---|
504 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]); |
---|
505 | |
---|
506 | if (i > 0 && j > 0){ |
---|
507 | if (enableTrainEmissions && i == 1 && j == 1){ |
---|
508 | LOG_PLUS_EQUALS (pairCounts[c1][c2], |
---|
509 | initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]); |
---|
510 | LOG_PLUS_EQUALS (pairCounts[c2][c1], |
---|
511 | initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]); |
---|
512 | } |
---|
513 | |
---|
514 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
515 | LOG_PLUS_EQUALS (transCounts[k][0], |
---|
516 | forward[k + i1j1] + transProb[k][0] + |
---|
517 | matchProb[c1][c2] + backward[0 + ij]); |
---|
518 | if (enableTrainEmissions && i != 1 || j != 1){ |
---|
519 | LOG_PLUS_EQUALS (pairCounts[c1][c2], |
---|
520 | forward[k + i1j1] + transProb[k][0] + |
---|
521 | matchProb[c1][c2] + backward[0 + ij]); |
---|
522 | LOG_PLUS_EQUALS (pairCounts[c2][c1], |
---|
523 | forward[k + i1j1] + transProb[k][0] + |
---|
524 | matchProb[c2][c1] + backward[0 + ij]); |
---|
525 | } |
---|
526 | } |
---|
527 | } |
---|
528 | if (i > 0){ |
---|
529 | for (int k = 0; k < NumInsertStates; k++){ |
---|
530 | LOG_PLUS_EQUALS (transCounts[0][2*k+1], |
---|
531 | forward[0 + i1j] + transProb[0][2*k+1] + |
---|
532 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
533 | LOG_PLUS_EQUALS (transCounts[2*k+1][2*k+1], |
---|
534 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
---|
535 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
536 | if (enableTrainEmissions){ |
---|
537 | if (i == 1 && j == 0){ |
---|
538 | LOG_PLUS_EQUALS (singleCounts[c1], |
---|
539 | initialDistribution[2*k+1] + insProb[c1][k] + backward[2*k+1 + ij]); |
---|
540 | } |
---|
541 | else { |
---|
542 | LOG_PLUS_EQUALS (singleCounts[c1], |
---|
543 | forward[0 + i1j] + transProb[0][2*k+1] + |
---|
544 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
545 | LOG_PLUS_EQUALS (singleCounts[c1], |
---|
546 | forward[2*k+1 + i1j] + transProb[2*k+1][2*k+1] + |
---|
547 | insProb[c1][k] + backward[2*k+1 + ij]); |
---|
548 | } |
---|
549 | } |
---|
550 | } |
---|
551 | } |
---|
552 | if (j > 0){ |
---|
553 | for (int k = 0; k < NumInsertStates; k++){ |
---|
554 | LOG_PLUS_EQUALS (transCounts[0][2*k+2], |
---|
555 | forward[0 + ij1] + transProb[0][2*k+2] + |
---|
556 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
557 | LOG_PLUS_EQUALS (transCounts[2*k+2][2*k+2], |
---|
558 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
---|
559 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
560 | if (enableTrainEmissions){ |
---|
561 | if (i == 0 && j == 1){ |
---|
562 | LOG_PLUS_EQUALS (singleCounts[c2], |
---|
563 | initialDistribution[2*k+2] + insProb[c2][k] + backward[2*k+2 + ij]); |
---|
564 | } |
---|
565 | else { |
---|
566 | LOG_PLUS_EQUALS (singleCounts[c2], |
---|
567 | forward[0 + ij1] + transProb[0][2*k+2] + |
---|
568 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
569 | LOG_PLUS_EQUALS (singleCounts[c2], |
---|
570 | forward[2*k+2 + ij1] + transProb[2*k+2][2*k+2] + |
---|
571 | insProb[c2][k] + backward[2*k+2 + ij]); |
---|
572 | } |
---|
573 | } |
---|
574 | } |
---|
575 | } |
---|
576 | |
---|
577 | ij += NumMatrixTypes; |
---|
578 | i1j += NumMatrixTypes; |
---|
579 | ij1 += NumMatrixTypes; |
---|
580 | i1j1 += NumMatrixTypes; |
---|
581 | } |
---|
582 | } |
---|
583 | |
---|
584 | // scale all expected counts appropriately |
---|
585 | for (int i = 0; i < NumMatrixTypes; i++){ |
---|
586 | initCounts[i] -= totalProb; |
---|
587 | for (int j = 0; j < NumMatrixTypes; j++) |
---|
588 | transCounts[i][j] -= totalProb; |
---|
589 | } |
---|
590 | if (enableTrainEmissions){ |
---|
591 | for (int i = 0; i < 256; i++){ |
---|
592 | for (int j = 0; j < 256; j++) |
---|
593 | pairCounts[i][j] -= totalProb; |
---|
594 | singleCounts[i] -= totalProb; |
---|
595 | } |
---|
596 | } |
---|
597 | |
---|
598 | // compute new initial distribution |
---|
599 | float totalInitDistribCounts = 0; |
---|
600 | for (int i = 0; i < NumMatrixTypes; i++) |
---|
601 | totalInitDistribCounts += exp (initCounts[i]); // should be 2 |
---|
602 | initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts)); |
---|
603 | for (int k = 0; k < NumInsertStates; k++){ |
---|
604 | float val = (exp (initCounts[2*k+1]) + exp (initCounts[2*k+2])) / 2; |
---|
605 | initDistribMat[2*k+1] = initDistribMat[2*k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts)); |
---|
606 | } |
---|
607 | |
---|
608 | // compute total counts for match state |
---|
609 | float inMatchStateCounts = 0; |
---|
610 | for (int i = 0; i < NumMatrixTypes; i++) |
---|
611 | inMatchStateCounts += exp (transCounts[0][i]); |
---|
612 | for (int i = 0; i < NumInsertStates; i++){ |
---|
613 | |
---|
614 | // compute total counts for gap state |
---|
615 | float inGapStateCounts = |
---|
616 | exp (transCounts[2*i+1][0]) + |
---|
617 | exp (transCounts[2*i+1][2*i+1]) + |
---|
618 | exp (transCounts[2*i+2][0]) + |
---|
619 | exp (transCounts[2*i+2][2*i+2]); |
---|
620 | |
---|
621 | gapOpen[2*i] = gapOpen[2*i+1] = |
---|
622 | (exp (transCounts[0][2*i+1]) + |
---|
623 | exp (transCounts[0][2*i+2])) / |
---|
624 | (2 * inMatchStateCounts); |
---|
625 | |
---|
626 | gapExtend[2*i] = gapExtend[2*i+1] = |
---|
627 | (exp (transCounts[2*i+1][2*i+1]) + |
---|
628 | exp (transCounts[2*i+2][2*i+2])) / |
---|
629 | inGapStateCounts; |
---|
630 | } |
---|
631 | |
---|
632 | if (enableTrainEmissions){ |
---|
633 | float totalPairCounts = 0; |
---|
634 | float totalSingleCounts = 0; |
---|
635 | for (int i = 0; i < 256; i++){ |
---|
636 | for (int j = 0; j <= i; j++) |
---|
637 | totalPairCounts += exp (pairCounts[j][i]); |
---|
638 | totalSingleCounts += exp (singleCounts[i]); |
---|
639 | } |
---|
640 | |
---|
641 | for (int i = 0; i < 256; i++) if (!islower ((char) i)){ |
---|
642 | int li = (int)((unsigned char) tolower ((char) i)); |
---|
643 | for (int j = 0; j <= i; j++) if (!islower ((char) j)){ |
---|
644 | int lj = (int)((unsigned char) tolower ((char) j)); |
---|
645 | emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] = |
---|
646 | emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts; |
---|
647 | } |
---|
648 | emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts; |
---|
649 | } |
---|
650 | } |
---|
651 | } |
---|
652 | |
---|
653 | ///////////////////////////////////////////////////////////////// |
---|
654 | // ProbabilisticModel::ComputeAlignment() |
---|
655 | // |
---|
656 | // Computes an alignment based on the given posterior matrix. |
---|
657 | // This is done by finding the maximum summing path (or |
---|
658 | // maximum weight trace) through the posterior matrix. The |
---|
659 | // final alignment is returned as a pair consisting of: |
---|
660 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
---|
661 | // denote insertions in one of the two sequences and |
---|
662 | // B's denote that both sequences are present (i.e. |
---|
663 | // matches). |
---|
664 | // (2) a float indicating the sum achieved |
---|
665 | ///////////////////////////////////////////////////////////////// |
---|
666 | |
---|
667 | pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length, const VF &posterior) const { |
---|
668 | |
---|
669 | float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows); |
---|
670 | float *oldRow = twoRows; |
---|
671 | float *newRow = twoRows + seq2Length + 1; |
---|
672 | |
---|
673 | char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
---|
674 | char *tracebackPtr = tracebackMatrix; |
---|
675 | |
---|
676 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
---|
677 | |
---|
678 | // initialization |
---|
679 | for (int i = 0; i <= seq2Length; i++){ |
---|
680 | oldRow[i] = 0; |
---|
681 | *(tracebackPtr++) = 'L'; |
---|
682 | } |
---|
683 | |
---|
684 | // fill in matrix |
---|
685 | for (int i = 1; i <= seq1Length; i++){ |
---|
686 | |
---|
687 | // initialize left column |
---|
688 | newRow[0] = 0; |
---|
689 | posteriorPtr++; |
---|
690 | *(tracebackPtr++) = 'U'; |
---|
691 | |
---|
692 | // fill in rest of row |
---|
693 | for (int j = 1; j <= seq2Length; j++){ |
---|
694 | ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j], |
---|
695 | 'D', 'L', 'U', &newRow[j], tracebackPtr++); // Match, insert, delete |
---|
696 | } |
---|
697 | |
---|
698 | // swap rows |
---|
699 | float *temp = oldRow; |
---|
700 | oldRow = newRow; |
---|
701 | newRow = temp; |
---|
702 | } |
---|
703 | |
---|
704 | // store best score |
---|
705 | float total = oldRow[seq2Length]; |
---|
706 | delete [] twoRows; |
---|
707 | |
---|
708 | // compute traceback |
---|
709 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
---|
710 | int r = seq1Length, c = seq2Length; |
---|
711 | while (r != 0 || c != 0){ |
---|
712 | char ch = tracebackMatrix[r*(seq2Length+1) + c]; |
---|
713 | switch (ch){ |
---|
714 | case 'L': c--; alignment->push_back ('Y'); break; |
---|
715 | case 'U': r--; alignment->push_back ('X'); break; |
---|
716 | case 'D': c--; r--; alignment->push_back ('B'); break; |
---|
717 | default: assert (false); |
---|
718 | } |
---|
719 | } |
---|
720 | |
---|
721 | delete [] tracebackMatrix; |
---|
722 | |
---|
723 | reverse (alignment->begin(), alignment->end()); |
---|
724 | |
---|
725 | return make_pair(alignment, total); |
---|
726 | } |
---|
727 | |
---|
728 | ///////////////////////////////////////////////////////////////// |
---|
729 | // ProbabilisticModel::ComputeAlignment2() |
---|
730 | // |
---|
731 | // Computes an alignment based on the given posterior matrix. |
---|
732 | // This is done by finding the maximum summing path (or |
---|
733 | // maximum weight trace) through the posterior matrix. The |
---|
734 | // final alignment is returned as a pair consisting of: |
---|
735 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
---|
736 | // denote insertions in one of the two sequences and |
---|
737 | // B's denote that both sequences are present (i.e. |
---|
738 | // matches). |
---|
739 | // (2) a float indicating the sum achieved |
---|
740 | ///////////////////////////////////////////////////////////////// |
---|
741 | |
---|
742 | pair<SafeVector<char> *, float> ComputeAlignment2 (int seq1Length, int seq2Length, |
---|
743 | const VF &posterior, std::vector<StemCandidate> *pscs1, std::vector<StemCandidate> *pscs2, |
---|
744 | std::vector<int> *matchPSCS1, std::vector<int> *matchPSCS2) const { |
---|
745 | NRMat<float> WM(seq1Length + 1, seq2Length + 1); |
---|
746 | for (int i = 0; i <= seq1Length; i++) { |
---|
747 | for (int j = 0; j <= seq2Length; j++) { |
---|
748 | WM[i][j] = 0; |
---|
749 | } |
---|
750 | } |
---|
751 | |
---|
752 | int len = WORDLENGTH; |
---|
753 | int size = matchPSCS1->size(); |
---|
754 | float weight = 1000; |
---|
755 | |
---|
756 | for(int iter = 0; iter < size; iter++) { |
---|
757 | int i = matchPSCS1->at(iter); |
---|
758 | int j = matchPSCS2->at(iter); |
---|
759 | |
---|
760 | const StemCandidate &sc1 = pscs1->at(i); |
---|
761 | const StemCandidate &sc2 = pscs2->at(j); |
---|
762 | for(int k = 0; k < len; k++) { |
---|
763 | WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight; |
---|
764 | } |
---|
765 | } |
---|
766 | float *twoRows = new float[(seq2Length+1)*2]; assert (twoRows); |
---|
767 | float *oldRow = twoRows; |
---|
768 | float *newRow = twoRows + seq2Length + 1; |
---|
769 | |
---|
770 | char *tracebackMatrix = new char[(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
---|
771 | char *tracebackPtr = tracebackMatrix; |
---|
772 | |
---|
773 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
---|
774 | |
---|
775 | // initialization |
---|
776 | for (int i = 0; i <= seq2Length; i++){ |
---|
777 | oldRow[i] = 0; |
---|
778 | *(tracebackPtr++) = 'L'; |
---|
779 | } |
---|
780 | |
---|
781 | // fill in matrix |
---|
782 | for (int i = 1; i <= seq1Length; i++){ |
---|
783 | |
---|
784 | // initialize left column |
---|
785 | newRow[0] = 0; |
---|
786 | posteriorPtr++; |
---|
787 | *(tracebackPtr++) = 'U'; |
---|
788 | |
---|
789 | // fill in rest of row |
---|
790 | for (int j = 1; j <= seq2Length; j++){ |
---|
791 | ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1] + WM[i][j], newRow[j-1], oldRow[j], |
---|
792 | 'D', 'L', 'U', &newRow[j], tracebackPtr++); |
---|
793 | } |
---|
794 | |
---|
795 | // swap rows |
---|
796 | float *temp = oldRow; |
---|
797 | oldRow = newRow; |
---|
798 | newRow = temp; |
---|
799 | } |
---|
800 | |
---|
801 | // store best score |
---|
802 | float total = oldRow[seq2Length]; |
---|
803 | delete [] twoRows; |
---|
804 | |
---|
805 | // compute traceback |
---|
806 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
---|
807 | int r = seq1Length, c = seq2Length; |
---|
808 | while (r != 0 || c != 0){ |
---|
809 | char ch = tracebackMatrix[r*(seq2Length+1) + c]; |
---|
810 | switch (ch){ |
---|
811 | case 'L': c--; alignment->push_back ('Y'); break; |
---|
812 | case 'U': r--; alignment->push_back ('X'); break; |
---|
813 | case 'D': c--; r--; alignment->push_back ('B'); break; |
---|
814 | default: assert (false); |
---|
815 | } |
---|
816 | } |
---|
817 | |
---|
818 | delete [] tracebackMatrix; |
---|
819 | |
---|
820 | reverse (alignment->begin(), alignment->end()); |
---|
821 | |
---|
822 | return make_pair(alignment, total); |
---|
823 | } |
---|
824 | |
---|
825 | ///////////////////////////////////////////////////////////////// |
---|
826 | // ProbabilisticModel::ComputeAlignmentWithGapPenalties() |
---|
827 | // |
---|
828 | // Similar to ComputeAlignment() except with gap penalties. |
---|
829 | ///////////////////////////////////////////////////////////////// |
---|
830 | |
---|
831 | pair<SafeVector<char> *, float> ComputeAlignmentWithGapPenalties (MultiSequence *align1, |
---|
832 | MultiSequence *align2, |
---|
833 | const VF &posterior, int numSeqs1, |
---|
834 | int numSeqs2, |
---|
835 | float gapOpenPenalty, |
---|
836 | float gapContinuePenalty) const { |
---|
837 | int seq1Length = align1->GetSequence(0)->GetLength(); |
---|
838 | int seq2Length = align2->GetSequence(0)->GetLength(); |
---|
839 | SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences()); |
---|
840 | SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences()); |
---|
841 | |
---|
842 | // grab character data |
---|
843 | for (int i = 0; i < align1->GetNumSequences(); i++) |
---|
844 | dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr(); |
---|
845 | for (int i = 0; i < align2->GetNumSequences(); i++) |
---|
846 | dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr(); |
---|
847 | |
---|
848 | // the number of active sequences at any given column is defined to be the |
---|
849 | // number of non-gap characters in that column; the number of gap opens at |
---|
850 | // any given column is defined to be the number of gap characters in that |
---|
851 | // column where the previous character in the respective sequence was not |
---|
852 | // a gap |
---|
853 | SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1); |
---|
854 | SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1); |
---|
855 | |
---|
856 | // compute number of active sequences and gap opens for each group |
---|
857 | for (int i = 0; i < align1->GetNumSequences(); i++){ |
---|
858 | SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr(); |
---|
859 | numActive1[0] = numGapOpens1[0] = 0; |
---|
860 | for (int j = 1; j <= seq1Length; j++){ |
---|
861 | if (dataPtr[j] != '-'){ |
---|
862 | numActive1[j]++; |
---|
863 | numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-'); |
---|
864 | } |
---|
865 | } |
---|
866 | } |
---|
867 | for (int i = 0; i < align2->GetNumSequences(); i++){ |
---|
868 | SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr(); |
---|
869 | numActive2[0] = numGapOpens2[0] = 0; |
---|
870 | for (int j = 1; j <= seq2Length; j++){ |
---|
871 | if (dataPtr[j] != '-'){ |
---|
872 | numActive2[j]++; |
---|
873 | numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-'); |
---|
874 | } |
---|
875 | } |
---|
876 | } |
---|
877 | |
---|
878 | VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1)); |
---|
879 | VF continuingPenalty1 (numSeqs1+1); |
---|
880 | VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1)); |
---|
881 | VF continuingPenalty2 (numSeqs2+1); |
---|
882 | |
---|
883 | // precompute penalties |
---|
884 | for (int i = 0; i <= numSeqs1; i++) |
---|
885 | for (int j = 0; j <= numSeqs2; j++) |
---|
886 | openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j)); |
---|
887 | for (int i = 0; i <= numSeqs1; i++) |
---|
888 | continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2; |
---|
889 | for (int i = 0; i <= numSeqs2; i++) |
---|
890 | for (int j = 0; j <= numSeqs1; j++) |
---|
891 | openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j)); |
---|
892 | for (int i = 0; i <= numSeqs2; i++) |
---|
893 | continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1; |
---|
894 | |
---|
895 | float *twoRows = new float[6*(seq2Length+1)]; assert (twoRows); |
---|
896 | float *oldRowMatch = twoRows; |
---|
897 | float *newRowMatch = twoRows + (seq2Length+1); |
---|
898 | float *oldRowInsertX = twoRows + 2*(seq2Length+1); |
---|
899 | float *newRowInsertX = twoRows + 3*(seq2Length+1); |
---|
900 | float *oldRowInsertY = twoRows + 4*(seq2Length+1); |
---|
901 | float *newRowInsertY = twoRows + 5*(seq2Length+1); |
---|
902 | |
---|
903 | char *tracebackMatrix = new char[3*(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix); |
---|
904 | char *tracebackPtr = tracebackMatrix; |
---|
905 | |
---|
906 | VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1; |
---|
907 | |
---|
908 | // initialization |
---|
909 | for (int i = 0; i <= seq2Length; i++){ |
---|
910 | oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO; |
---|
911 | oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]]; |
---|
912 | *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'Y'; |
---|
913 | tracebackPtr += 3; |
---|
914 | } |
---|
915 | |
---|
916 | // fill in matrix |
---|
917 | for (int i = 1; i <= seq1Length; i++){ |
---|
918 | |
---|
919 | // initialize left column |
---|
920 | newRowMatch[0] = newRowInsertY[0] = LOG_ZERO; |
---|
921 | newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]]; |
---|
922 | posteriorPtr++; |
---|
923 | *(tracebackPtr) = *(tracebackPtr+1) = *(tracebackPtr+2) = 'X'; |
---|
924 | tracebackPtr += 3; |
---|
925 | |
---|
926 | // fill in rest of row |
---|
927 | for (int j = 1; j <= seq2Length; j++){ |
---|
928 | |
---|
929 | // going to MATCH state |
---|
930 | ChooseBestOfThree (oldRowMatch[j-1], |
---|
931 | oldRowInsertX[j-1], |
---|
932 | oldRowInsertY[j-1], |
---|
933 | 'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++); |
---|
934 | newRowMatch[j] += *(posteriorPtr++); |
---|
935 | |
---|
936 | // going to INSERT X state |
---|
937 | ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], |
---|
938 | oldRowInsertX[j] + continuingPenalty1[numActive1[i]], |
---|
939 | oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]], |
---|
940 | 'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++); |
---|
941 | |
---|
942 | // going to INSERT Y state |
---|
943 | ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], |
---|
944 | newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]], |
---|
945 | newRowInsertY[j-1] + continuingPenalty2[numActive2[j]], |
---|
946 | 'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++); |
---|
947 | } |
---|
948 | |
---|
949 | // swap rows |
---|
950 | float *temp; |
---|
951 | temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp; |
---|
952 | temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp; |
---|
953 | temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp; |
---|
954 | } |
---|
955 | |
---|
956 | // store best score |
---|
957 | float total; |
---|
958 | char matrix; |
---|
959 | ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length], |
---|
960 | 'M', 'X', 'Y', &total, &matrix); |
---|
961 | |
---|
962 | delete [] twoRows; |
---|
963 | |
---|
964 | // compute traceback |
---|
965 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
---|
966 | int r = seq1Length, c = seq2Length; |
---|
967 | while (r != 0 || c != 0){ |
---|
968 | |
---|
969 | int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2; |
---|
970 | char ch = tracebackMatrix[(r*(seq2Length+1) + c) * 3 + offset]; |
---|
971 | switch (matrix){ |
---|
972 | case 'Y': c--; alignment->push_back ('Y'); break; |
---|
973 | case 'X': r--; alignment->push_back ('X'); break; |
---|
974 | case 'M': c--; r--; alignment->push_back ('B'); break; |
---|
975 | default: assert (false); |
---|
976 | } |
---|
977 | matrix = ch; |
---|
978 | } |
---|
979 | |
---|
980 | delete [] tracebackMatrix; |
---|
981 | |
---|
982 | reverse (alignment->begin(), alignment->end()); |
---|
983 | |
---|
984 | return make_pair(alignment, 1.0f); |
---|
985 | } |
---|
986 | |
---|
987 | |
---|
988 | ///////////////////////////////////////////////////////////////// |
---|
989 | // ProbabilisticModel::ComputeViterbiAlignment() |
---|
990 | // |
---|
991 | // Computes the highest probability pairwise alignment using the |
---|
992 | // probabilistic model. The final alignment is returned as a |
---|
993 | // pair consisting of: |
---|
994 | // (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and |
---|
995 | // denote insertions in one of the two sequences and |
---|
996 | // B's denote that both sequences are present (i.e. |
---|
997 | // matches). |
---|
998 | // (2) a float containing the log probability of the best |
---|
999 | // alignment (not used) |
---|
1000 | ///////////////////////////////////////////////////////////////// |
---|
1001 | |
---|
1002 | pair<SafeVector<char> *, float> ComputeViterbiAlignment (Sequence *seq1, Sequence *seq2) const { |
---|
1003 | |
---|
1004 | assert (seq1); |
---|
1005 | assert (seq2); |
---|
1006 | |
---|
1007 | const int seq1Length = seq1->GetLength(); |
---|
1008 | const int seq2Length = seq2->GetLength(); |
---|
1009 | |
---|
1010 | // retrieve the points to the beginning of each sequence |
---|
1011 | SafeVector<char>::iterator iter1 = seq1->GetDataPtr(); |
---|
1012 | SafeVector<char>::iterator iter2 = seq2->GetDataPtr(); |
---|
1013 | |
---|
1014 | // create viterbi matrix |
---|
1015 | VF *viterbiPtr = new VF (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), LOG_ZERO); |
---|
1016 | assert (viterbiPtr); |
---|
1017 | VF &viterbi = *viterbiPtr; |
---|
1018 | |
---|
1019 | // create traceback matrix |
---|
1020 | VI *tracebackPtr = new VI (NumMatrixTypes * (seq1Length+1) * (seq2Length+1), -1); |
---|
1021 | assert (tracebackPtr); |
---|
1022 | VI &traceback = *tracebackPtr; |
---|
1023 | |
---|
1024 | // initialization condition |
---|
1025 | for (int k = 0; k < NumMatrixTypes; k++) |
---|
1026 | viterbi[k] = initialDistribution[k]; |
---|
1027 | |
---|
1028 | // remember offset for each index combination |
---|
1029 | int ij = 0; |
---|
1030 | int i1j = -seq2Length - 1; |
---|
1031 | int ij1 = -1; |
---|
1032 | int i1j1 = -seq2Length - 2; |
---|
1033 | |
---|
1034 | ij *= NumMatrixTypes; |
---|
1035 | i1j *= NumMatrixTypes; |
---|
1036 | ij1 *= NumMatrixTypes; |
---|
1037 | i1j1 *= NumMatrixTypes; |
---|
1038 | |
---|
1039 | // compute viterbi scores |
---|
1040 | for (int i = 0; i <= seq1Length; i++){ |
---|
1041 | unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i]; |
---|
1042 | for (int j = 0; j <= seq2Length; j++){ |
---|
1043 | unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j]; |
---|
1044 | |
---|
1045 | if (i > 0 && j > 0){ |
---|
1046 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
1047 | float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2]; |
---|
1048 | if (viterbi[0 + ij] < newVal){ |
---|
1049 | viterbi[0 + ij] = newVal; |
---|
1050 | traceback[0 + ij] = k; |
---|
1051 | } |
---|
1052 | } |
---|
1053 | } |
---|
1054 | if (i > 0){ |
---|
1055 | for (int k = 0; k < NumInsertStates; k++){ |
---|
1056 | float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1]; |
---|
1057 | float valFromIns = insProb[c1][k] + viterbi[2*k+1 + i1j] + transProb[2*k+1][2*k+1]; |
---|
1058 | if (valFromMatch >= valFromIns){ |
---|
1059 | viterbi[2*k+1 + ij] = valFromMatch; |
---|
1060 | traceback[2*k+1 + ij] = 0; |
---|
1061 | } |
---|
1062 | else { |
---|
1063 | viterbi[2*k+1 + ij] = valFromIns; |
---|
1064 | traceback[2*k+1 + ij] = 2*k+1; |
---|
1065 | } |
---|
1066 | } |
---|
1067 | } |
---|
1068 | if (j > 0){ |
---|
1069 | for (int k = 0; k < NumInsertStates; k++){ |
---|
1070 | float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2]; |
---|
1071 | float valFromIns = insProb[c2][k] + viterbi[2*k+2 + ij1] + transProb[2*k+2][2*k+2]; |
---|
1072 | if (valFromMatch >= valFromIns){ |
---|
1073 | viterbi[2*k+2 + ij] = valFromMatch; |
---|
1074 | traceback[2*k+2 + ij] = 0; |
---|
1075 | } |
---|
1076 | else { |
---|
1077 | viterbi[2*k+2 + ij] = valFromIns; |
---|
1078 | traceback[2*k+2 + ij] = 2*k+2; |
---|
1079 | } |
---|
1080 | } |
---|
1081 | } |
---|
1082 | |
---|
1083 | ij += NumMatrixTypes; |
---|
1084 | i1j += NumMatrixTypes; |
---|
1085 | ij1 += NumMatrixTypes; |
---|
1086 | i1j1 += NumMatrixTypes; |
---|
1087 | } |
---|
1088 | } |
---|
1089 | |
---|
1090 | // figure out best terminating cell |
---|
1091 | float bestProb = LOG_ZERO; |
---|
1092 | int state = -1; |
---|
1093 | for (int k = 0; k < NumMatrixTypes; k++){ |
---|
1094 | float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k]; |
---|
1095 | if (bestProb < thisProb){ |
---|
1096 | bestProb = thisProb; |
---|
1097 | state = k; |
---|
1098 | } |
---|
1099 | } |
---|
1100 | assert (state != -1); |
---|
1101 | |
---|
1102 | delete viterbiPtr; |
---|
1103 | |
---|
1104 | // compute traceback |
---|
1105 | SafeVector<char> *alignment = new SafeVector<char>; assert (alignment); |
---|
1106 | int r = seq1Length, c = seq2Length; |
---|
1107 | while (r != 0 || c != 0){ |
---|
1108 | int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)]; |
---|
1109 | |
---|
1110 | if (state == 0){ c--; r--; alignment->push_back ('B'); } |
---|
1111 | else if (state % 2 == 1){ r--; alignment->push_back ('X'); } |
---|
1112 | else { c--; alignment->push_back ('Y'); } |
---|
1113 | |
---|
1114 | state = newState; |
---|
1115 | } |
---|
1116 | |
---|
1117 | delete tracebackPtr; |
---|
1118 | |
---|
1119 | reverse (alignment->begin(), alignment->end()); |
---|
1120 | |
---|
1121 | return make_pair(alignment, bestProb); |
---|
1122 | } |
---|
1123 | |
---|
1124 | ///////////////////////////////////////////////////////////////// |
---|
1125 | // ProbabilisticModel::BuildPosterior() |
---|
1126 | // |
---|
1127 | // Builds a posterior probability matrix needed to align a pair |
---|
1128 | // of alignments. Mathematically, the returned matrix M is |
---|
1129 | // defined as follows: |
---|
1130 | // M[i,j] = sum sum f(s,t,i,j) |
---|
1131 | // s in align1 t in align2 |
---|
1132 | // where |
---|
1133 | // [ P(s[i'] <--> t[j']) |
---|
1134 | // [ if s[i'] is a letter in the ith column of align1 and |
---|
1135 | // [ t[j'] it a letter in the jth column of align2 |
---|
1136 | // f(s,t,i,j) = [ |
---|
1137 | // [ 0 otherwise |
---|
1138 | // |
---|
1139 | ///////////////////////////////////////////////////////////////// |
---|
1140 | |
---|
1141 | VF *BuildPosterior (MultiSequence *align1, MultiSequence *align2, |
---|
1142 | const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices, |
---|
1143 | float cutoff = 0.0f) const { |
---|
1144 | const int seq1Length = align1->GetSequence(0)->GetLength(); |
---|
1145 | const int seq2Length = align2->GetSequence(0)->GetLength(); |
---|
1146 | |
---|
1147 | VF *posteriorPtr = new VF((seq1Length+1) * (seq2Length+1), 0); assert (posteriorPtr); |
---|
1148 | VF &posterior = *posteriorPtr; |
---|
1149 | VF::iterator postPtr = posterior.begin(); |
---|
1150 | |
---|
1151 | // for each s in align1 |
---|
1152 | for (int i = 0; i < align1->GetNumSequences(); i++){ |
---|
1153 | int first = align1->GetSequence(i)->GetLabel(); |
---|
1154 | SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping(); |
---|
1155 | |
---|
1156 | // for each t in align2 |
---|
1157 | for (int j = 0; j < align2->GetNumSequences(); j++){ |
---|
1158 | int second = align2->GetSequence(j)->GetLabel(); |
---|
1159 | SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping(); |
---|
1160 | if (first < second){ |
---|
1161 | |
---|
1162 | // get the associated sparse matrix |
---|
1163 | SparseMatrix *matrix = sparseMatrices[first][second]; |
---|
1164 | |
---|
1165 | for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){ |
---|
1166 | SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii); |
---|
1167 | int base = (*mapping1)[ii] * (seq2Length+1); |
---|
1168 | int rowSize = matrix->GetRowSize(ii); |
---|
1169 | // add in all relevant values |
---|
1170 | for (int jj = 0; jj < rowSize; jj++) |
---|
1171 | posterior[base + (*mapping2)[row[jj].first]] += row[jj].second; |
---|
1172 | |
---|
1173 | // subtract cutoff |
---|
1174 | for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) { |
---|
1175 | posterior[base + (*mapping2)[jj]] -= cutoff; |
---|
1176 | } |
---|
1177 | |
---|
1178 | } |
---|
1179 | |
---|
1180 | } else { |
---|
1181 | // get the associated sparse matrix |
---|
1182 | SparseMatrix *matrix = sparseMatrices[second][first]; |
---|
1183 | |
---|
1184 | for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){ |
---|
1185 | SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj); |
---|
1186 | int base = (*mapping2)[jj]; |
---|
1187 | int rowSize = matrix->GetRowSize(jj); |
---|
1188 | |
---|
1189 | // add in all relevant values |
---|
1190 | for (int ii = 0; ii < rowSize; ii++) |
---|
1191 | posterior[base + (*mapping1)[row[ii].first] * (seq2Length + 1)] += row[ii].second; |
---|
1192 | |
---|
1193 | // subtract cutoff |
---|
1194 | for (int ii = 0; ii < matrix->GetSeq2Length(); ii++) |
---|
1195 | posterior[base + (*mapping1)[ii] * (seq2Length + 1)] -= cutoff; |
---|
1196 | } |
---|
1197 | |
---|
1198 | } |
---|
1199 | |
---|
1200 | |
---|
1201 | delete mapping2; |
---|
1202 | } |
---|
1203 | |
---|
1204 | delete mapping1; |
---|
1205 | } |
---|
1206 | |
---|
1207 | return posteriorPtr; |
---|
1208 | } |
---|
1209 | }; |
---|
1210 | } |
---|
1211 | #endif |
---|