Context Navigation

ProbabilisticModel.h

Visit:

Last change on this file was 10371, checked in by aboeckma, 12 years ago
updated mafft version. Added extensions (no svn ignore, yet)
File size: 43.1 KB

Line
1	/////////////////////////////////////////////////////////////////
2	// ProbabilisticModel.h
3	//
4	// Routines for (1) posterior probability computations
5	// (2) chained anchoring
6	// (3) maximum weight trace alignment
7	/////////////////////////////////////////////////////////////////
8
9	#ifndef PROBABILISTICMODEL_H
10	#define PROBABILISTICMODEL_H
11
12	#include <list>
13	#include <cmath>
14	#include <cstdio>
15	#include "SafeVector.h"
16	#include "ScoreType.h"
17	#include "SparseMatrix.h"
18	#include "MultiSequence.h"
19	#include "StemCandidate.hpp"
20	#include "scarna.hpp"
21	#include "nrutil.h"
22	#include <vector>
23
24	using namespace std;
25
26	const int NumMatchStates = 1; // note that in this version the number
27	// of match states is fixed at 1...will
28	// change in future versions
29	const int NumMatrixTypes = NumMatchStates + NumInsertStates * 2;
30
31	/////////////////////////////////////////////////////////////////
32	// ProbabilisticModel
33	//
34	// Class for storing the parameters of a probabilistic model and
35	// performing different computations based on those parameters.
36	// In particular, this class handles the computation of
37	// posterior probabilities that may be used in alignment.
38	/////////////////////////////////////////////////////////////////
39	namespace MXSCARNA {
40	class ProbabilisticModel {
41
42	float initialDistribution[NumMatrixTypes]; // holds the initial probabilities for each state
43	float transProb[NumMatrixTypes][NumMatrixTypes]; // holds all state-to-state transition probabilities
44	float matchProb[256][256]; // emission probabilities for match states
45	float insProb[256][NumMatrixTypes]; // emission probabilities for insert states
46	NRMat<float> WM;
47
48	public:
49
50	/////////////////////////////////////////////////////////////////
51	// ProbabilisticModel::ProbabilisticModel()
52	//
53	// Constructor. Builds a new probabilistic model using the
54	// given parameters.
55	/////////////////////////////////////////////////////////////////
56
57	ProbabilisticModel (const VF &initDistribMat, const VF &gapOpen, const VF &gapExtend,
58	const VVF &emitPairs, const VF &emitSingle){
59
60	// build transition matrix
61	VVF transMat (NumMatrixTypes, VF (NumMatrixTypes, 0.0f));
62	transMat[0][0] = 1;
63	for (int i = 0; i < NumInsertStates; i++){
64	transMat[0][2i+1] = gapOpen[2i];
65	transMat[0][2i+2] = gapOpen[2i+1];
66	transMat[0][0] -= (gapOpen[2i] + gapOpen[2i+1]);
67	assert (transMat[0][0] > 0);
68	transMat[2i+1][2i+1] = gapExtend[2*i];
69	transMat[2i+2][2i+2] = gapExtend[2*i+1];
70	transMat[2i+1][2i+2] = 0;
71	transMat[2i+2][2i+1] = 0;
72	transMat[2i+1][0] = 1 - gapExtend[2i];
73	transMat[2i+2][0] = 1 - gapExtend[2i+1];
74	}
75
76	// create initial and transition probability matrices
77	for (int i = 0; i < NumMatrixTypes; i++){
78	initialDistribution[i] = LOG (initDistribMat[i]);
79	for (int j = 0; j < NumMatrixTypes; j++)
80	transProb[i][j] = LOG (transMat[i][j]);
81	}
82
83	// create insertion and match probability matrices
84	for (int i = 0; i < 256; i++){
85	for (int j = 0; j < NumMatrixTypes; j++)
86	insProb[i][j] = LOG (emitSingle[i]);
87	for (int j = 0; j < 256; j++)
88	matchProb[i][j] = LOG (emitPairs[i][j]);
89	}
90	}
91
92	NRMat<float> weightMatchScore(std::vector<StemCandidate> pscs1, std::vector<StemCandidate> pscs2,
93	std::vector<int> matchPSCS1, std::vector<int> matchPSCS2, NRMat<float> WM) {
94	int len = WORDLENGTH;
95	int size = matchPSCS1->size();
96	float weight = 1000;
97
98	for(int iter = 0; iter < size; iter++) {
99	int i = matchPSCS1->at(iter);
100	int j = matchPSCS2->at(iter);
101
102	const StemCandidate &sc1 = pscs1->at(i);
103	const StemCandidate &sc2 = pscs2->at(j);
104
105	for(int k = 0; k < len; k++) {
106	WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight;
107	// sumWeight += weight;
108	}
109	}
110	return WM;
111	}
112
113	/////////////////////////////////////////////////////////////////
114	// ProbabilisticModel::ComputeForwardMatrix()
115	//
116	// Computes a set of forward probability matrices for aligning
117	// seq1 and seq2.
118	//
119	// For efficiency reasons, a single-dimensional floating-point
120	// array is used here, with the following indexing scheme:
121	//
122	// forward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
123	// refers to the probability of aligning through j characters
124	// of the first sequence, k characters of the second sequence,
125	// and ending in state i.
126	/////////////////////////////////////////////////////////////////
127
128	VF ComputeForwardMatrix (Sequence seq1, Sequence *seq2) const {
129
130	assert (seq1);
131	assert (seq2);
132
133	const int seq1Length = seq1->GetLength();
134	const int seq2Length = seq2->GetLength();
135
136	// retrieve the points to the beginning of each sequence
137	SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
138	SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
139
140	// create matrix
141	VF forwardPtr = new VF (NumMatrixTypes (seq1Length+1) * (seq2Length+1), LOG_ZERO);
142	assert (forwardPtr);
143	VF &forward = *forwardPtr;
144
145	// initialization condition
146	forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] =
147	initialDistribution[0] + matchProb[(unsigned char) iter1[1]][(unsigned char) iter2[1]];
148
149	for (int k = 0; k < NumInsertStates; k++){
150	forward[2k+1 + NumMatrixTypes (1 * (seq2Length+1) + 0)] =
151	initialDistribution[2*k+1] + insProb[(unsigned char) iter1[1]][k];
152	forward[2k+2 + NumMatrixTypes (0 * (seq2Length+1) + 1)] =
153	initialDistribution[2*k+2] + insProb[(unsigned char) iter2[1]][k];
154	}
155
156	// remember offset for each index combination
157	int ij = 0;
158	int i1j = -seq2Length - 1;
159	int ij1 = -1;
160	int i1j1 = -seq2Length - 2;
161
162	ij *= NumMatrixTypes;
163	i1j *= NumMatrixTypes;
164	ij1 *= NumMatrixTypes;
165	i1j1 *= NumMatrixTypes;
166
167	// compute forward scores
168	for (int i = 0; i <= seq1Length; i++){
169	unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
170	for (int j = 0; j <= seq2Length; j++){
171	unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
172
173	if (i > 1 \|\| j > 1){
174	if (i > 0 && j > 0){
175	forward[0 + ij] = forward[0 + i1j1] + transProb[0][0];
176	for (int k = 1; k < NumMatrixTypes; k++)
177	LOG_PLUS_EQUALS (forward[0 + ij], forward[k + i1j1] + transProb[k][0]);
178	forward[0 + ij] += matchProb[c1][c2];
179	}
180	if (i > 0){
181	for (int k = 0; k < NumInsertStates; k++)
182	forward[2*k+1 + ij] = insProb[c1][k] +
183	LOG_ADD (forward[0 + i1j] + transProb[0][2*k+1],
184	forward[2k+1 + i1j] + transProb[2k+1][2*k+1]);
185	}
186	if (j > 0){
187	for (int k = 0; k < NumInsertStates; k++)
188	forward[2*k+2 + ij] = insProb[c2][k] +
189	LOG_ADD (forward[0 + ij1] + transProb[0][2*k+2],
190	forward[2k+2 + ij1] + transProb[2k+2][2*k+2]);
191	}
192	}
193
194	ij += NumMatrixTypes;
195	i1j += NumMatrixTypes;
196	ij1 += NumMatrixTypes;
197	i1j1 += NumMatrixTypes;
198	}
199	}
200
201	return forwardPtr;
202	}
203
204	/////////////////////////////////////////////////////////////////
205	// ProbabilisticModel::ComputeBackwardMatrix()
206	//
207	// Computes a set of backward probability matrices for aligning
208	// seq1 and seq2.
209	//
210	// For efficiency reasons, a single-dimensional floating-point
211	// array is used here, with the following indexing scheme:
212	//
213	// backward[i + NumMatrixTypes * (j * (seq2Length+1) + k)]
214	// refers to the probability of starting in state i and
215	// aligning from character j+1 to the end of the first
216	// sequence and from character k+1 to the end of the second
217	// sequence.
218	/////////////////////////////////////////////////////////////////
219
220	VF ComputeBackwardMatrix (Sequence seq1, Sequence *seq2) const {
221
222	assert (seq1);
223	assert (seq2);
224
225	const int seq1Length = seq1->GetLength();
226	const int seq2Length = seq2->GetLength();
227	SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
228	SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
229
230	// create matrix
231	VF backwardPtr = new VF (NumMatrixTypes (seq1Length+1) * (seq2Length+1), LOG_ZERO);
232	assert (backwardPtr);
233	VF &backward = *backwardPtr;
234
235	// initialization condition
236	for (int k = 0; k < NumMatrixTypes; k++)
237	backward[NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1) + k] = initialDistribution[k];
238
239	// remember offset for each index combination
240	int ij = (seq1Length+1) * (seq2Length+1) - 1;
241	int i1j = ij + seq2Length + 1;
242	int ij1 = ij + 1;
243	int i1j1 = ij + seq2Length + 2;
244
245	ij *= NumMatrixTypes;
246	i1j *= NumMatrixTypes;
247	ij1 *= NumMatrixTypes;
248	i1j1 *= NumMatrixTypes;
249
250	// compute backward scores
251	for (int i = seq1Length; i >= 0; i--){
252	unsigned char c1 = (i == seq1Length) ? '~' : (unsigned char) iter1[i+1];
253	for (int j = seq2Length; j >= 0; j--){
254	unsigned char c2 = (j == seq2Length) ? '~' : (unsigned char) iter2[j+1];
255
256	if (i < seq1Length && j < seq2Length){
257	const float ProbXY = backward[0 + i1j1] + matchProb[c1][c2];
258	for (int k = 0; k < NumMatrixTypes; k++)
259	LOG_PLUS_EQUALS (backward[k + ij], ProbXY + transProb[k][0]);
260	}
261	if (i < seq1Length){
262	for (int k = 0; k < NumInsertStates; k++){
263	LOG_PLUS_EQUALS (backward[0 + ij], backward[2k+1 + i1j] + insProb[c1][k] + transProb[0][2k+1]);
264	LOG_PLUS_EQUALS (backward[2k+1 + ij], backward[2k+1 + i1j] + insProb[c1][k] + transProb[2k+1][2k+1]);
265	}
266	}
267	if (j < seq2Length){
268	for (int k = 0; k < NumInsertStates; k++){
269	LOG_PLUS_EQUALS (backward[0 + ij], backward[2k+2 + ij1] + insProb[c2][k] + transProb[0][2k+2]);
270	LOG_PLUS_EQUALS (backward[2k+2 + ij], backward[2k+2 + ij1] + insProb[c2][k] + transProb[2k+2][2k+2]);
271	}
272	}
273
274	ij -= NumMatrixTypes;
275	i1j -= NumMatrixTypes;
276	ij1 -= NumMatrixTypes;
277	i1j1 -= NumMatrixTypes;
278	}
279	}
280
281	return backwardPtr;
282	}
283
284	/////////////////////////////////////////////////////////////////
285	// ProbabilisticModel::ComputeTotalProbability()
286	//
287	// Computes the total probability of an alignment given
288	// the forward and backward matrices.
289	/////////////////////////////////////////////////////////////////
290
291	float ComputeTotalProbability (int seq1Length, int seq2Length,
292	const VF &forward, const VF &backward) const {
293
294	// compute total probability
295	float totalForwardProb = LOG_ZERO;
296	float totalBackwardProb = LOG_ZERO;
297	for (int k = 0; k < NumMatrixTypes; k++){
298	LOG_PLUS_EQUALS (totalForwardProb,
299	forward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
300	backward[k + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
301	}
302
303	totalBackwardProb =
304	forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
305	backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)];
306
307	for (int k = 0; k < NumInsertStates; k++){
308	LOG_PLUS_EQUALS (totalBackwardProb,
309	forward[2k+1 + NumMatrixTypes (1 * (seq2Length+1) + 0)] +
310	backward[2k+1 + NumMatrixTypes (1 * (seq2Length+1) + 0)]);
311	LOG_PLUS_EQUALS (totalBackwardProb,
312	forward[2k+2 + NumMatrixTypes (0 * (seq2Length+1) + 1)] +
313	backward[2k+2 + NumMatrixTypes (0 * (seq2Length+1) + 1)]);
314	}
315
316	// cerr << totalForwardProb << " " << totalBackwardProb << endl;
317
318	return (totalForwardProb + totalBackwardProb) / 2;
319	}
320
321	/////////////////////////////////////////////////////////////////
322	// ProbabilisticModel::ComputePosteriorMatrix()
323	//
324	// Computes the posterior probability matrix based on
325	// the forward and backward matrices.
326	/////////////////////////////////////////////////////////////////
327
328	VF ComputePosteriorMatrix (Sequence seq1, Sequence *seq2,
329	const VF &forward, const VF &backward) const {
330
331	assert (seq1);
332	assert (seq2);
333
334	const int seq1Length = seq1->GetLength();
335	const int seq2Length = seq2->GetLength();
336
337	float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
338	forward, backward);
339
340	// compute posterior matrices
341	VF posteriorPtr = new VF((seq1Length+1) (seq2Length+1)); assert (posteriorPtr);
342	VF &posterior = *posteriorPtr;
343
344	int ij = 0;
345	VF::iterator ptr = posterior.begin();
346
347	for (int i = 0; i <= seq1Length; i++){
348	for (int j = 0; j <= seq2Length; j++){
349	*(ptr++) = EXP (min (LOG_ONE, forward[ij] + backward[ij] - totalProb));
350	ij += NumMatrixTypes;
351	}
352	}
353
354	posterior[0] = 0;
355
356	return posteriorPtr;
357	}
358
359	/*
360	/////////////////////////////////////////////////////////////////
361	// ProbabilisticModel::ComputeExpectedCounts()
362	//
363	// Computes the expected counts for the various transitions.
364	/////////////////////////////////////////////////////////////////
365
366	VVF *ComputeExpectedCounts () const {
367
368	assert (seq1);
369	assert (seq2);
370
371	const int seq1Length = seq1->GetLength();
372	const int seq2Length = seq2->GetLength();
373	SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
374	SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
375
376	// compute total probability
377	float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
378	forward, backward);
379
380	// initialize expected counts
381	VVF *countsPtr = new VVF(NumMatrixTypes + 1, VF(NumMatrixTypes, LOG_ZERO)); assert (countsPtr);
382	VVF &counts = *countsPtr;
383
384	// remember offset for each index combination
385	int ij = 0;
386	int i1j = -seq2Length - 1;
387	int ij1 = -1;
388	int i1j1 = -seq2Length - 2;
389
390	ij *= NumMatrixTypes;
391	i1j *= NumMatrixTypes;
392	ij1 *= NumMatrixTypes;
393	i1j1 *= NumMatrixTypes;
394
395	// compute expected counts
396	for (int i = 0; i <= seq1Length; i++){
397	unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
398	for (int j = 0; j <= seq2Length; j++){
399	unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
400
401	if (i > 0 && j > 0){
402	for (int k = 0; k < NumMatrixTypes; k++)
403	LOG_PLUS_EQUALS (counts[k][0],
404	forward[k + i1j1] + transProb[k][0] +
405	matchProb[c1][c2] + backward[0 + ij]);
406	}
407	if (i > 0){
408	for (int k = 0; k < NumInsertStates; k++){
409	LOG_PLUS_EQUALS (counts[0][2*k+1],
410	forward[0 + i1j] + transProb[0][2*k+1] +
411	insProb[c1][k] + backward[2*k+1 + ij]);
412	LOG_PLUS_EQUALS (counts[2k+1][2k+1],
413	forward[2k+1 + i1j] + transProb[2k+1][2*k+1] +
414	insProb[c1][k] + backward[2*k+1 + ij]);
415	}
416	}
417	if (j > 0){
418	for (int k = 0; k < NumInsertStates; k++){
419	LOG_PLUS_EQUALS (counts[0][2*k+2],
420	forward[0 + ij1] + transProb[0][2*k+2] +
421	insProb[c2][k] + backward[2*k+2 + ij]);
422	LOG_PLUS_EQUALS (counts[2k+2][2k+2],
423	forward[2k+2 + ij1] + transProb[2k+2][2*k+2] +
424	insProb[c2][k] + backward[2*k+2 + ij]);
425	}
426	}
427
428	ij += NumMatrixTypes;
429	i1j += NumMatrixTypes;
430	ij1 += NumMatrixTypes;
431	i1j1 += NumMatrixTypes;
432	}
433	}
434
435	// scale all expected counts appropriately
436	for (int i = 0; i < NumMatrixTypes; i++)
437	for (int j = 0; j < NumMatrixTypes; j++)
438	counts[i][j] -= totalProb;
439
440	}
441	*/
442
443	/////////////////////////////////////////////////////////////////
444	// ProbabilisticModel::ComputeNewParameters()
445	//
446	// Computes a new parameter set based on the expected counts
447	// given.
448	/////////////////////////////////////////////////////////////////
449
450	void ComputeNewParameters (Sequence seq1, Sequence seq2,
451	const VF &forward, const VF &backward,
452	VF &initDistribMat, VF &gapOpen,
453	VF &gapExtend, VVF &emitPairs, VF &emitSingle, bool enableTrainEmissions) const {
454
455	assert (seq1);
456	assert (seq2);
457
458	const int seq1Length = seq1->GetLength();
459	const int seq2Length = seq2->GetLength();
460	SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
461	SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
462
463	// compute total probability
464	float totalProb = ComputeTotalProbability (seq1Length, seq2Length,
465	forward, backward);
466
467	// initialize expected counts
468	VVF transCounts (NumMatrixTypes, VF (NumMatrixTypes, LOG_ZERO));
469	VF initCounts (NumMatrixTypes, LOG_ZERO);
470	VVF pairCounts (256, VF (256, LOG_ZERO));
471	VF singleCounts (256, LOG_ZERO);
472
473	// remember offset for each index combination
474	int ij = 0;
475	int i1j = -seq2Length - 1;
476	int ij1 = -1;
477	int i1j1 = -seq2Length - 2;
478
479	ij *= NumMatrixTypes;
480	i1j *= NumMatrixTypes;
481	ij1 *= NumMatrixTypes;
482	i1j1 *= NumMatrixTypes;
483
484	// compute initial distribution posteriors
485	initCounts[0] = LOG_ADD (forward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)] +
486	backward[0 + NumMatrixTypes * (1 * (seq2Length+1) + 1)],
487	forward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)] +
488	backward[0 + NumMatrixTypes * ((seq1Length+1) * (seq2Length+1) - 1)]);
489	for (int k = 0; k < NumInsertStates; k++){
490	initCounts[2k+1] = LOG_ADD (forward[2k+1 + NumMatrixTypes * (1 * (seq2Length+1) + 0)] +
491	backward[2k+1 + NumMatrixTypes (1 * (seq2Length+1) + 0)],
492	forward[2k+1 + NumMatrixTypes ((seq1Length+1) * (seq2Length+1) - 1)] +
493	backward[2k+1 + NumMatrixTypes ((seq1Length+1) * (seq2Length+1) - 1)]);
494	initCounts[2k+2] = LOG_ADD (forward[2k+2 + NumMatrixTypes * (0 * (seq2Length+1) + 1)] +
495	backward[2k+2 + NumMatrixTypes (0 * (seq2Length+1) + 1)],
496	forward[2k+2 + NumMatrixTypes ((seq1Length+1) * (seq2Length+1) - 1)] +
497	backward[2k+2 + NumMatrixTypes ((seq1Length+1) * (seq2Length+1) - 1)]);
498	}
499
500	// compute expected counts
501	for (int i = 0; i <= seq1Length; i++){
502	unsigned char c1 = (i == 0) ? '~' : (unsigned char) toupper(iter1[i]);
503	for (int j = 0; j <= seq2Length; j++){
504	unsigned char c2 = (j == 0) ? '~' : (unsigned char) toupper(iter2[j]);
505
506	if (i > 0 && j > 0){
507	if (enableTrainEmissions && i == 1 && j == 1){
508	LOG_PLUS_EQUALS (pairCounts[c1][c2],
509	initialDistribution[0] + matchProb[c1][c2] + backward[0 + ij]);
510	LOG_PLUS_EQUALS (pairCounts[c2][c1],
511	initialDistribution[0] + matchProb[c2][c1] + backward[0 + ij]);
512	}
513
514	for (int k = 0; k < NumMatrixTypes; k++){
515	LOG_PLUS_EQUALS (transCounts[k][0],
516	forward[k + i1j1] + transProb[k][0] +
517	matchProb[c1][c2] + backward[0 + ij]);
518	if (enableTrainEmissions && i != 1 \|\| j != 1){
519	LOG_PLUS_EQUALS (pairCounts[c1][c2],
520	forward[k + i1j1] + transProb[k][0] +
521	matchProb[c1][c2] + backward[0 + ij]);
522	LOG_PLUS_EQUALS (pairCounts[c2][c1],
523	forward[k + i1j1] + transProb[k][0] +
524	matchProb[c2][c1] + backward[0 + ij]);
525	}
526	}
527	}
528	if (i > 0){
529	for (int k = 0; k < NumInsertStates; k++){
530	LOG_PLUS_EQUALS (transCounts[0][2*k+1],
531	forward[0 + i1j] + transProb[0][2*k+1] +
532	insProb[c1][k] + backward[2*k+1 + ij]);
533	LOG_PLUS_EQUALS (transCounts[2k+1][2k+1],
534	forward[2k+1 + i1j] + transProb[2k+1][2*k+1] +
535	insProb[c1][k] + backward[2*k+1 + ij]);
536	if (enableTrainEmissions){
537	if (i == 1 && j == 0){
538	LOG_PLUS_EQUALS (singleCounts[c1],
539	initialDistribution[2k+1] + insProb[c1][k] + backward[2k+1 + ij]);
540	}
541	else {
542	LOG_PLUS_EQUALS (singleCounts[c1],
543	forward[0 + i1j] + transProb[0][2*k+1] +
544	insProb[c1][k] + backward[2*k+1 + ij]);
545	LOG_PLUS_EQUALS (singleCounts[c1],
546	forward[2k+1 + i1j] + transProb[2k+1][2*k+1] +
547	insProb[c1][k] + backward[2*k+1 + ij]);
548	}
549	}
550	}
551	}
552	if (j > 0){
553	for (int k = 0; k < NumInsertStates; k++){
554	LOG_PLUS_EQUALS (transCounts[0][2*k+2],
555	forward[0 + ij1] + transProb[0][2*k+2] +
556	insProb[c2][k] + backward[2*k+2 + ij]);
557	LOG_PLUS_EQUALS (transCounts[2k+2][2k+2],
558	forward[2k+2 + ij1] + transProb[2k+2][2*k+2] +
559	insProb[c2][k] + backward[2*k+2 + ij]);
560	if (enableTrainEmissions){
561	if (i == 0 && j == 1){
562	LOG_PLUS_EQUALS (singleCounts[c2],
563	initialDistribution[2k+2] + insProb[c2][k] + backward[2k+2 + ij]);
564	}
565	else {
566	LOG_PLUS_EQUALS (singleCounts[c2],
567	forward[0 + ij1] + transProb[0][2*k+2] +
568	insProb[c2][k] + backward[2*k+2 + ij]);
569	LOG_PLUS_EQUALS (singleCounts[c2],
570	forward[2k+2 + ij1] + transProb[2k+2][2*k+2] +
571	insProb[c2][k] + backward[2*k+2 + ij]);
572	}
573	}
574	}
575	}
576
577	ij += NumMatrixTypes;
578	i1j += NumMatrixTypes;
579	ij1 += NumMatrixTypes;
580	i1j1 += NumMatrixTypes;
581	}
582	}
583
584	// scale all expected counts appropriately
585	for (int i = 0; i < NumMatrixTypes; i++){
586	initCounts[i] -= totalProb;
587	for (int j = 0; j < NumMatrixTypes; j++)
588	transCounts[i][j] -= totalProb;
589	}
590	if (enableTrainEmissions){
591	for (int i = 0; i < 256; i++){
592	for (int j = 0; j < 256; j++)
593	pairCounts[i][j] -= totalProb;
594	singleCounts[i] -= totalProb;
595	}
596	}
597
598	// compute new initial distribution
599	float totalInitDistribCounts = 0;
600	for (int i = 0; i < NumMatrixTypes; i++)
601	totalInitDistribCounts += exp (initCounts[i]); // should be 2
602	initDistribMat[0] = min (1.0f, max (0.0f, (float) exp (initCounts[0]) / totalInitDistribCounts));
603	for (int k = 0; k < NumInsertStates; k++){
604	float val = (exp (initCounts[2k+1]) + exp (initCounts[2k+2])) / 2;
605	initDistribMat[2k+1] = initDistribMat[2k+2] = min (1.0f, max (0.0f, val / totalInitDistribCounts));
606	}
607
608	// compute total counts for match state
609	float inMatchStateCounts = 0;
610	for (int i = 0; i < NumMatrixTypes; i++)
611	inMatchStateCounts += exp (transCounts[0][i]);
612	for (int i = 0; i < NumInsertStates; i++){
613
614	// compute total counts for gap state
615	float inGapStateCounts =
616	exp (transCounts[2*i+1][0]) +
617	exp (transCounts[2i+1][2i+1]) +
618	exp (transCounts[2*i+2][0]) +
619	exp (transCounts[2i+2][2i+2]);
620
621	gapOpen[2i] = gapOpen[2i+1] =
622	(exp (transCounts[0][2*i+1]) +
623	exp (transCounts[0][2*i+2])) /
624	(2 * inMatchStateCounts);
625
626	gapExtend[2i] = gapExtend[2i+1] =
627	(exp (transCounts[2i+1][2i+1]) +
628	exp (transCounts[2i+2][2i+2])) /
629	inGapStateCounts;
630	}
631
632	if (enableTrainEmissions){
633	float totalPairCounts = 0;
634	float totalSingleCounts = 0;
635	for (int i = 0; i < 256; i++){
636	for (int j = 0; j <= i; j++)
637	totalPairCounts += exp (pairCounts[j][i]);
638	totalSingleCounts += exp (singleCounts[i]);
639	}
640
641	for (int i = 0; i < 256; i++) if (!islower ((char) i)){
642	int li = (int)((unsigned char) tolower ((char) i));
643	for (int j = 0; j <= i; j++) if (!islower ((char) j)){
644	int lj = (int)((unsigned char) tolower ((char) j));
645	emitPairs[i][j] = emitPairs[i][lj] = emitPairs[li][j] = emitPairs[li][lj] =
646	emitPairs[j][i] = emitPairs[j][li] = emitPairs[lj][i] = emitPairs[lj][li] = exp(pairCounts[j][i]) / totalPairCounts;
647	}
648	emitSingle[i] = emitSingle[li] = exp(singleCounts[i]) / totalSingleCounts;
649	}
650	}
651	}
652
653	/////////////////////////////////////////////////////////////////
654	// ProbabilisticModel::ComputeAlignment()
655	//
656	// Computes an alignment based on the given posterior matrix.
657	// This is done by finding the maximum summing path (or
658	// maximum weight trace) through the posterior matrix. The
659	// final alignment is returned as a pair consisting of:
660	// (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
661	// denote insertions in one of the two sequences and
662	// B's denote that both sequences are present (i.e.
663	// matches).
664	// (2) a float indicating the sum achieved
665	/////////////////////////////////////////////////////////////////
666
667	pair<SafeVector<char> *, float> ComputeAlignment (int seq1Length, int seq2Length, const VF &posterior) const {
668
669	float twoRows = new float[(seq2Length+1)2]; assert (twoRows);
670	float *oldRow = twoRows;
671	float *newRow = twoRows + seq2Length + 1;
672
673	char tracebackMatrix = new char[(seq1Length+1)(seq2Length+1)]; assert (tracebackMatrix);
674	char *tracebackPtr = tracebackMatrix;
675
676	VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
677
678	// initialization
679	for (int i = 0; i <= seq2Length; i++){
680	oldRow[i] = 0;
681	*(tracebackPtr++) = 'L';
682	}
683
684	// fill in matrix
685	for (int i = 1; i <= seq1Length; i++){
686
687	// initialize left column
688	newRow[0] = 0;
689	posteriorPtr++;
690	*(tracebackPtr++) = 'U';
691
692	// fill in rest of row
693	for (int j = 1; j <= seq2Length; j++){
694	ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1], newRow[j-1], oldRow[j],
695	'D', 'L', 'U', &newRow[j], tracebackPtr++); // Match, insert, delete
696	}
697
698	// swap rows
699	float *temp = oldRow;
700	oldRow = newRow;
701	newRow = temp;
702	}
703
704	// store best score
705	float total = oldRow[seq2Length];
706	delete [] twoRows;
707
708	// compute traceback
709	SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
710	int r = seq1Length, c = seq2Length;
711	while (r != 0 \|\| c != 0){
712	char ch = tracebackMatrix[r*(seq2Length+1) + c];
713	switch (ch){
714	case 'L': c--; alignment->push_back ('Y'); break;
715	case 'U': r--; alignment->push_back ('X'); break;
716	case 'D': c--; r--; alignment->push_back ('B'); break;
717	default: assert (false);
718	}
719	}
720
721	delete [] tracebackMatrix;
722
723	reverse (alignment->begin(), alignment->end());
724
725	return make_pair(alignment, total);
726	}
727
728	/////////////////////////////////////////////////////////////////
729	// ProbabilisticModel::ComputeAlignment2()
730	//
731	// Computes an alignment based on the given posterior matrix.
732	// This is done by finding the maximum summing path (or
733	// maximum weight trace) through the posterior matrix. The
734	// final alignment is returned as a pair consisting of:
735	// (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
736	// denote insertions in one of the two sequences and
737	// B's denote that both sequences are present (i.e.
738	// matches).
739	// (2) a float indicating the sum achieved
740	/////////////////////////////////////////////////////////////////
741
742	pair<SafeVector<char> *, float> ComputeAlignment2 (int seq1Length, int seq2Length,
743	const VF &posterior, std::vector<StemCandidate> pscs1, std::vector<StemCandidate> pscs2,
744	std::vector<int> matchPSCS1, std::vector<int> matchPSCS2) const {
745	NRMat<float> WM(seq1Length + 1, seq2Length + 1);
746	for (int i = 0; i <= seq1Length; i++) {
747	for (int j = 0; j <= seq2Length; j++) {
748	WM[i][j] = 0;
749	}
750	}
751
752	int len = WORDLENGTH;
753	int size = matchPSCS1->size();
754	float weight = 1000;
755
756	for(int iter = 0; iter < size; iter++) {
757	int i = matchPSCS1->at(iter);
758	int j = matchPSCS2->at(iter);
759
760	const StemCandidate &sc1 = pscs1->at(i);
761	const StemCandidate &sc2 = pscs2->at(j);
762	for(int k = 0; k < len; k++) {
763	WM[sc1.GetPosition() + k][sc2.GetPosition() + k] += weight;
764	}
765	}
766	float twoRows = new float[(seq2Length+1)2]; assert (twoRows);
767	float *oldRow = twoRows;
768	float *newRow = twoRows + seq2Length + 1;
769
770	char tracebackMatrix = new char[(seq1Length+1)(seq2Length+1)]; assert (tracebackMatrix);
771	char *tracebackPtr = tracebackMatrix;
772
773	VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
774
775	// initialization
776	for (int i = 0; i <= seq2Length; i++){
777	oldRow[i] = 0;
778	*(tracebackPtr++) = 'L';
779	}
780
781	// fill in matrix
782	for (int i = 1; i <= seq1Length; i++){
783
784	// initialize left column
785	newRow[0] = 0;
786	posteriorPtr++;
787	*(tracebackPtr++) = 'U';
788
789	// fill in rest of row
790	for (int j = 1; j <= seq2Length; j++){
791	ChooseBestOfThree (*(posteriorPtr++) + oldRow[j-1] + WM[i][j], newRow[j-1], oldRow[j],
792	'D', 'L', 'U', &newRow[j], tracebackPtr++);
793	}
794
795	// swap rows
796	float *temp = oldRow;
797	oldRow = newRow;
798	newRow = temp;
799	}
800
801	// store best score
802	float total = oldRow[seq2Length];
803	delete [] twoRows;
804
805	// compute traceback
806	SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
807	int r = seq1Length, c = seq2Length;
808	while (r != 0 \|\| c != 0){
809	char ch = tracebackMatrix[r*(seq2Length+1) + c];
810	switch (ch){
811	case 'L': c--; alignment->push_back ('Y'); break;
812	case 'U': r--; alignment->push_back ('X'); break;
813	case 'D': c--; r--; alignment->push_back ('B'); break;
814	default: assert (false);
815	}
816	}
817
818	delete [] tracebackMatrix;
819
820	reverse (alignment->begin(), alignment->end());
821
822	return make_pair(alignment, total);
823	}
824
825	/////////////////////////////////////////////////////////////////
826	// ProbabilisticModel::ComputeAlignmentWithGapPenalties()
827	//
828	// Similar to ComputeAlignment() except with gap penalties.
829	/////////////////////////////////////////////////////////////////
830
831	pair<SafeVector<char> , float> ComputeAlignmentWithGapPenalties (MultiSequence align1,
832	MultiSequence *align2,
833	const VF &posterior, int numSeqs1,
834	int numSeqs2,
835	float gapOpenPenalty,
836	float gapContinuePenalty) const {
837	int seq1Length = align1->GetSequence(0)->GetLength();
838	int seq2Length = align2->GetSequence(0)->GetLength();
839	SafeVector<SafeVector<char>::iterator > dataPtrs1 (align1->GetNumSequences());
840	SafeVector<SafeVector<char>::iterator > dataPtrs2 (align2->GetNumSequences());
841
842	// grab character data
843	for (int i = 0; i < align1->GetNumSequences(); i++)
844	dataPtrs1[i] = align1->GetSequence(i)->GetDataPtr();
845	for (int i = 0; i < align2->GetNumSequences(); i++)
846	dataPtrs2[i] = align2->GetSequence(i)->GetDataPtr();
847
848	// the number of active sequences at any given column is defined to be the
849	// number of non-gap characters in that column; the number of gap opens at
850	// any given column is defined to be the number of gap characters in that
851	// column where the previous character in the respective sequence was not
852	// a gap
853	SafeVector<int> numActive1 (seq1Length+1), numGapOpens1 (seq1Length+1);
854	SafeVector<int> numActive2 (seq2Length+1), numGapOpens2 (seq2Length+1);
855
856	// compute number of active sequences and gap opens for each group
857	for (int i = 0; i < align1->GetNumSequences(); i++){
858	SafeVector<char>::iterator dataPtr = align1->GetSequence(i)->GetDataPtr();
859	numActive1[0] = numGapOpens1[0] = 0;
860	for (int j = 1; j <= seq1Length; j++){
861	if (dataPtr[j] != '-'){
862	numActive1[j]++;
863	numGapOpens1[j] += (j != 1 && dataPtr[j-1] != '-');
864	}
865	}
866	}
867	for (int i = 0; i < align2->GetNumSequences(); i++){
868	SafeVector<char>::iterator dataPtr = align2->GetSequence(i)->GetDataPtr();
869	numActive2[0] = numGapOpens2[0] = 0;
870	for (int j = 1; j <= seq2Length; j++){
871	if (dataPtr[j] != '-'){
872	numActive2[j]++;
873	numGapOpens2[j] += (j != 1 && dataPtr[j-1] != '-');
874	}
875	}
876	}
877
878	VVF openingPenalty1 (numSeqs1+1, VF (numSeqs2+1));
879	VF continuingPenalty1 (numSeqs1+1);
880	VVF openingPenalty2 (numSeqs1+1, VF (numSeqs2+1));
881	VF continuingPenalty2 (numSeqs2+1);
882
883	// precompute penalties
884	for (int i = 0; i <= numSeqs1; i++)
885	for (int j = 0; j <= numSeqs2; j++)
886	openingPenalty1[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs2 - j));
887	for (int i = 0; i <= numSeqs1; i++)
888	continuingPenalty1[i] = i * gapContinuePenalty * numSeqs2;
889	for (int i = 0; i <= numSeqs2; i++)
890	for (int j = 0; j <= numSeqs1; j++)
891	openingPenalty2[i][j] = i * (gapOpenPenalty * j + gapContinuePenalty * (numSeqs1 - j));
892	for (int i = 0; i <= numSeqs2; i++)
893	continuingPenalty2[i] = i * gapContinuePenalty * numSeqs1;
894
895	float twoRows = new float[6(seq2Length+1)]; assert (twoRows);
896	float *oldRowMatch = twoRows;
897	float *newRowMatch = twoRows + (seq2Length+1);
898	float oldRowInsertX = twoRows + 2(seq2Length+1);
899	float newRowInsertX = twoRows + 3(seq2Length+1);
900	float oldRowInsertY = twoRows + 4(seq2Length+1);
901	float newRowInsertY = twoRows + 5(seq2Length+1);
902
903	char tracebackMatrix = new char[3(seq1Length+1)*(seq2Length+1)]; assert (tracebackMatrix);
904	char *tracebackPtr = tracebackMatrix;
905
906	VF::const_iterator posteriorPtr = posterior.begin() + seq2Length + 1;
907
908	// initialization
909	for (int i = 0; i <= seq2Length; i++){
910	oldRowMatch[i] = oldRowInsertX[i] = (i == 0) ? 0 : LOG_ZERO;
911	oldRowInsertY[i] = (i == 0) ? 0 : oldRowInsertY[i-1] + continuingPenalty2[numActive2[i]];
912	(tracebackPtr) = (tracebackPtr+1) = *(tracebackPtr+2) = 'Y';
913	tracebackPtr += 3;
914	}
915
916	// fill in matrix
917	for (int i = 1; i <= seq1Length; i++){
918
919	// initialize left column
920	newRowMatch[0] = newRowInsertY[0] = LOG_ZERO;
921	newRowInsertX[0] = oldRowInsertX[0] + continuingPenalty1[numActive1[i]];
922	posteriorPtr++;
923	(tracebackPtr) = (tracebackPtr+1) = *(tracebackPtr+2) = 'X';
924	tracebackPtr += 3;
925
926	// fill in rest of row
927	for (int j = 1; j <= seq2Length; j++){
928
929	// going to MATCH state
930	ChooseBestOfThree (oldRowMatch[j-1],
931	oldRowInsertX[j-1],
932	oldRowInsertY[j-1],
933	'M', 'X', 'Y', &newRowMatch[j], tracebackPtr++);
934	newRowMatch[j] += *(posteriorPtr++);
935
936	// going to INSERT X state
937	ChooseBestOfThree (oldRowMatch[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
938	oldRowInsertX[j] + continuingPenalty1[numActive1[i]],
939	oldRowInsertY[j] + openingPenalty1[numActive1[i]][numGapOpens2[j]],
940	'M', 'X', 'Y', &newRowInsertX[j], tracebackPtr++);
941
942	// going to INSERT Y state
943	ChooseBestOfThree (newRowMatch[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
944	newRowInsertX[j-1] + openingPenalty2[numActive2[j]][numGapOpens1[i]],
945	newRowInsertY[j-1] + continuingPenalty2[numActive2[j]],
946	'M', 'X', 'Y', &newRowInsertY[j], tracebackPtr++);
947	}
948
949	// swap rows
950	float *temp;
951	temp = oldRowMatch; oldRowMatch = newRowMatch; newRowMatch = temp;
952	temp = oldRowInsertX; oldRowInsertX = newRowInsertX; newRowInsertX = temp;
953	temp = oldRowInsertY; oldRowInsertY = newRowInsertY; newRowInsertY = temp;
954	}
955
956	// store best score
957	float total;
958	char matrix;
959	ChooseBestOfThree (oldRowMatch[seq2Length], oldRowInsertX[seq2Length], oldRowInsertY[seq2Length],
960	'M', 'X', 'Y', &total, &matrix);
961
962	delete [] twoRows;
963
964	// compute traceback
965	SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
966	int r = seq1Length, c = seq2Length;
967	while (r != 0 \|\| c != 0){
968
969	int offset = (matrix == 'M') ? 0 : (matrix == 'X') ? 1 : 2;
970	char ch = tracebackMatrix[(r(seq2Length+1) + c) 3 + offset];
971	switch (matrix){
972	case 'Y': c--; alignment->push_back ('Y'); break;
973	case 'X': r--; alignment->push_back ('X'); break;
974	case 'M': c--; r--; alignment->push_back ('B'); break;
975	default: assert (false);
976	}
977	matrix = ch;
978	}
979
980	delete [] tracebackMatrix;
981
982	reverse (alignment->begin(), alignment->end());
983
984	return make_pair(alignment, 1.0f);
985	}
986
987
988	/////////////////////////////////////////////////////////////////
989	// ProbabilisticModel::ComputeViterbiAlignment()
990	//
991	// Computes the highest probability pairwise alignment using the
992	// probabilistic model. The final alignment is returned as a
993	// pair consisting of:
994	// (1) a string (e.g., XXXBBXXXBBBBBBYYYYBBB) where X's and
995	// denote insertions in one of the two sequences and
996	// B's denote that both sequences are present (i.e.
997	// matches).
998	// (2) a float containing the log probability of the best
999	// alignment (not used)
1000	/////////////////////////////////////////////////////////////////
1001
1002	pair<SafeVector<char> , float> ComputeViterbiAlignment (Sequence seq1, Sequence *seq2) const {
1003
1004	assert (seq1);
1005	assert (seq2);
1006
1007	const int seq1Length = seq1->GetLength();
1008	const int seq2Length = seq2->GetLength();
1009
1010	// retrieve the points to the beginning of each sequence
1011	SafeVector<char>::iterator iter1 = seq1->GetDataPtr();
1012	SafeVector<char>::iterator iter2 = seq2->GetDataPtr();
1013
1014	// create viterbi matrix
1015	VF viterbiPtr = new VF (NumMatrixTypes (seq1Length+1) * (seq2Length+1), LOG_ZERO);
1016	assert (viterbiPtr);
1017	VF &viterbi = *viterbiPtr;
1018
1019	// create traceback matrix
1020	VI tracebackPtr = new VI (NumMatrixTypes (seq1Length+1) * (seq2Length+1), -1);
1021	assert (tracebackPtr);
1022	VI &traceback = *tracebackPtr;
1023
1024	// initialization condition
1025	for (int k = 0; k < NumMatrixTypes; k++)
1026	viterbi[k] = initialDistribution[k];
1027
1028	// remember offset for each index combination
1029	int ij = 0;
1030	int i1j = -seq2Length - 1;
1031	int ij1 = -1;
1032	int i1j1 = -seq2Length - 2;
1033
1034	ij *= NumMatrixTypes;
1035	i1j *= NumMatrixTypes;
1036	ij1 *= NumMatrixTypes;
1037	i1j1 *= NumMatrixTypes;
1038
1039	// compute viterbi scores
1040	for (int i = 0; i <= seq1Length; i++){
1041	unsigned char c1 = (i == 0) ? '~' : (unsigned char) iter1[i];
1042	for (int j = 0; j <= seq2Length; j++){
1043	unsigned char c2 = (j == 0) ? '~' : (unsigned char) iter2[j];
1044
1045	if (i > 0 && j > 0){
1046	for (int k = 0; k < NumMatrixTypes; k++){
1047	float newVal = viterbi[k + i1j1] + transProb[k][0] + matchProb[c1][c2];
1048	if (viterbi[0 + ij] < newVal){
1049	viterbi[0 + ij] = newVal;
1050	traceback[0 + ij] = k;
1051	}
1052	}
1053	}
1054	if (i > 0){
1055	for (int k = 0; k < NumInsertStates; k++){
1056	float valFromMatch = insProb[c1][k] + viterbi[0 + i1j] + transProb[0][2*k+1];
1057	float valFromIns = insProb[c1][k] + viterbi[2k+1 + i1j] + transProb[2k+1][2*k+1];
1058	if (valFromMatch >= valFromIns){
1059	viterbi[2*k+1 + ij] = valFromMatch;
1060	traceback[2*k+1 + ij] = 0;
1061	}
1062	else {
1063	viterbi[2*k+1 + ij] = valFromIns;
1064	traceback[2k+1 + ij] = 2k+1;
1065	}
1066	}
1067	}
1068	if (j > 0){
1069	for (int k = 0; k < NumInsertStates; k++){
1070	float valFromMatch = insProb[c2][k] + viterbi[0 + ij1] + transProb[0][2*k+2];
1071	float valFromIns = insProb[c2][k] + viterbi[2k+2 + ij1] + transProb[2k+2][2*k+2];
1072	if (valFromMatch >= valFromIns){
1073	viterbi[2*k+2 + ij] = valFromMatch;
1074	traceback[2*k+2 + ij] = 0;
1075	}
1076	else {
1077	viterbi[2*k+2 + ij] = valFromIns;
1078	traceback[2k+2 + ij] = 2k+2;
1079	}
1080	}
1081	}
1082
1083	ij += NumMatrixTypes;
1084	i1j += NumMatrixTypes;
1085	ij1 += NumMatrixTypes;
1086	i1j1 += NumMatrixTypes;
1087	}
1088	}
1089
1090	// figure out best terminating cell
1091	float bestProb = LOG_ZERO;
1092	int state = -1;
1093	for (int k = 0; k < NumMatrixTypes; k++){
1094	float thisProb = viterbi[k + NumMatrixTypes * ((seq1Length+1)*(seq2Length+1) - 1)] + initialDistribution[k];
1095	if (bestProb < thisProb){
1096	bestProb = thisProb;
1097	state = k;
1098	}
1099	}
1100	assert (state != -1);
1101
1102	delete viterbiPtr;
1103
1104	// compute traceback
1105	SafeVector<char> *alignment = new SafeVector<char>; assert (alignment);
1106	int r = seq1Length, c = seq2Length;
1107	while (r != 0 \|\| c != 0){
1108	int newState = traceback[state + NumMatrixTypes * (r * (seq2Length+1) + c)];
1109
1110	if (state == 0){ c--; r--; alignment->push_back ('B'); }
1111	else if (state % 2 == 1){ r--; alignment->push_back ('X'); }
1112	else { c--; alignment->push_back ('Y'); }
1113
1114	state = newState;
1115	}
1116
1117	delete tracebackPtr;
1118
1119	reverse (alignment->begin(), alignment->end());
1120
1121	return make_pair(alignment, bestProb);
1122	}
1123
1124	/////////////////////////////////////////////////////////////////
1125	// ProbabilisticModel::BuildPosterior()
1126	//
1127	// Builds a posterior probability matrix needed to align a pair
1128	// of alignments. Mathematically, the returned matrix M is
1129	// defined as follows:
1130	// M[i,j] = sum sum f(s,t,i,j)
1131	// s in align1 t in align2
1132	// where
1133	// [ P(s[i'] <--> t[j'])
1134	// [ if s[i'] is a letter in the ith column of align1 and
1135	// [ t[j'] it a letter in the jth column of align2
1136	// f(s,t,i,j) = [
1137	// [ 0 otherwise
1138	//
1139	/////////////////////////////////////////////////////////////////
1140
1141	VF BuildPosterior (MultiSequence align1, MultiSequence *align2,
1142	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1143	float cutoff = 0.0f) const {
1144	const int seq1Length = align1->GetSequence(0)->GetLength();
1145	const int seq2Length = align2->GetSequence(0)->GetLength();
1146
1147	VF posteriorPtr = new VF((seq1Length+1) (seq2Length+1), 0); assert (posteriorPtr);
1148	VF &posterior = *posteriorPtr;
1149	VF::iterator postPtr = posterior.begin();
1150
1151	// for each s in align1
1152	for (int i = 0; i < align1->GetNumSequences(); i++){
1153	int first = align1->GetSequence(i)->GetLabel();
1154	SafeVector<int> *mapping1 = align1->GetSequence(i)->GetMapping();
1155
1156	// for each t in align2
1157	for (int j = 0; j < align2->GetNumSequences(); j++){
1158	int second = align2->GetSequence(j)->GetLabel();
1159	SafeVector<int> *mapping2 = align2->GetSequence(j)->GetMapping();
1160	if (first < second){
1161
1162	// get the associated sparse matrix
1163	SparseMatrix *matrix = sparseMatrices[first][second];
1164
1165	for (int ii = 1; ii <= matrix->GetSeq1Length(); ii++){
1166	SafeVector<PIF>::iterator row = matrix->GetRowPtr(ii);
1167	int base = (mapping1)[ii] (seq2Length+1);
1168	int rowSize = matrix->GetRowSize(ii);
1169	// add in all relevant values
1170	for (int jj = 0; jj < rowSize; jj++)
1171	posterior[base + (*mapping2)[row[jj].first]] += row[jj].second;
1172
1173	// subtract cutoff
1174	for (int jj = 0; jj < matrix->GetSeq2Length(); jj++) {
1175	posterior[base + (*mapping2)[jj]] -= cutoff;
1176	}
1177
1178	}
1179
1180	} else {
1181	// get the associated sparse matrix
1182	SparseMatrix *matrix = sparseMatrices[second][first];
1183
1184	for (int jj = 1; jj <= matrix->GetSeq1Length(); jj++){
1185	SafeVector<PIF>::iterator row = matrix->GetRowPtr(jj);
1186	int base = (*mapping2)[jj];
1187	int rowSize = matrix->GetRowSize(jj);
1188
1189	// add in all relevant values
1190	for (int ii = 0; ii < rowSize; ii++)
1191	posterior[base + (mapping1)[row[ii].first] (seq2Length + 1)] += row[ii].second;
1192
1193	// subtract cutoff
1194	for (int ii = 0; ii < matrix->GetSeq2Length(); ii++)
1195	posterior[base + (mapping1)[ii] (seq2Length + 1)] -= cutoff;
1196	}
1197
1198	}
1199
1200
1201	delete mapping2;
1202	}
1203
1204	delete mapping1;
1205	}
1206
1207	return posteriorPtr;
1208	}
1209	};
1210	}
1211	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/GDE/MAFFT/mafft-7.055-with-extensions/extensions/mxscarna_src/probconsRNA/ProbabilisticModel.h

Download in other formats: