Context Navigation

source: branches/ali/GDE/PROBCONS/probcons/Main.cc

Visit:

Last change on this file was 10411, checked in by aboeckma, 12 years ago
make probcons compile on linux (added cstring include )
File size: 51.8 KB

Line
1	/////////////////////////////////////////////////////////////////
2	// Main.cc
3	//
4	// Main routines for PROBCONS program.
5	/////////////////////////////////////////////////////////////////
6
7	#include "SafeVector.h"
8	#include "MultiSequence.h"
9	#include "Defaults.h"
10	#include "ScoreType.h"
11	#include "ProbabilisticModel.h"
12	#include "EvolutionaryTree.h"
13	#include "SparseMatrix.h"
14	#include <string>
15	#include <sstream>
16	#include <iomanip>
17	#include <iostream>
18	#include <list>
19	#include <set>
20	#include <algorithm>
21	#include <climits>
22	#include <cstdio>
23	#include <cstdlib>
24	#include <cerrno>
25	#include <iomanip>
26	#include <cstring>
27
28	string parametersInputFilename = "";
29	string parametersOutputFilename = "no training";
30	string annotationFilename = "";
31
32	bool enableTraining = false;
33	bool enableVerbose = false;
34	bool enableAllPairs = false;
35	bool enableAnnotation = false;
36	bool enableViterbi = false;
37	bool enableClustalWOutput = false;
38	bool enableTrainEmissions = false;
39	bool enableAlignOrder = false;
40	int numConsistencyReps = 2;
41	int numPreTrainingReps = 0;
42	int numIterativeRefinementReps = 100;
43
44	float cutoff = 0;
45	float gapOpenPenalty = 0;
46	float gapContinuePenalty = 0;
47
48	VF initDistrib (NumMatrixTypes);
49	VF gapOpen (2*NumInsertStates);
50	VF gapExtend (2*NumInsertStates);
51	VVF emitPairs (256, VF (256, 1e-10));
52	VF emitSingle (256, 1e-5);
53
54	string alphabet = alphabetDefault;
55
56	const int MIN_PRETRAINING_REPS = 0;
57	const int MAX_PRETRAINING_REPS = 20;
58	const int MIN_CONSISTENCY_REPS = 0;
59	const int MAX_CONSISTENCY_REPS = 5;
60	const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
61	const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
62
63	/////////////////////////////////////////////////////////////////
64	// Function prototypes
65	/////////////////////////////////////////////////////////////////
66
67	void PrintHeading();
68	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
69	const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
70	MultiSequence DoAlign (MultiSequence sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend,
71	VVF &emitPairs, VF &emitSingle);
72	SafeVector<string> ParseParams (int argc, char **argv);
73	void ReadParameters ();
74	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
75	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
76	const ProbabilisticModel &model);
77	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
78	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
79	const ProbabilisticModel &model);
80	SafeVector<SafeVector<SparseMatrix > > DoRelaxation (MultiSequence sequences,
81	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
82	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior);
83	void Relax1 (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior);
84
85	set<int> GetSubtree (const TreeNode *tree);
86	void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
87	const ProbabilisticModel &model, MultiSequence* &alignment,
88	const TreeNode *tree);
89	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
90	const ProbabilisticModel &model, MultiSequence* &alignment);
91	void WriteAnnotation (MultiSequence *alignment,
92	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
93	int ComputeScore (const SafeVector<pair<int, int> > &active,
94	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
95
96
97	/////////////////////////////////////////////////////////////////
98	// main()
99	//
100	// Calls all initialization routines and runs the PROBCONS
101	// aligner.
102	/////////////////////////////////////////////////////////////////
103
104	int main (int argc, char **argv){
105
106	// print PROBCONS heading
107	PrintHeading();
108
109	// parse program parameters
110	SafeVector<string> sequenceNames = ParseParams (argc, argv);
111	ReadParameters();
112	PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
113
114	// now, we'll process all the files given as input. If we are given
115	// several filenames as input, then we'll load all of those sequences
116	// simultaneously, as long as we're not training. On the other hand,
117	// if we are training, then we'll treat each file as a separate
118	// training instance
119
120	// if we are training
121	if (enableTraining){
122
123	// build new model for aligning
124	ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
125
126	// prepare to average parameters
127	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
128	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
129	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
130	if (enableTrainEmissions){
131	for (int i = 0; i < (int) emitPairs.size(); i++)
132	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
133	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
134	}
135
136	// align each file individually
137	for (int i = 0; i < (int) sequenceNames.size(); i++){
138
139	VF thisInitDistrib (NumMatrixTypes);
140	VF thisGapOpen (2*NumInsertStates);
141	VF thisGapExtend (2*NumInsertStates);
142	VVF thisEmitPairs (256, VF (256, 1e-10));
143	VF thisEmitSingle (256, 1e-5);
144
145	// load sequence file
146	MultiSequence *sequences = new MultiSequence(); assert (sequences);
147	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
148	sequences->LoadMFA (sequenceNames[i], true);
149
150	// align sequences
151	DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
152
153	// add in contribution of the derived parameters
154	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
155	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
156	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
157	if (enableTrainEmissions){
158	for (int i = 0; i < (int) emitPairs.size(); i++)
159	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
160	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
161	}
162
163	delete sequences;
164	}
165
166	// compute new parameters and print them out
167	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
168	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
169	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
170	if (enableTrainEmissions){
171	for (int i = 0; i < (int) emitPairs.size(); i++)
172	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
173	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
174	}
175
176	PrintParameters ("Trained parameter set:",
177	initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
178	parametersOutputFilename.c_str());
179	}
180
181	// if we are not training, we must simply want to align some sequences
182	else {
183
184	// load all files together
185	MultiSequence *sequences = new MultiSequence(); assert (sequences);
186	for (int i = 0; i < (int) sequenceNames.size(); i++){
187	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
188	sequences->LoadMFA (sequenceNames[i], true);
189	}
190
191	// do all "pre-training" repetitions first
192	for (int ct = 0; ct < numPreTrainingReps; ct++){
193	enableTraining = true;
194
195	// build new model for aligning
196	ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
197	emitPairs, emitSingle);
198
199	// do initial alignments
200	DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
201
202	// print new parameters
203	PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
204
205	enableTraining = false;
206	}
207
208	// now, we can perform the alignments and write them out
209	MultiSequence *alignment = DoAlign (sequences,
210	ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
211	initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
212
213	if (!enableAllPairs){
214	if (enableClustalWOutput)
215	alignment->WriteALN (cout);
216	else
217	alignment->WriteMFA (cout);
218	}
219	delete alignment;
220	delete sequences;
221
222	}
223	}
224
225	/////////////////////////////////////////////////////////////////
226	// PrintHeading()
227	//
228	// Prints heading for PROBCONS program.
229	/////////////////////////////////////////////////////////////////
230
231	void PrintHeading (){
232	cerr << endl
233	<< "PROBCONS version " << VERSION << " - align multiple protein sequences and print to standard output" << endl
234	<< "Written by Chuong Do" << endl
235	<< endl;
236	}
237
238	/////////////////////////////////////////////////////////////////
239	// PrintParameters()
240	//
241	// Prints PROBCONS parameters to STDERR. If a filename is
242	// specified, then the parameters are also written to the file.
243	/////////////////////////////////////////////////////////////////
244
245	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
246	const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
247
248	// print parameters to the screen
249	cerr << message << endl
250	<< " initDistrib[] = { ";
251	for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
252	cerr << "}" << endl
253	<< " gapOpen[] = { ";
254	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
255	cerr << "}" << endl
256	<< " gapExtend[] = { ";
257	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
258	cerr << "}" << endl
259	<< endl;
260
261	/*
262	for (int i = 0; i < 5; i++){
263	for (int j = 0; j <= i; j++){
264	cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
265	}
266	cerr << endl;
267	}*/
268
269	// if a file name is specified
270	if (filename){
271
272	// attempt to open the file for writing
273	FILE *file = fopen (filename, "w");
274	if (!file){
275	cerr << "ERROR: Unable to write parameter file: " << filename << endl;
276	exit (1);
277	}
278
279	// if successful, then write the parameters to the file
280	for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
281	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
282	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
283	fprintf (file, "%s\n", alphabet.c_str());
284	for (int i = 0; i < (int) alphabet.size(); i++){
285	for (int j = 0; j <= i; j++)
286	fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
287	fprintf (file, "\n");
288	}
289	for (int i = 0; i < (int) alphabet.size(); i++)
290	fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
291	fprintf (file, "\n");
292	fclose (file);
293	}
294	}
295
296	/////////////////////////////////////////////////////////////////
297	// DoAlign()
298	//
299	// First computes all pairwise posterior probability matrices.
300	// Then, computes new parameters if training, or a final
301	// alignment, otherwise.
302	/////////////////////////////////////////////////////////////////
303
304	MultiSequence DoAlign (MultiSequence sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen,
305	VF &gapExtend, VVF &emitPairs, VF &emitSingle){
306
307	assert (sequences);
308
309	const int numSeqs = sequences->GetNumSequences();
310	VVF distances (numSeqs, VF (numSeqs, 0));
311	SafeVector<SafeVector<SparseMatrix > > sparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
312
313
314
315	if (enableTraining){
316	// prepare to average parameters
317	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
318	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
319	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
320	if (enableTrainEmissions){
321	for (int i = 0; i < (int) emitPairs.size(); i++)
322	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
323	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
324	}
325	}
326
327	// skip posterior calculations if we just want to do Viterbi alignments
328	if (!enableViterbi){
329
330	// do all pairwise alignments for posterior probability matrices
331	for (int a = 0; a < numSeqs-1; a++){
332	for (int b = a+1; b < numSeqs; b++){
333	Sequence *seq1 = sequences->GetSequence (a);
334	Sequence *seq2 = sequences->GetSequence (b);
335
336	// verbose output
337	if (enableVerbose)
338	cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
339	<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
340
341	// compute forward and backward probabilities
342	VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
343	VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
344
345	// if we are training, then we'll simply want to compute the
346	// expected counts for each region within the matrix separately;
347	// otherwise, we'll need to put all of the regions together and
348	// assemble a posterior probability match matrix
349
350	// so, if we're training
351	if (enableTraining){
352
353	// compute new parameters
354	VF thisInitDistrib (NumMatrixTypes);
355	VF thisGapOpen (2*NumInsertStates);
356	VF thisGapExtend (2*NumInsertStates);
357	VVF thisEmitPairs (256, VF (256, 1e-10));
358	VF thisEmitSingle (256, 1e-5);
359
360	model.ComputeNewParameters (seq1, seq2, forward, backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
361
362	// add in contribution of the derived parameters
363	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
364	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
365	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
366	if (enableTrainEmissions){
367	for (int i = 0; i < (int) emitPairs.size(); i++)
368	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
369	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
370	}
371
372	// let us know that we're done.
373	if (enableVerbose) cerr << "done." << endl;
374	}
375	else {
376
377	// compute posterior probability matrix
378	VF posterior = model.ComputePosteriorMatrix (seq1, seq2, forward, *backward); assert (posterior);
379
380	// compute sparse representations
381	sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
382	sparseMatrices[b][a] = NULL;
383
384	if (!enableAllPairs){
385	// perform the pairwise sequence alignment
386	pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
387	seq2->GetLength(),
388	*posterior);
389
390	// compute "expected accuracy" distance for evolutionary tree computation
391	float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
392	distances[a][b] = distances[b][a] = distance;
393
394	if (enableVerbose)
395	cerr << setprecision (10) << distance << endl;
396
397	delete alignment.first;
398	}
399	else {
400	// let us know that we're done.
401	if (enableVerbose) cerr << "done." << endl;
402	}
403
404	delete posterior;
405	}
406
407	delete forward;
408	delete backward;
409	}
410	}
411	}
412
413	// now average out parameters derived
414	if (enableTraining){
415
416	// compute new parameters
417	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
418	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
419	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
420
421	if (enableTrainEmissions){
422	for (int i = 0; i < (int) emitPairs.size(); i++)
423	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
424	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
425	}
426	}
427
428	// see if we still want to do some alignments
429	else {
430
431	if (!enableViterbi){
432
433	// perform the consistency transformation the desired number of times
434	for (int r = 0; r < numConsistencyReps; r++){
435	SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
436
437	// now replace the old posterior matrices
438	for (int i = 0; i < numSeqs; i++){
439	for (int j = 0; j < numSeqs; j++){
440	delete sparseMatrices[i][j];
441	sparseMatrices[i][j] = newSparseMatrices[i][j];
442	}
443	}
444	}
445	}
446
447	MultiSequence *finalAlignment = NULL;
448
449	if (enableAllPairs){
450	for (int a = 0; a < numSeqs-1; a++){
451	for (int b = a+1; b < numSeqs; b++){
452	Sequence *seq1 = sequences->GetSequence (a);
453	Sequence *seq2 = sequences->GetSequence (b);
454
455	if (enableVerbose)
456	cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
457	<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
458
459
460	// perform the pairwise sequence alignment
461	pair<SafeVector<char> *, float> alignment;
462	if (enableViterbi)
463	alignment = model.ComputeViterbiAlignment (seq1, seq2);
464	else {
465
466	// build posterior matrix
467	VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
468	int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
469	for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
470
471	alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
472	delete posterior;
473	}
474
475	// write pairwise alignments
476	string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
477	ofstream outfile (name.c_str());
478
479	MultiSequence *result = new MultiSequence();
480	result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
481	result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
482	if (enableClustalWOutput)
483	result->WriteALN (outfile);
484	else
485	result->WriteMFA (outfile);
486
487	outfile.close();
488
489	delete alignment.first;
490	}
491	}
492	}
493
494	// now if we still need to do a final multiple alignment
495	else {
496
497	if (enableVerbose)
498	cerr << endl;
499
500	// compute the evolutionary tree
501	TreeNode *tree = TreeNode::ComputeTree (distances);
502
503	tree->Print (cerr, sequences);
504	cerr << endl;
505
506	// make the final alignment
507	finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
508
509	// build annotation
510	if (enableAnnotation){
511	WriteAnnotation (finalAlignment, sparseMatrices);
512	}
513
514	delete tree;
515	}
516
517	if (!enableViterbi){
518	// delete sparse matrices
519	for (int a = 0; a < numSeqs-1; a++){
520	for (int b = a+1; b < numSeqs; b++){
521	delete sparseMatrices[a][b];
522	delete sparseMatrices[b][a];
523	}
524	}
525	}
526
527	return finalAlignment;
528	}
529
530	return NULL;
531	}
532
533	/////////////////////////////////////////////////////////////////
534	// GetInteger()
535	//
536	// Attempts to parse an integer from the character string given.
537	// Returns true only if no parsing error occurs.
538	/////////////////////////////////////////////////////////////////
539
540	bool GetInteger (char data, int val){
541	char *endPtr;
542	long int retVal;
543
544	assert (val);
545
546	errno = 0;
547	retVal = strtol (data, &endPtr, 0);
548	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
549	if (errno != 0 && (retVal == LONG_MAX \|\| retVal == LONG_MIN)) return false;
550	if (retVal < (long) INT_MIN \|\| retVal > (long) INT_MAX) return false;
551	*val = (int) retVal;
552	return true;
553	}
554
555	/////////////////////////////////////////////////////////////////
556	// GetFloat()
557	//
558	// Attempts to parse a float from the character string given.
559	// Returns true only if no parsing error occurs.
560	/////////////////////////////////////////////////////////////////
561
562	bool GetFloat (char data, float val){
563	char *endPtr;
564	double retVal;
565
566	assert (val);
567
568	errno = 0;
569	retVal = strtod (data, &endPtr);
570	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
571	if (errno != 0 && (retVal >= 1000000.0 \|\| retVal <= -1000000.0)) return false;
572	*val = (float) retVal;
573	return true;
574	}
575
576	/////////////////////////////////////////////////////////////////
577	// ParseParams()
578	//
579	// Parse all command-line options.
580	/////////////////////////////////////////////////////////////////
581
582	SafeVector<string> ParseParams (int argc, char **argv){
583
584	if (argc < 2){
585
586	cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
587	<< "you are welcome to redistribute it under certain conditions. See the" << endl
588	<< "file COPYING.txt for details." << endl
589	<< endl
590	<< "Usage:" << endl
591	<< " probcons [OPTION]... [MFAFILE]..." << endl
592	<< endl
593	<< "Description:" << endl
594	<< " Align sequences in MFAFILE(s) and print result to standard output" << endl
595	<< endl
596	<< " -clustalw" << endl
597	<< " use CLUSTALW output format instead of MFA" << endl
598	<< endl
599	<< " -c, --consistency REPS" << endl
600	<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
601	<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
602	<< endl
603	<< " -ir, --iterative-refinement REPS" << endl
604	<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
605	<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
606	<< endl
607	<< " -pre, --pre-training REPS" << endl
608	<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
609	<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
610	<< endl
611	<< " -pairs" << endl
612	<< " generate all-pairs pairwise alignments" << endl
613	<< endl
614	<< " -viterbi" << endl
615	<< " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
616	<< endl
617	<< " -v, --verbose" << endl
618	<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
619	<< endl
620	<< " -annot FILENAME" << endl
621	<< " write annotation for multiple alignment to FILENAME" << endl
622	<< endl
623	<< " -t, --train FILENAME" << endl
624	<< " compute EM transition probabilities, store in FILENAME (default: "
625	<< parametersOutputFilename << ")" << endl
626	<< endl
627	<< " -e, --emissions" << endl
628	<< " also reestimate emission probabilities (default: "
629	<< (enableTrainEmissions ? "on" : "off") << ")" << endl
630	<< endl
631	<< " -p, --paramfile FILENAME" << endl
632	<< " read parameters from FILENAME (default: "
633	<< parametersInputFilename << ")" << endl
634	<< endl
635	<< " -a, --alignment-order" << endl
636	<< " print sequences in alignment order rather than input order (default: "
637	<< (enableAlignOrder ? "on" : "off") << ")" << endl
638	<< endl;
639	// << " -go, --gap-open VALUE" << endl
640	// << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
641	// << endl
642	// << " -ge, --gap-extension VALUE" << endl
643	// << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
644	// << endl
645	// << " -co, --cutoff CUTOFF" << endl
646	// << " subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
647	// << endl
648
649	exit (1);
650	}
651
652	SafeVector<string> sequenceNames;
653	int tempInt;
654	float tempFloat;
655
656	for (int i = 1; i < argc; i++){
657	if (argv[i][0] == '-'){
658
659	// training
660	if (!strcmp (argv[i], "-t") \|\| !strcmp (argv[i], "--train")){
661	enableTraining = true;
662	if (i < argc - 1)
663	parametersOutputFilename = string (argv[++i]);
664	else {
665	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
666	exit (1);
667	}
668	}
669
670	// emission training
671	else if (!strcmp (argv[i], "-e") \|\| !strcmp (argv[i], "--emissions")){
672	enableTrainEmissions = true;
673	}
674
675	// parameter file
676	else if (!strcmp (argv[i], "-p") \|\| !strcmp (argv[i], "--paramfile")){
677	if (i < argc - 1)
678	parametersInputFilename = string (argv[++i]);
679	else {
680	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
681	exit (1);
682	}
683	}
684
685	// number of consistency transformations
686	else if (!strcmp (argv[i], "-c") \|\| !strcmp (argv[i], "--consistency")){
687	if (i < argc - 1){
688	if (!GetInteger (argv[++i], &tempInt)){
689	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
690	exit (1);
691	}
692	else {
693	if (tempInt < MIN_CONSISTENCY_REPS \|\| tempInt > MAX_CONSISTENCY_REPS){
694	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
695	<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
696	exit (1);
697	}
698	else
699	numConsistencyReps = tempInt;
700	}
701	}
702	else {
703	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
704	exit (1);
705	}
706	}
707
708	// number of randomized partitioning iterative refinement passes
709	else if (!strcmp (argv[i], "-ir") \|\| !strcmp (argv[i], "--iterative-refinement")){
710	if (i < argc - 1){
711	if (!GetInteger (argv[++i], &tempInt)){
712	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
713	exit (1);
714	}
715	else {
716	if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS \|\| tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
717	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
718	<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
719	exit (1);
720	}
721	else
722	numIterativeRefinementReps = tempInt;
723	}
724	}
725	else {
726	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
727	exit (1);
728	}
729	}
730
731	// number of EM pre-training rounds
732	else if (!strcmp (argv[i], "-pre") \|\| !strcmp (argv[i], "--pre-training")){
733	if (i < argc - 1){
734	if (!GetInteger (argv[++i], &tempInt)){
735	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
736	exit (1);
737	}
738	else {
739	if (tempInt < MIN_PRETRAINING_REPS \|\| tempInt > MAX_PRETRAINING_REPS){
740	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
741	<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
742	exit (1);
743	}
744	else
745	numPreTrainingReps = tempInt;
746	}
747	}
748	else {
749	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
750	exit (1);
751	}
752	}
753
754	// gap open penalty
755	else if (!strcmp (argv[i], "-go") \|\| !strcmp (argv[i], "--gap-open")){
756	if (i < argc - 1){
757	if (!GetFloat (argv[++i], &tempFloat)){
758	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
759	exit (1);
760	}
761	else {
762	if (tempFloat > 0){
763	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
764	exit (1);
765	}
766	else
767	gapOpenPenalty = tempFloat;
768	}
769	}
770	else {
771	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
772	exit (1);
773	}
774	}
775
776	// gap extension penalty
777	else if (!strcmp (argv[i], "-ge") \|\| !strcmp (argv[i], "--gap-extension")){
778	if (i < argc - 1){
779	if (!GetFloat (argv[++i], &tempFloat)){
780	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
781	exit (1);
782	}
783	else {
784	if (tempFloat > 0){
785	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
786	exit (1);
787	}
788	else
789	gapContinuePenalty = tempFloat;
790	}
791	}
792	else {
793	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
794	exit (1);
795	}
796	}
797
798	// all-pairs pairwise alignments
799	else if (!strcmp (argv[i], "-pairs")){
800	enableAllPairs = true;
801	}
802
803	// all-pairs pairwise Viterbi alignments
804	else if (!strcmp (argv[i], "-viterbi")){
805	enableAllPairs = true;
806	enableViterbi = true;
807	}
808
809	// annotation files
810	else if (!strcmp (argv[i], "-annot")){
811	enableAnnotation = true;
812	if (i < argc - 1)
813	annotationFilename = argv[++i];
814	else {
815	cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
816	exit (1);
817	}
818	}
819
820	// clustalw output format
821	else if (!strcmp (argv[i], "-clustalw")){
822	enableClustalWOutput = true;
823	}
824
825	// cutoff
826	else if (!strcmp (argv[i], "-co") \|\| !strcmp (argv[i], "--cutoff")){
827	if (i < argc - 1){
828	if (!GetFloat (argv[++i], &tempFloat)){
829	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
830	exit (1);
831	}
832	else {
833	if (tempFloat < 0 \|\| tempFloat > 1){
834	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
835	exit (1);
836	}
837	else
838	cutoff = tempFloat;
839	}
840	}
841	else {
842	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
843	exit (1);
844	}
845	}
846
847	// verbose reporting
848	else if (!strcmp (argv[i], "-v") \|\| !strcmp (argv[i], "--verbose")){
849	enableVerbose = true;
850	}
851
852	// alignment order
853	else if (!strcmp (argv[i], "-a") \|\| !strcmp (argv[i], "--alignment-order")){
854	enableAlignOrder = true;
855	}
856
857	// bad arguments
858	else {
859	cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
860	exit (1);
861	}
862	}
863	else {
864	sequenceNames.push_back (string (argv[i]));
865	}
866	}
867
868	if (enableTrainEmissions && !enableTraining){
869	cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
870	exit (1);
871	}
872
873	return sequenceNames;
874	}
875
876	/////////////////////////////////////////////////////////////////
877	// ReadParameters()
878	//
879	// Read initial distribution, transition, and emission
880	// parameters from a file.
881	/////////////////////////////////////////////////////////////////
882
883	void ReadParameters (){
884
885	ifstream data;
886
887	emitPairs = VVF (256, VF (256, 1e-10));
888	emitSingle = VF (256, 1e-5);
889
890	// read initial state distribution and transition parameters
891	if (parametersInputFilename == string ("")){
892	if (NumInsertStates == 1){
893	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
894	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
895	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
896	}
897	else if (NumInsertStates == 2){
898	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
899	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
900	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
901	}
902	else {
903	cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
904	<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
905	exit (1);
906	}
907
908	alphabet = alphabetDefault;
909
910	for (int i = 0; i < (int) alphabet.length(); i++){
911	emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
912	emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
913	for (int j = 0; j <= i; j++){
914	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
915	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
916	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
917	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
918	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
919	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
920	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
921	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
922	}
923	}
924	}
925	else {
926	data.open (parametersInputFilename.c_str());
927	if (data.fail()){
928	cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
929	exit (1);
930	}
931
932	string line[3];
933	for (int i = 0; i < 3; i++){
934	if (!getline (data, line[i])){
935	cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
936	exit (1);
937	}
938	}
939	istringstream data2;
940	data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
941	data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
942	data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
943
944	if (!getline (data, line[0])){
945	cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
946	exit (1);
947	}
948
949	// read alphabet as concatenation of all characters on alphabet line
950	alphabet = "";
951	string token;
952	data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
953
954	for (int i = 0; i < (int) alphabet.size(); i++){
955	for (int j = 0; j <= i; j++){
956	float val;
957	data >> val;
958	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
959	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
960	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
961	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
962	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
963	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
964	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
965	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
966	}
967	}
968
969	for (int i = 0; i < (int) alphabet.size(); i++){
970	float val;
971	data >> val;
972	emitSingle[(unsigned char) tolower(alphabet[i])] = val;
973	emitSingle[(unsigned char) toupper(alphabet[i])] = val;
974	}
975	data.close();
976	}
977	}
978
979	/////////////////////////////////////////////////////////////////
980	// ProcessTree()
981	//
982	// Process the tree recursively. Returns the aligned sequences
983	// corresponding to a node or leaf of the tree.
984	/////////////////////////////////////////////////////////////////
985
986	MultiSequence ProcessTree (const TreeNode tree, MultiSequence *sequences,
987	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
988	const ProbabilisticModel &model){
989	MultiSequence *result;
990
991	// check if this is a node of the alignment tree
992	if (tree->GetSequenceLabel() == -1){
993	MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
994	MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
995
996	assert (alignLeft);
997	assert (alignRight);
998
999	result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
1000	assert (result);
1001
1002	delete alignLeft;
1003	delete alignRight;
1004	}
1005
1006	// otherwise, this is a leaf of the alignment tree
1007	else {
1008	result = new MultiSequence(); assert (result);
1009	result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1010	}
1011
1012	return result;
1013	}
1014
1015	/////////////////////////////////////////////////////////////////
1016	// ComputeFinalAlignment()
1017	//
1018	// Compute the final alignment by calling ProcessTree(), then
1019	// performing iterative refinement as needed.
1020	/////////////////////////////////////////////////////////////////
1021
1022	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
1023	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1024	const ProbabilisticModel &model){
1025
1026	MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
1027
1028	SafeVector<int> oldOrdering;
1029	if (enableAlignOrder){
1030	for (int i = 0; i < alignment->GetNumSequences(); i++)
1031	oldOrdering.push_back (alignment->GetSequence(i)->GetSortLabel());
1032	alignment->SaveOrdering();
1033	enableAlignOrder = false;
1034	}
1035
1036	// tree-based refinement
1037	// TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree);
1038
1039	// iterative refinement
1040	for (int i = 0; i < numIterativeRefinementReps; i++)
1041	DoIterativeRefinement (sparseMatrices, model, alignment);
1042
1043	cerr << endl;
1044
1045	if (oldOrdering.size() > 0){
1046	for (int i = 0; i < (int) oldOrdering.size(); i++){
1047	alignment->GetSequence(i)->SetSortLabel(oldOrdering[i]);
1048	}
1049	}
1050
1051	// return final alignment
1052	return alignment;
1053	}
1054
1055	/////////////////////////////////////////////////////////////////
1056	// AlignAlignments()
1057	//
1058	// Returns the alignment of two MultiSequence objects.
1059	/////////////////////////////////////////////////////////////////
1060
1061	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
1062	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1063	const ProbabilisticModel &model){
1064
1065	// print some info about the alignment
1066	if (enableVerbose){
1067	for (int i = 0; i < align1->GetNumSequences(); i++)
1068	cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1069	cerr << "] vs. ";
1070	for (int i = 0; i < align2->GetNumSequences(); i++)
1071	cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1072	cerr << "]: ";
1073	}
1074
1075	VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1076	pair<SafeVector<char> *, float> alignment;
1077
1078	// choose the alignment routine depending on the "cosmetic" gap penalties used
1079	if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
1080	alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1081	else
1082	alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1083	*posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1084	gapOpenPenalty, gapContinuePenalty);
1085
1086	delete posterior;
1087
1088	if (enableVerbose){
1089
1090	// compute total length of sequences
1091	int totLength = 0;
1092	for (int i = 0; i < align1->GetNumSequences(); i++)
1093	for (int j = 0; j < align2->GetNumSequences(); j++)
1094	totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1095
1096	// give an "accuracy" measure for the alignment
1097	cerr << alignment.second / totLength << endl;
1098	}
1099
1100	// now build final alignment
1101	MultiSequence *result = new MultiSequence();
1102	for (int i = 0; i < align1->GetNumSequences(); i++)
1103	result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1104	for (int i = 0; i < align2->GetNumSequences(); i++)
1105	result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1106	if (!enableAlignOrder)
1107	result->SortByLabel();
1108
1109	// free temporary alignment
1110	delete alignment.first;
1111
1112	return result;
1113	}
1114
1115	/////////////////////////////////////////////////////////////////
1116	// DoRelaxation()
1117	//
1118	// Performs one round of the consistency transformation. The
1119	// formula used is:
1120	// 1
1121	// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1122	// \|S\| z in S k
1123	//
1124	// where S = {x, y, all other sequences...}
1125	//
1126	/////////////////////////////////////////////////////////////////
1127
1128	SafeVector<SafeVector<SparseMatrix > > DoRelaxation (MultiSequence sequences,
1129	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1130	const int numSeqs = sequences->GetNumSequences();
1131
1132	SafeVector<SafeVector<SparseMatrix > > newSparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
1133
1134	// for every pair of sequences
1135	for (int i = 0; i < numSeqs; i++){
1136	for (int j = i+1; j < numSeqs; j++){
1137	Sequence *seq1 = sequences->GetSequence (i);
1138	Sequence *seq2 = sequences->GetSequence (j);
1139
1140	if (enableVerbose)
1141	cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1142	<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1143
1144	// get the original posterior matrix
1145	VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1146	VF &posterior = *posteriorPtr;
1147
1148	const int seq1Length = seq1->GetLength();
1149	const int seq2Length = seq2->GetLength();
1150
1151	// contribution from the summation where z = x and z = y
1152	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1153
1154	if (enableVerbose)
1155	cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1156
1157	// contribution from all other sequences
1158	for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1159	if (k < i)
1160	Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1161	else if (k > i && k < j)
1162	Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1163	else {
1164	SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1165	Relax (sparseMatrices[i][k], temp, posterior);
1166	delete temp;
1167	}
1168	}
1169
1170	// now renormalization
1171	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1172
1173	// mask out positions not originally in the posterior matrix
1174	SparseMatrix *matXY = sparseMatrices[i][j];
1175	for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1176	for (int x = 1; x <= seq1Length; x++){
1177	SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1178	SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1179	VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1180	int curr = 0;
1181	while (XYptr != XYend){
1182
1183	// zero out all cells until the first filled column
1184	while (curr < XYptr->first){
1185	base[curr] = 0;
1186	curr++;
1187	}
1188
1189	// now, skip over this column
1190	curr++;
1191	++XYptr;
1192	}
1193
1194	// zero out cells after last column
1195	while (curr <= seq2Length){
1196	base[curr] = 0;
1197	curr++;
1198	}
1199	}
1200
1201	// save the new posterior matrix
1202	newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1203	newSparseMatrices[j][i] = NULL;
1204
1205	if (enableVerbose)
1206	cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1207
1208	delete posteriorPtr;
1209
1210	if (enableVerbose)
1211	cerr << "done." << endl;
1212	}
1213	}
1214
1215	return newSparseMatrices;
1216	}
1217
1218	/////////////////////////////////////////////////////////////////
1219	// Relax()
1220	//
1221	// Computes the consistency transformation for a single sequence
1222	// z, and adds the transformed matrix to "posterior".
1223	/////////////////////////////////////////////////////////////////
1224
1225	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior){
1226
1227	assert (matXZ);
1228	assert (matZY);
1229
1230	int lengthX = matXZ->GetSeq1Length();
1231	int lengthY = matZY->GetSeq2Length();
1232	assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1233
1234	// for every x[i]
1235	for (int i = 1; i <= lengthX; i++){
1236	SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1237	SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1238
1239	VF::iterator base = posterior.begin() + i * (lengthY + 1);
1240
1241	// iterate through all x[i]-z[k]
1242	while (XZptr != XZend){
1243	SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1244	SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1245	const float XZval = XZptr->second;
1246
1247	// iterate through all z[k]-y[j]
1248	while (ZYptr != ZYend){
1249	base[ZYptr->first] += XZval * ZYptr->second;
1250	ZYptr++;
1251	}
1252	XZptr++;
1253	}
1254	}
1255	}
1256
1257	/////////////////////////////////////////////////////////////////
1258	// Relax1()
1259	//
1260	// Computes the consistency transformation for a single sequence
1261	// z, and adds the transformed matrix to "posterior".
1262	/////////////////////////////////////////////////////////////////
1263
1264	void Relax1 (SparseMatrix matZX, SparseMatrix matZY, VF &posterior){
1265
1266	assert (matZX);
1267	assert (matZY);
1268
1269	int lengthZ = matZX->GetSeq1Length();
1270	int lengthY = matZY->GetSeq2Length();
1271
1272	// for every z[k]
1273	for (int k = 1; k <= lengthZ; k++){
1274	SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1275	SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1276
1277	// iterate through all z[k]-x[i]
1278	while (ZXptr != ZXend){
1279	SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1280	SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1281	const float ZXval = ZXptr->second;
1282	VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1283
1284	// iterate through all z[k]-y[j]
1285	while (ZYptr != ZYend){
1286	base[ZYptr->first] += ZXval * ZYptr->second;
1287	ZYptr++;
1288	}
1289	ZXptr++;
1290	}
1291	}
1292	}
1293
1294	/////////////////////////////////////////////////////////////////
1295	// GetSubtree
1296	//
1297	// Returns set containing all leaf labels of the current subtree.
1298	/////////////////////////////////////////////////////////////////
1299
1300	set<int> GetSubtree (const TreeNode *tree){
1301	set<int> s;
1302
1303	if (tree->GetSequenceLabel() == -1){
1304	s = GetSubtree (tree->GetLeftChild());
1305	set<int> t = GetSubtree (tree->GetRightChild());
1306
1307	for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1308	s.insert (*iter);
1309	}
1310	else {
1311	s.insert (tree->GetSequenceLabel());
1312	}
1313
1314	return s;
1315	}
1316
1317	/////////////////////////////////////////////////////////////////
1318	// TreeBasedBiPartitioning
1319	//
1320	// Uses the iterative refinement scheme from MUSCLE.
1321	/////////////////////////////////////////////////////////////////
1322
1323	void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1324	const ProbabilisticModel &model, MultiSequence* &alignment,
1325	const TreeNode *tree){
1326	// check if this is a node of the alignment tree
1327	if (tree->GetSequenceLabel() == -1){
1328	TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild());
1329	TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild());
1330
1331	set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1332	set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1333	set<int> leftSubtreeComplement, rightSubtreeComplement;
1334
1335	// calculate complement of each subtree
1336	for (int i = 0; i < alignment->GetNumSequences(); i++){
1337	if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1338	if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1339	}
1340
1341	// perform realignments for edge to left child
1342	if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1343	MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1344	MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1345	delete alignment;
1346	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1347	}
1348
1349	// perform realignments for edge to right child
1350	if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1351	MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1352	MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1353	delete alignment;
1354	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1355	}
1356	}
1357	}
1358
1359	/////////////////////////////////////////////////////////////////
1360	// DoIterativeRefinement()
1361	//
1362	// Performs a single round of randomized partionining iterative
1363	// refinement.
1364	/////////////////////////////////////////////////////////////////
1365
1366	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1367	const ProbabilisticModel &model, MultiSequence* &alignment){
1368	set<int> groupOne, groupTwo;
1369
1370	// create two separate groups
1371	for (int i = 0; i < alignment->GetNumSequences(); i++){
1372	if (rand() % 2)
1373	groupOne.insert (i);
1374	else
1375	groupTwo.insert (i);
1376	}
1377
1378	if (groupOne.empty() \|\| groupTwo.empty()) return;
1379
1380	// project into the two groups
1381	MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1382	MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1383	delete alignment;
1384
1385	// realign
1386	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1387
1388	delete groupOneSeqs;
1389	delete groupTwoSeqs;
1390	}
1391
1392	/////////////////////////////////////////////////////////////////
1393	// WriteAnnotation()
1394	//
1395	// Computes annotation for multiple alignment and write values
1396	// to a file.
1397	/////////////////////////////////////////////////////////////////
1398
1399	void WriteAnnotation (MultiSequence *alignment,
1400	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1401	ofstream outfile (annotationFilename.c_str());
1402
1403	if (outfile.fail()){
1404	cerr << "ERROR: Unable to write annotation file." << endl;
1405	exit (1);
1406	}
1407
1408	const int alignLength = alignment->GetSequence(0)->GetLength();
1409	const int numSeqs = alignment->GetNumSequences();
1410
1411	SafeVector<int> position (numSeqs, 0);
1412	SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1413	for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1414	SafeVector<pair<int,int> > active;
1415	active.reserve (numSeqs);
1416
1417	SafeVector<int> lab;
1418	for (int i = 0; i < numSeqs; i++) lab.push_back(alignment->GetSequence(i)->GetSortLabel());
1419
1420	// for every column
1421	for (int i = 1; i <= alignLength; i++){
1422
1423	// find all aligned residues in this particular column
1424	active.clear();
1425	for (int j = 0; j < numSeqs; j++){
1426	if (seqs[j][i] != '-'){
1427	active.push_back (make_pair(lab[j], ++position[j]));
1428	}
1429	}
1430
1431	sort (active.begin(), active.end());
1432	outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1433	}
1434
1435	outfile.close();
1436	}
1437
1438	/////////////////////////////////////////////////////////////////
1439	// ComputeScore()
1440	//
1441	// Computes the annotation score for a particular column.
1442	/////////////////////////////////////////////////////////////////
1443
1444	int ComputeScore (const SafeVector<pair<int, int> > &active,
1445	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1446
1447	if (active.size() <= 1) return 0;
1448
1449	// ALTERNATIVE #1: Compute the average alignment score.
1450
1451	float val = 0;
1452	for (int i = 0; i < (int) active.size(); i++){
1453	for (int j = i+1; j < (int) active.size(); j++){
1454	val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1455	}
1456	}
1457
1458	return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1459
1460	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: