Context Navigation

Main.cc

Visit:

Last change on this file was 10371, checked in by aboeckma, 12 years ago
updated mafft version. Added extensions (no svn ignore, yet)
File size: 65.8 KB

Line
1	/////////////////////////////////////////////////////////////////
2	// Main.cc
3	//
4	// Main routines for MXSCARNA program.
5	/////////////////////////////////////////////////////////////////
6
7	#include "scarna.hpp"
8	#include "SafeVector.h"
9	#include "MultiSequence.h"
10	#include "Defaults.h"
11	#include "ScoreType.h"
12	#include "ProbabilisticModel.h"
13	#include "EvolutionaryTree.h"
14	#include "SparseMatrix.h"
15	#include "BPPMatrix.hpp"
16	#include "StemCandidate.hpp"
17	#include "Globaldp.hpp"
18	#include "nrutil.h"
19	#include "AlifoldMEA.h"
20	#include <string>
21	#include <sstream>
22	#include <iomanip>
23	#include <iostream>
24	#include <list>
25	#include <set>
26	#include <algorithm>
27	#include <cstdio>
28	#include <cstdlib>
29	#include <cerrno>
30	#include <iomanip>
31	#include <fstream>
32
33	#include "RfoldWrapper.hpp"
34	//static RFOLD::Rfold folder;
35
36	using namespace::MXSCARNA;
37	using namespace::RFOLD;
38
39	string parametersInputFilename = "";
40	string parametersOutputFilename = "no training";
41	string annotationFilename = "";
42	string weboutputFileName = "";
43
44	ofstream *outputFile;
45
46	bool enableTraining = false;
47	bool enableVerbose = false;
48	bool enableAllPairs = false;
49	bool enableAnnotation = false;
50	bool enableViterbi = false;
51	bool enableClustalWOutput = false;
52	bool enableTrainEmissions = false;
53	bool enableAlignOrder = false;
54	bool enableWebOutput = false;
55	bool enableStockholmOutput = false;
56	bool enableMXSCARNAOutput = false;
57	bool enableMcCaskillMEAMode = false;
58	int numConsistencyReps = 2;
59	int numPreTrainingReps = 0;
60	int numIterativeRefinementReps = 100;
61	int scsLength = SCSLENGTH;
62	float cutoff = 0;
63	float gapOpenPenalty = 0;
64	float gapContinuePenalty = 0;
65	float threshhold = 1.0;
66	float BaseProbThreshold = BASEPROBTHRESHOLD;
67	float BasePairConst = BASEPAIRCONST;
68	int BandWidth = BANDWIDTH;
69	bool useRfold = USERFOLD;
70	SafeVector<string> sequenceNames;
71
72	VF initDistrib (NumMatrixTypes);
73	VF gapOpen (2*NumInsertStates);
74	VF gapExtend (2*NumInsertStates);
75	VVF emitPairs (256, VF (256, 1e-10));
76	VF emitSingle (256, 1e-5);
77
78	string alphabet = alphabetDefault;
79
80	string *ssCons = NULL;
81
82	const int MIN_PRETRAINING_REPS = 0;
83	const int MAX_PRETRAINING_REPS = 20;
84	const int MIN_CONSISTENCY_REPS = 0;
85	const int MAX_CONSISTENCY_REPS = 5;
86	const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
87	const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
88
89	/////////////////////////////////////////////////////////////////
90	// Function prototypes
91	/////////////////////////////////////////////////////////////////
92
93	void PrintHeading();
94	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
95	const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
96	MultiSequence DoAlign (MultiSequence sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle);
97	SafeVector<string> ParseParams (int argc, char **argv);
98	void ReadParameters ();
99	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
100	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
101	const ProbabilisticModel &model,
102	SafeVector<BPPMatrix*> &BPPMatrices);
103	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
104	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
105	const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices, float identity);
106	SafeVector<SafeVector<SparseMatrix > > DoRelaxation (MultiSequence sequences,
107	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
108	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior);
109	void Relax1 (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior);
110	void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
111	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
112	SafeVector<BPPMatrix*> &BPPMatrices);
113	set<int> GetSubtree (const TreeNode *tree);
114	void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
115	const ProbabilisticModel &model, MultiSequence* &alignment,
116	const TreeNode tree, SafeVector<BPPMatrix> &BPPMatrices);
117	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
118	const ProbabilisticModel &model, MultiSequence* &alignment);
119	void WriteAnnotation (MultiSequence *alignment,
120	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
121	int ComputeScore (const SafeVector<pair<int, int> > &active,
122	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
123	std::vector<StemCandidate>* seq2scs(MultiSequence Sequences, SafeVector<BPPMatrix> &BPPMatrices, int BandWidth);
124	void removeConflicts(std::vector<StemCandidate> pscs1, std::vector<StemCandidate> pscs2, std::vector<int> matchPSCS1, std::vector<int> matchPSCS2);
125
126	struct prob {
127	int i;
128	int j;
129	float p;
130	};
131
132	/////////////////////////////////////////////////////////////////
133	// main()
134	//
135	// Calls all initialization routines and runs the MXSCARNA
136	// aligner.
137	/////////////////////////////////////////////////////////////////
138
139
140	int main (int argc, char **argv){
141
142	// print MXSCARNA heading
143	PrintHeading();
144
145	// parse program parameters
146	sequenceNames = ParseParams (argc, argv);
147	ReadParameters();
148	PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
149
150	// now, we'll process all the files given as input. If we are given
151	// several filenames as input, then we'll load all of those sequences
152	// simultaneously, as long as we're not training. On the other hand,
153	// if we are training, then we'll treat each file as a separate
154	// training instance
155
156	if (enableMcCaskillMEAMode) {
157	MultiSequence *sequences = new MultiSequence(); assert (sequences);
158	for (int i = 0; i < (int) sequenceNames.size(); i++){
159	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
160	sequences->LoadMFA (sequenceNames[i], true);
161	}
162
163	const int numSeqs = sequences->GetNumSequences();
164	SafeVector<BPPMatrix*> BPPMatrices;
165
166	// compute the base pairing matrices for each sequences
167	for(int i = 0; i < numSeqs; i++) {
168	Sequence *tmpSeq = sequences->GetSequence(i);
169	string seq = tmpSeq->GetString();
170	int n_seq = tmpSeq->GetLength();
171	BPPMatrix *bppmat = new BPPMatrix(seq, n_seq);
172	BPPMatrices.push_back(bppmat);
173	}
174	AlifoldMEA alifold(sequences, BPPMatrices, BasePairConst);
175	alifold.Run();
176	ssCons = alifold.getSScons();
177
178	if (enableStockholmOutput) {
179	sequences->WriteSTOCKHOLM (cout, ssCons);
180	}
181	else if (enableMXSCARNAOutput){
182	sequences->WriteMXSCARNA (cout, ssCons);
183	}
184	else {
185	sequences->WriteMFA (cout, ssCons);
186	}
187
188	delete sequences;
189	}
190	// if we are training
191	else if (enableTraining){
192
193	// build new model for aligning
194	ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
195
196	// prepare to average parameters
197	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
198	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
199	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
200	if (enableTrainEmissions){
201	for (int i = 0; i < (int) emitPairs.size(); i++)
202	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
203	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
204	}
205
206	// align each file individually
207	for (int i = 0; i < (int) sequenceNames.size(); i++){
208
209	VF thisInitDistrib (NumMatrixTypes);
210	VF thisGapOpen (2*NumInsertStates);
211	VF thisGapExtend (2*NumInsertStates);
212	VVF thisEmitPairs (256, VF (256, 1e-10));
213	VF thisEmitSingle (256, 1e-5);
214
215	// load sequence file
216	MultiSequence *sequences = new MultiSequence(); assert (sequences);
217	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
218	sequences->LoadMFA (sequenceNames[i], true);
219
220	// align sequences
221	DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
222
223	// add in contribution of the derived parameters
224	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
225	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
226	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
227	if (enableTrainEmissions){
228	for (int i = 0; i < (int) emitPairs.size(); i++)
229	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
230	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
231	}
232
233	delete sequences;
234	}
235
236	// compute new parameters and print them out
237	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
238	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
239	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
240	if (enableTrainEmissions){
241	for (int i = 0; i < (int) emitPairs.size(); i++)
242	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
243	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
244	}
245
246	PrintParameters ("Trained parameter set:",
247	initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
248	parametersOutputFilename.c_str());
249	}
250	// pass
251	// if we are not training, we must simply want to align some sequences
252	else {
253	// load all files together
254	MultiSequence *sequences = new MultiSequence(); assert (sequences);
255	for (int i = 0; i < (int) sequenceNames.size(); i++){
256	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
257
258	sequences->LoadMFA (sequenceNames[i], true);
259	}
260
261	// do all "pre-training" repetitions first
262	// NOT execute
263	for (int ct = 0; ct < numPreTrainingReps; ct++){
264	enableTraining = true;
265
266	// build new model for aligning
267	ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
268	emitPairs, emitSingle);
269
270	// do initial alignments
271	DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
272
273	// print new parameters
274	PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
275
276	enableTraining = false;
277	}
278
279	// now, we can perform the alignments and write them out
280	if (enableWebOutput) {
281	outputFile = new ofstream(weboutputFileName.c_str());
282	if (!outputFile) {
283	cerr << "cannot open output file." << weboutputFileName << endl;
284	exit(1);
285	}
286	*outputFile << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << endl;
287	*outputFile << "<result>" << endl;
288	}
289	MultiSequence *alignment = DoAlign (sequences,
290	ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
291	initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
292
293
294	if (!enableAllPairs){
295	if (enableClustalWOutput) {
296	alignment->WriteALN (cout);
297	}
298	else if (enableWebOutput) {
299	alignment->WriteWEB (*outputFile, ssCons);
300	// computeStructureWithAlifold ();
301	}
302	else if (enableStockholmOutput) {
303	alignment->WriteSTOCKHOLM (cout, ssCons);
304	}
305	else if (enableMXSCARNAOutput) {
306	alignment->WriteMXSCARNA (cout, ssCons);
307	}
308	else {
309	alignment->WriteMFA (cout, ssCons);
310	}
311	}
312
313	if (enableWebOutput) {
314	*outputFile << "</result>" << endl;
315	delete outputFile;
316	}
317
318	delete ssCons;
319	delete alignment;
320	delete sequences;
321
322	}
323	}
324
325	/////////////////////////////////////////////////////////////////
326	// PrintHeading()
327	//
328	// Prints heading for PROBCONS program.
329	/////////////////////////////////////////////////////////////////
330
331	void PrintHeading (){
332	cerr << endl
333	<< "Multiplex SCARNA"<< endl
334	<< "version " << VERSION << " - align multiple RNA sequences and print to standard output" << endl
335	<< "Written by Yasuo Tabei" << endl
336	<< endl;
337	}
338
339	/////////////////////////////////////////////////////////////////
340	// PrintParameters()
341	//
342	// Prints PROBCONS parameters to STDERR. If a filename is
343	// specified, then the parameters are also written to the file.
344	/////////////////////////////////////////////////////////////////
345
346	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
347	const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
348
349	// print parameters to the screen
350	cerr << message << endl
351	<< " initDistrib[] = { ";
352	for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
353	cerr << "}" << endl
354	<< " gapOpen[] = { ";
355	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
356	cerr << "}" << endl
357	<< " gapExtend[] = { ";
358	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
359	cerr << "}" << endl
360	<< endl;
361
362	/*
363	for (int i = 0; i < 5; i++){
364	for (int j = 0; j <= i; j++){
365	cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
366	}
367	cerr << endl;
368	}*/
369
370	// if a file name is specified
371	if (filename){
372
373	// attempt to open the file for writing
374	FILE *file = fopen (filename, "w");
375	if (!file){
376	cerr << "ERROR: Unable to write parameter file: " << filename << endl;
377	exit (1);
378	}
379
380	// if successful, then write the parameters to the file
381	for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
382	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
383	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
384	fprintf (file, "%s\n", alphabet.c_str());
385	for (int i = 0; i < (int) alphabet.size(); i++){
386	for (int j = 0; j <= i; j++)
387	fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
388	fprintf (file, "\n");
389	}
390	for (int i = 0; i < (int) alphabet.size(); i++)
391	fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
392	fprintf (file, "\n");
393	fclose (file);
394	}
395	}
396
397	/////////////////////////////////////////////////////////////////
398	// DoAlign()
399	//
400	// First computes all pairwise posterior probability matrices.
401	// Then, computes new parameters if training, or a final
402	// alignment, otherwise.
403	/////////////////////////////////////////////////////////////////
404	MultiSequence DoAlign (MultiSequence sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle){
405
406	assert (sequences);
407
408	const int numSeqs = sequences->GetNumSequences();
409	VVF distances (numSeqs, VF (numSeqs, 0));
410	VVF identities (numSeqs, VF (numSeqs, 0));
411	SafeVector<SafeVector<SparseMatrix > > sparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
412
413	SafeVector<BPPMatrix*> BPPMatrices;
414
415	RfoldWrapper rfoldWrapper;
416	if (useRfold) {
417	BPPMatrices
418	= rfoldWrapper.getProb(sequenceNames, sequences, BASEPROBTHRESHOLD, BandWidth);
419	}
420	else {
421	for(int i = 0; i < numSeqs; i++) {
422	Sequence *tmpSeq = sequences->GetSequence(i);
423	string seq = tmpSeq->GetString();
424	int n_seq = tmpSeq->GetLength();
425	BPPMatrix *bppmat = new BPPMatrix(seq, n_seq, BASEPROBTHRESHOLD);
426	BPPMatrices.push_back(bppmat);
427	}
428	}
429
430	if (enableTraining){
431	// prepare to average parameters
432	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
433	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
434	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
435	if (enableTrainEmissions){
436	for (int i = 0; i < (int) emitPairs.size(); i++)
437	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
438	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
439	}
440	}
441
442	// skip posterior calculations if we just want to do Viterbi alignments
443	if (!enableViterbi){
444
445	// do all pairwise alignments for posterior probability matrices
446	for (int a = 0; a < numSeqs-1; a++){
447	for (int b = a+1; b < numSeqs; b++){
448	Sequence *seq1 = sequences->GetSequence (a);
449	Sequence *seq2 = sequences->GetSequence (b);
450
451	// verbose output
452	if (enableVerbose)
453	cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
454	<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
455
456	// compute forward and backward probabilities
457	VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
458	VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
459
460	// if we are training, then we'll simply want to compute the
461	// expected counts for each region within the matrix separately;
462	// otherwise, we'll need to put all of the regions together and
463	// assemble a posterior probability match matrix
464
465	// so, if we're training
466	if (enableTraining){
467
468	// compute new parameters
469	VF thisInitDistrib (NumMatrixTypes);
470	VF thisGapOpen (2*NumInsertStates);
471	VF thisGapExtend (2*NumInsertStates);
472	VVF thisEmitPairs (256, VF (256, 1e-10));
473	VF thisEmitSingle (256, 1e-5);
474
475	model.ComputeNewParameters (seq1, seq2, forward, backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
476
477	// add in contribution of the derived parameters
478	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
479	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
480	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
481	if (enableTrainEmissions){
482	for (int i = 0; i < (int) emitPairs.size(); i++)
483	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
484	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
485	}
486
487	// let us know that we're done.
488	if (enableVerbose) cerr << "done." << endl;
489	}
490	// pass
491	else {
492
493	// compute posterior probability matrix
494	VF posterior = model.ComputePosteriorMatrix (seq1, seq2, forward, *backward); assert (posterior);
495
496	// compute sparse representations
497	sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
498	sparseMatrices[b][a] = NULL;
499
500	if (!enableAllPairs){
501	// perform the pairwise sequence alignment
502	pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
503	seq2->GetLength(),
504	*posterior);
505
506	Sequence *tmpSeq1 = seq1->AddGaps (alignment.first, 'X');
507	Sequence *tmpSeq2 = seq2->AddGaps (alignment.first, 'Y');
508
509	// compute sequence identity for each pair of sequenceses
510	int length = tmpSeq1->GetLength();
511	int matchCount = 0;
512	int misMatchCount = 0;
513	for (int k = 1; k <= length; k++) {
514	int ch1 = tmpSeq1->GetPosition(k);
515	int ch2 = tmpSeq2->GetPosition(k);
516	if (ch1 == ch2 && ch1 != '-' && ch2 != '-') { ++matchCount; }
517	else if (ch1 != ch2 && ch1 != '-' && ch2 != '-') { ++misMatchCount; }
518	}
519
520	identities[a][b] = identities[b][a] = (float)matchCount/(float)(matchCount + misMatchCount);
521
522	// compute "expected accuracy" distance for evolutionary tree computation
523	float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
524	distances[a][b] = distances[b][a] = distance;
525
526	if (enableVerbose)
527	cerr << setprecision (10) << distance << endl;
528
529	delete alignment.first;
530	}
531	else {
532	// let us know that we're done.
533	if (enableVerbose) cerr << "done." << endl;
534	}
535
536	delete posterior;
537	}
538
539	delete forward;
540	delete backward;
541	}
542	}
543	}
544
545	// now average out parameters derived
546	if (enableTraining){
547	// compute new parameters
548	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
549	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
550	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
551
552	if (enableTrainEmissions){
553	for (int i = 0; i < (int) emitPairs.size(); i++)
554	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
555	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
556	}
557	}
558
559	// see if we still want to do some alignments
560	else {
561	// pass
562	if (!enableViterbi){
563
564	// perform the consistency transformation the desired number of times
565	for (int r = 0; r < numConsistencyReps; r++){
566	SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
567
568	// now replace the old posterior matrices
569	for (int i = 0; i < numSeqs; i++){
570	for (int j = 0; j < numSeqs; j++){
571	delete sparseMatrices[i][j];
572	sparseMatrices[i][j] = newSparseMatrices[i][j];
573	}
574	}
575	}
576	if (numSeqs > 8) {
577	for (int r = 0; r < 1; r++)
578	DoBasePairProbabilityRelaxation(sequences, sparseMatrices, BPPMatrices);
579	}
580	}
581
582	MultiSequence *finalAlignment = NULL;
583
584	if (enableAllPairs){
585	for (int a = 0; a < numSeqs-1; a++){
586	for (int b = a+1; b < numSeqs; b++){
587	Sequence *seq1 = sequences->GetSequence (a);
588	Sequence *seq2 = sequences->GetSequence (b);
589
590	if (enableVerbose)
591	cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
592	<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
593
594
595	// perform the pairwise sequence alignment
596	pair<SafeVector<char> *, float> alignment;
597	if (enableViterbi)
598	alignment = model.ComputeViterbiAlignment (seq1, seq2);
599	else {
600
601	// build posterior matrix
602	VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
603	int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
604	for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
605
606	alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
607	delete posterior;
608	}
609
610	// write pairwise alignments
611	string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
612	ofstream outfile (name.c_str());
613
614	MultiSequence *result = new MultiSequence();
615	result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
616	result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
617	if (enableClustalWOutput)
618	result->WriteALN (outfile);
619	else
620	result->WriteMFA (outfile);
621
622	outfile.close();
623
624	delete alignment.first;
625	}
626	}
627	}
628
629	// now if we still need to do a final multiple alignment
630	else {
631
632	if (enableVerbose)
633	cerr << endl;
634
635	// compute the evolutionary tree
636	TreeNode *tree = TreeNode::ComputeTree (distances, identities);
637
638	if (enableWebOutput) {
639	*outputFile << "<tree>" << endl;
640	tree->Print (*outputFile, sequences);
641	*outputFile << "</tree>" << endl;
642	}
643	else {
644	tree->Print (cerr, sequences);
645	cerr << endl;
646	}
647	// make the final alignment
648	finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model, BPPMatrices);
649
650	// build annotation
651	if (enableAnnotation){
652	WriteAnnotation (finalAlignment, sparseMatrices);
653	}
654
655	delete tree;
656	}
657
658	if (!enableViterbi){
659	// delete sparse matrices
660	for (int a = 0; a < numSeqs-1; a++){
661	for (int b = a+1; b < numSeqs; b++){
662	delete sparseMatrices[a][b];
663	delete sparseMatrices[b][a];
664	}
665	}
666	}
667
668	AlifoldMEA alifold(finalAlignment, BPPMatrices, BasePairConst);
669	alifold.Run();
670	ssCons = alifold.getSScons();
671
672	return finalAlignment;
673
674	}
675
676	return NULL;
677	}
678
679	/////////////////////////////////////////////////////////////////
680	// GetInteger()
681	//
682	// Attempts to parse an integer from the character string given.
683	// Returns true only if no parsing error occurs.
684	/////////////////////////////////////////////////////////////////
685
686	bool GetInteger (char data, int val){
687	char *endPtr;
688	long int retVal;
689
690	assert (val);
691
692	errno = 0;
693	retVal = strtol (data, &endPtr, 0);
694	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
695	if (errno != 0 && (retVal == LONG_MAX \|\| retVal == LONG_MIN)) return false;
696	if (retVal < (long) INT_MIN \|\| retVal > (long) INT_MAX) return false;
697	*val = (int) retVal;
698	return true;
699	}
700
701	/////////////////////////////////////////////////////////////////
702	// GetFloat()
703	//
704	// Attempts to parse a float from the character string given.
705	// Returns true only if no parsing error occurs.
706	/////////////////////////////////////////////////////////////////
707
708	bool GetFloat (char data, float val){
709	char *endPtr;
710	double retVal;
711
712	assert (val);
713
714	errno = 0;
715	retVal = strtod (data, &endPtr);
716	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
717	if (errno != 0 && (retVal >= 1000000.0 \|\| retVal <= -1000000.0)) return false;
718	*val = (float) retVal;
719	return true;
720	}
721
722	/////////////////////////////////////////////////////////////////
723	// ParseParams()
724	//
725	// Parse all command-line options.
726	/////////////////////////////////////////////////////////////////
727
728	SafeVector<string> ParseParams (int argc, char **argv){
729
730	if (argc < 2){
731
732	cerr << "MXSCARNA comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
733	<< "you are welcome to redistribute it under certain conditions. See the" << endl
734	<< "file COPYING.txt for details." << endl
735	<< endl
736	<< "Usage:" << endl
737	<< " mxscarna [OPTION]... [MFAFILE]..." << endl
738	<< endl
739	<< "Description:" << endl
740	<< " Align sequences in MFAFILE(s) and print result to standard output" << endl
741	<< endl
742	<< " -clustalw" << endl
743	<< " use CLUSTALW output format instead of MFA" << endl
744	<< endl
745	<< " -stockholm" << endl
746	<< " use STOCKHOLM output format instead of MFA" << endl
747	<< endl
748	<< " -mxscarna" << endl
749	<< " use MXSCARNA output format instead of MFA" << endl
750	<< endl
751	<< " -weboutput /<output_path>/<outputfilename>" << endl
752	<< " use web output format" << endl
753	<< endl
754	<< " -c, --consistency REPS" << endl
755	<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
756	<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
757	<< endl
758	<< " -ir, --iterative-refinement REPS" << endl
759	<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
760	<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
761	<< endl
762	<< " -pre, --pre-training REPS" << endl
763	<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
764	<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
765	<< endl
766	<< " -pairs" << endl
767	<< " generate all-pairs pairwise alignments" << endl
768	<< endl
769	<< " -viterbi" << endl
770	<< " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
771	<< endl
772	<< " -v, --verbose" << endl
773	<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
774	<< endl
775	<< " -annot FILENAME" << endl
776	<< " write annotation for multiple alignment to FILENAME" << endl
777	<< endl
778	<< " -t, --train FILENAME" << endl
779	<< " compute EM transition probabilities, store in FILENAME (default: "
780	<< parametersOutputFilename << ")" << endl
781	<< endl
782	<< " -e, --emissions" << endl
783	<< " also reestimate emission probabilities (default: "
784	<< (enableTrainEmissions ? "on" : "off") << ")" << endl
785	<< endl
786	<< " -p, --paramfile FILENAME" << endl
787	<< " read parameters from FILENAME (default: "
788	<< parametersInputFilename << ")" << endl
789	<< endl
790	<< " -a, --alignment-order" << endl
791	<< " print sequences in alignment order rather than input order (default: "
792	<< (enableAlignOrder ? "on" : "off") << ")" << endl
793	<< endl
794	<< " -s THRESHOLD" << endl
795	<< " the threshold of SCS alignment" << endl
796	<< endl
797	<< " In default, for less than " << threshhold << ", the SCS aligment is applied. " << endl
798	<< " -l SCSLENGTH" << endl
799	<< " the length of stem candidates " << SCSLENGTH << endl
800	<< endl
801	<< " -b BASEPROBTRHESHHOLD" << endl
802	<< " the threshold of base pairing probability " << BASEPROBTHRESHOLD << endl
803	<< endl
804	<< " -m, --mccaskillmea" << endl
805	<< " McCaskill MEA MODE: input the clustalw format file and output the secondary structure predicted by McCaskill MEA" << endl
806	<< endl
807	<< " -g BASEPAIRSCORECONST" << endl
808	<< " the control parameter of the prediction of base pairs, default:" << BasePairConst << endl
809	<< endl
810	<< " -w BANDWIDTH" << endl
811	<< " the control parameter of the distance of stem candidates, default:" << BANDWIDTH << endl
812	<< " -rfold" << endl
813	<< " use Rfold instead of global McCaskill algorithm to calcurate base paring probality matrices, default: (" << (useRfold? "on" : "off") << ")" << endl
814	<< endl;
815
816
817	// << " -go, --gap-open VALUE" << endl
818	// << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
819	// << endl
820	// << " -ge, --gap-extension VALUE" << endl
821	// << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
822	// << endl
823	// << " -co, --cutoff CUTOFF" << endl
824	// << " subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
825	// << endl
826
827	exit (1);
828	}
829
830	SafeVector<string> sequenceNames;
831	int tempInt;
832	float tempFloat;
833
834	for (int i = 1; i < argc; i++){
835	if (argv[i][0] == '-'){
836
837	// training
838	if (!strcmp (argv[i], "-t") \|\| !strcmp (argv[i], "--train")){
839	enableTraining = true;
840	if (i < argc - 1)
841	parametersOutputFilename = string (argv[++i]);
842	else {
843	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
844	exit (1);
845	}
846	}
847
848	// emission training
849	else if (!strcmp (argv[i], "-e") \|\| !strcmp (argv[i], "--emissions")){
850	enableTrainEmissions = true;
851	}
852
853	// parameter file
854	else if (!strcmp (argv[i], "-p") \|\| !strcmp (argv[i], "--paramfile")){
855	if (i < argc - 1)
856	parametersInputFilename = string (argv[++i]);
857	else {
858	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
859	exit (1);
860	}
861	}
862	else if (! strcmp (argv[i], "-s")) {
863	if (i < argc - 1){
864	if (!GetFloat (argv[++i], &tempFloat)){
865	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
866	exit (1);
867	}
868	else {
869	if (tempFloat < 0){
870	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
871	exit (1);
872	}
873	else
874	threshhold = tempFloat;
875	}
876	}
877	else {
878	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
879	exit (1);
880	}
881	}
882
883	else if (! strcmp (argv[i], "-l")) {
884	if (i < argc - 1) {
885	if (!GetInteger (argv[++i], &tempInt)){
886	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
887	exit (1);
888	}
889	else {
890	if (tempInt <= 0 \|\| 6 <= tempInt) {
891	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
892	<< "1 and 6" << "." << endl;
893	exit (1);
894	}
895	else
896	scsLength = tempInt;
897	}
898	}
899	}
900	else if (! strcmp (argv[i], "-b")) {
901	if (i < argc - 1) {
902	if (!GetFloat (argv[++i], &tempFloat)){
903	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
904	exit (1);
905	}
906	else {
907	if (tempFloat < 0 && 1 < tempFloat) {
908	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
909	exit (1);
910	}
911	else
912	BaseProbThreshold = tempFloat;
913	}
914	}
915	}
916	else if (! strcmp (argv[i], "-g")) {
917	if (i < argc - 1) {
918	if (!GetFloat (argv[++i], &tempFloat)){
919	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
920	exit (1);
921	}
922	else {
923	if (tempFloat < 0 && 1 < tempFloat) {
924	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
925	exit (1);
926	}
927	else
928	BasePairConst = tempFloat;
929	}
930	}
931	}
932
933	// number of consistency transformations
934	else if (!strcmp (argv[i], "-c") \|\| !strcmp (argv[i], "--consistency")){
935	if (i < argc - 1){
936	if (!GetInteger (argv[++i], &tempInt)){
937	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
938	exit (1);
939	}
940	else {
941	if (tempInt < MIN_CONSISTENCY_REPS \|\| tempInt > MAX_CONSISTENCY_REPS){
942	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
943	<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
944	exit (1);
945	}
946	else
947	numConsistencyReps = tempInt;
948	}
949	}
950	else {
951	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
952	exit (1);
953	}
954	}
955
956	// number of randomized partitioning iterative refinement passes
957	else if (!strcmp (argv[i], "-ir") \|\| !strcmp (argv[i], "--iterative-refinement")){
958	if (i < argc - 1){
959	if (!GetInteger (argv[++i], &tempInt)){
960	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
961	exit (1);
962	}
963	else {
964	if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS \|\| tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
965	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
966	<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
967	exit (1);
968	}
969	else
970	numIterativeRefinementReps = tempInt;
971	}
972	}
973	else {
974	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
975	exit (1);
976	}
977	}
978	else if (!strcmp (argv[i], "-rfold")){
979	useRfold = true;
980	}
981	// number of EM pre-training rounds
982	else if (!strcmp (argv[i], "-pre") \|\| !strcmp (argv[i], "--pre-training")){
983	if (i < argc - 1){
984	if (!GetInteger (argv[++i], &tempInt)){
985	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
986	exit (1);
987	}
988	else {
989	if (tempInt < MIN_PRETRAINING_REPS \|\| tempInt > MAX_PRETRAINING_REPS){
990	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
991	<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
992	exit (1);
993	}
994	else
995	numPreTrainingReps = tempInt;
996	}
997	}
998	else {
999	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1000	exit (1);
1001	}
1002	}
1003
1004	// the distance of stem candidate
1005	else if (!strcmp (argv[i], "-w")){
1006	if (i < argc - 1){
1007	if (!GetInteger (argv[++i], &tempInt)){
1008	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
1009	exit (1);
1010	}
1011	else {
1012	BandWidth = tempInt;
1013	}
1014	}
1015	else {
1016	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1017	exit (1);
1018	}
1019	}
1020
1021	// gap open penalty
1022	else if (!strcmp (argv[i], "-go") \|\| !strcmp (argv[i], "--gap-open")){
1023	if (i < argc - 1){
1024	if (!GetFloat (argv[++i], &tempFloat)){
1025	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1026	exit (1);
1027	}
1028	else {
1029	if (tempFloat > 0){
1030	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1031	exit (1);
1032	}
1033	else
1034	gapOpenPenalty = tempFloat;
1035	}
1036	}
1037	else {
1038	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1039	exit (1);
1040	}
1041	}
1042
1043	// gap extension penalty
1044	else if (!strcmp (argv[i], "-ge") \|\| !strcmp (argv[i], "--gap-extension")){
1045	if (i < argc - 1){
1046	if (!GetFloat (argv[++i], &tempFloat)){
1047	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1048	exit (1);
1049	}
1050	else {
1051	if (tempFloat > 0){
1052	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1053	exit (1);
1054	}
1055	else
1056	gapContinuePenalty = tempFloat;
1057	}
1058	}
1059	else {
1060	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1061	exit (1);
1062	}
1063	}
1064
1065	// all-pairs pairwise alignments
1066	else if (!strcmp (argv[i], "-pairs")){
1067	enableAllPairs = true;
1068	}
1069
1070	// all-pairs pairwise Viterbi alignments
1071	else if (!strcmp (argv[i], "-viterbi")){
1072	enableAllPairs = true;
1073	enableViterbi = true;
1074	}
1075
1076	// annotation files
1077	else if (!strcmp (argv[i], "-annot")){
1078	enableAnnotation = true;
1079	if (i < argc - 1)
1080	annotationFilename = argv[++i];
1081	else {
1082	cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
1083	exit (1);
1084	}
1085	}
1086
1087	// clustalw output format
1088	else if (!strcmp (argv[i], "-clustalw")){
1089	enableClustalWOutput = true;
1090	}
1091	// mxscarna output format
1092	else if (!strcmp (argv[i], "-mxscarna")) {
1093	enableMXSCARNAOutput = true;
1094	}
1095	// stockholm output format
1096	else if (!strcmp (argv[i], "-stockholm")) {
1097	enableStockholmOutput = true;
1098	}
1099	// web output format
1100	else if (!strcmp (argv[i], "-weboutput")) {
1101	if (i < argc - 1) {
1102	weboutputFileName = string(argv[++i]);
1103	}
1104	else {
1105	cerr << "ERROR: Invalid following option " << argv[i-1] << ": " << argv[i] << endl;
1106	exit (1);
1107	}
1108
1109	enableWebOutput = true;
1110	}
1111
1112	// cutoff
1113	else if (!strcmp (argv[i], "-co") \|\| !strcmp (argv[i], "--cutoff")){
1114	if (i < argc - 1){
1115	if (!GetFloat (argv[++i], &tempFloat)){
1116	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1117	exit (1);
1118	}
1119	else {
1120	if (tempFloat < 0 \|\| tempFloat > 1){
1121	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
1122	exit (1);
1123	}
1124	else
1125	cutoff = tempFloat;
1126	}
1127	}
1128	else {
1129	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1130	exit (1);
1131	}
1132	}
1133
1134	// verbose reporting
1135	else if (!strcmp (argv[i], "-v") \|\| !strcmp (argv[i], "--verbose")){
1136	enableVerbose = true;
1137	}
1138
1139	// alignment order
1140	else if (!strcmp (argv[i], "-a") \|\| !strcmp (argv[i], "--alignment-order")){
1141	enableAlignOrder = true;
1142	}
1143	// McCaskill MEA MODE
1144	else if (!strcmp (argv[i], "-m") \|\| !strcmp (argv[i], "--mccaskillmea")){
1145	enableMcCaskillMEAMode = true;
1146	}
1147	// bad arguments
1148	else {
1149	cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
1150	exit (1);
1151	}
1152	}
1153	else {
1154	sequenceNames.push_back (string (argv[i]));
1155	}
1156	}
1157
1158	if (enableTrainEmissions && !enableTraining){
1159	cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
1160	exit (1);
1161	}
1162
1163	return sequenceNames;
1164	}
1165
1166	/////////////////////////////////////////////////////////////////
1167	// ReadParameters()
1168	//
1169	// Read initial distribution, transition, and emission
1170	// parameters from a file.
1171	/////////////////////////////////////////////////////////////////
1172
1173	void ReadParameters (){
1174
1175	ifstream data;
1176
1177	emitPairs = VVF (256, VF (256, 1e-10));
1178	emitSingle = VF (256, 1e-5);
1179
1180	// read initial state distribution and transition parameters
1181	// pass
1182	if (parametersInputFilename == string ("")){
1183	if (NumInsertStates == 1){
1184	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
1185	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
1186	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
1187	}
1188	else if (NumInsertStates == 2){
1189	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
1190	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
1191	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
1192	}
1193	else {
1194	cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
1195	<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
1196	exit (1);
1197	}
1198
1199	alphabet = alphabetDefault;
1200
1201	for (int i = 0; i < (int) alphabet.length(); i++){
1202	emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
1203	emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
1204	for (int j = 0; j <= i; j++){
1205	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1206	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1207	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1208	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1209	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1210	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1211	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1212	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1213	}
1214	}
1215	}
1216	else {
1217	data.open (parametersInputFilename.c_str());
1218	if (data.fail()){
1219	cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
1220	exit (1);
1221	}
1222
1223	string line[3];
1224	for (int i = 0; i < 3; i++){
1225	if (!getline (data, line[i])){
1226	cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
1227	exit (1);
1228	}
1229	}
1230	istringstream data2;
1231	data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
1232	data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
1233	data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
1234
1235	if (!getline (data, line[0])){
1236	cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
1237	exit (1);
1238	}
1239
1240	// read alphabet as concatenation of all characters on alphabet line
1241	alphabet = "";
1242	string token;
1243	data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
1244
1245	for (int i = 0; i < (int) alphabet.size(); i++){
1246	for (int j = 0; j <= i; j++){
1247	float val;
1248	data >> val;
1249	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1250	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1251	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1252	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1253	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1254	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1255	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1256	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1257	}
1258	}
1259
1260	for (int i = 0; i < (int) alphabet.size(); i++){
1261	float val;
1262	data >> val;
1263	emitSingle[(unsigned char) tolower(alphabet[i])] = val;
1264	emitSingle[(unsigned char) toupper(alphabet[i])] = val;
1265	}
1266	data.close();
1267	}
1268	}
1269
1270	/////////////////////////////////////////////////////////////////
1271	// ProcessTree()
1272	//
1273	// Process the tree recursively. Returns the aligned sequences
1274	// corresponding to a node or leaf of the tree.
1275	/////////////////////////////////////////////////////////////////
1276	float ide;
1277	MultiSequence ProcessTree (const TreeNode tree, MultiSequence *sequences,
1278	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1279	const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices) {
1280	MultiSequence *result;
1281
1282	// check if this is a node of the alignment tree
1283	if (tree->GetSequenceLabel() == -1){
1284	MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model, BPPMatrices);
1285	MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model, BPPMatrices);
1286
1287	assert (alignLeft);
1288	assert (alignRight);
1289
1290	result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model, BPPMatrices, tree->GetIdentity());
1291	assert (result);
1292
1293	delete alignLeft;
1294	delete alignRight;
1295	}
1296
1297	// otherwise, this is a leaf of the alignment tree
1298	else {
1299	result = new MultiSequence(); assert (result);
1300	result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1301	}
1302
1303	return result;
1304	}
1305
1306	/////////////////////////////////////////////////////////////////
1307	// ComputeFinalAlignment()
1308	//
1309	// Compute the final alignment by calling ProcessTree(), then
1310	// performing iterative refinement as needed.
1311	/////////////////////////////////////////////////////////////////
1312
1313	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
1314	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1315	const ProbabilisticModel &model,
1316	SafeVector<BPPMatrix*> &BPPMatrices) {
1317
1318	MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model, BPPMatrices);
1319
1320	if (enableAlignOrder){
1321	alignment->SaveOrdering();
1322	enableAlignOrder = false;
1323	}
1324
1325	// tree-based refinement
1326	// if you use the function, you can degrade the quality of the software.
1327	// TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree, BPPMatrices);
1328
1329	// iterative refinement
1330	/*
1331	for (int i = 0; i < numIterativeRefinementReps; i++)
1332	DoIterativeRefinement (sparseMatrices, model, alignment);
1333
1334	cerr << endl;
1335	*/
1336	// return final alignment
1337	return alignment;
1338	}
1339
1340	/////////////////////////////////////////////////////////////////
1341	// AlignAlignments()
1342	//
1343	// Returns the alignment of two MultiSequence objects.
1344	/////////////////////////////////////////////////////////////////
1345
1346	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
1347	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1348	const ProbabilisticModel &model,
1349	SafeVector<BPPMatrix*> &BPPMatrices, float identity){
1350
1351	// print some info about the alignment
1352	if (enableVerbose){
1353	for (int i = 0; i < align1->GetNumSequences(); i++)
1354	cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1355	cerr << "] vs. ";
1356	for (int i = 0; i < align2->GetNumSequences(); i++)
1357	cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1358	cerr << "]: ";
1359	}
1360
1361	VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1362
1363	pair<SafeVector<char> *, float> alignment;
1364	// choose the alignment routine depending on the "cosmetic" gap penalties used
1365	if (gapOpenPenalty == 0 && gapContinuePenalty == 0) {
1366
1367	if(identity < threshhold) {
1368	std::vector<StemCandidate> pscs1, pscs2;
1369	pscs1 = seq2scs(align1, BPPMatrices, BandWidth);
1370	pscs2 = seq2scs(align2, BPPMatrices, BandWidth);
1371	std::vector<int> *matchPSCS1 = new std::vector<int>;
1372	std::vector<int> *matchPSCS2 = new std::vector<int>;
1373
1374	Globaldp globaldp(pscs1, pscs2, align1, align2, matchPSCS1, matchPSCS2, posterior, BPPMatrices);
1375	//float scsScore = globaldp.Run();
1376
1377	globaldp.Run();
1378
1379	removeConflicts(pscs1, pscs2, matchPSCS1, matchPSCS2);
1380
1381	alignment = model.ComputeAlignment2 (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, pscs1, pscs2, matchPSCS1, matchPSCS2);
1382
1383	} else {
1384	alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1385	}
1386	}
1387	else {
1388	alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1389	*posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1390	gapOpenPenalty, gapContinuePenalty);
1391	}
1392
1393	delete posterior;
1394
1395	if (enableVerbose){
1396
1397	// compute total length of sequences
1398	int totLength = 0;
1399	for (int i = 0; i < align1->GetNumSequences(); i++)
1400	for (int j = 0; j < align2->GetNumSequences(); j++)
1401	totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1402
1403	// give an "accuracy" measure for the alignment
1404	cerr << alignment.second / totLength << endl;
1405	}
1406
1407	// now build final alignment
1408	MultiSequence *result = new MultiSequence();
1409	for (int i = 0; i < align1->GetNumSequences(); i++)
1410	result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1411	for (int i = 0; i < align2->GetNumSequences(); i++)
1412	result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1413	if (!enableAlignOrder)
1414	result->SortByLabel();
1415
1416	// free temporary alignment
1417	delete alignment.first;
1418
1419	return result;
1420	}
1421
1422	/////////////////////////////////////////////////////////////////
1423	// DoRelaxation()
1424	//
1425	// Performs one round of the consistency transformation. The
1426	// formula used is:
1427	// 1
1428	// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1429	// \|S\| z in S k
1430	//
1431	// where S = {x, y, all other sequences...}
1432	//
1433	/////////////////////////////////////////////////////////////////
1434
1435	SafeVector<SafeVector<SparseMatrix > > DoRelaxation (MultiSequence sequences,
1436	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1437	const int numSeqs = sequences->GetNumSequences();
1438
1439	SafeVector<SafeVector<SparseMatrix > > newSparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
1440
1441	// for every pair of sequences
1442	for (int i = 0; i < numSeqs; i++){
1443	for (int j = i+1; j < numSeqs; j++){
1444	Sequence *seq1 = sequences->GetSequence (i);
1445	Sequence *seq2 = sequences->GetSequence (j);
1446
1447	if (enableVerbose)
1448	cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1449	<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1450
1451	// get the original posterior matrix
1452	VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1453	VF &posterior = *posteriorPtr;
1454
1455	const int seq1Length = seq1->GetLength();
1456	const int seq2Length = seq2->GetLength();
1457
1458	// contribution from the summation where z = x and z = y
1459	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1460
1461	if (enableVerbose)
1462	cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1463
1464	// contribution from all other sequences
1465	for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1466	if (k < i)
1467	Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1468	else if (k > i && k < j)
1469	Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1470	else {
1471	SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1472	Relax (sparseMatrices[i][k], temp, posterior);
1473	delete temp;
1474	}
1475	}
1476
1477	// now renormalization
1478	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1479
1480	// mask out positions not originally in the posterior matrix
1481	SparseMatrix *matXY = sparseMatrices[i][j];
1482	for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1483	for (int x = 1; x <= seq1Length; x++){
1484	SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1485	SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1486	VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1487	int curr = 0;
1488	while (XYptr != XYend){
1489
1490	// zero out all cells until the first filled column
1491	while (curr < XYptr->first){
1492	base[curr] = 0;
1493	curr++;
1494	}
1495
1496	// now, skip over this column
1497	curr++;
1498	++XYptr;
1499	}
1500
1501	// zero out cells after last column
1502	while (curr <= seq2Length){
1503	base[curr] = 0;
1504	curr++;
1505	}
1506	}
1507
1508	// save the new posterior matrix
1509	newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1510	newSparseMatrices[j][i] = NULL;
1511
1512	if (enableVerbose)
1513	cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1514
1515	delete posteriorPtr;
1516
1517	if (enableVerbose)
1518	cerr << "done." << endl;
1519	}
1520	}
1521
1522	return newSparseMatrices;
1523	}
1524
1525	/////////////////////////////////////////////////////////////////
1526	// Relax()
1527	//
1528	// Computes the consistency transformation for a single sequence
1529	// z, and adds the transformed matrix to "posterior".
1530	/////////////////////////////////////////////////////////////////
1531
1532	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior){
1533
1534	assert (matXZ);
1535	assert (matZY);
1536
1537	int lengthX = matXZ->GetSeq1Length();
1538	int lengthY = matZY->GetSeq2Length();
1539	assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1540
1541	// for every x[i]
1542	for (int i = 1; i <= lengthX; i++){
1543	SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1544	SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1545
1546	VF::iterator base = posterior.begin() + i * (lengthY + 1);
1547
1548	// iterate through all x[i]-z[k]
1549	while (XZptr != XZend){
1550	SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1551	SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1552	const float XZval = XZptr->second;
1553
1554	// iterate through all z[k]-y[j]
1555	while (ZYptr != ZYend){
1556	base[ZYptr->first] += XZval * ZYptr->second;
1557	ZYptr++;
1558	}
1559	XZptr++;
1560	}
1561	}
1562	}
1563
1564	/////////////////////////////////////////////////////////////////
1565	// Relax1()
1566	//
1567	// Computes the consistency transformation for a single sequence
1568	// z, and adds the transformed matrix to "posterior".
1569	/////////////////////////////////////////////////////////////////
1570
1571	void Relax1 (SparseMatrix matZX, SparseMatrix matZY, VF &posterior){
1572
1573	assert (matZX);
1574	assert (matZY);
1575
1576	int lengthZ = matZX->GetSeq1Length();
1577	int lengthY = matZY->GetSeq2Length();
1578
1579	// for every z[k]
1580	for (int k = 1; k <= lengthZ; k++){
1581	SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1582	SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1583
1584	// iterate through all z[k]-x[i]
1585	while (ZXptr != ZXend){
1586	SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1587	SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1588	const float ZXval = ZXptr->second;
1589	VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1590
1591	// iterate through all z[k]-y[j]
1592	while (ZYptr != ZYend){
1593	base[ZYptr->first] += ZXval * ZYptr->second;
1594	ZYptr++;
1595	}
1596	ZXptr++;
1597	}
1598	}
1599	}
1600
1601	void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
1602	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1603	SafeVector<BPPMatrix*> &BPPMatrices) {
1604	const int numSeqs = sequences->GetNumSequences();
1605
1606	for (int i = 0; i < numSeqs; i++) {
1607	Sequence *seq1 = sequences->GetSequence (i);
1608	BPPMatrix *seq1BppMatrix = BPPMatrices[seq1->GetLabel()];
1609	Trimat<float> consBppMat(seq1->GetLength() + 1);
1610	int seq1Length = seq1->GetLength();
1611
1612	for (int k = 1; k <= seq1Length; k++) {
1613	for (int l = k; l <= seq1Length; l++) {
1614	consBppMat.ref(k, l) = seq1BppMatrix->GetProb(k, l);
1615	}
1616	}
1617
1618	for (int j = i + 1; j < numSeqs; j++) {
1619
1620	// VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior()
1621	Sequence *seq2 = sequences->GetSequence (j);
1622	BPPMatrix *seq2BppMatrix = BPPMatrices[seq2->GetLabel()];
1623	// int seq2Length = seq2->GetLength();
1624	SparseMatrix *matchProb = sparseMatrices[i][j];
1625
1626	// vector<PIF2> &probs1 = seq1BppMatrix->bppMat.data2;
1627	for(int k = 1; k <= seq1Length; k++) {
1628	for(int m = k, n = k; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1629
1630	// for (int k = 0; k < (int)probs1.size(); k++) {
1631	// float tmpProb1 = probs1[k].prob;
1632	// int tmp1I = probs1[k].i;
1633	// int tmp1J = probs1[k].j;
1634
1635	float sumProb = 0;
1636	vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1637	for(int l = 0; l < (int)probs2.size(); l++) {
1638	float tmpProb2 = probs2[l].prob;
1639	int tmp2I = probs2[l].i;
1640	int tmp2J = probs2[l].j;
1641	sumProb += matchProb->GetValue(m, tmp2I)matchProb->GetValue(n, tmp2J)tmpProb2;
1642	}
1643
1644	consBppMat.ref(m, n) += sumProb;
1645	}
1646
1647	for(int m = k, n = k + 1; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1648
1649	// for (int k = 0; k < (int)probs1.size(); k++) {
1650	// float tmpProb1 = probs1[k].prob;
1651	// int tmp1I = probs1[k].i;
1652	// int tmp1J = probs1[k].j;
1653
1654	float sumProb = 0;
1655	vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1656	for(int l = 0; l < (int)probs2.size(); l++) {
1657	float tmpProb2 = probs2[l].prob;
1658	int tmp2I = probs2[l].i;
1659	int tmp2J = probs2[l].j;
1660	sumProb += matchProb->GetValue(m, tmp2I)matchProb->GetValue(n, tmp2J)tmpProb2;
1661	}
1662
1663	consBppMat.ref(m, n) += sumProb;
1664	}
1665	}
1666	}
1667
1668	/*
1669	for(int k = 1; k <= seq1Length; k++) {
1670	for(int m = k, n = k; n <= k + 30 && m >= 1 && n <= seq1Length; m--, n++) {
1671	float tmpProb = seq1BppMatrix->GetProb(m, n);
1672	for(int l = 1; l <= seq2Length; l++) {
1673	for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1674	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1675	}
1676	for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1677	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1678	}
1679	}
1680	consBppMat.ref(m, n) += tmpProb;
1681	}
1682
1683	for(int m = k, n = k + 1; n <= k + 31 && m >= 1 && n <= seq1Length; m--, n++) {
1684	float tmpProb = seq1BppMatrix->GetProb(m, n);
1685	for(int l = 1; l <= seq2Length; l++) {
1686	for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1687	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1688	}
1689	for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1690	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1691	}
1692	}
1693	consBppMat.ref(m,n) += tmpProb;
1694	}
1695	}
1696	}
1697	*/
1698	for (int m = 1; m <= seq1Length; m++) {
1699	for (int n = m + 4; n <= seq1Length; n++) {
1700	consBppMat.ref(m,n) = consBppMat.ref(m,n)/(float)numSeqs;
1701	}
1702	}
1703	seq1BppMatrix->updateBPPMatrix(consBppMat);
1704	}
1705	}
1706
1707	/////////////////////////////////////////////////////////////////
1708	// GetSubtree
1709	//
1710	// Returns set containing all leaf labels of the current subtree.
1711	/////////////////////////////////////////////////////////////////
1712
1713	set<int> GetSubtree (const TreeNode *tree){
1714	set<int> s;
1715
1716	if (tree->GetSequenceLabel() == -1){
1717	s = GetSubtree (tree->GetLeftChild());
1718	set<int> t = GetSubtree (tree->GetRightChild());
1719
1720	for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1721	s.insert (*iter);
1722	}
1723	else {
1724	s.insert (tree->GetSequenceLabel());
1725	}
1726
1727	return s;
1728	}
1729
1730	/////////////////////////////////////////////////////////////////
1731	// TreeBasedBiPartitioning
1732	//
1733	// Uses the iterative refinement scheme from MUSCLE.
1734	/////////////////////////////////////////////////////////////////
1735
1736	void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1737	const ProbabilisticModel &model, MultiSequence* &alignment,
1738	const TreeNode tree, SafeVector<BPPMatrix> &BPPMatrices){
1739	// check if this is a node of the alignment tree
1740	if (tree->GetSequenceLabel() == -1){
1741	TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild(), BPPMatrices);
1742	TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild(), BPPMatrices);
1743
1744	set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1745	set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1746	set<int> leftSubtreeComplement, rightSubtreeComplement;
1747
1748	// calculate complement of each subtree
1749	for (int i = 0; i < alignment->GetNumSequences(); i++){
1750	if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1751	if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1752	}
1753
1754	// perform realignments for edge to left child
1755	if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1756	MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1757	MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1758	delete alignment;
1759	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetLeftChild()->GetIdentity());
1760	}
1761
1762	// perform realignments for edge to right child
1763	if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1764	MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1765	MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1766	delete alignment;
1767	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetRightChild()->GetIdentity());
1768	}
1769	}
1770	}
1771
1772	/////////////////////////////////////////////////////////////////
1773	// DoterativeRefinement()
1774	//
1775	// Performs a single round of randomized partionining iterative
1776	// refinement.
1777	/////////////////////////////////////////////////////////////////
1778	/*
1779	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1780	const ProbabilisticModel &model, MultiSequence* &alignment){
1781	set<int> groupOne, groupTwo;
1782
1783	// create two separate groups
1784	for (int i = 0; i < alignment->GetNumSequences(); i++){
1785	if (rand() % 2)
1786	groupOne.insert (i);
1787	else
1788	groupTwo.insert (i);
1789	}
1790
1791	if (groupOne.empty() \|\| groupTwo.empty()) return;
1792
1793	// project into the two groups
1794	MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1795	MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1796	delete alignment;
1797
1798	// realign
1799	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1800
1801	delete groupOneSeqs;
1802	delete groupTwoSeqs;
1803	}
1804	*/
1805
1806	/////////////////////////////////////////////////////////////////
1807	// WriteAnnotation()
1808	//
1809	// Computes annotation for multiple alignment and write values
1810	// to a file.
1811	/////////////////////////////////////////////////////////////////
1812
1813	void WriteAnnotation (MultiSequence *alignment,
1814	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1815	ofstream outfile (annotationFilename.c_str());
1816
1817	if (outfile.fail()){
1818	cerr << "ERROR: Unable to write annotation file." << endl;
1819	exit (1);
1820	}
1821
1822	const int alignLength = alignment->GetSequence(0)->GetLength();
1823	const int numSeqs = alignment->GetNumSequences();
1824
1825	SafeVector<int> position (numSeqs, 0);
1826	SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1827	for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1828	SafeVector<pair<int,int> > active;
1829	active.reserve (numSeqs);
1830
1831	// for every column
1832	for (int i = 1; i <= alignLength; i++){
1833
1834	// find all aligned residues in this particular column
1835	active.clear();
1836	for (int j = 0; j < numSeqs; j++){
1837	if (seqs[j][i] != '-'){
1838	active.push_back (make_pair(j, ++position[j]));
1839	}
1840	}
1841
1842	outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1843	}
1844
1845	outfile.close();
1846	}
1847
1848	/////////////////////////////////////////////////////////////////
1849	// ComputeScore()
1850	//
1851	// Computes the annotation score for a particular column.
1852	/////////////////////////////////////////////////////////////////
1853
1854	int ComputeScore (const SafeVector<pair<int, int> > &active,
1855	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1856
1857	if (active.size() <= 1) return 0;
1858
1859	// ALTERNATIVE #1: Compute the average alignment score.
1860
1861	float val = 0;
1862	for (int i = 0; i < (int) active.size(); i++){
1863	for (int j = i+1; j < (int) active.size(); j++){
1864	val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1865	}
1866	}
1867
1868	return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1869
1870	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/GDE/MAFFT/mafft-7.055-with-extensions/extensions/mxscarna_src/probconsRNA/Main.cc

Download in other formats: