Context Navigation

Main.cc

Visit:

Last change on this file was 10371, checked in by aboeckma, 11 years ago
updated mafft version. Added extensions (no svn ignore, yet)
File size: 65.7 KB

Line
1	/////////////////////////////////////////////////////////////////
2	// Main.cc
3	//
4	// Main routines for MXSCARNA program.
5	/////////////////////////////////////////////////////////////////
6
7	#include "scarna.hpp"
8	#include "SafeVector.h"
9	#include "MultiSequence.h"
10	#include "Defaults.h"
11	#include "ScoreType.h"
12	#include "ProbabilisticModel.h"
13	#include "EvolutionaryTree.h"
14	#include "SparseMatrix.h"
15	#include "BPPMatrix.hpp"
16	#include "StemCandidate.hpp"
17	#include "Globaldp.hpp"
18	#include "nrutil.h"
19	#include "AlifoldMEA.h"
20	#include <string>
21	#include <sstream>
22	#include <iomanip>
23	#include <iostream>
24	#include <list>
25	#include <set>
26	#include <algorithm>
27	#include <cstdio>
28	#include <cstdlib>
29	#include <cerrno>
30	#include <iomanip>
31	#include <fstream>
32
33	//#include "RfoldWrapper.hpp"
34	//static RFOLD::Rfold folder;
35
36	using namespace::MXSCARNA;
37
38	string parametersInputFilename = "";
39	string parametersOutputFilename = "no training";
40	string annotationFilename = "";
41	string weboutputFileName = "";
42
43	ofstream *outputFile;
44
45	bool enableTraining = false;
46	bool enableVerbose = false;
47	bool enableAllPairs = false;
48	bool enableAnnotation = false;
49	bool enableViterbi = false;
50	bool enableClustalWOutput = false;
51	bool enableTrainEmissions = false;
52	bool enableAlignOrder = false;
53	bool enableWebOutput = false;
54	bool enableStockholmOutput = false;
55	bool enableMXSCARNAOutput = false;
56	bool enableMcCaskillMEAMode = false;
57	char bppmode = 's'; // by katoh
58	int numConsistencyReps = 2;
59	int numPreTrainingReps = 0;
60	int numIterativeRefinementReps = 100;
61	int scsLength = SCSLENGTH;
62	float cutoff = 0;
63	float gapOpenPenalty = 0;
64	float gapContinuePenalty = 0;
65	float threshhold = 1.0;
66	float BaseProbThreshold = BASEPROBTHRESHOLD;
67	float BasePairConst = BASEPAIRCONST;
68	int BandWidth = BANDWIDTH;
69
70	SafeVector<string> sequenceNames;
71
72	VF initDistrib (NumMatrixTypes);
73	VF gapOpen (2*NumInsertStates);
74	VF gapExtend (2*NumInsertStates);
75	VVF emitPairs (256, VF (256, 1e-10));
76	VF emitSingle (256, 1e-5);
77
78	string alphabet = alphabetDefault;
79
80	string *ssCons = NULL;
81
82	const int MIN_PRETRAINING_REPS = 0;
83	const int MAX_PRETRAINING_REPS = 20;
84	const int MIN_CONSISTENCY_REPS = 0;
85	const int MAX_CONSISTENCY_REPS = 5;
86	const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
87	const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
88
89	/////////////////////////////////////////////////////////////////
90	// Function prototypes
91	/////////////////////////////////////////////////////////////////
92
93	void PrintHeading();
94	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
95	const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename);
96	MultiSequence DoAlign (MultiSequence sequence, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle);
97	SafeVector<string> ParseParams (int argc, char **argv);
98	void ReadParameters ();
99	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
100	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
101	const ProbabilisticModel &model,
102	SafeVector<BPPMatrix*> &BPPMatrices);
103	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
104	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
105	const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices, float identity);
106	SafeVector<SafeVector<SparseMatrix > > DoRelaxation (MultiSequence sequences,
107	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
108	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior);
109	void Relax1 (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior);
110	void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
111	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
112	SafeVector<BPPMatrix*> &BPPMatrices);
113	set<int> GetSubtree (const TreeNode *tree);
114	void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
115	const ProbabilisticModel &model, MultiSequence* &alignment,
116	const TreeNode tree, SafeVector<BPPMatrix> &BPPMatrices);
117	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
118	const ProbabilisticModel &model, MultiSequence* &alignment);
119	void WriteAnnotation (MultiSequence *alignment,
120	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
121	int ComputeScore (const SafeVector<pair<int, int> > &active,
122	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices);
123	std::vector<StemCandidate>* seq2scs(MultiSequence Sequences, SafeVector<BPPMatrix> &BPPMatrices, int BandWidth);
124	void removeConflicts(std::vector<StemCandidate> pscs1, std::vector<StemCandidate> pscs2, std::vector<int> matchPSCS1, std::vector<int> matchPSCS2);
125
126	struct prob {
127	int i;
128	int j;
129	float p;
130	};
131
132	/////////////////////////////////////////////////////////////////
133	// main()
134	//
135	// Calls all initialization routines and runs the MXSCARNA
136	// aligner.
137	/////////////////////////////////////////////////////////////////
138
139
140	int main (int argc, char **argv){
141
142	// print MXSCARNA heading
143	PrintHeading();
144
145	// parse program parameters
146	sequenceNames = ParseParams (argc, argv);
147	ReadParameters();
148	PrintParameters ("Using parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
149
150	// now, we'll process all the files given as input. If we are given
151	// several filenames as input, then we'll load all of those sequences
152	// simultaneously, as long as we're not training. On the other hand,
153	// if we are training, then we'll treat each file as a separate
154	// training instance
155
156	if (enableMcCaskillMEAMode) {
157	MultiSequence *sequences = new MultiSequence(); assert (sequences);
158	for (int i = 0; i < (int) sequenceNames.size(); i++){
159	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
160	sequences->LoadMFA (sequenceNames[i], true);
161	}
162
163	const int numSeqs = sequences->GetNumSequences();
164	SafeVector<BPPMatrix*> BPPMatrices;
165
166	// compute the base pairing matrices for each sequences
167	for(int i = 0; i < numSeqs; i++) {
168	Sequence *tmpSeq = sequences->GetSequence(i);
169	string seq = tmpSeq->GetString();
170	int n_seq = tmpSeq->GetLength();
171	BPPMatrix *bppmat = new BPPMatrix(bppmode, seq, n_seq); // modified by katoh
172	BPPMatrices.push_back(bppmat);
173	}
174	if (bppmode=='w') exit( 0 );
175
176	AlifoldMEA alifold(sequences, BPPMatrices, BasePairConst);
177	alifold.Run();
178	ssCons = alifold.getSScons();
179
180	if (enableStockholmOutput) {
181	sequences->WriteSTOCKHOLM (cout, ssCons);
182	}
183	else if (enableMXSCARNAOutput){
184	sequences->WriteMXSCARNA (cout, ssCons);
185	}
186	else {
187	sequences->WriteMFA (cout, ssCons);
188	}
189
190	delete sequences;
191	}
192	// if we are training
193	else if (enableTraining){
194
195	// build new model for aligning
196	ProbabilisticModel model (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
197
198	// prepare to average parameters
199	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
200	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
201	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
202	if (enableTrainEmissions){
203	for (int i = 0; i < (int) emitPairs.size(); i++)
204	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
205	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
206	}
207
208	// align each file individually
209	for (int i = 0; i < (int) sequenceNames.size(); i++){
210
211	VF thisInitDistrib (NumMatrixTypes);
212	VF thisGapOpen (2*NumInsertStates);
213	VF thisGapExtend (2*NumInsertStates);
214	VVF thisEmitPairs (256, VF (256, 1e-10));
215	VF thisEmitSingle (256, 1e-5);
216
217	// load sequence file
218	MultiSequence *sequences = new MultiSequence(); assert (sequences);
219	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
220	sequences->LoadMFA (sequenceNames[i], true);
221
222	// align sequences
223	DoAlign (sequences, model, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle);
224
225	// add in contribution of the derived parameters
226	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
227	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
228	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
229	if (enableTrainEmissions){
230	for (int i = 0; i < (int) emitPairs.size(); i++)
231	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
232	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
233	}
234
235	delete sequences;
236	}
237
238	// compute new parameters and print them out
239	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= (int) sequenceNames.size();
240	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= (int) sequenceNames.size();
241	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= (int) sequenceNames.size();
242	if (enableTrainEmissions){
243	for (int i = 0; i < (int) emitPairs.size(); i++)
244	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= (int) sequenceNames.size();
245	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= sequenceNames.size();
246	}
247
248	PrintParameters ("Trained parameter set:",
249	initDistrib, gapOpen, gapExtend, emitPairs, emitSingle,
250	parametersOutputFilename.c_str());
251	}
252	// pass
253	// if we are not training, we must simply want to align some sequences
254	else {
255	// load all files together
256	MultiSequence *sequences = new MultiSequence(); assert (sequences);
257	for (int i = 0; i < (int) sequenceNames.size(); i++){
258	cerr << "Loading sequence file: " << sequenceNames[i] << endl;
259
260	sequences->LoadMFA (sequenceNames[i], true);
261	}
262
263	// do all "pre-training" repetitions first
264	// NOT execute
265	for (int ct = 0; ct < numPreTrainingReps; ct++){
266	enableTraining = true;
267
268	// build new model for aligning
269	ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
270	emitPairs, emitSingle);
271
272	// do initial alignments
273	DoAlign (sequences, model, initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
274
275	// print new parameters
276	PrintParameters ("Recomputed parameter set:", initDistrib, gapOpen, gapExtend, emitPairs, emitSingle, NULL);
277
278	enableTraining = false;
279	}
280
281	// now, we can perform the alignments and write them out
282	if (enableWebOutput) {
283	outputFile = new ofstream(weboutputFileName.c_str());
284	if (!outputFile) {
285	cerr << "cannot open output file." << weboutputFileName << endl;
286	exit(1);
287	}
288	*outputFile << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" << endl;
289	*outputFile << "<result>" << endl;
290	}
291	MultiSequence *alignment = DoAlign (sequences,
292	ProbabilisticModel (initDistrib, gapOpen, gapExtend, emitPairs, emitSingle),
293	initDistrib, gapOpen, gapExtend, emitPairs, emitSingle);
294
295
296	if (!enableAllPairs){
297	if (enableClustalWOutput) {
298	alignment->WriteALN (cout);
299	}
300	else if (enableWebOutput) {
301	alignment->WriteWEB (*outputFile, ssCons);
302	// computeStructureWithAlifold ();
303	}
304	else if (enableStockholmOutput) {
305	alignment->WriteSTOCKHOLM (cout, ssCons);
306	}
307	else if (enableMXSCARNAOutput) {
308	alignment->WriteMXSCARNA (cout, ssCons);
309	}
310	else {
311	alignment->WriteMFA (cout, ssCons);
312	}
313	}
314
315	if (enableWebOutput) {
316	*outputFile << "</result>" << endl;
317	delete outputFile;
318	}
319
320	delete ssCons;
321	delete alignment;
322	delete sequences;
323
324	}
325	}
326
327	/////////////////////////////////////////////////////////////////
328	// PrintHeading()
329	//
330	// Prints heading for PROBCONS program.
331	/////////////////////////////////////////////////////////////////
332
333	void PrintHeading (){
334	cerr << endl
335	<< "Multiplex SCARNA"<< endl
336	<< "version " << VERSION << " - align multiple RNA sequences and print to standard output" << endl
337	<< "Written by Yasuo Tabei" << endl
338	<< endl;
339	}
340
341	/////////////////////////////////////////////////////////////////
342	// PrintParameters()
343	//
344	// Prints PROBCONS parameters to STDERR. If a filename is
345	// specified, then the parameters are also written to the file.
346	/////////////////////////////////////////////////////////////////
347
348	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
349	const VF &gapExtend, const VVF &emitPairs, const VF &emitSingle, const char *filename){
350
351	// print parameters to the screen
352	cerr << message << endl
353	<< " initDistrib[] = { ";
354	for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
355	cerr << "}" << endl
356	<< " gapOpen[] = { ";
357	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
358	cerr << "}" << endl
359	<< " gapExtend[] = { ";
360	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
361	cerr << "}" << endl
362	<< endl;
363
364	/*
365	for (int i = 0; i < 5; i++){
366	for (int j = 0; j <= i; j++){
367	cerr << emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]] << " ";
368	}
369	cerr << endl;
370	}*/
371
372	// if a file name is specified
373	if (filename){
374
375	// attempt to open the file for writing
376	FILE *file = fopen (filename, "w");
377	if (!file){
378	cerr << "ERROR: Unable to write parameter file: " << filename << endl;
379	exit (1);
380	}
381
382	// if successful, then write the parameters to the file
383	for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
384	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
385	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
386	fprintf (file, "%s\n", alphabet.c_str());
387	for (int i = 0; i < (int) alphabet.size(); i++){
388	for (int j = 0; j <= i; j++)
389	fprintf (file, "%.10f ", emitPairs[(unsigned char) alphabet[i]][(unsigned char) alphabet[j]]);
390	fprintf (file, "\n");
391	}
392	for (int i = 0; i < (int) alphabet.size(); i++)
393	fprintf (file, "%.10f ", emitSingle[(unsigned char) alphabet[i]]);
394	fprintf (file, "\n");
395	fclose (file);
396	}
397	}
398
399	/////////////////////////////////////////////////////////////////
400	// DoAlign()
401	//
402	// First computes all pairwise posterior probability matrices.
403	// Then, computes new parameters if training, or a final
404	// alignment, otherwise.
405	/////////////////////////////////////////////////////////////////
406	MultiSequence DoAlign (MultiSequence sequences, const ProbabilisticModel &model, VF &initDistrib, VF &gapOpen, VF &gapExtend, VVF &emitPairs, VF &emitSingle){
407
408
409	assert (sequences);
410
411	const int numSeqs = sequences->GetNumSequences();
412	VVF distances (numSeqs, VF (numSeqs, 0));
413	VVF identities (numSeqs, VF (numSeqs, 0));
414	SafeVector<SafeVector<SparseMatrix > > sparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
415
416	SafeVector<BPPMatrix*> BPPMatrices;
417
418	for(int i = 0; i < numSeqs; i++) {
419	Sequence *tmpSeq = sequences->GetSequence(i);
420	string seq = tmpSeq->GetString();
421	int n_seq = tmpSeq->GetLength();
422	BPPMatrix *bppmat = new BPPMatrix(bppmode, seq, n_seq, BASEPROBTHRESHOLD); // modified by katoh
423	BPPMatrices.push_back(bppmat);
424	}
425
426	if (enableTraining){
427	// prepare to average parameters
428	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] = 0;
429	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] = 0;
430	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] = 0;
431	if (enableTrainEmissions){
432	for (int i = 0; i < (int) emitPairs.size(); i++)
433	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] = 0;
434	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] = 0;
435	}
436	}
437
438	// skip posterior calculations if we just want to do Viterbi alignments
439	if (!enableViterbi){
440
441	// do all pairwise alignments for posterior probability matrices
442	for (int a = 0; a < numSeqs-1; a++){
443	for (int b = a+1; b < numSeqs; b++){
444	Sequence *seq1 = sequences->GetSequence (a);
445	Sequence *seq2 = sequences->GetSequence (b);
446
447	// verbose output
448	if (enableVerbose)
449	cerr << "Computing posterior matrix: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
450	<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
451
452	// compute forward and backward probabilities
453	VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
454	VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
455
456	// if we are training, then we'll simply want to compute the
457	// expected counts for each region within the matrix separately;
458	// otherwise, we'll need to put all of the regions together and
459	// assemble a posterior probability match matrix
460
461	// so, if we're training
462	if (enableTraining){
463
464	// compute new parameters
465	VF thisInitDistrib (NumMatrixTypes);
466	VF thisGapOpen (2*NumInsertStates);
467	VF thisGapExtend (2*NumInsertStates);
468	VVF thisEmitPairs (256, VF (256, 1e-10));
469	VF thisEmitSingle (256, 1e-5);
470
471	model.ComputeNewParameters (seq1, seq2, forward, backward, thisInitDistrib, thisGapOpen, thisGapExtend, thisEmitPairs, thisEmitSingle, enableTrainEmissions);
472
473	// add in contribution of the derived parameters
474	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] += thisInitDistrib[i];
475	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] += thisGapOpen[i];
476	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] += thisGapExtend[i];
477	if (enableTrainEmissions){
478	for (int i = 0; i < (int) emitPairs.size(); i++)
479	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] += thisEmitPairs[i][j];
480	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] += thisEmitSingle[i];
481	}
482
483	// let us know that we're done.
484	if (enableVerbose) cerr << "done." << endl;
485	}
486	// pass
487	else {
488
489	// compute posterior probability matrix
490	VF posterior = model.ComputePosteriorMatrix (seq1, seq2, forward, *backward); assert (posterior);
491
492	// compute sparse representations
493	sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
494	sparseMatrices[b][a] = NULL;
495
496	if (!enableAllPairs){
497	// perform the pairwise sequence alignment
498	pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
499	seq2->GetLength(),
500	*posterior);
501
502	Sequence *tmpSeq1 = seq1->AddGaps (alignment.first, 'X');
503	Sequence *tmpSeq2 = seq2->AddGaps (alignment.first, 'Y');
504
505	// compute sequence identity for each pair of sequenceses
506	int length = tmpSeq1->GetLength();
507	int matchCount = 0;
508	int misMatchCount = 0;
509	for (int k = 1; k <= length; k++) {
510	int ch1 = tmpSeq1->GetPosition(k);
511	int ch2 = tmpSeq2->GetPosition(k);
512	if (ch1 == ch2 && ch1 != '-' && ch2 != '-') { ++matchCount; }
513	else if (ch1 != ch2 && ch1 != '-' && ch2 != '-') { ++misMatchCount; }
514	}
515
516	identities[a][b] = identities[b][a] = (float)matchCount/(float)(matchCount + misMatchCount);
517
518	// compute "expected accuracy" distance for evolutionary tree computation
519	float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
520	distances[a][b] = distances[b][a] = distance;
521
522	if (enableVerbose)
523	cerr << setprecision (10) << distance << endl;
524
525	delete alignment.first;
526	}
527	else {
528	// let us know that we're done.
529	if (enableVerbose) cerr << "done." << endl;
530	}
531
532	delete posterior;
533	}
534
535	delete forward;
536	delete backward;
537	}
538	}
539	}
540
541
542	// now average out parameters derived
543	if (enableTraining){
544	// compute new parameters
545	for (int i = 0; i < (int) initDistrib.size(); i++) initDistrib[i] /= numSeqs * (numSeqs - 1) / 2;
546	for (int i = 0; i < (int) gapOpen.size(); i++) gapOpen[i] /= numSeqs * (numSeqs - 1) / 2;
547	for (int i = 0; i < (int) gapExtend.size(); i++) gapExtend[i] /= numSeqs * (numSeqs - 1) / 2;
548
549	if (enableTrainEmissions){
550	for (int i = 0; i < (int) emitPairs.size(); i++)
551	for (int j = 0; j < (int) emitPairs[i].size(); j++) emitPairs[i][j] /= numSeqs * (numSeqs - 1) / 2;
552	for (int i = 0; i < (int) emitSingle.size(); i++) emitSingle[i] /= numSeqs * (numSeqs - 1) / 2;
553	}
554	}
555
556	// see if we still want to do some alignments
557	else {
558	// pass
559	if (!enableViterbi){
560
561	// perform the consistency transformation the desired number of times
562	for (int r = 0; r < numConsistencyReps; r++){
563	SafeVector<SafeVector<SparseMatrix *> > newSparseMatrices = DoRelaxation (sequences, sparseMatrices);
564
565	// now replace the old posterior matrices
566	for (int i = 0; i < numSeqs; i++){
567	for (int j = 0; j < numSeqs; j++){
568	delete sparseMatrices[i][j];
569	sparseMatrices[i][j] = newSparseMatrices[i][j];
570	}
571	}
572	}
573	//if (numSeqs > 8) {
574	// for (int r = 0; r < 1; r++)
575	// DoBasePairProbabilityRelaxation(sequences, sparseMatrices, BPPMatrices);
576	//}
577	}
578
579	MultiSequence *finalAlignment = NULL;
580
581	if (enableAllPairs){
582	for (int a = 0; a < numSeqs-1; a++){
583	for (int b = a+1; b < numSeqs; b++){
584	Sequence *seq1 = sequences->GetSequence (a);
585	Sequence *seq2 = sequences->GetSequence (b);
586
587	if (enableVerbose)
588	cerr << "Performing pairwise alignment: (" << a+1 << ") " << seq1->GetHeader() << " vs. "
589	<< "(" << b+1 << ") " << seq2->GetHeader() << " -- ";
590
591
592	// perform the pairwise sequence alignment
593	pair<SafeVector<char> *, float> alignment;
594	if (enableViterbi)
595	alignment = model.ComputeViterbiAlignment (seq1, seq2);
596	else {
597
598	// build posterior matrix
599	VF *posterior = sparseMatrices[a][b]->GetPosterior(); assert (posterior);
600	int length = (seq1->GetLength() + 1) * (seq2->GetLength() + 1);
601	for (int i = 0; i < length; i++) (*posterior)[i] -= cutoff;
602
603	alignment = model.ComputeAlignment (seq1->GetLength(), seq2->GetLength(), *posterior);
604	delete posterior;
605	}
606
607
608	// write pairwise alignments
609	string name = seq1->GetHeader() + "-" + seq2->GetHeader() + (enableClustalWOutput ? ".aln" : ".fasta");
610	ofstream outfile (name.c_str());
611
612	MultiSequence *result = new MultiSequence();
613	result->AddSequence (seq1->AddGaps(alignment.first, 'X'));
614	result->AddSequence (seq2->AddGaps(alignment.first, 'Y'));
615	result->WriteMFAseq (outfile); // by katoh
616
617	outfile.close();
618
619	delete alignment.first;
620	}
621	}
622	exit( 0 );
623	}
624
625	// now if we still need to do a final multiple alignment
626	else {
627
628	if (enableVerbose)
629	cerr << endl;
630
631	// compute the evolutionary tree
632	TreeNode *tree = TreeNode::ComputeTree (distances, identities);
633
634	if (enableWebOutput) {
635	*outputFile << "<tree>" << endl;
636	tree->Print (*outputFile, sequences);
637	*outputFile << "</tree>" << endl;
638	}
639	else {
640	tree->Print (cerr, sequences);
641	cerr << endl;
642	}
643	// make the final alignment
644	finalAlignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model, BPPMatrices);
645
646	// build annotation
647	if (enableAnnotation){
648	WriteAnnotation (finalAlignment, sparseMatrices);
649	}
650
651	delete tree;
652	}
653
654	if (!enableViterbi){
655	// delete sparse matrices
656	for (int a = 0; a < numSeqs-1; a++){
657	for (int b = a+1; b < numSeqs; b++){
658	delete sparseMatrices[a][b];
659	delete sparseMatrices[b][a];
660	}
661	}
662	}
663
664	//AlifoldMEA alifold(finalAlignment, BPPMatrices, BasePairConst);
665	//alifold.Run();
666	//ssCons = alifold.getSScons();
667
668	return finalAlignment;
669
670	}
671
672	return NULL;
673	}
674
675	/////////////////////////////////////////////////////////////////
676	// GetInteger()
677	//
678	// Attempts to parse an integer from the character string given.
679	// Returns true only if no parsing error occurs.
680	/////////////////////////////////////////////////////////////////
681
682	bool GetInteger (char data, int val){
683	char *endPtr;
684	long int retVal;
685
686	assert (val);
687
688	errno = 0;
689	retVal = strtol (data, &endPtr, 0);
690	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
691	if (errno != 0 && (retVal == LONG_MAX \|\| retVal == LONG_MIN)) return false;
692	if (retVal < (long) INT_MIN \|\| retVal > (long) INT_MAX) return false;
693	*val = (int) retVal;
694	return true;
695	}
696
697	/////////////////////////////////////////////////////////////////
698	// GetFloat()
699	//
700	// Attempts to parse a float from the character string given.
701	// Returns true only if no parsing error occurs.
702	/////////////////////////////////////////////////////////////////
703
704	bool GetFloat (char data, float val){
705	char *endPtr;
706	double retVal;
707
708	assert (val);
709
710	errno = 0;
711	retVal = strtod (data, &endPtr);
712	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
713	if (errno != 0 && (retVal >= 1000000.0 \|\| retVal <= -1000000.0)) return false;
714	*val = (float) retVal;
715	return true;
716	}
717
718	/////////////////////////////////////////////////////////////////
719	// ParseParams()
720	//
721	// Parse all command-line options.
722	/////////////////////////////////////////////////////////////////
723
724	SafeVector<string> ParseParams (int argc, char **argv){
725
726	if (argc < 2){
727
728	cerr << "MXSCARNA comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
729	<< "you are welcome to redistribute it under certain conditions. See the" << endl
730	<< "file COPYING.txt for details." << endl
731	<< endl
732	<< "Usage:" << endl
733	<< " mxscarna [OPTION]... [MFAFILE]..." << endl
734	<< endl
735	<< "Description:" << endl
736	<< " Align sequences in MFAFILE(s) and print result to standard output" << endl
737	<< endl
738	<< " -clustalw" << endl
739	<< " use CLUSTALW output format instead of MFA" << endl
740	<< endl
741	<< " -stockholm" << endl
742	<< " use STOCKHOLM output format instead of MFA" << endl
743	<< endl
744	<< " -mxscarna" << endl
745	<< " use MXSCARNA output format instead of MFA" << endl
746	<< endl
747	<< " -weboutput /<output_path>/<outputfilename>" << endl
748	<< " use web output format" << endl
749	<< endl
750	<< " -c, --consistency REPS" << endl
751	<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
752	<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
753	<< endl
754	<< " -ir, --iterative-refinement REPS" << endl
755	<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
756	<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
757	<< endl
758	<< " -pre, --pre-training REPS" << endl
759	<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
760	<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
761	<< endl
762	<< " -pairs" << endl
763	<< " generate all-pairs pairwise alignments" << endl
764	<< endl
765	<< " -viterbi" << endl
766	<< " use Viterbi algorithm to generate all pairs (automatically enables -pairs)" << endl
767	<< endl
768	<< " -v, --verbose" << endl
769	<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
770	<< endl
771	<< " -annot FILENAME" << endl
772	<< " write annotation for multiple alignment to FILENAME" << endl
773	<< endl
774	<< " -t, --train FILENAME" << endl
775	<< " compute EM transition probabilities, store in FILENAME (default: "
776	<< parametersOutputFilename << ")" << endl
777	<< endl
778	<< " -e, --emissions" << endl
779	<< " also reestimate emission probabilities (default: "
780	<< (enableTrainEmissions ? "on" : "off") << ")" << endl
781	<< endl
782	<< " -p, --paramfile FILENAME" << endl
783	<< " read parameters from FILENAME (default: "
784	<< parametersInputFilename << ")" << endl
785	<< endl
786	<< " -a, --alignment-order" << endl
787	<< " print sequences in alignment order rather than input order (default: "
788	<< (enableAlignOrder ? "on" : "off") << ")" << endl
789	<< endl
790	<< " -s THRESHOLD" << endl
791	<< " the threshold of SCS alignment" << endl
792	<< endl
793	<< " In default, for less than " << threshhold << ", the SCS aligment is applied. " << endl
794	<< " -l SCSLENGTH" << endl
795	<< " the length of stem candidates " << SCSLENGTH << endl
796	<< endl
797	<< " -b BASEPROBTRHESHHOLD" << endl
798	<< " the threshold of base pairing probability " << BASEPROBTHRESHOLD << endl
799	<< endl
800	<< " -m, --mccaskillmea" << endl
801	<< " McCaskill MEA MODE: input the clustalw format file and output the secondary structure predicted by McCaskill MEA" << endl
802	<< endl
803	<< " -g BASEPAIRSCORECONST" << endl
804	<< " the control parameter of the prediction of base pairs, default:" << BasePairConst << endl
805	<< endl
806	<< " -w BANDWIDTH" << endl
807	<< " the control parameter of the distance of stem candidates, default:" << BANDWIDTH << endl
808	<< endl;
809
810
811	// << " -go, --gap-open VALUE" << endl
812	// << " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
813	// << endl
814	// << " -ge, --gap-extension VALUE" << endl
815	// << " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
816	// << endl
817	// << " -co, --cutoff CUTOFF" << endl
818	// << " subtract 0 <= CUTOFF <= 1 (default: " << cutoff << ") from all posterior values before final alignment" << endl
819	// << endl
820
821	exit (1);
822	}
823
824	SafeVector<string> sequenceNames;
825	int tempInt;
826	float tempFloat;
827
828	for (int i = 1; i < argc; i++){
829	if (argv[i][0] == '-'){
830
831	// training
832	if (!strcmp (argv[i], "-t") \|\| !strcmp (argv[i], "--train")){
833	enableTraining = true;
834	if (i < argc - 1)
835	parametersOutputFilename = string (argv[++i]);
836	else {
837	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
838	exit (1);
839	}
840	}
841
842	// emission training
843	else if (!strcmp (argv[i], "-e") \|\| !strcmp (argv[i], "--emissions")){
844	enableTrainEmissions = true;
845	}
846
847	// parameter file
848	else if (!strcmp (argv[i], "-p") \|\| !strcmp (argv[i], "--paramfile")){
849	if (i < argc - 1)
850	parametersInputFilename = string (argv[++i]);
851	else {
852	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
853	exit (1);
854	}
855	}
856	else if (! strcmp (argv[i], "-s")) {
857	if (i < argc - 1){
858	if (!GetFloat (argv[++i], &tempFloat)){
859	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
860	exit (1);
861	}
862	else {
863	if (tempFloat < 0){
864	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
865	exit (1);
866	}
867	else
868	threshhold = tempFloat;
869	}
870	}
871	else {
872	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
873	exit (1);
874	}
875	}
876
877	else if (! strcmp (argv[i], "-l")) {
878	if (i < argc - 1) {
879	if (!GetInteger (argv[++i], &tempInt)){
880	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
881	exit (1);
882	}
883	else {
884	if (tempInt <= 0 \|\| 6 <= tempInt) {
885	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
886	<< "1 and 6" << "." << endl;
887	exit (1);
888	}
889	else
890	scsLength = tempInt;
891	}
892	}
893	}
894	else if (! strcmp (argv[i], "-b")) {
895	if (i < argc - 1) {
896	if (!GetFloat (argv[++i], &tempFloat)){
897	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
898	exit (1);
899	}
900	else {
901	if (tempFloat < 0 && 1 < tempFloat) {
902	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
903	exit (1);
904	}
905	else
906	BaseProbThreshold = tempFloat;
907	}
908	}
909	}
910	else if (! strcmp (argv[i], "-g")) {
911	if (i < argc - 1) {
912	if (!GetFloat (argv[++i], &tempFloat)){
913	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
914	exit (1);
915	}
916	else {
917	if (tempFloat < 0 && 1 < tempFloat) {
918	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be nagative." << endl;
919	exit (1);
920	}
921	else
922	BasePairConst = tempFloat;
923	}
924	}
925	}
926
927	// number of consistency transformations
928	else if (!strcmp (argv[i], "-c") \|\| !strcmp (argv[i], "--consistency")){
929	if (i < argc - 1){
930	if (!GetInteger (argv[++i], &tempInt)){
931	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
932	exit (1);
933	}
934	else {
935	if (tempInt < MIN_CONSISTENCY_REPS \|\| tempInt > MAX_CONSISTENCY_REPS){
936	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
937	<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
938	exit (1);
939	}
940	else
941	numConsistencyReps = tempInt;
942	}
943	}
944	else {
945	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
946	exit (1);
947	}
948	}
949
950	// number of randomized partitioning iterative refinement passes
951	else if (!strcmp (argv[i], "-ir") \|\| !strcmp (argv[i], "--iterative-refinement")){
952	if (i < argc - 1){
953	if (!GetInteger (argv[++i], &tempInt)){
954	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
955	exit (1);
956	}
957	else {
958	if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS \|\| tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
959	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
960	<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
961	exit (1);
962	}
963	else
964	numIterativeRefinementReps = tempInt;
965	}
966	}
967	else {
968	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
969	exit (1);
970	}
971	}
972	// number of EM pre-training rounds
973	else if (!strcmp (argv[i], "-pre") \|\| !strcmp (argv[i], "--pre-training")){
974	if (i < argc - 1){
975	if (!GetInteger (argv[++i], &tempInt)){
976	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
977	exit (1);
978	}
979	else {
980	if (tempInt < MIN_PRETRAINING_REPS \|\| tempInt > MAX_PRETRAINING_REPS){
981	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
982	<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
983	exit (1);
984	}
985	else
986	numPreTrainingReps = tempInt;
987	}
988	}
989	else {
990	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
991	exit (1);
992	}
993	}
994
995	// the distance of stem candidate
996	else if (!strcmp (argv[i], "-w")){
997	if (i < argc - 1){
998	if (!GetInteger (argv[++i], &tempInt)){
999	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
1000	exit (1);
1001	}
1002	else {
1003	BandWidth = tempInt;
1004	}
1005	}
1006	else {
1007	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
1008	exit (1);
1009	}
1010	}
1011
1012	// gap open penalty
1013	else if (!strcmp (argv[i], "-go") \|\| !strcmp (argv[i], "--gap-open")){
1014	if (i < argc - 1){
1015	if (!GetFloat (argv[++i], &tempFloat)){
1016	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1017	exit (1);
1018	}
1019	else {
1020	if (tempFloat > 0){
1021	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1022	exit (1);
1023	}
1024	else
1025	gapOpenPenalty = tempFloat;
1026	}
1027	}
1028	else {
1029	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1030	exit (1);
1031	}
1032	}
1033
1034	// gap extension penalty
1035	else if (!strcmp (argv[i], "-ge") \|\| !strcmp (argv[i], "--gap-extension")){
1036	if (i < argc - 1){
1037	if (!GetFloat (argv[++i], &tempFloat)){
1038	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1039	exit (1);
1040	}
1041	else {
1042	if (tempFloat > 0){
1043	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
1044	exit (1);
1045	}
1046	else
1047	gapContinuePenalty = tempFloat;
1048	}
1049	}
1050	else {
1051	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1052	exit (1);
1053	}
1054	}
1055
1056	// all-pairs pairwise alignments
1057	else if (!strcmp (argv[i], "-pairs")){
1058	enableAllPairs = true;
1059	}
1060
1061	// all-pairs pairwise Viterbi alignments
1062	else if (!strcmp (argv[i], "-viterbi")){
1063	enableAllPairs = true;
1064	enableViterbi = true;
1065	}
1066
1067	// read base-pairing probability from the '_bpp' file, by katoh
1068	else if (!strcmp (argv[i], "-readbpp")){
1069	bppmode = 'r';
1070	}
1071
1072	// write base-pairing probability to stdout, by katoh
1073	else if (!strcmp (argv[i], "-writebpp")){
1074	bppmode = 'w';
1075	}
1076
1077	// annotation files
1078	else if (!strcmp (argv[i], "-annot")){
1079	enableAnnotation = true;
1080	if (i < argc - 1)
1081	annotationFilename = argv[++i];
1082	else {
1083	cerr << "ERROR: FILENAME expected for option " << argv[i] << endl;
1084	exit (1);
1085	}
1086	}
1087
1088	// clustalw output format
1089	else if (!strcmp (argv[i], "-clustalw")){
1090	enableClustalWOutput = true;
1091	}
1092	// mxscarna output format
1093	else if (!strcmp (argv[i], "-mxscarna")) {
1094	enableMXSCARNAOutput = true;
1095	}
1096	// stockholm output format
1097	else if (!strcmp (argv[i], "-stockholm")) {
1098	enableStockholmOutput = true;
1099	}
1100	// web output format
1101	else if (!strcmp (argv[i], "-weboutput")) {
1102	if (i < argc - 1) {
1103	weboutputFileName = string(argv[++i]);
1104	}
1105	else {
1106	cerr << "ERROR: Invalid following option " << argv[i-1] << ": " << argv[i] << endl;
1107	exit (1);
1108	}
1109
1110	enableWebOutput = true;
1111	}
1112
1113	// cutoff
1114	else if (!strcmp (argv[i], "-co") \|\| !strcmp (argv[i], "--cutoff")){
1115	if (i < argc - 1){
1116	if (!GetFloat (argv[++i], &tempFloat)){
1117	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
1118	exit (1);
1119	}
1120	else {
1121	if (tempFloat < 0 \|\| tempFloat > 1){
1122	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must be between 0 and 1." << endl;
1123	exit (1);
1124	}
1125	else
1126	cutoff = tempFloat;
1127	}
1128	}
1129	else {
1130	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
1131	exit (1);
1132	}
1133	}
1134
1135	// verbose reporting
1136	else if (!strcmp (argv[i], "-v") \|\| !strcmp (argv[i], "--verbose")){
1137	enableVerbose = true;
1138	}
1139
1140	// alignment order
1141	else if (!strcmp (argv[i], "-a") \|\| !strcmp (argv[i], "--alignment-order")){
1142	enableAlignOrder = true;
1143	}
1144	// McCaskill MEA MODE
1145	else if (!strcmp (argv[i], "-m") \|\| !strcmp (argv[i], "--mccaskillmea")){
1146	enableMcCaskillMEAMode = true;
1147	}
1148	// bad arguments
1149	else {
1150	cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
1151	exit (1);
1152	}
1153	}
1154	else {
1155	sequenceNames.push_back (string (argv[i]));
1156	}
1157	}
1158
1159	if (enableTrainEmissions && !enableTraining){
1160	cerr << "ERROR: Training emissions (-e) requires training (-t)" << endl;
1161	exit (1);
1162	}
1163
1164	return sequenceNames;
1165	}
1166
1167	/////////////////////////////////////////////////////////////////
1168	// ReadParameters()
1169	//
1170	// Read initial distribution, transition, and emission
1171	// parameters from a file.
1172	/////////////////////////////////////////////////////////////////
1173
1174	void ReadParameters (){
1175
1176	ifstream data;
1177
1178	emitPairs = VVF (256, VF (256, 1e-10));
1179	emitSingle = VF (256, 1e-5);
1180
1181	// read initial state distribution and transition parameters
1182	// pass
1183	if (parametersInputFilename == string ("")){
1184	if (NumInsertStates == 1){
1185	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
1186	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
1187	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
1188	}
1189	else if (NumInsertStates == 2){
1190	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
1191	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
1192	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
1193	}
1194	else {
1195	cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
1196	<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
1197	exit (1);
1198	}
1199
1200	alphabet = alphabetDefault;
1201
1202	for (int i = 0; i < (int) alphabet.length(); i++){
1203	emitSingle[(unsigned char) tolower(alphabet[i])] = emitSingleDefault[i];
1204	emitSingle[(unsigned char) toupper(alphabet[i])] = emitSingleDefault[i];
1205	for (int j = 0; j <= i; j++){
1206	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1207	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1208	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = emitPairsDefault[i][j];
1209	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = emitPairsDefault[i][j];
1210	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1211	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1212	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = emitPairsDefault[i][j];
1213	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = emitPairsDefault[i][j];
1214	}
1215	}
1216	}
1217	else {
1218	data.open (parametersInputFilename.c_str());
1219	if (data.fail()){
1220	cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
1221	exit (1);
1222	}
1223
1224	string line[3];
1225	for (int i = 0; i < 3; i++){
1226	if (!getline (data, line[i])){
1227	cerr << "ERROR: Unable to read transition parameters from parameter file: " << parametersInputFilename << endl;
1228	exit (1);
1229	}
1230	}
1231	istringstream data2;
1232	data2.clear(); data2.str (line[0]); for (int i = 0; i < NumMatrixTypes; i++) data2 >> initDistrib[i];
1233	data2.clear(); data2.str (line[1]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapOpen[i];
1234	data2.clear(); data2.str (line[2]); for (int i = 0; i < 2*NumInsertStates; i++) data2 >> gapExtend[i];
1235
1236	if (!getline (data, line[0])){
1237	cerr << "ERROR: Unable to read alphabet from scoring matrix file: " << parametersInputFilename << endl;
1238	exit (1);
1239	}
1240
1241	// read alphabet as concatenation of all characters on alphabet line
1242	alphabet = "";
1243	string token;
1244	data2.clear(); data2.str (line[0]); while (data2 >> token) alphabet += token;
1245
1246	for (int i = 0; i < (int) alphabet.size(); i++){
1247	for (int j = 0; j <= i; j++){
1248	float val;
1249	data >> val;
1250	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1251	emitPairs[(unsigned char) tolower(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1252	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) tolower(alphabet[j])] = val;
1253	emitPairs[(unsigned char) toupper(alphabet[i])][(unsigned char) toupper(alphabet[j])] = val;
1254	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1255	emitPairs[(unsigned char) tolower(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1256	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) tolower(alphabet[i])] = val;
1257	emitPairs[(unsigned char) toupper(alphabet[j])][(unsigned char) toupper(alphabet[i])] = val;
1258	}
1259	}
1260
1261	for (int i = 0; i < (int) alphabet.size(); i++){
1262	float val;
1263	data >> val;
1264	emitSingle[(unsigned char) tolower(alphabet[i])] = val;
1265	emitSingle[(unsigned char) toupper(alphabet[i])] = val;
1266	}
1267	data.close();
1268	}
1269	}
1270
1271	/////////////////////////////////////////////////////////////////
1272	// ProcessTree()
1273	//
1274	// Process the tree recursively. Returns the aligned sequences
1275	// corresponding to a node or leaf of the tree.
1276	/////////////////////////////////////////////////////////////////
1277	float ide;
1278	MultiSequence ProcessTree (const TreeNode tree, MultiSequence *sequences,
1279	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1280	const ProbabilisticModel &model, SafeVector<BPPMatrix*> &BPPMatrices) {
1281	MultiSequence *result;
1282
1283	// check if this is a node of the alignment tree
1284	if (tree->GetSequenceLabel() == -1){
1285	MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model, BPPMatrices);
1286	MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model, BPPMatrices);
1287
1288	assert (alignLeft);
1289	assert (alignRight);
1290
1291	result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model, BPPMatrices, tree->GetIdentity());
1292	assert (result);
1293
1294	delete alignLeft;
1295	delete alignRight;
1296	}
1297
1298	// otherwise, this is a leaf of the alignment tree
1299	else {
1300	result = new MultiSequence(); assert (result);
1301	result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
1302	}
1303
1304	return result;
1305	}
1306
1307	/////////////////////////////////////////////////////////////////
1308	// ComputeFinalAlignment()
1309	//
1310	// Compute the final alignment by calling ProcessTree(), then
1311	// performing iterative refinement as needed.
1312	/////////////////////////////////////////////////////////////////
1313
1314	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
1315	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1316	const ProbabilisticModel &model,
1317	SafeVector<BPPMatrix*> &BPPMatrices) {
1318
1319	MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model, BPPMatrices);
1320
1321	if (enableAlignOrder){
1322	alignment->SaveOrdering();
1323	enableAlignOrder = false;
1324	}
1325
1326	// tree-based refinement
1327	// if you use the function, you can degrade the quality of the software.
1328	// TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree, BPPMatrices);
1329
1330	// iterative refinement
1331	/*
1332	for (int i = 0; i < numIterativeRefinementReps; i++)
1333	DoIterativeRefinement (sparseMatrices, model, alignment);
1334
1335	cerr << endl;
1336	*/
1337	// return final alignment
1338	return alignment;
1339	}
1340
1341	/////////////////////////////////////////////////////////////////
1342	// AlignAlignments()
1343	//
1344	// Returns the alignment of two MultiSequence objects.
1345	/////////////////////////////////////////////////////////////////
1346
1347	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
1348	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1349	const ProbabilisticModel &model,
1350	SafeVector<BPPMatrix*> &BPPMatrices, float identity){
1351
1352	// print some info about the alignment
1353	if (enableVerbose){
1354	for (int i = 0; i < align1->GetNumSequences(); i++)
1355	cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
1356	cerr << "] vs. ";
1357	for (int i = 0; i < align2->GetNumSequences(); i++)
1358	cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
1359	cerr << "]: ";
1360	}
1361
1362	VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices, cutoff);
1363
1364	pair<SafeVector<char> *, float> alignment;
1365	// choose the alignment routine depending on the "cosmetic" gap penalties used
1366	if (gapOpenPenalty == 0 && gapContinuePenalty == 0) {
1367
1368	if(identity <= threshhold) {
1369	std::vector<StemCandidate> pscs1, pscs2;
1370	pscs1 = seq2scs(align1, BPPMatrices, BandWidth);
1371	pscs2 = seq2scs(align2, BPPMatrices, BandWidth);
1372	std::vector<int> *matchPSCS1 = new std::vector<int>;
1373	std::vector<int> *matchPSCS2 = new std::vector<int>;
1374
1375	Globaldp globaldp(pscs1, pscs2, align1, align2, matchPSCS1, matchPSCS2, posterior, BPPMatrices);
1376	//float scsScore = globaldp.Run();
1377
1378	globaldp.Run();
1379
1380	removeConflicts(pscs1, pscs2, matchPSCS1, matchPSCS2);
1381
1382	alignment = model.ComputeAlignment2 (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior, pscs1, pscs2, matchPSCS1, matchPSCS2);
1383	delete matchPSCS1;
1384	delete matchPSCS2;
1385	} else {
1386	alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
1387	}
1388	}
1389	else {
1390	alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
1391	*posterior, align1->GetNumSequences(), align2->GetNumSequences(),
1392	gapOpenPenalty, gapContinuePenalty);
1393	}
1394
1395	delete posterior;
1396
1397	if (enableVerbose){
1398
1399	// compute total length of sequences
1400	int totLength = 0;
1401	for (int i = 0; i < align1->GetNumSequences(); i++)
1402	for (int j = 0; j < align2->GetNumSequences(); j++)
1403	totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
1404
1405	// give an "accuracy" measure for the alignment
1406	cerr << alignment.second / totLength << endl;
1407	}
1408
1409	// now build final alignment
1410	MultiSequence *result = new MultiSequence();
1411	for (int i = 0; i < align1->GetNumSequences(); i++)
1412	result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
1413	for (int i = 0; i < align2->GetNumSequences(); i++)
1414	result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
1415	if (!enableAlignOrder)
1416	result->SortByLabel();
1417
1418	// free temporary alignment
1419	delete alignment.first;
1420
1421	return result;
1422	}
1423
1424	/////////////////////////////////////////////////////////////////
1425	// DoRelaxation()
1426	//
1427	// Performs one round of the consistency transformation. The
1428	// formula used is:
1429	// 1
1430	// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
1431	// \|S\| z in S k
1432	//
1433	// where S = {x, y, all other sequences...}
1434	//
1435	/////////////////////////////////////////////////////////////////
1436
1437	SafeVector<SafeVector<SparseMatrix > > DoRelaxation (MultiSequence sequences,
1438	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1439	const int numSeqs = sequences->GetNumSequences();
1440
1441	SafeVector<SafeVector<SparseMatrix > > newSparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
1442
1443	// for every pair of sequences
1444	for (int i = 0; i < numSeqs; i++){
1445	for (int j = i+1; j < numSeqs; j++){
1446	Sequence *seq1 = sequences->GetSequence (i);
1447	Sequence *seq2 = sequences->GetSequence (j);
1448
1449	if (enableVerbose)
1450	cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
1451	<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";
1452
1453	// get the original posterior matrix
1454	VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
1455	VF &posterior = *posteriorPtr;
1456
1457	const int seq1Length = seq1->GetLength();
1458	const int seq2Length = seq2->GetLength();
1459
1460	// contribution from the summation where z = x and z = y
1461	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
1462
1463	if (enableVerbose)
1464	cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
1465
1466	// contribution from all other sequences
1467	for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
1468	if (k < i)
1469	Relax1 (sparseMatrices[k][i], sparseMatrices[k][j], posterior);
1470	else if (k > i && k < j)
1471	Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
1472	else {
1473	SparseMatrix *temp = sparseMatrices[j][k]->ComputeTranspose();
1474	Relax (sparseMatrices[i][k], temp, posterior);
1475	delete temp;
1476	}
1477	}
1478
1479	// now renormalization
1480	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
1481
1482	// mask out positions not originally in the posterior matrix
1483	SparseMatrix *matXY = sparseMatrices[i][j];
1484	for (int y = 0; y <= seq2Length; y++) posterior[y] = 0;
1485	for (int x = 1; x <= seq1Length; x++){
1486	SafeVector<PIF>::iterator XYptr = matXY->GetRowPtr(x);
1487	SafeVector<PIF>::iterator XYend = XYptr + matXY->GetRowSize(x);
1488	VF::iterator base = posterior.begin() + x * (seq2Length + 1);
1489	int curr = 0;
1490	while (XYptr != XYend){
1491
1492	// zero out all cells until the first filled column
1493	while (curr < XYptr->first){
1494	base[curr] = 0;
1495	curr++;
1496	}
1497
1498	// now, skip over this column
1499	curr++;
1500	++XYptr;
1501	}
1502
1503	// zero out cells after last column
1504	while (curr <= seq2Length){
1505	base[curr] = 0;
1506	curr++;
1507	}
1508	}
1509
1510	// save the new posterior matrix
1511	newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
1512	newSparseMatrices[j][i] = NULL;
1513
1514	if (enableVerbose)
1515	cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
1516
1517	delete posteriorPtr;
1518
1519	if (enableVerbose)
1520	cerr << "done." << endl;
1521	}
1522	}
1523
1524	return newSparseMatrices;
1525	}
1526
1527	/////////////////////////////////////////////////////////////////
1528	// Relax()
1529	//
1530	// Computes the consistency transformation for a single sequence
1531	// z, and adds the transformed matrix to "posterior".
1532	/////////////////////////////////////////////////////////////////
1533
1534	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior){
1535
1536	assert (matXZ);
1537	assert (matZY);
1538
1539	int lengthX = matXZ->GetSeq1Length();
1540	int lengthY = matZY->GetSeq2Length();
1541	assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
1542
1543	// for every x[i]
1544	for (int i = 1; i <= lengthX; i++){
1545	SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
1546	SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
1547
1548	VF::iterator base = posterior.begin() + i * (lengthY + 1);
1549
1550	// iterate through all x[i]-z[k]
1551	while (XZptr != XZend){
1552	SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
1553	SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
1554	const float XZval = XZptr->second;
1555
1556	// iterate through all z[k]-y[j]
1557	while (ZYptr != ZYend){
1558	base[ZYptr->first] += XZval * ZYptr->second;
1559	ZYptr++;
1560	}
1561	XZptr++;
1562	}
1563	}
1564	}
1565
1566	/////////////////////////////////////////////////////////////////
1567	// Relax1()
1568	//
1569	// Computes the consistency transformation for a single sequence
1570	// z, and adds the transformed matrix to "posterior".
1571	/////////////////////////////////////////////////////////////////
1572
1573	void Relax1 (SparseMatrix matZX, SparseMatrix matZY, VF &posterior){
1574
1575	assert (matZX);
1576	assert (matZY);
1577
1578	int lengthZ = matZX->GetSeq1Length();
1579	int lengthY = matZY->GetSeq2Length();
1580
1581	// for every z[k]
1582	for (int k = 1; k <= lengthZ; k++){
1583	SafeVector<PIF>::iterator ZXptr = matZX->GetRowPtr(k);
1584	SafeVector<PIF>::iterator ZXend = ZXptr + matZX->GetRowSize(k);
1585
1586	// iterate through all z[k]-x[i]
1587	while (ZXptr != ZXend){
1588	SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(k);
1589	SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(k);
1590	const float ZXval = ZXptr->second;
1591	VF::iterator base = posterior.begin() + ZXptr->first * (lengthY + 1);
1592
1593	// iterate through all z[k]-y[j]
1594	while (ZYptr != ZYend){
1595	base[ZYptr->first] += ZXval * ZYptr->second;
1596	ZYptr++;
1597	}
1598	ZXptr++;
1599	}
1600	}
1601	}
1602
1603	void DoBasePairProbabilityRelaxation (MultiSequence *sequences,
1604	SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1605	SafeVector<BPPMatrix*> &BPPMatrices) {
1606	const int numSeqs = sequences->GetNumSequences();
1607
1608	for (int i = 0; i < numSeqs; i++) {
1609	Sequence *seq1 = sequences->GetSequence (i);
1610	BPPMatrix *seq1BppMatrix = BPPMatrices[seq1->GetLabel()];
1611	Trimat<float> consBppMat(seq1->GetLength() + 1);
1612	int seq1Length = seq1->GetLength();
1613
1614	for (int k = 1; k <= seq1Length; k++) {
1615	for (int l = k; l <= seq1Length; l++) {
1616	consBppMat.ref(k, l) = seq1BppMatrix->GetProb(k, l);
1617	}
1618	}
1619
1620	for (int j = i + 1; j < numSeqs; j++) {
1621
1622	// VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior()
1623	Sequence *seq2 = sequences->GetSequence (j);
1624	BPPMatrix *seq2BppMatrix = BPPMatrices[seq2->GetLabel()];
1625	// int seq2Length = seq2->GetLength();
1626	SparseMatrix *matchProb = sparseMatrices[i][j];
1627
1628	// vector<PIF2> &probs1 = seq1BppMatrix->bppMat.data2;
1629	for(int k = 1; k <= seq1Length; k++) {
1630	for(int m = k, n = k; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1631
1632	// for (int k = 0; k < (int)probs1.size(); k++) {
1633	// float tmpProb1 = probs1[k].prob;
1634	// int tmp1I = probs1[k].i;
1635	// int tmp1J = probs1[k].j;
1636
1637	float sumProb = 0;
1638	vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1639	for(int l = 0; l < (int)probs2.size(); l++) {
1640	float tmpProb2 = probs2[l].prob;
1641	int tmp2I = probs2[l].i;
1642	int tmp2J = probs2[l].j;
1643	sumProb += matchProb->GetValue(m, tmp2I)matchProb->GetValue(n, tmp2J)tmpProb2;
1644	}
1645
1646	consBppMat.ref(m, n) += sumProb;
1647	}
1648
1649	for(int m = k, n = k + 1; n <= k + 200 && m >= 1 && n <= seq1Length; m--, n++) {
1650
1651	// for (int k = 0; k < (int)probs1.size(); k++) {
1652	// float tmpProb1 = probs1[k].prob;
1653	// int tmp1I = probs1[k].i;
1654	// int tmp1J = probs1[k].j;
1655
1656	float sumProb = 0;
1657	vector<PIF2> &probs2 = seq2BppMatrix->bppMat.data2;
1658	for(int l = 0; l < (int)probs2.size(); l++) {
1659	float tmpProb2 = probs2[l].prob;
1660	int tmp2I = probs2[l].i;
1661	int tmp2J = probs2[l].j;
1662	sumProb += matchProb->GetValue(m, tmp2I)matchProb->GetValue(n, tmp2J)tmpProb2;
1663	}
1664
1665	consBppMat.ref(m, n) += sumProb;
1666	}
1667	}
1668	}
1669
1670	/*
1671	for(int k = 1; k <= seq1Length; k++) {
1672	for(int m = k, n = k; n <= k + 30 && m >= 1 && n <= seq1Length; m--, n++) {
1673	float tmpProb = seq1BppMatrix->GetProb(m, n);
1674	for(int l = 1; l <= seq2Length; l++) {
1675	for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1676	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1677	}
1678	for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1679	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1680	}
1681	}
1682	consBppMat.ref(m, n) += tmpProb;
1683	}
1684
1685	for(int m = k, n = k + 1; n <= k + 31 && m >= 1 && n <= seq1Length; m--, n++) {
1686	float tmpProb = seq1BppMatrix->GetProb(m, n);
1687	for(int l = 1; l <= seq2Length; l++) {
1688	for(int s = l, t = l; t <= l + 30 && s >= 1 && t <= seq2Length; s--, t++) {
1689	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1690	}
1691	for(int s = l, t = l + 1; t <= l + 31 && s >= 1 && t <= seq2Length; s--, t++) {
1692	tmpProb += matchProb->GetValue(m,s)matchProb->GetValue(n,t)seq2BppMatrix->GetProb(s,t);
1693	}
1694	}
1695	consBppMat.ref(m,n) += tmpProb;
1696	}
1697	}
1698	}
1699	*/
1700	for (int m = 1; m <= seq1Length; m++) {
1701	for (int n = m + 4; n <= seq1Length; n++) {
1702	consBppMat.ref(m,n) = consBppMat.ref(m,n)/(float)numSeqs;
1703	}
1704	}
1705	seq1BppMatrix->updateBPPMatrix(consBppMat);
1706	}
1707	}
1708
1709	/////////////////////////////////////////////////////////////////
1710	// GetSubtree
1711	//
1712	// Returns set containing all leaf labels of the current subtree.
1713	/////////////////////////////////////////////////////////////////
1714
1715	set<int> GetSubtree (const TreeNode *tree){
1716	set<int> s;
1717
1718	if (tree->GetSequenceLabel() == -1){
1719	s = GetSubtree (tree->GetLeftChild());
1720	set<int> t = GetSubtree (tree->GetRightChild());
1721
1722	for (set<int>::iterator iter = t.begin(); iter != t.end(); ++iter)
1723	s.insert (*iter);
1724	}
1725	else {
1726	s.insert (tree->GetSequenceLabel());
1727	}
1728
1729	return s;
1730	}
1731
1732	/////////////////////////////////////////////////////////////////
1733	// TreeBasedBiPartitioning
1734	//
1735	// Uses the iterative refinement scheme from MUSCLE.
1736	/////////////////////////////////////////////////////////////////
1737
1738	void TreeBasedBiPartitioning (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1739	const ProbabilisticModel &model, MultiSequence* &alignment,
1740	const TreeNode tree, SafeVector<BPPMatrix> &BPPMatrices){
1741	// check if this is a node of the alignment tree
1742	if (tree->GetSequenceLabel() == -1){
1743	TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetLeftChild(), BPPMatrices);
1744	TreeBasedBiPartitioning (sparseMatrices, model, alignment, tree->GetRightChild(), BPPMatrices);
1745
1746	set<int> leftSubtree = GetSubtree (tree->GetLeftChild());
1747	set<int> rightSubtree = GetSubtree (tree->GetRightChild());
1748	set<int> leftSubtreeComplement, rightSubtreeComplement;
1749
1750	// calculate complement of each subtree
1751	for (int i = 0; i < alignment->GetNumSequences(); i++){
1752	if (leftSubtree.find(i) == leftSubtree.end()) leftSubtreeComplement.insert (i);
1753	if (rightSubtree.find(i) == rightSubtree.end()) rightSubtreeComplement.insert (i);
1754	}
1755
1756	// perform realignments for edge to left child
1757	if (!leftSubtree.empty() && !leftSubtreeComplement.empty()){
1758	MultiSequence *groupOneSeqs = alignment->Project (leftSubtree); assert (groupOneSeqs);
1759	MultiSequence *groupTwoSeqs = alignment->Project (leftSubtreeComplement); assert (groupTwoSeqs);
1760	delete alignment;
1761	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetLeftChild()->GetIdentity());
1762	}
1763
1764	// perform realignments for edge to right child
1765	if (!rightSubtree.empty() && !rightSubtreeComplement.empty()){
1766	MultiSequence *groupOneSeqs = alignment->Project (rightSubtree); assert (groupOneSeqs);
1767	MultiSequence *groupTwoSeqs = alignment->Project (rightSubtreeComplement); assert (groupTwoSeqs);
1768	delete alignment;
1769	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model, BPPMatrices, tree->GetRightChild()->GetIdentity());
1770	}
1771	}
1772	}
1773
1774	/////////////////////////////////////////////////////////////////
1775	// DoterativeRefinement()
1776	//
1777	// Performs a single round of randomized partionining iterative
1778	// refinement.
1779	/////////////////////////////////////////////////////////////////
1780	/*
1781	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
1782	const ProbabilisticModel &model, MultiSequence* &alignment){
1783	set<int> groupOne, groupTwo;
1784
1785	// create two separate groups
1786	for (int i = 0; i < alignment->GetNumSequences(); i++){
1787	if (rand() % 2)
1788	groupOne.insert (i);
1789	else
1790	groupTwo.insert (i);
1791	}
1792
1793	if (groupOne.empty() \|\| groupTwo.empty()) return;
1794
1795	// project into the two groups
1796	MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
1797	MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
1798	delete alignment;
1799
1800	// realign
1801	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
1802
1803	delete groupOneSeqs;
1804	delete groupTwoSeqs;
1805	}
1806	*/
1807
1808	/////////////////////////////////////////////////////////////////
1809	// WriteAnnotation()
1810	//
1811	// Computes annotation for multiple alignment and write values
1812	// to a file.
1813	/////////////////////////////////////////////////////////////////
1814
1815	void WriteAnnotation (MultiSequence *alignment,
1816	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1817	ofstream outfile (annotationFilename.c_str());
1818
1819	if (outfile.fail()){
1820	cerr << "ERROR: Unable to write annotation file." << endl;
1821	exit (1);
1822	}
1823
1824	const int alignLength = alignment->GetSequence(0)->GetLength();
1825	const int numSeqs = alignment->GetNumSequences();
1826
1827	SafeVector<int> position (numSeqs, 0);
1828	SafeVector<SafeVector<char>::iterator> seqs (numSeqs);
1829	for (int i = 0; i < numSeqs; i++) seqs[i] = alignment->GetSequence(i)->GetDataPtr();
1830	SafeVector<pair<int,int> > active;
1831	active.reserve (numSeqs);
1832
1833	// for every column
1834	for (int i = 1; i <= alignLength; i++){
1835
1836	// find all aligned residues in this particular column
1837	active.clear();
1838	for (int j = 0; j < numSeqs; j++){
1839	if (seqs[j][i] != '-'){
1840	active.push_back (make_pair(j, ++position[j]));
1841	}
1842	}
1843
1844	outfile << setw(4) << ComputeScore (active, sparseMatrices) << endl;
1845	}
1846
1847	outfile.close();
1848	}
1849
1850	/////////////////////////////////////////////////////////////////
1851	// ComputeScore()
1852	//
1853	// Computes the annotation score for a particular column.
1854	/////////////////////////////////////////////////////////////////
1855
1856	int ComputeScore (const SafeVector<pair<int, int> > &active,
1857	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices){
1858
1859	if (active.size() <= 1) return 0;
1860
1861	// ALTERNATIVE #1: Compute the average alignment score.
1862
1863	float val = 0;
1864	for (int i = 0; i < (int) active.size(); i++){
1865	for (int j = i+1; j < (int) active.size(); j++){
1866	val += sparseMatrices[active[i].first][active[j].first]->GetValue(active[i].second, active[j].second);
1867	}
1868	}
1869
1870	return (int) (200 * val / ((int) active.size() * ((int) active.size() - 1)));
1871
1872	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: tags/arb-6.0/GDE/MAFFT/mafft-7.055-with-extensions/extensions/mxscarna_src/Main.cc

Download in other formats: