Context Navigation

FixRef.cc

Visit:

Last change on this file was 10371, checked in by aboeckma, 12 years ago
updated mafft version. Added extensions (no svn ignore, yet)
File size: 34.7 KB

Line
1	/////////////////////////////////////////////////////////////////
2	// Main.cc
3	/////////////////////////////////////////////////////////////////
4
5	#include "SafeVector.h"
6	#include "MultiSequence.h"
7	#include "Defaults.h"
8	#include "ScoreType.h"
9	#include "ProbabilisticModel.h"
10	#include "EvolutionaryTree.h"
11	#include "SparseMatrix.h"
12	#include <string>
13	#include <iomanip>
14	#include <iostream>
15	#include <list>
16	#include <set>
17	#include <algorithm>
18	#include <cstdio>
19	#include <cstdlib>
20	#include <cerrno>
21	#include <iomanip>
22
23	string matrixFilename = "";
24	string parametersInputFilename = "";
25	string parametersOutputFilename = "no training";
26
27	bool enableTraining = false;
28	bool enableVerbose = false;
29	int numConsistencyReps = 2;
30	int numPreTrainingReps = 0;
31	int numIterativeRefinementReps = 100;
32
33	float gapOpenPenalty = 0;
34	float gapContinuePenalty = 0;
35	VF initDistrib (NumMatrixTypes);
36	VF gapOpen (2*NumInsertStates);
37	VF gapExtend (2*NumInsertStates);
38	SafeVector<char> alphabet;
39	VVF emitPairs;
40	VF emitSingle;
41
42	const int MIN_PRETRAINING_REPS = 0;
43	const int MAX_PRETRAINING_REPS = 20;
44	const int MIN_CONSISTENCY_REPS = 0;
45	const int MAX_CONSISTENCY_REPS = 5;
46	const int MIN_ITERATIVE_REFINEMENT_REPS = 0;
47	const int MAX_ITERATIVE_REFINEMENT_REPS = 1000;
48
49	/////////////////////////////////////////////////////////////////
50	// Function prototypes
51	/////////////////////////////////////////////////////////////////
52
53	void PrintHeading();
54	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
55	const VF &gapExtend, const char *filename);
56	MultiSequence DoAlign (MultiSequence sequence, const ProbabilisticModel &model);
57	SafeVector<string> ParseParams (int argc, char **argv);
58	void ReadParameters ();
59	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
60	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
61	const ProbabilisticModel &model);
62	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
63	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
64	const ProbabilisticModel &model);
65	void DoRelaxation (MultiSequence sequences, SafeVector<SafeVector<SparseMatrix > > &sparseMatrices);
66	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior);
67	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
68	const ProbabilisticModel &model, MultiSequence* &alignment);
69	//float ScoreAlignment (MultiSequence alignment, MultiSequence sequences, SparseMatrix **sparseMatrices, const int numSeqs);
70
71	/////////////////////////////////////////////////////////////////
72	// main()
73	//
74	// Calls all initialization routines and runs the PROBCONS
75	// aligner.
76	/////////////////////////////////////////////////////////////////
77
78	int main (int argc, char **argv){
79
80	if (argc != 3){
81	cerr << "Usage: FixRef inputfile reffile" << endl;
82	exit (1);
83	}
84
85	string inputFilename = string (argv[1]);
86	string refFilename = string (argv[2]);
87
88	ReadParameters();
89
90	// build new model for aligning
91	ProbabilisticModel model (initDistrib, gapOpen, gapExtend,
92	alphabet, emitPairs, emitSingle);
93
94	MultiSequence *inputSeq = new MultiSequence(); inputSeq->LoadMFA (inputFilename);
95	MultiSequence *refSeq = new MultiSequence(); refSeq->LoadMFA (refFilename);
96
97	SafeVector<char> *ali = new SafeVector<char>;
98
99	if (refSeq->GetNumSequences() != 2){
100	cerr << "ERROR: Expected two sequences in reference alignment." << endl;
101	exit (1);
102	}
103	set<int> s; s.insert (0);
104	MultiSequence *ref1 = refSeq->Project (s);
105	s.clear(); s.insert (1);
106	MultiSequence *ref2 = refSeq->Project (s);
107
108	for (int i = 0; i < inputSeq->GetNumSequences(); i++){
109	if (inputSeq->GetSequence(i)->GetHeader() == ref1->GetSequence(0)->GetHeader()){
110	ref1->AddSequence (inputSeq->GetSequence(i)->Clone());
111	}
112	if (inputSeq->GetSequence(i)->GetHeader() == ref2->GetSequence(0)->GetHeader())
113	ref2->AddSequence (inputSeq->GetSequence(i)->Clone());
114	}
115	if (ref1->GetNumSequences() != 2){
116	cerr << "ERROR: Expected two sequences in reference1 alignment." << endl;
117	exit (1);
118	}
119	if (ref2->GetNumSequences() != 2){
120	cerr << "ERROR: Expected two sequences in reference2 alignment." << endl;
121	exit (1);
122	}
123
124	ref1->GetSequence(0)->SetLabel(0);
125	ref2->GetSequence(0)->SetLabel(0);
126	ref1->GetSequence(1)->SetLabel(1);
127	ref2->GetSequence(1)->SetLabel(1);
128
129	// cerr << "Aligning..." << endl;
130
131	// now, we can perform the alignments and write them out
132	MultiSequence *alignment1 = DoAlign (ref1,
133	ProbabilisticModel (initDistrib, gapOpen, gapExtend,
134	alphabet, emitPairs, emitSingle));
135
136	//cerr << "Aligning second..." << endl;
137	MultiSequence *alignment2 = DoAlign (ref2,
138	ProbabilisticModel (initDistrib, gapOpen, gapExtend,
139	alphabet, emitPairs, emitSingle));
140
141	SafeVector<char>::iterator iter1 = alignment1->GetSequence(0)->GetDataPtr();
142	SafeVector<char>::iterator iter2 = alignment1->GetSequence(1)->GetDataPtr();
143	for (int i = 1; i <= alignment1->GetSequence(0)->GetLength(); i++){
144	if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
145	if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
146	}
147	iter1 = alignment2->GetSequence(0)->GetDataPtr();
148	iter2 = alignment2->GetSequence(1)->GetDataPtr();
149	for (int i = 1; i <= alignment2->GetSequence(0)->GetLength(); i++){
150	if (islower(iter1[i])) iter2[i] = tolower(iter2[i]);
151	if (isupper(iter1[i])) iter2[i] = toupper(iter2[i]);
152	}
153	//alignment1->WriteMFA (cout);
154	//alignment2->WriteMFA (cout);
155
156	int a1 = 0, a = 0;
157	int b1 = 0, b = 0;
158
159	for (int i = 1; i <= refSeq->GetSequence(0)->GetLength(); i++){
160
161	// catch up in filler sequences
162	if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
163	while (true){
164	a++;
165	if (alignment1->GetSequence(0)->GetPosition(a) != '-') break;
166	ali->push_back ('X');
167	}
168	}
169	if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
170	while (true){
171	b++;
172	if (alignment2->GetSequence(0)->GetPosition(b) != '-') break;
173	ali->push_back ('Y');
174	}
175	}
176
177	if (refSeq->GetSequence(0)->GetPosition(i) != '-' &&
178	refSeq->GetSequence(1)->GetPosition(i) != '-'){
179	//cerr << "M: " << refSeq->GetSequence(0)->GetPosition(i) << refSeq->GetSequence(1)->GetPosition(i) << endl;
180	ali->push_back ('B');
181	}
182	else if (refSeq->GetSequence(0)->GetPosition(i) != '-'){
183	//cerr << "X" << endl;
184	ali->push_back ('X');
185	}
186	else if (refSeq->GetSequence(1)->GetPosition(i) != '-'){
187	//cerr << "Y" << endl;
188	ali->push_back ('Y');
189	}
190	}
191
192	while (a < alignment1->GetSequence(0)->GetLength()){
193	a++;
194	ali->push_back ('X');
195	if (alignment1->GetSequence(0)->GetPosition(a) != '-') a1++;
196	}
197	while (b < alignment2->GetSequence(0)->GetLength()){
198	b++;
199	ali->push_back ('Y');
200	if (alignment2->GetSequence(0)->GetPosition(b) != '-') b1++;
201	}
202
203	Sequence *seq1 = alignment1->GetSequence(1)->AddGaps (ali, 'X');
204	Sequence *seq2 = alignment2->GetSequence(1)->AddGaps (ali, 'Y');
205	seq1->WriteMFA (cout, 60);
206	seq2->WriteMFA (cout, 60);
207
208	delete seq1;
209	delete seq2;
210
211	delete ali;
212	delete alignment1;
213	delete alignment2;
214	delete inputSeq;
215	delete refSeq;
216	}
217
218	/////////////////////////////////////////////////////////////////
219	// PrintHeading()
220	//
221	// Prints heading for PROBCONS program.
222	/////////////////////////////////////////////////////////////////
223
224	void PrintHeading (){
225	cerr << endl
226	<< "PROBCONS version 1.02 - align multiple protein sequences and print to standard output" << endl
227	<< "Copyright (C) 2004 Chuong Ba Do" << endl
228	<< endl;
229	}
230
231	/////////////////////////////////////////////////////////////////
232	// PrintParameters()
233	//
234	// Prints PROBCONS parameters to STDERR. If a filename is
235	// specified, then the parameters are also written to the file.
236	/////////////////////////////////////////////////////////////////
237
238	void PrintParameters (const char *message, const VF &initDistrib, const VF &gapOpen,
239	const VF &gapExtend, const char *filename){
240
241	// print parameters to the screen
242	cerr << message << endl
243	<< " initDistrib[] = { ";
244	for (int i = 0; i < NumMatrixTypes; i++) cerr << setprecision (10) << initDistrib[i] << " ";
245	cerr << "}" << endl
246	<< " gapOpen[] = { ";
247	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapOpen[i] << " ";
248	cerr << "}" << endl
249	<< " gapExtend[] = { ";
250	for (int i = 0; i < NumInsertStates*2; i++) cerr << setprecision (10) << gapExtend[i] << " ";
251	cerr << "}" << endl
252	<< endl;
253
254	// if a file name is specified
255	if (filename){
256
257	// attempt to open the file for writing
258	FILE *file = fopen (filename, "w");
259	if (!file){
260	cerr << "ERROR: Unable to write parameter file: " << filename << endl;
261	exit (1);
262	}
263
264	// if successful, then write the parameters to the file
265	for (int i = 0; i < NumMatrixTypes; i++) fprintf (file, "%.10f ", initDistrib[i]); fprintf (file, "\n");
266	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapOpen[i]); fprintf (file, "\n");
267	for (int i = 0; i < 2*NumInsertStates; i++) fprintf (file, "%.10f ", gapExtend[i]); fprintf (file, "\n");
268	fclose (file);
269	}
270	}
271
272	/////////////////////////////////////////////////////////////////
273	// DoAlign()
274	//
275	// First computes all pairwise posterior probability matrices.
276	// Then, computes new parameters if training, or a final
277	// alignment, otherwise.
278	/////////////////////////////////////////////////////////////////
279
280	MultiSequence DoAlign (MultiSequence sequences, const ProbabilisticModel &model){
281
282	assert (sequences);
283
284	const int numSeqs = sequences->GetNumSequences();
285	VVF distances (numSeqs, VF (numSeqs, 0));
286	SafeVector<SafeVector<SparseMatrix > > sparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
287
288	// do all pairwise alignments
289	for (int a = 0; a < numSeqs-1; a++){
290	for (int b = a+1; b < numSeqs; b++){
291	Sequence *seq1 = sequences->GetSequence (a);
292	Sequence *seq2 = sequences->GetSequence (b);
293
294	// verbose output
295	if (enableVerbose)
296	cerr << "(" << a+1 << ") " << seq1->GetHeader() << " vs. "
297	<< "(" << b+1 << ") " << seq2->GetHeader() << ": ";
298
299	// compute forward and backward probabilities
300	VF *forward = model.ComputeForwardMatrix (seq1, seq2); assert (forward);
301	VF *backward = model.ComputeBackwardMatrix (seq1, seq2); assert (backward);
302
303	// if we are training, then we'll simply want to compute the
304	// expected counts for each region within the matrix separately;
305	// otherwise, we'll need to put all of the regions together and
306	// assemble a posterior probability match matrix
307
308	// compute posterior probability matrix
309	VF posterior = model.ComputePosteriorMatrix (seq1, seq2, forward, *backward); assert (posterior);
310
311	// compute "expected accuracy" distance for evolutionary tree computation
312	pair<SafeVector<char> *, float> alignment = model.ComputeAlignment (seq1->GetLength(),
313	seq2->GetLength(),
314	*posterior);
315
316	float distance = alignment.second / min (seq1->GetLength(), seq2->GetLength());
317
318	if (enableVerbose)
319	cerr << setprecision (10) << distance << endl;
320
321	// save posterior probability matrices in sparse format
322	distances[a][b] = distances[b][a] = distance;
323	sparseMatrices[a][b] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), *posterior);
324	sparseMatrices[b][a] = sparseMatrices[a][b]->ComputeTranspose();
325
326	delete alignment.first;
327	delete posterior;
328
329	delete forward;
330	delete backward;
331	}
332	}
333
334	if (!enableTraining){
335	if (enableVerbose)
336	cerr << endl;
337
338	// now, perform the consistency transformation the desired number of times
339	for (int i = 0; i < numConsistencyReps; i++)
340	DoRelaxation (sequences, sparseMatrices);
341
342	// compute the evolutionary tree
343	TreeNode *tree = TreeNode::ComputeTree (distances);
344
345	//tree->Print (cerr, sequences);
346	//cerr << endl;
347
348	// make the final alignment
349	MultiSequence *alignment = ComputeFinalAlignment (tree, sequences, sparseMatrices, model);
350	delete tree;
351
352	return alignment;
353	}
354
355	return NULL;
356	}
357
358	/////////////////////////////////////////////////////////////////
359	// GetInteger()
360	//
361	// Attempts to parse an integer from the character string given.
362	// Returns true only if no parsing error occurs.
363	/////////////////////////////////////////////////////////////////
364
365	bool GetInteger (char data, int val){
366	char *endPtr;
367	long int retVal;
368
369	assert (val);
370
371	errno = 0;
372	retVal = strtol (data, &endPtr, 0);
373	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
374	if (errno != 0 && (retVal == LONG_MAX \|\| retVal == LONG_MIN)) return false;
375	if (retVal < (long) INT_MIN \|\| retVal > (long) INT_MAX) return false;
376	*val = (int) retVal;
377	return true;
378	}
379
380	/////////////////////////////////////////////////////////////////
381	// GetFloat()
382	//
383	// Attempts to parse a float from the character string given.
384	// Returns true only if no parsing error occurs.
385	/////////////////////////////////////////////////////////////////
386
387	bool GetFloat (char data, float val){
388	char *endPtr;
389	double retVal;
390
391	assert (val);
392
393	errno = 0;
394	retVal = strtod (data, &endPtr);
395	if (retVal == 0 && (errno != 0 \|\| data == endPtr)) return false;
396	if (errno != 0 && (retVal >= 1000000.0 \|\| retVal <= -1000000.0)) return false;
397	*val = (float) retVal;
398	return true;
399	}
400
401	/////////////////////////////////////////////////////////////////
402	// ParseParams()
403	//
404	// Parse all command-line options.
405	/////////////////////////////////////////////////////////////////
406
407	SafeVector<string> ParseParams (int argc, char **argv){
408
409	if (argc < 2){
410
411	cerr << "PROBCONS comes with ABSOLUTELY NO WARRANTY. This is free software, and" << endl
412	<< "you are welcome to redistribute it under certain conditions. See the" << endl
413	<< "file COPYING.txt for details." << endl
414	<< endl
415	<< "Usage:" << endl
416	<< " probcons [OPTION]... [MFAFILE]..." << endl
417	<< endl
418	<< "Description:" << endl
419	<< " Align sequences in MFAFILE(s) and print result to standard output" << endl
420	<< endl
421	<< " -t, --train FILENAME" << endl
422	<< " compute EM transition probabilities, store in FILENAME (default: "
423	<< parametersOutputFilename << ")" << endl
424	<< endl
425	<< " -m, --matrixfile FILENAME" << endl
426	<< " read transition parameters from FILENAME (default: "
427	<< matrixFilename << ")" << endl
428	<< endl
429	<< " -p, --paramfile FILENAME" << endl
430	<< " read scoring matrix probabilities from FILENAME (default: "
431	<< parametersInputFilename << ")" << endl
432	<< endl
433	<< " -c, --consistency REPS" << endl
434	<< " use " << MIN_CONSISTENCY_REPS << " <= REPS <= " << MAX_CONSISTENCY_REPS
435	<< " (default: " << numConsistencyReps << ") passes of consistency transformation" << endl
436	<< endl
437	<< " -ir, --iterative-refinement REPS" << endl
438	<< " use " << MIN_ITERATIVE_REFINEMENT_REPS << " <= REPS <= " << MAX_ITERATIVE_REFINEMENT_REPS
439	<< " (default: " << numIterativeRefinementReps << ") passes of iterative-refinement" << endl
440	<< endl
441	<< " -pre, --pre-training REPS" << endl
442	<< " use " << MIN_PRETRAINING_REPS << " <= REPS <= " << MAX_PRETRAINING_REPS
443	<< " (default: " << numPreTrainingReps << ") rounds of pretraining" << endl
444	<< endl
445	<< " -go, --gap-open VALUE" << endl
446	<< " gap opening penalty of VALUE <= 0 (default: " << gapOpenPenalty << ")" << endl
447	<< endl
448	<< " -ge, --gap-extension VALUE" << endl
449	<< " gap extension penalty of VALUE <= 0 (default: " << gapContinuePenalty << ")" << endl
450	<< endl
451	<< " -v, --verbose" << endl
452	<< " report progress while aligning (default: " << (enableVerbose ? "on" : "off") << ")" << endl
453	<< endl;
454
455	exit (1);
456	}
457
458	SafeVector<string> sequenceNames;
459	int tempInt;
460	float tempFloat;
461
462	for (int i = 1; i < argc; i++){
463	if (argv[i][0] == '-'){
464
465	// training
466	if (!strcmp (argv[i], "-t") \|\| !strcmp (argv[i], "--train")){
467	enableTraining = true;
468	if (i < argc - 1)
469	parametersOutputFilename = string (argv[++i]);
470	else {
471	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
472	exit (1);
473	}
474	}
475
476	// scoring matrix file
477	else if (!strcmp (argv[i], "-m") \|\| !strcmp (argv[i], "--matrixfile")){
478	if (i < argc - 1)
479	matrixFilename = string (argv[++i]);
480	else {
481	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
482	exit (1);
483	}
484	}
485
486	// transition/initial distribution parameter file
487	else if (!strcmp (argv[i], "-p") \|\| !strcmp (argv[i], "--paramfile")){
488	if (i < argc - 1)
489	parametersInputFilename = string (argv[++i]);
490	else {
491	cerr << "ERROR: Filename expected for option " << argv[i] << endl;
492	exit (1);
493	}
494	}
495
496	// number of consistency transformations
497	else if (!strcmp (argv[i], "-c") \|\| !strcmp (argv[i], "--consistency")){
498	if (i < argc - 1){
499	if (!GetInteger (argv[++i], &tempInt)){
500	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
501	exit (1);
502	}
503	else {
504	if (tempInt < MIN_CONSISTENCY_REPS \|\| tempInt > MAX_CONSISTENCY_REPS){
505	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
506	<< MIN_CONSISTENCY_REPS << " and " << MAX_CONSISTENCY_REPS << "." << endl;
507	exit (1);
508	}
509	else
510	numConsistencyReps = tempInt;
511	}
512	}
513	else {
514	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
515	exit (1);
516	}
517	}
518
519	// number of randomized partitioning iterative refinement passes
520	else if (!strcmp (argv[i], "-ir") \|\| !strcmp (argv[i], "--iterative-refinement")){
521	if (i < argc - 1){
522	if (!GetInteger (argv[++i], &tempInt)){
523	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
524	exit (1);
525	}
526	else {
527	if (tempInt < MIN_ITERATIVE_REFINEMENT_REPS \|\| tempInt > MAX_ITERATIVE_REFINEMENT_REPS){
528	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
529	<< MIN_ITERATIVE_REFINEMENT_REPS << " and " << MAX_ITERATIVE_REFINEMENT_REPS << "." << endl;
530	exit (1);
531	}
532	else
533	numIterativeRefinementReps = tempInt;
534	}
535	}
536	else {
537	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
538	exit (1);
539	}
540	}
541
542	// number of EM pre-training rounds
543	else if (!strcmp (argv[i], "-pre") \|\| !strcmp (argv[i], "--pre-training")){
544	if (i < argc - 1){
545	if (!GetInteger (argv[++i], &tempInt)){
546	cerr << "ERROR: Invalid integer following option " << argv[i-1] << ": " << argv[i] << endl;
547	exit (1);
548	}
549	else {
550	if (tempInt < MIN_PRETRAINING_REPS \|\| tempInt > MAX_PRETRAINING_REPS){
551	cerr << "ERROR: For option " << argv[i-1] << ", integer must be between "
552	<< MIN_PRETRAINING_REPS << " and " << MAX_PRETRAINING_REPS << "." << endl;
553	exit (1);
554	}
555	else
556	numPreTrainingReps = tempInt;
557	}
558	}
559	else {
560	cerr << "ERROR: Integer expected for option " << argv[i] << endl;
561	exit (1);
562	}
563	}
564
565	// gap open penalty
566	else if (!strcmp (argv[i], "-go") \|\| !strcmp (argv[i], "--gap-open")){
567	if (i < argc - 1){
568	if (!GetFloat (argv[++i], &tempFloat)){
569	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
570	exit (1);
571	}
572	else {
573	if (tempFloat > 0){
574	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
575	exit (1);
576	}
577	else
578	gapOpenPenalty = tempFloat;
579	}
580	}
581	else {
582	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
583	exit (1);
584	}
585	}
586
587	// gap extension penalty
588	else if (!strcmp (argv[i], "-ge") \|\| !strcmp (argv[i], "--gap-extension")){
589	if (i < argc - 1){
590	if (!GetFloat (argv[++i], &tempFloat)){
591	cerr << "ERROR: Invalid floating-point value following option " << argv[i-1] << ": " << argv[i] << endl;
592	exit (1);
593	}
594	else {
595	if (tempFloat > 0){
596	cerr << "ERROR: For option " << argv[i-1] << ", floating-point value must not be positive." << endl;
597	exit (1);
598	}
599	else
600	gapContinuePenalty = tempFloat;
601	}
602	}
603	else {
604	cerr << "ERROR: Floating-point value expected for option " << argv[i] << endl;
605	exit (1);
606	}
607	}
608
609	// verbose reporting
610	else if (!strcmp (argv[i], "-v") \|\| !strcmp (argv[i], "--verbose")){
611	enableVerbose = true;
612	}
613
614	// bad arguments
615	else {
616	cerr << "ERROR: Unrecognized option: " << argv[i] << endl;
617	exit (1);
618	}
619	}
620	else {
621	sequenceNames.push_back (string (argv[i]));
622	}
623	}
624
625	return sequenceNames;
626	}
627
628	/////////////////////////////////////////////////////////////////
629	// ReadParameters()
630	//
631	// Read initial distribution, transition, and emission
632	// parameters from a file.
633	/////////////////////////////////////////////////////////////////
634
635	void ReadParameters (){
636
637	ifstream data;
638
639	// read initial state distribution and transition parameters
640	if (parametersInputFilename == string ("")){
641	if (NumInsertStates == 1){
642	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib1Default[i];
643	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen1Default[i];
644	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend1Default[i];
645	}
646	else if (NumInsertStates == 2){
647	for (int i = 0; i < NumMatrixTypes; i++) initDistrib[i] = initDistrib2Default[i];
648	for (int i = 0; i < 2*NumInsertStates; i++) gapOpen[i] = gapOpen2Default[i];
649	for (int i = 0; i < 2*NumInsertStates; i++) gapExtend[i] = gapExtend2Default[i];
650	}
651	else {
652	cerr << "ERROR: No default initial distribution/parameter settings exist" << endl
653	<< " for " << NumInsertStates << " pairs of insert states. Use --paramfile." << endl;
654	exit (1);
655	}
656	}
657	else {
658	data.open (parametersInputFilename.c_str());
659	if (data.fail()){
660	cerr << "ERROR: Unable to read parameter file: " << parametersInputFilename << endl;
661	exit (1);
662	}
663	for (int i = 0; i < NumMatrixTypes; i++) data >> initDistrib[i];
664	for (int i = 0; i < 2*NumInsertStates; i++) data >> gapOpen[i];
665	for (int i = 0; i < 2*NumInsertStates; i++) data >> gapExtend[i];
666	data.close();
667	}
668
669	// read emission parameters
670	int alphabetSize = 20;
671
672	// allocate memory
673	alphabet = SafeVector<char>(alphabetSize);
674	emitPairs = VVF (alphabetSize, VF (alphabetSize, 0));
675	emitSingle = VF (alphabetSize);
676
677	if (matrixFilename == string ("")){
678	for (int i = 0; i < alphabetSize; i++) alphabet[i] = alphabetDefault[i];
679	for (int i = 0; i < alphabetSize; i++){
680	emitSingle[i] = emitSingleDefault[i];
681	for (int j = 0; j <= i; j++){
682	emitPairs[i][j] = emitPairs[j][i] = (i == j);
683	}
684	}
685	}
686	else {
687	data.open (matrixFilename.c_str());
688	if (data.fail()){
689	cerr << "ERROR: Unable to read scoring matrix file: " << matrixFilename << endl;
690	exit (1);
691	}
692
693	for (int i = 0; i < alphabetSize; i++) data >> alphabet[i];
694	for (int i = 0; i < alphabetSize; i++){
695	for (int j = 0; j <= i; j++){
696	data >> emitPairs[i][j];
697	emitPairs[j][i] = emitPairs[i][j];
698	}
699	}
700	for (int i = 0; i < alphabetSize; i++){
701	char ch;
702	data >> ch;
703	assert (ch == alphabet[i]);
704	}
705	for (int i = 0; i < alphabetSize; i++) data >> emitSingle[i];
706	data.close();
707	}
708	}
709
710	/////////////////////////////////////////////////////////////////
711	// ProcessTree()
712	//
713	// Process the tree recursively. Returns the aligned sequences
714	// corresponding to a node or leaf of the tree.
715	/////////////////////////////////////////////////////////////////
716
717	MultiSequence ProcessTree (const TreeNode tree, MultiSequence *sequences,
718	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
719	const ProbabilisticModel &model){
720	MultiSequence *result;
721
722	// check if this is a node of the alignment tree
723	if (tree->GetSequenceLabel() == -1){
724	MultiSequence *alignLeft = ProcessTree (tree->GetLeftChild(), sequences, sparseMatrices, model);
725	MultiSequence *alignRight = ProcessTree (tree->GetRightChild(), sequences, sparseMatrices, model);
726
727	assert (alignLeft);
728	assert (alignRight);
729
730	result = AlignAlignments (alignLeft, alignRight, sparseMatrices, model);
731	assert (result);
732
733	delete alignLeft;
734	delete alignRight;
735	}
736
737	// otherwise, this is a leaf of the alignment tree
738	else {
739	result = new MultiSequence(); assert (result);
740	result->AddSequence (sequences->GetSequence(tree->GetSequenceLabel())->Clone());
741	}
742
743	return result;
744	}
745
746	/////////////////////////////////////////////////////////////////
747	// ComputeFinalAlignment()
748	//
749	// Compute the final alignment by calling ProcessTree(), then
750	// performing iterative refinement as needed.
751	/////////////////////////////////////////////////////////////////
752
753	MultiSequence ComputeFinalAlignment (const TreeNode tree, MultiSequence *sequences,
754	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
755	const ProbabilisticModel &model){
756
757	MultiSequence *alignment = ProcessTree (tree, sequences, sparseMatrices, model);
758
759	// iterative refinement
760	for (int i = 0; i < numIterativeRefinementReps; i++)
761	DoIterativeRefinement (sparseMatrices, model, alignment);
762
763	cerr << endl;
764
765	// return final alignment
766	return alignment;
767	}
768
769	/////////////////////////////////////////////////////////////////
770	// AlignAlignments()
771	//
772	// Returns the alignment of two MultiSequence objects.
773	/////////////////////////////////////////////////////////////////
774
775	MultiSequence AlignAlignments (MultiSequence align1, MultiSequence *align2,
776	const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
777	const ProbabilisticModel &model){
778
779	// print some info about the alignment
780	if (enableVerbose){
781	for (int i = 0; i < align1->GetNumSequences(); i++)
782	cerr << ((i==0) ? "[" : ",") << align1->GetSequence(i)->GetLabel();
783	cerr << "] vs. ";
784	for (int i = 0; i < align2->GetNumSequences(); i++)
785	cerr << ((i==0) ? "[" : ",") << align2->GetSequence(i)->GetLabel();
786	cerr << "]: ";
787	}
788
789	VF *posterior = model.BuildPosterior (align1, align2, sparseMatrices);
790	pair<SafeVector<char> *, float> alignment;
791
792	// choose the alignment routine depending on the "cosmetic" gap penalties used
793	if (gapOpenPenalty == 0 && gapContinuePenalty == 0)
794	alignment = model.ComputeAlignment (align1->GetSequence(0)->GetLength(), align2->GetSequence(0)->GetLength(), *posterior);
795	else
796	alignment = model.ComputeAlignmentWithGapPenalties (align1, align2,
797	*posterior, align1->GetNumSequences(), align2->GetNumSequences(),
798	gapOpenPenalty, gapContinuePenalty);
799
800	delete posterior;
801
802	if (enableVerbose){
803
804	// compute total length of sequences
805	int totLength = 0;
806	for (int i = 0; i < align1->GetNumSequences(); i++)
807	for (int j = 0; j < align2->GetNumSequences(); j++)
808	totLength += min (align1->GetSequence(i)->GetLength(), align2->GetSequence(j)->GetLength());
809
810	// give an "accuracy" measure for the alignment
811	cerr << alignment.second / totLength << endl;
812	}
813
814	// now build final alignment
815	MultiSequence *result = new MultiSequence();
816	for (int i = 0; i < align1->GetNumSequences(); i++)
817	result->AddSequence (align1->GetSequence(i)->AddGaps(alignment.first, 'X'));
818	for (int i = 0; i < align2->GetNumSequences(); i++)
819	result->AddSequence (align2->GetSequence(i)->AddGaps(alignment.first, 'Y'));
820	result->SortByLabel();
821
822	// free temporary alignment
823	delete alignment.first;
824
825	return result;
826	}
827
828	/////////////////////////////////////////////////////////////////
829	// DoRelaxation()
830	//
831	// Performs one round of the consistency transformation. The
832	// formula used is:
833	// 1
834	// P'(x[i]-y[j]) = --- sum sum P(x[i]-z[k]) P(z[k]-y[j])
835	// \|S\| z in S k
836	//
837	// where S = {x, y, all other sequences...}
838	//
839	/////////////////////////////////////////////////////////////////
840
841	void DoRelaxation (MultiSequence sequences, SafeVector<SafeVector<SparseMatrix > > &sparseMatrices){
842	const int numSeqs = sequences->GetNumSequences();
843
844	SafeVector<SafeVector<SparseMatrix > > newSparseMatrices (numSeqs, SafeVector<SparseMatrix >(numSeqs, NULL));
845
846	// for every pair of sequences
847	for (int i = 0; i < numSeqs; i++){
848	for (int j = i+1; j < numSeqs; j++){
849	Sequence *seq1 = sequences->GetSequence (i);
850	Sequence *seq2 = sequences->GetSequence (j);
851
852	if (enableVerbose)
853	cerr << "Relaxing (" << i+1 << ") " << seq1->GetHeader() << " vs. "
854	<< "(" << j+1 << ") " << seq2->GetHeader() << ": ";
855
856	// get the original posterior matrix
857	VF *posteriorPtr = sparseMatrices[i][j]->GetPosterior(); assert (posteriorPtr);
858	VF &posterior = *posteriorPtr;
859
860	const int seq1Length = seq1->GetLength();
861	const int seq2Length = seq2->GetLength();
862
863	// contribution from the summation where z = x and z = y
864	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] += posterior[k];
865
866	if (enableVerbose)
867	cerr << sparseMatrices[i][j]->GetNumCells() << " --> ";
868
869	// contribution from all other sequences
870	for (int k = 0; k < numSeqs; k++) if (k != i && k != j){
871	Relax (sparseMatrices[i][k], sparseMatrices[k][j], posterior);
872	}
873
874	// now renormalization
875	for (int k = 0; k < (seq1Length+1) * (seq2Length+1); k++) posterior[k] /= numSeqs;
876
877	// save the new posterior matrix
878	newSparseMatrices[i][j] = new SparseMatrix (seq1->GetLength(), seq2->GetLength(), posterior);
879	newSparseMatrices[j][i] = newSparseMatrices[i][j]->ComputeTranspose();
880
881	if (enableVerbose)
882	cerr << newSparseMatrices[i][j]->GetNumCells() << " -- ";
883
884	delete posteriorPtr;
885
886	if (enableVerbose)
887	cerr << "done." << endl;
888	}
889	}
890
891	// now replace the old posterior matrices
892	for (int i = 0; i < numSeqs; i++){
893	for (int j = 0; j < numSeqs; j++){
894	delete sparseMatrices[i][j];
895	sparseMatrices[i][j] = newSparseMatrices[i][j];
896	}
897	}
898	}
899
900	/////////////////////////////////////////////////////////////////
901	// DoRelaxation()
902	//
903	// Computes the consistency transformation for a single sequence
904	// z, and adds the transformed matrix to "posterior".
905	/////////////////////////////////////////////////////////////////
906
907	void Relax (SparseMatrix matXZ, SparseMatrix matZY, VF &posterior){
908
909	assert (matXZ);
910	assert (matZY);
911
912	int lengthX = matXZ->GetSeq1Length();
913	int lengthY = matZY->GetSeq2Length();
914	assert (matXZ->GetSeq2Length() == matZY->GetSeq1Length());
915
916	// for every x[i]
917	for (int i = 1; i <= lengthX; i++){
918	SafeVector<PIF>::iterator XZptr = matXZ->GetRowPtr(i);
919	SafeVector<PIF>::iterator XZend = XZptr + matXZ->GetRowSize(i);
920
921	VF::iterator base = posterior.begin() + i * (lengthY + 1);
922
923	// iterate through all x[i]-z[k]
924	while (XZptr != XZend){
925	SafeVector<PIF>::iterator ZYptr = matZY->GetRowPtr(XZptr->first);
926	SafeVector<PIF>::iterator ZYend = ZYptr + matZY->GetRowSize(XZptr->first);
927	const float XZval = XZptr->second;
928
929	// iterate through all z[k]-y[j]
930	while (ZYptr != ZYend){
931	base[ZYptr->first] += XZval * ZYptr->second;;
932	ZYptr++;
933	}
934	XZptr++;
935	}
936	}
937	}
938
939	/////////////////////////////////////////////////////////////////
940	// DoIterativeRefinement()
941	//
942	// Performs a single round of randomized partionining iterative
943	// refinement.
944	/////////////////////////////////////////////////////////////////
945
946	void DoIterativeRefinement (const SafeVector<SafeVector<SparseMatrix *> > &sparseMatrices,
947	const ProbabilisticModel &model, MultiSequence* &alignment){
948	set<int> groupOne, groupTwo;
949
950	// create two separate groups
951	for (int i = 0; i < alignment->GetNumSequences(); i++){
952	if (random() % 2)
953	groupOne.insert (i);
954	else
955	groupTwo.insert (i);
956	}
957
958	if (groupOne.empty() \|\| groupTwo.empty()) return;
959
960	// project into the two groups
961	MultiSequence *groupOneSeqs = alignment->Project (groupOne); assert (groupOneSeqs);
962	MultiSequence *groupTwoSeqs = alignment->Project (groupTwo); assert (groupTwoSeqs);
963	delete alignment;
964
965	// realign
966	alignment = AlignAlignments (groupOneSeqs, groupTwoSeqs, sparseMatrices, model);
967	}
968
969	/*
970	float ScoreAlignment (MultiSequence alignment, MultiSequence sequences, SparseMatrix **sparseMatrices, const int numSeqs){
971	int totLength = 0;
972	float score = 0;
973
974	for (int a = 0; a < alignment->GetNumSequences(); a++){
975	for (int b = a+1; b < alignment->GetNumSequences(); b++){
976	Sequence *seq1 = alignment->GetSequence(a);
977	Sequence *seq2 = alignment->GetSequence(b);
978
979	const int seq1Length = sequences->GetSequence(seq1->GetLabel())->GetLength();
980	const int seq2Length = sequences->GetSequence(seq2->GetLabel())->GetLength();
981
982	totLength += min (seq1Length, seq2Length);
983
984	int pos1 = 0, pos2 = 0;
985	for (int i = 1; i <= seq1->GetLength(); i++){
986	char ch1 = seq1->GetPosition(i);
987	char ch2 = seq2->GetPosition(i);
988
989	if (ch1 != '-') pos1++;
990	if (ch2 != '-') pos2++;
991	if (ch1 != '-' && ch2 != '-'){
992	score += sparseMatrices[a * numSeqs + b]->GetValue (pos1, pos2);
993	}
994	}
995	}
996	}
997
998	return score / totLength;
999	}
1000	*/

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/lib/GDE/MAFFT/mafft-7.055-with-extensions/extensions/mxscarna_src/probconsRNA/FixRef.cc

Download in other formats: