Context Navigation

fastdistmafft.cpp

Visit:

Last change on this file was 10390, checked in by aboeckma, 12 years ago
added muscle sourcles amd makefile
File size: 7.3 KB

Line
1	#include "muscle.h"
2	#include "distfunc.h"
3	#include "seqvect.h"
4	#include <math.h>
5
6	#define TRACE 0
7
8	#define MIN(x, y) (((x) < (y)) ? (x) : (y))
9	#define MAX(x, y) (((x) > (y)) ? (x) : (y))
10
11	const unsigned TUPLE_COUNT = 66666*6;
12	static unsigned char Count1[TUPLE_COUNT];
13	static unsigned char Count2[TUPLE_COUNT];
14
15	// Amino acid groups according to MAFFT (sextet5)
16	// 0 = A G P S T
17	// 1 = I L M V
18	// 2 = N D Q E B Z
19	// 3 = R H K
20	// 4 = F W Y
21	// 5 = C
22	// 6 = X . - U
23	unsigned ResidueGroup[] =
24	{
25	0, // AX_A,
26	5, // AX_C,
27	2, // AX_D,
28	2, // AX_E,
29	4, // AX_F,
30	0, // AX_G,
31	3, // AX_H,
32	1, // AX_I,
33	3, // AX_K,
34	1, // AX_L,
35	1, // AX_M,
36	2, // AX_N,
37	0, // AX_P,
38	2, // AX_Q,
39	3, // AX_R,
40	0, // AX_S,
41	0, // AX_T,
42	1, // AX_V,
43	4, // AX_W,
44	4, // AX_Y,
45
46	2, // AX_B, // D or N
47	2, // AX_Z, // E or Q
48	0, // AX_X, // Unknown // ****** TODO ***********
49	// This isn't the correct way of avoiding group 6
50	0 // AX_GAP, // ****** TODO ****************
51	};
52	unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);
53
54	static char *TupleToStr(int t)
55	{
56	static char s[7];
57	int t1, t2, t3, t4, t5, t6;
58
59	t1 = t%6;
60	t2 = (t/6)%6;
61	t3 = (t/(6*6))%6;
62	t4 = (t/(666))%6;
63	t5 = (t/(666*6))%6;
64	t6 = (t/(66666))%6;
65
66	s[5] = '0' + t1;
67	s[4] = '0' + t2;
68	s[3] = '0' + t3;
69	s[2] = '0' + t4;
70	s[1] = '0' + t5;
71	s[0] = '0' + t6;
72	return s;
73	}
74
75	static unsigned GetTuple(const unsigned uLetters[], unsigned n)
76	{
77	assert(uLetters[n] < uResidueGroupCount);
78	assert(uLetters[n+1] < uResidueGroupCount);
79	assert(uLetters[n+2] < uResidueGroupCount);
80	assert(uLetters[n+3] < uResidueGroupCount);
81	assert(uLetters[n+4] < uResidueGroupCount);
82	assert(uLetters[n+5] < uResidueGroupCount);
83
84	unsigned u1 = ResidueGroup[uLetters[n]];
85	unsigned u2 = ResidueGroup[uLetters[n+1]];
86	unsigned u3 = ResidueGroup[uLetters[n+2]];
87	unsigned u4 = ResidueGroup[uLetters[n+3]];
88	unsigned u5 = ResidueGroup[uLetters[n+4]];
89	unsigned u6 = ResidueGroup[uLetters[n+5]];
90
91	return u6 + u56 + u466 + u3666 + u26666 + u16666*6;
92	}
93
94	static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])
95	{
96	memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));
97	for (unsigned n = 0; n < uTupleCount; ++n)
98	{
99	const unsigned uTuple = GetTuple(L, n);
100	++(Count[uTuple]);
101	}
102	}
103
104	static void ListCount(const unsigned char Count[])
105	{
106	for (unsigned n = 0; n < TUPLE_COUNT; ++n)
107	{
108	if (0 == Count[n])
109	continue;
110	Log("%s %u\n", TupleToStr(n), Count[n]);
111	}
112	}
113
114	void DistKmer6_6(const SeqVect &v, DistFunc &DF)
115	{
116	const unsigned uSeqCount = v.Length();
117
118	DF.SetCount(uSeqCount);
119	if (0 == uSeqCount)
120	return;
121
122	// Initialize distance matrix to zero
123	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
124	{
125	DF.SetDist(uSeq1, uSeq1, 0);
126	for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
127	DF.SetDist(uSeq1, uSeq2, 0);
128	}
129
130	// Convert to letters
131	unsigned *Letters = new unsigned [uSeqCount];
132	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
133	{
134	Seq &s = *(v[uSeqIndex]);
135	const unsigned uSeqLength = s.Length();
136	unsigned *L = new unsigned[uSeqLength];
137	Letters[uSeqIndex] = L;
138	for (unsigned n = 0; n < uSeqLength; ++n)
139	{
140	char c = s[n];
141	L[n] = CharToLetterEx(c);
142	assert(L[n] < uResidueGroupCount);
143	}
144	}
145
146	unsigned *uCommonTupleCount = new unsigned [uSeqCount];
147	for (unsigned n = 0; n < uSeqCount; ++n)
148	{
149	uCommonTupleCount[n] = new unsigned[uSeqCount];
150	memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
151	}
152
153	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
154	unsigned uCount = 0;
155	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
156	{
157	Seq &seq1 = *(v[uSeq1]);
158	const unsigned uSeqLength1 = seq1.Length();
159	if (uSeqLength1 < 5)
160	continue;
161
162	const unsigned uTupleCount = uSeqLength1 - 5;
163	const unsigned *L = Letters[uSeq1];
164	CountTuples(L, uTupleCount, Count1);
165	#if TRACE
166	{
167	Log("Seq1=%d\n", uSeq1);
168	Log("Groups:\n");
169	for (unsigned n = 0; n < uSeqLength1; ++n)
170	Log("%u", ResidueGroup[L[n]]);
171	Log("\n");
172
173	Log("Tuples:\n");
174	ListCount(Count1);
175	}
176	#endif
177
178	SetProgressDesc("K-mer dist pass 1");
179	for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
180	{
181	if (0 == uCount%500)
182	Progress(uCount, uPairCount);
183	++uCount;
184	Seq &seq2 = *(v[uSeq2]);
185	const unsigned uSeqLength2 = seq2.Length();
186	if (uSeqLength2 < 5)
187	{
188	if (uSeq1 == uSeq2)
189	DF.SetDist(uSeq1, uSeq2, 0);
190	else
191	DF.SetDist(uSeq1, uSeq2, 1);
192	continue;
193	}
194
195	// First pass through seq 2 to count tuples
196	const unsigned uTupleCount = uSeqLength2 - 5;
197	const unsigned *L = Letters[uSeq2];
198	CountTuples(L, uTupleCount, Count2);
199	#if TRACE
200	Log("Seq2=%d Counts=\n", uSeq2);
201	ListCount(Count2);
202	#endif
203
204	// Second pass to accumulate sum of shared tuples
205	// MAFFT defines this as the sum over unique tuples
206	// in seq2 of the minimum of the number of tuples found
207	// in the two sequences.
208	unsigned uSum = 0;
209	for (unsigned n = 0; n < uTupleCount; ++n)
210	{
211	const unsigned uTuple = GetTuple(L, n);
212	uSum += MIN(Count1[uTuple], Count2[uTuple]);
213
214	// This is a hack to make sure each unique tuple counted only once.
215	Count2[uTuple] = 0;
216	}
217	#if TRACE
218	{
219	Seq &s1 = *(v[uSeq1]);
220	Seq &s2 = *(v[uSeq2]);
221	const char *pName1 = s1.GetName();
222	const char *pName2 = s2.GetName();
223	Log("Common count %s(%d) - %s(%d) =%u\n",
224	pName1, uSeq1, pName2, uSeq2, uSum);
225	}
226	#endif
227	uCommonTupleCount[uSeq1][uSeq2] = uSum;
228	uCommonTupleCount[uSeq2][uSeq1] = uSum;
229	}
230	}
231	ProgressStepsDone();
232
233	uCount = 0;
234	SetProgressDesc("K-mer dist pass 2");
235	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
236	{
237	Seq &s1 = *(v[uSeq1]);
238	const char *pName1 = s1.GetName();
239
240	double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
241	if (0 == dCommonTupleCount11)
242	dCommonTupleCount11 = 1;
243
244	DF.SetDist(uSeq1, uSeq1, 0);
245	for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
246	{
247	if (0 == uCount%500)
248	Progress(uCount, uPairCount);
249	++uCount;
250
251	double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
252	if (0 == dCommonTupleCount22)
253	dCommonTupleCount22 = 1;
254
255	const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
256	/dCommonTupleCount11;
257	const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
258	/dCommonTupleCount22;
259
260	// dMinDist is the value used for tree-building in MAFFT
261	const double dMinDist = MIN(dDist1, dDist2);
262	DF.SetDist(uSeq1, uSeq2, (float) dMinDist);
263
264	//const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
265	//g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
266	// ** TODO ** why does this make score slightly worse??
267	//const double dKimuraDist = KimuraDist(dEstimatedPctId);
268	//DF.SetDist(uSeq1, uSeq2, dKimuraDist);
269	}
270	}
271	ProgressStepsDone();
272
273	for (unsigned n = 0; n < uSeqCount; ++n)
274	delete[] uCommonTupleCount[n];
275	delete[] uCommonTupleCount;
276	delete[] Letters;
277	}
278
279	double PctIdToMAFFTDist(double dPctId)
280	{
281	if (dPctId < 0.05)
282	dPctId = 0.05;
283	double dDist = -log(dPctId);
284	return dDist;
285	}
286
287	double PctIdToHeightMAFFT(double dPctId)
288	{
289	return PctIdToMAFFTDist(dPctId);
290	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/GDE/MUSCLE/src/fastdistmafft.cpp

Download in other formats: