Context Navigation

fastdistnuc.cpp

Visit:

Last change on this file was 10390, checked in by aboeckma, 12 years ago
added muscle sourcles amd makefile
File size: 6.8 KB

Line
1	#include "muscle.h"
2	#include "distfunc.h"
3	#include "seqvect.h"
4	#include <math.h>
5
6	#define TRACE 0
7
8	#define MIN(x, y) (((x) < (y)) ? (x) : (y))
9	#define MAX(x, y) (((x) > (y)) ? (x) : (y))
10
11	const unsigned TUPLE_COUNT = 66666*6;
12	static unsigned char Count1[TUPLE_COUNT];
13	static unsigned char Count2[TUPLE_COUNT];
14
15	// Nucleotide groups according to MAFFT (sextet5)
16	// 0 = A
17	// 1 = C
18	// 2 = G
19	// 3 = T
20	// 4 = other
21
22	static unsigned ResidueGroup[] =
23	{
24	0, // NX_A,
25	1, // NX_C,
26	2, // NX_G,
27	3, // NX_T/U
28	4, // NX_N,
29	4, // NX_R,
30	4, // NX_Y,
31	4, // NX_GAP
32	};
33	static unsigned uResidueGroupCount = sizeof(ResidueGroup)/sizeof(ResidueGroup[0]);
34
35	static char *TupleToStr(int t)
36	{
37	static char s[7];
38	int t1, t2, t3, t4, t5, t6;
39
40	t1 = t%6;
41	t2 = (t/6)%6;
42	t3 = (t/(6*6))%6;
43	t4 = (t/(666))%6;
44	t5 = (t/(666*6))%6;
45	t6 = (t/(66666))%6;
46
47	s[5] = '0' + t1;
48	s[4] = '0' + t2;
49	s[3] = '0' + t3;
50	s[2] = '0' + t4;
51	s[1] = '0' + t5;
52	s[0] = '0' + t6;
53	return s;
54	}
55
56	static unsigned GetTuple(const unsigned uLetters[], unsigned n)
57	{
58	assert(uLetters[n] < uResidueGroupCount);
59	assert(uLetters[n+1] < uResidueGroupCount);
60	assert(uLetters[n+2] < uResidueGroupCount);
61	assert(uLetters[n+3] < uResidueGroupCount);
62	assert(uLetters[n+4] < uResidueGroupCount);
63	assert(uLetters[n+5] < uResidueGroupCount);
64
65	unsigned u1 = ResidueGroup[uLetters[n]];
66	unsigned u2 = ResidueGroup[uLetters[n+1]];
67	unsigned u3 = ResidueGroup[uLetters[n+2]];
68	unsigned u4 = ResidueGroup[uLetters[n+3]];
69	unsigned u5 = ResidueGroup[uLetters[n+4]];
70	unsigned u6 = ResidueGroup[uLetters[n+5]];
71
72	return u6 + u56 + u466 + u3666 + u26666 + u16666*6;
73	}
74
75	static void CountTuples(const unsigned L[], unsigned uTupleCount, unsigned char Count[])
76	{
77	memset(Count, 0, TUPLE_COUNT*sizeof(unsigned char));
78	for (unsigned n = 0; n < uTupleCount; ++n)
79	{
80	const unsigned uTuple = GetTuple(L, n);
81	++(Count[uTuple]);
82	}
83	}
84
85	static void ListCount(const unsigned char Count[])
86	{
87	for (unsigned n = 0; n < TUPLE_COUNT; ++n)
88	{
89	if (0 == Count[n])
90	continue;
91	Log("%s %u\n", TupleToStr(n), Count[n]);
92	}
93	}
94
95	void DistKmer4_6(const SeqVect &v, DistFunc &DF)
96	{
97	if (ALPHA_DNA != g_Alpha && ALPHA_RNA != g_Alpha)
98	Quit("DistKmer4_6 requires nucleo alphabet");
99
100	const unsigned uSeqCount = v.Length();
101
102	DF.SetCount(uSeqCount);
103	if (0 == uSeqCount)
104	return;
105
106	// Initialize distance matrix to zero
107	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
108	{
109	DF.SetDist(uSeq1, uSeq1, 0);
110	for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
111	DF.SetDist(uSeq1, uSeq2, 0);
112	}
113
114	// Convert to letters
115	unsigned *Letters = new unsigned [uSeqCount];
116	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
117	{
118	Seq &s = *(v[uSeqIndex]);
119	const unsigned uSeqLength = s.Length();
120	unsigned *L = new unsigned[uSeqLength];
121	Letters[uSeqIndex] = L;
122	for (unsigned n = 0; n < uSeqLength; ++n)
123	{
124	char c = s[n];
125	L[n] = CharToLetterEx(c);
126	if (L[n] >= 4)
127	L[n] = 4;
128	}
129	}
130
131	unsigned *uCommonTupleCount = new unsigned [uSeqCount];
132	for (unsigned n = 0; n < uSeqCount; ++n)
133	{
134	uCommonTupleCount[n] = new unsigned[uSeqCount];
135	memset(uCommonTupleCount[n], 0, uSeqCount*sizeof(unsigned));
136	}
137
138	const unsigned uPairCount = (uSeqCount*(uSeqCount + 1))/2;
139	unsigned uCount = 0;
140	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
141	{
142	Seq &seq1 = *(v[uSeq1]);
143	const unsigned uSeqLength1 = seq1.Length();
144	if (uSeqLength1 < 5)
145	continue;
146
147	const unsigned uTupleCount = uSeqLength1 - 5;
148	const unsigned *L = Letters[uSeq1];
149	CountTuples(L, uTupleCount, Count1);
150	#if TRACE
151	{
152	Log("Seq1=%d\n", uSeq1);
153	Log("Groups:\n");
154	for (unsigned n = 0; n < uSeqLength1; ++n)
155	Log("%u", ResidueGroup[L[n]]);
156	Log("\n");
157
158	Log("Tuples:\n");
159	ListCount(Count1);
160	}
161	#endif
162
163	SetProgressDesc("K-mer dist pass 1");
164	for (unsigned uSeq2 = 0; uSeq2 <= uSeq1; ++uSeq2)
165	{
166	if (0 == uCount%500)
167	Progress(uCount, uPairCount);
168	++uCount;
169	Seq &seq2 = *(v[uSeq2]);
170	const unsigned uSeqLength2 = seq2.Length();
171	if (uSeqLength2 < 5)
172	{
173	if (uSeq1 == uSeq2)
174	DF.SetDist(uSeq1, uSeq2, 0);
175	else
176	DF.SetDist(uSeq1, uSeq2, 1);
177	continue;
178	}
179
180	// First pass through seq 2 to count tuples
181	const unsigned uTupleCount = uSeqLength2 - 5;
182	const unsigned *L = Letters[uSeq2];
183	CountTuples(L, uTupleCount, Count2);
184	#if TRACE
185	Log("Seq2=%d Counts=\n", uSeq2);
186	ListCount(Count2);
187	#endif
188
189	// Second pass to accumulate sum of shared tuples
190	// MAFFT defines this as the sum over unique tuples
191	// in seq2 of the minimum of the number of tuples found
192	// in the two sequences.
193	unsigned uSum = 0;
194	for (unsigned n = 0; n < uTupleCount; ++n)
195	{
196	const unsigned uTuple = GetTuple(L, n);
197	uSum += MIN(Count1[uTuple], Count2[uTuple]);
198
199	// This is a hack to make sure each unique tuple counted only once.
200	Count2[uTuple] = 0;
201	}
202	#if TRACE
203	{
204	Seq &s1 = *(v[uSeq1]);
205	Seq &s2 = *(v[uSeq2]);
206	const char *pName1 = s1.GetName();
207	const char *pName2 = s2.GetName();
208	Log("Common count %s(%d) - %s(%d) =%u\n",
209	pName1, uSeq1, pName2, uSeq2, uSum);
210	}
211	#endif
212	uCommonTupleCount[uSeq1][uSeq2] = uSum;
213	uCommonTupleCount[uSeq2][uSeq1] = uSum;
214	}
215	}
216	ProgressStepsDone();
217
218	uCount = 0;
219	SetProgressDesc("K-mer dist pass 2");
220	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
221	{
222	Seq &s1 = *(v[uSeq1]);
223	const char *pName1 = s1.GetName();
224
225	double dCommonTupleCount11 = uCommonTupleCount[uSeq1][uSeq1];
226	if (0 == dCommonTupleCount11)
227	dCommonTupleCount11 = 1;
228
229	DF.SetDist(uSeq1, uSeq1, 0);
230	for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
231	{
232	if (0 == uCount%500)
233	Progress(uCount, uPairCount);
234	++uCount;
235
236	double dCommonTupleCount22 = uCommonTupleCount[uSeq2][uSeq2];
237	if (0 == dCommonTupleCount22)
238	dCommonTupleCount22 = 1;
239
240	const double dDist1 = 3.0*(dCommonTupleCount11 - uCommonTupleCount[uSeq1][uSeq2])
241	/dCommonTupleCount11;
242	const double dDist2 = 3.0*(dCommonTupleCount22 - uCommonTupleCount[uSeq1][uSeq2])
243	/dCommonTupleCount22;
244
245	// dMinDist is the value used for tree-building in MAFFT
246	const double dMinDist = MIN(dDist1, dDist2);
247	DF.SetDist(uSeq1, uSeq2, (float) dMinDist);
248
249	//const double dEstimatedPctId = TupleDistToEstimatedPctId(dMinDist);
250	//g_dfPwId.SetDist(uSeq1, uSeq2, dEstimatedPctId);
251	// ** TODO ** why does this make score slightly worse??
252	//const double dKimuraDist = KimuraDist(dEstimatedPctId);
253	//DF.SetDist(uSeq1, uSeq2, dKimuraDist);
254	}
255	}
256	ProgressStepsDone();
257
258	for (unsigned n = 0; n < uSeqCount; ++n)
259	{
260	delete[] uCommonTupleCount[n];
261	delete[] Letters[n];
262	}
263	delete[] uCommonTupleCount;
264	delete[] Letters;
265	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/stable/GDE/MUSCLE/src/fastdistnuc.cpp

Download in other formats: