Context Navigation

fastdistkmer.cpp

Visit:

Last change on this file was 10390, checked in by aboeckma, 12 years ago
added muscle sourcles amd makefile
File size: 6.4 KB

Line
1	#include "muscle.h"
2	#include "msa.h"
3	#include "seqvect.h"
4	#include "seq.h"
5	#include "distfunc.h"
6	#include <math.h>
7
8	#define TRACE 0
9
10	/***
11	Some candidate alphabets considered because they
12	have high correlations and small table sizes.
13	Correlation coefficent is between k-mer distance
14	and %id D measured from a CLUSTALW alignment.
15	Table size is N^k where N is size of alphabet.
16	A is standard (uncompressed) amino alphabet.
17
18	Correlation
19	Alpha N k Table Size all 25-50%
20	----- -- - ---------- ---- ------
21	A 20 3 8,000 0.943 0.575
22	A 20 4 160,000 0.962 0.685 <<
23	LiA 14 4 38,416 0.966 0.645
24	SEB 14 4 38,416 0.964 0.634
25	LiA 13 4 28,561 0.965 0.640
26	LiA 12 4 20,736 0.963 0.620
27	LiA 10 5 100,000 0.964 0.652
28
29	We select A with k=4 because it has the best
30	correlations. The only drawback is a large table
31	size, but space is readily available and the only
32	additional time cost is in resetting the table to
33	zero, which can be done quickly with memset or by
34	keeping a list of the k-mers that were found (should
35	test to see which is faster, and may vary by compiler
36	and processor type). It also has the minor advantage
37	that we don't need to convert the alphabet.
38
39	Fractional identity d is estimated as follows.
40
41	F = fractional k-mer count
42	if F is 0: F = 0.01
43	Y = log(0.02 + F)
44	d = -4.1 + 4.12*Y
45
46	The constant 0.02 was chosen to make the relationship
47	between Y and D linear. The constants -4.1 and 4.12
48	were chosen to fit a straight line to the scatterplot
49	of Y vs D.
50	***/
51
52	#define MIN(x, y) (((x) < (y)) ? (x) : (y))
53
54	const unsigned K = 4;
55	const unsigned N = 20;
56	const unsigned N_2 = 20*20;
57	const unsigned N_3 = 202020;
58	const unsigned N_4 = 202020*20;
59
60	const unsigned TABLE_SIZE = N_4;
61
62	// For debug output
63	const char *KmerToStr(unsigned Kmer)
64	{
65	static char s[5];
66
67	unsigned c3 = (Kmer/N_3)%N;
68	unsigned c2 = (Kmer/N_2)%N;
69	unsigned c1 = (Kmer/N)%N;
70	unsigned c0 = Kmer%N;
71
72	s[0] = LetterToChar(c3);
73	s[1] = LetterToChar(c2);
74	s[2] = LetterToChar(c1);
75	s[3] = LetterToChar(c0);
76	return s;
77	}
78
79	void CountKmers(const byte s[], unsigned uSeqLength, byte KmerCounts[])
80	{
81	#if TRACE
82	Log("CountKmers\n");
83	#endif
84	memset(KmerCounts, 0, TABLE_SIZE*sizeof(byte));
85
86	const byte *ptrKmerStart = s;
87	const byte *ptrKmerEnd = s + 4;
88	const byte *ptrSeqEnd = s + uSeqLength;
89
90	unsigned c3 = s[0]*N_3;
91	unsigned c2 = s[1]*N_2;
92	unsigned c1 = s[2]*N;
93	unsigned c0 = s[3];
94
95	unsigned Kmer = c3 + c2 + c1 + c0;
96
97	for (;;)
98	{
99	assert(Kmer < TABLE_SIZE);
100
101	#if TRACE
102	Log("Kmer=%d=%s\n", Kmer, KmerToStr(Kmer));
103	#endif
104	++(KmerCounts[Kmer]);
105
106	if (ptrKmerEnd == ptrSeqEnd)
107	break;
108
109	// Compute k-mer as function of previous k-mer:
110	// 1. Subtract first letter from previous k-mer.
111	// 2. Multiply by N.
112	// 3. Add next letter.
113	c3 = (ptrKmerStart++) N_3;
114	Kmer = (Kmer - c3)*N;
115	Kmer += *ptrKmerEnd++;
116	}
117	}
118
119	unsigned CommonKmerCount(const byte Seq[], unsigned uSeqLength,
120	const byte KmerCounts1[], const byte Seq2[], unsigned uSeqLength2)
121	{
122	byte KmerCounts2[TABLE_SIZE];
123	CountKmers(Seq2, uSeqLength2, KmerCounts2);
124
125	const byte *ptrKmerStart = Seq;
126	const byte *ptrKmerEnd = Seq + 4;
127	const byte *ptrSeqEnd = Seq + uSeqLength;
128
129	unsigned c3 = Seq[0]*N_3;
130	unsigned c2 = Seq[1]*N_2;
131	unsigned c1 = Seq[2]*N;
132	unsigned c0 = Seq[3];
133
134	unsigned Kmer = c3 + c2 + c1 + c0;
135
136	unsigned uCommonCount = 0;
137	for (;;)
138	{
139	assert(Kmer < TABLE_SIZE);
140
141	const byte Count1 = KmerCounts1[Kmer];
142	const byte Count2 = KmerCounts2[Kmer];
143
144	uCommonCount += MIN(Count1, Count2);
145
146	// Hack so we don't double-count
147	KmerCounts2[Kmer] = 0;
148
149	if (ptrKmerEnd == ptrSeqEnd)
150	break;
151
152	// Compute k-mer as function of previous k-mer:
153	// 1. Subtract first letter from previous k-mer.
154	// 2. Multiply by N.
155	// 3. Add next letter.
156	c3 = (ptrKmerStart++) N_3;
157	Kmer = (Kmer - c3)*N;
158	Kmer += *ptrKmerEnd++;
159	}
160	return uCommonCount;
161	}
162
163	static void SeqToLetters(const Seq &s, byte Letters[])
164	{
165	const unsigned uSeqLength = s.Length();
166	for (unsigned uCol = 0; uCol < uSeqLength; ++uCol)
167	{
168	char c = s.GetChar(uCol);
169	// Ugly hack. My k-mer counting code isn't wild-card
170	// aware. Arbitrarily replace wildcards by a specific
171	// amino acid.
172	if (IsWildcardChar(c))
173	c = 'A';
174	*Letters++ = CharToLetter(c);
175	}
176	}
177
178	void FastDistKmer(const SeqVect &v, DistFunc &DF)
179	{
180	byte KmerCounts[TABLE_SIZE];
181
182	const unsigned uSeqCount = v.GetSeqCount();
183
184	DF.SetCount(uSeqCount);
185	if (0 == uSeqCount)
186	return;
187
188	// Initialize distance matrix to zero
189	for (unsigned uSeq1 = 0; uSeq1 < uSeqCount; ++uSeq1)
190	{
191	DF.SetDist(uSeq1, uSeq1, 0);
192	for (unsigned uSeq2 = 0; uSeq2 < uSeq1; ++uSeq2)
193	DF.SetDist(uSeq1, uSeq2, 0);
194	}
195
196	unsigned uMaxLength = 0;
197	for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
198	{
199	const Seq &s = v.GetSeq(uSeqIndex);
200	unsigned uSeqLength = s.Length();
201	if (uSeqLength > uMaxLength)
202	uMaxLength = uSeqLength;
203	}
204	if (0 == uMaxLength)
205	return;
206
207	byte *Seq1Letters = new byte[uMaxLength];
208	byte *Seq2Letters = new byte[uMaxLength];
209
210	for (unsigned uSeqIndex1 = 0; uSeqIndex1 < uSeqCount - 1; ++uSeqIndex1)
211	{
212	const Seq &s1 = v.GetSeq(uSeqIndex1);
213	const unsigned uSeqLength1 = s1.Length();
214
215	SeqToLetters(s1, Seq1Letters);
216	CountKmers(Seq1Letters, uSeqLength1, KmerCounts);
217
218	for (unsigned uSeqIndex2 = uSeqIndex1 + 1; uSeqIndex2 < uSeqCount;
219	++uSeqIndex2)
220	{
221	const Seq &s2 = v.GetSeq(uSeqIndex2);
222	const unsigned uSeqLength2 = s2.Length();
223
224	SeqToLetters(s2, Seq2Letters);
225
226	unsigned uCommonKmerCount = CommonKmerCount(Seq1Letters, uSeqLength1,
227	KmerCounts, Seq2Letters, uSeqLength2);
228
229	unsigned uMinLength = MIN(uSeqLength1, uSeqLength2);
230	double F = (double) uCommonKmerCount / (uMinLength - K + 1);
231	if (0.0 == F)
232	F = 0.01;
233	double Y = log(0.02 + F);
234	double EstimatedPctId = Y/4.12 + 0.995;
235	double KD = KimuraDist(EstimatedPctId);
236	// DF.SetDist(uSeqIndex1, uSeqIndex2, (float) KD);
237	DF.SetDist(uSeqIndex1, uSeqIndex2, (float) (1 - F));
238	#if TRACE
239	Log("CommonCount=%u, MinLength=%u, F=%6.4f Y=%6.4f, %%id=%6.4f, KimuraDist=%8.4f\n",
240	uCommonKmerCount, uMinLength, F, Y, EstimatedPctId, KD);
241	#endif
242	}
243	}
244
245	delete[] Seq1Letters;
246	delete[] Seq2Letters;
247	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/GDE/MUSCLE/src/fastdistkmer.cpp

Download in other formats: