Context Navigation

source: tags/arb_5.0/AWT/AWT_codon_table.cxx

Visit:

Last change on this file was 5773, checked in by westram, 16 years ago
If not a valid IUPAC, AWT_decode_iupac() now returns "" (previously " ") for easier testing.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 29.1 KB

Line
1	#include <stdio.h>
2	#include <stdlib.h>
3	#include <string.h>
4	#include <ctype.h>
5
6	#include "awt_iupac.hxx"
7	#include "awt_codon_table.hxx"
8
9	// const int AWAR_PROTEIN_TYPE_bacterial_code_index = 8;
10
11	#define EMBL_BACTERIAL_TABLE_INDEX 11
12
13	// Info about translation codes was taken from
14	// http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
15
16	struct AWT_Codon_Code_Definition AWT_codon_def[AWT_CODON_TABLES+1] =
17	{
18	// 0000000001111111111222222222233333333334444444444555555555566666
19	// 1234567890123456789012345678901234567890123456789012345678901234
20
21	// "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1
22	// "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2
23	// "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3
24	{
25	" (1) Standard code",
26	"FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", // The first code in this table has to be 'Standard code'!
27	"---M---------------M---------------M----------------------------",
28	1
29	},
30	{
31	" (2) Vertebrate mitochondrial code",
32	"FFLLSSSSYYCCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSVVVVAAAADDEEGGGG",
33	"--------------------------------MMMM---------------M------------",
34	2
35	},
36	{
37	" (3) Yeast mitochondrial code",
38	"FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
39	"----------------------------------MM----------------------------",
40	3
41	},
42	{
43	" (4) Mold/Protozoan/Coelenterate mito. + Mycoplasma/Spiroplasma code",
44	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
45	"--MM---------------M------------MMMM---------------M------------",
46	4
47	},
48	{
49	" (5) Invertebrate mitochondrial code",
50	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
51	"---M----------------------------MMMM---------------M------------",
52	5
53	},
54	{
55	" (6) Ciliate, Dasycladacean and Hexamita nuclear code",
56	"FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
57	"-----------------------------------M----------------------------",
58	6
59	},
60	{
61	" (9) Echinoderm and Flatworm mitochondrial code",
62	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
63	"-----------------------------------M---------------M------------",
64	9
65	},
66	{
67	"(10) Euplotid nuclear code",
68	"FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
69	"-----------------------------------M----------------------------",
70	10
71	},
72	// 0000000001111111111222222222233333333334444444444555555555566666
73	// 1234567890123456789012345678901234567890123456789012345678901234
74
75	// "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1
76	// "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2
77	// "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3
78	{
79	"(11) Bacterial and Plant Plastid code",
80	"FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
81	"---M---------------M------------MMMM---------------M------------",
82	11
83	},
84	{
85	"(12) Alternative Yeast nuclear code",
86	"FFLLSSSSYY*CCWLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
87	"-------------------M---------------M----------------------------",
88	12
89	},
90	{
91	"(13) Ascidian mitochondrial code",
92	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
93	"---M------------------------------MM---------------M------------",
94	13
95	},
96	{
97	"(14) Alternative Flatworm mitochondrial code",
98	"FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
99	"-----------------------------------M----------------------------",
100	14
101	},
102	{
103	"(15) Blepharisma nuclear code",
104	"FFLLSSSSYYQCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
105	"-----------------------------------M----------------------------",
106	15
107	},
108	{
109	"(16) Chlorophycean mitochondrial code",
110	"FFLLSSSSYYLCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
111	"-----------------------------------M----------------------------",
112	16
113	},
114	{
115	"(21) Trematode mitochondrial code",
116	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
117	"-----------------------------------M---------------M------------",
118	21
119	},
120	{
121	"(22) Scenedesmus obliquus mitochondrial code",
122	"FFLLSSSYYLCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
123	"-----------------------------------M----------------------------",
124	22
125	},
126	{
127	"(23) Thraustochytrium mitochondrial code",
128	"FFLSSSSYYCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
129	"--------------------------------M--M---------------M------------",
130	23
131	},
132
133	{ 0, 0, 0, 0 } // end of table-marker
134	};
135
136	#define MAX_EMBL_TRANSL_TABLE_VALUE 23 // maximum known EMBL transl_table value
137
138	int AWT_embl_transl_table_2_arb_code_nr(int embl_code_nr) {
139	// returns -1 if embl_code_nr is not known by ARB
140
141	static bool initialized = false;
142	static int arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE+1]; // key: embl_code_nr, value: arb_code_nr or -1
143
144	if (!initialized) {
145	for (int embl = 0; embl <= MAX_EMBL_TRANSL_TABLE_VALUE; ++embl) {
146	arb_code_nr_table[embl] = -1; // illegal table
147	}
148	for (int arb_code_nr = 0; arb_code_nr < AWT_CODON_TABLES; ++arb_code_nr) {
149	arb_code_nr_table[AWT_codon_def[arb_code_nr].embl_feature_transl_table] = arb_code_nr;
150	}
151	// should be index of 'Bacterial and Plant Plastid code'
152	// (otherwise maybe AWAR_PROTEIN_TYPE_bacterial_code_index is wrong)
153	awt_assert(arb_code_nr_table[EMBL_BACTERIAL_TABLE_INDEX] == AWAR_PROTEIN_TYPE_bacterial_code_index);
154	awt_assert(arb_code_nr_table[1] == 0); // Standard code has to be on index zero!
155
156	initialized = true;
157	}
158
159	if (embl_code_nr<0 \|\| embl_code_nr>MAX_EMBL_TRANSL_TABLE_VALUE) return -1;
160
161	int arb_code_nr = arb_code_nr_table[embl_code_nr];
162	#ifdef DEBUG
163	if (arb_code_nr != -1) {
164	awt_assert(arb_code_nr >= 0 && arb_code_nr < AWT_CODON_TABLES);
165	awt_assert(AWT_arb_code_nr_2_embl_transl_table(arb_code_nr) == embl_code_nr);
166	}
167	#endif
168	return arb_code_nr;
169	}
170
171	int AWT_arb_code_nr_2_embl_transl_table(int arb_code_nr) {
172	awt_assert(arb_code_nr >= 0 && arb_code_nr<AWT_CODON_TABLES);
173	return AWT_codon_def[arb_code_nr].embl_feature_transl_table;
174	}
175
176
177	static bool codon_tables_initialized = false;
178	static char definite_translation[AWT_MAX_CODONS]; // contains 0 if ambiguous, otherwise it contains the definite translation
179	static char *ambiguous_codons[AWT_MAX_CODONS]; // for each ambiguous codon: contains all translations (each only once)
180
181	void AWT_initialize_codon_tables() {
182	if (codon_tables_initialized) return;
183
184	int codon_nr;
185	int code_nr;
186
187	for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) {
188	ambiguous_codons[codon_nr] = 0;
189	}
190
191	awt_assert(AWT_CODON_TABLES>=1);
192	memcpy(definite_translation, AWT_codon_def[0].aa, AWT_MAX_CODONS); // only one translation is really definite
193
194	awt_assert(AWT_codon_def[AWT_CODON_TABLES].aa==NULL); // Error in AWT_codon_def or AWT_CODON_CODES
195
196	for (code_nr=1; code_nr<AWT_CODON_TABLES; code_nr++) {
197	const char *translation = AWT_codon_def[code_nr].aa;
198
199	for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) {
200	if (definite_translation[codon_nr]!='?') { // is definite till now
201	if (definite_translation[codon_nr]!=translation[codon_nr]) { // we found a different translation
202	// create ambiguous_codons:
203	char amb = (char)GB_calloc(AWT_MAX_CODONS+1, sizeof(char));
204	amb[0] = definite_translation[codon_nr];
205	amb[1] = translation[codon_nr];
206
207	ambiguous_codons[codon_nr] = amb;
208	definite_translation[codon_nr] = '?';
209	#if defined(DEBUG) && 0
210	printf("amb[%i]='%s'\n", codon_nr, amb);
211	#endif
212	}
213	}
214	else { // is ambiguous
215	if (strchr(ambiguous_codons[codon_nr], translation[codon_nr])==0) { // not listed in ambiguous codons
216	// append another ambiguous codon:
217	char *amb = ambiguous_codons[codon_nr];
218	amb[strlen(amb)] = translation[codon_nr];
219	#if defined(DEBUG) && 0
220	printf("amb[%i]='%s'\n", codon_nr, amb);
221	#endif
222	}
223	}
224	}
225	}
226
227	codon_tables_initialized = true;
228	}
229
230	// return 0..3 (ok) or 4 (failure)
231	inline int dna2idx(char c) {
232	switch (c) {
233	case 'T': case 't':
234	case 'U': case 'u': return 0;
235	case 'C': case 'c': return 1;
236	case 'A': case 'a': return 2;
237	case 'G': case 'g': return 3;
238	}
239	return 4;
240	}
241
242	inline char idx2dna(int idx) {
243	awt_assert(idx>=0 && idx<4);
244	return "TCAG"[idx];
245	}
246
247	inline int calc_codon_nr(const char *dna) {
248	int i1 = dna2idx(dna[0]);
249	int i2 = dna2idx(dna[1]);
250	int i3 = dna2idx(dna[2]);
251
252	if (i1==4\|\|i2==4\|\|i3==4) return AWT_MAX_CODONS; // is not a codon
253
254	int codon_nr = i116 + i24 + i3;
255	awt_assert(codon_nr>=0 && codon_nr<=AWT_MAX_CODONS);
256	return codon_nr;
257	}
258
259	inline void build_codon(int codon_nr, char *to_buffer) {
260	awt_assert(codon_nr>=0 && codon_nr<AWT_MAX_CODONS);
261
262	to_buffer[0] = idx2dna((codon_nr>>4)&3);
263	to_buffer[1] = idx2dna((codon_nr>>2)&3);
264	to_buffer[2] = idx2dna(codon_nr&3);
265	}
266
267	const char* AWT_get_codon_code_name(int code) {
268	awt_assert(code>=0 && code<AWT_CODON_TABLES);
269	return AWT_codon_def[code].name;
270	}
271
272	static const char *protein_name[26+1] = {
273	"Ala", // A
274	"Asx", // B
275	"Cys", // C
276	"Asp", // D
277	"Glu", // E
278	"Phe", // F
279	"Gly", // G
280	"His", // H
281	"Ile", // I
282	0, // J
283	"Lys", // K
284	"Leu", // L
285	"Met", // M
286	"Asn", // N
287	0, // O
288	"Pro", // P
289	"Gln", // Q
290	"Arg", // R
291	"Ser", // S
292	"Thr", // T
293	0, // U
294	"Val", // V
295	"Trp", // W
296	"Xxx", // X
297	"Tyr", // Y
298	"Glx", // Z
299	0
300	};
301
302	const char *AWT_get_protein_name(char protein) {
303	if (protein=='*') return "End";
304	if (protein=='-') return "---";
305
306	awt_assert(protein>='A' && protein<='Z');
307	awt_assert(protein_name[protein-'A']!=0);
308	return protein_name[protein-'A'];
309	}
310
311	#ifdef DEBUG
312
313	inline char nextBase(char c) {
314	switch (c) {
315	case 'T': return 'C';
316	case 'C': return 'A';
317	case 'A': return 'G';
318	case 'G': return 0;
319	default: awt_assert(0);
320	}
321	return 0;
322	}
323
324	void AWT_dump_codons() {
325	AWT_allowedCode allowed_code;
326
327	for (char c='*'; c<='Z'; c++) {
328	printf("Codes for '%c': ", c);
329	int first_line = 1;
330	int found = 0;
331	for (char b1='T'; b1; b1=nextBase(b1)) {
332	for (char b2='T'; b2; b2=nextBase(b2)) {
333	for (char b3='T'; b3; b3=nextBase(b3)) {
334	char dna[4];
335	dna[0]=b1;
336	dna[1]=b2;
337	dna[2]=b3;
338	dna[3]=0;
339
340	AWT_allowedCode allowed_code_left;
341	if (AWT_is_codon(c, dna, allowed_code, allowed_code_left)) {
342	if (!first_line) printf("\n ");
343	first_line = 0;
344	printf("%s (", dna);
345
346	int first=1;
347	for (int code=0; code<AWT_CODON_TABLES; code++) {
348	if (allowed_code_left.is_allowed(code)) {
349	if (!first) printf(",");
350	first=0;
351	printf("%i",code);
352	}
353	}
354	printf(") ");
355
356	found = 1;
357	}
358	}
359	}
360	}
361	if (!found) printf("none");
362	printf("\n");
363	if (c=='*') c='A'-1;
364	}
365	}
366	#endif
367
368	char AWT_is_start_codon(const char *dna, int arb_code_nr) {
369	// if dna[0]..dna[2] is defined as start codon for 'arb_code_nr'
370	// return 'M' (or whatever is defined in tables)
371	// return 0 otherwise
372
373	char is_start_codon = 0;
374	int codon_nr = calc_codon_nr(dna);
375
376	awt_assert(arb_code_nr >= 0 && arb_code_nr<AWT_CODON_TABLES);
377
378	if (codon_nr != AWT_MAX_CODONS) { // dna is a clean codon (it contains no iupac-codes)
379	const char *starts = AWT_codon_def[arb_code_nr].starts;
380
381	is_start_codon = starts[codon_nr];
382	if (is_start_codon == '-') is_start_codon = 0;
383	}
384
385	return is_start_codon;
386	}
387
388
389	bool AWT_is_codon(char protein, const char dna, const AWT_allowedCode& allowed_code, AWT_allowedCode& allowed_code_left, const char *fail_reason_ptr) {
390	// return TRUE if 'dna' contains a codon of 'protein' ('dna' must not contain any gaps)
391	// allowed_code contains 1 for each allowed code and 0 otherwise
392	// allowed_code_left contains a copy of allowed_codes with all impossible codes set to zero
393
394	awt_assert(codon_tables_initialized);
395
396	const char *fail_reason = 0;
397	bool is_codon = false;
398
399	if (fail_reason_ptr) *fail_reason_ptr = 0;
400
401	protein = toupper(protein);
402	if (protein=='B') { // B is a shortcut for Asp(=D) or Asn(=N)
403	is_codon = AWT_is_codon('D', dna, allowed_code, allowed_code_left, &fail_reason);
404	if (!is_codon) {
405	awt_assert(fail_reason != 0); // if failed there should always be a failure-reason
406	char *fail1 = strdup(fail_reason);
407	is_codon = AWT_is_codon('N', dna, allowed_code, allowed_code_left, &fail_reason);
408	if (!is_codon) {
409	char *fail2 = strdup(fail_reason);
410	fail_reason = GBS_global_string("%s and %s", fail1, fail2);
411	free(fail2);
412	}
413	free(fail1);
414	}
415	}
416	else if (protein=='Z') { // Z is a shortcut for Glu(=E) or Gln(=Q)
417	is_codon = AWT_is_codon('E', dna, allowed_code, allowed_code_left, &fail_reason);
418	if (!is_codon) {
419	awt_assert(fail_reason != 0); // if failed there should always be a failure-reason
420	char *fail1 = strdup(fail_reason);
421	is_codon = AWT_is_codon('Q', dna, allowed_code, allowed_code_left, &fail_reason);
422	if (!is_codon) {
423	char *fail2 = strdup(fail_reason);
424	fail_reason = GBS_global_string("%s and %s", fail1, fail2);
425	free(fail2);
426	}
427	free(fail1);
428	}
429	}
430	else {
431	int codon_nr = calc_codon_nr(dna);
432	if (codon_nr==AWT_MAX_CODONS) { // dna is not a clean codon (it contains iupac-codes)
433	int error_positions = 0;
434	int first_error_pos = -1;
435	bool too_short = false;
436	{
437	int iupac_pos;
438	for (iupac_pos=0; iupac_pos<3 && !too_short; iupac_pos++) {
439	if (!dna[iupac_pos]) {
440	too_short = true;
441	}
442	else if (strchr("ACGTU", dna[iupac_pos]) == 0) {
443	if (first_error_pos==-1) first_error_pos = iupac_pos;
444	error_positions++;
445	}
446	}
447	}
448
449	if (too_short) {
450	fail_reason = GBS_global_string("Not enough nucleotides (got '%s')", dna);
451	}
452	else {
453	gb_assert(error_positions);
454	if (error_positions==3) { // don't accept codons with 3 errors
455	fail_reason = GBS_global_string("Three consecutive IUPAC codes '%c%c%c'", dna[0], dna[1], dna[2]);
456	}
457	else {
458	const char *decoded_iupac = AWT_decode_iupac(dna[first_error_pos], GB_AT_DNA, 0);
459
460	if (!decoded_iupac[0]) { // no valid IUPAC
461	allowed_code_left.forbidAll();
462	fail_reason = GBS_global_string("Not a valid IUPAC code:'%c'", dna[first_error_pos]);
463	}
464	else {
465	char dna_copy[4];
466	memcpy(dna_copy, dna, 3);
467	dna_copy[3] = 0;
468
469	#if defined(DEBUG) && 0
470	printf("Check if '%s' is a codon for '%c'\n", dna_copy, protein);
471	#endif
472
473	int all_are_codons = 1;
474	AWT_allowedCode allowed_code_copy;
475	allowed_code_copy = allowed_code;
476
477	for (int i=0; decoded_iupac[i]; i++) {
478	dna_copy[first_error_pos] = decoded_iupac[i];
479	if (!AWT_is_codon(protein, dna_copy, allowed_code_copy, allowed_code_left)) {
480	all_are_codons = 0;
481	break;
482	}
483	allowed_code_copy = allowed_code_left;
484	}
485
486	if (all_are_codons) {
487	allowed_code_left = allowed_code_copy;
488	is_codon = true;
489	}
490	else {
491	allowed_code_left.forbidAll();
492	fail_reason = GBS_global_string("Not all IUPAC-combinations of '%s' translate", dna_copy);
493	}
494	#if defined(DEBUG) && 0
495	printf("result = %i\n", all_are_codons);
496	#endif
497	}
498	}
499	}
500	}
501	else if (definite_translation[codon_nr]!='?') {
502	int ok = definite_translation[codon_nr]==protein;
503
504	if (ok) {
505	allowed_code_left = allowed_code;
506	is_codon = true;
507	}
508	else {
509	allowed_code_left.forbidAll();
510	fail_reason = GBS_global_string("'%c%c%c' does never translate to '%c' (1)", dna[0], dna[1], dna[2], protein);
511	}
512	}
513	else if (strchr(ambiguous_codons[codon_nr], protein)==0) {
514	allowed_code_left.forbidAll();
515	fail_reason = GBS_global_string("'%c%c%c' does never translate to '%c' (2)", dna[0], dna[1], dna[2], protein);
516	}
517	else {
518	bool correct_disallowed_translation = false;
519
520	// search for allowed correct translation possibity:
521	for (int code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) {
522	if (AWT_codon_def[code_nr].aa[codon_nr] == protein) { // does it translate correct?
523	if (allowed_code.is_allowed(code_nr)) { // is this code allowed?
524	allowed_code_left.allow(code_nr);
525	is_codon = true;
526	}
527	else {
528	allowed_code_left.forbid(code_nr); // otherwise forbid code in future
529	correct_disallowed_translation = true;
530	}
531	}
532	else {
533	allowed_code_left.forbid(code_nr); // otherwise forbid code in future
534	}
535	}
536
537	if (!is_codon) {
538	awt_assert(correct_disallowed_translation); // should be true because otherwise we shouldn't run into this else-branch
539	char left_tables[AWT_CODON_TABLES*3+1];
540	char *ltp = left_tables;
541	bool first = true;
542	for (int code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) {
543	if (allowed_code.is_allowed(code_nr)) {
544	if (!first) *ltp++ = ',';
545	ltp += sprintf(ltp, "%i", code_nr);
546	first = false;
547	}
548	}
549	fail_reason = GBS_global_string("'%c%c%c' does not translate to '%c' for any of the leftover trans-tables (%s)",
550	dna[0], dna[1], dna[2], protein, left_tables);
551	}
552	}
553	}
554
555	if (!is_codon) {
556	awt_assert(fail_reason);
557	if (fail_reason_ptr) *fail_reason_ptr = fail_reason; // set failure-reason if requested
558	}
559	return is_codon;
560	}
561
562	// -------------------------------------------------------------------------------- Codon_Group
563
564	class Codon_Group
565	{
566	char codon[64]; // index is calculated with calc_codon_nr
567
568	public:
569	Codon_Group(char protein, int code_nr);
570	~Codon_Group() {}
571
572	// static int idx(int x, int y, int z) const { return (((x<<2)+y)<<2)+z; }
573	// static int is_idx(int idx) { return idx>=0 && idx<AWT_MAX_CODONS; }
574
575	// void add_member(int idx) { awt_assert(is_idx(idx)); codon[idx] = 1; }
576	Codon_Group& operator += (const Codon_Group& other);
577	int expand(char *to_buffer) const;
578	};
579
580	Codon_Group::Codon_Group(char protein, int code_nr) {
581	protein = toupper(protein);
582	awt_assert(protein=='*' \|\| isalpha(protein));
583	awt_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES);
584
585	const char *amino_table = AWT_codon_def[code_nr].aa;
586	for (int i=0; i<AWT_MAX_CODONS; i++) {
587	codon[i] = amino_table[i]==protein;
588	}
589	}
590
591	Codon_Group& Codon_Group::operator+=(const Codon_Group& other) {
592	for (int i=0; i<AWT_MAX_CODONS; i++) {
593	codon[i] = codon[i] \|\| other.codon[i];
594	}
595	return *this;
596	}
597
598	inline int legal_dna_no(int i) { return i>=0 && i<4; }
599	inline void my_memcpy(char dest, const char source, size_t length) { for (size_t l=0; l<length; l++) { dest[l] = source[l]; } }
600
601	inline const char buildMixedCodon(const char con1, const char *con2) {
602	int mismatches = 0;
603	int mismatch_index = -1;
604	static char buf[4];
605
606	for (int i=0; i<3; i++) {
607	if (con1[i]!=con2[i]) {
608	mismatches++;
609	mismatch_index = i;
610	}
611	else {
612	buf[i] = con1[i];
613	}
614	}
615
616	if (mismatches==1) { // exactly one position differs between codons
617	awt_assert(mismatch_index!=-1);
618	buf[mismatch_index] = AWT_iupac_add(con1[mismatch_index], con2[mismatch_index], GB_AT_DNA);
619	buf[3] = 0;
620	return buf;
621	}
622	return 0;
623	}
624
625	static int expandMore(const char bufferStart, int no_of_condons, char&to_buffer) {
626	int i, j;
627	const char con1, con2;
628	int added = 0;
629
630	for (i=0; i<no_of_condons; i++) {
631	con1 = bufferStart+3*i;
632
633	for (j=i+1; j<no_of_condons; j++) {
634	con2 = bufferStart+3*j;
635	const char *result = buildMixedCodon(con1, con2);
636	if (result) {
637	to_buffer[0] = 0;
638	// do we already have this codon?
639	const char *found;
640	const char *startSearch = bufferStart;
641	for (;;) {
642	found = strstr(startSearch, result);
643	if (!found) break;
644	int pos = (found-bufferStart);
645	if ((pos%3)==0) break; // yes aready here!
646	startSearch = found+1; // was misaligned -> try behind
647	}
648
649	if (!found) {
650	my_memcpy(to_buffer, result, 3); to_buffer+=3;
651	added++;
652	}
653	}
654	}
655	}
656	return no_of_condons+added;
657	}
658
659	int Codon_Group::expand(char *to_buffer) const {
660	int count = 0;
661	int i;
662	char *org_to_buffer = to_buffer;
663
664	for (i=0; i<AWT_MAX_CODONS; i++) {
665	if (codon[i]) {
666	build_codon(i, to_buffer);
667	to_buffer += 3;
668	count++;
669	}
670	}
671
672	#if defined(DEBUG) && 0
673	to_buffer[0] = 0;
674	printf("codons = '%s'\n", org_to_buffer);
675	#endif
676
677	for (;;) {
678	int new_count = expandMore(org_to_buffer, count, to_buffer);
679	if (new_count==count) break; // nothing expanded -> done
680	count = new_count;
681	#if defined(DEBUG) && 0
682	to_buffer[0] = 0;
683	printf("codons (expandedMore) = '%s'\n", org_to_buffer);
684	#endif
685	}
686
687	awt_assert(count==(int(to_buffer-org_to_buffer)/3));
688
689	return count;
690	}
691
692	// --------------------------------------------------------------------------------
693
694	static Codon_Group *get_Codon_Group(char protein, int code_nr) {
695	awt_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES);
696	protein = toupper(protein);
697	awt_assert(isalpha(protein) \|\| protein=='*');
698	awt_assert(codon_tables_initialized);
699
700	Codon_Group *cgroup = 0;
701
702	if (protein=='B') {
703	cgroup = new Codon_Group('D', code_nr);
704	Codon_Group N('N', code_nr);
705	*cgroup += N;
706	}
707	else if (protein=='Z') {
708	cgroup = new Codon_Group('E', code_nr);
709	Codon_Group Q('Q', code_nr);
710	*cgroup += Q;
711	}
712	else {
713	cgroup = new Codon_Group(protein, code_nr);
714	}
715
716	awt_assert(cgroup);
717
718	return cgroup;
719	}
720
721	#define MAX_CODON_LIST_LENGTH (70*3)
722
723	// get a list of all codons ("xyzxyzxyz...") encoding 'protein' in case we use Codon-Code 'code_nr'
724	// (includes all completely contained IUPAC-encoded codons at the end of list)
725	const char *AWT_get_codons(char protein, int code_nr) {
726	Codon_Group *cgroup = get_Codon_Group(protein, code_nr);
727
728	static char buffer[MAX_CODON_LIST_LENGTH+1];
729	int offset = 3*cgroup->expand(buffer);
730	awt_assert(offset<MAX_CODON_LIST_LENGTH);
731	buffer[offset] = 0;
732
733	delete cgroup;
734
735	return buffer;
736	}
737
738	// #if defined(DEBUG)
739	// void test_AWT_get_codons() {
740	// AWT_initialize_codon_tables();
741	// for (int code_nr=0; code_nr<1; /AWT_CODON_TABLES;/ code_nr++) {
742	// printf("--------------------- Code = %i\n", code_nr);
743	// for (char c='*'; c<='Z'; c++) {
744	// const char *got_codons = AWT_get_codons(c, code_nr);
745	// printf("%c='%s'\n", c, got_codons);
746	// if (c=='*') c='A'-1;
747	// }
748	// }
749	// }
750	// #endif
751
752
753	// get a IUPAC-triple generated by mixing all codons belonging to 'protein'
754	const char *AWT_get_protein_iupac(char protein, int code_nr) {
755	if (protein == 'X') return "NNN";
756	if (protein == '.') return "...";
757	if (protein == '-') return "---";
758
759	const char *codons = AWT_get_codons(protein, code_nr);
760	static char result[] = "xxx";
761
762	awt_assert(codons && strlen(codons) >= 3);
763	memcpy(result, codons, 3);
764	for (int off = 3; codons[off]; off += 3) {
765	for (int base = 0; base<3; ++base) {
766	result[base] = AWT_iupac_add(result[base], codons[off+base], GB_AT_DNA);
767	}
768	}
769
770	return result;
771	}
772
773
774	static unsigned char protein_index_def[] = "ABCDEFGHIKLMNPQRSTVWXYZ.-*";
775	static char protein_index[256]; // index of protein in protein_2_iupac_table
776	static bool protein_index_initialized = 0;
777
778	// #define PROTEIN_TABLE_SIZE (26-3+3) // all chars - "JOU" + ".-*"
779	#define PROTEIN_TABLE_SIZE sizeof(protein_index_def)
780
781	static void initialize_protein_index() {
782	memset(protein_index, char(-1), sizeof(protein_index));
783
784	for (int i = 0; protein_index_def[i]; ++i) {
785	protein_index[protein_index_def[i]] = protein_index[tolower(protein_index_def[i])] = i*3;
786	}
787	protein_index_initialized = true;
788	}
789
790	static int protein_2_iupac_tables_initialized_4_code = -1;
791	static char protein_2_iupac_table[3*PROTEIN_TABLE_SIZE];
792
793	static void initialize_protein_2_iupac_tables(int code_nr) {
794	if (!protein_index_initialized) initialize_protein_index();
795	if (!codon_tables_initialized) AWT_initialize_codon_tables();
796
797	memset(protein_2_iupac_table, 0, sizeof(protein_2_iupac_table));
798	for (int i = 0; protein_index_def[i]; ++i) {
799	char c = protein_index_def[i];
800	const char *expanded = AWT_get_protein_iupac(c, code_nr);
801	size_t off = i*3;
802
803	for (int j = 0; j<3; ++j) {
804	protein_2_iupac_table[off+j] = expanded[j]; // write to table
805	}
806	}
807
808	protein_2_iupac_tables_initialized_4_code = code_nr;
809	}
810	// --------------------------------------------------------------------------------
811	// converts a protein sequence to a DNA sequence containing IUPAC codes
812	// Example for standard code :
813	// 'ABCZ' -> 'GCN RAY TGY SRN'
814	// if prot_len == 0 -> prot_len gets calculated
815
816	char AWT_proteinSeq_2_iupac(const char proteinSeq, size_t prot_len, int code_nr) {
817	if (protein_2_iupac_tables_initialized_4_code != code_nr) {
818	initialize_protein_2_iupac_tables(code_nr);
819	}
820	if (prot_len == 0) prot_len = strlen(proteinSeq);
821
822	size_t dna_len = prot_len*3;
823	char result = (char)malloc(dna_len+1);
824
825	size_t didx = 0;
826	for (size_t pidx = 0; pidx<prot_len; ++pidx, didx += 3) {
827	char prot_idx = protein_index[(unsigned char)proteinSeq[pidx]];
828
829	if (prot_idx == -1) { // illegal character
830	memcpy(result+didx, "???", 3);
831	}
832	else {
833	memcpy(result+didx, protein_2_iupac_table+prot_idx, 3);
834	}
835	}
836	result[didx] = 0;
837
838	return result;
839	}
840

Note: See TracBrowser for help on using the repository browser.

Download in other formats: