Context Navigation

source: tags/arb-6.0-rc3/SL/PRONUC/AP_codon_table.cxx

Visit:

Last change on this file was 8814, checked in by westram, 13 years ago
fixed some warnings in NDEBUG mode (unused warnings)
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 26.1 KB

Line
1	// =============================================================== //
2	// //
3	// File : AP_codon_table.cxx //
4	// Purpose : //
5	// //
6	// Coded by Ralf Westram (coder@reallysoft.de) in January 2010 //
7	// Institute of Microbiology (Technical University Munich) //
8	// http://www.arb-home.de/ //
9	// //
10	// =============================================================== //
11
12	#include "AP_codon_table.hxx"
13	#include "iupac.h"
14
15	#include <arbdb.h>
16
17	#include <cctype>
18
19	#define pn_assert(cond) arb_assert(cond)
20
21	#define EMBL_BACTERIAL_TABLE_INDEX 11
22
23	// Info about translation codes was taken from
24	// http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
25
26	static AWT_Codon_Code_Definition AWT_codon_def[AWT_CODON_TABLES+1] =
27	{
28	// 0000000001111111111222222222233333333334444444444555555555566666
29	// 1234567890123456789012345678901234567890123456789012345678901234
30
31	// "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1
32	// "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2
33	// "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3
34	{
35	" (1) Standard code",
36	"FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", // The first code in this table has to be 'Standard code'!
37	"---M---------------M---------------M----------------------------",
38	1
39	},
40	{
41	" (2) Vertebrate mitochondrial code",
42	"FFLLSSSSYYCCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSVVVVAAAADDEEGGGG",
43	"--------------------------------MMMM---------------M------------",
44	2
45	},
46	{
47	" (3) Yeast mitochondrial code",
48	"FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
49	"----------------------------------MM----------------------------",
50	3
51	},
52	{
53	" (4) Mold/Protozoan/Coelenterate mito. + Mycoplasma/Spiroplasma code",
54	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
55	"--MM---------------M------------MMMM---------------M------------",
56	4
57	},
58	{
59	" (5) Invertebrate mitochondrial code",
60	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
61	"---M----------------------------MMMM---------------M------------",
62	5
63	},
64	{
65	" (6) Ciliate, Dasycladacean and Hexamita nuclear code",
66	"FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
67	"-----------------------------------M----------------------------",
68	6
69	},
70	{
71	" (9) Echinoderm and Flatworm mitochondrial code",
72	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
73	"-----------------------------------M---------------M------------",
74	9
75	},
76	{
77	"(10) Euplotid nuclear code",
78	"FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
79	"-----------------------------------M----------------------------",
80	10
81	},
82	// 0000000001111111111222222222233333333334444444444555555555566666
83	// 1234567890123456789012345678901234567890123456789012345678901234
84
85	// "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1
86	// "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2
87	// "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3
88	{
89	"(11) Bacterial and Plant Plastid code",
90	"FFLLSSSSYY*CCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
91	"---M---------------M------------MMMM---------------M------------",
92	11
93	},
94	{
95	"(12) Alternative Yeast nuclear code",
96	"FFLLSSSSYY*CCWLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
97	"-------------------M---------------M----------------------------",
98	12
99	},
100	{
101	"(13) Ascidian mitochondrial code",
102	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
103	"---M------------------------------MM---------------M------------",
104	13
105	},
106	{
107	"(14) Alternative Flatworm mitochondrial code",
108	"FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
109	"-----------------------------------M----------------------------",
110	14
111	},
112	{
113	"(15) Blepharisma nuclear code",
114	"FFLLSSSSYYQCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
115	"-----------------------------------M----------------------------",
116	15
117	},
118	{
119	"(16) Chlorophycean mitochondrial code",
120	"FFLLSSSSYYLCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
121	"-----------------------------------M----------------------------",
122	16
123	},
124	{
125	"(21) Trematode mitochondrial code",
126	"FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
127	"-----------------------------------M---------------M------------",
128	21
129	},
130	{
131	"(22) Scenedesmus obliquus mitochondrial code",
132	"FFLLSSSYYLCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
133	"-----------------------------------M----------------------------",
134	22
135	},
136	{
137	"(23) Thraustochytrium mitochondrial code",
138	"FFLSSSSYYCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
139	"--------------------------------M--M---------------M------------",
140	23
141	},
142
143	{ 0, 0, 0, 0 } // end of table-marker
144	};
145
146	#define MAX_EMBL_TRANSL_TABLE_VALUE 23 // maximum known EMBL transl_table value
147
148	int AWT_embl_transl_table_2_arb_code_nr(int embl_code_nr) {
149	// returns -1 if embl_code_nr is not known by ARB
150
151	static bool initialized = false;
152	static int arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE+1]; // key: embl_code_nr, value: arb_code_nr or -1
153
154	if (!initialized) {
155	for (int embl = 0; embl <= MAX_EMBL_TRANSL_TABLE_VALUE; ++embl) {
156	arb_code_nr_table[embl] = -1; // illegal table
157	}
158	for (int arb_code_nr = 0; arb_code_nr < AWT_CODON_TABLES; ++arb_code_nr) {
159	arb_code_nr_table[AWT_codon_def[arb_code_nr].embl_feature_transl_table] = arb_code_nr;
160	}
161	// should be index of 'Bacterial and Plant Plastid code'
162	// (otherwise maybe AWAR_PROTEIN_TYPE_bacterial_code_index is wrong)
163	pn_assert(arb_code_nr_table[EMBL_BACTERIAL_TABLE_INDEX] == AWAR_PROTEIN_TYPE_bacterial_code_index);
164	pn_assert(arb_code_nr_table[1] == 0); // Standard code has to be on index zero!
165
166	initialized = true;
167	}
168
169	if (embl_code_nr<0 \|\| embl_code_nr>MAX_EMBL_TRANSL_TABLE_VALUE) return -1;
170
171	int arb_code_nr = arb_code_nr_table[embl_code_nr];
172	#ifdef DEBUG
173	if (arb_code_nr != -1) {
174	pn_assert(arb_code_nr >= 0 && arb_code_nr < AWT_CODON_TABLES);
175	pn_assert(AWT_arb_code_nr_2_embl_transl_table(arb_code_nr) == embl_code_nr);
176	}
177	#endif
178	return arb_code_nr;
179	}
180
181	int AWT_arb_code_nr_2_embl_transl_table(int arb_code_nr) {
182	pn_assert(arb_code_nr >= 0 && arb_code_nr<AWT_CODON_TABLES);
183	return AWT_codon_def[arb_code_nr].embl_feature_transl_table;
184	}
185
186
187	static bool codon_tables_initialized = false;
188	static char definite_translation[AWT_MAX_CODONS]; // contains 0 if ambiguous, otherwise it contains the definite translation
189	static char *ambiguous_codons[AWT_MAX_CODONS]; // for each ambiguous codon: contains all translations (each only once)
190
191	void AP_initialize_codon_tables() {
192	if (codon_tables_initialized) return;
193
194	int codon_nr;
195	int code_nr;
196
197	for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) {
198	ambiguous_codons[codon_nr] = 0;
199	}
200
201	pn_assert(AWT_CODON_TABLES>=1);
202	memcpy(definite_translation, AWT_codon_def[0].aa, AWT_MAX_CODONS); // only one translation is really definite
203
204	pn_assert(AWT_codon_def[AWT_CODON_TABLES].aa==NULL); // Error in AWT_codon_def or AWT_CODON_CODES
205
206	for (code_nr=1; code_nr<AWT_CODON_TABLES; code_nr++) {
207	const char *translation = AWT_codon_def[code_nr].aa;
208
209	for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) {
210	if (definite_translation[codon_nr]!='?') { // is definite till now
211	if (definite_translation[codon_nr]!=translation[codon_nr]) { // we found a different translation
212	// create ambiguous_codons:
213	char amb = (char)GB_calloc(AWT_MAX_CODONS+1, sizeof(char));
214	amb[0] = definite_translation[codon_nr];
215	amb[1] = translation[codon_nr];
216
217	ambiguous_codons[codon_nr] = amb;
218	definite_translation[codon_nr] = '?';
219	#if defined(DEBUG) && 0
220	printf("amb[%i]='%s'\n", codon_nr, amb);
221	#endif
222	}
223	}
224	else { // is ambiguous
225	if (strchr(ambiguous_codons[codon_nr], translation[codon_nr])==0) { // not listed in ambiguous codons
226	// append another ambiguous codon:
227	char *amb = ambiguous_codons[codon_nr];
228	amb[strlen(amb)] = translation[codon_nr];
229	#if defined(DEBUG) && 0
230	printf("amb[%i]='%s'\n", codon_nr, amb);
231	#endif
232	}
233	}
234	}
235	}
236
237	codon_tables_initialized = true;
238	}
239
240	// return 0..3 (ok) or 4 (failure)
241	inline int dna2idx(char c) {
242	switch (c) {
243	case 'T': case 't':
244	case 'U': case 'u': return 0;
245	case 'C': case 'c': return 1;
246	case 'A': case 'a': return 2;
247	case 'G': case 'g': return 3;
248	}
249	return 4;
250	}
251
252	inline char idx2dna(int idx) {
253	pn_assert(idx>=0 && idx<4);
254	return "TCAG"[idx];
255	}
256
257	inline int calc_codon_nr(const char *dna) {
258	int i1 = dna2idx(dna[0]);
259	int i2 = dna2idx(dna[1]);
260	int i3 = dna2idx(dna[2]);
261
262	if (i1==4\|\|i2==4\|\|i3==4) return AWT_MAX_CODONS; // is not a codon
263
264	int codon_nr = i116 + i24 + i3;
265	pn_assert(codon_nr>=0 && codon_nr<=AWT_MAX_CODONS);
266	return codon_nr;
267	}
268
269	inline void build_codon(int codon_nr, char *to_buffer) {
270	pn_assert(codon_nr>=0 && codon_nr<AWT_MAX_CODONS);
271
272	to_buffer[0] = idx2dna((codon_nr>>4)&3);
273	to_buffer[1] = idx2dna((codon_nr>>2)&3);
274	to_buffer[2] = idx2dna(codon_nr&3);
275	}
276
277	const char* AWT_get_codon_code_name(int code) {
278	pn_assert(code>=0 && code<AWT_CODON_TABLES);
279	return AWT_codon_def[code].name;
280	}
281
282	static const char *protein_name[26+1] = {
283	"Ala", // A
284	"Asx", // B
285	"Cys", // C
286	"Asp", // D
287	"Glu", // E
288	"Phe", // F
289	"Gly", // G
290	"His", // H
291	"Ile", // I
292	0, // J
293	"Lys", // K
294	"Leu", // L
295	"Met", // M
296	"Asn", // N
297	0, // O
298	"Pro", // P
299	"Gln", // Q
300	"Arg", // R
301	"Ser", // S
302	"Thr", // T
303	0, // U
304	"Val", // V
305	"Trp", // W
306	"Xxx", // X
307	"Tyr", // Y
308	"Glx", // Z
309	0
310	};
311
312	const char *AP_get_protein_name(char protein) {
313	if (protein=='*') return "End";
314	if (protein=='-') return "---";
315
316	pn_assert(protein>='A' && protein<='Z');
317	pn_assert(protein_name[protein-'A']!=0);
318	return protein_name[protein-'A'];
319	}
320
321	#ifdef DEBUG
322
323	inline char nextBase(char c) {
324	switch (c) {
325	case 'T': return 'C';
326	case 'C': return 'A';
327	case 'A': return 'G';
328	case 'G': return 0;
329	default: pn_assert(0);
330	}
331	return 0;
332	}
333
334	void AWT_dump_codons() {
335	AWT_allowedCode allowed_code;
336
337	for (char c='*'; c<='Z'; c++) {
338	printf("Codes for '%c': ", c);
339	int first_line = 1;
340	int found = 0;
341	for (char b1='T'; b1; b1=nextBase(b1)) {
342	for (char b2='T'; b2; b2=nextBase(b2)) {
343	for (char b3='T'; b3; b3=nextBase(b3)) {
344	char dna[4];
345	dna[0]=b1;
346	dna[1]=b2;
347	dna[2]=b3;
348	dna[3]=0;
349
350	AWT_allowedCode allowed_code_left;
351	if (AWT_is_codon(c, dna, allowed_code, allowed_code_left)) {
352	if (!first_line) printf("\n ");
353	first_line = 0;
354	printf("%s (", dna);
355
356	int first=1;
357	for (int code=0; code<AWT_CODON_TABLES; code++) {
358	if (allowed_code_left.is_allowed(code)) {
359	if (!first) printf(",");
360	first=0;
361	printf("%i", code);
362	}
363	}
364	printf(") ");
365
366	found = 1;
367	}
368	}
369	}
370	}
371	if (!found) printf("none");
372	printf("\n");
373	if (c=='*') c='A'-1;
374	}
375	}
376	#endif
377
378	char AWT_is_start_codon(const char *dna, int arb_code_nr) {
379	// if dna[0]..dna[2] is defined as start codon for 'arb_code_nr'
380	// return 'M' (or whatever is defined in tables)
381	// return 0 otherwise
382
383	char is_start_codon = 0;
384	int codon_nr = calc_codon_nr(dna);
385
386	pn_assert(arb_code_nr >= 0 && arb_code_nr<AWT_CODON_TABLES);
387
388	if (codon_nr != AWT_MAX_CODONS) { // dna is a clean codon (it contains no iupac-codes)
389	const char *starts = AWT_codon_def[arb_code_nr].starts;
390
391	is_start_codon = starts[codon_nr];
392	if (is_start_codon == '-') is_start_codon = 0;
393	}
394
395	return is_start_codon;
396	}
397
398
399	bool AWT_is_codon(char protein, const char dna, const AWT_allowedCode& allowed_code, AWT_allowedCode& allowed_code_left, const char *fail_reason_ptr) {
400	// return TRUE if 'dna' contains a codon of 'protein' ('dna' must not contain any gaps)
401	// allowed_code contains 1 for each allowed code and 0 otherwise
402	// allowed_code_left contains a copy of allowed_codes with all impossible codes set to zero
403
404	pn_assert(codon_tables_initialized);
405
406	const char *fail_reason = 0;
407	bool is_codon = false;
408
409	if (fail_reason_ptr) *fail_reason_ptr = 0;
410
411	protein = toupper(protein);
412	if (protein=='B') { // B is a shortcut for Asp(=D) or Asn(=N)
413	is_codon = AWT_is_codon('D', dna, allowed_code, allowed_code_left, &fail_reason);
414	if (!is_codon) {
415	pn_assert(fail_reason != 0); // if failed there should always be a failure-reason
416	char *fail1 = strdup(fail_reason);
417	is_codon = AWT_is_codon('N', dna, allowed_code, allowed_code_left, &fail_reason);
418	if (!is_codon) {
419	char *fail2 = strdup(fail_reason);
420	fail_reason = GBS_global_string("%s and %s", fail1, fail2);
421	free(fail2);
422	}
423	free(fail1);
424	}
425	}
426	else if (protein=='Z') { // Z is a shortcut for Glu(=E) or Gln(=Q)
427	is_codon = AWT_is_codon('E', dna, allowed_code, allowed_code_left, &fail_reason);
428	if (!is_codon) {
429	pn_assert(fail_reason != 0); // if failed there should always be a failure-reason
430	char *fail1 = strdup(fail_reason);
431	is_codon = AWT_is_codon('Q', dna, allowed_code, allowed_code_left, &fail_reason);
432	if (!is_codon) {
433	char *fail2 = strdup(fail_reason);
434	fail_reason = GBS_global_string("%s and %s", fail1, fail2);
435	free(fail2);
436	}
437	free(fail1);
438	}
439	}
440	else {
441	int codon_nr = calc_codon_nr(dna);
442	if (codon_nr==AWT_MAX_CODONS) { // dna is not a clean codon (it contains iupac-codes)
443	int error_positions = 0;
444	int first_error_pos = -1;
445	bool too_short = false;
446	{
447	int iupac_pos;
448	for (iupac_pos=0; iupac_pos<3 && !too_short; iupac_pos++) {
449	if (!dna[iupac_pos]) {
450	too_short = true;
451	}
452	else if (strchr("ACGTU", dna[iupac_pos]) == 0) {
453	if (first_error_pos==-1) first_error_pos = iupac_pos;
454	error_positions++;
455	}
456	}
457	}
458
459	if (too_short) {
460	fail_reason = GBS_global_string("Not enough nucleotides (got '%s')", dna);
461	}
462	else {
463	pn_assert(error_positions);
464	if (error_positions==3) { // don't accept codons with 3 errors
465	fail_reason = GBS_global_string("Three consecutive IUPAC codes '%c%c%c'", dna[0], dna[1], dna[2]);
466	}
467	else {
468	const char *decoded_iupac = iupac::decode(dna[first_error_pos], GB_AT_DNA, 0);
469
470	if (!decoded_iupac[0]) { // no valid IUPAC
471	allowed_code_left.forbidAll();
472	fail_reason = GBS_global_string("Not a valid IUPAC code:'%c'", dna[first_error_pos]);
473	}
474	else {
475	char dna_copy[4];
476	memcpy(dna_copy, dna, 3);
477	dna_copy[3] = 0;
478
479	#if defined(DEBUG) && 0
480	printf("Check if '%s' is a codon for '%c'\n", dna_copy, protein);
481	#endif
482
483	int all_are_codons = 1;
484	AWT_allowedCode allowed_code_copy;
485	allowed_code_copy = allowed_code;
486
487	for (int i=0; decoded_iupac[i]; i++) {
488	dna_copy[first_error_pos] = decoded_iupac[i];
489	if (!AWT_is_codon(protein, dna_copy, allowed_code_copy, allowed_code_left)) {
490	all_are_codons = 0;
491	break;
492	}
493	allowed_code_copy = allowed_code_left;
494	}
495
496	if (all_are_codons) {
497	allowed_code_left = allowed_code_copy;
498	is_codon = true;
499	}
500	else {
501	allowed_code_left.forbidAll();
502	fail_reason = GBS_global_string("Not all IUPAC-combinations of '%s' translate", dna_copy);
503	}
504	#if defined(DEBUG) && 0
505	printf("result = %i\n", all_are_codons);
506	#endif
507	}
508	}
509	}
510	}
511	else if (definite_translation[codon_nr]!='?') {
512	int ok = definite_translation[codon_nr]==protein;
513
514	if (ok) {
515	allowed_code_left = allowed_code;
516	is_codon = true;
517	}
518	else {
519	allowed_code_left.forbidAll();
520	fail_reason = GBS_global_string("'%c%c%c' does never translate to '%c' (1)", dna[0], dna[1], dna[2], protein);
521	}
522	}
523	else if (strchr(ambiguous_codons[codon_nr], protein)==0) {
524	allowed_code_left.forbidAll();
525	fail_reason = GBS_global_string("'%c%c%c' does never translate to '%c' (2)", dna[0], dna[1], dna[2], protein);
526	}
527	else {
528	#if defined(ASSERTION_USED)
529	bool correct_disallowed_translation = false;
530	#endif
531
532	// search for allowed correct translation possibility:
533	for (int code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) {
534	if (AWT_codon_def[code_nr].aa[codon_nr] == protein) { // does it translate correct?
535	if (allowed_code.is_allowed(code_nr)) { // is this code allowed?
536	allowed_code_left.allow(code_nr);
537	is_codon = true;
538	}
539	else {
540	allowed_code_left.forbid(code_nr); // otherwise forbid code in future
541	#if defined(ASSERTION_USED)
542	correct_disallowed_translation = true;
543	#endif
544	}
545	}
546	else {
547	allowed_code_left.forbid(code_nr); // otherwise forbid code in future
548	}
549	}
550
551	if (!is_codon) {
552	pn_assert(correct_disallowed_translation); // should be true because otherwise we shouldn't run into this else-branch
553	char left_tables[AWT_CODON_TABLES*3+1];
554	char *ltp = left_tables;
555	bool first = true;
556	for (int code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) {
557	if (allowed_code.is_allowed(code_nr)) {
558	if (!first) *ltp++ = ',';
559	ltp += sprintf(ltp, "%i", code_nr);
560	first = false;
561	}
562	}
563	fail_reason = GBS_global_string("'%c%c%c' does not translate to '%c' for any of the leftover trans-tables (%s)",
564	dna[0], dna[1], dna[2], protein, left_tables);
565	}
566	}
567	}
568
569	if (!is_codon) {
570	pn_assert(fail_reason);
571	if (fail_reason_ptr) *fail_reason_ptr = fail_reason; // set failure-reason if requested
572	}
573	return is_codon;
574	}
575
576	// -------------------------------------------------------------------------------- Codon_Group
577
578	class Codon_Group
579	{
580	char codon[64]; // index is calculated with calc_codon_nr
581
582	public:
583	Codon_Group(char protein, int code_nr);
584	~Codon_Group() {}
585
586	Codon_Group& operator += (const Codon_Group& other);
587	int expand(char *to_buffer) const;
588	};
589
590	Codon_Group::Codon_Group(char protein, int code_nr) {
591	protein = toupper(protein);
592	pn_assert(protein=='*' \|\| isalpha(protein));
593	pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES);
594
595	const char *amino_table = AWT_codon_def[code_nr].aa;
596	for (int i=0; i<AWT_MAX_CODONS; i++) {
597	codon[i] = amino_table[i]==protein;
598	}
599	}
600
601	Codon_Group& Codon_Group::operator+=(const Codon_Group& other) {
602	for (int i=0; i<AWT_MAX_CODONS; i++) {
603	codon[i] = codon[i] \|\| other.codon[i];
604	}
605	return *this;
606	}
607
608	inline int legal_dna_no(int i) { return i>=0 && i<4; }
609	inline void my_memcpy(char dest, const char source, size_t length) { for (size_t l=0; l<length; l++) { dest[l] = source[l]; } }
610
611	inline const char buildMixedCodon(const char con1, const char *con2) {
612	int mismatches = 0;
613	int mismatch_index = -1;
614	static char buf[4];
615
616	for (int i=0; i<3; i++) {
617	if (con1[i]!=con2[i]) {
618	mismatches++;
619	mismatch_index = i;
620	}
621	else {
622	buf[i] = con1[i];
623	}
624	}
625
626	if (mismatches==1) { // exactly one position differs between codons
627	pn_assert(mismatch_index!=-1);
628	buf[mismatch_index] = iupac::combine(con1[mismatch_index], con2[mismatch_index], GB_AT_DNA);
629	buf[3] = 0;
630	return buf;
631	}
632	return 0;
633	}
634
635	static int expandMore(const char bufferStart, int no_of_condons, char&to_buffer) {
636	int i, j;
637	const char con1, con2;
638	int added = 0;
639
640	for (i=0; i<no_of_condons; i++) {
641	con1 = bufferStart+3*i;
642
643	for (j=i+1; j<no_of_condons; j++) {
644	con2 = bufferStart+3*j;
645	const char *result = buildMixedCodon(con1, con2);
646	if (result) {
647	to_buffer[0] = 0;
648	// do we already have this codon?
649	const char *found;
650	const char *startSearch = bufferStart;
651	for (;;) {
652	found = strstr(startSearch, result);
653	if (!found) break;
654	int pos = (found-bufferStart);
655	if ((pos%3)==0) break; // yes already here!
656	startSearch = found+1; // was misaligned -> try behind
657	}
658
659	if (!found) {
660	my_memcpy(to_buffer, result, 3); to_buffer+=3;
661	added++;
662	}
663	}
664	}
665	}
666	return no_of_condons+added;
667	}
668
669	int Codon_Group::expand(char *to_buffer) const {
670	int count = 0;
671	int i;
672	char *org_to_buffer = to_buffer;
673
674	for (i=0; i<AWT_MAX_CODONS; i++) {
675	if (codon[i]) {
676	build_codon(i, to_buffer);
677	to_buffer += 3;
678	count++;
679	}
680	}
681
682	#if defined(DEBUG) && 0
683	to_buffer[0] = 0;
684	printf("codons = '%s'\n", org_to_buffer);
685	#endif
686
687	for (;;) {
688	int new_count = expandMore(org_to_buffer, count, to_buffer);
689	if (new_count==count) break; // nothing expanded -> done
690	count = new_count;
691	#if defined(DEBUG) && 0
692	to_buffer[0] = 0;
693	printf("codons (expandedMore) = '%s'\n", org_to_buffer);
694	#endif
695	}
696
697	pn_assert(count==(int(to_buffer-org_to_buffer)/3));
698
699	return count;
700	}
701
702	// --------------------------------------------------------------------------------
703
704	static Codon_Group *get_Codon_Group(char protein, int code_nr) {
705	pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES);
706	protein = toupper(protein);
707	pn_assert(isalpha(protein) \|\| protein=='*');
708	pn_assert(codon_tables_initialized);
709
710	Codon_Group *cgroup = 0;
711
712	if (protein=='B') {
713	cgroup = new Codon_Group('D', code_nr);
714	Codon_Group N('N', code_nr);
715	*cgroup += N;
716	}
717	else if (protein=='Z') {
718	cgroup = new Codon_Group('E', code_nr);
719	Codon_Group Q('Q', code_nr);
720	*cgroup += Q;
721	}
722	else {
723	cgroup = new Codon_Group(protein, code_nr);
724	}
725
726	pn_assert(cgroup);
727
728	return cgroup;
729	}
730
731	#define MAX_CODON_LIST_LENGTH (70*3)
732
733	// get a list of all codons ("xyzxyzxyz...") encoding 'protein' in case we use Codon-Code 'code_nr'
734	// (includes all completely contained IUPAC-encoded codons at the end of list)
735	const char *AP_get_codons(char protein, int code_nr) {
736	Codon_Group *cgroup = get_Codon_Group(protein, code_nr);
737
738	static char buffer[MAX_CODON_LIST_LENGTH+1];
739	int offset = 3*cgroup->expand(buffer);
740	pn_assert(offset<MAX_CODON_LIST_LENGTH);
741	buffer[offset] = 0;
742
743	delete cgroup;
744
745	return buffer;
746	}
747

Note: See TracBrowser for help on using the repository browser.

Download in other formats: