1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : AP_codon_table.cxx // |
---|
4 | // Purpose : codon definitions for DNA -> AA translation // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in January 2010 // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #include "AP_codon_table.hxx" |
---|
13 | #include "AP_pro_a_nucs.hxx" |
---|
14 | #include "iupac.h" |
---|
15 | |
---|
16 | #include <arb_global_defs.h> |
---|
17 | #include <arb_str.h> |
---|
18 | |
---|
19 | #include <cctype> |
---|
20 | |
---|
21 | #define pn_assert(cond) arb_assert(cond) |
---|
22 | |
---|
23 | #define EMBL_BACTERIAL_TABLE_INDEX 11 |
---|
24 | #define AWT_CODON_TABLE_MAX_NAME_LENGTH 57 // increasing this limit forces GUI re-layout (look4: AWT_get_codon_code_name) |
---|
25 | |
---|
26 | #define VALID_PROTEIN "ABCDEFGHIJKLMNPQRSTVWXYZ*" // all possible translations |
---|
27 | #define VALID_PROTEIN_NO_X "ABCDEFGHIJKLMNPQRSTVWYZ*" // same as VALID_PROTEIN w/o 'X' |
---|
28 | |
---|
29 | // ---------------------------------------------------------------------------------------------------- |
---|
30 | // |
---|
31 | // Info about translation codes was taken from |
---|
32 | // http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi |
---|
33 | // and |
---|
34 | // https://en.wikipedia.org/wiki/List_of_genetic_codes |
---|
35 | // |
---|
36 | // Whenever adding new or correcting existing code tables, please |
---|
37 | // - check data on NCBI webpage mentioned above |
---|
38 | // - document last update in ../../HELP_SOURCE/source/transl_table.hlp@LAST_UPDATE_FROM_WEBPAGE |
---|
39 | // |
---|
40 | // ---------------------------------------------------------------------------------------------------- |
---|
41 | |
---|
42 | static AWT_Codon_Code_Definition AWT_codon_def[AWT_CODON_TABLES+1] = |
---|
43 | { |
---|
44 | // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) |
---|
45 | // 0123456789012345678901234567890123456789012345678901234567890123 |
---|
46 | // |
---|
47 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1 |
---|
48 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2 |
---|
49 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
---|
50 | { |
---|
51 | " (1) Standard code", |
---|
52 | "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", // The first code in this table has to be 'Standard code'! |
---|
53 | "---M------**--*----M---------------M----------------------------", |
---|
54 | 1 // arb:0 |
---|
55 | }, |
---|
56 | { |
---|
57 | " (2) Vertebrate mitochondrial code", |
---|
58 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", |
---|
59 | "----------**--------------------MMMM----------**---M------------", |
---|
60 | 2 // arb:1 |
---|
61 | }, |
---|
62 | { |
---|
63 | " (3) Yeast mitochondrial code", |
---|
64 | "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
65 | "----------**----------------------MM----------------------------", |
---|
66 | 3 // arb:2 |
---|
67 | }, |
---|
68 | // " (X) 6789012345678901234567890123456789012345678901234567", // max.name length (57) |
---|
69 | { |
---|
70 | " (4) Coelenterate Mitochondrial + Mycoplasma/Spiroplasma", |
---|
71 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
72 | "--MM------**-------M------------MMMM---------------M------------", |
---|
73 | 4 // arb:3 |
---|
74 | }, |
---|
75 | { |
---|
76 | " (5) Invertebrate mitochondrial code", |
---|
77 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", |
---|
78 | "---M------**--------------------MMMM---------------M------------", |
---|
79 | 5 // arb:4 |
---|
80 | }, |
---|
81 | { |
---|
82 | " (6) Ciliate, Dasycladacean and Hexamita nuclear code", |
---|
83 | "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
84 | "--------------*--------------------M----------------------------", |
---|
85 | 6 // arb:5 |
---|
86 | }, |
---|
87 | { |
---|
88 | " (9) Echinoderm and Flatworm mitochondrial code", |
---|
89 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", |
---|
90 | "----------**-----------------------M---------------M------------", |
---|
91 | 9 // arb:6 |
---|
92 | }, |
---|
93 | { |
---|
94 | "(10) Euplotid nuclear code", |
---|
95 | "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
96 | "----------**-----------------------M----------------------------", |
---|
97 | 10 // arb:7 |
---|
98 | }, |
---|
99 | // 0000000001111111111222222222233333333334444444444555555555566666 |
---|
100 | // 1234567890123456789012345678901234567890123456789012345678901234 |
---|
101 | |
---|
102 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1 |
---|
103 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2 |
---|
104 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
---|
105 | { |
---|
106 | "(11) Bacterial and Plant Plastid code", |
---|
107 | "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
108 | "---M------**--*----M------------MMMM---------------M------------", |
---|
109 | 11 // arb:8 |
---|
110 | }, |
---|
111 | { |
---|
112 | "(12) Alternative Yeast nuclear code", |
---|
113 | "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
114 | "----------**--*----M---------------M----------------------------", |
---|
115 | 12 // arb:9 |
---|
116 | }, |
---|
117 | { |
---|
118 | "(13) Ascidian mitochondrial code", |
---|
119 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", |
---|
120 | "---M------**----------------------MM---------------M------------", |
---|
121 | 13 // arb:10 |
---|
122 | }, |
---|
123 | { |
---|
124 | "(14) Alternative Flatworm mitochondrial code", |
---|
125 | "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", |
---|
126 | "-----------*-----------------------M----------------------------", |
---|
127 | 14 // arb:11 |
---|
128 | }, |
---|
129 | { |
---|
130 | "(15) Blepharisma nuclear code (deleted?)", // why is it no longer listed at NCBI? |
---|
131 | "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
132 | "----------*---*--------------------M----------------------------", // converted to new format manually (no source) |
---|
133 | 15 // arb:12 |
---|
134 | }, |
---|
135 | { |
---|
136 | "(16) Chlorophycean mitochondrial code", |
---|
137 | "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
138 | "----------*---*--------------------M----------------------------", |
---|
139 | 16 // arb:13 |
---|
140 | }, |
---|
141 | { |
---|
142 | "(21) Trematode mitochondrial code", |
---|
143 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", |
---|
144 | "----------**-----------------------M---------------M------------", |
---|
145 | 21 // arb:14 |
---|
146 | }, |
---|
147 | { |
---|
148 | "(22) Scenedesmus obliquus mitochondrial code", |
---|
149 | "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
150 | "------*---*---*--------------------M----------------------------", |
---|
151 | 22 // arb:15 |
---|
152 | }, |
---|
153 | { |
---|
154 | "(23) Thraustochytrium mitochondrial code", |
---|
155 | "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
156 | "--*-------**--*-----------------M--M---------------M------------", |
---|
157 | 23 // arb:16 |
---|
158 | }, |
---|
159 | { |
---|
160 | "(24) Pterobranchia Mitochondrial Code", |
---|
161 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG", |
---|
162 | "---M------**-------M---------------M---------------M------------", |
---|
163 | 24 // arb:17 |
---|
164 | }, |
---|
165 | { |
---|
166 | "(25) Candidate Division SR1 and Gracilibacteria Code", |
---|
167 | "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
168 | "---M------**-----------------------M---------------M------------", |
---|
169 | 25 // arb:18 |
---|
170 | }, |
---|
171 | { |
---|
172 | "(26) Pachysolen tannophilus Nuclear Code", |
---|
173 | "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
174 | "----------**--*----M---------------M----------------------------", |
---|
175 | 26 // arb:19 |
---|
176 | }, |
---|
177 | { |
---|
178 | "(27) Karyorelict Nuclear", |
---|
179 | "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
180 | "--------------*--------------------M----------------------------", |
---|
181 | 27 // arb:20 |
---|
182 | }, |
---|
183 | { |
---|
184 | "(28) Condylostoma Nuclear", |
---|
185 | "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
186 | "----------**--*--------------------M----------------------------", |
---|
187 | 28 // arb:21 |
---|
188 | }, |
---|
189 | { |
---|
190 | "(29) Mesodinium Nuclear", |
---|
191 | "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
192 | "--------------*--------------------M----------------------------", |
---|
193 | 29 // arb:22 |
---|
194 | }, |
---|
195 | { |
---|
196 | "(30) Peritrich Nuclear", |
---|
197 | "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
198 | "--------------*--------------------M----------------------------", |
---|
199 | 30 // arb:23 |
---|
200 | }, |
---|
201 | { |
---|
202 | "(31) Blastocrithidia Nuclear", |
---|
203 | "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
---|
204 | "----------**-----------------------M----------------------------", |
---|
205 | 31 // arb:24 |
---|
206 | }, |
---|
207 | |
---|
208 | { NULp, NULp, NULp, 0 } // end of table-marker |
---|
209 | }; |
---|
210 | |
---|
211 | // When adding new genetic code: |
---|
212 | // - increase AP_codon_table.hxx@AWT_CODON_TABLES |
---|
213 | // - increase .@MAX_EMBL_TRANSL_TABLE_VALUE |
---|
214 | // - add arb-codenr to .@ALL_TABLES |
---|
215 | |
---|
216 | #define MAX_EMBL_TRANSL_TABLE_VALUE 31 // maximum known EMBL transl_table value |
---|
217 | |
---|
218 | // -------------------------------------------------------------------------------- |
---|
219 | |
---|
220 | int TTIT_embl2arb(int embl_code_nr) { |
---|
221 | // returns -1 if embl_code_nr is not known by ARB |
---|
222 | |
---|
223 | static bool initialized = false; |
---|
224 | static int arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE+1]; // key: embl_code_nr, value: arb_code_nr or -1 |
---|
225 | |
---|
226 | if (!initialized) { |
---|
227 | for (int embl = 0; embl <= MAX_EMBL_TRANSL_TABLE_VALUE; ++embl) { |
---|
228 | arb_code_nr_table[embl] = -1; // illegal table |
---|
229 | } |
---|
230 | for (int arb_code_nr = 0; arb_code_nr < AWT_CODON_TABLES; ++arb_code_nr) { |
---|
231 | int embl = AWT_codon_def[arb_code_nr].embl_feature_transl_table; |
---|
232 | |
---|
233 | pn_assert(embl<=MAX_EMBL_TRANSL_TABLE_VALUE); // defined embl code is above limit |
---|
234 | pn_assert(arb_code_nr_table[embl] == -1); // duplicate definition of EMBL table number |
---|
235 | |
---|
236 | arb_code_nr_table[embl] = arb_code_nr; |
---|
237 | } |
---|
238 | // should be index of 'Bacterial and Plant Plastid code' |
---|
239 | // (otherwise maybe AWAR_PROTEIN_TYPE_bacterial_code_index is wrong) |
---|
240 | pn_assert(arb_code_nr_table[EMBL_BACTERIAL_TABLE_INDEX] == AWAR_PROTEIN_TYPE_bacterial_code_index); |
---|
241 | pn_assert(arb_code_nr_table[1] == 0); // Standard code has to be on index zero! |
---|
242 | pn_assert(arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE] != -1); // arb_code_nr_table is defined too big |
---|
243 | |
---|
244 | initialized = true; |
---|
245 | } |
---|
246 | |
---|
247 | if (embl_code_nr<0 || embl_code_nr>MAX_EMBL_TRANSL_TABLE_VALUE) return -1; |
---|
248 | |
---|
249 | int arb_code_nr = arb_code_nr_table[embl_code_nr]; |
---|
250 | #ifdef DEBUG |
---|
251 | if (arb_code_nr != -1) { |
---|
252 | pn_assert(arb_code_nr >= 0 && arb_code_nr < AWT_CODON_TABLES); |
---|
253 | pn_assert(TTIT_arb2embl(arb_code_nr) == embl_code_nr); |
---|
254 | } |
---|
255 | #endif |
---|
256 | return arb_code_nr; |
---|
257 | } |
---|
258 | |
---|
259 | int TTIT_arb2embl(int arb_code_nr) { |
---|
260 | pn_assert(arb_code_nr >= 0 && arb_code_nr<AWT_CODON_TABLES); |
---|
261 | return AWT_codon_def[arb_code_nr].embl_feature_transl_table; |
---|
262 | } |
---|
263 | |
---|
264 | |
---|
265 | static bool codon_tables_initialized = false; |
---|
266 | static char definite_translation[AWT_MAX_CODONS]; // contains 0 if ambiguous, otherwise it contains the definite translation |
---|
267 | static char *ambiguous_codons[AWT_MAX_CODONS]; // for each ambiguous codon: contains all translations (each only once) |
---|
268 | |
---|
269 | static void addToAmbiguous(int codon_nr, char possible_translation) { |
---|
270 | static uint8_t length[AWT_MAX_CODONS]; |
---|
271 | |
---|
272 | char*& ambEntry = ambiguous_codons[codon_nr]; |
---|
273 | uint8_t& ambLen = length[codon_nr]; |
---|
274 | |
---|
275 | if (!ambEntry) { // first insert |
---|
276 | ambEntry = ARB_calloc<char>(AWT_MAX_CODONS+1); |
---|
277 | ambEntry[0] = possible_translation; |
---|
278 | ambLen = 1; |
---|
279 | } |
---|
280 | else if (!strchr(ambEntry, possible_translation)) { |
---|
281 | ambEntry[ambLen++] = possible_translation; |
---|
282 | } |
---|
283 | } |
---|
284 | |
---|
285 | void AP_initialize_codon_tables() { |
---|
286 | if (codon_tables_initialized) return; |
---|
287 | |
---|
288 | int codon_nr; |
---|
289 | int code_nr; |
---|
290 | |
---|
291 | for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) { |
---|
292 | ambiguous_codons[codon_nr] = NULp; |
---|
293 | } |
---|
294 | |
---|
295 | pn_assert(AWT_CODON_TABLES>=1); |
---|
296 | pn_assert(!AWT_codon_def[AWT_CODON_TABLES].aa); // Error in AWT_codon_def or AWT_CODON_CODES |
---|
297 | |
---|
298 | for (code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) { |
---|
299 | const char *translation = AWT_codon_def[code_nr].aa; |
---|
300 | const char *startStop = AWT_codon_def[code_nr].startStop; |
---|
301 | |
---|
302 | pn_assert(strlen(AWT_codon_def[code_nr].name) <= AWT_CODON_TABLE_MAX_NAME_LENGTH); // GUI layout depends on max. name length |
---|
303 | |
---|
304 | for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) { |
---|
305 | bool isOptionalStartStop = false; |
---|
306 | |
---|
307 | // check definition of 'translation' and 'startStop' is consistent: |
---|
308 | switch (startStop[codon_nr]) { |
---|
309 | case 'M': // defined as start-codon |
---|
310 | pn_assert(translation[codon_nr] != '*'); // invalid def: stop AND start |
---|
311 | isOptionalStartStop = translation[codon_nr] != 'M'; |
---|
312 | break; |
---|
313 | |
---|
314 | case '*': // defined as stop-codon (new def style) |
---|
315 | pn_assert(translation[codon_nr] != 'M'); // invalid def: start AND stop |
---|
316 | isOptionalStartStop = translation[codon_nr] != '*'; |
---|
317 | break; |
---|
318 | |
---|
319 | case '-': // neither start nor stop (new def style) not start (old def style) |
---|
320 | pn_assert(translation[codon_nr] != '*'); // invalid def: stop codons have to be marked in 'Starts' definition |
---|
321 | break; |
---|
322 | |
---|
323 | default: |
---|
324 | pn_assert(0); // invalid character in startStop |
---|
325 | break; |
---|
326 | } |
---|
327 | |
---|
328 | // detect definite/ambiguous translations: |
---|
329 | if (code_nr == 0) { // first table (no ambiguity possible yet) |
---|
330 | if (isOptionalStartStop) { |
---|
331 | addToAmbiguous(codon_nr, translation[codon_nr]); |
---|
332 | addToAmbiguous(codon_nr, startStop[codon_nr]); |
---|
333 | definite_translation[codon_nr] = 0; |
---|
334 | } |
---|
335 | else { |
---|
336 | definite_translation[codon_nr] = translation[codon_nr]; |
---|
337 | } |
---|
338 | } |
---|
339 | else if (definite_translation[codon_nr]) { // is definite till now |
---|
340 | if (definite_translation[codon_nr] != translation[codon_nr] || isOptionalStartStop) { // we found a different translation |
---|
341 | addToAmbiguous(codon_nr, definite_translation[codon_nr]); |
---|
342 | addToAmbiguous(codon_nr, translation[codon_nr]); |
---|
343 | if (isOptionalStartStop) addToAmbiguous(codon_nr, startStop[codon_nr]); |
---|
344 | definite_translation[codon_nr] = 0; |
---|
345 | } |
---|
346 | } |
---|
347 | else { // is ambiguous |
---|
348 | addToAmbiguous(codon_nr, translation[codon_nr]); |
---|
349 | if (isOptionalStartStop) addToAmbiguous(codon_nr, startStop[codon_nr]); |
---|
350 | } |
---|
351 | } |
---|
352 | } |
---|
353 | |
---|
354 | codon_tables_initialized = true; |
---|
355 | } |
---|
356 | |
---|
357 | // return 0..3 (ok) or 4 (failure) |
---|
358 | inline int dna2idx(char c) { |
---|
359 | switch (c) { |
---|
360 | case 'T': case 't': |
---|
361 | case 'U': case 'u': return 0; |
---|
362 | case 'C': case 'c': return 1; |
---|
363 | case 'A': case 'a': return 2; |
---|
364 | case 'G': case 'g': return 3; |
---|
365 | } |
---|
366 | return 4; |
---|
367 | } |
---|
368 | |
---|
369 | inline char idx2dna(int idx) { |
---|
370 | pn_assert(idx>=0 && idx<4); |
---|
371 | return "TCAG"[idx]; |
---|
372 | } |
---|
373 | |
---|
374 | inline int calc_codon_nr(const char *dna) { |
---|
375 | int i1 = dna2idx(dna[0]); if (i1 == 4) return AWT_MAX_CODONS; // is not a codon |
---|
376 | int i2 = dna2idx(dna[1]); if (i2 == 4) return AWT_MAX_CODONS; |
---|
377 | int i3 = dna2idx(dna[2]); if (i3 == 4) return AWT_MAX_CODONS; |
---|
378 | |
---|
379 | int codon_nr = i1*16 + i2*4 + i3; |
---|
380 | pn_assert(codon_nr>=0 && codon_nr<=AWT_MAX_CODONS); |
---|
381 | return codon_nr; |
---|
382 | } |
---|
383 | |
---|
384 | inline void build_codon(int codon_nr, char *to_buffer) { |
---|
385 | pn_assert(codon_nr>=0 && codon_nr<AWT_MAX_CODONS); |
---|
386 | |
---|
387 | to_buffer[0] = idx2dna((codon_nr>>4)&3); |
---|
388 | to_buffer[1] = idx2dna((codon_nr>>2)&3); |
---|
389 | to_buffer[2] = idx2dna(codon_nr&3); |
---|
390 | } |
---|
391 | |
---|
392 | const char* AWT_get_codon_code_name(int code) { |
---|
393 | pn_assert(code>=0 && code<AWT_CODON_TABLES); |
---|
394 | return AWT_codon_def[code].name; |
---|
395 | } |
---|
396 | |
---|
397 | static const char *aa_3letter_name[26+1] = { |
---|
398 | "Ala", // A |
---|
399 | "Asx", // B (= D or N) |
---|
400 | "Cys", // C |
---|
401 | "Asp", // D |
---|
402 | "Glu", // E |
---|
403 | "Phe", // F |
---|
404 | "Gly", // G |
---|
405 | "His", // H |
---|
406 | "Ile", // I |
---|
407 | "Xle", // J (= I or L) |
---|
408 | "Lys", // K |
---|
409 | "Leu", // L |
---|
410 | "Met", // M |
---|
411 | "Asn", // N |
---|
412 | NULp, // O |
---|
413 | "Pro", // P |
---|
414 | "Gln", // Q |
---|
415 | "Arg", // R |
---|
416 | "Ser", // S |
---|
417 | "Thr", // T |
---|
418 | NULp, // U |
---|
419 | "Val", // V |
---|
420 | "Trp", // W |
---|
421 | "Xaa", // X |
---|
422 | "Tyr", // Y |
---|
423 | "Glx", // Z (= E or Q) |
---|
424 | NULp |
---|
425 | }; |
---|
426 | |
---|
427 | const char *getAminoAcidAbbr(char aa) { |
---|
428 | if (aa=='*') return "End"; |
---|
429 | aa = toupper(aa); |
---|
430 | if (aa>='A' && aa<='Z') return aa_3letter_name[aa-'A']; |
---|
431 | return NULp; |
---|
432 | } |
---|
433 | |
---|
434 | #ifdef DEBUG |
---|
435 | |
---|
436 | inline char nextBase(char c) { |
---|
437 | switch (c) { |
---|
438 | case 'T': return 'C'; |
---|
439 | case 'C': return 'A'; |
---|
440 | case 'A': return 'G'; |
---|
441 | #if 0 |
---|
442 | case 'G': return 0; |
---|
443 | #else |
---|
444 | case 'G': return 'M'; |
---|
445 | case 'M': return 'R'; |
---|
446 | case 'R': return 'W'; |
---|
447 | case 'W': return 'S'; |
---|
448 | case 'S': return 'Y'; |
---|
449 | case 'Y': return 'K'; |
---|
450 | case 'K': return 'V'; |
---|
451 | case 'V': return 'H'; |
---|
452 | case 'H': return 'D'; |
---|
453 | case 'D': return 'B'; |
---|
454 | case 'B': return 'N'; |
---|
455 | case 'N': return 0; |
---|
456 | #endif |
---|
457 | default: pn_assert(0); |
---|
458 | } |
---|
459 | return 0; |
---|
460 | } |
---|
461 | |
---|
462 | void AWT_dump_codons(TranslationTableIndexType type, bool skipX) { |
---|
463 | // use for debugging |
---|
464 | |
---|
465 | const TransTables all_allowed; |
---|
466 | |
---|
467 | for (char c='*'; c<='Z'; c++) { |
---|
468 | printf("Codons for '%c': ", c); |
---|
469 | |
---|
470 | if (skipX && c == 'X') { |
---|
471 | fputs("skipped", stdout); |
---|
472 | } |
---|
473 | else { |
---|
474 | bool first_line = true; |
---|
475 | bool found = false; |
---|
476 | for (char b1='T'; b1; b1=nextBase(b1)) { |
---|
477 | for (char b2='T'; b2; b2=nextBase(b2)) { |
---|
478 | for (char b3='T'; b3; b3=nextBase(b3)) { |
---|
479 | char dna[4]; |
---|
480 | dna[0]=b1; |
---|
481 | dna[1]=b2; |
---|
482 | dna[2]=b3; |
---|
483 | dna[3]=0; |
---|
484 | |
---|
485 | TransTables remaining; |
---|
486 | if (AWT_is_codon(c, dna, all_allowed, remaining)) { |
---|
487 | if (!first_line) fputs("\n ", stdout); |
---|
488 | first_line = false; |
---|
489 | printf("%s (%s)", dna, remaining.to_string(type)); |
---|
490 | found = true; |
---|
491 | } |
---|
492 | } |
---|
493 | } |
---|
494 | } |
---|
495 | if (!found) fputs("none", stdout); |
---|
496 | } |
---|
497 | fputs("\n", stdout); |
---|
498 | if (c=='*') c='A'-1; |
---|
499 | } |
---|
500 | } |
---|
501 | #endif |
---|
502 | |
---|
503 | inline char isStartOrStopCodonNr(int codon_nr, int code_nr) { |
---|
504 | char isStartStop = 0; |
---|
505 | pn_assert(code_nr >= 0 && code_nr<AWT_CODON_TABLES); |
---|
506 | |
---|
507 | pn_assert(codon_nr != AWT_MAX_CODONS); // should not be called with IUPAC codons |
---|
508 | pn_assert(codon_nr >= 0 && codon_nr<AWT_MAX_CODONS); // (use isStartOrStopCodon, isStartCodon or isStopCodon) |
---|
509 | |
---|
510 | if (codon_nr != AWT_MAX_CODONS) { // 'codon' is a clean codon (it contains no iupac-codes) |
---|
511 | isStartStop = AWT_codon_def[code_nr].startStop[codon_nr]; |
---|
512 | if (isStartStop == '-') { |
---|
513 | isStartStop = 0; |
---|
514 | } |
---|
515 | } |
---|
516 | |
---|
517 | arb_assert(implicated(isStartStop, isStartStop == '*' || isStartStop == 'M')); |
---|
518 | return isStartStop; |
---|
519 | } |
---|
520 | |
---|
521 | char AWT_translator::isStartOrStopCodon(const char *codon) const { |
---|
522 | /*! test whether 'codon' is a start- or stop-codon. |
---|
523 | * @param codon three bases definining the codon |
---|
524 | * @return '*' for stop-codons, 'M' for start-codons, 0 otherwise |
---|
525 | */ |
---|
526 | |
---|
527 | char result = 0; |
---|
528 | int codon_nr = calc_codon_nr(codon); |
---|
529 | if (codon_nr == AWT_MAX_CODONS) { // codon contains iupac codes (rare case -> brute force implementation ok) |
---|
530 | TransTables allowed; |
---|
531 | allowed.forbidAllBut(CodeNr()); |
---|
532 | TransTables remaining = allowed; |
---|
533 | |
---|
534 | bool is_start = AWT_is_codon('M', codon, allowed, remaining, NULp); |
---|
535 | bool is_stop = is_start ? false : AWT_is_codon('*', codon, allowed, remaining, NULp); |
---|
536 | |
---|
537 | pn_assert(!(is_start && is_stop)); |
---|
538 | result = is_start ? 'M' : (is_stop ? '*' : 0); |
---|
539 | } |
---|
540 | else { // codon is a clean codon |
---|
541 | result = isStartOrStopCodonNr(calc_codon_nr(codon), code_nr); |
---|
542 | } |
---|
543 | return result; |
---|
544 | } |
---|
545 | |
---|
546 | inline bool protMatches(char p1, char p2) { |
---|
547 | /*! return true if p1 matches p2 |
---|
548 | * @param p1 "normal" protein (neither B, Z nor J) |
---|
549 | * @param p2 any protein (B, Z and J ok) |
---|
550 | * B is a shortcut for Asp(=D) or Asn(=N) |
---|
551 | * J is a shortcut for Ile(=I) or Leu(=L) |
---|
552 | * Z is a shortcut for Glu(=E) or Gln(=Q) |
---|
553 | */ |
---|
554 | pn_assert(p1 != 'B' && p1 != 'Z' && p1 != 'J'); |
---|
555 | pn_assert(p1 == toupper(p1)); |
---|
556 | pn_assert(p2 == toupper(p2)); |
---|
557 | |
---|
558 | if (p1 == p2) return true; |
---|
559 | if (p2 == 'B') return p1 == 'D' || p1 == 'N'; |
---|
560 | if (p2 == 'J') return p1 == 'I' || p1 == 'L'; |
---|
561 | if (p2 == 'Z') return p1 == 'E' || p1 == 'Q'; |
---|
562 | return false; |
---|
563 | } |
---|
564 | inline bool containsProtMatching(const char *pstr, char p) { |
---|
565 | /*! return true, if 'pstr' contains any protein that matches 'p'. |
---|
566 | * uses same logic as protMatches() |
---|
567 | */ |
---|
568 | pn_assert(p == toupper(p)); |
---|
569 | if (p == 'B') return strchr(pstr, 'D') || strchr(pstr, 'N'); |
---|
570 | if (p == 'J') return strchr(pstr, 'I') || strchr(pstr, 'L'); |
---|
571 | if (p == 'Z') return strchr(pstr, 'E') || strchr(pstr, 'Q'); |
---|
572 | return strchr(pstr, p); |
---|
573 | } |
---|
574 | inline bool isGap(char c) { return GAP::is_std_gap(c); } |
---|
575 | |
---|
576 | inline GB_ERROR neverTranslatesError(const char *dna, char protein) { |
---|
577 | if (!strchr(VALID_PROTEIN, protein)) { |
---|
578 | return GBS_global_string("'%c' is no valid amino acid", protein); |
---|
579 | } |
---|
580 | return GBS_global_string("'%c%c%c' never translates to '%c'", dna[0], dna[1], dna[2], protein); |
---|
581 | } |
---|
582 | |
---|
583 | bool AWT_is_codon(char protein, const char *const dna, const TransTables& allowed, TransTables& remaining, const char **fail_reason_ptr) { |
---|
584 | /*! test if 'dna' codes 'protein' |
---|
585 | * @param protein amino acid |
---|
586 | * @param dna three nucleotides (gaps allowed, e.g. 'A-C' can be tested vs 'X') |
---|
587 | * @param allowed allowed translation tables |
---|
588 | * @param remaining returns the remaining allowed translation tables (only if this functions returns true) |
---|
589 | * @param fail_reason_ptr if not NULp => store reason for failure here (or set it to NULp on success) |
---|
590 | * @return true if dna translates to protein |
---|
591 | */ |
---|
592 | |
---|
593 | pn_assert(allowed.any()); |
---|
594 | pn_assert(codon_tables_initialized); |
---|
595 | |
---|
596 | const char *fail_reason = NULp; |
---|
597 | if (fail_reason_ptr) *fail_reason_ptr = NULp; |
---|
598 | |
---|
599 | bool is_codon = false; |
---|
600 | int codon_nr = calc_codon_nr(dna); |
---|
601 | int first_iupac_pos = -1; |
---|
602 | int iupac_positions = 0; |
---|
603 | bool decided = false; |
---|
604 | bool general_failure = false; |
---|
605 | |
---|
606 | protein = toupper(protein); |
---|
607 | |
---|
608 | if (codon_nr==AWT_MAX_CODONS) { // dna is not a clean codon (i.e. it contains iupac-codes or gaps) |
---|
609 | bool too_short = false; |
---|
610 | int nucs_seen = 0; |
---|
611 | for (int iupac_pos=0; iupac_pos<3 && !too_short && !fail_reason; iupac_pos++) { |
---|
612 | char N = dna[iupac_pos]; |
---|
613 | |
---|
614 | if (!N) too_short = true; |
---|
615 | else if (!isGap(N)) { |
---|
616 | nucs_seen++; |
---|
617 | if (!strchr("ACGTU", N)) { |
---|
618 | if (first_iupac_pos==-1) first_iupac_pos = iupac_pos; |
---|
619 | iupac_positions++; |
---|
620 | const char *decoded_iupac = iupac::decode(N, GB_AT_DNA, 0); |
---|
621 | if (!decoded_iupac[0]) { // no valid IUPAC |
---|
622 | fail_reason = GBS_global_string("Invalid character '%c' in DNA", N); |
---|
623 | } |
---|
624 | } |
---|
625 | } |
---|
626 | } |
---|
627 | |
---|
628 | if (!fail_reason && !nucs_seen) { // got no dna |
---|
629 | fail_reason = "No nucleotides left"; |
---|
630 | } |
---|
631 | else if (nucs_seen<3) { |
---|
632 | too_short = true; |
---|
633 | } |
---|
634 | |
---|
635 | if (fail_reason) { |
---|
636 | decided = true; // fails for all proteins |
---|
637 | } |
---|
638 | else if (too_short) { |
---|
639 | decided = true; |
---|
640 | if (protein == 'X') { |
---|
641 | is_codon = true; |
---|
642 | } |
---|
643 | else { |
---|
644 | char dna_copy[4]; |
---|
645 | strncpy(dna_copy, dna, 3); |
---|
646 | dna_copy[3] = 0; |
---|
647 | |
---|
648 | fail_reason = GBS_global_string("Not enough nucleotides (got '%s')", dna_copy); |
---|
649 | } |
---|
650 | } |
---|
651 | } |
---|
652 | |
---|
653 | if (!decided) { |
---|
654 | if (protein == 'X') { |
---|
655 | TransTables allowed_copy = allowed; |
---|
656 | const char *valid_prot = VALID_PROTEIN_NO_X; |
---|
657 | |
---|
658 | for (int i = 0; valid_prot[i]; ++i) { |
---|
659 | if (AWT_is_codon(valid_prot[i], dna, allowed_copy, remaining)) { |
---|
660 | allowed_copy.forbid(remaining); |
---|
661 | if (allowed_copy.none()) break; |
---|
662 | } |
---|
663 | } |
---|
664 | |
---|
665 | if (allowed_copy.any()) { |
---|
666 | is_codon = true; |
---|
667 | remaining = allowed_copy; |
---|
668 | } |
---|
669 | else { |
---|
670 | fail_reason = neverTranslatesError(dna, protein); |
---|
671 | } |
---|
672 | } |
---|
673 | else if (codon_nr==AWT_MAX_CODONS) { // dna is a codon with one or more IUPAC codes |
---|
674 | pn_assert(iupac_positions); |
---|
675 | const char *decoded_iupac = iupac::decode(dna[first_iupac_pos], GB_AT_DNA, 0); |
---|
676 | pn_assert(decoded_iupac[0]); // already should have been catched above |
---|
677 | |
---|
678 | char dna_copy[4]; |
---|
679 | memcpy(dna_copy, dna, 3); |
---|
680 | dna_copy[3] = 0; |
---|
681 | |
---|
682 | bool all_are_codons = true; |
---|
683 | bool one_is_codon = false; |
---|
684 | |
---|
685 | TransTables allowed_copy = allowed; |
---|
686 | |
---|
687 | for (int i=0; decoded_iupac[i]; i++) { |
---|
688 | dna_copy[first_iupac_pos] = decoded_iupac[i]; |
---|
689 | const char *subfail; |
---|
690 | if (!AWT_is_codon(protein, dna_copy, allowed_copy, remaining, &subfail)) { |
---|
691 | all_are_codons = false; |
---|
692 | if (!one_is_codon && ARB_strBeginsWith(subfail, "Not all ")) one_is_codon = true; |
---|
693 | if (one_is_codon) break; |
---|
694 | } |
---|
695 | else { |
---|
696 | one_is_codon = true; |
---|
697 | allowed_copy = remaining; |
---|
698 | } |
---|
699 | } |
---|
700 | |
---|
701 | if (all_are_codons) { |
---|
702 | pn_assert(allowed_copy.any()); |
---|
703 | remaining = allowed_copy; |
---|
704 | is_codon = true; |
---|
705 | } |
---|
706 | else { |
---|
707 | remaining.forbidAll(); |
---|
708 | dna_copy[first_iupac_pos] = dna[first_iupac_pos]; |
---|
709 | if (one_is_codon) { |
---|
710 | fail_reason = GBS_global_string("Not all IUPAC-combinations of '%s' translate to '%c'", dna_copy, protein); // careful when changing this message (see above) |
---|
711 | } |
---|
712 | else { |
---|
713 | fail_reason = neverTranslatesError(dna_copy, protein); |
---|
714 | } |
---|
715 | } |
---|
716 | } |
---|
717 | else if (definite_translation[codon_nr]) { // codon has a definite translation (i.e. translates equal for all code-tables) |
---|
718 | char defTransl = definite_translation[codon_nr]; |
---|
719 | |
---|
720 | #if defined(ASSERTION_USED) |
---|
721 | bool optionalCodonExists = false; |
---|
722 | for (int code_nr=0; code_nr<AWT_CODON_TABLES && !optionalCodonExists; code_nr++) { |
---|
723 | char startStop = isStartOrStopCodonNr(codon_nr, code_nr); |
---|
724 | if (startStop && startStop != defTransl) { // got optional start/stop codon |
---|
725 | if (allowed.is_allowed(code_nr)) { |
---|
726 | pn_assert(startStop == '*' || startStop == 'M'); |
---|
727 | optionalCodonExists = true; |
---|
728 | } |
---|
729 | } |
---|
730 | } |
---|
731 | pn_assert(!optionalCodonExists); // when this fails -> definite_translation[] is wrong |
---|
732 | #endif |
---|
733 | |
---|
734 | int ok = protMatches(defTransl, protein); |
---|
735 | if (ok) { |
---|
736 | remaining = allowed; |
---|
737 | is_codon = true; |
---|
738 | } |
---|
739 | else { |
---|
740 | remaining.forbidAll(); |
---|
741 | fail_reason = GBS_global_string("'%c%c%c' translates to '%c', not to '%c'", dna[0], dna[1], dna[2], defTransl, protein); |
---|
742 | general_failure = true; |
---|
743 | } |
---|
744 | } |
---|
745 | else if (!containsProtMatching(ambiguous_codons[codon_nr], protein)) { // codon does not translate to protein in any code-table |
---|
746 | remaining.forbidAll(); |
---|
747 | fail_reason = neverTranslatesError(dna, protein); |
---|
748 | general_failure = true; |
---|
749 | } |
---|
750 | else { |
---|
751 | #if defined(ASSERTION_USED) |
---|
752 | bool correct_disallowed_translation = false; |
---|
753 | #endif |
---|
754 | |
---|
755 | // Now codon translates to protein in at least 1 code-table! |
---|
756 | // Check whether protein translates in any of the allowed code-tables and forbid rest |
---|
757 | for (int code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) { |
---|
758 | bool mayTranslate = protMatches(AWT_codon_def[code_nr].aa[codon_nr], protein); |
---|
759 | if (!mayTranslate && (protein == '*' || protein == 'M')) { |
---|
760 | char startOrStop = isStartOrStopCodonNr(codon_nr, code_nr); |
---|
761 | mayTranslate = startOrStop && protMatches(startOrStop, protein); |
---|
762 | } |
---|
763 | |
---|
764 | if (mayTranslate) { // may codon_nr translate to protein for code_nr |
---|
765 | if (allowed.is_allowed(code_nr)) { // is this code allowed? |
---|
766 | remaining.allow(code_nr); |
---|
767 | is_codon = true; |
---|
768 | } |
---|
769 | else { |
---|
770 | remaining.forbid(code_nr); // otherwise forbid code in future |
---|
771 | #if defined(ASSERTION_USED) |
---|
772 | correct_disallowed_translation = true; |
---|
773 | #endif |
---|
774 | } |
---|
775 | } |
---|
776 | else { |
---|
777 | remaining.forbid(code_nr); // otherwise forbid code in future |
---|
778 | } |
---|
779 | } |
---|
780 | |
---|
781 | if (!is_codon) { |
---|
782 | pn_assert(correct_disallowed_translation); // should be true because otherwise we shouldn't run into this else-branch |
---|
783 | fail_reason = GBS_global_string("'%c%c%c' does not translate to '%c'", dna[0], dna[1], dna[2], protein); |
---|
784 | } |
---|
785 | } |
---|
786 | } |
---|
787 | |
---|
788 | if (!is_codon) { |
---|
789 | pn_assert(fail_reason); |
---|
790 | if (fail_reason_ptr) { |
---|
791 | if (!allowed.all() && !general_failure) { |
---|
792 | int one = allowed.explicit_table(); |
---|
793 | if (one == -1) { |
---|
794 | const char *left_tables = allowed.to_string(TTIT_EMBL); |
---|
795 | pn_assert(left_tables[0]); // allowed should never be empty! |
---|
796 | |
---|
797 | fail_reason = GBS_global_string("%s (for any of the leftover trans-tables: %s)", fail_reason, left_tables); |
---|
798 | } |
---|
799 | else { |
---|
800 | int one_embl = TTIT_arb2embl(one); |
---|
801 | fail_reason = GBS_global_string("%s (for trans-table %i)", fail_reason, one_embl); |
---|
802 | } |
---|
803 | } |
---|
804 | |
---|
805 | *fail_reason_ptr = fail_reason; // set failure-reason if requested |
---|
806 | } |
---|
807 | } |
---|
808 | #if defined(ASSERTION_USED) |
---|
809 | else { |
---|
810 | pn_assert(remaining.is_subset_of(allowed)); |
---|
811 | } |
---|
812 | #endif |
---|
813 | return is_codon; |
---|
814 | } |
---|
815 | |
---|
816 | // -------------------------------------------------------------------------------- Codon_Group |
---|
817 | |
---|
818 | #if defined(DEBUG) |
---|
819 | // #define DUMP_CODON_GROUP_EXPANSION |
---|
820 | #endif |
---|
821 | |
---|
822 | class Codon_Group { |
---|
823 | char codon[64]; // index is calculated with calc_codon_nr |
---|
824 | |
---|
825 | public: |
---|
826 | Codon_Group(char protein, int code_nr); |
---|
827 | ~Codon_Group() {} |
---|
828 | |
---|
829 | Codon_Group& operator += (const Codon_Group& other); |
---|
830 | int expand(char *to_buffer) const; |
---|
831 | }; |
---|
832 | |
---|
833 | Codon_Group::Codon_Group(char protein, int code_nr) { |
---|
834 | protein = toupper(protein); |
---|
835 | pn_assert(protein=='*' || isalpha(protein)); |
---|
836 | pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES); |
---|
837 | |
---|
838 | const char *amino_table = AWT_codon_def[code_nr].aa; |
---|
839 | for (int i=0; i<AWT_MAX_CODONS; i++) { |
---|
840 | codon[i] = amino_table[i]==protein; |
---|
841 | } |
---|
842 | } |
---|
843 | |
---|
844 | Codon_Group& Codon_Group::operator+=(const Codon_Group& other) { |
---|
845 | for (int i=0; i<AWT_MAX_CODONS; i++) { |
---|
846 | codon[i] = codon[i] || other.codon[i]; |
---|
847 | } |
---|
848 | return *this; |
---|
849 | } |
---|
850 | |
---|
851 | inline int legal_dna_no(int i) { return i>=0 && i<4; } |
---|
852 | |
---|
853 | inline const char *buildMixedCodon(const char *const con1, const char *const con2) { |
---|
854 | int mismatches = 0; |
---|
855 | int mismatch_index = -1; |
---|
856 | static char buf[4]; |
---|
857 | |
---|
858 | for (int i=0; i<3; i++) { |
---|
859 | if (con1[i]!=con2[i]) { |
---|
860 | mismatches++; |
---|
861 | mismatch_index = i; |
---|
862 | } |
---|
863 | else { |
---|
864 | buf[i] = con1[i]; |
---|
865 | } |
---|
866 | } |
---|
867 | |
---|
868 | if (mismatches==1) { // exactly one position differs between codons |
---|
869 | pn_assert(mismatch_index!=-1); |
---|
870 | buf[mismatch_index] = iupac::combine(con1[mismatch_index], con2[mismatch_index], GB_AT_DNA); |
---|
871 | buf[3] = 0; |
---|
872 | |
---|
873 | if (memcmp(con1, buf, 3) == 0 || |
---|
874 | memcmp(con2, buf, 3) == 0) |
---|
875 | { |
---|
876 | return NULp; |
---|
877 | } |
---|
878 | |
---|
879 | #if defined(DUMP_CODON_GROUP_EXPANSION) |
---|
880 | printf(" buildMixedCodon('%c%c%c','%c%c%c') == '%s'\n", |
---|
881 | con1[0], con1[1], con1[2], |
---|
882 | con2[0], con2[1], con2[2], |
---|
883 | buf); |
---|
884 | #endif |
---|
885 | |
---|
886 | return buf; |
---|
887 | } |
---|
888 | return NULp; |
---|
889 | } |
---|
890 | |
---|
891 | static int expandMore(const char *bufferStart, int no_of_condons, char*&to_buffer) { |
---|
892 | int i, j; |
---|
893 | const char *con1, *con2; |
---|
894 | int added = 0; |
---|
895 | |
---|
896 | for (i=0; i<no_of_condons; i++) { |
---|
897 | con1 = bufferStart+3*i; |
---|
898 | |
---|
899 | for (j=i+1; j<no_of_condons; j++) { |
---|
900 | con2 = bufferStart+3*j; |
---|
901 | const char *result = buildMixedCodon(con1, con2); |
---|
902 | if (result) { |
---|
903 | to_buffer[0] = 0; |
---|
904 | // do we already have this codon? |
---|
905 | const char *found; |
---|
906 | const char *startSearch = bufferStart; |
---|
907 | for (;;) { |
---|
908 | found = strstr(startSearch, result); |
---|
909 | if (!found) break; |
---|
910 | int pos = (found-bufferStart); |
---|
911 | if ((pos%3)==0) break; // yes already here! |
---|
912 | startSearch = found+1; // was misaligned -> try behind |
---|
913 | } |
---|
914 | |
---|
915 | if (!found) { |
---|
916 | memmove(to_buffer, result, 3); to_buffer+=3; |
---|
917 | added++; |
---|
918 | } |
---|
919 | } |
---|
920 | } |
---|
921 | } |
---|
922 | return no_of_condons+added; |
---|
923 | } |
---|
924 | |
---|
925 | int Codon_Group::expand(char *to_buffer) const { |
---|
926 | int count = 0; |
---|
927 | int i; |
---|
928 | char *org_to_buffer = to_buffer; |
---|
929 | |
---|
930 | for (i=0; i<AWT_MAX_CODONS; i++) { |
---|
931 | if (codon[i]) { |
---|
932 | build_codon(i, to_buffer); |
---|
933 | to_buffer += 3; |
---|
934 | count++; |
---|
935 | } |
---|
936 | } |
---|
937 | |
---|
938 | #if defined(DUMP_CODON_GROUP_EXPANSION) |
---|
939 | to_buffer[0] = 0; |
---|
940 | printf("codons = '%s'\n", org_to_buffer); |
---|
941 | #endif |
---|
942 | |
---|
943 | for (;;) { |
---|
944 | int new_count = expandMore(org_to_buffer, count, to_buffer); |
---|
945 | if (new_count==count) break; // nothing expanded -> done |
---|
946 | count = new_count; |
---|
947 | #if defined(DUMP_CODON_GROUP_EXPANSION) |
---|
948 | to_buffer[0] = 0; |
---|
949 | printf("codons (expandedMore) = '%s'\n", org_to_buffer); |
---|
950 | #endif |
---|
951 | } |
---|
952 | |
---|
953 | pn_assert(count==(int(to_buffer-org_to_buffer)/3)); |
---|
954 | |
---|
955 | return count; |
---|
956 | } |
---|
957 | |
---|
958 | // -------------------------------------------------------------------------------- |
---|
959 | |
---|
960 | static Codon_Group *get_Codon_Group(char protein, int code_nr) { |
---|
961 | pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES); |
---|
962 | protein = toupper(protein); |
---|
963 | pn_assert(isalpha(protein) || protein=='*'); |
---|
964 | pn_assert(codon_tables_initialized); |
---|
965 | |
---|
966 | Codon_Group *cgroup = NULp; |
---|
967 | |
---|
968 | if (protein=='B') { |
---|
969 | cgroup = new Codon_Group('D', code_nr); |
---|
970 | Codon_Group N('N', code_nr); |
---|
971 | *cgroup += N; |
---|
972 | } |
---|
973 | else if (protein=='Z') { |
---|
974 | cgroup = new Codon_Group('E', code_nr); |
---|
975 | Codon_Group Q('Q', code_nr); |
---|
976 | *cgroup += Q; |
---|
977 | } |
---|
978 | else { |
---|
979 | cgroup = new Codon_Group(protein, code_nr); |
---|
980 | } |
---|
981 | |
---|
982 | pn_assert(cgroup); |
---|
983 | |
---|
984 | return cgroup; |
---|
985 | } |
---|
986 | |
---|
987 | #define MAX_CODON_LIST_LENGTH (70*3) |
---|
988 | |
---|
989 | const char *AP_get_codons(char protein, int code_nr) { |
---|
990 | // get a list of all codons ("xyzxyzxyz...") encoding 'protein' in case we use Codon-Code 'code_nr' |
---|
991 | // (includes all completely contained IUPAC-encoded codons at the end of list) |
---|
992 | // |
---|
993 | // Optional start-/stop-codons are not added |
---|
994 | // (i.e. a query for 'M' or '*' may report "incomplete" results) |
---|
995 | |
---|
996 | Codon_Group *cgroup = get_Codon_Group(protein, code_nr); |
---|
997 | |
---|
998 | static char buffer[MAX_CODON_LIST_LENGTH+1]; |
---|
999 | int offset = 3*cgroup->expand(buffer); |
---|
1000 | pn_assert(offset<MAX_CODON_LIST_LENGTH); |
---|
1001 | buffer[offset] = 0; |
---|
1002 | |
---|
1003 | delete cgroup; |
---|
1004 | |
---|
1005 | return buffer; |
---|
1006 | } |
---|
1007 | |
---|
1008 | // -------------------------------------------------------------------------------- |
---|
1009 | |
---|
1010 | #ifdef UNIT_TESTS |
---|
1011 | #ifndef TEST_UNIT_H |
---|
1012 | #include <test_unit.h> |
---|
1013 | #endif |
---|
1014 | |
---|
1015 | static const char *startStopSummary() { |
---|
1016 | // returns string showing summary for start/stop |
---|
1017 | // position = codon_nr |
---|
1018 | // content: |
---|
1019 | // '*' -> translates to stop-codon for at least one code |
---|
1020 | // 'M' -> translates to start-codon for at least one code |
---|
1021 | // '2' -> both (not necessarily same code) |
---|
1022 | // '-' -> does not translate to start or stop for any code |
---|
1023 | |
---|
1024 | static char result[AWT_MAX_CODONS+1]; |
---|
1025 | |
---|
1026 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
---|
1027 | char startStop = '-'; |
---|
1028 | for (int code = 0; code<AWT_CODON_TABLES && (startStop != '2'); ++code) { |
---|
1029 | switch (isStartOrStopCodonNr(codon, code)) { |
---|
1030 | case '*': |
---|
1031 | switch (startStop) { |
---|
1032 | case '*': break; |
---|
1033 | case '-': startStop = '*'; break; |
---|
1034 | case 'M': startStop = '2'; break; |
---|
1035 | default: pn_assert(0); break; |
---|
1036 | } |
---|
1037 | break; |
---|
1038 | case 'M': |
---|
1039 | switch (startStop) { |
---|
1040 | case 'M': break; |
---|
1041 | case '-': startStop = 'M'; break; |
---|
1042 | case '*': startStop = '2'; break; |
---|
1043 | default: pn_assert(0); break; |
---|
1044 | } |
---|
1045 | break; |
---|
1046 | |
---|
1047 | case 0: break; |
---|
1048 | default: pn_assert(0); break; |
---|
1049 | } |
---|
1050 | } |
---|
1051 | result[codon] = startStop; |
---|
1052 | } |
---|
1053 | result[AWT_MAX_CODONS] = 0; |
---|
1054 | return result; |
---|
1055 | } |
---|
1056 | static const char *optionality() { |
---|
1057 | // returns string indicating whether start/stop-codon is optional |
---|
1058 | // position = codon_nr |
---|
1059 | // content: |
---|
1060 | // '-' -> only non-optional start/stop |
---|
1061 | // '!' -> only optional start/stop |
---|
1062 | // '?' -> both |
---|
1063 | // ' ' -> never start or stop |
---|
1064 | |
---|
1065 | static char result[AWT_MAX_CODONS+1]; |
---|
1066 | |
---|
1067 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
---|
1068 | char optional = ' '; |
---|
1069 | for (int code = 0; code<AWT_CODON_TABLES && (optional != '?'); ++code) { |
---|
1070 | char startStop = isStartOrStopCodonNr(codon, code); |
---|
1071 | if (startStop) { |
---|
1072 | bool is_optional = AWT_codon_def[code].aa[codon] != startStop; |
---|
1073 | |
---|
1074 | switch (optional) { |
---|
1075 | case ' ': optional = is_optional ? '!' : '-'; break; |
---|
1076 | case '-': optional = is_optional ? '?' : '-'; break; |
---|
1077 | case '!': optional = is_optional ? '!' : '?'; break; |
---|
1078 | default: pn_assert(0); break; |
---|
1079 | } |
---|
1080 | } |
---|
1081 | } |
---|
1082 | |
---|
1083 | #if defined(ASSERTION_USED) |
---|
1084 | bool sometimes_optional = optional == '!' || optional == '?'; |
---|
1085 | pn_assert(!sometimes_optional || !definite_translation[codon]); |
---|
1086 | #endif |
---|
1087 | |
---|
1088 | result[codon] = optional; |
---|
1089 | } |
---|
1090 | result[AWT_MAX_CODONS] = 0; |
---|
1091 | |
---|
1092 | return result; |
---|
1093 | } |
---|
1094 | static const char *definite() { |
---|
1095 | static char result[AWT_MAX_CODONS+1]; |
---|
1096 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
---|
1097 | result[codon] = definite_translation[codon] ? definite_translation[codon] : ' '; |
---|
1098 | } |
---|
1099 | result[AWT_MAX_CODONS] = 0; |
---|
1100 | return result; |
---|
1101 | } |
---|
1102 | static const char *ambig_count() { |
---|
1103 | static char result[AWT_MAX_CODONS+1]; |
---|
1104 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
---|
1105 | const char *amb = ambiguous_codons[codon]; |
---|
1106 | result[codon] = amb ? '0'+strlen(amb) : ' '; |
---|
1107 | } |
---|
1108 | result[AWT_MAX_CODONS] = 0; |
---|
1109 | return result; |
---|
1110 | } |
---|
1111 | |
---|
1112 | #define e2a(c) TTIT_embl2arb(c) |
---|
1113 | |
---|
1114 | void TEST_codon_check() { |
---|
1115 | AP_initialize_codon_tables(); |
---|
1116 | |
---|
1117 | // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) |
---|
1118 | // 0123456789012345678901234567890123456789012345678901234567890123 |
---|
1119 | // |
---|
1120 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG" base1 |
---|
1121 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG" base2 |
---|
1122 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
---|
1123 | TEST_EXPECT_EQUAL(startStopSummary(), "--2M--*---**--*----M------------MMMM----------**---M------------"); |
---|
1124 | TEST_EXPECT_EQUAL(optionality (), " ?! - ?? ? ! !!?- -- ! "); |
---|
1125 | TEST_EXPECT_EQUAL(definite (), "FF SS SYY CC W PPPPHHQQRRRR MTTTTNN KSS VVV AAAADDEEGGGG"); // optional start/stop codons shall never be definite |
---|
1126 | TEST_EXPECT_EQUAL(ambig_count (), " 32 2 45 4 2225 222 2 45 2 "); // number of proteins in ambiguous_codons |
---|
1127 | |
---|
1128 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('*'), "End"); |
---|
1129 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('C'), "Cys"); |
---|
1130 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('B'), "Asx"); |
---|
1131 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('b'), "Asx"); |
---|
1132 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('J'), "Xle"); |
---|
1133 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('O'), NULp); |
---|
1134 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('X'), "Xaa"); |
---|
1135 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('x'), "Xaa"); |
---|
1136 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('-'), NULp); |
---|
1137 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('='), NULp); |
---|
1138 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('7'), NULp); |
---|
1139 | |
---|
1140 | TEST_EXPECT(protMatches('V', 'V')); |
---|
1141 | TEST_EXPECT(protMatches('N', 'B')); |
---|
1142 | TEST_EXPECT(protMatches('E', 'Z')); |
---|
1143 | TEST_EXPECT(!protMatches('N', 'Z')); |
---|
1144 | TEST_EXPECT(!protMatches('V', 'Z')); |
---|
1145 | |
---|
1146 | TEST_EXPECT_EQUAL(AP_get_codons('D', 0), "GATGACGAY"); |
---|
1147 | TEST_EXPECT_EQUAL(AP_get_codons('N', 0), "AATAACAAY"); |
---|
1148 | TEST_EXPECT_EQUAL(AP_get_codons('B', 0), "AAT" "AAC" "GAT" "GAC" "AAY" "RAT" "RAC" "GAY" "RAY"); // 'B' = 'D' or 'N' |
---|
1149 | |
---|
1150 | TEST_EXPECT_EQUAL(AP_get_codons('L', 0), "TTATTGCTTCTCCTACTG" "TTRYTAYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVYTRCTN"); |
---|
1151 | TEST_EXPECT_EQUAL(AP_get_codons('L', 2), "TTATTG" "TTR"); |
---|
1152 | TEST_EXPECT_EQUAL(AP_get_codons('L', 9), "TTATTGCTTCTCCTAT" "TRYTACTYCTWCTMCTH"); |
---|
1153 | TEST_EXPECT_EQUAL(AP_get_codons('L', 13), "TTATTGTAGCTTCTCCTACTG" "TTRYTATWGYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVYTRCTN"); |
---|
1154 | TEST_EXPECT_EQUAL(AP_get_codons('L', 16), "TTGCTTCTCCTAC" "TGYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVCTN"); |
---|
1155 | |
---|
1156 | TEST_EXPECT_EQUAL(AP_get_codons('S', 0), "TCTTCCTCATCGAGTAGC" "TCYTCWTCKTCMTCSTCRAGYTCHTCBTCDTCVTCN"); |
---|
1157 | TEST_EXPECT_EQUAL(AP_get_codons('S', 4), "TCTTCCTCATCGAGTAGCAGAAGG" "TCYTCWTCKTCMTCSTCRAGYAGWAGKAGMAGSAGRTCHTCBTCDTCVAGHAGBAGDAGVTCNAGN"); |
---|
1158 | TEST_EXPECT_EQUAL(AP_get_codons('S', 9), "TCTTCCTCATCGCTGAGTAGC" "TCYTCWTCKTCMTCSTCRAGYTCHTCBTCDTCVTCN"); |
---|
1159 | TEST_EXPECT_EQUAL(AP_get_codons('S', 15), "TCTTCCTCGAGTAGC" "TCYTCKTCSAGYTCB"); |
---|
1160 | |
---|
1161 | // stop-codons: |
---|
1162 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 1)), "TAATAGTGA" "TARTRA"); // the 3 standard stop codons and their IUPAC covers |
---|
1163 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 2)), "TAATAGAGAAGG" "TARAGR"); |
---|
1164 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 3)), "TAATAG" "TAR"); // not TGA |
---|
1165 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 4)), "TAATAG" "TAR"); |
---|
1166 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 5)), "TAATAG" "TAR"); |
---|
1167 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 9)), "TAATAG" "TAR"); |
---|
1168 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(10)), "TAATAG" "TAR"); |
---|
1169 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(13)), "TAATAG" "TAR"); |
---|
1170 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(21)), "TAATAG" "TAR"); |
---|
1171 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(15)), "TAATGA" "TRA"); // not TAG |
---|
1172 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(16)), "TAATGA" "TRA"); |
---|
1173 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 6)), "TGA"); // not TAA TAG |
---|
1174 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(14)), "TAG"); // not TAA TGA |
---|
1175 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(22)), "TCATAATGA" "TMATSATRATVA"); |
---|
1176 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(23)), "TTATAATAGTGA" "TWATKATARTRATDA"); |
---|
1177 | |
---|
1178 | { |
---|
1179 | // Note: optional start/stop-codons are not added in Codon_Group, |
---|
1180 | // because they would introduce ambiguous mapping. |
---|
1181 | |
---|
1182 | // test optional stop-codons: |
---|
1183 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(27)), ""); |
---|
1184 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(28)), ""); |
---|
1185 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(31)), ""); |
---|
1186 | |
---|
1187 | // test optional start-codons: |
---|
1188 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 1)), "ATG"); // 3 (start-codons listed in table-definition) |
---|
1189 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 2)), "ATAATG" "ATR"); // 5 |
---|
1190 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 3)), "ATAATG" "ATR"); // 2 |
---|
1191 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 4)), "ATG"); // 8 |
---|
1192 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 5)), "ATAATG" "ATR"); // 6 |
---|
1193 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 6)), "ATG"); // 1 |
---|
1194 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(11)), "ATG"); // 7 |
---|
1195 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(13)), "ATAATG" "ATR"); // 4 |
---|
1196 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(24)), "ATG"); // 4 |
---|
1197 | } |
---|
1198 | |
---|
1199 | TEST_EXPECT_EQUAL(AP_get_codons('X', 0), ""); // @@@ wrong: TGR->X (or disallow call) |
---|
1200 | |
---|
1201 | const TransTables allowed; |
---|
1202 | |
---|
1203 | // --------------------------- |
---|
1204 | // test valid codons |
---|
1205 | struct test_is_codon { |
---|
1206 | char protein; |
---|
1207 | const char *codon; |
---|
1208 | const char *tables; |
---|
1209 | }; |
---|
1210 | |
---|
1211 | #define ALL_TABLES "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" // contains arb table-numbers |
---|
1212 | |
---|
1213 | test_is_codon is_codon[] = { |
---|
1214 | { 'P', "CCC", ALL_TABLES }, |
---|
1215 | { 'P', "CCN", ALL_TABLES }, |
---|
1216 | { 'R', "CGN", ALL_TABLES }, |
---|
1217 | |
---|
1218 | { 'D', "GAY", ALL_TABLES }, |
---|
1219 | { 'N', "AAY", ALL_TABLES }, |
---|
1220 | { 'B', "AAY", ALL_TABLES }, // translates to 'N', but matches B(=D|N) for realigner |
---|
1221 | { 'B', "GAY", ALL_TABLES }, // translates to 'D', but matches B(=D|N) for realigner |
---|
1222 | { 'B', "RAY", ALL_TABLES }, // translates to 'D' or to 'N' (i.e. only matches 'B', see failing test for 'RAY' below) |
---|
1223 | { 'B', "RAT", ALL_TABLES }, |
---|
1224 | |
---|
1225 | { 'Q', "CAR", ALL_TABLES }, |
---|
1226 | { 'E', "GAR", ALL_TABLES }, |
---|
1227 | { 'Z', "SAR", ALL_TABLES }, |
---|
1228 | |
---|
1229 | { 'X', "NNN", ALL_TABLES }, |
---|
1230 | |
---|
1231 | { 'L', "TTR", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "TTR", "16" }, |
---|
1232 | { 'L', "YTA", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "YTA", "2,16" }, // Y=TC |
---|
1233 | { 'L', "CTM", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, { 'T', "CTM", "2" }, // M=AC |
---|
1234 | { 'L', "CTN", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTN", "2" }, { 'X', "CTN", "9,19" }, |
---|
1235 | { 'L', "CTK", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTK", "2" }, { 'X', "CTK", "9,19" }, // K=TG |
---|
1236 | |
---|
1237 | { 'L', "TWG", "13,15" }, // W=AT |
---|
1238 | { 'J', "TWG", "13,15" }, // translates to 'L', but matches J(=I|L) for realigner |
---|
1239 | { 'X', "TWG", "0,1,2,3,4,5,6,7,8,9,10,11,12" ",14" ",16,17,18,19,20,21,22,23,24" }, // all but 'L<->TWG' |
---|
1240 | |
---|
1241 | { 'S', "AGY", ALL_TABLES }, |
---|
1242 | { 'S', "TCY", ALL_TABLES }, |
---|
1243 | { 'S', "TCN", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24" }, // all but 15 (where 'TCA->*') |
---|
1244 | { 'S', "AGN", "4,6,11,14" }, |
---|
1245 | { 'S', "AGR", "4,6,11,14" }, |
---|
1246 | |
---|
1247 | { '*', "AGR", "1" }, // R=AG |
---|
1248 | { 'G', "AGR", "10" }, |
---|
1249 | { 'X', "AGR", "17" }, |
---|
1250 | { 'R', "AGR", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" }, |
---|
1251 | |
---|
1252 | { 'G', "AGA", "10" }, |
---|
1253 | { 'S', "AGA", "4,6,11,14,17" }, |
---|
1254 | { 'R', "AGA", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" }, |
---|
1255 | { '*', "AGA", "1" }, |
---|
1256 | |
---|
1257 | { 'K', "AGG", "17" }, |
---|
1258 | |
---|
1259 | { 'W', "TGR", "1,2,3,4,6,10,11,14,17,20,21,24" }, |
---|
1260 | { 'X', "TGR", "0,5,7,8,9,12,13,15,16,18,19,22,23" }, // all but 'W<->TGR' (e.g. code==0: TGA->* & TGG->W => TGR->X) |
---|
1261 | |
---|
1262 | { 'C', "TGW", "7" }, // W = AT |
---|
1263 | { 'X', "TGW", "0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, // all but 'C<->TGW' |
---|
1264 | |
---|
1265 | { 'C', "TGT", ALL_TABLES }, |
---|
1266 | |
---|
1267 | { 'C', "TGA", "7" }, |
---|
1268 | { 'G', "TGA", "18" }, |
---|
1269 | { 'W', "TGA", "1,2,3,4,6,10,11,14,17,20,21,24" }, |
---|
1270 | { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" }, // standard stop codons |
---|
1271 | { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" }, |
---|
1272 | { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, |
---|
1273 | |
---|
1274 | { '*', "TRA", "0,8,9,12,13,15,16,19,21" }, // R=AG |
---|
1275 | { 'X', "TRA", "1,2,3,4,5,6,7,10,11,14,17,18,20,22,23,24" }, // all but '*<->TRA' |
---|
1276 | |
---|
1277 | { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" }, |
---|
1278 | { 'Y', "TAR", "22" }, |
---|
1279 | { 'E', "TAR", "23,24" }, |
---|
1280 | { 'Q', "TAR", "5,20,21" }, |
---|
1281 | { 'Z', "TAR", "5,20,21,23,24" }, // Z=EQ (TAR never translates to 'E', only 'Q') |
---|
1282 | { 'X', "TAR", "11,12,13,15" }, |
---|
1283 | |
---|
1284 | { 'B', "AAW", "6,11,14" }, // W=AT |
---|
1285 | { 'N', "AAW", "6,11,14" }, |
---|
1286 | { 'X', "AAW", "0,1,2,3,4,5,7,8,9,10,12,13,15,16,17,18,19,20,21,22,23,24" }, // all but 'B<->AAW' & 'N<->AAW' |
---|
1287 | |
---|
1288 | { 'T', "CTG", "2" }, |
---|
1289 | { 'S', "CTG", "9" }, |
---|
1290 | { 'A', "CTG", "19" }, |
---|
1291 | { 'L', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTG' & 'S<->CTG' |
---|
1292 | { 'J', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // same as for 'L' |
---|
1293 | { 'M', "CTG", "0,3,8,9,17,19" }, // optional start-codon |
---|
1294 | |
---|
1295 | { 'T', "CTR", "2" }, |
---|
1296 | { 'X', "CTR", "9,19" }, |
---|
1297 | { 'L', "CTR", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTR' & 'X<->CTR' |
---|
1298 | |
---|
1299 | { 'E', "KAR", "23,24" }, |
---|
1300 | // Q <->KAR fails (see below) |
---|
1301 | { 'Z', "KAR", "5,20,21,23,24" }, // Z=E|Q |
---|
1302 | { 'X', "KAR", "0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22" }, |
---|
1303 | |
---|
1304 | { 'G', "KGA", "18" }, |
---|
1305 | { 'X', "KGA", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24" }, // all but G<->KGA |
---|
1306 | |
---|
1307 | { 'E', "TAG", "23,24" }, |
---|
1308 | { 'Q', "TAG", "5,12,20,21" }, |
---|
1309 | { 'L', "TAG", "13,15" }, |
---|
1310 | { 'Y', "TAG", "22" }, |
---|
1311 | { 'J', "TAG", "13,15" }, // J=I|L |
---|
1312 | { 'Z', "TAG", "5,12,20,21,23,24" }, // Z=E|Q |
---|
1313 | { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, |
---|
1314 | |
---|
1315 | { 'J', "WTA", "0,3,5,6,7,8,9,11,12,13,15,17,18,19,20,21,22,23,24" }, |
---|
1316 | |
---|
1317 | { 'X', "A-C", ALL_TABLES }, |
---|
1318 | { 'X', ".T.", ALL_TABLES }, |
---|
1319 | |
---|
1320 | // tests to protect buffer overflows in dna |
---|
1321 | { 'X', "CG", ALL_TABLES }, |
---|
1322 | { 'X', "T", ALL_TABLES }, |
---|
1323 | |
---|
1324 | // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) |
---|
1325 | // 0123456789012345678901234567890123456789012345678901234567890123 |
---|
1326 | // |
---|
1327 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG" base1 |
---|
1328 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG" base2 |
---|
1329 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
---|
1330 | // "--2M--*---**--*----M------------MMMM----------**---M------------" (= startStopSummary) |
---|
1331 | // " ?! - ?? ? ! !!?- -- ! " (= optionality: !=all start/stop optional; -=no start/stop optional, ?=mixed) |
---|
1332 | |
---|
1333 | // test all start codons: |
---|
1334 | { 'M', "TTA", "3" }, // start AND stop -> see ../ALILINK/TranslateRealign.cxx@TTA_AMBIGUITY |
---|
1335 | { 'M', "TTG", "0,3,4,8,10,17,18" }, |
---|
1336 | { 'L', "TTG", ALL_TABLES }, |
---|
1337 | // M <->CTG already tested above |
---|
1338 | { 'M', "ATT", "1,3,4,8,16" }, |
---|
1339 | { 'M', "ATC", "1,3,4,8" }, |
---|
1340 | { 'M', "ATA", "1,2,3,4,8,10,14" }, |
---|
1341 | { 'I', "ATA", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" }, // optional for 3, 8 |
---|
1342 | { 'M', "ATG", ALL_TABLES }, // no optional start |
---|
1343 | { 'M', "ATR", "1,2,3,4,8,10,14" }, // R = AG (code=3 -> ATA->IM ATG->M) |
---|
1344 | { 'M', "ATM", "1,3,4,8" }, // M = AC |
---|
1345 | { 'M', "ATS", "1,3,4,8" }, // S = CG |
---|
1346 | { 'M', "ATY", "1,3,4,8" }, // Y = TC |
---|
1347 | { 'M', "ATK", "1,3,4,8,16" }, // K = TG |
---|
1348 | { 'M', "ATW", "1,3,4,8" }, // W = AT |
---|
1349 | { 'M', "ATV", "1,3,4,8" }, // V = ACG |
---|
1350 | { 'M', "ATB", "1,3,4,8" }, // B = TCG |
---|
1351 | { 'M', "ATD", "1,3,4,8" }, // D = ATG |
---|
1352 | |
---|
1353 | { 'M', "ATH", "1,3,4,8" }, // H = ACT |
---|
1354 | { 'I', "ATH", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" }, |
---|
1355 | { 'X', "ATH", "2,10,14" }, |
---|
1356 | |
---|
1357 | { 'M', "ATN", "1,3,4,8" }, // H = ATCG |
---|
1358 | { 'M', "GTG", "1,3,4,6,8,10,14,16,17,18" }, |
---|
1359 | |
---|
1360 | // test all stop codons: |
---|
1361 | { '*', "AGA", "1" }, // (DUPTEST) |
---|
1362 | { '*', "AGG", "1" }, |
---|
1363 | { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" },//(DUPTEST) |
---|
1364 | { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, // (DUPTEST) |
---|
1365 | { '*', "TCA", "15" }, |
---|
1366 | { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" }, // (DUPTEST) |
---|
1367 | { '*', "TTA", "16" }, |
---|
1368 | |
---|
1369 | { '*', "TWA", "16" }, // W = AT |
---|
1370 | { '*', "TMA", "15" }, // M = AC |
---|
1371 | { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" }, // R = AG (DUPTEST) |
---|
1372 | { '*', "TRA", "0,8,9,12,13,15,16,19,21" }, // R = AG (DUPTEST) |
---|
1373 | { '*', "AGR", "1" }, // R = AG (DUPTEST) |
---|
1374 | |
---|
1375 | { 0, NULp, NULp} |
---|
1376 | }; |
---|
1377 | |
---|
1378 | for (int c = 0; is_codon[c].protein; ++c) { |
---|
1379 | const test_is_codon& C = is_codon[c]; |
---|
1380 | TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon)); |
---|
1381 | |
---|
1382 | TransTables remaining; |
---|
1383 | const char *failure; |
---|
1384 | bool isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure); |
---|
1385 | |
---|
1386 | TEST_EXPECT_NULL(failure); |
---|
1387 | TEST_EXPECT(isCodon); |
---|
1388 | TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), C.tables); |
---|
1389 | } |
---|
1390 | |
---|
1391 | // ----------------------------- |
---|
1392 | // test invalid codons |
---|
1393 | struct test_not_codon { |
---|
1394 | char protein; |
---|
1395 | const char *codon; |
---|
1396 | const char *error; |
---|
1397 | }; |
---|
1398 | test_not_codon not_codon[] = { |
---|
1399 | { 'P', "SYK", "Not all IUPAC-combinations of 'SYK' translate to 'P'" }, // correct (possible translations are PAL) |
---|
1400 | { 'F', "SYK", "'SYK' never translates to 'F'" }, // correct failure |
---|
1401 | { 'P', "NNN", "Not all IUPAC-combinations of 'NNN' translate to 'P'" }, // correct failure |
---|
1402 | { 'D', "RAY", "Not all IUPAC-combinations of 'RAY' translate to 'D'" }, // correct failure |
---|
1403 | { 'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E'" }, // correct failure |
---|
1404 | { 'Q', "KAR", "Not all IUPAC-combinations of 'KAR' translate to 'Q'" }, // correct failure |
---|
1405 | |
---|
1406 | { 'S', "CYT", "'CYT' never translates to 'S'" }, // correct failure |
---|
1407 | |
---|
1408 | { 'O', "RAY", "'O' is no valid amino acid" }, |
---|
1409 | { 'U', "AAA", "'U' is no valid amino acid" }, |
---|
1410 | |
---|
1411 | { 'L', "A-C", "Not enough nucleotides (got 'A-C')" }, // correct failure |
---|
1412 | { 'V', ".T.", "Not enough nucleotides (got '.T.')" }, // correct failure |
---|
1413 | { 'L', "...", "No nucleotides left" }, |
---|
1414 | { 'J', "...", "No nucleotides left" }, |
---|
1415 | |
---|
1416 | { 'I', "ATR", "Not all IUPAC-combinations of 'ATR' translate to 'I'" }, // R = AG // ok: 'ATG' translates to 'M', not to 'I' |
---|
1417 | |
---|
1418 | { '*', "TYA", "Not all IUPAC-combinations of 'TYA' translate to '*'" }, // Y = TC; TCA(code=15) TTA(code=16) -> no code for both |
---|
1419 | { '*', "TRR", "Not all IUPAC-combinations of 'TRR' translate to '*'" }, // R = AG (TGG does never translate to '*') |
---|
1420 | { '*', "WGA", "Not all IUPAC-combinations of 'WGA' translate to '*'" }, // W = AT; AGA(1) TGA(other) -> no common codes |
---|
1421 | { '*', "THA", "Not all IUPAC-combinations of 'THA' translate to '*'" }, // H = ACT; TAA(many) TCA(15) TTA(16) -> no code overlap between TCA and TTA |
---|
1422 | |
---|
1423 | { 'X', "...", "No nucleotides left" }, |
---|
1424 | { 'X', "..", "No nucleotides left" }, |
---|
1425 | { 'X', "-", "No nucleotides left" }, |
---|
1426 | { 'X', "", "No nucleotides left" }, |
---|
1427 | |
---|
1428 | // test invalid chars |
---|
1429 | { 'X', "AZA", "Invalid character 'Z' in DNA" }, |
---|
1430 | { 'X', "A@A", "Invalid character '@' in DNA" }, |
---|
1431 | { 'L', "AZA", "Invalid character 'Z' in DNA" }, |
---|
1432 | |
---|
1433 | // tests to protect buffer overflows in dna |
---|
1434 | |
---|
1435 | { 'A', "--", "No nucleotides left" }, |
---|
1436 | { 'L', ".", "No nucleotides left" }, |
---|
1437 | { 'J', ".", "No nucleotides left" }, |
---|
1438 | { 'L', "AT", "Not enough nucleotides (got 'AT')" }, |
---|
1439 | { 'L', "C", "Not enough nucleotides (got 'C')" }, |
---|
1440 | { 'L', "", "No nucleotides left" }, |
---|
1441 | |
---|
1442 | { 0, NULp, NULp} |
---|
1443 | }; |
---|
1444 | for (int c = 0; not_codon[c].protein; ++c) { |
---|
1445 | const test_not_codon& C = not_codon[c]; |
---|
1446 | TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon)); |
---|
1447 | |
---|
1448 | TransTables remaining; |
---|
1449 | const char *failure; |
---|
1450 | bool isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure); |
---|
1451 | |
---|
1452 | if (isCodon) { // the test-case makes no sense in 'not_codon' |
---|
1453 | TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), ""); // -> move the failing test-case up into 'is_codon'-section |
---|
1454 | } |
---|
1455 | else { |
---|
1456 | TEST_EXPECT_EQUAL(failure, C.error); |
---|
1457 | } |
---|
1458 | TEST_EXPECT(!isCodon); |
---|
1459 | } |
---|
1460 | |
---|
1461 | // ---------------------------------- |
---|
1462 | // test uncombinable codons |
---|
1463 | struct test_uncombinable_codons { |
---|
1464 | char protein1; |
---|
1465 | const char *codon1; |
---|
1466 | const char *tables; |
---|
1467 | char protein2; |
---|
1468 | const char *codon2; |
---|
1469 | const char *error; |
---|
1470 | }; |
---|
1471 | test_uncombinable_codons uncomb_codons[] = { |
---|
1472 | { '*', "TTA", "16", 'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E' (for trans-table 23)" }, |
---|
1473 | { '*', "TTA", "16", 'X', "TRA", "'TRA' never translates to 'X' (for trans-table 23)" }, |
---|
1474 | { 'L', "TAG", "13,15", 'X', "TRA", "'TRA' never translates to 'X' (for any of the leftover trans-tables: 16,22)" }, |
---|
1475 | { 'L', "TAG", "13,15", 'Q', "TAR", "'TAR' never translates to 'Q' (for any of the leftover trans-tables: 16,22)" }, |
---|
1476 | { '*', "TTA", "16", '*', "TCA", "'TCA' does not translate to '*' (for trans-table 23)" }, |
---|
1477 | { 'N', "AAA", "6,11,14", 'X', "AAW", "'AAW' never translates to 'X' (for any of the leftover trans-tables: 9,14,21)" }, |
---|
1478 | { 'N', "AAA", "6,11,14", 'K', "AAA", "'AAA' does not translate to 'K' (for any of the leftover trans-tables: 9,14,21)" }, |
---|
1479 | |
---|
1480 | { 0, NULp, NULp, 0, NULp, NULp} |
---|
1481 | }; |
---|
1482 | |
---|
1483 | for (int c = 0; uncomb_codons[c].protein1; ++c) { |
---|
1484 | const test_uncombinable_codons& C = uncomb_codons[c]; |
---|
1485 | TEST_ANNOTATE(GBS_global_string("%c <- %s + %c <- %s", C.protein1, C.codon1, C.protein2, C.codon2)); |
---|
1486 | |
---|
1487 | TransTables remaining1; |
---|
1488 | const char *failure; |
---|
1489 | bool isCodon = AWT_is_codon(C.protein1, C.codon1, allowed, remaining1, &failure); |
---|
1490 | |
---|
1491 | TEST_EXPECT(isCodon); |
---|
1492 | TEST_EXPECT_EQUAL(remaining1.to_string(TTIT_ARB), C.tables); |
---|
1493 | |
---|
1494 | // @@@ add separate test: show protein2/codon2 return true from AWT_is_codon if not called with remaining1 |
---|
1495 | |
---|
1496 | TransTables remaining2; |
---|
1497 | isCodon = AWT_is_codon(C.protein2, C.codon2, remaining1, remaining2, &failure); |
---|
1498 | TEST_EXPECT_EQUAL(failure, C.error); |
---|
1499 | TEST_REJECT(isCodon); |
---|
1500 | |
---|
1501 | } |
---|
1502 | } |
---|
1503 | |
---|
1504 | #endif // UNIT_TESTS |
---|
1505 | |
---|
1506 | // -------------------------------------------------------------------------------- |
---|