| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : AP_codon_table.cxx // |
|---|
| 4 | // Purpose : codon definitions for DNA -> AA translation // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in January 2010 // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | #include "AP_codon_table.hxx" |
|---|
| 13 | #include "AP_pro_a_nucs.hxx" |
|---|
| 14 | #include "iupac.h" |
|---|
| 15 | |
|---|
| 16 | #include <arb_global_defs.h> |
|---|
| 17 | #include <arb_str.h> |
|---|
| 18 | |
|---|
| 19 | #include <cctype> |
|---|
| 20 | |
|---|
| 21 | #define pn_assert(cond) arb_assert(cond) |
|---|
| 22 | |
|---|
| 23 | #define EMBL_BACTERIAL_TABLE_INDEX 11 |
|---|
| 24 | #define AWT_CODON_TABLE_MAX_NAME_LENGTH 57 // increasing this limit forces GUI re-layout (look4: AWT_get_codon_code_name) |
|---|
| 25 | |
|---|
| 26 | #define VALID_PROTEIN "ABCDEFGHIJKLMNPQRSTVWXYZ*" // all possible translations |
|---|
| 27 | #define VALID_PROTEIN_NO_X "ABCDEFGHIJKLMNPQRSTVWYZ*" // same as VALID_PROTEIN w/o 'X' |
|---|
| 28 | |
|---|
| 29 | // ---------------------------------------------------------------------------------------------------- |
|---|
| 30 | // |
|---|
| 31 | // Info about translation codes was taken from |
|---|
| 32 | // http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi |
|---|
| 33 | // and |
|---|
| 34 | // https://en.wikipedia.org/wiki/List_of_genetic_codes |
|---|
| 35 | // |
|---|
| 36 | // Whenever adding new or correcting existing code tables, please |
|---|
| 37 | // - check data on NCBI webpage mentioned above |
|---|
| 38 | // - document last update in ../../HELP_SOURCE/oldhelp/transl_table.hlp@LAST_UPDATE_FROM_WEBPAGE |
|---|
| 39 | // |
|---|
| 40 | // ---------------------------------------------------------------------------------------------------- |
|---|
| 41 | |
|---|
| 42 | static AWT_Codon_Code_Definition AWT_codon_def[AWT_CODON_TABLES+1] = |
|---|
| 43 | { |
|---|
| 44 | // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) |
|---|
| 45 | // 0123456789012345678901234567890123456789012345678901234567890123 |
|---|
| 46 | // |
|---|
| 47 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1 |
|---|
| 48 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2 |
|---|
| 49 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
|---|
| 50 | { |
|---|
| 51 | " (1) Standard code", |
|---|
| 52 | "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", // The first code in this table has to be 'Standard code'! |
|---|
| 53 | "---M------**--*----M---------------M----------------------------", |
|---|
| 54 | 1 // arb:0 |
|---|
| 55 | }, |
|---|
| 56 | { |
|---|
| 57 | " (2) Vertebrate mitochondrial code", |
|---|
| 58 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", |
|---|
| 59 | "----------**--------------------MMMM----------**---M------------", |
|---|
| 60 | 2 // arb:1 |
|---|
| 61 | }, |
|---|
| 62 | { |
|---|
| 63 | " (3) Yeast mitochondrial code", |
|---|
| 64 | "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 65 | "----------**----------------------MM----------------------------", |
|---|
| 66 | 3 // arb:2 |
|---|
| 67 | }, |
|---|
| 68 | // " (X) 6789012345678901234567890123456789012345678901234567", // max.name length (57) |
|---|
| 69 | { |
|---|
| 70 | " (4) Coelenterate Mitochondrial + Mycoplasma/Spiroplasma", |
|---|
| 71 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 72 | "--MM------**-------M------------MMMM---------------M------------", |
|---|
| 73 | 4 // arb:3 |
|---|
| 74 | }, |
|---|
| 75 | { |
|---|
| 76 | " (5) Invertebrate mitochondrial code", |
|---|
| 77 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", |
|---|
| 78 | "---M------**--------------------MMMM---------------M------------", |
|---|
| 79 | 5 // arb:4 |
|---|
| 80 | }, |
|---|
| 81 | { |
|---|
| 82 | " (6) Ciliate, Dasycladacean and Hexamita nuclear code", |
|---|
| 83 | "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 84 | "--------------*--------------------M----------------------------", |
|---|
| 85 | 6 // arb:5 |
|---|
| 86 | }, |
|---|
| 87 | { |
|---|
| 88 | " (9) Echinoderm and Flatworm mitochondrial code", |
|---|
| 89 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", |
|---|
| 90 | "----------**-----------------------M---------------M------------", |
|---|
| 91 | 9 // arb:6 |
|---|
| 92 | }, |
|---|
| 93 | { |
|---|
| 94 | "(10) Euplotid nuclear code", |
|---|
| 95 | "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 96 | "----------**-----------------------M----------------------------", |
|---|
| 97 | 10 // arb:7 |
|---|
| 98 | }, |
|---|
| 99 | // 0000000001111111111222222222233333333334444444444555555555566666 |
|---|
| 100 | // 1234567890123456789012345678901234567890123456789012345678901234 |
|---|
| 101 | |
|---|
| 102 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG", base1 |
|---|
| 103 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG", base2 |
|---|
| 104 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
|---|
| 105 | { |
|---|
| 106 | "(11) Bacterial and Plant Plastid code", |
|---|
| 107 | "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 108 | "---M------**--*----M------------MMMM---------------M------------", |
|---|
| 109 | 11 // arb:8 |
|---|
| 110 | }, |
|---|
| 111 | { |
|---|
| 112 | "(12) Alternative Yeast nuclear code", |
|---|
| 113 | "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 114 | "----------**--*----M---------------M----------------------------", |
|---|
| 115 | 12 // arb:9 |
|---|
| 116 | }, |
|---|
| 117 | { |
|---|
| 118 | "(13) Ascidian mitochondrial code", |
|---|
| 119 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", |
|---|
| 120 | "---M------**----------------------MM---------------M------------", |
|---|
| 121 | 13 // arb:10 |
|---|
| 122 | }, |
|---|
| 123 | { |
|---|
| 124 | "(14) Alternative Flatworm mitochondrial code", |
|---|
| 125 | "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", |
|---|
| 126 | "-----------*-----------------------M----------------------------", |
|---|
| 127 | 14 // arb:11 |
|---|
| 128 | }, |
|---|
| 129 | { |
|---|
| 130 | "(15) Blepharisma nuclear code (deleted?)", // why is it no longer listed at NCBI? |
|---|
| 131 | "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 132 | "----------*---*--------------------M----------------------------", // converted to new format manually (no source) |
|---|
| 133 | 15 // arb:12 |
|---|
| 134 | }, |
|---|
| 135 | { |
|---|
| 136 | "(16) Chlorophycean mitochondrial code", |
|---|
| 137 | "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 138 | "----------*---*--------------------M----------------------------", |
|---|
| 139 | 16 // arb:13 |
|---|
| 140 | }, |
|---|
| 141 | { |
|---|
| 142 | "(21) Trematode mitochondrial code", |
|---|
| 143 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", |
|---|
| 144 | "----------**-----------------------M---------------M------------", |
|---|
| 145 | 21 // arb:14 |
|---|
| 146 | }, |
|---|
| 147 | { |
|---|
| 148 | "(22) Scenedesmus obliquus mitochondrial code", |
|---|
| 149 | "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 150 | "------*---*---*--------------------M----------------------------", |
|---|
| 151 | 22 // arb:15 |
|---|
| 152 | }, |
|---|
| 153 | { |
|---|
| 154 | "(23) Thraustochytrium mitochondrial code", |
|---|
| 155 | "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 156 | "--*-------**--*-----------------M--M---------------M------------", |
|---|
| 157 | 23 // arb:16 |
|---|
| 158 | }, |
|---|
| 159 | { |
|---|
| 160 | "(24) Pterobranchia Mitochondrial Code", |
|---|
| 161 | "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG", |
|---|
| 162 | "---M------**-------M---------------M---------------M------------", |
|---|
| 163 | 24 // arb:17 |
|---|
| 164 | }, |
|---|
| 165 | { |
|---|
| 166 | "(25) Candidate Division SR1 and Gracilibacteria Code", |
|---|
| 167 | "FFLLSSSSYY**CCGWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 168 | "---M------**-----------------------M---------------M------------", |
|---|
| 169 | 25 // arb:18 |
|---|
| 170 | }, |
|---|
| 171 | { |
|---|
| 172 | "(26) Pachysolen tannophilus Nuclear Code", |
|---|
| 173 | "FFLLSSSSYY**CC*WLLLAPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 174 | "----------**--*----M---------------M----------------------------", |
|---|
| 175 | 26 // arb:19 |
|---|
| 176 | }, |
|---|
| 177 | { |
|---|
| 178 | "(27) Karyorelict Nuclear", |
|---|
| 179 | "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 180 | "--------------*--------------------M----------------------------", |
|---|
| 181 | 27 // arb:20 |
|---|
| 182 | }, |
|---|
| 183 | { |
|---|
| 184 | "(28) Condylostoma Nuclear", |
|---|
| 185 | "FFLLSSSSYYQQCCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 186 | "----------**--*--------------------M----------------------------", |
|---|
| 187 | 28 // arb:21 |
|---|
| 188 | }, |
|---|
| 189 | { |
|---|
| 190 | "(29) Mesodinium Nuclear", |
|---|
| 191 | "FFLLSSSSYYYYCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 192 | "--------------*--------------------M----------------------------", |
|---|
| 193 | 29 // arb:22 |
|---|
| 194 | }, |
|---|
| 195 | { |
|---|
| 196 | "(30) Peritrich Nuclear", |
|---|
| 197 | "FFLLSSSSYYEECC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 198 | "--------------*--------------------M----------------------------", |
|---|
| 199 | 30 // arb:23 |
|---|
| 200 | }, |
|---|
| 201 | { |
|---|
| 202 | "(31) Blastocrithidia Nuclear", |
|---|
| 203 | "FFLLSSSSYYEECCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", |
|---|
| 204 | "----------**-----------------------M----------------------------", |
|---|
| 205 | 31 // arb:24 |
|---|
| 206 | }, |
|---|
| 207 | |
|---|
| 208 | { NULp, NULp, NULp, 0 } // end of table-marker |
|---|
| 209 | }; |
|---|
| 210 | |
|---|
| 211 | // When adding new genetic code: |
|---|
| 212 | // - increase AP_codon_table.hxx@AWT_CODON_TABLES |
|---|
| 213 | // - increase .@MAX_EMBL_TRANSL_TABLE_VALUE |
|---|
| 214 | // - add arb-codenr to .@ALL_TABLES |
|---|
| 215 | |
|---|
| 216 | #define MAX_EMBL_TRANSL_TABLE_VALUE 31 // maximum known EMBL transl_table value |
|---|
| 217 | |
|---|
| 218 | // -------------------------------------------------------------------------------- |
|---|
| 219 | |
|---|
| 220 | int TTIT_embl2arb(int embl_code_nr) { |
|---|
| 221 | // returns -1 if embl_code_nr is not known by ARB |
|---|
| 222 | |
|---|
| 223 | static bool initialized = false; |
|---|
| 224 | static int arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE+1]; // key: embl_code_nr, value: arb_code_nr or -1 |
|---|
| 225 | |
|---|
| 226 | if (!initialized) { |
|---|
| 227 | for (int embl = 0; embl <= MAX_EMBL_TRANSL_TABLE_VALUE; ++embl) { |
|---|
| 228 | arb_code_nr_table[embl] = -1; // illegal table |
|---|
| 229 | } |
|---|
| 230 | for (int arb_code_nr = 0; arb_code_nr < AWT_CODON_TABLES; ++arb_code_nr) { |
|---|
| 231 | int embl = AWT_codon_def[arb_code_nr].embl_feature_transl_table; |
|---|
| 232 | |
|---|
| 233 | pn_assert(embl<=MAX_EMBL_TRANSL_TABLE_VALUE); // defined embl code is above limit |
|---|
| 234 | pn_assert(arb_code_nr_table[embl] == -1); // duplicate definition of EMBL table number |
|---|
| 235 | |
|---|
| 236 | arb_code_nr_table[embl] = arb_code_nr; |
|---|
| 237 | } |
|---|
| 238 | // should be index of 'Bacterial and Plant Plastid code' |
|---|
| 239 | // (otherwise maybe AWAR_PROTEIN_TYPE_bacterial_code_index is wrong) |
|---|
| 240 | pn_assert(arb_code_nr_table[EMBL_BACTERIAL_TABLE_INDEX] == AWAR_PROTEIN_TYPE_bacterial_code_index); |
|---|
| 241 | pn_assert(arb_code_nr_table[1] == 0); // Standard code has to be on index zero! |
|---|
| 242 | pn_assert(arb_code_nr_table[MAX_EMBL_TRANSL_TABLE_VALUE] != -1); // arb_code_nr_table is defined too big |
|---|
| 243 | |
|---|
| 244 | initialized = true; |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | if (embl_code_nr<0 || embl_code_nr>MAX_EMBL_TRANSL_TABLE_VALUE) return -1; |
|---|
| 248 | |
|---|
| 249 | int arb_code_nr = arb_code_nr_table[embl_code_nr]; |
|---|
| 250 | #ifdef DEBUG |
|---|
| 251 | if (arb_code_nr != -1) { |
|---|
| 252 | pn_assert(arb_code_nr >= 0 && arb_code_nr < AWT_CODON_TABLES); |
|---|
| 253 | pn_assert(TTIT_arb2embl(arb_code_nr) == embl_code_nr); |
|---|
| 254 | } |
|---|
| 255 | #endif |
|---|
| 256 | return arb_code_nr; |
|---|
| 257 | } |
|---|
| 258 | |
|---|
| 259 | int TTIT_arb2embl(int arb_code_nr) { |
|---|
| 260 | pn_assert(arb_code_nr >= 0 && arb_code_nr<AWT_CODON_TABLES); |
|---|
| 261 | return AWT_codon_def[arb_code_nr].embl_feature_transl_table; |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | |
|---|
| 265 | static bool codon_tables_initialized = false; |
|---|
| 266 | static char definite_translation[AWT_MAX_CODONS]; // contains 0 if ambiguous, otherwise it contains the definite translation |
|---|
| 267 | static char *ambiguous_codons[AWT_MAX_CODONS]; // for each ambiguous codon: contains all translations (each only once) |
|---|
| 268 | |
|---|
| 269 | static void addToAmbiguous(int codon_nr, char possible_translation) { |
|---|
| 270 | static uint8_t length[AWT_MAX_CODONS]; |
|---|
| 271 | |
|---|
| 272 | char*& ambEntry = ambiguous_codons[codon_nr]; |
|---|
| 273 | uint8_t& ambLen = length[codon_nr]; |
|---|
| 274 | |
|---|
| 275 | if (!ambEntry) { // first insert |
|---|
| 276 | ambEntry = ARB_calloc<char>(AWT_MAX_CODONS+1); |
|---|
| 277 | ambEntry[0] = possible_translation; |
|---|
| 278 | ambLen = 1; |
|---|
| 279 | } |
|---|
| 280 | else if (!strchr(ambEntry, possible_translation)) { |
|---|
| 281 | ambEntry[ambLen++] = possible_translation; |
|---|
| 282 | } |
|---|
| 283 | } |
|---|
| 284 | |
|---|
| 285 | void AP_initialize_codon_tables() { |
|---|
| 286 | if (codon_tables_initialized) return; |
|---|
| 287 | |
|---|
| 288 | int codon_nr; |
|---|
| 289 | int code_nr; |
|---|
| 290 | |
|---|
| 291 | for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) { |
|---|
| 292 | ambiguous_codons[codon_nr] = NULp; |
|---|
| 293 | } |
|---|
| 294 | |
|---|
| 295 | pn_assert(AWT_CODON_TABLES>=1); |
|---|
| 296 | pn_assert(!AWT_codon_def[AWT_CODON_TABLES].aa); // Error in AWT_codon_def or AWT_CODON_CODES |
|---|
| 297 | |
|---|
| 298 | for (code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) { |
|---|
| 299 | const char *translation = AWT_codon_def[code_nr].aa; |
|---|
| 300 | const char *startStop = AWT_codon_def[code_nr].startStop; |
|---|
| 301 | |
|---|
| 302 | pn_assert(strlen(AWT_codon_def[code_nr].name) <= AWT_CODON_TABLE_MAX_NAME_LENGTH); // GUI layout depends on max. name length |
|---|
| 303 | |
|---|
| 304 | for (codon_nr=0; codon_nr<AWT_MAX_CODONS; codon_nr++) { |
|---|
| 305 | bool isOptionalStartStop = false; |
|---|
| 306 | |
|---|
| 307 | // check definition of 'translation' and 'startStop' is consistent: |
|---|
| 308 | switch (startStop[codon_nr]) { |
|---|
| 309 | case 'M': // defined as start-codon |
|---|
| 310 | pn_assert(translation[codon_nr] != '*'); // invalid def: stop AND start |
|---|
| 311 | isOptionalStartStop = translation[codon_nr] != 'M'; |
|---|
| 312 | break; |
|---|
| 313 | |
|---|
| 314 | case '*': // defined as stop-codon (new def style) |
|---|
| 315 | pn_assert(translation[codon_nr] != 'M'); // invalid def: start AND stop |
|---|
| 316 | isOptionalStartStop = translation[codon_nr] != '*'; |
|---|
| 317 | break; |
|---|
| 318 | |
|---|
| 319 | case '-': // neither start nor stop (new def style) not start (old def style) |
|---|
| 320 | pn_assert(translation[codon_nr] != '*'); // invalid def: stop codons have to be marked in 'Starts' definition |
|---|
| 321 | break; |
|---|
| 322 | |
|---|
| 323 | default: |
|---|
| 324 | pn_assert(0); // invalid character in startStop |
|---|
| 325 | break; |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | // detect definite/ambiguous translations: |
|---|
| 329 | if (code_nr == 0) { // first table (no ambiguity possible yet) |
|---|
| 330 | if (isOptionalStartStop) { |
|---|
| 331 | addToAmbiguous(codon_nr, translation[codon_nr]); |
|---|
| 332 | addToAmbiguous(codon_nr, startStop[codon_nr]); |
|---|
| 333 | definite_translation[codon_nr] = 0; |
|---|
| 334 | } |
|---|
| 335 | else { |
|---|
| 336 | definite_translation[codon_nr] = translation[codon_nr]; |
|---|
| 337 | } |
|---|
| 338 | } |
|---|
| 339 | else if (definite_translation[codon_nr]) { // is definite till now |
|---|
| 340 | if (definite_translation[codon_nr] != translation[codon_nr] || isOptionalStartStop) { // we found a different translation |
|---|
| 341 | addToAmbiguous(codon_nr, definite_translation[codon_nr]); |
|---|
| 342 | addToAmbiguous(codon_nr, translation[codon_nr]); |
|---|
| 343 | if (isOptionalStartStop) addToAmbiguous(codon_nr, startStop[codon_nr]); |
|---|
| 344 | definite_translation[codon_nr] = 0; |
|---|
| 345 | } |
|---|
| 346 | } |
|---|
| 347 | else { // is ambiguous |
|---|
| 348 | addToAmbiguous(codon_nr, translation[codon_nr]); |
|---|
| 349 | if (isOptionalStartStop) addToAmbiguous(codon_nr, startStop[codon_nr]); |
|---|
| 350 | } |
|---|
| 351 | } |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | codon_tables_initialized = true; |
|---|
| 355 | } |
|---|
| 356 | |
|---|
| 357 | // return 0..3 (ok) or 4 (failure) |
|---|
| 358 | inline int dna2idx(char c) { |
|---|
| 359 | switch (c) { |
|---|
| 360 | case 'T': case 't': |
|---|
| 361 | case 'U': case 'u': return 0; |
|---|
| 362 | case 'C': case 'c': return 1; |
|---|
| 363 | case 'A': case 'a': return 2; |
|---|
| 364 | case 'G': case 'g': return 3; |
|---|
| 365 | } |
|---|
| 366 | return 4; |
|---|
| 367 | } |
|---|
| 368 | |
|---|
| 369 | inline char idx2dna(int idx) { |
|---|
| 370 | pn_assert(idx>=0 && idx<4); |
|---|
| 371 | return "TCAG"[idx]; |
|---|
| 372 | } |
|---|
| 373 | |
|---|
| 374 | inline int calc_codon_nr(const char *dna) { |
|---|
| 375 | int i1 = dna2idx(dna[0]); if (i1 == 4) return AWT_MAX_CODONS; // is not a codon |
|---|
| 376 | int i2 = dna2idx(dna[1]); if (i2 == 4) return AWT_MAX_CODONS; |
|---|
| 377 | int i3 = dna2idx(dna[2]); if (i3 == 4) return AWT_MAX_CODONS; |
|---|
| 378 | |
|---|
| 379 | int codon_nr = i1*16 + i2*4 + i3; |
|---|
| 380 | pn_assert(codon_nr>=0 && codon_nr<=AWT_MAX_CODONS); |
|---|
| 381 | return codon_nr; |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | inline void build_codon(int codon_nr, char *to_buffer) { |
|---|
| 385 | pn_assert(codon_nr>=0 && codon_nr<AWT_MAX_CODONS); |
|---|
| 386 | |
|---|
| 387 | to_buffer[0] = idx2dna((codon_nr>>4)&3); |
|---|
| 388 | to_buffer[1] = idx2dna((codon_nr>>2)&3); |
|---|
| 389 | to_buffer[2] = idx2dna(codon_nr&3); |
|---|
| 390 | } |
|---|
| 391 | |
|---|
| 392 | const char* AWT_get_codon_code_name(int code) { |
|---|
| 393 | pn_assert(code>=0 && code<AWT_CODON_TABLES); |
|---|
| 394 | return AWT_codon_def[code].name; |
|---|
| 395 | } |
|---|
| 396 | |
|---|
| 397 | static const char *aa_3letter_name[26+1] = { |
|---|
| 398 | "Ala", // A |
|---|
| 399 | "Asx", // B (= D or N) |
|---|
| 400 | "Cys", // C |
|---|
| 401 | "Asp", // D |
|---|
| 402 | "Glu", // E |
|---|
| 403 | "Phe", // F |
|---|
| 404 | "Gly", // G |
|---|
| 405 | "His", // H |
|---|
| 406 | "Ile", // I |
|---|
| 407 | "Xle", // J (= I or L) |
|---|
| 408 | "Lys", // K |
|---|
| 409 | "Leu", // L |
|---|
| 410 | "Met", // M |
|---|
| 411 | "Asn", // N |
|---|
| 412 | NULp, // O |
|---|
| 413 | "Pro", // P |
|---|
| 414 | "Gln", // Q |
|---|
| 415 | "Arg", // R |
|---|
| 416 | "Ser", // S |
|---|
| 417 | "Thr", // T |
|---|
| 418 | NULp, // U |
|---|
| 419 | "Val", // V |
|---|
| 420 | "Trp", // W |
|---|
| 421 | "Xaa", // X |
|---|
| 422 | "Tyr", // Y |
|---|
| 423 | "Glx", // Z (= E or Q) |
|---|
| 424 | NULp |
|---|
| 425 | }; |
|---|
| 426 | |
|---|
| 427 | const char *getAminoAcidAbbr(char aa) { |
|---|
| 428 | if (aa=='*') return "End"; |
|---|
| 429 | aa = toupper(aa); |
|---|
| 430 | if (aa>='A' && aa<='Z') return aa_3letter_name[aa-'A']; |
|---|
| 431 | return NULp; |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | #ifdef DEBUG |
|---|
| 435 | |
|---|
| 436 | inline char nextBase(char c) { |
|---|
| 437 | switch (c) { |
|---|
| 438 | case 'T': return 'C'; |
|---|
| 439 | case 'C': return 'A'; |
|---|
| 440 | case 'A': return 'G'; |
|---|
| 441 | #if 0 |
|---|
| 442 | case 'G': return 0; |
|---|
| 443 | #else |
|---|
| 444 | case 'G': return 'M'; |
|---|
| 445 | case 'M': return 'R'; |
|---|
| 446 | case 'R': return 'W'; |
|---|
| 447 | case 'W': return 'S'; |
|---|
| 448 | case 'S': return 'Y'; |
|---|
| 449 | case 'Y': return 'K'; |
|---|
| 450 | case 'K': return 'V'; |
|---|
| 451 | case 'V': return 'H'; |
|---|
| 452 | case 'H': return 'D'; |
|---|
| 453 | case 'D': return 'B'; |
|---|
| 454 | case 'B': return 'N'; |
|---|
| 455 | case 'N': return 0; |
|---|
| 456 | #endif |
|---|
| 457 | default: pn_assert(0); |
|---|
| 458 | } |
|---|
| 459 | return 0; |
|---|
| 460 | } |
|---|
| 461 | |
|---|
| 462 | void AWT_dump_codons(TranslationTableIndexType type, bool skipX) { |
|---|
| 463 | // use for debugging |
|---|
| 464 | |
|---|
| 465 | const TransTables all_allowed; |
|---|
| 466 | |
|---|
| 467 | for (char c='*'; c<='Z'; c++) { |
|---|
| 468 | printf("Codons for '%c': ", c); |
|---|
| 469 | |
|---|
| 470 | if (skipX && c == 'X') { |
|---|
| 471 | fputs("skipped", stdout); |
|---|
| 472 | } |
|---|
| 473 | else { |
|---|
| 474 | bool first_line = true; |
|---|
| 475 | bool found = false; |
|---|
| 476 | for (char b1='T'; b1; b1=nextBase(b1)) { |
|---|
| 477 | for (char b2='T'; b2; b2=nextBase(b2)) { |
|---|
| 478 | for (char b3='T'; b3; b3=nextBase(b3)) { |
|---|
| 479 | char dna[4]; |
|---|
| 480 | dna[0]=b1; |
|---|
| 481 | dna[1]=b2; |
|---|
| 482 | dna[2]=b3; |
|---|
| 483 | dna[3]=0; |
|---|
| 484 | |
|---|
| 485 | TransTables remaining; |
|---|
| 486 | if (AWT_is_codon(c, dna, all_allowed, remaining)) { |
|---|
| 487 | if (!first_line) fputs("\n ", stdout); |
|---|
| 488 | first_line = false; |
|---|
| 489 | printf("%s (%s)", dna, remaining.to_string(type)); |
|---|
| 490 | found = true; |
|---|
| 491 | } |
|---|
| 492 | } |
|---|
| 493 | } |
|---|
| 494 | } |
|---|
| 495 | if (!found) fputs("none", stdout); |
|---|
| 496 | } |
|---|
| 497 | fputs("\n", stdout); |
|---|
| 498 | if (c=='*') c='A'-1; |
|---|
| 499 | } |
|---|
| 500 | } |
|---|
| 501 | #endif |
|---|
| 502 | |
|---|
| 503 | inline char isStartOrStopCodonNr(int codon_nr, int code_nr) { |
|---|
| 504 | char isStartStop = 0; |
|---|
| 505 | pn_assert(code_nr >= 0 && code_nr<AWT_CODON_TABLES); |
|---|
| 506 | |
|---|
| 507 | pn_assert(codon_nr != AWT_MAX_CODONS); // should not be called with IUPAC codons |
|---|
| 508 | pn_assert(codon_nr >= 0 && codon_nr<AWT_MAX_CODONS); // (use isStartOrStopCodon, isStartCodon or isStopCodon) |
|---|
| 509 | |
|---|
| 510 | if (codon_nr != AWT_MAX_CODONS) { // 'codon' is a clean codon (it contains no iupac-codes) |
|---|
| 511 | isStartStop = AWT_codon_def[code_nr].startStop[codon_nr]; |
|---|
| 512 | if (isStartStop == '-') { |
|---|
| 513 | isStartStop = 0; |
|---|
| 514 | } |
|---|
| 515 | } |
|---|
| 516 | |
|---|
| 517 | arb_assert(implicated(isStartStop, isStartStop == '*' || isStartStop == 'M')); |
|---|
| 518 | return isStartStop; |
|---|
| 519 | } |
|---|
| 520 | |
|---|
| 521 | char AWT_translator::isStartOrStopCodon(const char *codon) const { |
|---|
| 522 | /*! test whether 'codon' is a start- or stop-codon. |
|---|
| 523 | * @param codon three bases definining the codon |
|---|
| 524 | * @return '*' for stop-codons, 'M' for start-codons, 0 otherwise |
|---|
| 525 | */ |
|---|
| 526 | |
|---|
| 527 | char result = 0; |
|---|
| 528 | int codon_nr = calc_codon_nr(codon); |
|---|
| 529 | if (codon_nr == AWT_MAX_CODONS) { // codon contains iupac codes (rare case -> brute force implementation ok) |
|---|
| 530 | TransTables allowed; |
|---|
| 531 | allowed.forbidAllBut(CodeNr()); |
|---|
| 532 | TransTables remaining = allowed; |
|---|
| 533 | |
|---|
| 534 | bool is_start = AWT_is_codon('M', codon, allowed, remaining, NULp); |
|---|
| 535 | bool is_stop = is_start ? false : AWT_is_codon('*', codon, allowed, remaining, NULp); |
|---|
| 536 | |
|---|
| 537 | pn_assert(!(is_start && is_stop)); |
|---|
| 538 | result = is_start ? 'M' : (is_stop ? '*' : 0); |
|---|
| 539 | } |
|---|
| 540 | else { // codon is a clean codon |
|---|
| 541 | result = isStartOrStopCodonNr(calc_codon_nr(codon), code_nr); |
|---|
| 542 | } |
|---|
| 543 | return result; |
|---|
| 544 | } |
|---|
| 545 | |
|---|
| 546 | inline bool protMatches(char p1, char p2) { |
|---|
| 547 | /*! return true if p1 matches p2 |
|---|
| 548 | * @param p1 "normal" protein (neither B, Z nor J) |
|---|
| 549 | * @param p2 any protein (B, Z and J ok) |
|---|
| 550 | * B is a shortcut for Asp(=D) or Asn(=N) |
|---|
| 551 | * J is a shortcut for Ile(=I) or Leu(=L) |
|---|
| 552 | * Z is a shortcut for Glu(=E) or Gln(=Q) |
|---|
| 553 | */ |
|---|
| 554 | pn_assert(p1 != 'B' && p1 != 'Z' && p1 != 'J'); |
|---|
| 555 | pn_assert(p1 == toupper(p1)); |
|---|
| 556 | pn_assert(p2 == toupper(p2)); |
|---|
| 557 | |
|---|
| 558 | if (p1 == p2) return true; |
|---|
| 559 | if (p2 == 'B') return p1 == 'D' || p1 == 'N'; |
|---|
| 560 | if (p2 == 'J') return p1 == 'I' || p1 == 'L'; |
|---|
| 561 | if (p2 == 'Z') return p1 == 'E' || p1 == 'Q'; |
|---|
| 562 | return false; |
|---|
| 563 | } |
|---|
| 564 | inline bool containsProtMatching(const char *pstr, char p) { |
|---|
| 565 | /*! return true, if 'pstr' contains any protein that matches 'p'. |
|---|
| 566 | * uses same logic as protMatches() |
|---|
| 567 | */ |
|---|
| 568 | pn_assert(p == toupper(p)); |
|---|
| 569 | if (p == 'B') return strchr(pstr, 'D') || strchr(pstr, 'N'); |
|---|
| 570 | if (p == 'J') return strchr(pstr, 'I') || strchr(pstr, 'L'); |
|---|
| 571 | if (p == 'Z') return strchr(pstr, 'E') || strchr(pstr, 'Q'); |
|---|
| 572 | return strchr(pstr, p); |
|---|
| 573 | } |
|---|
| 574 | inline bool isGap(char c) { return GAP::is_std_gap(c); } |
|---|
| 575 | |
|---|
| 576 | inline GB_ERROR neverTranslatesError(const char *dna, char protein) { |
|---|
| 577 | if (!strchr(VALID_PROTEIN, protein)) { |
|---|
| 578 | return GBS_global_string("'%c' is no valid amino acid", protein); |
|---|
| 579 | } |
|---|
| 580 | return GBS_global_string("'%c%c%c' never translates to '%c'", dna[0], dna[1], dna[2], protein); |
|---|
| 581 | } |
|---|
| 582 | |
|---|
| 583 | bool AWT_is_codon(char protein, const char *const dna, const TransTables& allowed, TransTables& remaining, const char **fail_reason_ptr) { |
|---|
| 584 | /*! test if 'dna' codes 'protein' |
|---|
| 585 | * @param protein amino acid |
|---|
| 586 | * @param dna three nucleotides (gaps allowed, e.g. 'A-C' can be tested vs 'X') |
|---|
| 587 | * @param allowed allowed translation tables |
|---|
| 588 | * @param remaining returns the remaining allowed translation tables (only if this functions returns true) |
|---|
| 589 | * @param fail_reason_ptr if not NULp => store reason for failure here (or set it to NULp on success) |
|---|
| 590 | * @return true if dna translates to protein |
|---|
| 591 | */ |
|---|
| 592 | |
|---|
| 593 | pn_assert(allowed.any()); |
|---|
| 594 | pn_assert(codon_tables_initialized); |
|---|
| 595 | |
|---|
| 596 | const char *fail_reason = NULp; |
|---|
| 597 | if (fail_reason_ptr) *fail_reason_ptr = NULp; |
|---|
| 598 | |
|---|
| 599 | bool is_codon = false; |
|---|
| 600 | int codon_nr = calc_codon_nr(dna); |
|---|
| 601 | int first_iupac_pos = -1; |
|---|
| 602 | int iupac_positions = 0; |
|---|
| 603 | bool decided = false; |
|---|
| 604 | bool general_failure = false; |
|---|
| 605 | |
|---|
| 606 | protein = toupper(protein); |
|---|
| 607 | |
|---|
| 608 | if (codon_nr==AWT_MAX_CODONS) { // dna is not a clean codon (i.e. it contains iupac-codes or gaps) |
|---|
| 609 | bool too_short = false; |
|---|
| 610 | int nucs_seen = 0; |
|---|
| 611 | for (int iupac_pos=0; iupac_pos<3 && !too_short && !fail_reason; iupac_pos++) { |
|---|
| 612 | char N = dna[iupac_pos]; |
|---|
| 613 | |
|---|
| 614 | if (!N) too_short = true; |
|---|
| 615 | else if (!isGap(N)) { |
|---|
| 616 | nucs_seen++; |
|---|
| 617 | if (!strchr("ACGTU", N)) { |
|---|
| 618 | if (first_iupac_pos==-1) first_iupac_pos = iupac_pos; |
|---|
| 619 | iupac_positions++; |
|---|
| 620 | const char *decoded_iupac = iupac::decode(N, GB_AT_DNA, 0); |
|---|
| 621 | if (!decoded_iupac[0]) { // no valid IUPAC |
|---|
| 622 | fail_reason = GBS_global_string("Invalid character '%c' in DNA", N); |
|---|
| 623 | } |
|---|
| 624 | } |
|---|
| 625 | } |
|---|
| 626 | } |
|---|
| 627 | |
|---|
| 628 | if (!fail_reason && !nucs_seen) { // got no dna |
|---|
| 629 | fail_reason = "No nucleotides left"; |
|---|
| 630 | } |
|---|
| 631 | else if (nucs_seen<3) { |
|---|
| 632 | too_short = true; |
|---|
| 633 | } |
|---|
| 634 | |
|---|
| 635 | if (fail_reason) { |
|---|
| 636 | decided = true; // fails for all proteins |
|---|
| 637 | } |
|---|
| 638 | else if (too_short) { |
|---|
| 639 | decided = true; |
|---|
| 640 | if (protein == 'X') { |
|---|
| 641 | is_codon = true; |
|---|
| 642 | } |
|---|
| 643 | else { |
|---|
| 644 | char dna_copy[4]; |
|---|
| 645 | strncpy(dna_copy, dna, 3); |
|---|
| 646 | dna_copy[3] = 0; |
|---|
| 647 | |
|---|
| 648 | fail_reason = GBS_global_string("Not enough nucleotides (got '%s')", dna_copy); |
|---|
| 649 | } |
|---|
| 650 | } |
|---|
| 651 | } |
|---|
| 652 | |
|---|
| 653 | if (!decided) { |
|---|
| 654 | if (protein == 'X') { |
|---|
| 655 | TransTables allowed_copy = allowed; |
|---|
| 656 | const char *valid_prot = VALID_PROTEIN_NO_X; |
|---|
| 657 | |
|---|
| 658 | for (int i = 0; valid_prot[i]; ++i) { |
|---|
| 659 | if (AWT_is_codon(valid_prot[i], dna, allowed_copy, remaining)) { |
|---|
| 660 | allowed_copy.forbid(remaining); |
|---|
| 661 | if (allowed_copy.none()) break; |
|---|
| 662 | } |
|---|
| 663 | } |
|---|
| 664 | |
|---|
| 665 | if (allowed_copy.any()) { |
|---|
| 666 | is_codon = true; |
|---|
| 667 | remaining = allowed_copy; |
|---|
| 668 | } |
|---|
| 669 | else { |
|---|
| 670 | fail_reason = neverTranslatesError(dna, protein); |
|---|
| 671 | } |
|---|
| 672 | } |
|---|
| 673 | else if (codon_nr==AWT_MAX_CODONS) { // dna is a codon with one or more IUPAC codes |
|---|
| 674 | pn_assert(iupac_positions); |
|---|
| 675 | const char *decoded_iupac = iupac::decode(dna[first_iupac_pos], GB_AT_DNA, 0); |
|---|
| 676 | pn_assert(decoded_iupac[0]); // already should have been catched above |
|---|
| 677 | |
|---|
| 678 | char dna_copy[4]; |
|---|
| 679 | memcpy(dna_copy, dna, 3); |
|---|
| 680 | dna_copy[3] = 0; |
|---|
| 681 | |
|---|
| 682 | bool all_are_codons = true; |
|---|
| 683 | bool one_is_codon = false; |
|---|
| 684 | |
|---|
| 685 | TransTables allowed_copy = allowed; |
|---|
| 686 | |
|---|
| 687 | for (int i=0; decoded_iupac[i]; i++) { |
|---|
| 688 | dna_copy[first_iupac_pos] = decoded_iupac[i]; |
|---|
| 689 | const char *subfail; |
|---|
| 690 | if (!AWT_is_codon(protein, dna_copy, allowed_copy, remaining, &subfail)) { |
|---|
| 691 | all_are_codons = false; |
|---|
| 692 | if (!one_is_codon && ARB_strBeginsWith(subfail, "Not all ")) one_is_codon = true; |
|---|
| 693 | if (one_is_codon) break; |
|---|
| 694 | } |
|---|
| 695 | else { |
|---|
| 696 | one_is_codon = true; |
|---|
| 697 | allowed_copy = remaining; |
|---|
| 698 | } |
|---|
| 699 | } |
|---|
| 700 | |
|---|
| 701 | if (all_are_codons) { |
|---|
| 702 | pn_assert(allowed_copy.any()); |
|---|
| 703 | remaining = allowed_copy; |
|---|
| 704 | is_codon = true; |
|---|
| 705 | } |
|---|
| 706 | else { |
|---|
| 707 | remaining.forbidAll(); |
|---|
| 708 | dna_copy[first_iupac_pos] = dna[first_iupac_pos]; |
|---|
| 709 | if (one_is_codon) { |
|---|
| 710 | fail_reason = GBS_global_string("Not all IUPAC-combinations of '%s' translate to '%c'", dna_copy, protein); // careful when changing this message (see above) |
|---|
| 711 | } |
|---|
| 712 | else { |
|---|
| 713 | fail_reason = neverTranslatesError(dna_copy, protein); |
|---|
| 714 | } |
|---|
| 715 | } |
|---|
| 716 | } |
|---|
| 717 | else if (definite_translation[codon_nr]) { // codon has a definite translation (i.e. translates equal for all code-tables) |
|---|
| 718 | char defTransl = definite_translation[codon_nr]; |
|---|
| 719 | |
|---|
| 720 | #if defined(ASSERTION_USED) |
|---|
| 721 | bool optionalCodonExists = false; |
|---|
| 722 | for (int code_nr=0; code_nr<AWT_CODON_TABLES && !optionalCodonExists; code_nr++) { |
|---|
| 723 | char startStop = isStartOrStopCodonNr(codon_nr, code_nr); |
|---|
| 724 | if (startStop && startStop != defTransl) { // got optional start/stop codon |
|---|
| 725 | if (allowed.is_allowed(code_nr)) { |
|---|
| 726 | pn_assert(startStop == '*' || startStop == 'M'); |
|---|
| 727 | optionalCodonExists = true; |
|---|
| 728 | } |
|---|
| 729 | } |
|---|
| 730 | } |
|---|
| 731 | pn_assert(!optionalCodonExists); // when this fails -> definite_translation[] is wrong |
|---|
| 732 | #endif |
|---|
| 733 | |
|---|
| 734 | int ok = protMatches(defTransl, protein); |
|---|
| 735 | if (ok) { |
|---|
| 736 | remaining = allowed; |
|---|
| 737 | is_codon = true; |
|---|
| 738 | } |
|---|
| 739 | else { |
|---|
| 740 | remaining.forbidAll(); |
|---|
| 741 | fail_reason = GBS_global_string("'%c%c%c' translates to '%c', not to '%c'", dna[0], dna[1], dna[2], defTransl, protein); |
|---|
| 742 | general_failure = true; |
|---|
| 743 | } |
|---|
| 744 | } |
|---|
| 745 | else if (!containsProtMatching(ambiguous_codons[codon_nr], protein)) { // codon does not translate to protein in any code-table |
|---|
| 746 | remaining.forbidAll(); |
|---|
| 747 | fail_reason = neverTranslatesError(dna, protein); |
|---|
| 748 | general_failure = true; |
|---|
| 749 | } |
|---|
| 750 | else { |
|---|
| 751 | #if defined(ASSERTION_USED) |
|---|
| 752 | bool correct_disallowed_translation = false; |
|---|
| 753 | #endif |
|---|
| 754 | |
|---|
| 755 | // Now codon translates to protein in at least 1 code-table! |
|---|
| 756 | // Check whether protein translates in any of the allowed code-tables and forbid rest |
|---|
| 757 | for (int code_nr=0; code_nr<AWT_CODON_TABLES; code_nr++) { |
|---|
| 758 | bool mayTranslate = protMatches(AWT_codon_def[code_nr].aa[codon_nr], protein); |
|---|
| 759 | if (!mayTranslate && (protein == '*' || protein == 'M')) { |
|---|
| 760 | char startOrStop = isStartOrStopCodonNr(codon_nr, code_nr); |
|---|
| 761 | mayTranslate = startOrStop && protMatches(startOrStop, protein); |
|---|
| 762 | } |
|---|
| 763 | |
|---|
| 764 | if (mayTranslate) { // may codon_nr translate to protein for code_nr |
|---|
| 765 | if (allowed.is_allowed(code_nr)) { // is this code allowed? |
|---|
| 766 | remaining.allow(code_nr); |
|---|
| 767 | is_codon = true; |
|---|
| 768 | } |
|---|
| 769 | else { |
|---|
| 770 | remaining.forbid(code_nr); // otherwise forbid code in future |
|---|
| 771 | #if defined(ASSERTION_USED) |
|---|
| 772 | correct_disallowed_translation = true; |
|---|
| 773 | #endif |
|---|
| 774 | } |
|---|
| 775 | } |
|---|
| 776 | else { |
|---|
| 777 | remaining.forbid(code_nr); // otherwise forbid code in future |
|---|
| 778 | } |
|---|
| 779 | } |
|---|
| 780 | |
|---|
| 781 | if (!is_codon) { |
|---|
| 782 | pn_assert(correct_disallowed_translation); // should be true because otherwise we shouldn't run into this else-branch |
|---|
| 783 | fail_reason = GBS_global_string("'%c%c%c' does not translate to '%c'", dna[0], dna[1], dna[2], protein); |
|---|
| 784 | } |
|---|
| 785 | } |
|---|
| 786 | } |
|---|
| 787 | |
|---|
| 788 | if (!is_codon) { |
|---|
| 789 | pn_assert(fail_reason); |
|---|
| 790 | if (fail_reason_ptr) { |
|---|
| 791 | if (!allowed.all() && !general_failure) { |
|---|
| 792 | int one = allowed.explicit_table(); |
|---|
| 793 | if (one == -1) { |
|---|
| 794 | const char *left_tables = allowed.to_string(TTIT_EMBL); |
|---|
| 795 | pn_assert(left_tables[0]); // allowed should never be empty! |
|---|
| 796 | |
|---|
| 797 | fail_reason = GBS_global_string("%s (for any of the leftover trans-tables: %s)", fail_reason, left_tables); |
|---|
| 798 | } |
|---|
| 799 | else { |
|---|
| 800 | int one_embl = TTIT_arb2embl(one); |
|---|
| 801 | fail_reason = GBS_global_string("%s (for trans-table %i)", fail_reason, one_embl); |
|---|
| 802 | } |
|---|
| 803 | } |
|---|
| 804 | |
|---|
| 805 | *fail_reason_ptr = fail_reason; // set failure-reason if requested |
|---|
| 806 | } |
|---|
| 807 | } |
|---|
| 808 | #if defined(ASSERTION_USED) |
|---|
| 809 | else { |
|---|
| 810 | pn_assert(remaining.is_subset_of(allowed)); |
|---|
| 811 | } |
|---|
| 812 | #endif |
|---|
| 813 | return is_codon; |
|---|
| 814 | } |
|---|
| 815 | |
|---|
| 816 | // -------------------------------------------------------------------------------- Codon_Group |
|---|
| 817 | |
|---|
| 818 | #if defined(DEBUG) |
|---|
| 819 | // #define DUMP_CODON_GROUP_EXPANSION |
|---|
| 820 | #endif |
|---|
| 821 | |
|---|
| 822 | class Codon_Group { |
|---|
| 823 | char codon[64]; // index is calculated with calc_codon_nr |
|---|
| 824 | |
|---|
| 825 | public: |
|---|
| 826 | Codon_Group(char protein, int code_nr); |
|---|
| 827 | ~Codon_Group() {} |
|---|
| 828 | |
|---|
| 829 | Codon_Group& operator += (const Codon_Group& other); |
|---|
| 830 | int expand(char *to_buffer) const; |
|---|
| 831 | }; |
|---|
| 832 | |
|---|
| 833 | Codon_Group::Codon_Group(char protein, int code_nr) { |
|---|
| 834 | protein = toupper(protein); |
|---|
| 835 | pn_assert(protein=='*' || isalpha(protein)); |
|---|
| 836 | pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES); |
|---|
| 837 | |
|---|
| 838 | const char *amino_table = AWT_codon_def[code_nr].aa; |
|---|
| 839 | for (int i=0; i<AWT_MAX_CODONS; i++) { |
|---|
| 840 | codon[i] = amino_table[i]==protein; |
|---|
| 841 | } |
|---|
| 842 | } |
|---|
| 843 | |
|---|
| 844 | Codon_Group& Codon_Group::operator+=(const Codon_Group& other) { |
|---|
| 845 | for (int i=0; i<AWT_MAX_CODONS; i++) { |
|---|
| 846 | codon[i] = codon[i] || other.codon[i]; |
|---|
| 847 | } |
|---|
| 848 | return *this; |
|---|
| 849 | } |
|---|
| 850 | |
|---|
| 851 | inline int legal_dna_no(int i) { return i>=0 && i<4; } |
|---|
| 852 | |
|---|
| 853 | inline const char *buildMixedCodon(const char *const con1, const char *const con2) { |
|---|
| 854 | int mismatches = 0; |
|---|
| 855 | int mismatch_index = -1; |
|---|
| 856 | static char buf[4]; |
|---|
| 857 | |
|---|
| 858 | for (int i=0; i<3; i++) { |
|---|
| 859 | if (con1[i]!=con2[i]) { |
|---|
| 860 | mismatches++; |
|---|
| 861 | mismatch_index = i; |
|---|
| 862 | } |
|---|
| 863 | else { |
|---|
| 864 | buf[i] = con1[i]; |
|---|
| 865 | } |
|---|
| 866 | } |
|---|
| 867 | |
|---|
| 868 | if (mismatches==1) { // exactly one position differs between codons |
|---|
| 869 | pn_assert(mismatch_index!=-1); |
|---|
| 870 | buf[mismatch_index] = iupac::combine(con1[mismatch_index], con2[mismatch_index], GB_AT_DNA); |
|---|
| 871 | buf[3] = 0; |
|---|
| 872 | |
|---|
| 873 | if (memcmp(con1, buf, 3) == 0 || |
|---|
| 874 | memcmp(con2, buf, 3) == 0) |
|---|
| 875 | { |
|---|
| 876 | return NULp; |
|---|
| 877 | } |
|---|
| 878 | |
|---|
| 879 | #if defined(DUMP_CODON_GROUP_EXPANSION) |
|---|
| 880 | printf(" buildMixedCodon('%c%c%c','%c%c%c') == '%s'\n", |
|---|
| 881 | con1[0], con1[1], con1[2], |
|---|
| 882 | con2[0], con2[1], con2[2], |
|---|
| 883 | buf); |
|---|
| 884 | #endif |
|---|
| 885 | |
|---|
| 886 | return buf; |
|---|
| 887 | } |
|---|
| 888 | return NULp; |
|---|
| 889 | } |
|---|
| 890 | |
|---|
| 891 | static int expandMore(const char *bufferStart, int no_of_condons, char*&to_buffer) { |
|---|
| 892 | int i, j; |
|---|
| 893 | const char *con1, *con2; |
|---|
| 894 | int added = 0; |
|---|
| 895 | |
|---|
| 896 | for (i=0; i<no_of_condons; i++) { |
|---|
| 897 | con1 = bufferStart+3*i; |
|---|
| 898 | |
|---|
| 899 | for (j=i+1; j<no_of_condons; j++) { |
|---|
| 900 | con2 = bufferStart+3*j; |
|---|
| 901 | const char *result = buildMixedCodon(con1, con2); |
|---|
| 902 | if (result) { |
|---|
| 903 | to_buffer[0] = 0; |
|---|
| 904 | // do we already have this codon? |
|---|
| 905 | const char *found; |
|---|
| 906 | const char *startSearch = bufferStart; |
|---|
| 907 | for (;;) { |
|---|
| 908 | found = strstr(startSearch, result); |
|---|
| 909 | if (!found) break; |
|---|
| 910 | int pos = (found-bufferStart); |
|---|
| 911 | if ((pos%3)==0) break; // yes already here! |
|---|
| 912 | startSearch = found+1; // was misaligned -> try behind |
|---|
| 913 | } |
|---|
| 914 | |
|---|
| 915 | if (!found) { |
|---|
| 916 | memmove(to_buffer, result, 3); to_buffer+=3; |
|---|
| 917 | added++; |
|---|
| 918 | } |
|---|
| 919 | } |
|---|
| 920 | } |
|---|
| 921 | } |
|---|
| 922 | return no_of_condons+added; |
|---|
| 923 | } |
|---|
| 924 | |
|---|
| 925 | int Codon_Group::expand(char *to_buffer) const { |
|---|
| 926 | int count = 0; |
|---|
| 927 | int i; |
|---|
| 928 | char *org_to_buffer = to_buffer; |
|---|
| 929 | |
|---|
| 930 | for (i=0; i<AWT_MAX_CODONS; i++) { |
|---|
| 931 | if (codon[i]) { |
|---|
| 932 | build_codon(i, to_buffer); |
|---|
| 933 | to_buffer += 3; |
|---|
| 934 | count++; |
|---|
| 935 | } |
|---|
| 936 | } |
|---|
| 937 | |
|---|
| 938 | #if defined(DUMP_CODON_GROUP_EXPANSION) |
|---|
| 939 | to_buffer[0] = 0; |
|---|
| 940 | printf("codons = '%s'\n", org_to_buffer); |
|---|
| 941 | #endif |
|---|
| 942 | |
|---|
| 943 | for (;;) { |
|---|
| 944 | int new_count = expandMore(org_to_buffer, count, to_buffer); |
|---|
| 945 | if (new_count==count) break; // nothing expanded -> done |
|---|
| 946 | count = new_count; |
|---|
| 947 | #if defined(DUMP_CODON_GROUP_EXPANSION) |
|---|
| 948 | to_buffer[0] = 0; |
|---|
| 949 | printf("codons (expandedMore) = '%s'\n", org_to_buffer); |
|---|
| 950 | #endif |
|---|
| 951 | } |
|---|
| 952 | |
|---|
| 953 | pn_assert(count==(int(to_buffer-org_to_buffer)/3)); |
|---|
| 954 | |
|---|
| 955 | return count; |
|---|
| 956 | } |
|---|
| 957 | |
|---|
| 958 | // -------------------------------------------------------------------------------- |
|---|
| 959 | |
|---|
| 960 | static Codon_Group *get_Codon_Group(char protein, int code_nr) { |
|---|
| 961 | pn_assert(code_nr>=0 && code_nr<AWT_CODON_TABLES); |
|---|
| 962 | protein = toupper(protein); |
|---|
| 963 | pn_assert(isalpha(protein) || protein=='*'); |
|---|
| 964 | pn_assert(codon_tables_initialized); |
|---|
| 965 | |
|---|
| 966 | Codon_Group *cgroup = NULp; |
|---|
| 967 | |
|---|
| 968 | if (protein=='B') { |
|---|
| 969 | cgroup = new Codon_Group('D', code_nr); |
|---|
| 970 | Codon_Group N('N', code_nr); |
|---|
| 971 | *cgroup += N; |
|---|
| 972 | } |
|---|
| 973 | else if (protein=='Z') { |
|---|
| 974 | cgroup = new Codon_Group('E', code_nr); |
|---|
| 975 | Codon_Group Q('Q', code_nr); |
|---|
| 976 | *cgroup += Q; |
|---|
| 977 | } |
|---|
| 978 | else { |
|---|
| 979 | cgroup = new Codon_Group(protein, code_nr); |
|---|
| 980 | } |
|---|
| 981 | |
|---|
| 982 | pn_assert(cgroup); |
|---|
| 983 | |
|---|
| 984 | return cgroup; |
|---|
| 985 | } |
|---|
| 986 | |
|---|
| 987 | #define MAX_CODON_LIST_LENGTH (70*3) |
|---|
| 988 | |
|---|
| 989 | const char *AP_get_codons(char protein, int code_nr) { |
|---|
| 990 | // get a list of all codons ("xyzxyzxyz...") encoding 'protein' in case we use Codon-Code 'code_nr' |
|---|
| 991 | // (includes all completely contained IUPAC-encoded codons at the end of list) |
|---|
| 992 | // |
|---|
| 993 | // Optional start-/stop-codons are not added |
|---|
| 994 | // (i.e. a query for 'M' or '*' may report "incomplete" results) |
|---|
| 995 | |
|---|
| 996 | Codon_Group *cgroup = get_Codon_Group(protein, code_nr); |
|---|
| 997 | |
|---|
| 998 | static char buffer[MAX_CODON_LIST_LENGTH+1]; |
|---|
| 999 | int offset = 3*cgroup->expand(buffer); |
|---|
| 1000 | pn_assert(offset<MAX_CODON_LIST_LENGTH); |
|---|
| 1001 | buffer[offset] = 0; |
|---|
| 1002 | |
|---|
| 1003 | delete cgroup; |
|---|
| 1004 | |
|---|
| 1005 | return buffer; |
|---|
| 1006 | } |
|---|
| 1007 | |
|---|
| 1008 | // -------------------------------------------------------------------------------- |
|---|
| 1009 | |
|---|
| 1010 | #ifdef UNIT_TESTS |
|---|
| 1011 | #ifndef TEST_UNIT_H |
|---|
| 1012 | #include <test_unit.h> |
|---|
| 1013 | #endif |
|---|
| 1014 | |
|---|
| 1015 | static const char *startStopSummary() { |
|---|
| 1016 | // returns string showing summary for start/stop |
|---|
| 1017 | // position = codon_nr |
|---|
| 1018 | // content: |
|---|
| 1019 | // '*' -> translates to stop-codon for at least one code |
|---|
| 1020 | // 'M' -> translates to start-codon for at least one code |
|---|
| 1021 | // '2' -> both (not necessarily same code) |
|---|
| 1022 | // '-' -> does not translate to start or stop for any code |
|---|
| 1023 | |
|---|
| 1024 | static char result[AWT_MAX_CODONS+1]; |
|---|
| 1025 | |
|---|
| 1026 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
|---|
| 1027 | char startStop = '-'; |
|---|
| 1028 | for (int code = 0; code<AWT_CODON_TABLES && (startStop != '2'); ++code) { |
|---|
| 1029 | switch (isStartOrStopCodonNr(codon, code)) { |
|---|
| 1030 | case '*': |
|---|
| 1031 | switch (startStop) { |
|---|
| 1032 | case '*': break; |
|---|
| 1033 | case '-': startStop = '*'; break; |
|---|
| 1034 | case 'M': startStop = '2'; break; |
|---|
| 1035 | default: pn_assert(0); break; |
|---|
| 1036 | } |
|---|
| 1037 | break; |
|---|
| 1038 | case 'M': |
|---|
| 1039 | switch (startStop) { |
|---|
| 1040 | case 'M': break; |
|---|
| 1041 | case '-': startStop = 'M'; break; |
|---|
| 1042 | case '*': startStop = '2'; break; |
|---|
| 1043 | default: pn_assert(0); break; |
|---|
| 1044 | } |
|---|
| 1045 | break; |
|---|
| 1046 | |
|---|
| 1047 | case 0: break; |
|---|
| 1048 | default: pn_assert(0); break; |
|---|
| 1049 | } |
|---|
| 1050 | } |
|---|
| 1051 | result[codon] = startStop; |
|---|
| 1052 | } |
|---|
| 1053 | result[AWT_MAX_CODONS] = 0; |
|---|
| 1054 | return result; |
|---|
| 1055 | } |
|---|
| 1056 | static const char *optionality() { |
|---|
| 1057 | // returns string indicating whether start/stop-codon is optional |
|---|
| 1058 | // position = codon_nr |
|---|
| 1059 | // content: |
|---|
| 1060 | // '-' -> only non-optional start/stop |
|---|
| 1061 | // '!' -> only optional start/stop |
|---|
| 1062 | // '?' -> both |
|---|
| 1063 | // ' ' -> never start or stop |
|---|
| 1064 | |
|---|
| 1065 | static char result[AWT_MAX_CODONS+1]; |
|---|
| 1066 | |
|---|
| 1067 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
|---|
| 1068 | char optional = ' '; |
|---|
| 1069 | for (int code = 0; code<AWT_CODON_TABLES && (optional != '?'); ++code) { |
|---|
| 1070 | char startStop = isStartOrStopCodonNr(codon, code); |
|---|
| 1071 | if (startStop) { |
|---|
| 1072 | bool is_optional = AWT_codon_def[code].aa[codon] != startStop; |
|---|
| 1073 | |
|---|
| 1074 | switch (optional) { |
|---|
| 1075 | case ' ': optional = is_optional ? '!' : '-'; break; |
|---|
| 1076 | case '-': optional = is_optional ? '?' : '-'; break; |
|---|
| 1077 | case '!': optional = is_optional ? '!' : '?'; break; |
|---|
| 1078 | default: pn_assert(0); break; |
|---|
| 1079 | } |
|---|
| 1080 | } |
|---|
| 1081 | } |
|---|
| 1082 | |
|---|
| 1083 | #if defined(ASSERTION_USED) |
|---|
| 1084 | bool sometimes_optional = optional == '!' || optional == '?'; |
|---|
| 1085 | pn_assert(!sometimes_optional || !definite_translation[codon]); |
|---|
| 1086 | #endif |
|---|
| 1087 | |
|---|
| 1088 | result[codon] = optional; |
|---|
| 1089 | } |
|---|
| 1090 | result[AWT_MAX_CODONS] = 0; |
|---|
| 1091 | |
|---|
| 1092 | return result; |
|---|
| 1093 | } |
|---|
| 1094 | static const char *definite() { |
|---|
| 1095 | static char result[AWT_MAX_CODONS+1]; |
|---|
| 1096 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
|---|
| 1097 | result[codon] = definite_translation[codon] ? definite_translation[codon] : ' '; |
|---|
| 1098 | } |
|---|
| 1099 | result[AWT_MAX_CODONS] = 0; |
|---|
| 1100 | return result; |
|---|
| 1101 | } |
|---|
| 1102 | static const char *ambig_count() { |
|---|
| 1103 | static char result[AWT_MAX_CODONS+1]; |
|---|
| 1104 | for (int codon = 0; codon<AWT_MAX_CODONS; ++codon) { |
|---|
| 1105 | const char *amb = ambiguous_codons[codon]; |
|---|
| 1106 | result[codon] = amb ? '0'+strlen(amb) : ' '; |
|---|
| 1107 | } |
|---|
| 1108 | result[AWT_MAX_CODONS] = 0; |
|---|
| 1109 | return result; |
|---|
| 1110 | } |
|---|
| 1111 | |
|---|
| 1112 | #define e2a(c) TTIT_embl2arb(c) |
|---|
| 1113 | |
|---|
| 1114 | void TEST_codon_check() { |
|---|
| 1115 | AP_initialize_codon_tables(); |
|---|
| 1116 | |
|---|
| 1117 | // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) |
|---|
| 1118 | // 0123456789012345678901234567890123456789012345678901234567890123 |
|---|
| 1119 | // |
|---|
| 1120 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG" base1 |
|---|
| 1121 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG" base2 |
|---|
| 1122 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
|---|
| 1123 | TEST_EXPECT_EQUAL(startStopSummary(), "--2M--*---**--*----M------------MMMM----------**---M------------"); |
|---|
| 1124 | TEST_EXPECT_EQUAL(optionality (), " ?! - ?? ? ! !!?- -- ! "); |
|---|
| 1125 | TEST_EXPECT_EQUAL(definite (), "FF SS SYY CC W PPPPHHQQRRRR MTTTTNN KSS VVV AAAADDEEGGGG"); // optional start/stop codons shall never be definite |
|---|
| 1126 | TEST_EXPECT_EQUAL(ambig_count (), " 32 2 45 4 2225 222 2 45 2 "); // number of proteins in ambiguous_codons |
|---|
| 1127 | |
|---|
| 1128 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('*'), "End"); |
|---|
| 1129 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('C'), "Cys"); |
|---|
| 1130 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('B'), "Asx"); |
|---|
| 1131 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('b'), "Asx"); |
|---|
| 1132 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('J'), "Xle"); |
|---|
| 1133 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('O'), NULp); |
|---|
| 1134 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('X'), "Xaa"); |
|---|
| 1135 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('x'), "Xaa"); |
|---|
| 1136 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('-'), NULp); |
|---|
| 1137 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('='), NULp); |
|---|
| 1138 | TEST_EXPECT_EQUAL(getAminoAcidAbbr('7'), NULp); |
|---|
| 1139 | |
|---|
| 1140 | TEST_EXPECT(protMatches('V', 'V')); |
|---|
| 1141 | TEST_EXPECT(protMatches('N', 'B')); |
|---|
| 1142 | TEST_EXPECT(protMatches('E', 'Z')); |
|---|
| 1143 | TEST_EXPECT(!protMatches('N', 'Z')); |
|---|
| 1144 | TEST_EXPECT(!protMatches('V', 'Z')); |
|---|
| 1145 | |
|---|
| 1146 | TEST_EXPECT_EQUAL(AP_get_codons('D', 0), "GATGACGAY"); |
|---|
| 1147 | TEST_EXPECT_EQUAL(AP_get_codons('N', 0), "AATAACAAY"); |
|---|
| 1148 | TEST_EXPECT_EQUAL(AP_get_codons('B', 0), "AAT" "AAC" "GAT" "GAC" "AAY" "RAT" "RAC" "GAY" "RAY"); // 'B' = 'D' or 'N' |
|---|
| 1149 | |
|---|
| 1150 | TEST_EXPECT_EQUAL(AP_get_codons('L', 0), "TTATTGCTTCTCCTACTG" "TTRYTAYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVYTRCTN"); |
|---|
| 1151 | TEST_EXPECT_EQUAL(AP_get_codons('L', 2), "TTATTG" "TTR"); |
|---|
| 1152 | TEST_EXPECT_EQUAL(AP_get_codons('L', 9), "TTATTGCTTCTCCTAT" "TRYTACTYCTWCTMCTH"); |
|---|
| 1153 | TEST_EXPECT_EQUAL(AP_get_codons('L', 13), "TTATTGTAGCTTCTCCTACTG" "TTRYTATWGYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVYTRCTN"); |
|---|
| 1154 | TEST_EXPECT_EQUAL(AP_get_codons('L', 16), "TTGCTTCTCCTAC" "TGYTGCTYCTWCTKCTMCTSCTRCTHCTBCTDCTVCTN"); |
|---|
| 1155 | |
|---|
| 1156 | TEST_EXPECT_EQUAL(AP_get_codons('S', 0), "TCTTCCTCATCGAGTAGC" "TCYTCWTCKTCMTCSTCRAGYTCHTCBTCDTCVTCN"); |
|---|
| 1157 | TEST_EXPECT_EQUAL(AP_get_codons('S', 4), "TCTTCCTCATCGAGTAGCAGAAGG" "TCYTCWTCKTCMTCSTCRAGYAGWAGKAGMAGSAGRTCHTCBTCDTCVAGHAGBAGDAGVTCNAGN"); |
|---|
| 1158 | TEST_EXPECT_EQUAL(AP_get_codons('S', 9), "TCTTCCTCATCGCTGAGTAGC" "TCYTCWTCKTCMTCSTCRAGYTCHTCBTCDTCVTCN"); |
|---|
| 1159 | TEST_EXPECT_EQUAL(AP_get_codons('S', 15), "TCTTCCTCGAGTAGC" "TCYTCKTCSAGYTCB"); |
|---|
| 1160 | |
|---|
| 1161 | // stop-codons: |
|---|
| 1162 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 1)), "TAATAGTGA" "TARTRA"); // the 3 standard stop codons and their IUPAC covers |
|---|
| 1163 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 2)), "TAATAGAGAAGG" "TARAGR"); |
|---|
| 1164 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 3)), "TAATAG" "TAR"); // not TGA |
|---|
| 1165 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 4)), "TAATAG" "TAR"); |
|---|
| 1166 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 5)), "TAATAG" "TAR"); |
|---|
| 1167 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 9)), "TAATAG" "TAR"); |
|---|
| 1168 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(10)), "TAATAG" "TAR"); |
|---|
| 1169 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(13)), "TAATAG" "TAR"); |
|---|
| 1170 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(21)), "TAATAG" "TAR"); |
|---|
| 1171 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(15)), "TAATGA" "TRA"); // not TAG |
|---|
| 1172 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(16)), "TAATGA" "TRA"); |
|---|
| 1173 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a( 6)), "TGA"); // not TAA TAG |
|---|
| 1174 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(14)), "TAG"); // not TAA TGA |
|---|
| 1175 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(22)), "TCATAATGA" "TMATSATRATVA"); |
|---|
| 1176 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(23)), "TTATAATAGTGA" "TWATKATARTRATDA"); |
|---|
| 1177 | |
|---|
| 1178 | { |
|---|
| 1179 | // Note: optional start/stop-codons are not added in Codon_Group, |
|---|
| 1180 | // because they would introduce ambiguous mapping. |
|---|
| 1181 | |
|---|
| 1182 | // test optional stop-codons: |
|---|
| 1183 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(27)), ""); |
|---|
| 1184 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(28)), ""); |
|---|
| 1185 | TEST_EXPECT_EQUAL(AP_get_codons('*', e2a(31)), ""); |
|---|
| 1186 | |
|---|
| 1187 | // test optional start-codons: |
|---|
| 1188 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 1)), "ATG"); // 3 (start-codons listed in table-definition) |
|---|
| 1189 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 2)), "ATAATG" "ATR"); // 5 |
|---|
| 1190 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 3)), "ATAATG" "ATR"); // 2 |
|---|
| 1191 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 4)), "ATG"); // 8 |
|---|
| 1192 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 5)), "ATAATG" "ATR"); // 6 |
|---|
| 1193 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a( 6)), "ATG"); // 1 |
|---|
| 1194 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(11)), "ATG"); // 7 |
|---|
| 1195 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(13)), "ATAATG" "ATR"); // 4 |
|---|
| 1196 | TEST_EXPECT_EQUAL(AP_get_codons('M', e2a(24)), "ATG"); // 4 |
|---|
| 1197 | } |
|---|
| 1198 | |
|---|
| 1199 | TEST_EXPECT_EQUAL(AP_get_codons('X', 0), ""); // @@@ wrong: TGR->X (or disallow call) |
|---|
| 1200 | |
|---|
| 1201 | const TransTables allowed; |
|---|
| 1202 | |
|---|
| 1203 | // --------------------------- |
|---|
| 1204 | // test valid codons |
|---|
| 1205 | struct test_is_codon { |
|---|
| 1206 | char protein; |
|---|
| 1207 | const char *codon; |
|---|
| 1208 | const char *tables; |
|---|
| 1209 | }; |
|---|
| 1210 | |
|---|
| 1211 | #define ALL_TABLES "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" // contains arb table-numbers |
|---|
| 1212 | |
|---|
| 1213 | test_is_codon is_codon[] = { |
|---|
| 1214 | { 'P', "CCC", ALL_TABLES }, |
|---|
| 1215 | { 'P', "CCN", ALL_TABLES }, |
|---|
| 1216 | { 'R', "CGN", ALL_TABLES }, |
|---|
| 1217 | |
|---|
| 1218 | { 'D', "GAY", ALL_TABLES }, |
|---|
| 1219 | { 'N', "AAY", ALL_TABLES }, |
|---|
| 1220 | { 'B', "AAY", ALL_TABLES }, // translates to 'N', but matches B(=D|N) for realigner |
|---|
| 1221 | { 'B', "GAY", ALL_TABLES }, // translates to 'D', but matches B(=D|N) for realigner |
|---|
| 1222 | { 'B', "RAY", ALL_TABLES }, // translates to 'D' or to 'N' (i.e. only matches 'B', see failing test for 'RAY' below) |
|---|
| 1223 | { 'B', "RAT", ALL_TABLES }, |
|---|
| 1224 | |
|---|
| 1225 | { 'Q', "CAR", ALL_TABLES }, |
|---|
| 1226 | { 'E', "GAR", ALL_TABLES }, |
|---|
| 1227 | { 'Z', "SAR", ALL_TABLES }, |
|---|
| 1228 | |
|---|
| 1229 | { 'X', "NNN", ALL_TABLES }, |
|---|
| 1230 | |
|---|
| 1231 | { 'L', "TTR", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "TTR", "16" }, |
|---|
| 1232 | { 'L', "YTA", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15" ",17,18,19,20,21,22,23,24" }, { 'X', "YTA", "2,16" }, // Y=TC |
|---|
| 1233 | { 'L', "CTM", "0,1"",3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, { 'T', "CTM", "2" }, // M=AC |
|---|
| 1234 | { 'L', "CTN", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTN", "2" }, { 'X', "CTN", "9,19" }, |
|---|
| 1235 | { 'L', "CTK", "0,1"",3,4,5,6,7,8"",10,11,12,13,14,15,16,17,18" ",20,21,22,23,24" }, { 'T', "CTK", "2" }, { 'X', "CTK", "9,19" }, // K=TG |
|---|
| 1236 | |
|---|
| 1237 | { 'L', "TWG", "13,15" }, // W=AT |
|---|
| 1238 | { 'J', "TWG", "13,15" }, // translates to 'L', but matches J(=I|L) for realigner |
|---|
| 1239 | { 'X', "TWG", "0,1,2,3,4,5,6,7,8,9,10,11,12" ",14" ",16,17,18,19,20,21,22,23,24" }, // all but 'L<->TWG' |
|---|
| 1240 | |
|---|
| 1241 | { 'S', "AGY", ALL_TABLES }, |
|---|
| 1242 | { 'S', "TCY", ALL_TABLES }, |
|---|
| 1243 | { 'S', "TCN", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,16,17,18,19,20,21,22,23,24" }, // all but 15 (where 'TCA->*') |
|---|
| 1244 | { 'S', "AGN", "4,6,11,14" }, |
|---|
| 1245 | { 'S', "AGR", "4,6,11,14" }, |
|---|
| 1246 | |
|---|
| 1247 | { '*', "AGR", "1" }, // R=AG |
|---|
| 1248 | { 'G', "AGR", "10" }, |
|---|
| 1249 | { 'X', "AGR", "17" }, |
|---|
| 1250 | { 'R', "AGR", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" }, |
|---|
| 1251 | |
|---|
| 1252 | { 'G', "AGA", "10" }, |
|---|
| 1253 | { 'S', "AGA", "4,6,11,14,17" }, |
|---|
| 1254 | { 'R', "AGA", "0,2,3,5,7,8,9,12,13,15,16,18,19,20,21,22,23,24" }, |
|---|
| 1255 | { '*', "AGA", "1" }, |
|---|
| 1256 | |
|---|
| 1257 | { 'K', "AGG", "17" }, |
|---|
| 1258 | |
|---|
| 1259 | { 'W', "TGR", "1,2,3,4,6,10,11,14,17,20,21,24" }, |
|---|
| 1260 | { 'X', "TGR", "0,5,7,8,9,12,13,15,16,18,19,22,23" }, // all but 'W<->TGR' (e.g. code==0: TGA->* & TGG->W => TGR->X) |
|---|
| 1261 | |
|---|
| 1262 | { 'C', "TGW", "7" }, // W = AT |
|---|
| 1263 | { 'X', "TGW", "0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24" }, // all but 'C<->TGW' |
|---|
| 1264 | |
|---|
| 1265 | { 'C', "TGT", ALL_TABLES }, |
|---|
| 1266 | |
|---|
| 1267 | { 'C', "TGA", "7" }, |
|---|
| 1268 | { 'G', "TGA", "18" }, |
|---|
| 1269 | { 'W', "TGA", "1,2,3,4,6,10,11,14,17,20,21,24" }, |
|---|
| 1270 | { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" }, // standard stop codons |
|---|
| 1271 | { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" }, |
|---|
| 1272 | { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, |
|---|
| 1273 | |
|---|
| 1274 | { '*', "TRA", "0,8,9,12,13,15,16,19,21" }, // R=AG |
|---|
| 1275 | { 'X', "TRA", "1,2,3,4,5,6,7,10,11,14,17,18,20,22,23,24" }, // all but '*<->TRA' |
|---|
| 1276 | |
|---|
| 1277 | { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" }, |
|---|
| 1278 | { 'Y', "TAR", "22" }, |
|---|
| 1279 | { 'E', "TAR", "23,24" }, |
|---|
| 1280 | { 'Q', "TAR", "5,20,21" }, |
|---|
| 1281 | { 'Z', "TAR", "5,20,21,23,24" }, // Z=EQ (TAR never translates to 'E', only 'Q') |
|---|
| 1282 | { 'X', "TAR", "11,12,13,15" }, |
|---|
| 1283 | |
|---|
| 1284 | { 'B', "AAW", "6,11,14" }, // W=AT |
|---|
| 1285 | { 'N', "AAW", "6,11,14" }, |
|---|
| 1286 | { 'X', "AAW", "0,1,2,3,4,5,7,8,9,10,12,13,15,16,17,18,19,20,21,22,23,24" }, // all but 'B<->AAW' & 'N<->AAW' |
|---|
| 1287 | |
|---|
| 1288 | { 'T', "CTG", "2" }, |
|---|
| 1289 | { 'S', "CTG", "9" }, |
|---|
| 1290 | { 'A', "CTG", "19" }, |
|---|
| 1291 | { 'L', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTG' & 'S<->CTG' |
|---|
| 1292 | { 'J', "CTG", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // same as for 'L' |
|---|
| 1293 | { 'M', "CTG", "0,3,8,9,17,19" }, // optional start-codon |
|---|
| 1294 | |
|---|
| 1295 | { 'T', "CTR", "2" }, |
|---|
| 1296 | { 'X', "CTR", "9,19" }, |
|---|
| 1297 | { 'L', "CTR", "0,1,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,20,21,22,23,24" }, // all but 'T<->CTR' & 'X<->CTR' |
|---|
| 1298 | |
|---|
| 1299 | { 'E', "KAR", "23,24" }, |
|---|
| 1300 | // Q <->KAR fails (see below) |
|---|
| 1301 | { 'Z', "KAR", "5,20,21,23,24" }, // Z=E|Q |
|---|
| 1302 | { 'X', "KAR", "0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,22" }, |
|---|
| 1303 | |
|---|
| 1304 | { 'G', "KGA", "18" }, |
|---|
| 1305 | { 'X', "KGA", "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20,21,22,23,24" }, // all but G<->KGA |
|---|
| 1306 | |
|---|
| 1307 | { 'E', "TAG", "23,24" }, |
|---|
| 1308 | { 'Q', "TAG", "5,12,20,21" }, |
|---|
| 1309 | { 'L', "TAG", "13,15" }, |
|---|
| 1310 | { 'Y', "TAG", "22" }, |
|---|
| 1311 | { 'J', "TAG", "13,15" }, // J=I|L |
|---|
| 1312 | { 'Z', "TAG", "5,12,20,21,23,24" }, // Z=E|Q |
|---|
| 1313 | { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, |
|---|
| 1314 | |
|---|
| 1315 | { 'J', "WTA", "0,3,5,6,7,8,9,11,12,13,15,17,18,19,20,21,22,23,24" }, |
|---|
| 1316 | |
|---|
| 1317 | { 'X', "A-C", ALL_TABLES }, |
|---|
| 1318 | { 'X', ".T.", ALL_TABLES }, |
|---|
| 1319 | |
|---|
| 1320 | // tests to protect buffer overflows in dna |
|---|
| 1321 | { 'X', "CG", ALL_TABLES }, |
|---|
| 1322 | { 'X', "T", ALL_TABLES }, |
|---|
| 1323 | |
|---|
| 1324 | // 0000000000111111111122222222223333333333444444444455555555556666 codon number (0-63) |
|---|
| 1325 | // 0123456789012345678901234567890123456789012345678901234567890123 |
|---|
| 1326 | // |
|---|
| 1327 | // "TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG" base1 |
|---|
| 1328 | // "TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG" base2 |
|---|
| 1329 | // "TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG" base3 |
|---|
| 1330 | // "--2M--*---**--*----M------------MMMM----------**---M------------" (= startStopSummary) |
|---|
| 1331 | // " ?! - ?? ? ! !!?- -- ! " (= optionality: !=all start/stop optional; -=no start/stop optional, ?=mixed) |
|---|
| 1332 | |
|---|
| 1333 | // test all start codons: |
|---|
| 1334 | { 'M', "TTA", "3" }, // start AND stop -> see ../ALILINK/TranslateRealign.cxx@TTA_AMBIGUITY |
|---|
| 1335 | { 'M', "TTG", "0,3,4,8,10,17,18" }, |
|---|
| 1336 | { 'L', "TTG", ALL_TABLES }, |
|---|
| 1337 | // M <->CTG already tested above |
|---|
| 1338 | { 'M', "ATT", "1,3,4,8,16" }, |
|---|
| 1339 | { 'M', "ATC", "1,3,4,8" }, |
|---|
| 1340 | { 'M', "ATA", "1,2,3,4,8,10,14" }, |
|---|
| 1341 | { 'I', "ATA", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" }, // optional for 3, 8 |
|---|
| 1342 | { 'M', "ATG", ALL_TABLES }, // no optional start |
|---|
| 1343 | { 'M', "ATR", "1,2,3,4,8,10,14" }, // R = AG (code=3 -> ATA->IM ATG->M) |
|---|
| 1344 | { 'M', "ATM", "1,3,4,8" }, // M = AC |
|---|
| 1345 | { 'M', "ATS", "1,3,4,8" }, // S = CG |
|---|
| 1346 | { 'M', "ATY", "1,3,4,8" }, // Y = TC |
|---|
| 1347 | { 'M', "ATK", "1,3,4,8,16" }, // K = TG |
|---|
| 1348 | { 'M', "ATW", "1,3,4,8" }, // W = AT |
|---|
| 1349 | { 'M', "ATV", "1,3,4,8" }, // V = ACG |
|---|
| 1350 | { 'M', "ATB", "1,3,4,8" }, // B = TCG |
|---|
| 1351 | { 'M', "ATD", "1,3,4,8" }, // D = ATG |
|---|
| 1352 | |
|---|
| 1353 | { 'M', "ATH", "1,3,4,8" }, // H = ACT |
|---|
| 1354 | { 'I', "ATH", "0,3,5,6,7,8,9,11,12,13,15,16,17,18,19,20,21,22,23,24" }, |
|---|
| 1355 | { 'X', "ATH", "2,10,14" }, |
|---|
| 1356 | |
|---|
| 1357 | { 'M', "ATN", "1,3,4,8" }, // H = ATCG |
|---|
| 1358 | { 'M', "GTG", "1,3,4,6,8,10,14,16,17,18" }, |
|---|
| 1359 | |
|---|
| 1360 | // test all stop codons: |
|---|
| 1361 | { '*', "AGA", "1" }, // (DUPTEST) |
|---|
| 1362 | { '*', "AGG", "1" }, |
|---|
| 1363 | { '*', "TAA", "0,1,2,3,4,6,7,8,9,10,12,13,14,15,16,17,18,19,21,24" },//(DUPTEST) |
|---|
| 1364 | { '*', "TAG", "0,1,2,3,4,6,7,8,9,10,11,14,16,17,18,19,21,24" }, // (DUPTEST) |
|---|
| 1365 | { '*', "TCA", "15" }, |
|---|
| 1366 | { '*', "TGA", "0,5,8,9,12,13,15,16,19,20,21,22,23" }, // (DUPTEST) |
|---|
| 1367 | { '*', "TTA", "16" }, |
|---|
| 1368 | |
|---|
| 1369 | { '*', "TWA", "16" }, // W = AT |
|---|
| 1370 | { '*', "TMA", "15" }, // M = AC |
|---|
| 1371 | { '*', "TAR", "0,1,2,3,4,6,7,8,9,10,14,16,17,18,19,21,24" }, // R = AG (DUPTEST) |
|---|
| 1372 | { '*', "TRA", "0,8,9,12,13,15,16,19,21" }, // R = AG (DUPTEST) |
|---|
| 1373 | { '*', "AGR", "1" }, // R = AG (DUPTEST) |
|---|
| 1374 | |
|---|
| 1375 | { 0, NULp, NULp} |
|---|
| 1376 | }; |
|---|
| 1377 | |
|---|
| 1378 | for (int c = 0; is_codon[c].protein; ++c) { |
|---|
| 1379 | const test_is_codon& C = is_codon[c]; |
|---|
| 1380 | TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon)); |
|---|
| 1381 | |
|---|
| 1382 | TransTables remaining; |
|---|
| 1383 | const char *failure; |
|---|
| 1384 | bool isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure); |
|---|
| 1385 | |
|---|
| 1386 | TEST_EXPECT_NULL(failure); |
|---|
| 1387 | TEST_EXPECT(isCodon); |
|---|
| 1388 | TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), C.tables); |
|---|
| 1389 | } |
|---|
| 1390 | |
|---|
| 1391 | // ----------------------------- |
|---|
| 1392 | // test invalid codons |
|---|
| 1393 | struct test_not_codon { |
|---|
| 1394 | char protein; |
|---|
| 1395 | const char *codon; |
|---|
| 1396 | const char *error; |
|---|
| 1397 | }; |
|---|
| 1398 | test_not_codon not_codon[] = { |
|---|
| 1399 | { 'P', "SYK", "Not all IUPAC-combinations of 'SYK' translate to 'P'" }, // correct (possible translations are PAL) |
|---|
| 1400 | { 'F', "SYK", "'SYK' never translates to 'F'" }, // correct failure |
|---|
| 1401 | { 'P', "NNN", "Not all IUPAC-combinations of 'NNN' translate to 'P'" }, // correct failure |
|---|
| 1402 | { 'D', "RAY", "Not all IUPAC-combinations of 'RAY' translate to 'D'" }, // correct failure |
|---|
| 1403 | { 'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E'" }, // correct failure |
|---|
| 1404 | { 'Q', "KAR", "Not all IUPAC-combinations of 'KAR' translate to 'Q'" }, // correct failure |
|---|
| 1405 | |
|---|
| 1406 | { 'S', "CYT", "'CYT' never translates to 'S'" }, // correct failure |
|---|
| 1407 | |
|---|
| 1408 | { 'O', "RAY", "'O' is no valid amino acid" }, |
|---|
| 1409 | { 'U', "AAA", "'U' is no valid amino acid" }, |
|---|
| 1410 | |
|---|
| 1411 | { 'L', "A-C", "Not enough nucleotides (got 'A-C')" }, // correct failure |
|---|
| 1412 | { 'V', ".T.", "Not enough nucleotides (got '.T.')" }, // correct failure |
|---|
| 1413 | { 'L', "...", "No nucleotides left" }, |
|---|
| 1414 | { 'J', "...", "No nucleotides left" }, |
|---|
| 1415 | |
|---|
| 1416 | { 'I', "ATR", "Not all IUPAC-combinations of 'ATR' translate to 'I'" }, // R = AG // ok: 'ATG' translates to 'M', not to 'I' |
|---|
| 1417 | |
|---|
| 1418 | { '*', "TYA", "Not all IUPAC-combinations of 'TYA' translate to '*'" }, // Y = TC; TCA(code=15) TTA(code=16) -> no code for both |
|---|
| 1419 | { '*', "TRR", "Not all IUPAC-combinations of 'TRR' translate to '*'" }, // R = AG (TGG does never translate to '*') |
|---|
| 1420 | { '*', "WGA", "Not all IUPAC-combinations of 'WGA' translate to '*'" }, // W = AT; AGA(1) TGA(other) -> no common codes |
|---|
| 1421 | { '*', "THA", "Not all IUPAC-combinations of 'THA' translate to '*'" }, // H = ACT; TAA(many) TCA(15) TTA(16) -> no code overlap between TCA and TTA |
|---|
| 1422 | |
|---|
| 1423 | { 'X', "...", "No nucleotides left" }, |
|---|
| 1424 | { 'X', "..", "No nucleotides left" }, |
|---|
| 1425 | { 'X', "-", "No nucleotides left" }, |
|---|
| 1426 | { 'X', "", "No nucleotides left" }, |
|---|
| 1427 | |
|---|
| 1428 | // test invalid chars |
|---|
| 1429 | { 'X', "AZA", "Invalid character 'Z' in DNA" }, |
|---|
| 1430 | { 'X', "A@A", "Invalid character '@' in DNA" }, |
|---|
| 1431 | { 'L', "AZA", "Invalid character 'Z' in DNA" }, |
|---|
| 1432 | |
|---|
| 1433 | // tests to protect buffer overflows in dna |
|---|
| 1434 | |
|---|
| 1435 | { 'A', "--", "No nucleotides left" }, |
|---|
| 1436 | { 'L', ".", "No nucleotides left" }, |
|---|
| 1437 | { 'J', ".", "No nucleotides left" }, |
|---|
| 1438 | { 'L', "AT", "Not enough nucleotides (got 'AT')" }, |
|---|
| 1439 | { 'L', "C", "Not enough nucleotides (got 'C')" }, |
|---|
| 1440 | { 'L', "", "No nucleotides left" }, |
|---|
| 1441 | |
|---|
| 1442 | { 0, NULp, NULp} |
|---|
| 1443 | }; |
|---|
| 1444 | for (int c = 0; not_codon[c].protein; ++c) { |
|---|
| 1445 | const test_not_codon& C = not_codon[c]; |
|---|
| 1446 | TEST_ANNOTATE(GBS_global_string("%c <- %s", C.protein, C.codon)); |
|---|
| 1447 | |
|---|
| 1448 | TransTables remaining; |
|---|
| 1449 | const char *failure; |
|---|
| 1450 | bool isCodon = AWT_is_codon(C.protein, C.codon, allowed, remaining, &failure); |
|---|
| 1451 | |
|---|
| 1452 | if (isCodon) { // the test-case makes no sense in 'not_codon' |
|---|
| 1453 | TEST_EXPECT_EQUAL(remaining.to_string(TTIT_ARB), ""); // -> move the failing test-case up into 'is_codon'-section |
|---|
| 1454 | } |
|---|
| 1455 | else { |
|---|
| 1456 | TEST_EXPECT_EQUAL(failure, C.error); |
|---|
| 1457 | } |
|---|
| 1458 | TEST_EXPECT(!isCodon); |
|---|
| 1459 | } |
|---|
| 1460 | |
|---|
| 1461 | // ---------------------------------- |
|---|
| 1462 | // test uncombinable codons |
|---|
| 1463 | struct test_uncombinable_codons { |
|---|
| 1464 | char protein1; |
|---|
| 1465 | const char *codon1; |
|---|
| 1466 | const char *tables; |
|---|
| 1467 | char protein2; |
|---|
| 1468 | const char *codon2; |
|---|
| 1469 | const char *error; |
|---|
| 1470 | }; |
|---|
| 1471 | test_uncombinable_codons uncomb_codons[] = { |
|---|
| 1472 | { '*', "TTA", "16", 'E', "SAR", "Not all IUPAC-combinations of 'SAR' translate to 'E' (for trans-table 23)" }, |
|---|
| 1473 | { '*', "TTA", "16", 'X', "TRA", "'TRA' never translates to 'X' (for trans-table 23)" }, |
|---|
| 1474 | { 'L', "TAG", "13,15", 'X', "TRA", "'TRA' never translates to 'X' (for any of the leftover trans-tables: 16,22)" }, |
|---|
| 1475 | { 'L', "TAG", "13,15", 'Q', "TAR", "'TAR' never translates to 'Q' (for any of the leftover trans-tables: 16,22)" }, |
|---|
| 1476 | { '*', "TTA", "16", '*', "TCA", "'TCA' does not translate to '*' (for trans-table 23)" }, |
|---|
| 1477 | { 'N', "AAA", "6,11,14", 'X', "AAW", "'AAW' never translates to 'X' (for any of the leftover trans-tables: 9,14,21)" }, |
|---|
| 1478 | { 'N', "AAA", "6,11,14", 'K', "AAA", "'AAA' does not translate to 'K' (for any of the leftover trans-tables: 9,14,21)" }, |
|---|
| 1479 | |
|---|
| 1480 | { 0, NULp, NULp, 0, NULp, NULp} |
|---|
| 1481 | }; |
|---|
| 1482 | |
|---|
| 1483 | for (int c = 0; uncomb_codons[c].protein1; ++c) { |
|---|
| 1484 | const test_uncombinable_codons& C = uncomb_codons[c]; |
|---|
| 1485 | TEST_ANNOTATE(GBS_global_string("%c <- %s + %c <- %s", C.protein1, C.codon1, C.protein2, C.codon2)); |
|---|
| 1486 | |
|---|
| 1487 | TransTables remaining1; |
|---|
| 1488 | const char *failure; |
|---|
| 1489 | bool isCodon = AWT_is_codon(C.protein1, C.codon1, allowed, remaining1, &failure); |
|---|
| 1490 | |
|---|
| 1491 | TEST_EXPECT(isCodon); |
|---|
| 1492 | TEST_EXPECT_EQUAL(remaining1.to_string(TTIT_ARB), C.tables); |
|---|
| 1493 | |
|---|
| 1494 | // @@@ add separate test: show protein2/codon2 return true from AWT_is_codon if not called with remaining1 |
|---|
| 1495 | |
|---|
| 1496 | TransTables remaining2; |
|---|
| 1497 | isCodon = AWT_is_codon(C.protein2, C.codon2, remaining1, remaining2, &failure); |
|---|
| 1498 | TEST_EXPECT_EQUAL(failure, C.error); |
|---|
| 1499 | TEST_REJECT(isCodon); |
|---|
| 1500 | |
|---|
| 1501 | } |
|---|
| 1502 | } |
|---|
| 1503 | |
|---|
| 1504 | #endif // UNIT_TESTS |
|---|
| 1505 | |
|---|
| 1506 | // -------------------------------------------------------------------------------- |
|---|