| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : GEN_translations.cxx // |
|---|
| 4 | // Purpose : supports removal of redundant translations of // |
|---|
| 5 | // gene CDS // |
|---|
| 6 | // // |
|---|
| 7 | // Coded by Ralf Westram (coder@reallysoft.de) in January 2009 // |
|---|
| 8 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 9 | // http://www.arb-home.de/ // |
|---|
| 10 | // // |
|---|
| 11 | // =============================================================== // |
|---|
| 12 | |
|---|
| 13 | #include "GEN_local.hxx" |
|---|
| 14 | |
|---|
| 15 | #include <Translate.hxx> |
|---|
| 16 | #include <AP_codon_table.hxx> |
|---|
| 17 | #include <aw_question.hxx> |
|---|
| 18 | #include <arbdbt.h> |
|---|
| 19 | |
|---|
| 20 | using namespace std; |
|---|
| 21 | |
|---|
| 22 | // ------------------------------------------------- |
|---|
| 23 | // remove redundant translations from genes |
|---|
| 24 | |
|---|
| 25 | // @@@ add menu-entry to genome-NTREE ("Remove reproducible translations") |
|---|
| 26 | |
|---|
| 27 | static char *translate_gene_sequence(GBDATA *gb_gene, GB_ERROR& error, int& translated_length, char *startCodon) { |
|---|
| 28 | // return translation of gene sequence |
|---|
| 29 | // the start codon is copied into result buffer 'startCodon' (has to be sized 4 bytes) |
|---|
| 30 | |
|---|
| 31 | size_t gene_length; |
|---|
| 32 | char *gene_seq = GBT_read_gene_sequence_and_length(gb_gene, true, 0, &gene_length); |
|---|
| 33 | if (!gene_seq) error = GB_await_error(); |
|---|
| 34 | else { |
|---|
| 35 | // store start codon in result buffer: |
|---|
| 36 | memcpy(startCodon, gene_seq, 3); |
|---|
| 37 | startCodon[3] = 0; |
|---|
| 38 | |
|---|
| 39 | int arb_transl_table, codon_start; |
|---|
| 40 | error = translate_getInfo(gb_gene, arb_transl_table, codon_start); |
|---|
| 41 | |
|---|
| 42 | if (arb_transl_table == -1) arb_transl_table = TTIT_embl2arb(1); // use embl table 1 (standard code) |
|---|
| 43 | if (codon_start == -1) codon_start = 0; // default codon start |
|---|
| 44 | |
|---|
| 45 | if (!error) translate_nuc2aa(arb_transl_table, gene_seq, gene_length, codon_start, false, true, true, &translated_length); |
|---|
| 46 | |
|---|
| 47 | if (error) { |
|---|
| 48 | free(gene_seq); |
|---|
| 49 | gene_seq = NULp; |
|---|
| 50 | } |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | return gene_seq; |
|---|
| 54 | } |
|---|
| 55 | |
|---|
| 56 | enum GEN_remove_state { |
|---|
| 57 | GRS_NO_CHANGE = 0, // no translation found |
|---|
| 58 | GRS_FAILED = 1, // error is set |
|---|
| 59 | GRS_TRANSLATION_REMOVED = 2, // translation was present, reproducible and has been removed |
|---|
| 60 | GRS_TRANSLATION_FAILED = 4, // translation differed (wrote ARB translation to field 'ARB_translation') |
|---|
| 61 | GRS_START_CODON_WRONG = 8, // translation differed only in start codon |
|---|
| 62 | GRS_NOTE_ADDED = 16, // note has been added |
|---|
| 63 | }; |
|---|
| 64 | |
|---|
| 65 | static GEN_remove_state remove_redundant_translation(GBDATA *gb_gene, bool ignore_start_codon_error, char *errornousCodon, GB_ERROR &error) { |
|---|
| 66 | // If translation can be re-produced by ARB, |
|---|
| 67 | // it will be removed |
|---|
| 68 | // ('ARB_translation' will be removed as well in this case) |
|---|
| 69 | // Otherwise |
|---|
| 70 | // a field 'ARB_translation' is inserted, which contains the translation generated by ARB. |
|---|
| 71 | // |
|---|
| 72 | // If result is GRS_START_CODON_WRONG, the questionable codon is copied into errornousCodon. |
|---|
| 73 | // (errornousCodon has to be a buffer with size == 4) |
|---|
| 74 | |
|---|
| 75 | // @@@ If another code or codonstart translates fine, a hint shall be written to field 'translation_hint' |
|---|
| 76 | |
|---|
| 77 | GEN_remove_state result = GRS_NO_CHANGE; |
|---|
| 78 | char *add_note = NULp; // will be added as 'ARB_translation_note' (if set) |
|---|
| 79 | error = NULp; |
|---|
| 80 | |
|---|
| 81 | #define set_result_bit(s) result = GEN_remove_state(result|s) |
|---|
| 82 | |
|---|
| 83 | GBDATA *gb_translation = GB_entry(gb_gene, "translation"); |
|---|
| 84 | if (gb_translation) { |
|---|
| 85 | int translated_length; |
|---|
| 86 | char *generated = translate_gene_sequence(gb_gene, error, translated_length, errornousCodon); |
|---|
| 87 | |
|---|
| 88 | if (!generated || translated_length<1) { |
|---|
| 89 | // insert note and continue |
|---|
| 90 | add_note = GBS_global_string_copy("Failed to translate gene-sequence (%s)", error); |
|---|
| 91 | error = NULp; |
|---|
| 92 | set_result_bit(GRS_TRANSLATION_FAILED); |
|---|
| 93 | } |
|---|
| 94 | else { |
|---|
| 95 | if (generated[translated_length-1] == '*') { |
|---|
| 96 | generated[--translated_length] = 0; // cut off stop codon |
|---|
| 97 | } |
|---|
| 98 | |
|---|
| 99 | const char *original = GB_read_char_pntr(gb_translation); |
|---|
| 100 | |
|---|
| 101 | bool remove = false; |
|---|
| 102 | if (strcmp(generated+1, original+1) == 0) { // most of translation matches |
|---|
| 103 | if (generated[0] == original[0]) { // start codon matches |
|---|
| 104 | remove = true; |
|---|
| 105 | } |
|---|
| 106 | else { // start codon differs |
|---|
| 107 | set_result_bit(GRS_START_CODON_WRONG); // report |
|---|
| 108 | remove = ignore_start_codon_error; // and delete if requested |
|---|
| 109 | } |
|---|
| 110 | } |
|---|
| 111 | |
|---|
| 112 | if (remove) { // remove translation and related entries |
|---|
| 113 | const char *to_remove[] = { |
|---|
| 114 | "translation", |
|---|
| 115 | "ARB_translation", |
|---|
| 116 | "ARB_translation_note", |
|---|
| 117 | NULp |
|---|
| 118 | }; |
|---|
| 119 | |
|---|
| 120 | GB_ERROR err = NULp; |
|---|
| 121 | int failed_field = -1; |
|---|
| 122 | |
|---|
| 123 | for (int r = 0; to_remove[r] && !err; ++r) { |
|---|
| 124 | GBDATA *gb_remove = GB_entry(gb_gene, to_remove[r]); |
|---|
| 125 | if (gb_remove) { |
|---|
| 126 | err = GB_delete(gb_remove); |
|---|
| 127 | if (err) failed_field = r; |
|---|
| 128 | } |
|---|
| 129 | } |
|---|
| 130 | if (err) error = GBS_global_string("Failed to delete field '%s' (%s)", to_remove[failed_field], err); |
|---|
| 131 | else { |
|---|
| 132 | error = GBT_write_byte(gb_gene, "ARB_translation_rm", 1); |
|---|
| 133 | if (!error) set_result_bit(GRS_TRANSLATION_REMOVED); |
|---|
| 134 | } |
|---|
| 135 | } |
|---|
| 136 | else { |
|---|
| 137 | error = GBT_write_string(gb_gene, "ARB_translation", generated); |
|---|
| 138 | if (!error) set_result_bit(GRS_TRANSLATION_FAILED); |
|---|
| 139 | } |
|---|
| 140 | } |
|---|
| 141 | free(generated); |
|---|
| 142 | } |
|---|
| 143 | |
|---|
| 144 | if (add_note && !error) { |
|---|
| 145 | error = GBT_write_string(gb_gene, "ARB_translation_note", add_note); |
|---|
| 146 | set_result_bit(GRS_NOTE_ADDED); |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | if (error) result = GRS_FAILED; |
|---|
| 150 | free(add_note); |
|---|
| 151 | |
|---|
| 152 | return result; |
|---|
| 153 | |
|---|
| 154 | #undef set_result_bit |
|---|
| 155 | } |
|---|
| 156 | |
|---|
| 157 | GB_ERROR GEN_testAndRemoveTranslations(GBDATA *gb_gene_data, void (*warn)(AW_CL cd, const char *msg), AW_CL cd, AW_repeated_question *ok_to_ignore_wrong_start_codon) { |
|---|
| 158 | int ok = 0; // identical translations |
|---|
| 159 | int failed = 0; // non-identical translations |
|---|
| 160 | int wrong_start_codon = 0; // translations where start_codon differed |
|---|
| 161 | int no_entry = 0; // genes w/o 'translation' entry |
|---|
| 162 | int note_added = 0; // count gene for which a note has been added |
|---|
| 163 | GB_ERROR error = NULp; |
|---|
| 164 | |
|---|
| 165 | const int possibleCodons = 4*4*4; |
|---|
| 166 | GB_HASH *wrongStartCodons = GBS_create_hash(possibleCodons, GB_IGNORE_CASE); |
|---|
| 167 | |
|---|
| 168 | for (GBDATA *gb_gene = GB_entry(gb_gene_data, "gene"); gb_gene && !error; gb_gene = GB_nextEntry(gb_gene)) { |
|---|
| 169 | int retry = 0; |
|---|
| 170 | for (int Try = 0; Try <= retry && !error; Try++) { |
|---|
| 171 | error = NULp; |
|---|
| 172 | |
|---|
| 173 | char startCodon[4]; |
|---|
| 174 | GEN_remove_state state = remove_redundant_translation(gb_gene, Try, startCodon, error); |
|---|
| 175 | |
|---|
| 176 | switch (state) { |
|---|
| 177 | case GRS_NO_CHANGE: |
|---|
| 178 | no_entry++; |
|---|
| 179 | break; |
|---|
| 180 | |
|---|
| 181 | case GRS_FAILED: |
|---|
| 182 | gen_assert(error); |
|---|
| 183 | break; |
|---|
| 184 | |
|---|
| 185 | default: |
|---|
| 186 | if (state&GRS_TRANSLATION_REMOVED) { |
|---|
| 187 | ok++; |
|---|
| 188 | } |
|---|
| 189 | else { |
|---|
| 190 | gen_assert(state&GRS_TRANSLATION_FAILED); |
|---|
| 191 | if (Try == 0) { |
|---|
| 192 | if (state&GRS_START_CODON_WRONG) { |
|---|
| 193 | wrong_start_codon++; |
|---|
| 194 | AW_repeated_question* q = ok_to_ignore_wrong_start_codon; |
|---|
| 195 | |
|---|
| 196 | if (q->get_answer("only_start_codon_differs", |
|---|
| 197 | "Translation differs only in start codon", |
|---|
| 198 | "Ignore and remove,Keep translation", "all", false) == 0) { |
|---|
| 199 | retry++; |
|---|
| 200 | } |
|---|
| 201 | else { |
|---|
| 202 | failed++; |
|---|
| 203 | } |
|---|
| 204 | |
|---|
| 205 | GBS_incr_hash(wrongStartCodons, startCodon); |
|---|
| 206 | } |
|---|
| 207 | else if (state&GRS_NOTE_ADDED) { |
|---|
| 208 | failed++; |
|---|
| 209 | note_added++; |
|---|
| 210 | } |
|---|
| 211 | } |
|---|
| 212 | else { |
|---|
| 213 | failed++; |
|---|
| 214 | } |
|---|
| 215 | } |
|---|
| 216 | break; |
|---|
| 217 | } |
|---|
| 218 | } |
|---|
| 219 | } |
|---|
| 220 | |
|---|
| 221 | if (!error && failed>0) { |
|---|
| 222 | warn(cd, GBS_global_string("%i translations could not be reproduced by ARB", failed)); |
|---|
| 223 | static bool first_warning = true; |
|---|
| 224 | if (first_warning) { // show details once |
|---|
| 225 | warn(cd, |
|---|
| 226 | "Note: Reproducible translations were removed from database.\n" |
|---|
| 227 | " Failed translations were left in database and an additional\n" |
|---|
| 228 | " field 'ARB_translation' was added."); |
|---|
| 229 | warn(cd, GBS_global_string("- %i genes had no translation entry", no_entry)); |
|---|
| 230 | warn(cd, GBS_global_string("- %i translations were reproducible", ok)); |
|---|
| 231 | first_warning = false; |
|---|
| 232 | } |
|---|
| 233 | if (wrong_start_codon>0) { |
|---|
| 234 | char *codonInfo = GBS_hashtab_2_string(wrongStartCodons); |
|---|
| 235 | warn(cd, GBS_global_string("- %i translations had wrong start codon (%s)", wrong_start_codon, codonInfo)); |
|---|
| 236 | free(codonInfo); |
|---|
| 237 | } |
|---|
| 238 | if (note_added>0) { |
|---|
| 239 | warn(cd, GBS_global_string("- %i ARB_translation_note entries were generated. Please examine!", note_added)); |
|---|
| 240 | } |
|---|
| 241 | } |
|---|
| 242 | |
|---|
| 243 | GBS_free_hash(wrongStartCodons); |
|---|
| 244 | |
|---|
| 245 | return error; |
|---|
| 246 | } |
|---|
| 247 | |
|---|
| 248 | |
|---|
| 249 | |
|---|