1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : GEN_translations.cxx // |
---|
4 | // Purpose : supports removal of redundant translations of // |
---|
5 | // gene CDS // |
---|
6 | // // |
---|
7 | // Coded by Ralf Westram (coder@reallysoft.de) in January 2009 // |
---|
8 | // Institute of Microbiology (Technical University Munich) // |
---|
9 | // http://www.arb-home.de/ // |
---|
10 | // // |
---|
11 | // =============================================================== // |
---|
12 | |
---|
13 | #include <arbdbt.h> |
---|
14 | #include <awt_translate.hxx> |
---|
15 | #include <awt_codon_table.hxx> |
---|
16 | #include <aw_question.hxx> |
---|
17 | |
---|
18 | #include "GEN.hxx" |
---|
19 | #include "GEN_local.hxx" |
---|
20 | |
---|
21 | using namespace std; |
---|
22 | |
---|
23 | // ------------------------------------------------- |
---|
24 | // remove redundant translations from genes |
---|
25 | |
---|
26 | #if defined(DEVEL_RALF) |
---|
27 | # warning add menu-entry to genome-NTREE ("Remove reproduceable translations") |
---|
28 | #endif // DEVEL_RALF |
---|
29 | |
---|
30 | static char *translate_gene_sequence(GBDATA *gb_gene, GB_ERROR& error, int& translated_length, char *startCodon) { |
---|
31 | // return translation of gene sequence |
---|
32 | // the start codon is copied into result buffer 'startCodon' (has to be sized 4 bytes) |
---|
33 | |
---|
34 | size_t gene_length; |
---|
35 | char *gene_seq = GBT_read_gene_sequence_and_length(gb_gene, GB_TRUE, 0, &gene_length); |
---|
36 | if (!gene_seq) error = GB_await_error(); |
---|
37 | else { |
---|
38 | // store start codon in result buffer: |
---|
39 | memcpy(startCodon, gene_seq, 3); |
---|
40 | startCodon[3] = 0; |
---|
41 | |
---|
42 | int arb_transl_table, codon_start; |
---|
43 | error = AWT_getTranslationInfo(gb_gene, arb_transl_table, codon_start); |
---|
44 | |
---|
45 | if (arb_transl_table == -1) arb_transl_table = AWT_embl_transl_table_2_arb_code_nr(1); // use embl table 1 (standard code) |
---|
46 | if (codon_start == -1) codon_start = 0; // default codon start |
---|
47 | |
---|
48 | if (!error) AWT_pro_a_nucs_convert(arb_transl_table, gene_seq, gene_length, codon_start, false, true, true, &translated_length); |
---|
49 | |
---|
50 | if (error) { |
---|
51 | free(gene_seq); |
---|
52 | gene_seq = 0; |
---|
53 | } |
---|
54 | } |
---|
55 | |
---|
56 | return gene_seq; |
---|
57 | } |
---|
58 | |
---|
59 | enum GEN_remove_state { |
---|
60 | GRS_NO_CHANGE = 0, // no translation found |
---|
61 | GRS_FAILED = 1, // error is set |
---|
62 | GRS_TRANSLATION_REMOVED = 2, // translation was present, reproducible and has been removed |
---|
63 | GRS_TRANSLATION_FAILED = 4, // translation differed (wrote ARB translation to field 'ARB_translation') |
---|
64 | GRS_START_CODON_WRONG = 8, // translation differed only in start codon |
---|
65 | GRS_NOTE_ADDED = 16, // note has been added |
---|
66 | }; |
---|
67 | |
---|
68 | static GEN_remove_state remove_redundant_translation(GBDATA *gb_gene, bool ignore_start_codon_error, char *errornousCodon, GB_ERROR &error) { |
---|
69 | // If translation can be re-produced by ARB, |
---|
70 | // it will be removed |
---|
71 | // ('ARB_translation' will be removed as well in this case) |
---|
72 | // Otherwise |
---|
73 | // a field 'ARB_translation' is inserted, which contains the translation generated by ARB. |
---|
74 | // |
---|
75 | // If result is GRS_START_CODON_WRONG, the questionable codon is copied into errornousCodon. |
---|
76 | // (errornousCodon has to be a buffer with size == 4) |
---|
77 | // |
---|
78 | // If another code or codonstart translates fine, a hint shall be written to field 'translation_hint' |
---|
79 | #if defined(DEVEL_RALF) |
---|
80 | #warning TODO: If another code or codonstart translates fine, a hint shall be written to field 'translation_hint' |
---|
81 | #endif // DEVEL_RALF |
---|
82 | |
---|
83 | GEN_remove_state result = GRS_NO_CHANGE; |
---|
84 | error = 0; |
---|
85 | char *add_note = 0; // will be added as 'ARB_translation_note' (if set) |
---|
86 | |
---|
87 | const char *to_remove[] = { |
---|
88 | "translation", |
---|
89 | "ARB_translation", |
---|
90 | "ARB_translation_note", |
---|
91 | 0 |
---|
92 | }; |
---|
93 | |
---|
94 | #define set_result_bit(s) result = GEN_remove_state(result|s) |
---|
95 | |
---|
96 | GBDATA *gb_translation = GB_entry(gb_gene, "translation"); |
---|
97 | if (gb_translation) { |
---|
98 | int translated_length; |
---|
99 | char *generated = translate_gene_sequence(gb_gene, error, translated_length, errornousCodon); |
---|
100 | |
---|
101 | if (!generated || translated_length<1) { |
---|
102 | // insert note and continue |
---|
103 | add_note = GBS_global_string_copy("Failed to translate gene-sequence (%s)", error); |
---|
104 | error = 0; |
---|
105 | set_result_bit(GRS_TRANSLATION_FAILED); |
---|
106 | } |
---|
107 | else { |
---|
108 | if (generated[translated_length-1] == '*') { |
---|
109 | generated[--translated_length] = 0; // cut off stop codon |
---|
110 | } |
---|
111 | |
---|
112 | const char *original = GB_read_char_pntr(gb_translation); |
---|
113 | |
---|
114 | bool remove = false; |
---|
115 | if (strcmp(generated+1, original+1) == 0) { // most of translation matches |
---|
116 | if (generated[0] == original[0]) { // start codon matches |
---|
117 | remove = true; |
---|
118 | } |
---|
119 | else { // start codon differs |
---|
120 | set_result_bit(GRS_START_CODON_WRONG); // report |
---|
121 | remove = ignore_start_codon_error; // and delete if requested |
---|
122 | } |
---|
123 | } |
---|
124 | |
---|
125 | if (remove) { // remove translation and related entries |
---|
126 | GB_ERROR err = 0; |
---|
127 | int failed_field = -1; |
---|
128 | for (int r = 0; to_remove[r] && !err; ++r) { |
---|
129 | GBDATA *gb_remove = GB_entry(gb_gene, to_remove[r]); |
---|
130 | if (gb_remove) { |
---|
131 | err = GB_delete(gb_remove); |
---|
132 | if (err) failed_field = r; |
---|
133 | } |
---|
134 | } |
---|
135 | if (err) error = GBS_global_string("Failed to delete field '%s' (%s)", to_remove[failed_field], err); |
---|
136 | else { |
---|
137 | error = GBT_write_byte(gb_gene, "ARB_translation_rm", 1); |
---|
138 | if (!error) set_result_bit(GRS_TRANSLATION_REMOVED); |
---|
139 | } |
---|
140 | } |
---|
141 | else { |
---|
142 | error = GBT_write_string(gb_gene, "ARB_translation", generated); |
---|
143 | if (!error) set_result_bit(GRS_TRANSLATION_FAILED); |
---|
144 | } |
---|
145 | } |
---|
146 | free(generated); |
---|
147 | } |
---|
148 | |
---|
149 | if (add_note && !error) { |
---|
150 | error = GBT_write_string(gb_gene, "ARB_translation_note", add_note); |
---|
151 | set_result_bit(GRS_NOTE_ADDED); |
---|
152 | } |
---|
153 | |
---|
154 | if (error) result = GRS_FAILED; |
---|
155 | free(add_note); |
---|
156 | |
---|
157 | return result; |
---|
158 | |
---|
159 | #undef set_result_bit |
---|
160 | } |
---|
161 | |
---|
162 | GB_ERROR GEN_testAndRemoveTranslations(GBDATA *gb_gene_data, void (*warn)(AW_CL cd, const char *msg), AW_CL cd, AW_repeated_question *ok_to_ignore_wrong_start_codon) { |
---|
163 | int ok = 0; // identical translations |
---|
164 | int failed = 0; // non-identical translations |
---|
165 | int wrong_start_codon = 0; // translations where start_codon differed |
---|
166 | int no_entry = 0; // genes w/o 'translation' entry |
---|
167 | int note_added = 0; // count gene for which a note has been added |
---|
168 | GB_ERROR error = 0; |
---|
169 | |
---|
170 | GB_HASH *wrongStartCodons = GBS_create_hash(50, GB_IGNORE_CASE); |
---|
171 | |
---|
172 | for (GBDATA *gb_gene = GB_entry(gb_gene_data, "gene"); gb_gene && !error; gb_gene = GB_nextEntry(gb_gene)) { |
---|
173 | int retry = 0; |
---|
174 | for (int Try = 0; Try <= retry && !error; Try++) { |
---|
175 | error = 0; |
---|
176 | |
---|
177 | char startCodon[4]; |
---|
178 | GEN_remove_state state = remove_redundant_translation(gb_gene, Try, startCodon, error); |
---|
179 | |
---|
180 | switch (state) { |
---|
181 | case GRS_NO_CHANGE: |
---|
182 | no_entry++; |
---|
183 | break; |
---|
184 | |
---|
185 | case GRS_FAILED: |
---|
186 | gen_assert(error); |
---|
187 | break; |
---|
188 | |
---|
189 | default: |
---|
190 | if (state&GRS_TRANSLATION_REMOVED) { |
---|
191 | ok++; |
---|
192 | } |
---|
193 | else { |
---|
194 | gen_assert(state&GRS_TRANSLATION_FAILED); |
---|
195 | if (Try == 0) { |
---|
196 | if (state&GRS_START_CODON_WRONG) { |
---|
197 | wrong_start_codon++; |
---|
198 | AW_repeated_question* q = ok_to_ignore_wrong_start_codon; |
---|
199 | |
---|
200 | if (q->get_answer("Translation differs only in start codon", |
---|
201 | "Ignore and remove,Keep translation", "all", false) == 0) { |
---|
202 | retry++; |
---|
203 | } |
---|
204 | else { |
---|
205 | failed++; |
---|
206 | } |
---|
207 | |
---|
208 | GBS_incr_hash(wrongStartCodons, startCodon); |
---|
209 | } |
---|
210 | else if (state&GRS_NOTE_ADDED) { |
---|
211 | failed++; |
---|
212 | note_added++; |
---|
213 | } |
---|
214 | } |
---|
215 | else { |
---|
216 | failed++; |
---|
217 | } |
---|
218 | } |
---|
219 | break; |
---|
220 | } |
---|
221 | } |
---|
222 | } |
---|
223 | |
---|
224 | if (!error && failed>0) { |
---|
225 | warn(cd, GBS_global_string("%i translations could not be reproduced by ARB", failed)); |
---|
226 | static bool first_warning = true; |
---|
227 | if (first_warning) { // show details once |
---|
228 | warn(cd, |
---|
229 | "Note: Reproducible translations were removed from database.\n" |
---|
230 | " Failed translations were left in database and an additional\n" |
---|
231 | " field 'ARB_translation' was added."); |
---|
232 | warn(cd, GBS_global_string("- %i genes had no translation entry", no_entry)); |
---|
233 | warn(cd, GBS_global_string("- %i translations were reproducible", ok)); |
---|
234 | first_warning = false; |
---|
235 | } |
---|
236 | if (wrong_start_codon>0) { |
---|
237 | char *codonInfo = GBS_hashtab_2_string(wrongStartCodons); |
---|
238 | warn(cd, GBS_global_string("- %i translations had wrong start codon (%s)", wrong_start_codon, codonInfo)); |
---|
239 | free(codonInfo); |
---|
240 | } |
---|
241 | if (note_added>0) { |
---|
242 | warn(cd, GBS_global_string("- %i ARB_translation_note entries were generated. Please examine!", note_added)); |
---|
243 | } |
---|
244 | } |
---|
245 | |
---|
246 | GBS_free_hash(wrongStartCodons); |
---|
247 | |
---|
248 | return error; |
---|
249 | } |
---|
250 | |
---|
251 | |
---|
252 | |
---|