| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : AWT_translate.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2006 // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | #include "Translate.hxx" |
|---|
| 13 | |
|---|
| 14 | #include <AP_pro_a_nucs.hxx> |
|---|
| 15 | #include <AP_codon_table.hxx> |
|---|
| 16 | #include <arbdbt.h> |
|---|
| 17 | |
|---|
| 18 | #define tl_assert(cond) arb_assert(cond) |
|---|
| 19 | |
|---|
| 20 | GB_ERROR AWT_saveTranslationInfo(GBDATA *gb_species, int arb_transl_table, int codon_start) { |
|---|
| 21 | int embl_transl_table = AWT_arb_code_nr_2_embl_transl_table(arb_transl_table); |
|---|
| 22 | |
|---|
| 23 | tl_assert(codon_start >= 0 && codon_start<3); // codon_start has to be 0..2 |
|---|
| 24 | tl_assert(embl_transl_table >= 0); |
|---|
| 25 | |
|---|
| 26 | GB_ERROR error = GBT_write_string(gb_species, "transl_table", GBS_global_string("%i", embl_transl_table)); |
|---|
| 27 | if (!error) error = GBT_write_string(gb_species, "codon_start", GBS_global_string("%i", codon_start+1)); |
|---|
| 28 | |
|---|
| 29 | return error; |
|---|
| 30 | } |
|---|
| 31 | |
|---|
| 32 | GB_ERROR AWT_removeTranslationInfo(GBDATA *gb_species) { |
|---|
| 33 | GB_ERROR error = NULL; |
|---|
| 34 | |
|---|
| 35 | GBDATA *gb_transl_table = GB_entry(gb_species, "transl_table"); |
|---|
| 36 | if (gb_transl_table) error = GB_delete(gb_transl_table); |
|---|
| 37 | |
|---|
| 38 | if (!error) { |
|---|
| 39 | GBDATA *gb_codon_start = GB_entry(gb_species, "codon_start"); |
|---|
| 40 | if (gb_codon_start) error = GB_delete(gb_codon_start); |
|---|
| 41 | } |
|---|
| 42 | |
|---|
| 43 | return error; |
|---|
| 44 | } |
|---|
| 45 | |
|---|
| 46 | GB_ERROR AWT_getTranslationInfo(GBDATA *gb_item, int& arb_transl_table, int& codon_start) { |
|---|
| 47 | // looks for sub-entries 'transl_table' and 'codon_start' of species (works for genes as well) |
|---|
| 48 | // if found -> test for validity and translate 'transl_table' from EMBL to ARB table number |
|---|
| 49 | // |
|---|
| 50 | // returns: an error in case of problems |
|---|
| 51 | // |
|---|
| 52 | // 'arb_transl_table' is set to -1 if not found, otherwise it contains the arb table number |
|---|
| 53 | // 'codon_start' is set to -1 if not found, otherwise it contains the codon_start (0..2) |
|---|
| 54 | |
|---|
| 55 | arb_transl_table = -1; // not found yet |
|---|
| 56 | codon_start = -1; // not found yet |
|---|
| 57 | |
|---|
| 58 | GB_ERROR error = 0; |
|---|
| 59 | GBDATA *gb_transl_table = GB_entry(gb_item, "transl_table"); |
|---|
| 60 | |
|---|
| 61 | if (gb_transl_table) { |
|---|
| 62 | int embl_table = atoi(GB_read_char_pntr(gb_transl_table)); |
|---|
| 63 | arb_transl_table = AWT_embl_transl_table_2_arb_code_nr(embl_table); |
|---|
| 64 | if (arb_transl_table == -1) { // ill. table |
|---|
| 65 | error = GBS_global_string("Illegal (or unsupported) value (%i) in 'transl_table'", embl_table); |
|---|
| 66 | } |
|---|
| 67 | } |
|---|
| 68 | |
|---|
| 69 | if (!error) { |
|---|
| 70 | GBDATA *gb_codon_start = GB_entry(gb_item, "codon_start"); |
|---|
| 71 | if (gb_codon_start) { |
|---|
| 72 | int codon_start_value = atoi(GB_read_char_pntr(gb_codon_start)); |
|---|
| 73 | |
|---|
| 74 | if (codon_start_value<1 || codon_start_value>3) { |
|---|
| 75 | error = GBS_global_string("Illegal value (%i) in 'codon_start' (allowed: 1..3)", codon_start_value); |
|---|
| 76 | } |
|---|
| 77 | else { |
|---|
| 78 | codon_start = codon_start_value-1; // internal value is 0..2 |
|---|
| 79 | } |
|---|
| 80 | } |
|---|
| 81 | else if (arb_transl_table != -1) { |
|---|
| 82 | // default to codon_start 1 |
|---|
| 83 | error = GBT_write_string(gb_item, "codon_start", "1"); |
|---|
| 84 | if (!error) codon_start = 0; // internal value is 0..2 |
|---|
| 85 | } |
|---|
| 86 | } |
|---|
| 87 | |
|---|
| 88 | if (!error && arb_transl_table != codon_start) { |
|---|
| 89 | if (arb_transl_table == -1) error = "Found 'codon_start', but 'transl_table' is missing"; |
|---|
| 90 | else if (codon_start == -1) error = "Found 'transl_table', but 'codon_start' is missing"; |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | if (error) { // append species name to error message |
|---|
| 94 | error = GBS_global_string("%s (item='%s')", error, GBT_read_name(gb_item)); |
|---|
| 95 | } |
|---|
| 96 | |
|---|
| 97 | return error; |
|---|
| 98 | } |
|---|
| 99 | |
|---|
| 100 | inline void memcpy3(char *dest, const char *source) { |
|---|
| 101 | dest[0] = source[0]; |
|---|
| 102 | dest[1] = source[1]; |
|---|
| 103 | dest[2] = source[2]; |
|---|
| 104 | } |
|---|
| 105 | |
|---|
| 106 | int AWT_pro_a_nucs_convert(int arb_code_nr, char *data, size_t size, size_t pos, bool translate_all, bool create_start_codon, bool append_stop_codon, int *translatedSize) { |
|---|
| 107 | // if translate_all == true -> 'pos' > 1 produces a leading 'X' in protein data |
|---|
| 108 | // (otherwise nucleotides in front of the starting pos are simply ignored) |
|---|
| 109 | // |
|---|
| 110 | // if 'create_start_codon' is true and the first generated codon is a start codon of the used |
|---|
| 111 | // code, a 'M' is inserted instead of the codon |
|---|
| 112 | // if 'append_stop_codon' is true, the stop codon is appended as '*'. This is only done, if the last |
|---|
| 113 | // character not already is a stop codon. (Note: provide data with correct size) |
|---|
| 114 | // |
|---|
| 115 | // returns: |
|---|
| 116 | // - the translated protein sequence in 'data' |
|---|
| 117 | // - the length of the translated protein sequence in 'translatedSize' (if != 0) |
|---|
| 118 | // - number of stop-codons in translated sequence as result |
|---|
| 119 | |
|---|
| 120 | arb_assert(pos <= 2); |
|---|
| 121 | |
|---|
| 122 | for (char *p = data; *p; p++) { |
|---|
| 123 | char c = *p; |
|---|
| 124 | if ((c>='a') && (c<='z')) c = c+'A'-'a'; |
|---|
| 125 | if (c=='U') c = 'T'; |
|---|
| 126 | *p = c; |
|---|
| 127 | } |
|---|
| 128 | |
|---|
| 129 | char buffer[4]; |
|---|
| 130 | buffer[3] = 0; |
|---|
| 131 | |
|---|
| 132 | char *dest = data; |
|---|
| 133 | |
|---|
| 134 | if (pos && translate_all) { |
|---|
| 135 | for (char *p = data; p<data+pos; ++p) { |
|---|
| 136 | char c = *p; |
|---|
| 137 | if (c!='.' && c!='-') { // found a nucleotide |
|---|
| 138 | *dest++ = 'X'; |
|---|
| 139 | break; |
|---|
| 140 | } |
|---|
| 141 | } |
|---|
| 142 | } |
|---|
| 143 | |
|---|
| 144 | int stops = 0; |
|---|
| 145 | size_t i = pos; |
|---|
| 146 | char startCodon = 0; |
|---|
| 147 | const GB_HASH *t2i_hash = AWT_get_translator(arb_code_nr)->T2iHash(); |
|---|
| 148 | |
|---|
| 149 | if (create_start_codon) { |
|---|
| 150 | memcpy3(buffer, data+pos); |
|---|
| 151 | startCodon = AWT_is_start_codon(buffer, arb_code_nr); |
|---|
| 152 | } |
|---|
| 153 | |
|---|
| 154 | for (char *p = data+pos; i+2<size; p+=3, i+=3) { |
|---|
| 155 | memcpy3(buffer, p); |
|---|
| 156 | int spro = (int)GBS_read_hash(t2i_hash, buffer); |
|---|
| 157 | int C; |
|---|
| 158 | if (!spro) { |
|---|
| 159 | C = 'X'; |
|---|
| 160 | } |
|---|
| 161 | else { |
|---|
| 162 | if (spro == '*') stops++; |
|---|
| 163 | C = spro; |
|---|
| 164 | if (spro == 's') C = 'S'; |
|---|
| 165 | } |
|---|
| 166 | *(dest++) = (char)C; |
|---|
| 167 | } |
|---|
| 168 | |
|---|
| 169 | int tsize = dest-data; |
|---|
| 170 | |
|---|
| 171 | if (tsize>0) { // at least 1 amino written |
|---|
| 172 | if (create_start_codon && startCodon) data[0] = startCodon; |
|---|
| 173 | if (append_stop_codon && dest[-1] != '*') { |
|---|
| 174 | *dest++ = '*'; |
|---|
| 175 | tsize++; |
|---|
| 176 | } |
|---|
| 177 | } |
|---|
| 178 | dest[0] = 0; |
|---|
| 179 | |
|---|
| 180 | if (translatedSize) *translatedSize = tsize; |
|---|
| 181 | |
|---|
| 182 | return stops; |
|---|
| 183 | } |
|---|
| 184 | |
|---|
| 185 | |
|---|
| 186 | |
|---|
| 187 | |
|---|
| 188 | |
|---|