1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : AP_pro_a_nucs.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "AP_pro_a_nucs.hxx" |
---|
12 | |
---|
13 | #include <AP_codon_table.hxx> |
---|
14 | #include <arbdbt.h> |
---|
15 | #include <ad_cb.h> |
---|
16 | #include <arb_str.h> |
---|
17 | #include <algorithm> |
---|
18 | |
---|
19 | #define pn_assert(cond) arb_assert(cond) |
---|
20 | |
---|
21 | char *AP_create_dna_to_ap_bases() { |
---|
22 | int i; |
---|
23 | AP_BASES val; |
---|
24 | char *table = new char[256]; |
---|
25 | |
---|
26 | for (i=0; i<256; i++) { |
---|
27 | switch ((char)i) { |
---|
28 | case 'a': case 'A': val = AP_A; break; |
---|
29 | case 'g': case 'G': val = AP_G; break; |
---|
30 | case 'c': case 'C': val = AP_C; break; |
---|
31 | case 't': case 'T': |
---|
32 | case 'u': case 'U': val = AP_T; break; |
---|
33 | case '-': val = AP_GAP; break; |
---|
34 | case 'm': case 'M': val = AP_BASES(AP_A + AP_C); break; |
---|
35 | case 'r': case 'R': val = AP_BASES(AP_A + AP_G); break; |
---|
36 | case 'w': case 'W': val = AP_BASES(AP_A + AP_T); break; |
---|
37 | case 's': case 'S': val = AP_BASES(AP_C + AP_G); break; |
---|
38 | case 'y': case 'Y': val = AP_BASES(AP_C + AP_T); break; |
---|
39 | case 'k': case 'K': val = AP_BASES(AP_G + AP_T); break; |
---|
40 | case 'v': case 'V': val = AP_BASES(AP_A + AP_C + AP_G); break; |
---|
41 | case 'h': case 'H': val = AP_BASES(AP_A + AP_C + AP_T); break; |
---|
42 | case 'd': case 'D': val = AP_BASES(AP_A + AP_G + AP_T); break; |
---|
43 | case 'b': case 'B': val = AP_BASES(AP_C + AP_G + AP_T); break; |
---|
44 | case 'n': case 'N': val = AP_BASES(AP_A + AP_G + AP_C + AP_T); break; |
---|
45 | case '?': case '.': val = AP_BASES(AP_A + AP_G + AP_C + AP_T + AP_GAP); break; // = AP_DOT |
---|
46 | default: val = AP_DOT; break; // interpret everything else like a dot (alternative would be to abort with error) |
---|
47 | } |
---|
48 | table[i] = (char)val; |
---|
49 | } |
---|
50 | |
---|
51 | pn_assert(table[safeCharIndex('.')] == AP_DOT); // make sure a dot is a dot |
---|
52 | |
---|
53 | return table; |
---|
54 | } |
---|
55 | |
---|
56 | long *AWT_translator::create_pro_to_bits() const { |
---|
57 | long *table = ARB_calloc<long>(256); |
---|
58 | for (int i = 0; i < max_aa; i++) { |
---|
59 | int j = index_2_spro[i]; |
---|
60 | if (j == '.') { |
---|
61 | table[i] = -1; |
---|
62 | continue; |
---|
63 | } |
---|
64 | j = s2str[j]->index; |
---|
65 | table[i] = 1<<j; |
---|
66 | } |
---|
67 | return table; |
---|
68 | } |
---|
69 | |
---|
70 | void AWT_translator::build_table(unsigned char pbase, const char *nuc) { |
---|
71 | struct arb_r2a_pro_2_nuc *str = s2str[pbase]; |
---|
72 | |
---|
73 | // search for existing protein, else generate new |
---|
74 | if (!str) { |
---|
75 | str = new arb_r2a_pro_2_nuc; |
---|
76 | s2str[pbase] = str; |
---|
77 | s2str[tolower(pbase)] = str; |
---|
78 | |
---|
79 | str->index = max_aa++; |
---|
80 | str->single_pro = pbase; |
---|
81 | |
---|
82 | index_2_spro[str->index] = pbase; |
---|
83 | } |
---|
84 | |
---|
85 | // fast hash table |
---|
86 | pn_assert(!GBS_read_hash(t2i_hash, nuc)); // used twice (cannot handle ambiguities, e.g. optional start-/stop-codons) |
---|
87 | GBS_write_hash(t2i_hash, nuc, pbase); |
---|
88 | |
---|
89 | int n0 = nuc_2_bitset[safeCharIndex(nuc[0])]; |
---|
90 | int n1 = nuc_2_bitset[safeCharIndex(nuc[1])]; |
---|
91 | int n2 = nuc_2_bitset[safeCharIndex(nuc[2])]; |
---|
92 | |
---|
93 | struct arb_r2a_pro_2_nucs *nucs; |
---|
94 | for (nucs = str->nucs; nucs; nucs = nucs->next) { |
---|
95 | if ((!(nucs->nucbits[0] & ~n0)) && // search superset |
---|
96 | (!(nucs->nucbits[1] & ~n1)) && |
---|
97 | (!(nucs->nucbits[2] & ~n2))) break; |
---|
98 | |
---|
99 | int c = 0; |
---|
100 | if (nucs->nucbits[0] != n0) c++; |
---|
101 | if (nucs->nucbits[1] != n1) c++; |
---|
102 | if (nucs->nucbits[2] != n2) c++; |
---|
103 | if (c <= 1) break; |
---|
104 | } |
---|
105 | if (!nucs) { |
---|
106 | nucs = new arb_r2a_pro_2_nucs; |
---|
107 | nucs->next = str->nucs; |
---|
108 | str->nucs = nucs; |
---|
109 | } |
---|
110 | nucs->nucbits[0] |= n0; |
---|
111 | nucs->nucbits[1] |= n1; |
---|
112 | nucs->nucbits[2] |= n2; |
---|
113 | } |
---|
114 | |
---|
115 | // ---------------------------- |
---|
116 | // arb_r2a_pro_2_nucs |
---|
117 | |
---|
118 | arb_r2a_pro_2_nucs::arb_r2a_pro_2_nucs() |
---|
119 | : next(NULp) |
---|
120 | { |
---|
121 | memset(nucbits, 0, sizeof(nucbits)); |
---|
122 | } |
---|
123 | arb_r2a_pro_2_nucs::~arb_r2a_pro_2_nucs() { |
---|
124 | delete next; |
---|
125 | } |
---|
126 | |
---|
127 | // --------------------------- |
---|
128 | // arb_r2a_pro_2_nuc |
---|
129 | |
---|
130 | arb_r2a_pro_2_nuc::arb_r2a_pro_2_nuc() : |
---|
131 | single_pro(0), |
---|
132 | index(0), |
---|
133 | nucs(NULp) |
---|
134 | { |
---|
135 | } |
---|
136 | arb_r2a_pro_2_nuc::~arb_r2a_pro_2_nuc() { |
---|
137 | delete nucs; |
---|
138 | } |
---|
139 | |
---|
140 | // ------------------------ |
---|
141 | // AWT_translator |
---|
142 | |
---|
143 | static int codon_defined_in(const char *codon, const char *codons) { |
---|
144 | for (int off=0; codons[off]; off+=3) { |
---|
145 | if (codon[0]==codons[off] && codon[1]==codons[off+1] && codon[2]==codons[off+2]) { |
---|
146 | return 1; |
---|
147 | } |
---|
148 | } |
---|
149 | return 0; |
---|
150 | } |
---|
151 | |
---|
152 | // order of tables generated with build_table() by AWT_translator ctor is important: |
---|
153 | // must be compatible with DIST/PH_protdist.cxx !! |
---|
154 | // except that this table has an 's' insertion !!! |
---|
155 | |
---|
156 | #define T2I_ENTRIES_MAX 196 // maximum number of generated translations (by code number 11) |
---|
157 | |
---|
158 | AWT_translator::AWT_translator(int arb_protein_code_nr) : |
---|
159 | distance_meter(NULp), |
---|
160 | code_nr(arb_protein_code_nr), |
---|
161 | pro_2_bitset(NULp), |
---|
162 | realmax_aa(0), |
---|
163 | max_aa(0) |
---|
164 | { |
---|
165 | memset(s2str, 0, sizeof(s2str)); |
---|
166 | memset(index_2_spro, 0, sizeof(index_2_spro)); |
---|
167 | |
---|
168 | nuc_2_bitset = AP_create_dna_to_ap_bases(); |
---|
169 | t2i_hash = GBS_create_hash(T2I_ENTRIES_MAX, GB_IGNORE_CASE); // case insensitive |
---|
170 | |
---|
171 | AP_initialize_codon_tables(); |
---|
172 | |
---|
173 | { |
---|
174 | char *D_codons = strdup(AP_get_codons('D', code_nr)); |
---|
175 | char *N_codons = strdup(AP_get_codons('N', code_nr)); |
---|
176 | char *E_codons = strdup(AP_get_codons('E', code_nr)); |
---|
177 | char *Q_codons = strdup(AP_get_codons('Q', code_nr)); |
---|
178 | char *I_codons = strdup(AP_get_codons('I', code_nr)); |
---|
179 | char *L_codons = strdup(AP_get_codons('L', code_nr)); |
---|
180 | |
---|
181 | char protein; |
---|
182 | for (protein='*'; protein<='Z'; protein = (protein=='*' ? 'A' : protein+1)) { |
---|
183 | if (protein!='O' && protein!='U') { // OU are no aminos |
---|
184 | const char *codons; |
---|
185 | if (protein=='D') codons = D_codons; |
---|
186 | else if (protein=='N') codons = N_codons; |
---|
187 | else if (protein=='E') codons = E_codons; |
---|
188 | else if (protein=='Q') codons = Q_codons; |
---|
189 | else if (protein=='I') codons = I_codons; |
---|
190 | else if (protein=='L') codons = L_codons; |
---|
191 | else codons = AP_get_codons(protein, code_nr); |
---|
192 | // codons now contains a 0-terminated-string containing all possible codons for protein |
---|
193 | |
---|
194 | for (int off=0; codons[off]; off+=3) { |
---|
195 | char codon[4]; |
---|
196 | memcpy(codon, codons+off, 3); |
---|
197 | codon[3] = 0; |
---|
198 | |
---|
199 | if (protein=='B') { |
---|
200 | if (!codon_defined_in(codon, D_codons) && !codon_defined_in(codon, N_codons)) { |
---|
201 | build_table(protein, codon); |
---|
202 | } |
---|
203 | } |
---|
204 | else if (protein=='J') { |
---|
205 | if (!codon_defined_in(codon, I_codons) && !codon_defined_in(codon, L_codons)) { |
---|
206 | build_table(protein, codon); |
---|
207 | } |
---|
208 | } |
---|
209 | else if (protein=='Z') { |
---|
210 | if (!codon_defined_in(codon, E_codons) && !codon_defined_in(codon, Q_codons)) { |
---|
211 | build_table(protein, codon); |
---|
212 | } |
---|
213 | } |
---|
214 | else { |
---|
215 | build_table(protein, codon); |
---|
216 | } |
---|
217 | } |
---|
218 | } |
---|
219 | } |
---|
220 | |
---|
221 | free(L_codons); |
---|
222 | free(I_codons); |
---|
223 | free(Q_codons); |
---|
224 | free(E_codons); |
---|
225 | free(N_codons); |
---|
226 | free(D_codons); |
---|
227 | } |
---|
228 | |
---|
229 | realmax_aa = max_aa; |
---|
230 | |
---|
231 | build_table('-', "---"); |
---|
232 | build_table('.', "..."); |
---|
233 | build_table('.', "???"); |
---|
234 | build_table('X', "NNN"); |
---|
235 | |
---|
236 | pn_assert(GBS_hash_elements(t2i_hash) <= T2I_ENTRIES_MAX); |
---|
237 | |
---|
238 | pro_2_bitset = create_pro_to_bits(); |
---|
239 | } |
---|
240 | |
---|
241 | AWT_translator::~AWT_translator() { |
---|
242 | free(pro_2_bitset); |
---|
243 | |
---|
244 | delete [] nuc_2_bitset; |
---|
245 | GBS_free_hash(t2i_hash); |
---|
246 | for (int i=0; i<256; i++) { |
---|
247 | if (i != tolower(i)) continue; // do not delete duplicated entries (a-z == A-Z!) |
---|
248 | delete s2str[i]; |
---|
249 | } |
---|
250 | |
---|
251 | delete distance_meter; |
---|
252 | } |
---|
253 | |
---|
254 | const AWT_distance_meter *AWT_translator::getDistanceMeter() const { |
---|
255 | if (!distance_meter) distance_meter = new AWT_distance_meter(this); |
---|
256 | return distance_meter; |
---|
257 | } |
---|
258 | |
---|
259 | |
---|
260 | // ---------------------------- |
---|
261 | // Distance functions |
---|
262 | |
---|
263 | static int nuc_dist(const AWT_translator *translator, unsigned char p1, unsigned char p2) { |
---|
264 | // calculate minimum necessary nucleotide-mutations for a given amino-acid-mutation |
---|
265 | |
---|
266 | const struct arb_r2a_pro_2_nuc *s1, *s2; |
---|
267 | s1 = translator->S2str(p1); |
---|
268 | s2 = translator->S2str(p2); |
---|
269 | if ((!s1) || (!s2)) return -1; |
---|
270 | struct arb_r2a_pro_2_nucs *n1, *n2; |
---|
271 | long mindist = 3; |
---|
272 | // Check all combinations, if any combination is valid -> zero distance |
---|
273 | for (n1 = s1->nucs; n1; n1=n1->next) { |
---|
274 | for (n2 = s2->nucs; n2; n2=n2->next) { |
---|
275 | int dist = 0; |
---|
276 | int i; |
---|
277 | for (i=0; i<3; i++) { |
---|
278 | if (n1->nucbits[i] & n2->nucbits[i]) continue; |
---|
279 | dist++; |
---|
280 | } |
---|
281 | if (dist< mindist) mindist = dist; |
---|
282 | } |
---|
283 | } |
---|
284 | return mindist; |
---|
285 | } |
---|
286 | |
---|
287 | #if defined(DEBUG) |
---|
288 | static void awt_pro_a_nucs_debug(const AWT_translator *translator, const AWT_distance_meter *distmeter) { |
---|
289 | int max_aa = translator->MaxAA(); |
---|
290 | int realmax_aa = translator->RealmaxAA(); |
---|
291 | |
---|
292 | for (int s = 0; s< max_aa; s++) { |
---|
293 | const AWT_PDP *dist = distmeter->getDistance(s); |
---|
294 | |
---|
295 | // check bits should not be present in distpad |
---|
296 | if (s<realmax_aa) { |
---|
297 | for (int i=0; i<2; i++) { |
---|
298 | // assertion: bit is set in patd[i] -> bit is clear in patd[i+1] |
---|
299 | assert_or_exit((dist->patd[i] & ~dist->patd[i+1]) == 0); |
---|
300 | } |
---|
301 | } |
---|
302 | printf("Base %c[%i]: Dist to ", translator->index2spro(s), s); |
---|
303 | for (int d = 0; d< max_aa; d++) { |
---|
304 | int i; |
---|
305 | for (i=0; i<3; i++) { |
---|
306 | if (dist->patd[i] & (1<<d)) break; |
---|
307 | } |
---|
308 | printf ("%c%i ", translator->index2spro(d), i); |
---|
309 | } |
---|
310 | printf ("\n"); |
---|
311 | } |
---|
312 | } |
---|
313 | #endif // DEBUG |
---|
314 | |
---|
315 | // ---------------------------- |
---|
316 | // AWT_distance_meter |
---|
317 | |
---|
318 | AWT_distance_meter::AWT_distance_meter(const AWT_translator *translator) { |
---|
319 | memset(dist_, 0, sizeof(dist_)); |
---|
320 | |
---|
321 | int s; |
---|
322 | int i; |
---|
323 | int max_aa = translator->MaxAA(); |
---|
324 | int realmax_aa = translator->RealmaxAA(); |
---|
325 | |
---|
326 | for (s = 0; s< max_aa; s++) { |
---|
327 | ARB_calloc(dist_[s], max_aa); |
---|
328 | |
---|
329 | const arb_r2a_pro_2_nuc *s2str = translator->S2str(translator->index2spro(s)); |
---|
330 | for (i=0; i<3; i++) { |
---|
331 | dist_[s]->nucbits[i] = s2str->nucs->nucbits[i]; |
---|
332 | } |
---|
333 | } |
---|
334 | |
---|
335 | for (s = 0; s< max_aa; s++) { |
---|
336 | for (int d = 0; d< realmax_aa; d++) { |
---|
337 | int dist = nuc_dist(translator, translator->index2spro(s), translator->index2spro(d)); |
---|
338 | |
---|
339 | if (dist==0) dist_[s]->patd[0] |= 1<<d; // distance == 0 |
---|
340 | if (dist<=1) dist_[s]->patd[1] |= 1<<d; // distance <= 1 |
---|
341 | } |
---|
342 | dist_[s]->patd[2] |= dist_[s]->patd[1]; // (distance <= 1) => (distance <= 2) |
---|
343 | dist_[s]->patd[0] |= 1<<s; // set 'distance to self' == 0 |
---|
344 | } |
---|
345 | |
---|
346 | for (s = 0; s< max_aa; s++) { |
---|
347 | long sum = 0; |
---|
348 | for (int d = 0; d< realmax_aa; d++) { |
---|
349 | if ((1 << d) & dist_[s]->patd[1]) { // if distance(s, d) <= 1 |
---|
350 | sum |= dist_[d]->patd[1]; // collect all proteins which have 'distance <= 1' to 'd' |
---|
351 | } |
---|
352 | } |
---|
353 | dist_[s]->patd[2] |= sum; // and store them in 'distance <= 2' |
---|
354 | } |
---|
355 | |
---|
356 | #ifdef DEBUG |
---|
357 | awt_pro_a_nucs_debug(translator, this); |
---|
358 | #endif |
---|
359 | } |
---|
360 | |
---|
361 | AWT_distance_meter::~AWT_distance_meter() { |
---|
362 | for (int i=0; i<64; i++) { |
---|
363 | free(dist_[i]); |
---|
364 | } |
---|
365 | |
---|
366 | } |
---|
367 | |
---|
368 | // -------------------------------------------------------------------------------- |
---|
369 | // Translator factory: |
---|
370 | |
---|
371 | static int current_user_code_nr = -1; // always contain same value as AWAR_PROTEIN_TYPE (after calling AWT_default_protein_type once) |
---|
372 | |
---|
373 | static void user_code_nr_changed_cb(GBDATA *gb_awar) { |
---|
374 | // this callback keeps 'current_user_code_nr' synced with AWAR_PROTEIN_TYPE |
---|
375 | GBDATA *gb_main = GB_get_root(gb_awar); |
---|
376 | GB_transaction ta(gb_main); |
---|
377 | current_user_code_nr = GB_read_int(gb_awar); |
---|
378 | } |
---|
379 | |
---|
380 | #define CACHED_TRANSLATORS 4 |
---|
381 | |
---|
382 | AWT_translator *AWT_get_translator(int code_nr) { |
---|
383 | static SmartPtr<AWT_translator> cached[CACHED_TRANSLATORS]; |
---|
384 | |
---|
385 | if (cached[0].isNull() || cached[0]->CodeNr() != code_nr) { // most recent != requested |
---|
386 | SmartPtr<AWT_translator> translator; |
---|
387 | |
---|
388 | int i; |
---|
389 | for (i = 1; i<CACHED_TRANSLATORS; i++) { |
---|
390 | if (cached[i].isSet() && cached[i]->CodeNr() == code_nr) { |
---|
391 | // found existing translator |
---|
392 | translator = cached[i]; |
---|
393 | cached[i].setNull(); |
---|
394 | break; |
---|
395 | } |
---|
396 | } |
---|
397 | |
---|
398 | if (translator.isNull()) { |
---|
399 | translator = new AWT_translator(code_nr); |
---|
400 | } |
---|
401 | |
---|
402 | // insert new or found translator at front and shift existing to higher indices: |
---|
403 | for (i = 0; i<CACHED_TRANSLATORS && translator.isSet(); i++) { |
---|
404 | std::swap(cached[i], translator); |
---|
405 | } |
---|
406 | |
---|
407 | // deletes oldest translator, if no empty array position was found: |
---|
408 | } |
---|
409 | |
---|
410 | pn_assert(cached[0]->CodeNr() == code_nr); |
---|
411 | return &*cached[0]; |
---|
412 | } |
---|
413 | |
---|
414 | int AWT_default_protein_type(GBDATA *gb_main) { |
---|
415 | // returns protein code selected in AWAR_PROTEIN_TYPE |
---|
416 | |
---|
417 | if (current_user_code_nr == -1) { // user protein code in AWAR_PROTEIN_TYPE not traced yet |
---|
418 | pn_assert(gb_main); // either pass gb_main here or call once with valid gb_main at program startup |
---|
419 | |
---|
420 | { |
---|
421 | GB_transaction ta(gb_main); |
---|
422 | GBDATA *awar = GB_search(gb_main, AWAR_PROTEIN_TYPE, GB_INT); |
---|
423 | GB_add_callback(awar, GB_CB_CHANGED, makeDatabaseCallback(user_code_nr_changed_cb)); // bind a callback that traces AWAR_PROTEIN_TYPE |
---|
424 | user_code_nr_changed_cb(awar); |
---|
425 | } |
---|
426 | |
---|
427 | pn_assert(current_user_code_nr != -1); |
---|
428 | } |
---|
429 | |
---|
430 | return current_user_code_nr; |
---|
431 | } |
---|
432 | |
---|
433 | AWT_translator *AWT_get_user_translator(GBDATA *gb_main) { |
---|
434 | return AWT_get_translator(AWT_default_protein_type(gb_main)); |
---|
435 | } |
---|
436 | |
---|
437 | // -------------------------------------------------------------------------------- |
---|
438 | |
---|
439 | #ifdef UNIT_TESTS |
---|
440 | #ifndef TEST_UNIT_H |
---|
441 | #include <test_unit.h> |
---|
442 | #endif |
---|
443 | |
---|
444 | static int test_code_nr = -1; |
---|
445 | #if defined(ENABLE_CRASH_TESTS) |
---|
446 | static void translator_instance() { |
---|
447 | AWT_translator instance(test_code_nr); |
---|
448 | } |
---|
449 | #endif |
---|
450 | |
---|
451 | void TEST_translator_instantiation() { |
---|
452 | for (int i = 0; i<AWT_CODON_TABLES; ++i) { |
---|
453 | TEST_ANNOTATE(GBS_global_string("i=%i", i)); |
---|
454 | test_code_nr = i; |
---|
455 | TEST_EXPECT_NO_SEGFAULT(translator_instance); |
---|
456 | } |
---|
457 | } |
---|
458 | |
---|
459 | #endif // UNIT_TESTS |
---|
460 | |
---|
461 | // -------------------------------------------------------------------------------- |
---|