1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : AP_pro_a_nucs.hxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #ifndef AP_PRO_A_NUCS_HXX |
---|
12 | #define AP_PRO_A_NUCS_HXX |
---|
13 | |
---|
14 | #ifndef ARBDB_H |
---|
15 | #include <arbdb.h> |
---|
16 | #endif |
---|
17 | #ifndef ARBTOOLS_H |
---|
18 | #include <arbtools.h> |
---|
19 | #endif |
---|
20 | |
---|
21 | |
---|
22 | enum AP_BASES { |
---|
23 | AP_ILLEGAL = 0, |
---|
24 | |
---|
25 | AP_A = 1, |
---|
26 | AP_C = 2, |
---|
27 | AP_G = 4, |
---|
28 | AP_T = 8, |
---|
29 | AP_GAP = 16, // known gap ('-') |
---|
30 | |
---|
31 | // -------------------- above are bit values, below combinations of them |
---|
32 | |
---|
33 | // @@@ define IUPAC here not in AP_pro_a_nucs.cxx@AP_create_dna_to_ap_bases |
---|
34 | |
---|
35 | AP_DOT = 31, // maybe gap, maybe some base (anything unknown, esp. '.', '?'; interpreted as dot) |
---|
36 | |
---|
37 | AP_MAX = 32 // amount of possible values |
---|
38 | }; |
---|
39 | |
---|
40 | struct arb_r2a_pro_2_nucs : virtual Noncopyable { |
---|
41 | struct arb_r2a_pro_2_nucs *next; |
---|
42 | char nucbits[3]; // bitsets of nucs |
---|
43 | |
---|
44 | arb_r2a_pro_2_nucs(); |
---|
45 | ~arb_r2a_pro_2_nucs(); |
---|
46 | }; |
---|
47 | |
---|
48 | struct arb_r2a_pro_2_nuc : virtual Noncopyable { |
---|
49 | char single_pro; |
---|
50 | int index; // < 0x007fffff |
---|
51 | |
---|
52 | struct arb_r2a_pro_2_nucs *nucs; |
---|
53 | |
---|
54 | arb_r2a_pro_2_nuc(); |
---|
55 | ~arb_r2a_pro_2_nuc(); |
---|
56 | }; |
---|
57 | |
---|
58 | struct AWT_PDP { // distance definition for one protein |
---|
59 | long patd[3]; // proteins at dist |
---|
60 | // every bit in patd[x] represents one protein (used bits: 0-23) |
---|
61 | // bit in patd[0] is set = > distance == 0 |
---|
62 | // bit in patd[1] is set = > distance <= 1 |
---|
63 | // bit in patd[2] is set = > distance <= 2 |
---|
64 | |
---|
65 | char nucbits[3]; // bitsets of nucs |
---|
66 | }; |
---|
67 | |
---|
68 | class AWT_translator; |
---|
69 | |
---|
70 | class AWT_distance_meter : virtual Noncopyable { |
---|
71 | AWT_PDP *dist_[64]; // sets of proteins with special distance (64 > max_aa) |
---|
72 | |
---|
73 | public: |
---|
74 | AWT_distance_meter(const AWT_translator *translator); |
---|
75 | ~AWT_distance_meter(); |
---|
76 | |
---|
77 | const AWT_PDP *getDistance(int idx) const { return dist_[idx]; } |
---|
78 | AWT_PDP *getDistance(int idx) { return dist_[idx]; } |
---|
79 | }; |
---|
80 | |
---|
81 | |
---|
82 | class AWT_translator : virtual Noncopyable { |
---|
83 | private: |
---|
84 | mutable AWT_distance_meter *distance_meter; // (mutable to allow lazy-evaluation) |
---|
85 | |
---|
86 | int code_nr; // arb (not embl) |
---|
87 | GB_HASH *t2i_hash; // hash table trin >> singlepro |
---|
88 | arb_r2a_pro_2_nuc *s2str[256]; // singlecode protein >> dna ... |
---|
89 | |
---|
90 | long *pro_2_bitset; // aa-index(!) to bitset |
---|
91 | char *nuc_2_bitset; // dna-character to bitset |
---|
92 | |
---|
93 | unsigned char index_2_spro[64]; // 64 > max_aa |
---|
94 | |
---|
95 | int realmax_aa; // number of real AA + stop codon |
---|
96 | int max_aa; // plus ambiguous codes |
---|
97 | |
---|
98 | void build_table(unsigned char pbase, const char *nuc); |
---|
99 | long *create_pro_to_bits() const; |
---|
100 | |
---|
101 | public: |
---|
102 | |
---|
103 | AWT_translator(int arb_protein_code_nr); |
---|
104 | ~AWT_translator(); |
---|
105 | |
---|
106 | const AWT_distance_meter *getDistanceMeter() const; |
---|
107 | AWT_distance_meter *getDistanceMeter() { |
---|
108 | return const_cast<AWT_distance_meter*>(const_cast<const AWT_translator*>(this)->getDistanceMeter()); |
---|
109 | } |
---|
110 | |
---|
111 | int CodeNr() const { return code_nr; } |
---|
112 | const arb_r2a_pro_2_nuc *S2str(int index) const { return s2str[index]; } |
---|
113 | const arb_r2a_pro_2_nuc * const *S2strArray() const { return s2str; } |
---|
114 | |
---|
115 | long index2bitset(int index) const { return pro_2_bitset[index]; } |
---|
116 | unsigned char index2spro(int index) const { return index_2_spro[index]; } |
---|
117 | |
---|
118 | int RealmaxAA() const { return realmax_aa; } |
---|
119 | int MaxAA() const { return max_aa; } // incl. ambiguity codes |
---|
120 | |
---|
121 | char codon2aa(const char *codon) const { |
---|
122 | long spro = GBS_read_hash(t2i_hash, codon); |
---|
123 | return spro ? char(spro) : 'X'; |
---|
124 | } |
---|
125 | |
---|
126 | char isStartOrStopCodon(const char *codon) const; |
---|
127 | char isStartCodon(const char *codon) const { |
---|
128 | char start = isStartOrStopCodon(codon); |
---|
129 | return start == '*' ? 0 : start; // ignore stop |
---|
130 | } |
---|
131 | char isStopCodon(const char *codon) const { |
---|
132 | char stop = isStartOrStopCodon(codon); |
---|
133 | return stop == 'M' ? 0 : stop; // ignore start |
---|
134 | } |
---|
135 | }; |
---|
136 | |
---|
137 | #define AWAR_PROTEIN_TYPE "nt/protein_codon_type" |
---|
138 | |
---|
139 | char *AP_create_dna_to_ap_bases(); // create dna 2 nuc_bitset |
---|
140 | |
---|
141 | // ------------------------------ |
---|
142 | |
---|
143 | int AWT_default_protein_type(GBDATA *gb_main = NULp); // returns protein code selected in AWAR_PROTEIN_TYPE |
---|
144 | |
---|
145 | AWT_translator *AWT_get_translator(int code_nr); // use explicit protein code |
---|
146 | AWT_translator *AWT_get_user_translator(GBDATA *gb_main = NULp); // uses user setting for protein code from AWAR_PROTEIN_TYPE |
---|
147 | // AWAR_PROTEIN_TYPE has to exist; the first call of AWT_get_user_translator needs 'gb_main' != 0 |
---|
148 | |
---|
149 | #else |
---|
150 | #error AP_pro_a_nucs.hxx included twice |
---|
151 | #endif // AP_PRO_A_NUCS_HXX |
---|