| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : AP_pro_a_nucs.hxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #ifndef AP_PRO_A_NUCS_HXX |
|---|
| 12 | #define AP_PRO_A_NUCS_HXX |
|---|
| 13 | |
|---|
| 14 | #ifndef ARBDB_H |
|---|
| 15 | #include <arbdb.h> |
|---|
| 16 | #endif |
|---|
| 17 | #ifndef ARBTOOLS_H |
|---|
| 18 | #include <arbtools.h> |
|---|
| 19 | #endif |
|---|
| 20 | |
|---|
| 21 | |
|---|
| 22 | enum AP_BASES { |
|---|
| 23 | AP_ILLEGAL = 0, |
|---|
| 24 | |
|---|
| 25 | AP_A = 1, |
|---|
| 26 | AP_C = 2, |
|---|
| 27 | AP_G = 4, |
|---|
| 28 | AP_T = 8, |
|---|
| 29 | AP_GAP = 16, // known gap ('-') |
|---|
| 30 | |
|---|
| 31 | // -------------------- above are bit values, below combinations of them |
|---|
| 32 | |
|---|
| 33 | // @@@ define IUPAC here not in AP_pro_a_nucs.cxx@AP_create_dna_to_ap_bases |
|---|
| 34 | |
|---|
| 35 | AP_DOT = 31, // maybe gap, maybe some base (anything unknown, esp. '.', '?'; interpreted as dot) |
|---|
| 36 | |
|---|
| 37 | AP_MAX = 32 // amount of possible values |
|---|
| 38 | }; |
|---|
| 39 | |
|---|
| 40 | struct arb_r2a_pro_2_nucs : virtual Noncopyable { |
|---|
| 41 | struct arb_r2a_pro_2_nucs *next; |
|---|
| 42 | char nucbits[3]; // bitsets of nucs |
|---|
| 43 | |
|---|
| 44 | arb_r2a_pro_2_nucs(); |
|---|
| 45 | ~arb_r2a_pro_2_nucs(); |
|---|
| 46 | }; |
|---|
| 47 | |
|---|
| 48 | struct arb_r2a_pro_2_nuc : virtual Noncopyable { |
|---|
| 49 | char single_pro; |
|---|
| 50 | int index; // < 0x007fffff |
|---|
| 51 | |
|---|
| 52 | struct arb_r2a_pro_2_nucs *nucs; |
|---|
| 53 | |
|---|
| 54 | arb_r2a_pro_2_nuc(); |
|---|
| 55 | ~arb_r2a_pro_2_nuc(); |
|---|
| 56 | }; |
|---|
| 57 | |
|---|
| 58 | struct AWT_PDP { // distance definition for one protein |
|---|
| 59 | long patd[3]; // proteins at dist |
|---|
| 60 | // every bit in patd[x] represents one protein (used bits: 0-23) |
|---|
| 61 | // bit in patd[0] is set = > distance == 0 |
|---|
| 62 | // bit in patd[1] is set = > distance <= 1 |
|---|
| 63 | // bit in patd[2] is set = > distance <= 2 |
|---|
| 64 | |
|---|
| 65 | char nucbits[3]; // bitsets of nucs |
|---|
| 66 | }; |
|---|
| 67 | |
|---|
| 68 | class AWT_translator; |
|---|
| 69 | |
|---|
| 70 | class AWT_distance_meter : virtual Noncopyable { |
|---|
| 71 | AWT_PDP *dist_[64]; // sets of proteins with special distance (64 > max_aa) |
|---|
| 72 | |
|---|
| 73 | public: |
|---|
| 74 | AWT_distance_meter(const AWT_translator *translator); |
|---|
| 75 | ~AWT_distance_meter(); |
|---|
| 76 | |
|---|
| 77 | const AWT_PDP *getDistance(int idx) const { return dist_[idx]; } |
|---|
| 78 | AWT_PDP *getDistance(int idx) { return dist_[idx]; } |
|---|
| 79 | }; |
|---|
| 80 | |
|---|
| 81 | |
|---|
| 82 | class AWT_translator : virtual Noncopyable { |
|---|
| 83 | private: |
|---|
| 84 | mutable AWT_distance_meter *distance_meter; // (mutable to allow lazy-evaluation) |
|---|
| 85 | |
|---|
| 86 | int code_nr; // arb (not embl) |
|---|
| 87 | GB_HASH *t2i_hash; // hash table trin >> singlepro |
|---|
| 88 | arb_r2a_pro_2_nuc *s2str[256]; // singlecode protein >> dna ... |
|---|
| 89 | |
|---|
| 90 | long *pro_2_bitset; // aa-index(!) to bitset |
|---|
| 91 | char *nuc_2_bitset; // dna-character to bitset |
|---|
| 92 | |
|---|
| 93 | unsigned char index_2_spro[64]; // 64 > max_aa |
|---|
| 94 | |
|---|
| 95 | int realmax_aa; // number of real AA + stop codon |
|---|
| 96 | int max_aa; // plus ambiguous codes |
|---|
| 97 | |
|---|
| 98 | void build_table(unsigned char pbase, const char *nuc); |
|---|
| 99 | long *create_pro_to_bits() const; |
|---|
| 100 | |
|---|
| 101 | public: |
|---|
| 102 | |
|---|
| 103 | AWT_translator(int arb_protein_code_nr); |
|---|
| 104 | ~AWT_translator(); |
|---|
| 105 | |
|---|
| 106 | const AWT_distance_meter *getDistanceMeter() const; |
|---|
| 107 | AWT_distance_meter *getDistanceMeter() { |
|---|
| 108 | return const_cast<AWT_distance_meter*>(const_cast<const AWT_translator*>(this)->getDistanceMeter()); |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | int CodeNr() const { return code_nr; } |
|---|
| 112 | const arb_r2a_pro_2_nuc *S2str(int index) const { return s2str[index]; } |
|---|
| 113 | const arb_r2a_pro_2_nuc * const *S2strArray() const { return s2str; } |
|---|
| 114 | |
|---|
| 115 | long index2bitset(int index) const { return pro_2_bitset[index]; } |
|---|
| 116 | unsigned char index2spro(int index) const { return index_2_spro[index]; } |
|---|
| 117 | |
|---|
| 118 | int RealmaxAA() const { return realmax_aa; } |
|---|
| 119 | int MaxAA() const { return max_aa; } // incl. ambiguity codes |
|---|
| 120 | |
|---|
| 121 | char codon2aa(const char *codon) const { |
|---|
| 122 | long spro = GBS_read_hash(t2i_hash, codon); |
|---|
| 123 | return spro ? char(spro) : 'X'; |
|---|
| 124 | } |
|---|
| 125 | |
|---|
| 126 | char isStartOrStopCodon(const char *codon) const; |
|---|
| 127 | char isStartCodon(const char *codon) const { |
|---|
| 128 | char start = isStartOrStopCodon(codon); |
|---|
| 129 | return start == '*' ? 0 : start; // ignore stop |
|---|
| 130 | } |
|---|
| 131 | char isStopCodon(const char *codon) const { |
|---|
| 132 | char stop = isStartOrStopCodon(codon); |
|---|
| 133 | return stop == 'M' ? 0 : stop; // ignore start |
|---|
| 134 | } |
|---|
| 135 | }; |
|---|
| 136 | |
|---|
| 137 | #define AWAR_PROTEIN_TYPE "nt/protein_codon_type" |
|---|
| 138 | |
|---|
| 139 | char *AP_create_dna_to_ap_bases(); // create dna 2 nuc_bitset |
|---|
| 140 | |
|---|
| 141 | // ------------------------------ |
|---|
| 142 | |
|---|
| 143 | int AWT_default_protein_type(GBDATA *gb_main = NULp); // returns protein code selected in AWAR_PROTEIN_TYPE |
|---|
| 144 | |
|---|
| 145 | AWT_translator *AWT_get_translator(int code_nr); // use explicit protein code |
|---|
| 146 | AWT_translator *AWT_get_user_translator(GBDATA *gb_main = NULp); // uses user setting for protein code from AWAR_PROTEIN_TYPE |
|---|
| 147 | // AWAR_PROTEIN_TYPE has to exist; the first call of AWT_get_user_translator needs 'gb_main' != 0 |
|---|
| 148 | |
|---|
| 149 | #else |
|---|
| 150 | #error AP_pro_a_nucs.hxx included twice |
|---|
| 151 | #endif // AP_PRO_A_NUCS_HXX |
|---|