| 1 | // ================================================================= // |
|---|
| 2 | // // |
|---|
| 3 | // File : chartable.h // |
|---|
| 4 | // Purpose : count characters of multiple sequences // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // ================================================================= // |
|---|
| 10 | |
|---|
| 11 | #ifndef CHARTABLE_H |
|---|
| 12 | #define CHARTABLE_H |
|---|
| 13 | |
|---|
| 14 | #ifndef ARBTOOLS_H |
|---|
| 15 | #include <arbtools.h> |
|---|
| 16 | #endif |
|---|
| 17 | #ifndef POS_RANGE_H |
|---|
| 18 | #include <pos_range.h> |
|---|
| 19 | #endif |
|---|
| 20 | #ifndef ARBDB_BASE_H |
|---|
| 21 | #include <arbdb_base.h> |
|---|
| 22 | #endif |
|---|
| 23 | #ifndef _STDINT_H |
|---|
| 24 | #include <stdint.h> |
|---|
| 25 | #endif |
|---|
| 26 | |
|---|
| 27 | #define ct_assert(bed) arb_assert(bed) |
|---|
| 28 | |
|---|
| 29 | #ifdef DEBUG |
|---|
| 30 | // # define TEST_BASES_TABLE |
|---|
| 31 | #endif |
|---|
| 32 | |
|---|
| 33 | #if defined(DEBUG) && !defined(DEVEL_RELEASE) |
|---|
| 34 | # define TEST_CHAR_TABLE_INTEGRITY |
|---|
| 35 | #endif |
|---|
| 36 | |
|---|
| 37 | #define MAXCHARTABLE 256 |
|---|
| 38 | #define SHORT_TABLE_ELEM_SIZE 1 |
|---|
| 39 | #define SHORT_TABLE_MAX_VALUE 0xff |
|---|
| 40 | #define LONG_TABLE_ELEM_SIZE 4 |
|---|
| 41 | |
|---|
| 42 | #define MAX_INDEX_TABLES 128 // max number of different non-ambiguous indices (into BaseFrequencies::bases_table) |
|---|
| 43 | #define MAX_TARGET_INDICES 4 // max number of non-ambiguous indices affected by one ambiguous code (4 for 'N' in RNA/DNA, 2 for amino-acids) |
|---|
| 44 | #define MAX_AMBIGUITY_CODES (1+4+6) // for RNA/DNA (3 for amino-acids) |
|---|
| 45 | |
|---|
| 46 | #define MAX_USED_BASES_TABLES 256 |
|---|
| 47 | |
|---|
| 48 | // Note: short tables convert to long tables when adding 22 sequences (for nucleotides) |
|---|
| 49 | // * maybe remove short tables completely OR |
|---|
| 50 | // * increase values size in short tables to 16 bit |
|---|
| 51 | |
|---|
| 52 | namespace chartable { |
|---|
| 53 | |
|---|
| 54 | struct Ambiguity { |
|---|
| 55 | /*! define how a specific ambiguity code is counted |
|---|
| 56 | */ |
|---|
| 57 | |
|---|
| 58 | uint8_t indices; // number of entries in bases_table affected by this ambiguity |
|---|
| 59 | uint8_t increment; // how much each entry gets incremented |
|---|
| 60 | uint8_t index[MAX_TARGET_INDICES]; // into BaseFrequencies::bases_table |
|---|
| 61 | }; |
|---|
| 62 | |
|---|
| 63 | class SepBaseFreq : virtual Noncopyable { |
|---|
| 64 | /*! counts occurrences of one specific base (e.g. 'A') or a group of bases, |
|---|
| 65 | * separately for each alignment position of multiple input sequences. |
|---|
| 66 | */ |
|---|
| 67 | int table_entry_size; // how many bytes are used for each element of 'no_of_bases' (1 or 4 bytes) |
|---|
| 68 | union { |
|---|
| 69 | unsigned char *shortTable; |
|---|
| 70 | int *longTable; |
|---|
| 71 | } no_of_bases; // counts bases for each sequence position |
|---|
| 72 | int no_of_entries; // length of bases table |
|---|
| 73 | |
|---|
| 74 | int legal(int offset) const { return offset>=0 && offset<no_of_entries; } |
|---|
| 75 | |
|---|
| 76 | void set_elem_long(int offset, int value) { |
|---|
| 77 | #ifdef TEST_BASES_TABLE |
|---|
| 78 | ct_assert(legal(offset)); |
|---|
| 79 | ct_assert(table_entry_size==LONG_TABLE_ELEM_SIZE); |
|---|
| 80 | #endif |
|---|
| 81 | no_of_bases.longTable[offset] = value; |
|---|
| 82 | } |
|---|
| 83 | |
|---|
| 84 | void set_elem_short(int offset, int value) { |
|---|
| 85 | #ifdef TEST_BASES_TABLE |
|---|
| 86 | ct_assert(legal(offset)); |
|---|
| 87 | ct_assert(table_entry_size==SHORT_TABLE_ELEM_SIZE); |
|---|
| 88 | ct_assert(value>=0 && value<=SHORT_TABLE_MAX_VALUE); |
|---|
| 89 | #endif |
|---|
| 90 | no_of_bases.shortTable[offset] = value; |
|---|
| 91 | } |
|---|
| 92 | |
|---|
| 93 | int get_elem_long(int offset) const { |
|---|
| 94 | #ifdef TEST_BASES_TABLE |
|---|
| 95 | ct_assert(legal(offset)); |
|---|
| 96 | ct_assert(table_entry_size==LONG_TABLE_ELEM_SIZE); |
|---|
| 97 | #endif |
|---|
| 98 | return no_of_bases.longTable[offset]; |
|---|
| 99 | } |
|---|
| 100 | |
|---|
| 101 | int get_elem_short(int offset) const { |
|---|
| 102 | #ifdef TEST_BASES_TABLE |
|---|
| 103 | ct_assert(legal(offset)); |
|---|
| 104 | ct_assert(table_entry_size==SHORT_TABLE_ELEM_SIZE); |
|---|
| 105 | #endif |
|---|
| 106 | return no_of_bases.shortTable[offset]; |
|---|
| 107 | } |
|---|
| 108 | |
|---|
| 109 | public: |
|---|
| 110 | |
|---|
| 111 | SepBaseFreq(int maxseqlength); |
|---|
| 112 | ~SepBaseFreq(); |
|---|
| 113 | |
|---|
| 114 | void init(int length); |
|---|
| 115 | int size() const { return no_of_entries; } |
|---|
| 116 | |
|---|
| 117 | int get_table_entry_size() const { return table_entry_size; } |
|---|
| 118 | void expand_table_entry_size(); |
|---|
| 119 | bool bigger_table_entry_size_needed(int new_max_count) { return table_entry_size==SHORT_TABLE_ELEM_SIZE ? (new_max_count>SHORT_TABLE_MAX_VALUE) : 0; } |
|---|
| 120 | |
|---|
| 121 | int operator[](int offset) const { return table_entry_size==SHORT_TABLE_ELEM_SIZE ? get_elem_short(offset) : get_elem_long(offset); } |
|---|
| 122 | |
|---|
| 123 | void inc_short(int offset, int incr) { |
|---|
| 124 | int old = get_elem_short(offset); |
|---|
| 125 | ct_assert((old+incr)<=255); |
|---|
| 126 | set_elem_short(offset, old+incr); |
|---|
| 127 | } |
|---|
| 128 | void dec_short(int offset, int decr) { |
|---|
| 129 | int old = get_elem_short(offset); |
|---|
| 130 | ct_assert(old>=decr); |
|---|
| 131 | set_elem_short(offset, old-decr); |
|---|
| 132 | } |
|---|
| 133 | void inc_long(int offset, int incr) { |
|---|
| 134 | int old = get_elem_long(offset); |
|---|
| 135 | set_elem_long(offset, old+incr); |
|---|
| 136 | } |
|---|
| 137 | void dec_long(int offset, int decr) { |
|---|
| 138 | int old = get_elem_long(offset); |
|---|
| 139 | ct_assert(old>=decr); |
|---|
| 140 | set_elem_long(offset, old-decr); |
|---|
| 141 | } |
|---|
| 142 | |
|---|
| 143 | int firstDifference(const SepBaseFreq& other, int start, int end, int *firstDifferentPos) const; |
|---|
| 144 | int lastDifference(const SepBaseFreq& other, int start, int end, int *lastDifferentPos) const; |
|---|
| 145 | |
|---|
| 146 | void add(const SepBaseFreq& other, int start, int end); |
|---|
| 147 | void sub(const SepBaseFreq& other, int start, int end); |
|---|
| 148 | void sub_and_add(const SepBaseFreq& Sub, const SepBaseFreq& Add, PosRange range); |
|---|
| 149 | |
|---|
| 150 | void change_table_length(int new_length, int default_entry); |
|---|
| 151 | |
|---|
| 152 | |
|---|
| 153 | #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED) |
|---|
| 154 | int empty() const; |
|---|
| 155 | #endif // TEST_CHAR_TABLE_INTEGRITY |
|---|
| 156 | }; |
|---|
| 157 | |
|---|
| 158 | }; |
|---|
| 159 | |
|---|
| 160 | class ConsensusBuildParams; |
|---|
| 161 | |
|---|
| 162 | class BaseFrequencies : virtual Noncopyable { |
|---|
| 163 | /*! counts occurrences of ALL bases of multiple sequences separately |
|---|
| 164 | * for all alignment positions. |
|---|
| 165 | * |
|---|
| 166 | * Bases are clustered in groups of bases (alignment type dependent). |
|---|
| 167 | */ |
|---|
| 168 | |
|---|
| 169 | typedef chartable::SepBaseFreq SepBaseFreq; |
|---|
| 170 | typedef chartable::Ambiguity Ambiguity; |
|---|
| 171 | typedef SepBaseFreq *SepBaseFreqPtr; |
|---|
| 172 | |
|---|
| 173 | SepBaseFreqPtr *bases_table; |
|---|
| 174 | |
|---|
| 175 | int sequenceUnits; // # of sequences added to the table (multiplied with 'units') |
|---|
| 176 | bool ignore; // this table will be ignored when calculating tables higher in hierarchy |
|---|
| 177 | // (used in EDIT4 to suppress SAIs in tables of ED4_root_group_manager) |
|---|
| 178 | |
|---|
| 179 | // @@@ move statics into own class: |
|---|
| 180 | static bool initialized; |
|---|
| 181 | static Ambiguity ambiguity_table[MAX_AMBIGUITY_CODES]; |
|---|
| 182 | static uint8_t unitsPerSequence; // multiplier per added sequence (to allow proper distribution of ambiguity codes) |
|---|
| 183 | static unsigned char char_to_index_tab[MAXCHARTABLE]; // <MAX_INDEX_TABLES = > real index into 'bases_table', else index+MAX_INDEX_TABLES into ambiguity_table |
|---|
| 184 | static bool is_gap[MAXCHARTABLE]; |
|---|
| 185 | static unsigned char upper_index_chars[MAX_USED_BASES_TABLES]; |
|---|
| 186 | static int used_bases_tables; // size of 'bases_table' |
|---|
| 187 | static GB_alignment_type ali_type; |
|---|
| 188 | |
|---|
| 189 | static inline void set_char_to_index(unsigned char c, int index); |
|---|
| 190 | |
|---|
| 191 | void add(const BaseFrequencies& other, int start, int end); |
|---|
| 192 | void sub(const BaseFrequencies& other, int start, int end); |
|---|
| 193 | |
|---|
| 194 | void expand_tables(); |
|---|
| 195 | int get_table_entry_size() const { |
|---|
| 196 | return linear_table(0).get_table_entry_size(); |
|---|
| 197 | } |
|---|
| 198 | void prepare_to_add_elements(int new_sequences) { |
|---|
| 199 | ct_assert(used_bases_tables); |
|---|
| 200 | if (linear_table(0).bigger_table_entry_size_needed(sequenceUnits+new_sequences*unitsPerSequence)) { |
|---|
| 201 | expand_tables(); |
|---|
| 202 | } |
|---|
| 203 | } |
|---|
| 204 | |
|---|
| 205 | // linear access to all tables |
|---|
| 206 | SepBaseFreq& linear_table(int c) { ct_assert(c<used_bases_tables); return *bases_table[c]; } |
|---|
| 207 | const SepBaseFreq& linear_table(int c) const { ct_assert(c<used_bases_tables); return *bases_table[c]; } |
|---|
| 208 | |
|---|
| 209 | // access via character |
|---|
| 210 | SepBaseFreq& table(int c) { ct_assert(c>0 && c<MAXCHARTABLE); return linear_table(char_to_index_tab[c]); } |
|---|
| 211 | const SepBaseFreq& table(int c) const { ct_assert(c>0 && c<MAXCHARTABLE); return linear_table(char_to_index_tab[c]); } |
|---|
| 212 | |
|---|
| 213 | static unsigned char index_to_upperChar(int index) { return upper_index_chars[index]; } |
|---|
| 214 | |
|---|
| 215 | void incr_short(unsigned char c, int offset); |
|---|
| 216 | void decr_short(unsigned char c, int offset); |
|---|
| 217 | void incr_long(unsigned char c, int offset); |
|---|
| 218 | void decr_long(unsigned char c, int offset); |
|---|
| 219 | |
|---|
| 220 | #if defined(TEST_CHAR_TABLE_INTEGRITY) |
|---|
| 221 | void test() const; // test if table is valid (dumps core if invalid) |
|---|
| 222 | #else |
|---|
| 223 | void test() const {} |
|---|
| 224 | #endif |
|---|
| 225 | |
|---|
| 226 | public: |
|---|
| 227 | |
|---|
| 228 | #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED) |
|---|
| 229 | bool ok() const; |
|---|
| 230 | bool empty() const; |
|---|
| 231 | #endif |
|---|
| 232 | |
|---|
| 233 | BaseFrequencies(int maxseqlength=0); |
|---|
| 234 | ~BaseFrequencies(); |
|---|
| 235 | |
|---|
| 236 | static void setup(const char *gap_chars, GB_alignment_type ali_type_); |
|---|
| 237 | |
|---|
| 238 | void ignore_me() { ignore = 1; } |
|---|
| 239 | int is_ignored() const { return ignore; } |
|---|
| 240 | |
|---|
| 241 | void init(int maxseqlength); |
|---|
| 242 | int size() const { return bases_table[0]->size(); } |
|---|
| 243 | int added_sequences() const { return sequenceUnits/unitsPerSequence; } |
|---|
| 244 | |
|---|
| 245 | void bases_and_gaps_at(int column, int *bases, int *gaps) const; |
|---|
| 246 | double max_frequency_at(int column, bool ignore_gaps) const; |
|---|
| 247 | |
|---|
| 248 | static bool isGap(char c) { return is_gap[safeCharIndex(c)]; } |
|---|
| 249 | |
|---|
| 250 | const PosRange *changed_range(const BaseFrequencies& other) const; |
|---|
| 251 | static const PosRange *changed_range(const char *string1, const char *string2, int min_len); |
|---|
| 252 | |
|---|
| 253 | void add(const BaseFrequencies& other); |
|---|
| 254 | void sub(const BaseFrequencies& other); |
|---|
| 255 | void sub_and_add(const BaseFrequencies& Sub, const BaseFrequencies& Add, PosRange range); |
|---|
| 256 | |
|---|
| 257 | void add(const char *string, int len); |
|---|
| 258 | void sub(const char *string, int len); |
|---|
| 259 | void sub_and_add(const char *old_string, const char *new_string, PosRange range); |
|---|
| 260 | |
|---|
| 261 | void build_consensus_string_to(char *consensus_string, ExplicitRange range, const ConsensusBuildParams& BK) const; |
|---|
| 262 | char *build_consensus_string(PosRange r, const ConsensusBuildParams& cbp) const; |
|---|
| 263 | char *build_consensus_string(const ConsensusBuildParams& cbp) const { return build_consensus_string(PosRange::whole(), cbp); } |
|---|
| 264 | |
|---|
| 265 | void change_table_length(int new_length); |
|---|
| 266 | }; |
|---|
| 267 | |
|---|
| 268 | #else |
|---|
| 269 | #error chartable.h included twice |
|---|
| 270 | #endif // CHARTABLE_H |
|---|