1 | // ================================================================= // |
---|
2 | // // |
---|
3 | // File : chartable.h // |
---|
4 | // Purpose : count characters of multiple sequences // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ================================================================= // |
---|
10 | |
---|
11 | #ifndef CHARTABLE_H |
---|
12 | #define CHARTABLE_H |
---|
13 | |
---|
14 | #ifndef ARBTOOLS_H |
---|
15 | #include <arbtools.h> |
---|
16 | #endif |
---|
17 | #ifndef POS_RANGE_H |
---|
18 | #include <pos_range.h> |
---|
19 | #endif |
---|
20 | #ifndef ARBDB_BASE_H |
---|
21 | #include <arbdb_base.h> |
---|
22 | #endif |
---|
23 | #ifndef _STDINT_H |
---|
24 | #include <stdint.h> |
---|
25 | #endif |
---|
26 | |
---|
27 | #define ct_assert(bed) arb_assert(bed) |
---|
28 | |
---|
29 | #ifdef DEBUG |
---|
30 | // # define TEST_BASES_TABLE |
---|
31 | #endif |
---|
32 | |
---|
33 | #if defined(DEBUG) && !defined(DEVEL_RELEASE) |
---|
34 | # define TEST_CHAR_TABLE_INTEGRITY |
---|
35 | #endif |
---|
36 | |
---|
37 | #define MAXCHARTABLE 256 |
---|
38 | #define SHORT_TABLE_ELEM_SIZE 1 |
---|
39 | #define SHORT_TABLE_MAX_VALUE 0xff |
---|
40 | #define LONG_TABLE_ELEM_SIZE 4 |
---|
41 | |
---|
42 | #define MAX_INDEX_TABLES 128 // max number of different non-ambiguous indices (into BaseFrequencies::bases_table) |
---|
43 | #define MAX_TARGET_INDICES 4 // max number of non-ambiguous indices affected by one ambiguous code (4 for 'N' in RNA/DNA, 2 for amino-acids) |
---|
44 | #define MAX_AMBIGUITY_CODES (1+4+6) // for RNA/DNA (3 for amino-acids) |
---|
45 | |
---|
46 | #define MAX_USED_BASES_TABLES 256 |
---|
47 | |
---|
48 | // Note: short tables convert to long tables when adding 22 sequences (for nucleotides) |
---|
49 | // * maybe remove short tables completely OR |
---|
50 | // * increase values size in short tables to 16 bit |
---|
51 | |
---|
52 | namespace chartable { |
---|
53 | |
---|
54 | struct Ambiguity { |
---|
55 | /*! define how a specific ambiguity code is counted |
---|
56 | */ |
---|
57 | |
---|
58 | uint8_t indices; // number of entries in bases_table affected by this ambiguity |
---|
59 | uint8_t increment; // how much each entry gets incremented |
---|
60 | uint8_t index[MAX_TARGET_INDICES]; // into BaseFrequencies::bases_table |
---|
61 | }; |
---|
62 | |
---|
63 | class SepBaseFreq : virtual Noncopyable { |
---|
64 | /*! counts occurrences of one specific base (e.g. 'A') or a group of bases, |
---|
65 | * separately for each alignment position of multiple input sequences. |
---|
66 | */ |
---|
67 | int table_entry_size; // how many bytes are used for each element of 'no_of_bases' (1 or 4 bytes) |
---|
68 | union { |
---|
69 | unsigned char *shortTable; |
---|
70 | int *longTable; |
---|
71 | } no_of_bases; // counts bases for each sequence position |
---|
72 | int no_of_entries; // length of bases table |
---|
73 | |
---|
74 | int legal(int offset) const { return offset>=0 && offset<no_of_entries; } |
---|
75 | |
---|
76 | void set_elem_long(int offset, int value) { |
---|
77 | #ifdef TEST_BASES_TABLE |
---|
78 | ct_assert(legal(offset)); |
---|
79 | ct_assert(table_entry_size==LONG_TABLE_ELEM_SIZE); |
---|
80 | #endif |
---|
81 | no_of_bases.longTable[offset] = value; |
---|
82 | } |
---|
83 | |
---|
84 | void set_elem_short(int offset, int value) { |
---|
85 | #ifdef TEST_BASES_TABLE |
---|
86 | ct_assert(legal(offset)); |
---|
87 | ct_assert(table_entry_size==SHORT_TABLE_ELEM_SIZE); |
---|
88 | ct_assert(value>=0 && value<=SHORT_TABLE_MAX_VALUE); |
---|
89 | #endif |
---|
90 | no_of_bases.shortTable[offset] = value; |
---|
91 | } |
---|
92 | |
---|
93 | int get_elem_long(int offset) const { |
---|
94 | #ifdef TEST_BASES_TABLE |
---|
95 | ct_assert(legal(offset)); |
---|
96 | ct_assert(table_entry_size==LONG_TABLE_ELEM_SIZE); |
---|
97 | #endif |
---|
98 | return no_of_bases.longTable[offset]; |
---|
99 | } |
---|
100 | |
---|
101 | int get_elem_short(int offset) const { |
---|
102 | #ifdef TEST_BASES_TABLE |
---|
103 | ct_assert(legal(offset)); |
---|
104 | ct_assert(table_entry_size==SHORT_TABLE_ELEM_SIZE); |
---|
105 | #endif |
---|
106 | return no_of_bases.shortTable[offset]; |
---|
107 | } |
---|
108 | |
---|
109 | public: |
---|
110 | |
---|
111 | SepBaseFreq(int maxseqlength); |
---|
112 | ~SepBaseFreq(); |
---|
113 | |
---|
114 | void init(int length); |
---|
115 | int size() const { return no_of_entries; } |
---|
116 | |
---|
117 | int get_table_entry_size() const { return table_entry_size; } |
---|
118 | void expand_table_entry_size(); |
---|
119 | bool bigger_table_entry_size_needed(int new_max_count) { return table_entry_size==SHORT_TABLE_ELEM_SIZE ? (new_max_count>SHORT_TABLE_MAX_VALUE) : 0; } |
---|
120 | |
---|
121 | int operator[](int offset) const { return table_entry_size==SHORT_TABLE_ELEM_SIZE ? get_elem_short(offset) : get_elem_long(offset); } |
---|
122 | |
---|
123 | void inc_short(int offset, int incr) { |
---|
124 | int old = get_elem_short(offset); |
---|
125 | ct_assert((old+incr)<=255); |
---|
126 | set_elem_short(offset, old+incr); |
---|
127 | } |
---|
128 | void dec_short(int offset, int decr) { |
---|
129 | int old = get_elem_short(offset); |
---|
130 | ct_assert(old>=decr); |
---|
131 | set_elem_short(offset, old-decr); |
---|
132 | } |
---|
133 | void inc_long(int offset, int incr) { |
---|
134 | int old = get_elem_long(offset); |
---|
135 | set_elem_long(offset, old+incr); |
---|
136 | } |
---|
137 | void dec_long(int offset, int decr) { |
---|
138 | int old = get_elem_long(offset); |
---|
139 | ct_assert(old>=decr); |
---|
140 | set_elem_long(offset, old-decr); |
---|
141 | } |
---|
142 | |
---|
143 | int firstDifference(const SepBaseFreq& other, int start, int end, int *firstDifferentPos) const; |
---|
144 | int lastDifference(const SepBaseFreq& other, int start, int end, int *lastDifferentPos) const; |
---|
145 | |
---|
146 | void add(const SepBaseFreq& other, int start, int end); |
---|
147 | void sub(const SepBaseFreq& other, int start, int end); |
---|
148 | void sub_and_add(const SepBaseFreq& Sub, const SepBaseFreq& Add, PosRange range); |
---|
149 | |
---|
150 | void change_table_length(int new_length, int default_entry); |
---|
151 | |
---|
152 | |
---|
153 | #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED) |
---|
154 | int empty() const; |
---|
155 | #endif // TEST_CHAR_TABLE_INTEGRITY |
---|
156 | }; |
---|
157 | |
---|
158 | }; |
---|
159 | |
---|
160 | class ConsensusBuildParams; |
---|
161 | |
---|
162 | class BaseFrequencies : virtual Noncopyable { |
---|
163 | /*! counts occurrences of ALL bases of multiple sequences separately |
---|
164 | * for all alignment positions. |
---|
165 | * |
---|
166 | * Bases are clustered in groups of bases (alignment type dependent). |
---|
167 | */ |
---|
168 | |
---|
169 | typedef chartable::SepBaseFreq SepBaseFreq; |
---|
170 | typedef chartable::Ambiguity Ambiguity; |
---|
171 | typedef SepBaseFreq *SepBaseFreqPtr; |
---|
172 | |
---|
173 | SepBaseFreqPtr *bases_table; |
---|
174 | |
---|
175 | int sequenceUnits; // # of sequences added to the table (multiplied with 'units') |
---|
176 | bool ignore; // this table will be ignored when calculating tables higher in hierarchy |
---|
177 | // (used in EDIT4 to suppress SAIs in tables of ED4_root_group_manager) |
---|
178 | |
---|
179 | // @@@ move statics into own class: |
---|
180 | static bool initialized; |
---|
181 | static Ambiguity ambiguity_table[MAX_AMBIGUITY_CODES]; |
---|
182 | static uint8_t unitsPerSequence; // multiplier per added sequence (to allow proper distribution of ambiguity codes) |
---|
183 | static unsigned char char_to_index_tab[MAXCHARTABLE]; // <MAX_INDEX_TABLES = > real index into 'bases_table', else index+MAX_INDEX_TABLES into ambiguity_table |
---|
184 | static bool is_gap[MAXCHARTABLE]; |
---|
185 | static unsigned char upper_index_chars[MAX_USED_BASES_TABLES]; |
---|
186 | static int used_bases_tables; // size of 'bases_table' |
---|
187 | static GB_alignment_type ali_type; |
---|
188 | |
---|
189 | static inline void set_char_to_index(unsigned char c, int index); |
---|
190 | |
---|
191 | void add(const BaseFrequencies& other, int start, int end); |
---|
192 | void sub(const BaseFrequencies& other, int start, int end); |
---|
193 | |
---|
194 | void expand_tables(); |
---|
195 | int get_table_entry_size() const { |
---|
196 | return linear_table(0).get_table_entry_size(); |
---|
197 | } |
---|
198 | void prepare_to_add_elements(int new_sequences) { |
---|
199 | ct_assert(used_bases_tables); |
---|
200 | if (linear_table(0).bigger_table_entry_size_needed(sequenceUnits+new_sequences*unitsPerSequence)) { |
---|
201 | expand_tables(); |
---|
202 | } |
---|
203 | } |
---|
204 | |
---|
205 | // linear access to all tables |
---|
206 | SepBaseFreq& linear_table(int c) { ct_assert(c<used_bases_tables); return *bases_table[c]; } |
---|
207 | const SepBaseFreq& linear_table(int c) const { ct_assert(c<used_bases_tables); return *bases_table[c]; } |
---|
208 | |
---|
209 | // access via character |
---|
210 | SepBaseFreq& table(int c) { ct_assert(c>0 && c<MAXCHARTABLE); return linear_table(char_to_index_tab[c]); } |
---|
211 | const SepBaseFreq& table(int c) const { ct_assert(c>0 && c<MAXCHARTABLE); return linear_table(char_to_index_tab[c]); } |
---|
212 | |
---|
213 | static unsigned char index_to_upperChar(int index) { return upper_index_chars[index]; } |
---|
214 | |
---|
215 | void incr_short(unsigned char c, int offset); |
---|
216 | void decr_short(unsigned char c, int offset); |
---|
217 | void incr_long(unsigned char c, int offset); |
---|
218 | void decr_long(unsigned char c, int offset); |
---|
219 | |
---|
220 | #if defined(TEST_CHAR_TABLE_INTEGRITY) |
---|
221 | void test() const; // test if table is valid (dumps core if invalid) |
---|
222 | #else |
---|
223 | void test() const {} |
---|
224 | #endif |
---|
225 | |
---|
226 | public: |
---|
227 | |
---|
228 | #if defined(TEST_CHAR_TABLE_INTEGRITY) || defined(ASSERTION_USED) |
---|
229 | bool ok() const; |
---|
230 | bool empty() const; |
---|
231 | #endif |
---|
232 | |
---|
233 | BaseFrequencies(int maxseqlength=0); |
---|
234 | ~BaseFrequencies(); |
---|
235 | |
---|
236 | static void setup(const char *gap_chars, GB_alignment_type ali_type_); |
---|
237 | |
---|
238 | void ignore_me() { ignore = 1; } |
---|
239 | int is_ignored() const { return ignore; } |
---|
240 | |
---|
241 | void init(int maxseqlength); |
---|
242 | int size() const { return bases_table[0]->size(); } |
---|
243 | int added_sequences() const { return sequenceUnits/unitsPerSequence; } |
---|
244 | |
---|
245 | void bases_and_gaps_at(int column, int *bases, int *gaps) const; |
---|
246 | double max_frequency_at(int column, bool ignore_gaps) const; |
---|
247 | |
---|
248 | static bool isGap(char c) { return is_gap[safeCharIndex(c)]; } |
---|
249 | |
---|
250 | const PosRange *changed_range(const BaseFrequencies& other) const; |
---|
251 | static const PosRange *changed_range(const char *string1, const char *string2, int min_len); |
---|
252 | |
---|
253 | void add(const BaseFrequencies& other); |
---|
254 | void sub(const BaseFrequencies& other); |
---|
255 | void sub_and_add(const BaseFrequencies& Sub, const BaseFrequencies& Add, PosRange range); |
---|
256 | |
---|
257 | void add(const char *string, int len); |
---|
258 | void sub(const char *string, int len); |
---|
259 | void sub_and_add(const char *old_string, const char *new_string, PosRange range); |
---|
260 | |
---|
261 | void build_consensus_string_to(char *consensus_string, ExplicitRange range, const ConsensusBuildParams& BK) const; |
---|
262 | char *build_consensus_string(PosRange r, const ConsensusBuildParams& cbp) const; |
---|
263 | char *build_consensus_string(const ConsensusBuildParams& cbp) const { return build_consensus_string(PosRange::whole(), cbp); } |
---|
264 | |
---|
265 | void change_table_length(int new_length); |
---|
266 | }; |
---|
267 | |
---|
268 | #else |
---|
269 | #error chartable.h included twice |
---|
270 | #endif // CHARTABLE_H |
---|