1 | // ============================================================ // |
---|
2 | // // |
---|
3 | // File : FilteredExport.h // |
---|
4 | // Purpose : encapsulate SAI-filtered fasta exporter // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================ // |
---|
10 | |
---|
11 | #ifndef FILTEREDEXPORT_H |
---|
12 | #define FILTEREDEXPORT_H |
---|
13 | |
---|
14 | #ifndef AP_FILTER_HXX |
---|
15 | #include <AP_filter.hxx> |
---|
16 | #endif |
---|
17 | #ifndef _GLIBCXX_STRING |
---|
18 | #include <string> |
---|
19 | #endif |
---|
20 | #ifndef _STDINT_H |
---|
21 | #include <stdint.h> |
---|
22 | #endif |
---|
23 | |
---|
24 | typedef enum { BLOCK, PASS } FilterDefType; |
---|
25 | |
---|
26 | class CharRangeTable { |
---|
27 | // easy translation of character ranges to table (e.g. "a-zA-Z"). |
---|
28 | // '-' at start or end of argument string gets accepted as plain char! |
---|
29 | // Does only expand alphanumeric, forward ranges. |
---|
30 | |
---|
31 | bool table[256]; |
---|
32 | public: |
---|
33 | CharRangeTable(const char *chars) { |
---|
34 | memset(table, 0, 256); |
---|
35 | if (chars) { |
---|
36 | uint8_t prevchar = 0; |
---|
37 | for (int i = 0; chars[i]; ++i) { |
---|
38 | uint8_t c = chars[i]; |
---|
39 | if (c == '-' && prevchar) { |
---|
40 | uint8_t toChar = chars[i+1]; |
---|
41 | if (toChar) { |
---|
42 | ++i; |
---|
43 | |
---|
44 | bool expand = prevchar<=toChar; // only expand forward ranges |
---|
45 | if (expand) { |
---|
46 | for (unsigned b = prevchar; b<=toChar && expand; ++b) { |
---|
47 | if (!isalnum(b)) expand = false; // non-alphanumeric character expanded (dangerous) |
---|
48 | } |
---|
49 | } |
---|
50 | |
---|
51 | if (expand) { |
---|
52 | for (unsigned b = prevchar; b<=toChar; ++b) { |
---|
53 | table[b] = 1; |
---|
54 | } |
---|
55 | } |
---|
56 | else { // do not expand -> insert litarally |
---|
57 | table[prevchar] = 1; |
---|
58 | table['-'] = 1; |
---|
59 | table[toChar] = 1; |
---|
60 | } |
---|
61 | } |
---|
62 | else { |
---|
63 | table[c] = 1; // '-' at end |
---|
64 | } |
---|
65 | } |
---|
66 | else { |
---|
67 | table[c] = 1; |
---|
68 | } |
---|
69 | prevchar = c; |
---|
70 | } |
---|
71 | } |
---|
72 | } |
---|
73 | |
---|
74 | bool isSet(uint8_t i) const { return table[i]; } |
---|
75 | const char *expandedRange() const { |
---|
76 | static char buf[256+1]; |
---|
77 | int b = 0; |
---|
78 | for (unsigned i = 0; i<256; ++i) { |
---|
79 | if (isSet(i)) { |
---|
80 | buf[b++] = char(i); |
---|
81 | } |
---|
82 | } |
---|
83 | buf[b] = 0; |
---|
84 | return buf; |
---|
85 | } |
---|
86 | }; |
---|
87 | |
---|
88 | class FilterDefinition { |
---|
89 | FilterDefType type; |
---|
90 | |
---|
91 | std::string sai_name; |
---|
92 | std::string characters; // type == BLOCK -> blocking characters; type==PASS -> permeable characters |
---|
93 | |
---|
94 | bool inverse; // true -> do not use 'characters', use rest of ASCII set |
---|
95 | |
---|
96 | public: |
---|
97 | FilterDefinition(const char *sai_name_, FilterDefType type_, bool filter_chars, const char *characters_) : |
---|
98 | type(type_), |
---|
99 | sai_name(sai_name_), |
---|
100 | characters(characters_), |
---|
101 | inverse(!filter_chars) |
---|
102 | {} |
---|
103 | |
---|
104 | FilterDefType get_type() const { return type; } |
---|
105 | AP_filter *make_filter(GBDATA *gb_main, const char *aliName, size_t aliSize) const; |
---|
106 | }; |
---|
107 | |
---|
108 | |
---|
109 | class FilteredExport : virtual Noncopyable { |
---|
110 | GBDATA *gb_main; |
---|
111 | char *aliname; |
---|
112 | size_t alisize; |
---|
113 | |
---|
114 | bool accept_missing_data; |
---|
115 | |
---|
116 | char *header_ACI; |
---|
117 | char *sequence_ACI; |
---|
118 | |
---|
119 | // min requirements for export (which chars to count + min. counts required) |
---|
120 | CharRangeTable count_table; |
---|
121 | int minCount; |
---|
122 | |
---|
123 | |
---|
124 | AP_filter filter; |
---|
125 | bool filter_added; // add_SAI_filter called yet? |
---|
126 | |
---|
127 | char *get_filtered_sequence(GBDATA *gb_species, const char*& reason) const; |
---|
128 | char *get_fasta_header(GBDATA *gb_species) const; // w/o leading '>' |
---|
129 | |
---|
130 | #if defined(UNIT_TESTS) |
---|
131 | friend void TEST_FilteredExport(); // allow test inspection |
---|
132 | #endif |
---|
133 | |
---|
134 | int count_bases(const char *seq) const; |
---|
135 | |
---|
136 | public: |
---|
137 | FilteredExport(GBDATA *gb_main_, const char *aliname_, size_t alisize_); |
---|
138 | ~FilteredExport(); |
---|
139 | |
---|
140 | // configuration: |
---|
141 | void do_accept_missing_data() { accept_missing_data = true; } |
---|
142 | void set_required_baseCount(const char *basesToCount, int minCount_) { |
---|
143 | minCount = minCount_; |
---|
144 | count_table = CharRangeTable(basesToCount); |
---|
145 | arb_assert(implicated(minCount>0, basesToCount)); |
---|
146 | } |
---|
147 | void reset_required_baseCount() { set_required_baseCount(NULp, 0); } |
---|
148 | void set_header_ACI(const char *aci) { freedup(header_ACI, aci); } |
---|
149 | void set_sequence_ACI(const char *aci) { freedup(sequence_ACI, aci); } |
---|
150 | GB_ERROR add_SAI_filter(const FilterDefinition& filterDef) __ATTR__USERESULT; |
---|
151 | void clear_SAI_filters() { |
---|
152 | filter = AP_filter(alisize); |
---|
153 | filter_added = false; |
---|
154 | } |
---|
155 | |
---|
156 | // access: |
---|
157 | const char *get_aliname() const { |
---|
158 | return aliname; |
---|
159 | } |
---|
160 | |
---|
161 | // action: |
---|
162 | GB_ERROR write_fasta(FILE *out); |
---|
163 | }; |
---|
164 | |
---|
165 | |
---|
166 | #else |
---|
167 | #error FilteredExport.h included twice |
---|
168 | #endif // FILTEREDEXPORT_H |
---|