| 1 | // ============================================================ // |
|---|
| 2 | // // |
|---|
| 3 | // File : FilteredExport.h // |
|---|
| 4 | // Purpose : encapsulate SAI-filtered fasta exporter // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // ============================================================ // |
|---|
| 10 | |
|---|
| 11 | #ifndef FILTEREDEXPORT_H |
|---|
| 12 | #define FILTEREDEXPORT_H |
|---|
| 13 | |
|---|
| 14 | #ifndef AP_FILTER_HXX |
|---|
| 15 | #include <AP_filter.hxx> |
|---|
| 16 | #endif |
|---|
| 17 | #ifndef _GLIBCXX_STRING |
|---|
| 18 | #include <string> |
|---|
| 19 | #endif |
|---|
| 20 | #ifndef _STDINT_H |
|---|
| 21 | #include <stdint.h> |
|---|
| 22 | #endif |
|---|
| 23 | |
|---|
| 24 | typedef enum { BLOCK, PASS } FilterDefType; |
|---|
| 25 | |
|---|
| 26 | class CharRangeTable { |
|---|
| 27 | // easy translation of character ranges to table (e.g. "a-zA-Z"). |
|---|
| 28 | // '-' at start or end of argument string gets accepted as plain char! |
|---|
| 29 | // Does only expand alphanumeric, forward ranges. |
|---|
| 30 | |
|---|
| 31 | bool table[256]; |
|---|
| 32 | public: |
|---|
| 33 | CharRangeTable(const char *chars) { |
|---|
| 34 | memset(table, 0, 256); |
|---|
| 35 | if (chars) { |
|---|
| 36 | uint8_t prevchar = 0; |
|---|
| 37 | for (int i = 0; chars[i]; ++i) { |
|---|
| 38 | uint8_t c = chars[i]; |
|---|
| 39 | if (c == '-' && prevchar) { |
|---|
| 40 | uint8_t toChar = chars[i+1]; |
|---|
| 41 | if (toChar) { |
|---|
| 42 | ++i; |
|---|
| 43 | |
|---|
| 44 | bool expand = prevchar<=toChar; // only expand forward ranges |
|---|
| 45 | if (expand) { |
|---|
| 46 | for (unsigned b = prevchar; b<=toChar && expand; ++b) { |
|---|
| 47 | if (!isalnum(b)) expand = false; // non-alphanumeric character expanded (dangerous) |
|---|
| 48 | } |
|---|
| 49 | } |
|---|
| 50 | |
|---|
| 51 | if (expand) { |
|---|
| 52 | for (unsigned b = prevchar; b<=toChar; ++b) { |
|---|
| 53 | table[b] = 1; |
|---|
| 54 | } |
|---|
| 55 | } |
|---|
| 56 | else { // do not expand -> insert litarally |
|---|
| 57 | table[prevchar] = 1; |
|---|
| 58 | table['-'] = 1; |
|---|
| 59 | table[toChar] = 1; |
|---|
| 60 | } |
|---|
| 61 | } |
|---|
| 62 | else { |
|---|
| 63 | table[c] = 1; // '-' at end |
|---|
| 64 | } |
|---|
| 65 | } |
|---|
| 66 | else { |
|---|
| 67 | table[c] = 1; |
|---|
| 68 | } |
|---|
| 69 | prevchar = c; |
|---|
| 70 | } |
|---|
| 71 | } |
|---|
| 72 | } |
|---|
| 73 | |
|---|
| 74 | bool isSet(uint8_t i) const { return table[i]; } |
|---|
| 75 | const char *expandedRange() const { |
|---|
| 76 | static char buf[256+1]; |
|---|
| 77 | int b = 0; |
|---|
| 78 | for (unsigned i = 0; i<256; ++i) { |
|---|
| 79 | if (isSet(i)) { |
|---|
| 80 | buf[b++] = char(i); |
|---|
| 81 | } |
|---|
| 82 | } |
|---|
| 83 | buf[b] = 0; |
|---|
| 84 | return buf; |
|---|
| 85 | } |
|---|
| 86 | }; |
|---|
| 87 | |
|---|
| 88 | class FilterDefinition { |
|---|
| 89 | FilterDefType type; |
|---|
| 90 | |
|---|
| 91 | std::string sai_name; |
|---|
| 92 | std::string characters; // type == BLOCK -> blocking characters; type==PASS -> permeable characters |
|---|
| 93 | |
|---|
| 94 | bool inverse; // true -> do not use 'characters', use rest of ASCII set |
|---|
| 95 | |
|---|
| 96 | public: |
|---|
| 97 | FilterDefinition(const char *sai_name_, FilterDefType type_, bool filter_chars, const char *characters_) : |
|---|
| 98 | type(type_), |
|---|
| 99 | sai_name(sai_name_), |
|---|
| 100 | characters(characters_), |
|---|
| 101 | inverse(!filter_chars) |
|---|
| 102 | {} |
|---|
| 103 | |
|---|
| 104 | FilterDefType get_type() const { return type; } |
|---|
| 105 | AP_filter *make_filter(GBDATA *gb_main, const char *aliName, size_t aliSize) const; |
|---|
| 106 | }; |
|---|
| 107 | |
|---|
| 108 | |
|---|
| 109 | class FilteredExport : virtual Noncopyable { |
|---|
| 110 | GBDATA *gb_main; |
|---|
| 111 | char *aliname; |
|---|
| 112 | size_t alisize; |
|---|
| 113 | |
|---|
| 114 | bool accept_missing_data; |
|---|
| 115 | |
|---|
| 116 | char *header_ACI; |
|---|
| 117 | char *sequence_ACI; |
|---|
| 118 | |
|---|
| 119 | // min requirements for export (which chars to count + min. counts required) |
|---|
| 120 | CharRangeTable count_table; |
|---|
| 121 | int minCount; |
|---|
| 122 | |
|---|
| 123 | |
|---|
| 124 | AP_filter filter; |
|---|
| 125 | bool filter_added; // add_SAI_filter called yet? |
|---|
| 126 | |
|---|
| 127 | char *get_filtered_sequence(GBDATA *gb_species, const char*& reason) const; |
|---|
| 128 | char *get_fasta_header(GBDATA *gb_species) const; // w/o leading '>' |
|---|
| 129 | |
|---|
| 130 | #if defined(UNIT_TESTS) |
|---|
| 131 | friend void TEST_FilteredExport(); // allow test inspection |
|---|
| 132 | #endif |
|---|
| 133 | |
|---|
| 134 | int count_bases(const char *seq) const; |
|---|
| 135 | |
|---|
| 136 | public: |
|---|
| 137 | FilteredExport(GBDATA *gb_main_, const char *aliname_, size_t alisize_); |
|---|
| 138 | ~FilteredExport(); |
|---|
| 139 | |
|---|
| 140 | // configuration: |
|---|
| 141 | void do_accept_missing_data() { accept_missing_data = true; } |
|---|
| 142 | void set_required_baseCount(const char *basesToCount, int minCount_) { |
|---|
| 143 | minCount = minCount_; |
|---|
| 144 | count_table = CharRangeTable(basesToCount); |
|---|
| 145 | arb_assert(implicated(minCount>0, basesToCount)); |
|---|
| 146 | } |
|---|
| 147 | void reset_required_baseCount() { set_required_baseCount(NULp, 0); } |
|---|
| 148 | void set_header_ACI(const char *aci) { freedup(header_ACI, aci); } |
|---|
| 149 | void set_sequence_ACI(const char *aci) { freedup(sequence_ACI, aci); } |
|---|
| 150 | GB_ERROR add_SAI_filter(const FilterDefinition& filterDef) __ATTR__USERESULT; |
|---|
| 151 | void clear_SAI_filters() { |
|---|
| 152 | filter = AP_filter(alisize); |
|---|
| 153 | filter_added = false; |
|---|
| 154 | } |
|---|
| 155 | |
|---|
| 156 | // access: |
|---|
| 157 | const char *get_aliname() const { |
|---|
| 158 | return aliname; |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | // action: |
|---|
| 162 | GB_ERROR write_fasta(FILE *out); |
|---|
| 163 | }; |
|---|
| 164 | |
|---|
| 165 | |
|---|
| 166 | #else |
|---|
| 167 | #error FilteredExport.h included twice |
|---|
| 168 | #endif // FILTEREDEXPORT_H |
|---|