| 1 | // ============================================================ // |
|---|
| 2 | // // |
|---|
| 3 | // File : arb_export_seq_filtered.cxx // |
|---|
| 4 | // Purpose : export sequences filtered by SAI // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // ============================================================ // |
|---|
| 10 | |
|---|
| 11 | #include <FilteredExport.h> |
|---|
| 12 | #include <arbdbt.h> |
|---|
| 13 | #include <arb_file.h> |
|---|
| 14 | #include <string> |
|---|
| 15 | #include <list> |
|---|
| 16 | #include <cstdlib> |
|---|
| 17 | |
|---|
| 18 | using namespace std; |
|---|
| 19 | |
|---|
| 20 | class CLI : virtual Noncopyable { |
|---|
| 21 | bool helpWanted; |
|---|
| 22 | GB_ERROR error; |
|---|
| 23 | |
|---|
| 24 | string database; // name of input database |
|---|
| 25 | string fasta; // name of FASTA output file |
|---|
| 26 | string aliname; // name of alignment to use |
|---|
| 27 | string head_aci; // ACI used to create content of fasta header |
|---|
| 28 | string seq_aci; // ACI used to post-process sequence data |
|---|
| 29 | |
|---|
| 30 | bool accept_missing_data; // true -> skip sequences with less bases in data |
|---|
| 31 | |
|---|
| 32 | int min_bases; // skip sequences with less than 'min_bases' base-characters in data |
|---|
| 33 | string bases; // defines what counts as base-character |
|---|
| 34 | |
|---|
| 35 | string filterby; // name of last SAI specified with "--filterby" |
|---|
| 36 | |
|---|
| 37 | typedef SmartPtr<FilterDefinition> FilterDefinitionPtr; |
|---|
| 38 | typedef std::list<FilterDefinitionPtr> FilterDefinitionList; |
|---|
| 39 | |
|---|
| 40 | FilterDefinitionList def_filters; |
|---|
| 41 | |
|---|
| 42 | static inline const char *getarg(int& argc, const char**& argv) { |
|---|
| 43 | return argc>0 ? (--argc,*argv++) : NULp; |
|---|
| 44 | } |
|---|
| 45 | inline const char *expect_arg(int& argc, const char**& argv) { |
|---|
| 46 | const char *arg = getarg(argc, argv); |
|---|
| 47 | if (!arg) { |
|---|
| 48 | error = "expected argument missing"; |
|---|
| 49 | arg = ""; |
|---|
| 50 | } |
|---|
| 51 | return arg; |
|---|
| 52 | } |
|---|
| 53 | void show_help() const { |
|---|
| 54 | fputs("\n" |
|---|
| 55 | "arb_export_seq_filtered -- export sequences filtered by SAI\n" |
|---|
| 56 | "Usage: arb_export_seq_filtered [switches]\n" |
|---|
| 57 | "\n" |
|---|
| 58 | "general:\n" |
|---|
| 59 | " <chars> defines a set of characters (for blocking/passthrough).\n" |
|---|
| 60 | " It may contain alphanumeric ranges, e.g. '0-5' which is equiv. to '012345'.\n" |
|---|
| 61 | " To specify a literal '-' put it at start or end of <chars>.\n" |
|---|
| 62 | "\n" |
|---|
| 63 | "switches:\n" |
|---|
| 64 | "--db <dbname> ARB database to export from\n" |
|---|
| 65 | "--fasta <outname> name of generated fasta file\n" |
|---|
| 66 | "\n" |
|---|
| 67 | "--id <aci> specify content for fasta header using ACI\n" |
|---|
| 68 | " (default: \"readdb(name)\"; see http://help.arb-home.de/aci.html)\n" |
|---|
| 69 | "\n" |
|---|
| 70 | "--ali <aliname> name of alignment to use (default: use default alignment)\n" |
|---|
| 71 | "--accept-missing-data silently skip species w/o data (default: abort with error)\n" |
|---|
| 72 | "--seqpp <aci> specify ACI to postprocess sequence data (default: none; example dot->hyphen: \":.=-\")\n" |
|---|
| 73 | "\n" |
|---|
| 74 | "--filterby <SAI> filter sequences using <SAI> (SAI has to be stored in database)\n" |
|---|
| 75 | "--block [allbut] <chars> exclude columns where <SAI> contains any of <chars>\n" |
|---|
| 76 | "--pass [allbut] <chars> include columns where <SAI> contains any of <chars>\n" |
|---|
| 77 | " (specifying 'allbut' will invert the set of <chars>)\n" |
|---|
| 78 | "\n" |
|---|
| 79 | "--min-bases <count> min. base count of exported data (after filtering).\n" |
|---|
| 80 | " Export of species is skipped if not reached and\n" |
|---|
| 81 | " a diagnostic message gets written to stderr.\n" |
|---|
| 82 | "--count-bases <chars> defines base-characters to count (default: \"a-zA-Z\")\n" |
|---|
| 83 | "\n" |
|---|
| 84 | ,stderr); |
|---|
| 85 | } |
|---|
| 86 | |
|---|
| 87 | const char *missing_passBlock() { return "--filterby has to be followed by --pass or --block"; } |
|---|
| 88 | const char *missing_filterby() { return "--pass and --block have to be preceeded by --filterby"; } |
|---|
| 89 | |
|---|
| 90 | void store_filterdef(FilterDefinition *def) { |
|---|
| 91 | if (def) def_filters.push_back(def); |
|---|
| 92 | } |
|---|
| 93 | FilterDefinition *expect_filterdef_args(FilterDefType type, int& argc, const char**& argv) { |
|---|
| 94 | FilterDefinition *result = NULp; |
|---|
| 95 | string arg = expect_arg(argc, argv); |
|---|
| 96 | if (!error) { |
|---|
| 97 | bool filter_chars = true; |
|---|
| 98 | |
|---|
| 99 | if (arg == "allbut") { |
|---|
| 100 | filter_chars = false; |
|---|
| 101 | arg = expect_arg(argc, argv); |
|---|
| 102 | } |
|---|
| 103 | |
|---|
| 104 | if (!error) { |
|---|
| 105 | if (filterby.empty()) { |
|---|
| 106 | error = missing_filterby(); |
|---|
| 107 | } |
|---|
| 108 | else { |
|---|
| 109 | result = new FilterDefinition(filterby.c_str(), type, filter_chars, arg.c_str()); |
|---|
| 110 | } |
|---|
| 111 | } |
|---|
| 112 | } |
|---|
| 113 | filterby = ""; // forget last filterby (even in error-case -> avoids some weird error-messages) |
|---|
| 114 | return result; |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | void parse(int& argc, const char**& argv) { |
|---|
| 118 | const char *arg = getarg(argc, argv); |
|---|
| 119 | if (arg) { |
|---|
| 120 | if (strcmp(arg, "--db") == 0) database = expect_arg(argc, argv); |
|---|
| 121 | else if (strcmp(arg, "--fasta") == 0) fasta = expect_arg(argc, argv); |
|---|
| 122 | else if (strcmp(arg, "--id") == 0) head_aci = expect_arg(argc, argv); |
|---|
| 123 | else if (strcmp(arg, "--seqpp") == 0) seq_aci = expect_arg(argc, argv); |
|---|
| 124 | else if (strcmp(arg, "--ali") == 0) aliname = expect_arg(argc, argv); |
|---|
| 125 | else if (strcmp(arg, "--count-bases") == 0) bases = expect_arg(argc, argv); |
|---|
| 126 | |
|---|
| 127 | else if (strcmp(arg, "--filterby") == 0) { |
|---|
| 128 | if (!filterby.empty()) error = missing_passBlock(); |
|---|
| 129 | else filterby = expect_arg(argc, argv); |
|---|
| 130 | } |
|---|
| 131 | |
|---|
| 132 | else if (strcmp(arg, "--block") == 0) store_filterdef(expect_filterdef_args(BLOCK, argc, argv)); |
|---|
| 133 | else if (strcmp(arg, "--pass") == 0) store_filterdef(expect_filterdef_args(PASS, argc, argv)); |
|---|
| 134 | |
|---|
| 135 | else if (strcmp(arg, "--accept-missing-data") == 0) accept_missing_data = true; |
|---|
| 136 | |
|---|
| 137 | else if (strcmp(arg, "--min-bases") == 0) min_bases = atoi(expect_arg(argc, argv)); |
|---|
| 138 | |
|---|
| 139 | else if (strcmp(arg, "--help") == 0 || strcmp(arg, "-h") == 0) helpWanted = true; |
|---|
| 140 | |
|---|
| 141 | else { |
|---|
| 142 | error = GBS_global_string("unexpected argument '%s'", arg); |
|---|
| 143 | } |
|---|
| 144 | } |
|---|
| 145 | } |
|---|
| 146 | void check_required_arguments() { |
|---|
| 147 | if (database.empty()) error = "no input database specified"; |
|---|
| 148 | else if (fasta.empty()) error = "no output file specified"; |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | public: |
|---|
| 152 | CLI(int argc, const char **argv) : |
|---|
| 153 | helpWanted(false), |
|---|
| 154 | error(NULp), |
|---|
| 155 | accept_missing_data(false), |
|---|
| 156 | min_bases(0), |
|---|
| 157 | bases("a-zA-Z") |
|---|
| 158 | { |
|---|
| 159 | --argc; ++argv; |
|---|
| 160 | while (!error && argc>0 && !helpWanted) { |
|---|
| 161 | parse(argc, argv); |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | if (!helpWanted) { // do not check extended conditions, if '--help' seen |
|---|
| 165 | if (!error && !filterby.empty()) { |
|---|
| 166 | error = missing_passBlock(); |
|---|
| 167 | } |
|---|
| 168 | if (!error) { |
|---|
| 169 | check_required_arguments(); |
|---|
| 170 | if (error) helpWanted = true; |
|---|
| 171 | } |
|---|
| 172 | } |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | bool help_wanted() const { return helpWanted; } |
|---|
| 176 | void show_help_if_useful() const { if (helpWanted) show_help(); } |
|---|
| 177 | GB_ERROR get_error() const { return error; } |
|---|
| 178 | |
|---|
| 179 | const char *get_database() const { return database.c_str(); } |
|---|
| 180 | const char *get_fasta() const { return fasta.c_str(); } |
|---|
| 181 | const char *get_aliname() const { return aliname.c_str(); } |
|---|
| 182 | const char *get_head_aci() const { return head_aci.c_str(); } |
|---|
| 183 | const char *get_seq_aci() const { return seq_aci.c_str(); } |
|---|
| 184 | const char *get_basechars() const { return bases.c_str(); } |
|---|
| 185 | |
|---|
| 186 | bool shall_accept_missing_data() const { return accept_missing_data; } |
|---|
| 187 | |
|---|
| 188 | int get_min_bases_count() const { return min_bases; } |
|---|
| 189 | |
|---|
| 190 | bool has_filters() const { return !def_filters.empty(); } |
|---|
| 191 | GB_ERROR add_filters_to(FilteredExport& exporter) const { |
|---|
| 192 | arb_assert(!error); |
|---|
| 193 | GB_ERROR err = NULp; |
|---|
| 194 | for (FilterDefinitionList::const_iterator d = def_filters.begin(); !err && d != def_filters.end(); ++d) { |
|---|
| 195 | err = exporter.add_SAI_filter(**d); |
|---|
| 196 | } |
|---|
| 197 | return err; |
|---|
| 198 | } |
|---|
| 199 | }; |
|---|
| 200 | |
|---|
| 201 | static GB_ERROR export_seq_data(const CLI& args) { |
|---|
| 202 | GB_ERROR error = NULp; |
|---|
| 203 | |
|---|
| 204 | const char *dbname = args.get_database(); |
|---|
| 205 | GB_shell shell; |
|---|
| 206 | GBDATA *gb_main = GB_open(dbname, "r"); |
|---|
| 207 | |
|---|
| 208 | if (!gb_main) { |
|---|
| 209 | error = GB_await_error(); |
|---|
| 210 | } |
|---|
| 211 | else { |
|---|
| 212 | { |
|---|
| 213 | GB_transaction ta(gb_main); |
|---|
| 214 | |
|---|
| 215 | char *aliname = strdup(args.get_aliname()); |
|---|
| 216 | if (aliname[0]) { |
|---|
| 217 | if (!GBT_get_alignment(gb_main, aliname)) { |
|---|
| 218 | error = GB_await_error(); |
|---|
| 219 | } |
|---|
| 220 | } |
|---|
| 221 | else { |
|---|
| 222 | freeset(aliname, GBT_get_default_alignment(gb_main)); |
|---|
| 223 | if (!aliname || !aliname[0]) { |
|---|
| 224 | GB_clear_error(); |
|---|
| 225 | error = "no default alignment defined (use --ali to specify alignment name)"; |
|---|
| 226 | } |
|---|
| 227 | else { |
|---|
| 228 | fprintf(stderr, "Using default alignment '%s'\n", aliname); |
|---|
| 229 | } |
|---|
| 230 | } |
|---|
| 231 | |
|---|
| 232 | if (!error) { |
|---|
| 233 | arb_assert(aliname && aliname[0]); |
|---|
| 234 | |
|---|
| 235 | long alisize = GBT_get_alignment_len(gb_main, aliname); |
|---|
| 236 | if (alisize<=0) { |
|---|
| 237 | error = GB_await_error(); |
|---|
| 238 | } |
|---|
| 239 | else { |
|---|
| 240 | const char *outname = args.get_fasta(); |
|---|
| 241 | FILE *out = fopen(outname, "wt"); |
|---|
| 242 | |
|---|
| 243 | if (!out) { |
|---|
| 244 | error = GB_IO_error("writing", outname); |
|---|
| 245 | } |
|---|
| 246 | else { |
|---|
| 247 | FilteredExport exporter(gb_main, aliname, alisize); |
|---|
| 248 | |
|---|
| 249 | if (args.get_head_aci()[0]) { |
|---|
| 250 | exporter.set_header_ACI(args.get_head_aci()); |
|---|
| 251 | } |
|---|
| 252 | if (args.shall_accept_missing_data()) { |
|---|
| 253 | exporter.do_accept_missing_data(); |
|---|
| 254 | } |
|---|
| 255 | if (args.get_min_bases_count()>0) { |
|---|
| 256 | exporter.set_required_baseCount(args.get_basechars(), args.get_min_bases_count()); |
|---|
| 257 | } |
|---|
| 258 | if (args.get_seq_aci()[0]) { |
|---|
| 259 | exporter.set_sequence_ACI(args.get_seq_aci()); |
|---|
| 260 | } |
|---|
| 261 | if (args.has_filters()) { |
|---|
| 262 | arb_assert(!error); |
|---|
| 263 | error = args.add_filters_to(exporter); |
|---|
| 264 | } |
|---|
| 265 | |
|---|
| 266 | if (!error) { |
|---|
| 267 | error = exporter.write_fasta(out); |
|---|
| 268 | } |
|---|
| 269 | fclose(out); |
|---|
| 270 | |
|---|
| 271 | if (error) GB_unlink_or_warn(outname, NULp); |
|---|
| 272 | } |
|---|
| 273 | } |
|---|
| 274 | } |
|---|
| 275 | free(aliname); |
|---|
| 276 | } |
|---|
| 277 | GB_close(gb_main); |
|---|
| 278 | } |
|---|
| 279 | |
|---|
| 280 | return error; |
|---|
| 281 | } |
|---|
| 282 | |
|---|
| 283 | int main(int argc, char **argv) { |
|---|
| 284 | GB_ERROR error = NULp; |
|---|
| 285 | CLI args(argc, const_cast<const char**>(argv)); |
|---|
| 286 | |
|---|
| 287 | if (!error) error = args.get_error(); |
|---|
| 288 | |
|---|
| 289 | if (!error && args.help_wanted()) { |
|---|
| 290 | args.show_help_if_useful(); |
|---|
| 291 | return EXIT_FAILURE; |
|---|
| 292 | } |
|---|
| 293 | |
|---|
| 294 | if (!error) error = export_seq_data(args); |
|---|
| 295 | |
|---|
| 296 | if (error) { |
|---|
| 297 | args.show_help_if_useful(); |
|---|
| 298 | fprintf(stderr, "Error in arb_export_seq_filtered: %s\n", error); |
|---|
| 299 | return EXIT_FAILURE; |
|---|
| 300 | } |
|---|
| 301 | return EXIT_SUCCESS; |
|---|
| 302 | } |
|---|
| 303 | |
|---|