1 | // ============================================================ // |
---|
2 | // // |
---|
3 | // File : arb_export_seq_filtered.cxx // |
---|
4 | // Purpose : export sequences filtered by SAI // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================ // |
---|
10 | |
---|
11 | #include <FilteredExport.h> |
---|
12 | #include <arbdbt.h> |
---|
13 | #include <arb_file.h> |
---|
14 | #include <string> |
---|
15 | #include <list> |
---|
16 | #include <cstdlib> |
---|
17 | |
---|
18 | using namespace std; |
---|
19 | |
---|
20 | class CLI : virtual Noncopyable { |
---|
21 | bool helpWanted; |
---|
22 | GB_ERROR error; |
---|
23 | |
---|
24 | string database; // name of input database |
---|
25 | string fasta; // name of FASTA output file |
---|
26 | string aliname; // name of alignment to use |
---|
27 | string head_aci; // ACI used to create content of fasta header |
---|
28 | string seq_aci; // ACI used to post-process sequence data |
---|
29 | |
---|
30 | bool accept_missing_data; // true -> skip sequences with less bases in data |
---|
31 | |
---|
32 | int min_bases; // skip sequences with less than 'min_bases' base-characters in data |
---|
33 | string bases; // defines what counts as base-character |
---|
34 | |
---|
35 | string filterby; // name of last SAI specified with "--filterby" |
---|
36 | |
---|
37 | typedef SmartPtr<FilterDefinition> FilterDefinitionPtr; |
---|
38 | typedef std::list<FilterDefinitionPtr> FilterDefinitionList; |
---|
39 | |
---|
40 | FilterDefinitionList def_filters; |
---|
41 | |
---|
42 | static inline const char *getarg(int& argc, const char**& argv) { |
---|
43 | return argc>0 ? (--argc,*argv++) : NULp; |
---|
44 | } |
---|
45 | inline const char *expect_arg(int& argc, const char**& argv) { |
---|
46 | const char *arg = getarg(argc, argv); |
---|
47 | if (!arg) { |
---|
48 | error = "expected argument missing"; |
---|
49 | arg = ""; |
---|
50 | } |
---|
51 | return arg; |
---|
52 | } |
---|
53 | void show_help() const { |
---|
54 | fputs("\n" |
---|
55 | "arb_export_seq_filtered -- export sequences filtered by SAI\n" |
---|
56 | "Usage: arb_export_seq_filtered [switches]\n" |
---|
57 | "\n" |
---|
58 | "general:\n" |
---|
59 | " <chars> defines a set of characters (for blocking/passthrough).\n" |
---|
60 | " It may contain alphanumeric ranges, e.g. '0-5' which is equiv. to '012345'.\n" |
---|
61 | " To specify a literal '-' put it at start or end of <chars>.\n" |
---|
62 | "\n" |
---|
63 | "switches:\n" |
---|
64 | "--db <dbname> ARB database to export from\n" |
---|
65 | "--fasta <outname> name of generated fasta file\n" |
---|
66 | "\n" |
---|
67 | "--id <aci> specify content for fasta header using ACI\n" |
---|
68 | " (default: \"readdb(name)\"; see http://help.arb-home.de/aci.html)\n" |
---|
69 | "\n" |
---|
70 | "--ali <aliname> name of alignment to use (default: use default alignment)\n" |
---|
71 | "--accept-missing-data silently skip species w/o data (default: abort with error)\n" |
---|
72 | "--seqpp <aci> specify ACI to postprocess sequence data (default: none; example dot->hyphen: \":.=-\")\n" |
---|
73 | "\n" |
---|
74 | "--filterby <SAI> filter sequences using <SAI> (SAI has to be stored in database)\n" |
---|
75 | "--block [allbut] <chars> exclude columns where <SAI> contains any of <chars>\n" |
---|
76 | "--pass [allbut] <chars> include columns where <SAI> contains any of <chars>\n" |
---|
77 | " (specifying 'allbut' will invert the set of <chars>)\n" |
---|
78 | "\n" |
---|
79 | "--min-bases <count> min. base count of exported data (after filtering).\n" |
---|
80 | " Export of species is skipped if not reached and\n" |
---|
81 | " a diagnostic message gets written to stderr.\n" |
---|
82 | "--count-bases <chars> defines base-characters to count (default: \"a-zA-Z\")\n" |
---|
83 | "\n" |
---|
84 | ,stderr); |
---|
85 | } |
---|
86 | |
---|
87 | const char *missing_passBlock() { return "--filterby has to be followed by --pass or --block"; } |
---|
88 | const char *missing_filterby() { return "--pass and --block have to be preceeded by --filterby"; } |
---|
89 | |
---|
90 | void store_filterdef(FilterDefinition *def) { |
---|
91 | if (def) def_filters.push_back(def); |
---|
92 | } |
---|
93 | FilterDefinition *expect_filterdef_args(FilterDefType type, int& argc, const char**& argv) { |
---|
94 | FilterDefinition *result = NULp; |
---|
95 | string arg = expect_arg(argc, argv); |
---|
96 | if (!error) { |
---|
97 | bool filter_chars = true; |
---|
98 | |
---|
99 | if (arg == "allbut") { |
---|
100 | filter_chars = false; |
---|
101 | arg = expect_arg(argc, argv); |
---|
102 | } |
---|
103 | |
---|
104 | if (!error) { |
---|
105 | if (filterby.empty()) { |
---|
106 | error = missing_filterby(); |
---|
107 | } |
---|
108 | else { |
---|
109 | result = new FilterDefinition(filterby.c_str(), type, filter_chars, arg.c_str()); |
---|
110 | } |
---|
111 | } |
---|
112 | } |
---|
113 | filterby = ""; // forget last filterby (even in error-case -> avoids some weird error-messages) |
---|
114 | return result; |
---|
115 | } |
---|
116 | |
---|
117 | void parse(int& argc, const char**& argv) { |
---|
118 | const char *arg = getarg(argc, argv); |
---|
119 | if (arg) { |
---|
120 | if (strcmp(arg, "--db") == 0) database = expect_arg(argc, argv); |
---|
121 | else if (strcmp(arg, "--fasta") == 0) fasta = expect_arg(argc, argv); |
---|
122 | else if (strcmp(arg, "--id") == 0) head_aci = expect_arg(argc, argv); |
---|
123 | else if (strcmp(arg, "--seqpp") == 0) seq_aci = expect_arg(argc, argv); |
---|
124 | else if (strcmp(arg, "--ali") == 0) aliname = expect_arg(argc, argv); |
---|
125 | else if (strcmp(arg, "--count-bases") == 0) bases = expect_arg(argc, argv); |
---|
126 | |
---|
127 | else if (strcmp(arg, "--filterby") == 0) { |
---|
128 | if (!filterby.empty()) error = missing_passBlock(); |
---|
129 | else filterby = expect_arg(argc, argv); |
---|
130 | } |
---|
131 | |
---|
132 | else if (strcmp(arg, "--block") == 0) store_filterdef(expect_filterdef_args(BLOCK, argc, argv)); |
---|
133 | else if (strcmp(arg, "--pass") == 0) store_filterdef(expect_filterdef_args(PASS, argc, argv)); |
---|
134 | |
---|
135 | else if (strcmp(arg, "--accept-missing-data") == 0) accept_missing_data = true; |
---|
136 | |
---|
137 | else if (strcmp(arg, "--min-bases") == 0) min_bases = atoi(expect_arg(argc, argv)); |
---|
138 | |
---|
139 | else if (strcmp(arg, "--help") == 0 || strcmp(arg, "-h") == 0) helpWanted = true; |
---|
140 | |
---|
141 | else { |
---|
142 | error = GBS_global_string("unexpected argument '%s'", arg); |
---|
143 | } |
---|
144 | } |
---|
145 | } |
---|
146 | void check_required_arguments() { |
---|
147 | if (database.empty()) error = "no input database specified"; |
---|
148 | else if (fasta.empty()) error = "no output file specified"; |
---|
149 | } |
---|
150 | |
---|
151 | public: |
---|
152 | CLI(int argc, const char **argv) : |
---|
153 | helpWanted(false), |
---|
154 | error(NULp), |
---|
155 | accept_missing_data(false), |
---|
156 | min_bases(0), |
---|
157 | bases("a-zA-Z") |
---|
158 | { |
---|
159 | --argc; ++argv; |
---|
160 | while (!error && argc>0 && !helpWanted) { |
---|
161 | parse(argc, argv); |
---|
162 | } |
---|
163 | |
---|
164 | if (!helpWanted) { // do not check extended conditions, if '--help' seen |
---|
165 | if (!error && !filterby.empty()) { |
---|
166 | error = missing_passBlock(); |
---|
167 | } |
---|
168 | if (!error) { |
---|
169 | check_required_arguments(); |
---|
170 | if (error) helpWanted = true; |
---|
171 | } |
---|
172 | } |
---|
173 | } |
---|
174 | |
---|
175 | bool help_wanted() const { return helpWanted; } |
---|
176 | void show_help_if_useful() const { if (helpWanted) show_help(); } |
---|
177 | GB_ERROR get_error() const { return error; } |
---|
178 | |
---|
179 | const char *get_database() const { return database.c_str(); } |
---|
180 | const char *get_fasta() const { return fasta.c_str(); } |
---|
181 | const char *get_aliname() const { return aliname.c_str(); } |
---|
182 | const char *get_head_aci() const { return head_aci.c_str(); } |
---|
183 | const char *get_seq_aci() const { return seq_aci.c_str(); } |
---|
184 | const char *get_basechars() const { return bases.c_str(); } |
---|
185 | |
---|
186 | bool shall_accept_missing_data() const { return accept_missing_data; } |
---|
187 | |
---|
188 | int get_min_bases_count() const { return min_bases; } |
---|
189 | |
---|
190 | bool has_filters() const { return !def_filters.empty(); } |
---|
191 | GB_ERROR add_filters_to(FilteredExport& exporter) const { |
---|
192 | arb_assert(!error); |
---|
193 | GB_ERROR err = NULp; |
---|
194 | for (FilterDefinitionList::const_iterator d = def_filters.begin(); !err && d != def_filters.end(); ++d) { |
---|
195 | err = exporter.add_SAI_filter(**d); |
---|
196 | } |
---|
197 | return err; |
---|
198 | } |
---|
199 | }; |
---|
200 | |
---|
201 | static GB_ERROR export_seq_data(const CLI& args) { |
---|
202 | GB_ERROR error = NULp; |
---|
203 | |
---|
204 | const char *dbname = args.get_database(); |
---|
205 | GB_shell shell; |
---|
206 | GBDATA *gb_main = GB_open(dbname, "r"); |
---|
207 | |
---|
208 | if (!gb_main) { |
---|
209 | error = GB_await_error(); |
---|
210 | } |
---|
211 | else { |
---|
212 | { |
---|
213 | GB_transaction ta(gb_main); |
---|
214 | |
---|
215 | char *aliname = strdup(args.get_aliname()); |
---|
216 | if (aliname[0]) { |
---|
217 | if (!GBT_get_alignment(gb_main, aliname)) { |
---|
218 | error = GB_await_error(); |
---|
219 | } |
---|
220 | } |
---|
221 | else { |
---|
222 | freeset(aliname, GBT_get_default_alignment(gb_main)); |
---|
223 | if (!aliname || !aliname[0]) { |
---|
224 | GB_clear_error(); |
---|
225 | error = "no default alignment defined (use --ali to specify alignment name)"; |
---|
226 | } |
---|
227 | else { |
---|
228 | fprintf(stderr, "Using default alignment '%s'\n", aliname); |
---|
229 | } |
---|
230 | } |
---|
231 | |
---|
232 | if (!error) { |
---|
233 | arb_assert(aliname && aliname[0]); |
---|
234 | |
---|
235 | long alisize = GBT_get_alignment_len(gb_main, aliname); |
---|
236 | if (alisize<=0) { |
---|
237 | error = GB_await_error(); |
---|
238 | } |
---|
239 | else { |
---|
240 | const char *outname = args.get_fasta(); |
---|
241 | FILE *out = fopen(outname, "wt"); |
---|
242 | |
---|
243 | if (!out) { |
---|
244 | error = GB_IO_error("writing", outname); |
---|
245 | } |
---|
246 | else { |
---|
247 | FilteredExport exporter(gb_main, aliname, alisize); |
---|
248 | |
---|
249 | if (args.get_head_aci()[0]) { |
---|
250 | exporter.set_header_ACI(args.get_head_aci()); |
---|
251 | } |
---|
252 | if (args.shall_accept_missing_data()) { |
---|
253 | exporter.do_accept_missing_data(); |
---|
254 | } |
---|
255 | if (args.get_min_bases_count()>0) { |
---|
256 | exporter.set_required_baseCount(args.get_basechars(), args.get_min_bases_count()); |
---|
257 | } |
---|
258 | if (args.get_seq_aci()[0]) { |
---|
259 | exporter.set_sequence_ACI(args.get_seq_aci()); |
---|
260 | } |
---|
261 | if (args.has_filters()) { |
---|
262 | arb_assert(!error); |
---|
263 | error = args.add_filters_to(exporter); |
---|
264 | } |
---|
265 | |
---|
266 | if (!error) { |
---|
267 | error = exporter.write_fasta(out); |
---|
268 | } |
---|
269 | fclose(out); |
---|
270 | |
---|
271 | if (error) GB_unlink_or_warn(outname, NULp); |
---|
272 | } |
---|
273 | } |
---|
274 | } |
---|
275 | free(aliname); |
---|
276 | } |
---|
277 | GB_close(gb_main); |
---|
278 | } |
---|
279 | |
---|
280 | return error; |
---|
281 | } |
---|
282 | |
---|
283 | int main(int argc, char **argv) { |
---|
284 | GB_ERROR error = NULp; |
---|
285 | CLI args(argc, const_cast<const char**>(argv)); |
---|
286 | |
---|
287 | if (!error) error = args.get_error(); |
---|
288 | |
---|
289 | if (!error && args.help_wanted()) { |
---|
290 | args.show_help_if_useful(); |
---|
291 | return EXIT_FAILURE; |
---|
292 | } |
---|
293 | |
---|
294 | if (!error) error = export_seq_data(args); |
---|
295 | |
---|
296 | if (error) { |
---|
297 | args.show_help_if_useful(); |
---|
298 | fprintf(stderr, "Error in arb_export_seq_filtered: %s\n", error); |
---|
299 | return EXIT_FAILURE; |
---|
300 | } |
---|
301 | return EXIT_SUCCESS; |
---|
302 | } |
---|
303 | |
---|