1 | // ============================================================ // |
---|
2 | // // |
---|
3 | // File : arb_export_seq_filtered.cxx // |
---|
4 | // Purpose : export sequences filtered by SAI // |
---|
5 | // // |
---|
6 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2017 // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================ // |
---|
10 | |
---|
11 | #include <FilteredExport.h> |
---|
12 | #include <arbdbt.h> |
---|
13 | #include <arb_file.h> |
---|
14 | #include <string> |
---|
15 | #include <list> |
---|
16 | #include <cstdlib> |
---|
17 | |
---|
18 | using namespace std; |
---|
19 | |
---|
20 | class CLI : virtual Noncopyable { |
---|
21 | bool helpWanted; |
---|
22 | GB_ERROR error; |
---|
23 | |
---|
24 | string database; // name of input database |
---|
25 | string fasta; // name of FASTA output file |
---|
26 | string aliname; // name of alignment to use |
---|
27 | string head_aci; // ACI used to create content of fasta header |
---|
28 | string seq_aci; // ACI used to post-process sequence data |
---|
29 | |
---|
30 | bool accept_missing_data; // true -> skip sequences with less bases in data |
---|
31 | |
---|
32 | int min_bases; // skip sequences with less than 'min_bases' base-characters in data |
---|
33 | string bases; // defines what counts as base-character |
---|
34 | |
---|
35 | string filterby; // name of last SAI specified with "--filterby" |
---|
36 | |
---|
37 | typedef SmartPtr<FilterDefinition> FilterDefinitionPtr; |
---|
38 | typedef std::list<FilterDefinitionPtr> FilterDefinitionList; |
---|
39 | |
---|
40 | FilterDefinitionList def_filters; |
---|
41 | |
---|
42 | static inline const char *getarg(int& argc, const char**& argv) { |
---|
43 | return argc>0 ? (--argc,*argv++) : NULp; |
---|
44 | } |
---|
45 | inline const char *expect_arg(int& argc, const char**& argv) { |
---|
46 | const char *arg = getarg(argc, argv); |
---|
47 | if (!arg) { |
---|
48 | error = "expected argument missing"; |
---|
49 | arg = ""; |
---|
50 | } |
---|
51 | return arg; |
---|
52 | } |
---|
53 | void show_help() const { |
---|
54 | fputs("\n" |
---|
55 | "arb_export_seq_filtered -- export sequences filtered by SAI\n" |
---|
56 | "Usage: arb_export_seq_filtered [switches]\n" |
---|
57 | "\n" |
---|
58 | "general:\n" |
---|
59 | " <chars> may contain ranges (e.g. '0-5' which is equiv. to '012345')\n" |
---|
60 | "\n" |
---|
61 | "switches:\n" |
---|
62 | "--db <dbname> ARB database to export from\n" |
---|
63 | "--fasta <outname> name of generated fasta file\n" |
---|
64 | "\n" |
---|
65 | "--id <aci> specify content for fasta header using ACI\n" |
---|
66 | " (default: \"readdb(name)\"; see http://help.arb-home.de/aci.html)\n" |
---|
67 | "\n" |
---|
68 | "--ali <aliname> name of alignment to use (default: use default alignment)\n" |
---|
69 | "--accept-missing-data silently skip species w/o data (default: abort with error)\n" |
---|
70 | "--seqpp <aci> specify ACI to postprocess sequence data (default: none; example dot->hyphen: \":.=-\")\n" |
---|
71 | "\n" |
---|
72 | "--filterby <SAI> filter sequences using <SAI> (SAI has to be stored in database)\n" |
---|
73 | "--block [allbut] <chars> exclude columns where <SAI> contains any of <chars>\n" |
---|
74 | "--pass [allbut] <chars> include columns where <SAI> contains any of <chars>\n" |
---|
75 | " (specifying 'allbut' will invert the set of <chars>)\n" |
---|
76 | "\n" |
---|
77 | "--min-bases <count> min. base count of exported data (after filtering).\n" |
---|
78 | " Export of species is skipped if not reached and\n" |
---|
79 | " a diagnostic message gets written to stderr.\n" |
---|
80 | "--count-bases <chars> defines base-characters to count (default: \"a-zA-Z\")\n" |
---|
81 | "\n" |
---|
82 | ,stderr); |
---|
83 | } |
---|
84 | |
---|
85 | const char *missing_passBlock() { return "--filterby has to be followed by --pass or --block"; } |
---|
86 | const char *missing_filterby() { return "--pass and --block have to be preceeded by --filterby"; } |
---|
87 | |
---|
88 | void store_filterdef(FilterDefinition *def) { |
---|
89 | if (def) def_filters.push_back(def); |
---|
90 | } |
---|
91 | FilterDefinition *expect_filterdef_args(FilterDefType type, int& argc, const char**& argv) { |
---|
92 | FilterDefinition *result = NULp; |
---|
93 | string arg = expect_arg(argc, argv); |
---|
94 | if (!error) { |
---|
95 | bool filter_chars = true; |
---|
96 | |
---|
97 | if (arg == "allbut") { |
---|
98 | filter_chars = false; |
---|
99 | arg = expect_arg(argc, argv); |
---|
100 | } |
---|
101 | |
---|
102 | if (!error) { |
---|
103 | if (filterby.empty()) { |
---|
104 | error = missing_filterby(); |
---|
105 | } |
---|
106 | else { |
---|
107 | result = new FilterDefinition(filterby.c_str(), type, filter_chars, arg.c_str()); |
---|
108 | } |
---|
109 | } |
---|
110 | } |
---|
111 | filterby = ""; // forget last filterby (even in error-case -> avoids some weird error-messages) |
---|
112 | return result; |
---|
113 | } |
---|
114 | |
---|
115 | void parse(int& argc, const char**& argv) { |
---|
116 | const char *arg = getarg(argc, argv); |
---|
117 | if (arg) { |
---|
118 | if (strcmp(arg, "--db") == 0) database = expect_arg(argc, argv); |
---|
119 | else if (strcmp(arg, "--fasta") == 0) fasta = expect_arg(argc, argv); |
---|
120 | else if (strcmp(arg, "--id") == 0) head_aci = expect_arg(argc, argv); |
---|
121 | else if (strcmp(arg, "--seqpp") == 0) seq_aci = expect_arg(argc, argv); |
---|
122 | else if (strcmp(arg, "--ali") == 0) aliname = expect_arg(argc, argv); |
---|
123 | else if (strcmp(arg, "--count-bases") == 0) bases = expect_arg(argc, argv); |
---|
124 | |
---|
125 | else if (strcmp(arg, "--filterby") == 0) { |
---|
126 | if (!filterby.empty()) error = missing_passBlock(); |
---|
127 | else filterby = expect_arg(argc, argv); |
---|
128 | } |
---|
129 | |
---|
130 | else if (strcmp(arg, "--block") == 0) store_filterdef(expect_filterdef_args(BLOCK, argc, argv)); |
---|
131 | else if (strcmp(arg, "--pass") == 0) store_filterdef(expect_filterdef_args(PASS, argc, argv)); |
---|
132 | |
---|
133 | else if (strcmp(arg, "--accept-missing-data") == 0) accept_missing_data = true; |
---|
134 | |
---|
135 | else if (strcmp(arg, "--min-bases") == 0) min_bases = atoi(expect_arg(argc, argv)); |
---|
136 | |
---|
137 | else if (strcmp(arg, "--help") == 0 || strcmp(arg, "-h") == 0) helpWanted = true; |
---|
138 | |
---|
139 | else { |
---|
140 | error = GBS_global_string("unexpected argument '%s'", arg); |
---|
141 | } |
---|
142 | } |
---|
143 | } |
---|
144 | void check_required_arguments() { |
---|
145 | if (database.empty()) error = "no input database specified"; |
---|
146 | else if (fasta.empty()) error = "no output file specified"; |
---|
147 | } |
---|
148 | |
---|
149 | public: |
---|
150 | CLI(int argc, const char **argv) : |
---|
151 | helpWanted(false), |
---|
152 | error(NULp), |
---|
153 | accept_missing_data(false), |
---|
154 | min_bases(0), |
---|
155 | bases("a-zA-Z") |
---|
156 | { |
---|
157 | --argc; ++argv; |
---|
158 | while (!error && argc>0 && !helpWanted) { |
---|
159 | parse(argc, argv); |
---|
160 | } |
---|
161 | |
---|
162 | if (!helpWanted) { // do not check extended conditions, if '--help' seen |
---|
163 | if (!error && !filterby.empty()) { |
---|
164 | error = missing_passBlock(); |
---|
165 | } |
---|
166 | if (!error) { |
---|
167 | check_required_arguments(); |
---|
168 | if (error) helpWanted = true; |
---|
169 | } |
---|
170 | } |
---|
171 | } |
---|
172 | |
---|
173 | bool help_wanted() const { return helpWanted; } |
---|
174 | void show_help_if_useful() const { if (helpWanted) show_help(); } |
---|
175 | GB_ERROR get_error() const { return error; } |
---|
176 | |
---|
177 | const char *get_database() const { return database.c_str(); } |
---|
178 | const char *get_fasta() const { return fasta.c_str(); } |
---|
179 | const char *get_aliname() const { return aliname.c_str(); } |
---|
180 | const char *get_head_aci() const { return head_aci.c_str(); } |
---|
181 | const char *get_seq_aci() const { return seq_aci.c_str(); } |
---|
182 | const char *get_basechars() const { return bases.c_str(); } |
---|
183 | |
---|
184 | bool shall_accept_missing_data() const { return accept_missing_data; } |
---|
185 | |
---|
186 | int get_min_bases_count() const { return min_bases; } |
---|
187 | |
---|
188 | bool has_filters() const { return !def_filters.empty(); } |
---|
189 | GB_ERROR add_filters_to(FilteredExport& exporter) const { |
---|
190 | arb_assert(!error); |
---|
191 | GB_ERROR err = NULp; |
---|
192 | for (FilterDefinitionList::const_iterator d = def_filters.begin(); !err && d != def_filters.end(); ++d) { |
---|
193 | err = exporter.add_SAI_filter(**d); |
---|
194 | } |
---|
195 | return err; |
---|
196 | } |
---|
197 | }; |
---|
198 | |
---|
199 | static GB_ERROR export_seq_data(const CLI& args) { |
---|
200 | GB_ERROR error = NULp; |
---|
201 | |
---|
202 | const char *dbname = args.get_database(); |
---|
203 | GB_shell shell; |
---|
204 | GBDATA *gb_main = GB_open(dbname, "r"); |
---|
205 | |
---|
206 | if (!gb_main) { |
---|
207 | error = GB_await_error(); |
---|
208 | } |
---|
209 | else { |
---|
210 | { |
---|
211 | GB_transaction ta(gb_main); |
---|
212 | |
---|
213 | char *aliname = strdup(args.get_aliname()); |
---|
214 | if (aliname[0]) { |
---|
215 | if (!GBT_get_alignment(gb_main, aliname)) { |
---|
216 | error = GB_await_error(); |
---|
217 | } |
---|
218 | } |
---|
219 | else { |
---|
220 | freeset(aliname, GBT_get_default_alignment(gb_main)); |
---|
221 | if (!aliname || !aliname[0]) { |
---|
222 | error = "no default alignment defined (use --ali to specify alignment name)"; |
---|
223 | } |
---|
224 | else { |
---|
225 | fprintf(stderr, "Using default alignment '%s'\n", aliname); |
---|
226 | } |
---|
227 | } |
---|
228 | |
---|
229 | if (!error) { |
---|
230 | arb_assert(aliname && aliname[0]); |
---|
231 | |
---|
232 | long alisize = GBT_get_alignment_len(gb_main, aliname); |
---|
233 | if (alisize<0) { |
---|
234 | error = GBS_global_string("failed to detect length of alignment '%s'", aliname); |
---|
235 | } |
---|
236 | else { |
---|
237 | const char *outname = args.get_fasta(); |
---|
238 | FILE *out = fopen(outname, "wt"); |
---|
239 | |
---|
240 | if (!out) { |
---|
241 | error = GB_IO_error("writing", outname); |
---|
242 | } |
---|
243 | else { |
---|
244 | FilteredExport exporter(gb_main, aliname, alisize); |
---|
245 | |
---|
246 | if (args.get_head_aci()[0]) { |
---|
247 | exporter.set_header_ACI(args.get_head_aci()); |
---|
248 | } |
---|
249 | if (args.shall_accept_missing_data()) { |
---|
250 | exporter.do_accept_missing_data(); |
---|
251 | } |
---|
252 | if (args.get_min_bases_count()>0) { |
---|
253 | exporter.set_required_baseCount(args.get_basechars(), args.get_min_bases_count()); |
---|
254 | } |
---|
255 | if (args.get_seq_aci()[0]) { |
---|
256 | exporter.set_sequence_ACI(args.get_seq_aci()); |
---|
257 | } |
---|
258 | if (args.has_filters()) { |
---|
259 | arb_assert(!error); |
---|
260 | error = args.add_filters_to(exporter); |
---|
261 | } |
---|
262 | |
---|
263 | if (!error) { |
---|
264 | error = exporter.write_fasta(out); |
---|
265 | } |
---|
266 | fclose(out); |
---|
267 | |
---|
268 | if (error) GB_unlink_or_warn(outname, NULp); |
---|
269 | } |
---|
270 | } |
---|
271 | } |
---|
272 | free(aliname); |
---|
273 | } |
---|
274 | GB_close(gb_main); |
---|
275 | } |
---|
276 | |
---|
277 | return error; |
---|
278 | } |
---|
279 | |
---|
280 | int main(int argc, char **argv) { |
---|
281 | GB_ERROR error = NULp; |
---|
282 | CLI args(argc, const_cast<const char**>(argv)); |
---|
283 | |
---|
284 | if (!error) error = args.get_error(); |
---|
285 | |
---|
286 | if (!error && args.help_wanted()) { |
---|
287 | args.show_help_if_useful(); |
---|
288 | return EXIT_FAILURE; |
---|
289 | } |
---|
290 | |
---|
291 | if (!error) error = export_seq_data(args); |
---|
292 | |
---|
293 | if (error) { |
---|
294 | args.show_help_if_useful(); |
---|
295 | fprintf(stderr, "Error in arb_export_seq_filtered: %s\n", error); |
---|
296 | return EXIT_FAILURE; |
---|
297 | } |
---|
298 | return EXIT_SUCCESS; |
---|
299 | } |
---|
300 | |
---|