1 | #include <stdio.h> // needed for arbdb.h |
---|
2 | #include <string.h> |
---|
3 | #include <unistd.h> |
---|
4 | |
---|
5 | #include <arbdb.h> |
---|
6 | #include <arbdbt.h> |
---|
7 | #include <arb_file.h> |
---|
8 | #include <AP_filter.hxx> |
---|
9 | #include <seqio.hxx> |
---|
10 | #include <insdel.h> |
---|
11 | |
---|
12 | #include <string> |
---|
13 | #include <vector> |
---|
14 | #include <fstream> |
---|
15 | #include <map> |
---|
16 | #include <iostream> |
---|
17 | #include <sstream> |
---|
18 | #include <set> |
---|
19 | #include <iterator> |
---|
20 | #include <algorithm> |
---|
21 | |
---|
22 | using std::cerr; |
---|
23 | using std::endl; |
---|
24 | using std::clog; |
---|
25 | using std::cout; |
---|
26 | using std::string; |
---|
27 | using std::map; |
---|
28 | using std::ifstream; |
---|
29 | using std::ofstream; |
---|
30 | using std::vector; |
---|
31 | using std::stringstream; |
---|
32 | using std::getline; |
---|
33 | using std::set; |
---|
34 | |
---|
35 | template<typename T> |
---|
36 | struct equals { |
---|
37 | const T& t; |
---|
38 | equals(const T& _t) : t(_t) {} |
---|
39 | bool operator()(const T& _t) { return _t == t; } |
---|
40 | }; |
---|
41 | |
---|
42 | /* fake aw_message/aw_status functions */ |
---|
43 | |
---|
44 | static void aw_status(double d) { |
---|
45 | cerr << "aw_status_fraction: " << d << endl; |
---|
46 | cerr.flush(); |
---|
47 | } |
---|
48 | |
---|
49 | /* generic writer class */ |
---|
50 | class Writer : virtual Noncopyable { |
---|
51 | string err_state; |
---|
52 | protected: |
---|
53 | virtual void set_error(const char *error) { |
---|
54 | arb_assert(error); |
---|
55 | if (!err_state.empty()) { |
---|
56 | cerr << "ERROR: " << err_state << endl; |
---|
57 | } |
---|
58 | err_state = error; |
---|
59 | } |
---|
60 | |
---|
61 | public: |
---|
62 | virtual ~Writer() {} |
---|
63 | |
---|
64 | virtual void addSequence(GBDATA*) = 0; |
---|
65 | virtual void finish() {} |
---|
66 | |
---|
67 | const char *get_error() const { |
---|
68 | return err_state.empty() ? NULp : err_state.c_str(); |
---|
69 | } |
---|
70 | bool ok() const { return !get_error(); } |
---|
71 | }; |
---|
72 | |
---|
73 | class MultiFastaWriter : public Writer { // derived from Noncopyable |
---|
74 | ofstream file; |
---|
75 | string default_ali; |
---|
76 | double count, count_max; |
---|
77 | public: |
---|
78 | MultiFastaWriter(string s, const char* a, int c) |
---|
79 | : file(s.c_str()), |
---|
80 | default_ali(a), |
---|
81 | count(0), count_max(c) |
---|
82 | {} |
---|
83 | |
---|
84 | string readString(GBDATA* spec, const char* key) { |
---|
85 | GBDATA *gbd = GB_find(spec, key, SEARCH_CHILD); |
---|
86 | if (!gbd) return string("<empty>"); |
---|
87 | char *str= GB_read_as_string(gbd); |
---|
88 | if (!str) return string("<empty>"); |
---|
89 | string rval = str; |
---|
90 | free(str); |
---|
91 | return rval; |
---|
92 | } |
---|
93 | |
---|
94 | string readData(GBDATA* spec) { |
---|
95 | GBDATA *gbd = GB_find(spec, default_ali.c_str(), SEARCH_CHILD); |
---|
96 | if (!gbd) return string("<empty>"); |
---|
97 | string result = readString(gbd, "data"); |
---|
98 | GB_release(gbd); |
---|
99 | return result; |
---|
100 | } |
---|
101 | |
---|
102 | void addSequence(GBDATA* spec) OVERRIDE { |
---|
103 | file << ">" |
---|
104 | << readString(spec, "acc") << "." |
---|
105 | << readString(spec, "start") << "." |
---|
106 | << readString(spec, "stop") << " | " |
---|
107 | << readString(spec, "full_name") << "\n"; |
---|
108 | |
---|
109 | string tmp = readData(spec); |
---|
110 | std::replace_if(tmp.begin(),tmp.end(),equals<char>('.'),'-'); |
---|
111 | |
---|
112 | unsigned int i; |
---|
113 | for (i=0; i+70<tmp.size(); i+=70) |
---|
114 | file << tmp.substr(i,70) << "\n"; |
---|
115 | file << tmp.substr(i) << "\n"; |
---|
116 | aw_status(++count/count_max); |
---|
117 | } |
---|
118 | }; |
---|
119 | |
---|
120 | class ArbWriter FINAL_TYPE : public Writer { // derived from Noncopyable |
---|
121 | string template_arb; |
---|
122 | string filename; |
---|
123 | double count, count_max; |
---|
124 | GBDATA *gbdst, *gbdst_spec; |
---|
125 | |
---|
126 | void close() { |
---|
127 | if (gbdst) { |
---|
128 | GB_close(gbdst); |
---|
129 | gbdst = NULp; |
---|
130 | } |
---|
131 | } |
---|
132 | |
---|
133 | void set_error(const char *error) OVERRIDE { |
---|
134 | Writer::set_error(error); |
---|
135 | close(); |
---|
136 | } |
---|
137 | |
---|
138 | public: |
---|
139 | ArbWriter(string ta, string f, int c) |
---|
140 | : template_arb(ta), |
---|
141 | filename(f), |
---|
142 | count(0), |
---|
143 | count_max(c) |
---|
144 | { |
---|
145 | gbdst = GB_open(template_arb.c_str(), "rw"); |
---|
146 | if (!gbdst) set_error("Unable to open template file"); |
---|
147 | else { |
---|
148 | if (GB_request_undo_type(gbdst, GB_UNDO_NONE)) { |
---|
149 | cerr << "WARN: Unable to disable undo buffer"; |
---|
150 | } |
---|
151 | |
---|
152 | GB_begin_transaction(gbdst); |
---|
153 | gbdst_spec = GB_search(gbdst, "species_data", GB_CREATE_CONTAINER); |
---|
154 | GB_commit_transaction(gbdst); |
---|
155 | |
---|
156 | GB_ERROR error = GB_save_as(gbdst, filename.c_str(), "b"); |
---|
157 | if (error) set_error(GBS_global_string("Unable to open output file (Reason: %s)", error)); |
---|
158 | } |
---|
159 | } |
---|
160 | ~ArbWriter() { |
---|
161 | close(); |
---|
162 | if (!ok()) { |
---|
163 | unlink(filename.c_str()); |
---|
164 | } |
---|
165 | } |
---|
166 | |
---|
167 | void addSequence(GBDATA* gbspec) OVERRIDE { |
---|
168 | if (gbdst) { |
---|
169 | GB_transaction ta(gbdst); |
---|
170 | GBDATA *gbnew = GB_create_container(gbdst_spec, "species"); |
---|
171 | GB_ERROR error = NULp; |
---|
172 | if (!gbnew) { |
---|
173 | error = GB_await_error(); |
---|
174 | } |
---|
175 | else { |
---|
176 | error = GB_copy_dropProtectMarksAndTempstate(gbnew, gbspec); |
---|
177 | if (!error) error = GB_release(gbnew); |
---|
178 | } |
---|
179 | if (error) set_error(error); |
---|
180 | } |
---|
181 | aw_status(++count/count_max); |
---|
182 | } |
---|
183 | |
---|
184 | void finish() OVERRIDE { |
---|
185 | GB_ERROR error = NULp; |
---|
186 | if (gbdst) { |
---|
187 | cerr << "Compressing..." << endl; |
---|
188 | GB_transaction trans(gbdst); |
---|
189 | char *ali = GBT_get_default_alignment(gbdst); |
---|
190 | if (!ali) { |
---|
191 | error = "Your template DB has no default alignment"; |
---|
192 | } |
---|
193 | else { |
---|
194 | error = ARB_format_alignment(gbdst, ali); |
---|
195 | if (error) error = GBS_global_string("Failed to format alignments (Reason: %s)", error); |
---|
196 | free(ali); |
---|
197 | } |
---|
198 | } |
---|
199 | |
---|
200 | if (!error && GB_save_as(gbdst, filename.c_str(), "b")) { |
---|
201 | error = "Unable to save to output file"; |
---|
202 | } |
---|
203 | |
---|
204 | if (error) set_error(error); |
---|
205 | } |
---|
206 | }; |
---|
207 | |
---|
208 | class AwtiExportWriter FINAL_TYPE : public Writer { // derived from Noncopyable |
---|
209 | GBDATA *gbmain; |
---|
210 | const char *dbname; |
---|
211 | const char *formname; // export form (.eft) |
---|
212 | const char *outname; |
---|
213 | const int compress; |
---|
214 | |
---|
215 | public: |
---|
216 | AwtiExportWriter(GBDATA *_gbmain, const char *_dbname, const char* eft, |
---|
217 | const char* out, int c) |
---|
218 | : gbmain(_gbmain), dbname(_dbname), formname(eft), outname(out), compress(c) |
---|
219 | { |
---|
220 | GBT_mark_all(gbmain, 0); |
---|
221 | } |
---|
222 | void addSequence(GBDATA *gbspec) OVERRIDE { |
---|
223 | GB_write_flag(gbspec, 1); |
---|
224 | } |
---|
225 | |
---|
226 | void finish() OVERRIDE { |
---|
227 | const int cut_stop_codon = 0; |
---|
228 | const int multiple = 0; |
---|
229 | |
---|
230 | char *aliname = GBT_get_default_alignment(gbmain); |
---|
231 | AP_filter filter(GBT_get_alignment_len(gbmain, aliname)); |
---|
232 | char *real_outname = NULp; |
---|
233 | |
---|
234 | GB_ERROR err = SEQIO::export_by_format(gbmain, SEQIO::EBF_MARKED, NULp, |
---|
235 | &filter, cut_stop_codon, compress, |
---|
236 | dbname, formname, NULp, // @@@ pass name of specified FTS here |
---|
237 | outname, multiple, &real_outname); |
---|
238 | if (err) set_error(err); |
---|
239 | |
---|
240 | free(real_outname); |
---|
241 | free(aliname); |
---|
242 | } |
---|
243 | }; |
---|
244 | |
---|
245 | |
---|
246 | // @@@ add new parameter allowing to pass FTS (e.g. --fts <FILE>)! |
---|
247 | |
---|
248 | static void help() { |
---|
249 | cerr << |
---|
250 | "Exports sequences from an ARB database." |
---|
251 | "\n" |
---|
252 | "Parameters:\n" |
---|
253 | " --source <FILE> source file\n" |
---|
254 | " --dest <FILE> destination file\n" |
---|
255 | " --format <ARG> 'ARB', 'FASTA' or path to ARB EFT file\n" |
---|
256 | " --accs <FILE> export only sequences matching these accessions\n" |
---|
257 | "\n" |
---|
258 | "only for EFT output format:\n" |
---|
259 | " --eft-compress-gaps <ARG> 'none', 'vertical' or 'all' (default none)\n" |
---|
260 | "\n" |
---|
261 | "required for ARB output format:\n" |
---|
262 | " --arb-template <FILE> template (empty) arb file\n" |
---|
263 | |
---|
264 | << endl;; |
---|
265 | } |
---|
266 | |
---|
267 | static set<string> read_accession_file(const char* file) { |
---|
268 | set<string> accessions; |
---|
269 | ifstream ifs(file); |
---|
270 | string linestring; |
---|
271 | |
---|
272 | do { |
---|
273 | getline(ifs,linestring); |
---|
274 | if (!linestring.empty()) { |
---|
275 | stringstream line(linestring); |
---|
276 | string item; |
---|
277 | do { |
---|
278 | getline(line,item,','); |
---|
279 | item.erase(item.find_last_not_of(" \n,\t")+1); |
---|
280 | item.erase(0,item.find_first_not_of(" \n,\t")); |
---|
281 | if (!item.empty()) |
---|
282 | accessions.insert(item); |
---|
283 | } while(line.good()); |
---|
284 | } |
---|
285 | } while(ifs.good()); |
---|
286 | |
---|
287 | return accessions; |
---|
288 | } |
---|
289 | |
---|
290 | int main(int argc, char** argv) { |
---|
291 | enum { |
---|
292 | FMT_ARB, |
---|
293 | FMT_FASTA, |
---|
294 | FMT_EFT, |
---|
295 | FMT_NONE |
---|
296 | } format = FMT_NONE; |
---|
297 | |
---|
298 | char *src = NULp; |
---|
299 | char *dest = NULp; |
---|
300 | char *tmpl = NULp; |
---|
301 | char *accs = NULp; |
---|
302 | char *eft = NULp; |
---|
303 | int compress = 0; |
---|
304 | GB_shell shell; |
---|
305 | |
---|
306 | if (argc <2) { |
---|
307 | help(); |
---|
308 | return 0; |
---|
309 | } |
---|
310 | |
---|
311 | for (int i = 1; i < argc; i++) { |
---|
312 | if (argc > i+1) { |
---|
313 | if (!strcmp(argv[i], "--source")) { |
---|
314 | src = argv[++i]; |
---|
315 | } else if (!strcmp(argv[i], "--dest")) { |
---|
316 | dest = argv[++i]; |
---|
317 | } else if (!strcmp(argv[i], "--arb-template")) { |
---|
318 | tmpl = argv[++i]; |
---|
319 | } else if (!strcmp(argv[i], "--accs")) { |
---|
320 | accs = argv[++i]; |
---|
321 | } else if (!strcmp(argv[i], "--format")) { |
---|
322 | if (!strcmp(argv[++i], "ARB")) { |
---|
323 | format = FMT_ARB; |
---|
324 | } else if (!strcmp(argv[i], "FASTA")) { |
---|
325 | format = FMT_FASTA; |
---|
326 | } else { |
---|
327 | format = FMT_EFT; |
---|
328 | eft = argv[i]; |
---|
329 | } |
---|
330 | } else if (!strcmp(argv[i], "--eft-compress-gaps")) { |
---|
331 | if (!strcmp(argv[++i], "none")) { |
---|
332 | compress = 0; |
---|
333 | } else if (!strcmp(argv[i], "vertical")) { |
---|
334 | compress = 1; |
---|
335 | } else if (!strcmp(argv[i], "all")) { |
---|
336 | compress = 2; |
---|
337 | } else { |
---|
338 | cerr << "missing argument to --eft-compress-gaps?"; |
---|
339 | i--; |
---|
340 | } |
---|
341 | } else { |
---|
342 | cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl; |
---|
343 | } |
---|
344 | } else { |
---|
345 | cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl; |
---|
346 | } |
---|
347 | } |
---|
348 | |
---|
349 | if (!src) { |
---|
350 | cerr << "need '--source' parameter" << endl; |
---|
351 | return 1; |
---|
352 | } |
---|
353 | if (!dest) { |
---|
354 | cerr << "need '--dest' parameter" << endl; |
---|
355 | return 1; |
---|
356 | } |
---|
357 | if (format == FMT_NONE) { |
---|
358 | cerr << "need '--format' parameter" << endl; |
---|
359 | return 1; |
---|
360 | } |
---|
361 | if (format == FMT_ARB && !tmpl) { |
---|
362 | cerr << "need '--arb-template' parameter for output type ARB'" << endl; |
---|
363 | return 1; |
---|
364 | } |
---|
365 | |
---|
366 | set<string> accessions; |
---|
367 | if (accs) { |
---|
368 | if (!GB_is_regularfile(accs)) { |
---|
369 | cerr << "unable to open accession number file '" << accs << '\'' << endl; |
---|
370 | return 1; |
---|
371 | } |
---|
372 | accessions = read_accession_file(accs); |
---|
373 | } |
---|
374 | |
---|
375 | GB_ERROR error = NULp; |
---|
376 | int exitcode = 0; |
---|
377 | |
---|
378 | GBDATA *gbsrc = GB_open(src, "r"); |
---|
379 | if (!gbsrc) { |
---|
380 | error = "unable to open source file"; |
---|
381 | exitcode = 1; |
---|
382 | } |
---|
383 | |
---|
384 | if (!error) { |
---|
385 | GB_transaction trans(gbsrc); |
---|
386 | |
---|
387 | int count_max = 0; |
---|
388 | if (accs) { |
---|
389 | count_max = accessions.size(); |
---|
390 | } else { |
---|
391 | for (GBDATA *gbspec = GBT_first_species(gbsrc); |
---|
392 | gbspec; gbspec = GBT_next_species(gbspec)) { |
---|
393 | count_max++; |
---|
394 | } |
---|
395 | } |
---|
396 | |
---|
397 | // create writer for target type |
---|
398 | Writer *writer = NULp; |
---|
399 | switch(format) { |
---|
400 | case FMT_ARB: |
---|
401 | writer = new ArbWriter(tmpl, dest, count_max); |
---|
402 | break; |
---|
403 | case FMT_FASTA: { |
---|
404 | char *default_ali = GBT_get_default_alignment(gbsrc); |
---|
405 | writer = new MultiFastaWriter(dest, default_ali, count_max); |
---|
406 | free(default_ali); |
---|
407 | break; |
---|
408 | } |
---|
409 | case FMT_EFT: |
---|
410 | writer = new AwtiExportWriter(gbsrc, src, eft, dest, compress); |
---|
411 | break; |
---|
412 | |
---|
413 | case FMT_NONE: |
---|
414 | error = "internal error"; |
---|
415 | exitcode = 2; |
---|
416 | break; |
---|
417 | } |
---|
418 | |
---|
419 | if (writer) { |
---|
420 | // generate file |
---|
421 | for (GBDATA *gbspec = GBT_first_species(gbsrc); |
---|
422 | gbspec && writer->ok(); |
---|
423 | gbspec = GBT_next_species(gbspec)) |
---|
424 | { |
---|
425 | GBDATA *gbname = GB_find(gbspec, "acc", SEARCH_CHILD); |
---|
426 | string name(GB_read_char_pntr(gbname)); |
---|
427 | |
---|
428 | if (!accs || accessions.count(name)) { |
---|
429 | writer->addSequence(gbspec); |
---|
430 | } |
---|
431 | GB_release(gbspec); |
---|
432 | } |
---|
433 | |
---|
434 | if (writer->ok()) writer->finish(); |
---|
435 | |
---|
436 | if (!writer->ok()) { |
---|
437 | error = GBS_static_string(writer->get_error()); |
---|
438 | exitcode = 5; |
---|
439 | } |
---|
440 | delete writer; |
---|
441 | } |
---|
442 | } |
---|
443 | if (gbsrc) GB_close(gbsrc); |
---|
444 | |
---|
445 | if (error) cerr << "ERROR: " << error << endl; |
---|
446 | return exitcode; |
---|
447 | } |
---|
448 | |
---|