| 1 | #include <stdio.h> // needed for arbdb.h |
|---|
| 2 | #include <string.h> |
|---|
| 3 | #include <unistd.h> |
|---|
| 4 | |
|---|
| 5 | #include <arbdb.h> |
|---|
| 6 | #include <arbdbt.h> |
|---|
| 7 | #include <arb_file.h> |
|---|
| 8 | #include <AP_filter.hxx> |
|---|
| 9 | #include <seqio.hxx> |
|---|
| 10 | #include <insdel.h> |
|---|
| 11 | |
|---|
| 12 | #include <string> |
|---|
| 13 | #include <vector> |
|---|
| 14 | #include <fstream> |
|---|
| 15 | #include <map> |
|---|
| 16 | #include <iostream> |
|---|
| 17 | #include <sstream> |
|---|
| 18 | #include <set> |
|---|
| 19 | #include <iterator> |
|---|
| 20 | #include <algorithm> |
|---|
| 21 | |
|---|
| 22 | using std::cerr; |
|---|
| 23 | using std::endl; |
|---|
| 24 | using std::clog; |
|---|
| 25 | using std::cout; |
|---|
| 26 | using std::string; |
|---|
| 27 | using std::map; |
|---|
| 28 | using std::ifstream; |
|---|
| 29 | using std::ofstream; |
|---|
| 30 | using std::vector; |
|---|
| 31 | using std::stringstream; |
|---|
| 32 | using std::getline; |
|---|
| 33 | using std::set; |
|---|
| 34 | |
|---|
| 35 | template<typename T> |
|---|
| 36 | struct equals { |
|---|
| 37 | const T& t; |
|---|
| 38 | equals(const T& _t) : t(_t) {} |
|---|
| 39 | bool operator()(const T& _t) { return _t == t; } |
|---|
| 40 | }; |
|---|
| 41 | |
|---|
| 42 | /* fake aw_message/aw_status functions */ |
|---|
| 43 | |
|---|
| 44 | static void aw_status(double d) { |
|---|
| 45 | cerr << "aw_status_fraction: " << d << endl; |
|---|
| 46 | cerr.flush(); |
|---|
| 47 | } |
|---|
| 48 | |
|---|
| 49 | /* generic writer class */ |
|---|
| 50 | class Writer : virtual Noncopyable { |
|---|
| 51 | string err_state; |
|---|
| 52 | protected: |
|---|
| 53 | virtual void set_error(const char *error) { |
|---|
| 54 | arb_assert(error); |
|---|
| 55 | if (!err_state.empty()) { |
|---|
| 56 | cerr << "ERROR: " << err_state << endl; |
|---|
| 57 | } |
|---|
| 58 | err_state = error; |
|---|
| 59 | } |
|---|
| 60 | |
|---|
| 61 | public: |
|---|
| 62 | virtual ~Writer() {} |
|---|
| 63 | |
|---|
| 64 | virtual void addSequence(GBDATA*) = 0; |
|---|
| 65 | virtual void finish() {} |
|---|
| 66 | |
|---|
| 67 | const char *get_error() const { |
|---|
| 68 | return err_state.empty() ? NULp : err_state.c_str(); |
|---|
| 69 | } |
|---|
| 70 | bool ok() const { return !get_error(); } |
|---|
| 71 | }; |
|---|
| 72 | |
|---|
| 73 | class MultiFastaWriter : public Writer { // derived from Noncopyable |
|---|
| 74 | ofstream file; |
|---|
| 75 | string default_ali; |
|---|
| 76 | double count, count_max; |
|---|
| 77 | public: |
|---|
| 78 | MultiFastaWriter(string s, const char* a, int c) |
|---|
| 79 | : file(s.c_str()), |
|---|
| 80 | default_ali(a), |
|---|
| 81 | count(0), count_max(c) |
|---|
| 82 | {} |
|---|
| 83 | |
|---|
| 84 | string readString(GBDATA* spec, const char* key) { |
|---|
| 85 | GBDATA *gbd = GB_find(spec, key, SEARCH_CHILD); |
|---|
| 86 | if (!gbd) return string("<empty>"); |
|---|
| 87 | char *str= GB_read_as_string(gbd); |
|---|
| 88 | if (!str) return string("<empty>"); |
|---|
| 89 | string rval = str; |
|---|
| 90 | free(str); |
|---|
| 91 | return rval; |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | string readData(GBDATA* spec) { |
|---|
| 95 | GBDATA *gbd = GB_find(spec, default_ali.c_str(), SEARCH_CHILD); |
|---|
| 96 | if (!gbd) return string("<empty>"); |
|---|
| 97 | string result = readString(gbd, "data"); |
|---|
| 98 | GB_release(gbd); |
|---|
| 99 | return result; |
|---|
| 100 | } |
|---|
| 101 | |
|---|
| 102 | void addSequence(GBDATA* spec) OVERRIDE { |
|---|
| 103 | file << ">" |
|---|
| 104 | << readString(spec, "acc") << "." |
|---|
| 105 | << readString(spec, "start") << "." |
|---|
| 106 | << readString(spec, "stop") << " | " |
|---|
| 107 | << readString(spec, "full_name") << "\n"; |
|---|
| 108 | |
|---|
| 109 | string tmp = readData(spec); |
|---|
| 110 | std::replace_if(tmp.begin(),tmp.end(),equals<char>('.'),'-'); |
|---|
| 111 | |
|---|
| 112 | unsigned int i; |
|---|
| 113 | for (i=0; i+70<tmp.size(); i+=70) |
|---|
| 114 | file << tmp.substr(i,70) << "\n"; |
|---|
| 115 | file << tmp.substr(i) << "\n"; |
|---|
| 116 | aw_status(++count/count_max); |
|---|
| 117 | } |
|---|
| 118 | }; |
|---|
| 119 | |
|---|
| 120 | class ArbWriter FINAL_TYPE : public Writer { // derived from Noncopyable |
|---|
| 121 | string template_arb; |
|---|
| 122 | string filename; |
|---|
| 123 | double count, count_max; |
|---|
| 124 | GBDATA *gbdst, *gbdst_spec; |
|---|
| 125 | |
|---|
| 126 | void close() { |
|---|
| 127 | if (gbdst) { |
|---|
| 128 | GB_close(gbdst); |
|---|
| 129 | gbdst = NULp; |
|---|
| 130 | } |
|---|
| 131 | } |
|---|
| 132 | |
|---|
| 133 | void set_error(const char *error) OVERRIDE { |
|---|
| 134 | Writer::set_error(error); |
|---|
| 135 | close(); |
|---|
| 136 | } |
|---|
| 137 | |
|---|
| 138 | public: |
|---|
| 139 | ArbWriter(string ta, string f, int c) |
|---|
| 140 | : template_arb(ta), |
|---|
| 141 | filename(f), |
|---|
| 142 | count(0), |
|---|
| 143 | count_max(c) |
|---|
| 144 | { |
|---|
| 145 | gbdst = GB_open(template_arb.c_str(), "rw"); |
|---|
| 146 | if (!gbdst) set_error("Unable to open template file"); |
|---|
| 147 | else { |
|---|
| 148 | if (GB_request_undo_type(gbdst, GB_UNDO_NONE)) { |
|---|
| 149 | cerr << "WARN: Unable to disable undo buffer"; |
|---|
| 150 | } |
|---|
| 151 | |
|---|
| 152 | GB_begin_transaction(gbdst); |
|---|
| 153 | gbdst_spec = GB_search(gbdst, "species_data", GB_CREATE_CONTAINER); |
|---|
| 154 | GB_commit_transaction(gbdst); |
|---|
| 155 | |
|---|
| 156 | GB_ERROR error = GB_save_as(gbdst, filename.c_str(), "b"); |
|---|
| 157 | if (error) set_error(GBS_global_string("Unable to open output file (Reason: %s)", error)); |
|---|
| 158 | } |
|---|
| 159 | } |
|---|
| 160 | ~ArbWriter() { |
|---|
| 161 | close(); |
|---|
| 162 | if (!ok()) { |
|---|
| 163 | unlink(filename.c_str()); |
|---|
| 164 | } |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | void addSequence(GBDATA* gbspec) OVERRIDE { |
|---|
| 168 | if (gbdst) { |
|---|
| 169 | GB_transaction ta(gbdst); |
|---|
| 170 | GBDATA *gbnew = GB_create_container(gbdst_spec, "species"); |
|---|
| 171 | GB_ERROR error = NULp; |
|---|
| 172 | if (!gbnew) { |
|---|
| 173 | error = GB_await_error(); |
|---|
| 174 | } |
|---|
| 175 | else { |
|---|
| 176 | error = GB_copy_dropProtectMarksAndTempstate(gbnew, gbspec); |
|---|
| 177 | if (!error) error = GB_release(gbnew); |
|---|
| 178 | } |
|---|
| 179 | if (error) set_error(error); |
|---|
| 180 | } |
|---|
| 181 | aw_status(++count/count_max); |
|---|
| 182 | } |
|---|
| 183 | |
|---|
| 184 | void finish() OVERRIDE { |
|---|
| 185 | GB_ERROR error = NULp; |
|---|
| 186 | if (gbdst) { |
|---|
| 187 | cerr << "Compressing..." << endl; |
|---|
| 188 | GB_transaction trans(gbdst); |
|---|
| 189 | char *ali = GBT_get_default_alignment(gbdst); |
|---|
| 190 | if (!ali) { |
|---|
| 191 | error = "Your template DB has no default alignment"; |
|---|
| 192 | } |
|---|
| 193 | else { |
|---|
| 194 | error = ARB_format_alignment(gbdst, ali); |
|---|
| 195 | if (error) error = GBS_global_string("Failed to format alignments (Reason: %s)", error); |
|---|
| 196 | free(ali); |
|---|
| 197 | } |
|---|
| 198 | } |
|---|
| 199 | |
|---|
| 200 | if (!error && GB_save_as(gbdst, filename.c_str(), "b")) { |
|---|
| 201 | error = "Unable to save to output file"; |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | if (error) set_error(error); |
|---|
| 205 | } |
|---|
| 206 | }; |
|---|
| 207 | |
|---|
| 208 | class AwtiExportWriter FINAL_TYPE : public Writer { // derived from Noncopyable |
|---|
| 209 | GBDATA *gbmain; |
|---|
| 210 | const char *dbname; |
|---|
| 211 | const char *formname; // export form (.eft) |
|---|
| 212 | const char *outname; |
|---|
| 213 | const int compress; |
|---|
| 214 | |
|---|
| 215 | public: |
|---|
| 216 | AwtiExportWriter(GBDATA *_gbmain, const char *_dbname, const char* eft, |
|---|
| 217 | const char* out, int c) |
|---|
| 218 | : gbmain(_gbmain), dbname(_dbname), formname(eft), outname(out), compress(c) |
|---|
| 219 | { |
|---|
| 220 | GBT_mark_all(gbmain, 0); |
|---|
| 221 | } |
|---|
| 222 | void addSequence(GBDATA *gbspec) OVERRIDE { |
|---|
| 223 | GB_write_flag(gbspec, 1); |
|---|
| 224 | } |
|---|
| 225 | |
|---|
| 226 | void finish() OVERRIDE { |
|---|
| 227 | const int cut_stop_codon = 0; |
|---|
| 228 | const int multiple = 0; |
|---|
| 229 | |
|---|
| 230 | char *aliname = GBT_get_default_alignment(gbmain); |
|---|
| 231 | AP_filter filter(GBT_get_alignment_len(gbmain, aliname)); |
|---|
| 232 | char *real_outname = NULp; |
|---|
| 233 | |
|---|
| 234 | GB_ERROR err = SEQIO::export_by_format(gbmain, SEQIO::EBF_MARKED, NULp, |
|---|
| 235 | &filter, cut_stop_codon, compress, |
|---|
| 236 | dbname, formname, NULp, // @@@ pass name of specified FTS here |
|---|
| 237 | outname, multiple, &real_outname); |
|---|
| 238 | if (err) set_error(err); |
|---|
| 239 | |
|---|
| 240 | free(real_outname); |
|---|
| 241 | free(aliname); |
|---|
| 242 | } |
|---|
| 243 | }; |
|---|
| 244 | |
|---|
| 245 | |
|---|
| 246 | // @@@ add new parameter allowing to pass FTS (e.g. --fts <FILE>)! |
|---|
| 247 | |
|---|
| 248 | static void help() { |
|---|
| 249 | cerr << |
|---|
| 250 | "Exports sequences from an ARB database." |
|---|
| 251 | "\n" |
|---|
| 252 | "Parameters:\n" |
|---|
| 253 | " --source <FILE> source file\n" |
|---|
| 254 | " --dest <FILE> destination file\n" |
|---|
| 255 | " --format <ARG> 'ARB', 'FASTA' or path to ARB EFT file\n" |
|---|
| 256 | " --accs <FILE> export only sequences matching these accessions\n" |
|---|
| 257 | "\n" |
|---|
| 258 | "only for EFT output format:\n" |
|---|
| 259 | " --eft-compress-gaps <ARG> 'none', 'vertical' or 'all' (default none)\n" |
|---|
| 260 | "\n" |
|---|
| 261 | "required for ARB output format:\n" |
|---|
| 262 | " --arb-template <FILE> template (empty) arb file\n" |
|---|
| 263 | |
|---|
| 264 | << endl;; |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | static set<string> read_accession_file(const char* file) { |
|---|
| 268 | set<string> accessions; |
|---|
| 269 | ifstream ifs(file); |
|---|
| 270 | string linestring; |
|---|
| 271 | |
|---|
| 272 | do { |
|---|
| 273 | getline(ifs,linestring); |
|---|
| 274 | if (!linestring.empty()) { |
|---|
| 275 | stringstream line(linestring); |
|---|
| 276 | string item; |
|---|
| 277 | do { |
|---|
| 278 | getline(line,item,','); |
|---|
| 279 | item.erase(item.find_last_not_of(" \n,\t")+1); |
|---|
| 280 | item.erase(0,item.find_first_not_of(" \n,\t")); |
|---|
| 281 | if (!item.empty()) |
|---|
| 282 | accessions.insert(item); |
|---|
| 283 | } while(line.good()); |
|---|
| 284 | } |
|---|
| 285 | } while(ifs.good()); |
|---|
| 286 | |
|---|
| 287 | return accessions; |
|---|
| 288 | } |
|---|
| 289 | |
|---|
| 290 | int main(int argc, char** argv) { |
|---|
| 291 | enum { |
|---|
| 292 | FMT_ARB, |
|---|
| 293 | FMT_FASTA, |
|---|
| 294 | FMT_EFT, |
|---|
| 295 | FMT_NONE |
|---|
| 296 | } format = FMT_NONE; |
|---|
| 297 | |
|---|
| 298 | char *src = NULp; |
|---|
| 299 | char *dest = NULp; |
|---|
| 300 | char *tmpl = NULp; |
|---|
| 301 | char *accs = NULp; |
|---|
| 302 | char *eft = NULp; |
|---|
| 303 | int compress = 0; |
|---|
| 304 | GB_shell shell; |
|---|
| 305 | |
|---|
| 306 | if (argc <2) { |
|---|
| 307 | help(); |
|---|
| 308 | return 0; |
|---|
| 309 | } |
|---|
| 310 | |
|---|
| 311 | for (int i = 1; i < argc; i++) { |
|---|
| 312 | if (argc > i+1) { |
|---|
| 313 | if (!strcmp(argv[i], "--source")) { |
|---|
| 314 | src = argv[++i]; |
|---|
| 315 | } else if (!strcmp(argv[i], "--dest")) { |
|---|
| 316 | dest = argv[++i]; |
|---|
| 317 | } else if (!strcmp(argv[i], "--arb-template")) { |
|---|
| 318 | tmpl = argv[++i]; |
|---|
| 319 | } else if (!strcmp(argv[i], "--accs")) { |
|---|
| 320 | accs = argv[++i]; |
|---|
| 321 | } else if (!strcmp(argv[i], "--format")) { |
|---|
| 322 | if (!strcmp(argv[++i], "ARB")) { |
|---|
| 323 | format = FMT_ARB; |
|---|
| 324 | } else if (!strcmp(argv[i], "FASTA")) { |
|---|
| 325 | format = FMT_FASTA; |
|---|
| 326 | } else { |
|---|
| 327 | format = FMT_EFT; |
|---|
| 328 | eft = argv[i]; |
|---|
| 329 | } |
|---|
| 330 | } else if (!strcmp(argv[i], "--eft-compress-gaps")) { |
|---|
| 331 | if (!strcmp(argv[++i], "none")) { |
|---|
| 332 | compress = 0; |
|---|
| 333 | } else if (!strcmp(argv[i], "vertical")) { |
|---|
| 334 | compress = 1; |
|---|
| 335 | } else if (!strcmp(argv[i], "all")) { |
|---|
| 336 | compress = 2; |
|---|
| 337 | } else { |
|---|
| 338 | cerr << "missing argument to --eft-compress-gaps?"; |
|---|
| 339 | i--; |
|---|
| 340 | } |
|---|
| 341 | } else { |
|---|
| 342 | cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl; |
|---|
| 343 | } |
|---|
| 344 | } else { |
|---|
| 345 | cerr << "ignoring malformed argument: '" << argv[i] << "'" << endl; |
|---|
| 346 | } |
|---|
| 347 | } |
|---|
| 348 | |
|---|
| 349 | if (!src) { |
|---|
| 350 | cerr << "need '--source' parameter" << endl; |
|---|
| 351 | return 1; |
|---|
| 352 | } |
|---|
| 353 | if (!dest) { |
|---|
| 354 | cerr << "need '--dest' parameter" << endl; |
|---|
| 355 | return 1; |
|---|
| 356 | } |
|---|
| 357 | if (format == FMT_NONE) { |
|---|
| 358 | cerr << "need '--format' parameter" << endl; |
|---|
| 359 | return 1; |
|---|
| 360 | } |
|---|
| 361 | if (format == FMT_ARB && !tmpl) { |
|---|
| 362 | cerr << "need '--arb-template' parameter for output type ARB'" << endl; |
|---|
| 363 | return 1; |
|---|
| 364 | } |
|---|
| 365 | |
|---|
| 366 | set<string> accessions; |
|---|
| 367 | if (accs) { |
|---|
| 368 | if (!GB_is_regularfile(accs)) { |
|---|
| 369 | cerr << "unable to open accession number file '" << accs << '\'' << endl; |
|---|
| 370 | return 1; |
|---|
| 371 | } |
|---|
| 372 | accessions = read_accession_file(accs); |
|---|
| 373 | } |
|---|
| 374 | |
|---|
| 375 | GB_ERROR error = NULp; |
|---|
| 376 | int exitcode = 0; |
|---|
| 377 | |
|---|
| 378 | GBDATA *gbsrc = GB_open(src, "r"); |
|---|
| 379 | if (!gbsrc) { |
|---|
| 380 | error = "unable to open source file"; |
|---|
| 381 | exitcode = 1; |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | if (!error) { |
|---|
| 385 | GB_transaction trans(gbsrc); |
|---|
| 386 | |
|---|
| 387 | int count_max = 0; |
|---|
| 388 | if (accs) { |
|---|
| 389 | count_max = accessions.size(); |
|---|
| 390 | } else { |
|---|
| 391 | for (GBDATA *gbspec = GBT_first_species(gbsrc); |
|---|
| 392 | gbspec; gbspec = GBT_next_species(gbspec)) { |
|---|
| 393 | count_max++; |
|---|
| 394 | } |
|---|
| 395 | } |
|---|
| 396 | |
|---|
| 397 | // create writer for target type |
|---|
| 398 | Writer *writer = NULp; |
|---|
| 399 | switch(format) { |
|---|
| 400 | case FMT_ARB: |
|---|
| 401 | writer = new ArbWriter(tmpl, dest, count_max); |
|---|
| 402 | break; |
|---|
| 403 | case FMT_FASTA: { |
|---|
| 404 | char *default_ali = GBT_get_default_alignment(gbsrc); |
|---|
| 405 | writer = new MultiFastaWriter(dest, default_ali, count_max); |
|---|
| 406 | free(default_ali); |
|---|
| 407 | break; |
|---|
| 408 | } |
|---|
| 409 | case FMT_EFT: |
|---|
| 410 | writer = new AwtiExportWriter(gbsrc, src, eft, dest, compress); |
|---|
| 411 | break; |
|---|
| 412 | |
|---|
| 413 | case FMT_NONE: |
|---|
| 414 | error = "internal error"; |
|---|
| 415 | exitcode = 2; |
|---|
| 416 | break; |
|---|
| 417 | } |
|---|
| 418 | |
|---|
| 419 | if (writer) { |
|---|
| 420 | // generate file |
|---|
| 421 | for (GBDATA *gbspec = GBT_first_species(gbsrc); |
|---|
| 422 | gbspec && writer->ok(); |
|---|
| 423 | gbspec = GBT_next_species(gbspec)) |
|---|
| 424 | { |
|---|
| 425 | GBDATA *gbname = GB_find(gbspec, "acc", SEARCH_CHILD); |
|---|
| 426 | string name(GB_read_char_pntr(gbname)); |
|---|
| 427 | |
|---|
| 428 | if (!accs || accessions.count(name)) { |
|---|
| 429 | writer->addSequence(gbspec); |
|---|
| 430 | } |
|---|
| 431 | GB_release(gbspec); |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | if (writer->ok()) writer->finish(); |
|---|
| 435 | |
|---|
| 436 | if (!writer->ok()) { |
|---|
| 437 | error = GBS_static_string(writer->get_error()); |
|---|
| 438 | exitcode = 5; |
|---|
| 439 | } |
|---|
| 440 | delete writer; |
|---|
| 441 | } |
|---|
| 442 | } |
|---|
| 443 | if (gbsrc) GB_close(gbsrc); |
|---|
| 444 | |
|---|
| 445 | if (error) cerr << "ERROR: " << error << endl; |
|---|
| 446 | return exitcode; |
|---|
| 447 | } |
|---|
| 448 | |
|---|