1 | // ============================================================= // |
---|
2 | // // |
---|
3 | // File : seq_export.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================= // |
---|
10 | |
---|
11 | #include "seqio.hxx" |
---|
12 | |
---|
13 | #include <AP_filter.hxx> |
---|
14 | #include <arbdbt.h> |
---|
15 | #include <arb_strarray.h> |
---|
16 | #include <arb_file.h> |
---|
17 | #include <arb_diff.h> |
---|
18 | #include <xml.hxx> |
---|
19 | #include <arb_progress.h> |
---|
20 | #include <unistd.h> |
---|
21 | |
---|
22 | #define sio_assert(cond) arb_assert(cond) |
---|
23 | |
---|
24 | using std::string; |
---|
25 | |
---|
26 | |
---|
27 | // --------------------------------- |
---|
28 | // internal export commands |
---|
29 | |
---|
30 | enum EXPORT_CMD { |
---|
31 | // real formats |
---|
32 | EXPORT_XML, |
---|
33 | |
---|
34 | EXPORT_INVALID, |
---|
35 | EXPORT_USING_FORM, // default mode (has to be last entry in enum) |
---|
36 | }; |
---|
37 | |
---|
38 | static const char *internal_export_commands[] = { |
---|
39 | "xml_write", |
---|
40 | NULL |
---|
41 | }; |
---|
42 | |
---|
43 | static EXPORT_CMD check_internal(const char *command) { |
---|
44 | EXPORT_CMD cmd = EXPORT_INVALID; |
---|
45 | for (int i = 0; internal_export_commands[i]; ++i) { |
---|
46 | if (strcmp(command, internal_export_commands[i]) == 0) { |
---|
47 | cmd = static_cast<EXPORT_CMD>(i); |
---|
48 | } |
---|
49 | } |
---|
50 | return cmd; |
---|
51 | } |
---|
52 | |
---|
53 | // ---------------------- |
---|
54 | // export_format |
---|
55 | |
---|
56 | struct export_format : virtual Noncopyable { |
---|
57 | char *system; |
---|
58 | char *new_format; |
---|
59 | char *suffix; |
---|
60 | char *form; // transformed export expression (part behind 'BEGIN') |
---|
61 | |
---|
62 | EXPORT_CMD export_mode; |
---|
63 | |
---|
64 | export_format() |
---|
65 | : system(NULL), |
---|
66 | new_format(NULL), |
---|
67 | suffix(NULL), |
---|
68 | form(NULL), |
---|
69 | export_mode(EXPORT_XML) |
---|
70 | {} |
---|
71 | ~export_format() { |
---|
72 | free(system); |
---|
73 | free(new_format); |
---|
74 | free(suffix); |
---|
75 | free(form); |
---|
76 | } |
---|
77 | }; |
---|
78 | |
---|
79 | static GB_ERROR read_export_format(export_format *efo, const char *file, bool load_complete_form) { |
---|
80 | GB_ERROR error = 0; |
---|
81 | |
---|
82 | if (!file || !file[0]) { |
---|
83 | error = "No export format selected"; |
---|
84 | } |
---|
85 | else { |
---|
86 | char *fullfile = 0; |
---|
87 | if (GB_is_regularfile(file)) { // prefer files that are completely specified (full/rel path) |
---|
88 | fullfile = strdup(GB_canonical_path(file)); |
---|
89 | } |
---|
90 | else { |
---|
91 | fullfile = nulldup(GB_path_in_ARBHOME(file)); // fallback to ARBHOME-relative specification |
---|
92 | } |
---|
93 | |
---|
94 | FILE *in = fopen(fullfile, "r"); |
---|
95 | |
---|
96 | if (!in) error = GB_IO_error("reading export form", fullfile); |
---|
97 | else { |
---|
98 | efo->export_mode = EXPORT_USING_FORM; // default mode |
---|
99 | { |
---|
100 | bool seen_BEGIN = false; |
---|
101 | char *s1, *s2; |
---|
102 | size_t linenumber = 0; |
---|
103 | |
---|
104 | while (!error && !seen_BEGIN && SEQIO_read_string_pair(in, s1, s2, linenumber)) { |
---|
105 | if (!strcmp(s1, "SYSTEM")) { reassign(efo->system, s2); } |
---|
106 | else if (!strcmp(s1, "PRE_FORMAT")) { reassign(efo->new_format, s2); } |
---|
107 | else if (!strcmp(s1, "SUFFIX")) { reassign(efo->suffix, s2); } |
---|
108 | else if (!strcmp(s1, "INTERNAL")) { |
---|
109 | efo->export_mode = check_internal(s2); |
---|
110 | if (efo->export_mode == EXPORT_INVALID) { |
---|
111 | error = GBS_global_string("Unknown INTERNAL command '%s'", s2); |
---|
112 | } |
---|
113 | } |
---|
114 | else if (!strcmp(s1, "BEGIN")) { |
---|
115 | if (efo->export_mode != EXPORT_USING_FORM) { |
---|
116 | error = "'BEGIN' not allowed when 'INTERNAL' is used"; |
---|
117 | } |
---|
118 | else { |
---|
119 | seen_BEGIN = true; |
---|
120 | } |
---|
121 | } |
---|
122 | else { |
---|
123 | error = GBS_global_string("Unknown command '%s'", s1); |
---|
124 | } |
---|
125 | |
---|
126 | // add error location |
---|
127 | if (error) error = GBS_global_string("%s in line #%zu", error, linenumber); |
---|
128 | |
---|
129 | free(s2); |
---|
130 | free(s1); |
---|
131 | } |
---|
132 | } |
---|
133 | |
---|
134 | if (!error && load_complete_form && efo->export_mode == EXPORT_USING_FORM) { |
---|
135 | // now 'in' points to line behind 'BEGIN' |
---|
136 | char *form = GB_read_fp(in); // read rest of file |
---|
137 | |
---|
138 | // Join lines that end with \ with next line. |
---|
139 | // Replace ' = ' and ':' by '\=' and '\:' |
---|
140 | efo->form = GBS_string_eval(form, "\\\\\n=:\\==\\\\\\=:*=\\*\\=*1:\\:=\\\\\\:", 0); |
---|
141 | if (!efo->form) error = GB_failedTo_error("evaluate part below 'BEGIN'", NULL, GB_await_error()); |
---|
142 | free(form); |
---|
143 | } |
---|
144 | |
---|
145 | // some checks for incompatible commands |
---|
146 | if (!error) { |
---|
147 | if (efo->system && !efo->new_format) error = "Missing 'PRE_FORMAT' (needed by 'SYSTEM')"; |
---|
148 | else if (efo->new_format && !efo->system) error = "Missing 'SYSTEM' (needed by 'PRE_FORMAT')"; |
---|
149 | else if (efo->export_mode != EXPORT_USING_FORM) { |
---|
150 | if (efo->system) error = "'SYSTEM' is not allowed together with 'INTERNAL'"; |
---|
151 | if (efo->new_format) error = "'PRE_FORMAT' is not allowed together with 'INTERNAL'"; |
---|
152 | } |
---|
153 | } |
---|
154 | |
---|
155 | error = GB_failedTo_error("read export format", fullfile, error); |
---|
156 | fclose(in); |
---|
157 | } |
---|
158 | free(fullfile); |
---|
159 | } |
---|
160 | |
---|
161 | return error; |
---|
162 | } |
---|
163 | |
---|
164 | char *SEQIO_exportFormat_get_outfile_default_suffix(const char *formname, GB_ERROR& error) { |
---|
165 | export_format efs; |
---|
166 | error = read_export_format(&efs, formname, false); |
---|
167 | return (!error && efs.suffix) ? strdup(efs.suffix) : NULL; |
---|
168 | } |
---|
169 | |
---|
170 | |
---|
171 | |
---|
172 | // ---------------------------------------- |
---|
173 | // export sequence helper class |
---|
174 | |
---|
175 | typedef GBDATA *(*FindSpeciesFunction)(GBDATA *); |
---|
176 | |
---|
177 | class export_sequence_data : virtual Noncopyable { |
---|
178 | GBDATA *last_species_read; |
---|
179 | char *seq; |
---|
180 | size_t len; |
---|
181 | char *error; |
---|
182 | |
---|
183 | GBDATA *gb_main; |
---|
184 | char *ali; |
---|
185 | FindSpeciesFunction find_first, find_next; |
---|
186 | size_t species_count; |
---|
187 | AP_filter *filter; |
---|
188 | bool cut_stop_codon; |
---|
189 | int compress; // 0 = no;1 = vertical gaps; 2 = all gaps; |
---|
190 | |
---|
191 | long max_ali_len; // length of alignment |
---|
192 | size_t *export_column; // list of exported seq data positions |
---|
193 | size_t columns; // how many columns get exported |
---|
194 | |
---|
195 | GBDATA *single_species; // if != NULL -> first/next only return that species (used to export to multiple files) |
---|
196 | |
---|
197 | public: |
---|
198 | |
---|
199 | export_sequence_data(GBDATA *Gb_Main, bool only_marked, AP_filter* Filter, bool CutStopCodon, int Compress) |
---|
200 | : last_species_read(0) |
---|
201 | , seq(0) |
---|
202 | , len(0), error(0) |
---|
203 | , gb_main(Gb_Main), species_count(size_t(-1)) |
---|
204 | , filter(Filter) |
---|
205 | , cut_stop_codon(CutStopCodon) |
---|
206 | , compress(Compress) |
---|
207 | , export_column(0) |
---|
208 | , columns(0) |
---|
209 | , single_species(0) |
---|
210 | { |
---|
211 | ali = GBT_get_default_alignment(gb_main); |
---|
212 | max_ali_len = GBT_get_alignment_len(gb_main, ali); |
---|
213 | |
---|
214 | if (cut_stop_codon) { |
---|
215 | GB_alignment_type ali_type = GBT_get_alignment_type(gb_main, ali); |
---|
216 | if (ali_type != GB_AT_AA) { |
---|
217 | GB_warning("Cutting stop codon makes no sense - ignored"); |
---|
218 | cut_stop_codon = false; |
---|
219 | } |
---|
220 | } |
---|
221 | sio_assert(filter); |
---|
222 | |
---|
223 | if (only_marked) { |
---|
224 | find_first = GBT_first_marked_species; |
---|
225 | find_next = GBT_next_marked_species; |
---|
226 | } |
---|
227 | else { |
---|
228 | find_first = GBT_first_species; |
---|
229 | find_next = GBT_next_species; |
---|
230 | } |
---|
231 | |
---|
232 | if (max_ali_len>=0 && filter->get_length() < size_t(max_ali_len)) { |
---|
233 | GB_warningf("Warning: Your filter is shorter than the alignment (%zu<%li)", |
---|
234 | filter->get_length(), max_ali_len); |
---|
235 | max_ali_len = filter->get_length(); |
---|
236 | } |
---|
237 | } |
---|
238 | |
---|
239 | ~export_sequence_data() { |
---|
240 | delete [] export_column; |
---|
241 | delete [] seq; |
---|
242 | free(error); |
---|
243 | free(ali); |
---|
244 | } |
---|
245 | |
---|
246 | const char *getAlignment() const { return ali; } |
---|
247 | long getAliLen() const { return max_ali_len; } |
---|
248 | |
---|
249 | void set_single_mode(GBDATA *gb_species) { single_species = gb_species; } |
---|
250 | bool in_single_mode() const { return single_species; } |
---|
251 | |
---|
252 | GBDATA *first_species() const { return single_species ? single_species : find_first(gb_main); } |
---|
253 | GBDATA *next_species(GBDATA *gb_prev) const { return single_species ? NULL : find_next(gb_prev); } |
---|
254 | |
---|
255 | const unsigned char *get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& error) const; |
---|
256 | static bool isGap(char c) { return c == '-' || c == '.'; } |
---|
257 | |
---|
258 | size_t count_species() { |
---|
259 | sio_assert(!in_single_mode()); |
---|
260 | if (species_count == size_t(-1)) { |
---|
261 | species_count = 0; |
---|
262 | for (GBDATA *gb_species = find_first(gb_main); gb_species; gb_species = find_next(gb_species)) { |
---|
263 | species_count++; |
---|
264 | } |
---|
265 | } |
---|
266 | return species_count; |
---|
267 | } |
---|
268 | |
---|
269 | GB_ERROR detectVerticalGaps(); |
---|
270 | const char *get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& error); |
---|
271 | }; |
---|
272 | |
---|
273 | const unsigned char *export_sequence_data::get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& err) const { |
---|
274 | const char *data = 0; |
---|
275 | GBDATA *gb_seq = GBT_find_sequence(gb_species, ali); |
---|
276 | |
---|
277 | if (!gb_seq) { |
---|
278 | err = GBS_global_string_copy("No data in alignment '%s' of species '%s'", ali, GBT_read_name(gb_species)); |
---|
279 | slen = 0; |
---|
280 | } |
---|
281 | else { |
---|
282 | data = GB_read_char_pntr(gb_seq); |
---|
283 | slen = GB_read_count(gb_seq); |
---|
284 | err = 0; |
---|
285 | } |
---|
286 | return (const unsigned char *)data; |
---|
287 | } |
---|
288 | |
---|
289 | |
---|
290 | GB_ERROR export_sequence_data::detectVerticalGaps() { |
---|
291 | GB_ERROR err = 0; |
---|
292 | |
---|
293 | sio_assert(!in_single_mode()); |
---|
294 | |
---|
295 | if (compress == 1) { // compress vertical gaps! |
---|
296 | size_t gap_columns = filter->get_filtered_length(); |
---|
297 | size_t *gap_column = new size_t[gap_columns+1]; |
---|
298 | |
---|
299 | const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos(); |
---|
300 | memcpy(gap_column, filterpos_2_seqpos, gap_columns*sizeof(*gap_column)); |
---|
301 | gap_column[gap_columns] = max_ali_len; |
---|
302 | |
---|
303 | arb_progress progress("Calculating vertical gaps", count_species()); |
---|
304 | |
---|
305 | for (GBDATA *gb_species = first_species(); |
---|
306 | gb_species && !err; |
---|
307 | gb_species = next_species(gb_species)) |
---|
308 | { |
---|
309 | size_t slen; |
---|
310 | const unsigned char *sdata = get_seq_data(gb_species, slen, err); |
---|
311 | |
---|
312 | if (!err) { |
---|
313 | size_t j = 0; |
---|
314 | size_t i; |
---|
315 | for (i = 0; i<gap_columns; ++i) { |
---|
316 | if (isGap(sdata[gap_column[i]])) { |
---|
317 | gap_column[j++] = gap_column[i]; // keep gap column |
---|
318 | } |
---|
319 | // otherwise it's overwritten |
---|
320 | } |
---|
321 | |
---|
322 | sio_assert(i >= j); |
---|
323 | size_t skipped_columns = i-j; |
---|
324 | sio_assert(gap_columns >= skipped_columns); |
---|
325 | gap_columns -= skipped_columns; |
---|
326 | } |
---|
327 | progress.inc_and_check_user_abort(err); |
---|
328 | } |
---|
329 | |
---|
330 | if (!err) { |
---|
331 | columns = filter->get_filtered_length() - gap_columns; |
---|
332 | export_column = new size_t[columns]; |
---|
333 | |
---|
334 | size_t gpos = 0; // index into array of vertical gaps |
---|
335 | size_t epos = 0; // index into array of exported columns |
---|
336 | size_t flen = filter->get_filtered_length(); |
---|
337 | size_t a; |
---|
338 | for (a = 0; a<flen && gpos<gap_columns; ++a) { |
---|
339 | size_t fpos = filterpos_2_seqpos[a]; |
---|
340 | if (fpos == gap_column[gpos]) { // only gaps here -> skip column |
---|
341 | gpos++; |
---|
342 | } |
---|
343 | else { // not only gaps -> use column |
---|
344 | sio_assert(fpos<gap_column[gpos]); |
---|
345 | sio_assert(epos < columns); // got more columns than expected |
---|
346 | export_column[epos++] = fpos; |
---|
347 | } |
---|
348 | } |
---|
349 | for (; a<flen; ++a) { |
---|
350 | export_column[epos++] = filterpos_2_seqpos[a]; |
---|
351 | } |
---|
352 | |
---|
353 | sio_assert(epos == columns); |
---|
354 | } |
---|
355 | |
---|
356 | delete [] gap_column; |
---|
357 | } |
---|
358 | else { // compress all or none (simply use filter) |
---|
359 | const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos(); |
---|
360 | |
---|
361 | columns = filter->get_filtered_length(); |
---|
362 | export_column = new size_t[columns]; |
---|
363 | |
---|
364 | memcpy(export_column, filterpos_2_seqpos, columns*sizeof(*filterpos_2_seqpos)); |
---|
365 | } |
---|
366 | |
---|
367 | seq = new char[columns+1]; |
---|
368 | |
---|
369 | return err; |
---|
370 | } |
---|
371 | |
---|
372 | const char *export_sequence_data::get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& err) { |
---|
373 | if (gb_species != last_species_read) { |
---|
374 | freenull(error); |
---|
375 | |
---|
376 | // read + filter a new species |
---|
377 | GB_ERROR curr_error; |
---|
378 | const unsigned char *data = get_seq_data(gb_species, len, curr_error); |
---|
379 | |
---|
380 | if (curr_error) { |
---|
381 | error = strdup(curr_error); |
---|
382 | } |
---|
383 | else { |
---|
384 | size_t i; |
---|
385 | const uchar *simplify = filter->get_simplify_table(); |
---|
386 | |
---|
387 | if (cut_stop_codon) { |
---|
388 | const unsigned char *stop_codon = (const unsigned char *)memchr(data, '*', len); |
---|
389 | if (stop_codon) { |
---|
390 | len = stop_codon-data; |
---|
391 | } |
---|
392 | } |
---|
393 | |
---|
394 | if (compress == 2) { // compress all gaps |
---|
395 | size_t j = 0; |
---|
396 | for (i = 0; i<columns; ++i) { |
---|
397 | size_t seq_pos = export_column[i]; |
---|
398 | if (seq_pos<len) { |
---|
399 | unsigned char c = data[seq_pos]; |
---|
400 | if (!isGap(c)) { |
---|
401 | seq[j++] = simplify[c]; |
---|
402 | } |
---|
403 | } |
---|
404 | } |
---|
405 | seq[j] = 0; |
---|
406 | len = j; |
---|
407 | } |
---|
408 | else { // compress vertical or compress none (simply use filter in both cases) |
---|
409 | for (i = 0; i<columns; ++i) { |
---|
410 | size_t seq_pos = export_column[i]; |
---|
411 | if (seq_pos<len) { |
---|
412 | seq[i] = simplify[data[seq_pos]]; |
---|
413 | } |
---|
414 | else { |
---|
415 | seq[i] = simplify['.']; |
---|
416 | } |
---|
417 | } |
---|
418 | seq[i] = 0; |
---|
419 | len = columns; |
---|
420 | } |
---|
421 | } |
---|
422 | } |
---|
423 | |
---|
424 | err = error; |
---|
425 | if (error) { |
---|
426 | seq_len = 0; |
---|
427 | return 0; |
---|
428 | } |
---|
429 | |
---|
430 | seq_len = len; |
---|
431 | return seq; |
---|
432 | } |
---|
433 | |
---|
434 | // ---------------------------------------- |
---|
435 | // exported_sequence is hooked into ACI temporary (provides result of command 'export_sequence') |
---|
436 | // which is the sequence filtered and compressed according to settings in the export window |
---|
437 | |
---|
438 | static export_sequence_data *esd = 0; |
---|
439 | |
---|
440 | static const char *exported_sequence(GBDATA *gb_species, size_t *seq_len, GB_ERROR *error) { |
---|
441 | sio_assert(esd); |
---|
442 | return esd->get_export_sequence(gb_species, *seq_len, *error); |
---|
443 | } |
---|
444 | |
---|
445 | static GB_ERROR XML_recursive(GBDATA *gbd) { |
---|
446 | GB_ERROR error = 0; |
---|
447 | const char *key_name = GB_read_key_pntr(gbd); |
---|
448 | XML_Tag *tag = 0; |
---|
449 | bool descend = true; |
---|
450 | |
---|
451 | if (strncmp(key_name, "ali_", 4) == 0) |
---|
452 | { |
---|
453 | sio_assert(esd); |
---|
454 | descend = false; // do not descend into alignments |
---|
455 | if (strcmp(esd->getAlignment(), key_name) == 0) { // the wanted alignment |
---|
456 | |
---|
457 | tag = new XML_Tag("ALIGNMENT"); |
---|
458 | tag->add_attribute("name", key_name+4); |
---|
459 | |
---|
460 | GBDATA *gb_species = GB_get_father(gbd); |
---|
461 | size_t len; |
---|
462 | const char *seq = exported_sequence(gb_species, &len, &error); |
---|
463 | |
---|
464 | if (seq) { |
---|
465 | XML_Tag dtag("data"); |
---|
466 | { XML_Text seqText(seq); } |
---|
467 | } |
---|
468 | } |
---|
469 | } |
---|
470 | else { |
---|
471 | tag = new XML_Tag(key_name); |
---|
472 | |
---|
473 | if (GB_is_container(gbd)) { |
---|
474 | const char *name = GBT_read_char_pntr(gbd, "name"); |
---|
475 | if (name) tag->add_attribute("name", name); |
---|
476 | } |
---|
477 | } |
---|
478 | |
---|
479 | if (descend) { |
---|
480 | if (GB_read_type(gbd) == GB_DB) { |
---|
481 | for (GBDATA *gb_child = GB_child(gbd); gb_child && !error; gb_child = GB_nextChild(gb_child)) { |
---|
482 | const char *sub_key_name = GB_read_key_pntr(gb_child); |
---|
483 | |
---|
484 | if (strcmp(sub_key_name, "name") != 0) { // do not recurse for "name" (is handled above) |
---|
485 | error = XML_recursive(gb_child); |
---|
486 | } |
---|
487 | } |
---|
488 | } |
---|
489 | else { |
---|
490 | char *content = GB_read_as_string(gbd); |
---|
491 | if (content) { |
---|
492 | XML_Text text(content); |
---|
493 | free(content); |
---|
494 | } |
---|
495 | else { |
---|
496 | tag->add_attribute("error", "unsavable"); |
---|
497 | } |
---|
498 | } |
---|
499 | } |
---|
500 | |
---|
501 | delete tag; |
---|
502 | return error; |
---|
503 | } |
---|
504 | |
---|
505 | static GB_ERROR export_species_using_form(FILE *out, GBDATA *gb_species, const char *form) { |
---|
506 | GB_ERROR error = NULL; |
---|
507 | char *pars = GBS_string_eval(" ", form, gb_species); |
---|
508 | if (!pars) error = GB_await_error(); |
---|
509 | else { |
---|
510 | char *p; |
---|
511 | char *o = pars; |
---|
512 | while ((p = GBS_find_string(o, "$$DELETE_LINE$$", 0))) { |
---|
513 | char *l, *r; |
---|
514 | for (l = p; l>o; l--) if (*l=='\n') break; |
---|
515 | r = strchr(p, '\n'); if (!r) r = p + strlen(p); |
---|
516 | fwrite(o, 1, l-o, out); |
---|
517 | o = r; |
---|
518 | } |
---|
519 | fputs(o, out); |
---|
520 | free(pars); |
---|
521 | } |
---|
522 | return error; |
---|
523 | } |
---|
524 | |
---|
525 | static GB_ERROR export_format_single(const char *db_name, const char *formname, const char *outname, char **resulting_outname) { |
---|
526 | // Exports sequences specified by 'esd' (module global variable) |
---|
527 | // to format specified by 'formname'. |
---|
528 | // |
---|
529 | // if 'outname' == NULL -> export species to temporary file, otherwise to 'outname'. |
---|
530 | // Full path of generated file is returned in 'resulting_outname' |
---|
531 | |
---|
532 | static int export_depth = 0; |
---|
533 | export_depth++; |
---|
534 | |
---|
535 | *resulting_outname = 0; |
---|
536 | |
---|
537 | export_format efo; |
---|
538 | GB_ERROR error = read_export_format(&efo, formname, true); |
---|
539 | |
---|
540 | if (!error) { |
---|
541 | if (!outname) { // if no 'outname' is given -> export to temporary file |
---|
542 | char *unique_outname = GB_unique_filename("exported", efo.suffix); |
---|
543 | *resulting_outname = GB_create_tempfile(unique_outname); |
---|
544 | free(unique_outname); |
---|
545 | |
---|
546 | if (!*resulting_outname) error = GB_await_error(); |
---|
547 | } |
---|
548 | else *resulting_outname = strdup(outname); |
---|
549 | } |
---|
550 | |
---|
551 | sio_assert(error || *resulting_outname); |
---|
552 | |
---|
553 | if (!error) { |
---|
554 | if (efo.new_format) { |
---|
555 | // Export data using format 'new_format'. |
---|
556 | // Afterwards convert to wanted format using 'system'. |
---|
557 | |
---|
558 | sio_assert(efo.system); |
---|
559 | |
---|
560 | char *intermediate_export; |
---|
561 | error = export_format_single(db_name, efo.new_format, NULL, &intermediate_export); |
---|
562 | if (!error) { |
---|
563 | sio_assert(GB_is_privatefile(intermediate_export, false)); |
---|
564 | |
---|
565 | GB_informationf("Converting to %s", efo.suffix); |
---|
566 | |
---|
567 | char *srt = GBS_global_string_copy("$<=%s:$>=%s", intermediate_export, *resulting_outname); |
---|
568 | char *sys = GBS_string_eval(efo.system, srt, 0); |
---|
569 | |
---|
570 | GB_informationf("exec '%s'", efo.system); |
---|
571 | error = GBK_system(sys); |
---|
572 | |
---|
573 | GB_unlink_or_warn(intermediate_export, &error); |
---|
574 | |
---|
575 | free(sys); |
---|
576 | free(srt); |
---|
577 | } |
---|
578 | free(intermediate_export); |
---|
579 | } |
---|
580 | else { |
---|
581 | FILE *out = fopen(*resulting_outname, "wt"); |
---|
582 | if (!out) error = GB_IO_error("writing", *resulting_outname); |
---|
583 | else { |
---|
584 | XML_Document *xml = 0; |
---|
585 | |
---|
586 | int allCount = 0; |
---|
587 | for (GBDATA *gb_species = esd->first_species(); |
---|
588 | gb_species && !error; |
---|
589 | gb_species = esd->next_species(gb_species)) |
---|
590 | { |
---|
591 | allCount++; |
---|
592 | } |
---|
593 | |
---|
594 | arb_progress progress(allCount); |
---|
595 | progress.auto_subtitles("Saving species"); |
---|
596 | |
---|
597 | if (efo.export_mode == EXPORT_XML) { |
---|
598 | xml = new XML_Document("ARB_SEQ_EXPORT", "arb_seq_export.dtd", out); |
---|
599 | { |
---|
600 | xml->add_attribute("database", db_name); |
---|
601 | } |
---|
602 | xml->add_attribute("export_date", ARB_date_string()); |
---|
603 | { |
---|
604 | XML_Comment rem("There is a basic version of ARB_seq_export.dtd in $ARBHOME/lib/dtd\n" |
---|
605 | "but you might need to expand it by yourself,\n" |
---|
606 | "because the ARB-database may contain any kind of fields."); |
---|
607 | } |
---|
608 | } |
---|
609 | |
---|
610 | for (GBDATA *gb_species = esd->first_species(); |
---|
611 | gb_species && !error; |
---|
612 | gb_species = esd->next_species(gb_species)) |
---|
613 | { |
---|
614 | switch (efo.export_mode) { |
---|
615 | case EXPORT_USING_FORM: |
---|
616 | error = export_species_using_form(out, gb_species, efo.form); |
---|
617 | break; |
---|
618 | |
---|
619 | case EXPORT_XML: |
---|
620 | error = XML_recursive(gb_species); |
---|
621 | break; |
---|
622 | |
---|
623 | case EXPORT_INVALID: |
---|
624 | sio_assert(0); |
---|
625 | break; |
---|
626 | } |
---|
627 | progress.inc_and_check_user_abort(error); |
---|
628 | } |
---|
629 | |
---|
630 | delete xml; |
---|
631 | fclose(out); |
---|
632 | } |
---|
633 | } |
---|
634 | } |
---|
635 | |
---|
636 | if (error) { |
---|
637 | if (*resulting_outname) { |
---|
638 | GB_unlink_or_warn(*resulting_outname, NULL); |
---|
639 | freenull(*resulting_outname); |
---|
640 | } |
---|
641 | } |
---|
642 | |
---|
643 | export_depth--; |
---|
644 | |
---|
645 | return error; |
---|
646 | } |
---|
647 | |
---|
648 | static GB_ERROR export_format_multiple(const char* dbname, const char *formname, const char *outname, bool multiple, char **resulting_outname) { |
---|
649 | GB_ERROR error = 0; |
---|
650 | |
---|
651 | if (multiple) { |
---|
652 | char *path, *name, *suffix; |
---|
653 | GB_split_full_path(outname, &path, NULL, &name, &suffix); |
---|
654 | *resulting_outname = NULL; |
---|
655 | |
---|
656 | arb_progress progress("Exporting data", esd->count_species()); |
---|
657 | |
---|
658 | for (GBDATA *gb_species = esd->first_species(); |
---|
659 | gb_species && !error; |
---|
660 | gb_species = esd->next_species(gb_species)) |
---|
661 | { |
---|
662 | const char *species_name = GBT_read_char_pntr(gb_species, "name"); |
---|
663 | if (!species_name) error = "Can't export unnamed species"; |
---|
664 | else { |
---|
665 | const char *fname = GB_append_suffix(GBS_global_string("%s_%s", name, species_name), suffix); |
---|
666 | progress.subtitle(fname); |
---|
667 | |
---|
668 | char *oname = strdup(GB_concat_path(path, fname)); |
---|
669 | char *res_oname; |
---|
670 | |
---|
671 | esd->set_single_mode(gb_species); // means: only export 'gb_species' |
---|
672 | error = export_format_single(dbname, formname, oname, &res_oname); |
---|
673 | esd->set_single_mode(NULL); |
---|
674 | |
---|
675 | if (!*resulting_outname || // not set yet |
---|
676 | (res_oname && strcmp(*resulting_outname, res_oname)>0)) // or smaller than set one |
---|
677 | { |
---|
678 | reassign(*resulting_outname, res_oname); |
---|
679 | } |
---|
680 | |
---|
681 | free(res_oname); |
---|
682 | free(oname); |
---|
683 | } |
---|
684 | |
---|
685 | progress.inc_and_check_user_abort(error); |
---|
686 | } |
---|
687 | |
---|
688 | free(suffix); |
---|
689 | free(name); |
---|
690 | free(path); |
---|
691 | } |
---|
692 | else { |
---|
693 | arb_progress progress("Exporting data"); |
---|
694 | error = export_format_single(dbname, formname, outname, resulting_outname); |
---|
695 | } |
---|
696 | |
---|
697 | return error; |
---|
698 | } |
---|
699 | |
---|
700 | GB_ERROR SEQIO_export_by_format(GBDATA *gb_main, int marked_only, AP_filter *filter, |
---|
701 | int cut_stop_codon, int compress, const char *dbname, |
---|
702 | const char *formname, const char *outname, int multiple, |
---|
703 | char **real_outname) |
---|
704 | { |
---|
705 | sio_assert(!GB_have_error()); |
---|
706 | |
---|
707 | GB_ERROR error = filter->is_invalid(); |
---|
708 | if (!error) { |
---|
709 | esd = new export_sequence_data(gb_main, marked_only, filter, cut_stop_codon, compress); |
---|
710 | sio_assert(esd->getAliLen()>0); |
---|
711 | |
---|
712 | GB_set_export_sequence_hook(exported_sequence); |
---|
713 | |
---|
714 | error = esd->detectVerticalGaps(); |
---|
715 | if (!error) { |
---|
716 | error = export_format_multiple(dbname, formname, outname, multiple, real_outname); |
---|
717 | } |
---|
718 | |
---|
719 | GB_set_export_sequence_hook(0); |
---|
720 | } |
---|
721 | delete esd; |
---|
722 | esd = 0; |
---|
723 | |
---|
724 | sio_assert(!GB_have_error()); |
---|
725 | return error; |
---|
726 | } |
---|
727 | |
---|
728 | // -------------------------------------------------------------------------------- |
---|
729 | |
---|
730 | #ifdef UNIT_TESTS |
---|
731 | #include <test_unit.h> |
---|
732 | |
---|
733 | // uncomment to auto-update exported files |
---|
734 | // (needed once after changing database or export formats) |
---|
735 | // #define TEST_AUTO_UPDATE |
---|
736 | #define TEST_AUTO_UPDATE_ONLY_MISSING // do auto-update only if file is missing |
---|
737 | |
---|
738 | #define TEST_EXPORT_FORMAT(filename,load_complete_form) \ |
---|
739 | do { \ |
---|
740 | export_format efo; \ |
---|
741 | TEST_EXPECT_NO_ERROR(read_export_format((&efo), \ |
---|
742 | filename, \ |
---|
743 | load_complete_form)); \ |
---|
744 | } while(0) \ |
---|
745 | |
---|
746 | void TEST_sequence_export() { |
---|
747 | GB_shell shell; |
---|
748 | arb_suppress_progress silence; |
---|
749 | |
---|
750 | GBDATA *gb_main = GB_open("TEST_loadsave.arb", "r"); |
---|
751 | char *export_dir = nulldup(GB_path_in_ARBLIB("export")); |
---|
752 | StrArray eft; |
---|
753 | GBS_read_dir(eft, export_dir, "*.eft"); |
---|
754 | |
---|
755 | AP_filter *filter = NULL; |
---|
756 | { |
---|
757 | GB_transaction ta(gb_main); |
---|
758 | |
---|
759 | char *ali = GBT_get_default_alignment(gb_main); |
---|
760 | size_t alilen = GBT_get_alignment_len(gb_main, ali); |
---|
761 | filter = new AP_filter(alilen); |
---|
762 | |
---|
763 | GBT_mark_all(gb_main, 0); |
---|
764 | GBDATA *gb_species = GBT_find_species(gb_main, "MetMazei"); |
---|
765 | TEST_REJECT_NULL(gb_species); |
---|
766 | |
---|
767 | GB_write_flag(gb_species, 1); // mark |
---|
768 | free(ali); |
---|
769 | } |
---|
770 | for (int e = 0; eft[e]; ++e) { |
---|
771 | for (int complete = 0; complete <= 1; ++complete) { |
---|
772 | TEST_EXPORT_FORMAT(eft[e], complete); |
---|
773 | if (complete) { |
---|
774 | const char *outname = "impexp/exported"; |
---|
775 | char *used_outname = NULL; |
---|
776 | |
---|
777 | { |
---|
778 | GB_transaction ta(gb_main); |
---|
779 | TEST_EXPECT_NO_ERROR(SEQIO_export_by_format(gb_main, 1, filter, 0, 0, "DBname", eft[e], outname, 0, &used_outname)); |
---|
780 | } |
---|
781 | |
---|
782 | const char *name = strrchr(eft[e], '/'); |
---|
783 | TEST_REJECT_NULL(name); |
---|
784 | name++; |
---|
785 | |
---|
786 | char *expected = GBS_global_string_copy("impexp/%s.exported", name); |
---|
787 | |
---|
788 | #if defined(TEST_AUTO_UPDATE) |
---|
789 | #if defined(TEST_AUTO_UPDATE_ONLY_MISSING) |
---|
790 | if (GB_is_regularfile(expected)) { |
---|
791 | TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0); |
---|
792 | } |
---|
793 | else |
---|
794 | #else |
---|
795 | { |
---|
796 | TEST_COPY_FILE(outname, expected); |
---|
797 | } |
---|
798 | #endif |
---|
799 | #else |
---|
800 | TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0); |
---|
801 | // see ../../UNIT_TESTER/run/impexp |
---|
802 | #endif // TEST_AUTO_UPDATE |
---|
803 | TEST_EXPECT_ZERO_OR_SHOW_ERRNO(unlink(outname)); |
---|
804 | |
---|
805 | free(expected); |
---|
806 | free(used_outname); |
---|
807 | } |
---|
808 | } |
---|
809 | } |
---|
810 | |
---|
811 | delete filter; |
---|
812 | free(export_dir); |
---|
813 | GB_close(gb_main); |
---|
814 | } |
---|
815 | |
---|
816 | #endif // UNIT_TESTS |
---|