1 | // ============================================================= // |
---|
2 | // // |
---|
3 | // File : seq_export.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ============================================================= // |
---|
10 | |
---|
11 | #include "seqio.hxx" |
---|
12 | |
---|
13 | #include <AP_filter.hxx> |
---|
14 | #include <xferset.h> |
---|
15 | |
---|
16 | #include <arbdbt.h> |
---|
17 | #include <gb_aci.h> |
---|
18 | |
---|
19 | #include <arb_strarray.h> |
---|
20 | #include <arb_file.h> |
---|
21 | #include <arb_diff.h> |
---|
22 | #include <arb_progress.h> |
---|
23 | #include <arb_global_defs.h> |
---|
24 | |
---|
25 | #include <xml.hxx> |
---|
26 | |
---|
27 | #include <unistd.h> |
---|
28 | |
---|
29 | #define sio_assert(cond) arb_assert(cond) |
---|
30 | |
---|
31 | using std::string; |
---|
32 | using namespace SEQIO; |
---|
33 | using namespace FieldTransfer; |
---|
34 | |
---|
35 | // --------------------------------- |
---|
36 | // internal export commands |
---|
37 | |
---|
38 | enum EXPORT_CMD { |
---|
39 | // real formats |
---|
40 | EXPORT_XML, |
---|
41 | |
---|
42 | EXPORT_INVALID, |
---|
43 | EXPORT_USING_FORM, // default mode (has to be last entry in enum) |
---|
44 | }; |
---|
45 | |
---|
46 | static const char *internal_export_commands[] = { |
---|
47 | "xml_write", |
---|
48 | NULp |
---|
49 | }; |
---|
50 | |
---|
51 | static EXPORT_CMD check_internal(const char *command) { |
---|
52 | EXPORT_CMD cmd = EXPORT_INVALID; |
---|
53 | for (int i = 0; internal_export_commands[i]; ++i) { |
---|
54 | if (strcmp(command, internal_export_commands[i]) == 0) { |
---|
55 | cmd = static_cast<EXPORT_CMD>(i); |
---|
56 | } |
---|
57 | } |
---|
58 | return cmd; |
---|
59 | } |
---|
60 | |
---|
61 | // ---------------------- |
---|
62 | // export_format |
---|
63 | |
---|
64 | struct export_format : virtual Noncopyable { |
---|
65 | char *system; |
---|
66 | char *pre_format; |
---|
67 | char *suffix; |
---|
68 | char *description; // (multiline) description of filter |
---|
69 | char *form; // transformed export expression (part behind 'BEGIN') |
---|
70 | |
---|
71 | EXPORT_CMD export_mode; |
---|
72 | |
---|
73 | export_format() |
---|
74 | : system(NULp), |
---|
75 | pre_format(NULp), |
---|
76 | suffix(NULp), |
---|
77 | description(NULp), |
---|
78 | form(NULp), |
---|
79 | export_mode(EXPORT_XML) |
---|
80 | {} |
---|
81 | ~export_format() { |
---|
82 | free(system); |
---|
83 | free(pre_format); |
---|
84 | free(suffix); |
---|
85 | free(description); |
---|
86 | free(form); |
---|
87 | } |
---|
88 | }; |
---|
89 | |
---|
90 | static GB_ERROR read_export_format(export_format *efo, const char *file, bool load_complete_form) { |
---|
91 | GB_ERROR error = NULp; |
---|
92 | |
---|
93 | if (!file || !file[0]) { |
---|
94 | error = "No export format selected"; |
---|
95 | } |
---|
96 | else { |
---|
97 | char *fullfile = NULp; |
---|
98 | if (GB_is_regularfile(file)) { // prefer files that are completely specified (full/rel path) |
---|
99 | fullfile = strdup(GB_canonical_path(file)); |
---|
100 | } |
---|
101 | else { |
---|
102 | fullfile = nulldup(GB_path_in_ARBHOME(file)); // fallback to ARBHOME-relative specification |
---|
103 | } |
---|
104 | |
---|
105 | FILE *in = fopen(fullfile, "r"); |
---|
106 | |
---|
107 | if (!in) error = GB_IO_error("reading export form", fullfile); |
---|
108 | else { |
---|
109 | efo->export_mode = EXPORT_USING_FORM; // default mode |
---|
110 | { |
---|
111 | bool seen_BEGIN = false; |
---|
112 | char *s1, *s2; |
---|
113 | size_t linenumber = 0; |
---|
114 | |
---|
115 | while (!error && !seen_BEGIN && read_string_pair(in, s1, s2, linenumber)) { |
---|
116 | if (!strcmp(s1, "SYSTEM")) { reassign(efo->system, s2); } |
---|
117 | else if (!strcmp(s1, "PRE_FORMAT")) { reassign(efo->pre_format, s2); } |
---|
118 | else if (!strcmp(s1, "SUFFIX")) { reassign(efo->suffix, s2); } |
---|
119 | else if (!strcmp(s1, "DESCRIPTION")) { appendTo(efo->description, '\n', s2); } |
---|
120 | else if (!strcmp(s1, "INTERNAL")) { |
---|
121 | efo->export_mode = check_internal(s2); |
---|
122 | if (efo->export_mode == EXPORT_INVALID) { |
---|
123 | error = GBS_global_string("Unknown INTERNAL command '%s'", s2); |
---|
124 | } |
---|
125 | } |
---|
126 | else if (!strcmp(s1, "BEGIN")) { |
---|
127 | if (efo->export_mode != EXPORT_USING_FORM) { |
---|
128 | error = "'BEGIN' not allowed when 'INTERNAL' is used"; |
---|
129 | } |
---|
130 | else { |
---|
131 | seen_BEGIN = true; |
---|
132 | } |
---|
133 | } |
---|
134 | else { |
---|
135 | error = GBS_global_string("Unknown command '%s'", s1); |
---|
136 | } |
---|
137 | |
---|
138 | // add error location |
---|
139 | if (error) error = GBS_global_string("%s in line #%zu", error, linenumber); |
---|
140 | |
---|
141 | free(s2); |
---|
142 | free(s1); |
---|
143 | } |
---|
144 | } |
---|
145 | |
---|
146 | if (!error && load_complete_form && efo->export_mode == EXPORT_USING_FORM) { |
---|
147 | // now 'in' points to line behind 'BEGIN' |
---|
148 | char *form = GB_read_fp(in); // read rest of file |
---|
149 | |
---|
150 | // Join lines that end with \ with next line. |
---|
151 | // Replace ' = ' and ':' by '\=' and '\:' |
---|
152 | efo->form = GBS_string_eval(form, "\\\\\n=:\\==\\\\\\=:*=\\*\\=*1:\\:=\\\\\\:"); |
---|
153 | if (!efo->form) error = GB_failedTo_error("evaluate part below 'BEGIN'", NULp, GB_await_error()); |
---|
154 | free(form); |
---|
155 | } |
---|
156 | |
---|
157 | // some checks for incompatible commands |
---|
158 | if (!error) { |
---|
159 | if (efo->system && !efo->pre_format) error = "Missing 'PRE_FORMAT' (needed by 'SYSTEM')"; |
---|
160 | else if (efo->pre_format && !efo->system) error = "Missing 'SYSTEM' (needed by 'PRE_FORMAT')"; |
---|
161 | else if (efo->export_mode != EXPORT_USING_FORM) { |
---|
162 | if (efo->system) error = "'SYSTEM' is not allowed together with 'INTERNAL'"; |
---|
163 | if (efo->pre_format) error = "'PRE_FORMAT' is not allowed together with 'INTERNAL'"; |
---|
164 | } |
---|
165 | } |
---|
166 | |
---|
167 | error = GB_failedTo_error("read export format", fullfile, error); |
---|
168 | fclose(in); |
---|
169 | } |
---|
170 | free(fullfile); |
---|
171 | } |
---|
172 | |
---|
173 | return error; |
---|
174 | } |
---|
175 | |
---|
176 | // ---------------------------------------- |
---|
177 | // export sequence helper class |
---|
178 | |
---|
179 | class SpeciesSelector : virtual Noncopyable { |
---|
180 | ExportWhich which; |
---|
181 | const char *one_species; |
---|
182 | |
---|
183 | public: |
---|
184 | SpeciesSelector(ExportWhich which_, const char *one_species_) : |
---|
185 | which(which_), |
---|
186 | one_species(one_species_) |
---|
187 | {} |
---|
188 | GBDATA *select_first(GBDATA *gb_main) const { |
---|
189 | GBDATA *gb_species = NULp; |
---|
190 | switch (which) { |
---|
191 | case EBF_ALL: gb_species = GBT_first_species(gb_main); break; |
---|
192 | case EBF_MARKED: gb_species = GBT_first_marked_species(gb_main); break; |
---|
193 | case EBF_ONE: gb_species = GBT_find_species(gb_main, one_species); break; |
---|
194 | } |
---|
195 | return gb_species; |
---|
196 | } |
---|
197 | GBDATA *select_next(GBDATA *gb_previous) const { |
---|
198 | GBDATA *gb_species = NULp; |
---|
199 | switch (which) { |
---|
200 | case EBF_ALL: gb_species = GBT_next_species(gb_previous); break; |
---|
201 | case EBF_MARKED: gb_species = GBT_next_marked_species(gb_previous); break; |
---|
202 | case EBF_ONE: break; |
---|
203 | } |
---|
204 | return gb_species; |
---|
205 | } |
---|
206 | }; |
---|
207 | |
---|
208 | class export_sequence_data : virtual Noncopyable { // @@@ simplify using FilteredExport? |
---|
209 | GBDATA *last_species_read; |
---|
210 | char *seq; |
---|
211 | size_t len; |
---|
212 | char *error; |
---|
213 | |
---|
214 | GBDATA *gb_main; |
---|
215 | char *ali; |
---|
216 | |
---|
217 | SpeciesSelector whichSpecies; |
---|
218 | |
---|
219 | size_t species_count; |
---|
220 | AP_filter *filter; |
---|
221 | bool cut_stop_codon; |
---|
222 | int compress; // 0 = no;1 = vertical gaps; 2 = all gaps; |
---|
223 | |
---|
224 | long max_ali_len; // length of alignment |
---|
225 | size_t *export_column; // list of exported seq data positions |
---|
226 | size_t columns; // how many columns get exported |
---|
227 | |
---|
228 | GBDATA *single_species; // if set to species -> first/next only return this species (used to export to multiple files) |
---|
229 | |
---|
230 | public: |
---|
231 | |
---|
232 | export_sequence_data(GBDATA *Gb_Main, ExportWhich which, const char *one_species, AP_filter* Filter, bool CutStopCodon, int Compress) : |
---|
233 | last_species_read(NULp), |
---|
234 | seq(NULp), |
---|
235 | len(0), |
---|
236 | error(NULp), |
---|
237 | gb_main(Gb_Main), |
---|
238 | whichSpecies(which, one_species), |
---|
239 | species_count(size_t(-1)), |
---|
240 | filter(Filter), |
---|
241 | cut_stop_codon(CutStopCodon), |
---|
242 | compress(Compress), |
---|
243 | export_column(NULp), |
---|
244 | columns(0), |
---|
245 | single_species(NULp) |
---|
246 | { |
---|
247 | ali = GBT_get_default_alignment(gb_main); |
---|
248 | max_ali_len = GBT_get_alignment_len(gb_main, ali); |
---|
249 | |
---|
250 | if (cut_stop_codon) { |
---|
251 | GB_alignment_type ali_type = GBT_get_alignment_type(gb_main, ali); |
---|
252 | if (ali_type != GB_AT_AA) { |
---|
253 | GB_warning("Cutting stop codon makes no sense - ignored"); |
---|
254 | cut_stop_codon = false; |
---|
255 | } |
---|
256 | } |
---|
257 | sio_assert(filter); |
---|
258 | |
---|
259 | if (max_ali_len>=0 && filter->get_length() < size_t(max_ali_len)) { |
---|
260 | GB_warningf("Warning: Your filter is shorter than the alignment (%zu<%li)", |
---|
261 | filter->get_length(), max_ali_len); |
---|
262 | max_ali_len = filter->get_length(); |
---|
263 | } |
---|
264 | } |
---|
265 | |
---|
266 | ~export_sequence_data() { |
---|
267 | delete [] export_column; |
---|
268 | delete [] seq; |
---|
269 | free(error); |
---|
270 | free(ali); |
---|
271 | } |
---|
272 | |
---|
273 | const char *getAlignment() const { return ali; } |
---|
274 | long getAliLen() const { return max_ali_len; } |
---|
275 | GBDATA *get_gb_main() const { sio_assert(gb_main); return gb_main; } |
---|
276 | |
---|
277 | void set_single_mode(GBDATA *gb_species) { single_species = gb_species; } |
---|
278 | bool in_single_mode() const { return single_species; } |
---|
279 | |
---|
280 | GBDATA *first_species() const { return single_species ? single_species : whichSpecies.select_first(gb_main); } |
---|
281 | GBDATA *next_species(GBDATA *gb_prev) const { return single_species ? NULp : whichSpecies.select_next(gb_prev); } |
---|
282 | |
---|
283 | const unsigned char *get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& error) const; |
---|
284 | static bool isGap(char c) { return GAP::is_std_gap(c); } |
---|
285 | |
---|
286 | size_t count_species() { |
---|
287 | sio_assert(!in_single_mode()); |
---|
288 | if (species_count == size_t(-1)) { |
---|
289 | species_count = 0; |
---|
290 | for (GBDATA *gb_species = whichSpecies.select_first(gb_main); |
---|
291 | gb_species; |
---|
292 | gb_species = whichSpecies.select_next(gb_species)) |
---|
293 | { |
---|
294 | species_count++; |
---|
295 | } |
---|
296 | } |
---|
297 | return species_count; |
---|
298 | } |
---|
299 | |
---|
300 | GB_ERROR detectVerticalGaps(); |
---|
301 | const char *get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& error); |
---|
302 | }; |
---|
303 | |
---|
304 | const unsigned char *export_sequence_data::get_seq_data(GBDATA *gb_species, size_t& slen, GB_ERROR& err) const { |
---|
305 | const char *data = NULp; |
---|
306 | GBDATA *gb_seq = GBT_find_sequence(gb_species, ali); |
---|
307 | |
---|
308 | if (!gb_seq) { |
---|
309 | err = GBS_global_string_copy("No data in alignment '%s' of species '%s'", ali, GBT_get_name_or_description(gb_species)); |
---|
310 | slen = 0; |
---|
311 | } |
---|
312 | else { |
---|
313 | data = GB_read_char_pntr(gb_seq); |
---|
314 | slen = GB_read_count(gb_seq); |
---|
315 | err = NULp; |
---|
316 | } |
---|
317 | return (const unsigned char *)data; |
---|
318 | } |
---|
319 | |
---|
320 | |
---|
321 | GB_ERROR export_sequence_data::detectVerticalGaps() { |
---|
322 | GB_ERROR err = NULp; |
---|
323 | |
---|
324 | sio_assert(!in_single_mode()); |
---|
325 | |
---|
326 | if (compress == 1) { // compress vertical gaps! |
---|
327 | // @@@ detection of vertical gaps should better be done either by AP_filter directly or by FilteredExport |
---|
328 | |
---|
329 | size_t gap_columns = filter->get_filtered_length(); |
---|
330 | size_t *gap_column = new size_t[gap_columns+1]; |
---|
331 | |
---|
332 | const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos(); |
---|
333 | memcpy(gap_column, filterpos_2_seqpos, gap_columns*sizeof(*gap_column)); |
---|
334 | gap_column[gap_columns] = max_ali_len; |
---|
335 | |
---|
336 | arb_progress progress("Calculating vertical gaps", count_species()); |
---|
337 | |
---|
338 | for (GBDATA *gb_species = first_species(); |
---|
339 | gb_species && !err; |
---|
340 | gb_species = next_species(gb_species)) |
---|
341 | { |
---|
342 | size_t slen; |
---|
343 | const unsigned char *sdata = get_seq_data(gb_species, slen, err); |
---|
344 | |
---|
345 | if (!err) { |
---|
346 | size_t j = 0; |
---|
347 | size_t i; |
---|
348 | for (i = 0; i<gap_columns; ++i) { |
---|
349 | if (isGap(sdata[gap_column[i]])) { |
---|
350 | gap_column[j++] = gap_column[i]; // keep gap column |
---|
351 | } |
---|
352 | // otherwise it's overwritten |
---|
353 | } |
---|
354 | |
---|
355 | sio_assert(i >= j); |
---|
356 | size_t skipped_columns = i-j; |
---|
357 | sio_assert(gap_columns >= skipped_columns); |
---|
358 | gap_columns -= skipped_columns; |
---|
359 | } |
---|
360 | progress.inc_and_check_user_abort(err); |
---|
361 | } |
---|
362 | |
---|
363 | if (!err) { |
---|
364 | columns = filter->get_filtered_length() - gap_columns; |
---|
365 | export_column = new size_t[columns]; |
---|
366 | |
---|
367 | size_t gpos = 0; // index into array of vertical gaps |
---|
368 | size_t epos = 0; // index into array of exported columns |
---|
369 | size_t flen = filter->get_filtered_length(); |
---|
370 | size_t a; |
---|
371 | for (a = 0; a<flen && gpos<gap_columns; ++a) { |
---|
372 | size_t fpos = filterpos_2_seqpos[a]; |
---|
373 | if (fpos == gap_column[gpos]) { // only gaps here -> skip column |
---|
374 | gpos++; |
---|
375 | } |
---|
376 | else { // not only gaps -> use column |
---|
377 | sio_assert(fpos<gap_column[gpos]); |
---|
378 | sio_assert(epos < columns); // got more columns than expected |
---|
379 | export_column[epos++] = fpos; |
---|
380 | } |
---|
381 | } |
---|
382 | for (; a<flen; ++a) { // LOOP_VECTORIZED |
---|
383 | export_column[epos++] = filterpos_2_seqpos[a]; |
---|
384 | } |
---|
385 | |
---|
386 | sio_assert(epos == columns); |
---|
387 | } |
---|
388 | |
---|
389 | delete [] gap_column; |
---|
390 | } |
---|
391 | else { // compress all or none (simply use filter) |
---|
392 | const size_t *filterpos_2_seqpos = filter->get_filterpos_2_seqpos(); |
---|
393 | |
---|
394 | columns = filter->get_filtered_length(); |
---|
395 | export_column = new size_t[columns]; |
---|
396 | |
---|
397 | memcpy(export_column, filterpos_2_seqpos, columns*sizeof(*filterpos_2_seqpos)); |
---|
398 | } |
---|
399 | |
---|
400 | seq = new char[columns+1]; |
---|
401 | |
---|
402 | return err; |
---|
403 | } |
---|
404 | |
---|
405 | const char *export_sequence_data::get_export_sequence(GBDATA *gb_species, size_t& seq_len, GB_ERROR& err) { |
---|
406 | if (gb_species != last_species_read) { |
---|
407 | freenull(error); |
---|
408 | |
---|
409 | // read + filter a new species |
---|
410 | GB_ERROR curr_error; |
---|
411 | const unsigned char *data = get_seq_data(gb_species, len, curr_error); |
---|
412 | |
---|
413 | if (curr_error) { |
---|
414 | error = strdup(curr_error); |
---|
415 | } |
---|
416 | else { |
---|
417 | size_t i; |
---|
418 | const uchar *simplify = filter->get_simplify_table(); |
---|
419 | |
---|
420 | if (cut_stop_codon) { |
---|
421 | const unsigned char *stop_codon = (const unsigned char *)memchr(data, '*', len); |
---|
422 | if (stop_codon) { |
---|
423 | len = stop_codon-data; |
---|
424 | } |
---|
425 | } |
---|
426 | |
---|
427 | if (compress == 2) { // compress all gaps |
---|
428 | size_t j = 0; |
---|
429 | for (i = 0; i<columns; ++i) { |
---|
430 | size_t seq_pos = export_column[i]; |
---|
431 | if (seq_pos<len) { |
---|
432 | unsigned char c = data[seq_pos]; |
---|
433 | if (!isGap(c)) { |
---|
434 | seq[j++] = simplify[c]; |
---|
435 | } |
---|
436 | } |
---|
437 | } |
---|
438 | seq[j] = 0; |
---|
439 | len = j; |
---|
440 | } |
---|
441 | else { // compress vertical or compress none (simply use filter in both cases) |
---|
442 | for (i = 0; i<columns; ++i) { |
---|
443 | size_t seq_pos = export_column[i]; |
---|
444 | if (seq_pos<len) { |
---|
445 | seq[i] = simplify[data[seq_pos]]; |
---|
446 | } |
---|
447 | else { |
---|
448 | seq[i] = simplify['.']; |
---|
449 | } |
---|
450 | } |
---|
451 | seq[i] = 0; |
---|
452 | len = columns; |
---|
453 | } |
---|
454 | } |
---|
455 | } |
---|
456 | |
---|
457 | err = error; |
---|
458 | if (error) { |
---|
459 | seq_len = 0; |
---|
460 | return NULp; |
---|
461 | } |
---|
462 | |
---|
463 | seq_len = len; |
---|
464 | return seq; |
---|
465 | } |
---|
466 | |
---|
467 | // ---------------------------------------- |
---|
468 | // exported_sequence is hooked into ACI temporary (provides result of command 'export_sequence') |
---|
469 | // which is the sequence filtered and compressed according to settings in the export window |
---|
470 | |
---|
471 | static export_sequence_data *esd = NULp; |
---|
472 | |
---|
473 | static const char *exported_sequence(GBDATA *gb_species, size_t *seq_len, GB_ERROR *error) { |
---|
474 | sio_assert(esd); |
---|
475 | return esd->get_export_sequence(gb_species, *seq_len, *error); |
---|
476 | } |
---|
477 | |
---|
478 | static GB_ERROR XML_recursive(GBDATA *gbd, int depth) { |
---|
479 | GB_ERROR error = NULp; |
---|
480 | const char *key_name = GB_read_key_pntr(gbd); |
---|
481 | XML_Tag *tag = NULp; |
---|
482 | bool descend = true; |
---|
483 | |
---|
484 | if (depth == 1 && strncmp(key_name, "ali_", 4) == 0) { // hack needed if seq-quality information exists |
---|
485 | sio_assert(esd); |
---|
486 | descend = false; // do not descend into alignments |
---|
487 | if (strcmp(esd->getAlignment(), key_name) == 0) { // the wanted alignment |
---|
488 | |
---|
489 | tag = new XML_Tag("ALIGNMENT"); |
---|
490 | tag->add_attribute("name", key_name+4); |
---|
491 | |
---|
492 | GBDATA *gb_species = GB_get_father(gbd); |
---|
493 | size_t len; |
---|
494 | const char *seq = exported_sequence(gb_species, &len, &error); |
---|
495 | |
---|
496 | if (seq) { |
---|
497 | XML_Tag dtag("data"); |
---|
498 | { XML_Text seqText(seq); } |
---|
499 | } |
---|
500 | } |
---|
501 | } |
---|
502 | else { |
---|
503 | tag = new XML_Tag(key_name); |
---|
504 | |
---|
505 | if (GB_is_container(gbd)) { |
---|
506 | const char *name = GBT_read_char_pntr(gbd, "name"); |
---|
507 | if (name) tag->add_attribute("name", name); |
---|
508 | } |
---|
509 | } |
---|
510 | |
---|
511 | if (descend) { |
---|
512 | if (GB_read_type(gbd) == GB_DB) { |
---|
513 | for (GBDATA *gb_child = GB_child(gbd); gb_child && !error; gb_child = GB_nextChild(gb_child)) { |
---|
514 | const char *sub_key_name = GB_read_key_pntr(gb_child); |
---|
515 | |
---|
516 | if (strcmp(sub_key_name, "name") != 0) { // do not recurse for "name" (is handled above) |
---|
517 | error = XML_recursive(gb_child, depth+1); |
---|
518 | } |
---|
519 | } |
---|
520 | } |
---|
521 | else { |
---|
522 | char *content = GB_read_as_string(gbd); |
---|
523 | if (content) { |
---|
524 | XML_Text text(content); |
---|
525 | free(content); |
---|
526 | } |
---|
527 | else { |
---|
528 | tag->add_attribute("error", "unsavable"); |
---|
529 | } |
---|
530 | } |
---|
531 | } |
---|
532 | |
---|
533 | delete tag; |
---|
534 | return error; |
---|
535 | } |
---|
536 | |
---|
537 | static GB_ERROR export_species_using_form(FILE *out, const char *form, const GBL_call_env& callEnv) { // @@@ pass preparsed command (form) |
---|
538 | GB_ERROR error = NULp; |
---|
539 | char *pars = GBS_string_eval_in_env(" ", form, callEnv); |
---|
540 | if (!pars) error = GB_await_error(); |
---|
541 | else { |
---|
542 | char *p; |
---|
543 | char *o = pars; |
---|
544 | while ((p = GBS_find_string(o, "$$DELETE_LINE$$", 0))) { |
---|
545 | char *l, *r; |
---|
546 | for (l = p; l>o; l--) if (*l=='\n') break; |
---|
547 | r = strchr(p, '\n'); if (!r) r = p + strlen(p); |
---|
548 | fwrite(o, 1, l-o, out); |
---|
549 | o = r; |
---|
550 | } |
---|
551 | fputs(o, out); |
---|
552 | free(pars); |
---|
553 | } |
---|
554 | return error; |
---|
555 | } |
---|
556 | |
---|
557 | static GB_ERROR export_write_species(GBDATA *gb_species, FILE *out, const GBL_env& env, const export_format& efo) { |
---|
558 | GB_ERROR error = NULp; |
---|
559 | switch (efo.export_mode) { |
---|
560 | case EXPORT_USING_FORM: { |
---|
561 | GBL_call_env callEnv(gb_species, env); |
---|
562 | error = export_species_using_form(out, efo.form, callEnv); |
---|
563 | break; |
---|
564 | } |
---|
565 | |
---|
566 | case EXPORT_XML: |
---|
567 | error = XML_recursive(gb_species, 0); |
---|
568 | break; |
---|
569 | |
---|
570 | case EXPORT_INVALID: |
---|
571 | sio_assert(0); |
---|
572 | break; |
---|
573 | } |
---|
574 | return error; |
---|
575 | } |
---|
576 | |
---|
577 | static GB_ERROR export_format_single(const char *db_name, const char *formname, const char *outname, char **resulting_outname, RuleSetPtr ruleset) { |
---|
578 | // Exports sequences specified by 'esd' (module global variable) |
---|
579 | // to format specified by 'formname'. |
---|
580 | // |
---|
581 | // if 'outname' == NULp -> export species to temporary file, otherwise to 'outname'. |
---|
582 | // Full path of generated file is returned in 'resulting_outname' |
---|
583 | |
---|
584 | static int export_depth = 0; |
---|
585 | export_depth++; |
---|
586 | |
---|
587 | *resulting_outname = NULp; |
---|
588 | |
---|
589 | export_format efo; |
---|
590 | GB_ERROR error = read_export_format(&efo, formname, true); |
---|
591 | |
---|
592 | if (!error) { |
---|
593 | if (!outname) { // if no 'outname' is given -> export to temporary file |
---|
594 | char *unique_outname = GB_unique_filename("exported", efo.suffix); |
---|
595 | *resulting_outname = GB_create_tempfile(unique_outname); |
---|
596 | free(unique_outname); |
---|
597 | |
---|
598 | if (!*resulting_outname) error = GB_await_error(); |
---|
599 | } |
---|
600 | else *resulting_outname = strdup(outname); |
---|
601 | } |
---|
602 | |
---|
603 | sio_assert(error || *resulting_outname); |
---|
604 | |
---|
605 | if (!error) { |
---|
606 | if (efo.pre_format) { |
---|
607 | // Export data using format 'pre_format'. |
---|
608 | // Afterwards convert to wanted format using 'system'. |
---|
609 | |
---|
610 | sio_assert(efo.system); |
---|
611 | |
---|
612 | char *intermediate_export; |
---|
613 | error = export_format_single(db_name, efo.pre_format, NULp, &intermediate_export, ruleset); |
---|
614 | if (!error) { |
---|
615 | sio_assert(GB_is_privatefile(intermediate_export, false)); |
---|
616 | |
---|
617 | GB_informationf("Converting to %s", efo.suffix); |
---|
618 | |
---|
619 | char *srt = GBS_global_string_copy("$<=%s:$>=%s", intermediate_export, *resulting_outname); |
---|
620 | char *sys = GBS_string_eval(efo.system, srt); |
---|
621 | |
---|
622 | GB_informationf("exec '%s'", efo.system); |
---|
623 | error = GBK_system(sys); |
---|
624 | |
---|
625 | GB_unlink_or_warn(intermediate_export, &error); |
---|
626 | |
---|
627 | free(sys); |
---|
628 | free(srt); |
---|
629 | } |
---|
630 | free(intermediate_export); |
---|
631 | } |
---|
632 | else { |
---|
633 | FILE *out = fopen(*resulting_outname, "wt"); |
---|
634 | if (!out) error = GB_IO_error("writing", *resulting_outname); |
---|
635 | else { |
---|
636 | XML_Document *xml = NULp; |
---|
637 | |
---|
638 | long allCount = 0; |
---|
639 | for (GBDATA *gb_species = esd->first_species(); |
---|
640 | gb_species && !error; |
---|
641 | gb_species = esd->next_species(gb_species)) |
---|
642 | { |
---|
643 | allCount++; |
---|
644 | } |
---|
645 | |
---|
646 | arb_progress progress(allCount); |
---|
647 | progress.auto_subtitles("Saving species"); |
---|
648 | |
---|
649 | if (efo.export_mode == EXPORT_XML) { |
---|
650 | xml = new XML_Document("ARB_SEQ_EXPORT", "arb_seq_export.dtd", out); |
---|
651 | { |
---|
652 | xml->add_attribute("database", db_name); |
---|
653 | } |
---|
654 | xml->add_attribute("export_date", ARB_date_string()); |
---|
655 | { |
---|
656 | XML_Comment rem("There is a basic version of ARB_seq_export.dtd in $ARBHOME/lib/dtd\n" |
---|
657 | "but you might need to expand it by yourself,\n" |
---|
658 | "because the ARB-database may contain any kind of fields."); |
---|
659 | } |
---|
660 | } |
---|
661 | |
---|
662 | GBL_env env(esd->get_gb_main(), NULp); |
---|
663 | |
---|
664 | for (GBDATA *gb_species = esd->first_species(); |
---|
665 | gb_species && !error; |
---|
666 | gb_species = esd->next_species(gb_species)) |
---|
667 | { |
---|
668 | if (ruleset.isSet()) { |
---|
669 | GB_topSecurityLevel unsecured(env.get_gb_main()); // needed to clone species (overwrites name .. in temporary clone) |
---|
670 | ItemClonedByRuleSet clone(gb_species, CLONE_ITEM_SPECIES, ruleset, RENAME_ITEM_WHILE_TEMP_CLONE_EXISTS, NULp, NULp); |
---|
671 | if (clone.has_error()) { |
---|
672 | error = clone.get_error(); |
---|
673 | } |
---|
674 | else { |
---|
675 | GB_previousSecurityLevel user(unsecured); // run export itself with normal security |
---|
676 | error = export_write_species(clone.get_clone(), out, env, efo); |
---|
677 | } |
---|
678 | } |
---|
679 | else { |
---|
680 | error = export_write_species(gb_species, out, env, efo); |
---|
681 | } |
---|
682 | progress.inc_and_check_user_abort(error); |
---|
683 | } |
---|
684 | |
---|
685 | delete xml; |
---|
686 | fclose(out); |
---|
687 | } |
---|
688 | } |
---|
689 | } |
---|
690 | |
---|
691 | if (error) { |
---|
692 | if (*resulting_outname) { |
---|
693 | GB_unlink_or_warn(*resulting_outname, NULp); |
---|
694 | freenull(*resulting_outname); |
---|
695 | } |
---|
696 | } |
---|
697 | |
---|
698 | export_depth--; |
---|
699 | |
---|
700 | return error; |
---|
701 | } |
---|
702 | |
---|
703 | static GB_ERROR export_format_multiple(const char* dbname, const char *formname, const char *outname, bool multiple, char **resulting_outname, RuleSetPtr ruleset) { |
---|
704 | GB_ERROR error = NULp; |
---|
705 | |
---|
706 | if (multiple) { |
---|
707 | char *path, *name, *suffix; |
---|
708 | GB_split_full_path(outname, &path, NULp, &name, &suffix); |
---|
709 | *resulting_outname = NULp; |
---|
710 | |
---|
711 | arb_progress progress("Exporting data", esd->count_species()); |
---|
712 | |
---|
713 | for (GBDATA *gb_species = esd->first_species(); |
---|
714 | gb_species && !error; |
---|
715 | gb_species = esd->next_species(gb_species)) |
---|
716 | { |
---|
717 | const char *species_name = GBT_read_char_pntr(gb_species, "name"); |
---|
718 | if (!species_name) error = "Can't export unnamed species"; |
---|
719 | else { |
---|
720 | const char *fname = GB_append_suffix(GBS_global_string("%s_%s", name, species_name), suffix); |
---|
721 | progress.subtitle(fname); |
---|
722 | |
---|
723 | char *oname = strdup(GB_concat_path(path, fname)); |
---|
724 | char *res_oname; |
---|
725 | |
---|
726 | esd->set_single_mode(gb_species); // means: only export 'gb_species' |
---|
727 | error = export_format_single(dbname, formname, oname, &res_oname, ruleset); |
---|
728 | esd->set_single_mode(NULp); |
---|
729 | |
---|
730 | if (!*resulting_outname || // not set yet |
---|
731 | (res_oname && strcmp(*resulting_outname, res_oname)>0)) // or smaller than set one |
---|
732 | { |
---|
733 | reassign(*resulting_outname, res_oname); |
---|
734 | } |
---|
735 | |
---|
736 | free(res_oname); |
---|
737 | free(oname); |
---|
738 | } |
---|
739 | |
---|
740 | progress.inc_and_check_user_abort(error); |
---|
741 | } |
---|
742 | |
---|
743 | free(suffix); |
---|
744 | free(name); |
---|
745 | free(path); |
---|
746 | } |
---|
747 | else { |
---|
748 | arb_progress progress("Exporting data"); |
---|
749 | error = export_format_single(dbname, formname, outname, resulting_outname, ruleset); |
---|
750 | } |
---|
751 | |
---|
752 | return error; |
---|
753 | } |
---|
754 | |
---|
755 | namespace SEQIO { |
---|
756 | |
---|
757 | GB_ERROR export_by_format(GBDATA *gb_main, ExportWhich which, const char *one_species, |
---|
758 | AP_filter *filter, int cut_stop_codon, int compress, |
---|
759 | const char *dbname, const char *formname, const char *field_transfer_set, |
---|
760 | const char *outname, int multiple, char **real_outname) |
---|
761 | { |
---|
762 | sio_assert(!GB_have_error()); |
---|
763 | |
---|
764 | if (field_transfer_set && !field_transfer_set[0]) { // empty 'field_transfer_set' given |
---|
765 | field_transfer_set = NULp; // -> handle like NULp |
---|
766 | } |
---|
767 | |
---|
768 | GB_ERROR error = filter->is_invalid(); |
---|
769 | |
---|
770 | RuleSetPtr ruleset; |
---|
771 | if (!error) { |
---|
772 | if (field_transfer_set) { // if specified load ruleset: |
---|
773 | ErrorOrRuleSetPtr loaded = RuleSet::loadFrom(field_transfer_set); |
---|
774 | |
---|
775 | if (loaded.hasError()) { |
---|
776 | ARB_ERROR lerror = loaded.getError(); |
---|
777 | error = lerror.deliver(); |
---|
778 | } |
---|
779 | else { |
---|
780 | ruleset = loaded.getValue(); |
---|
781 | } |
---|
782 | } |
---|
783 | } |
---|
784 | |
---|
785 | if (!error) { |
---|
786 | esd = new export_sequence_data(gb_main, which, one_species, filter, cut_stop_codon, compress); |
---|
787 | sio_assert(esd->getAliLen()>0); |
---|
788 | |
---|
789 | GB_set_export_sequence_hook(exported_sequence); |
---|
790 | |
---|
791 | error = esd->detectVerticalGaps(); |
---|
792 | if (!error) { |
---|
793 | error = export_format_multiple(dbname, formname, outname, multiple, real_outname, ruleset); |
---|
794 | if (error) error = GBS_static_string(error); // error is member of export_sequence_data -> copy to static buffer |
---|
795 | } |
---|
796 | |
---|
797 | GB_set_export_sequence_hook(NULp); |
---|
798 | } |
---|
799 | delete esd; |
---|
800 | esd = NULp; |
---|
801 | |
---|
802 | sio_assert(!GB_have_error()); |
---|
803 | return error; |
---|
804 | } |
---|
805 | |
---|
806 | GB_ERROR get_exportFormat_information(const char *eft_formname, ExportFormatInfo& info) { |
---|
807 | export_format efs; |
---|
808 | GB_ERROR error = read_export_format(&efs, eft_formname, false); |
---|
809 | |
---|
810 | if (!error) { |
---|
811 | if (efs.suffix) { |
---|
812 | info.suffix = efs.suffix; |
---|
813 | efs.suffix = NULp; |
---|
814 | } |
---|
815 | if (efs.description) { |
---|
816 | info.description = efs.description; |
---|
817 | efs.description = NULp; |
---|
818 | } |
---|
819 | } |
---|
820 | |
---|
821 | return error; |
---|
822 | } |
---|
823 | |
---|
824 | char *get_exportFormat_evalForm(const char *eft_formname, GB_ERROR& error) { |
---|
825 | // load copy of form that gets evaluated during export. |
---|
826 | export_format efs; |
---|
827 | error = read_export_format(&efs, eft_formname, true); |
---|
828 | if (!error && efs.form) { |
---|
829 | if (efs.pre_format) { |
---|
830 | sio_assert(strcmp(efs.form, "*=") == 0); // caused by eval in read_export_format? |
---|
831 | return get_exportFormat_evalForm(efs.pre_format, error); |
---|
832 | } |
---|
833 | |
---|
834 | sio_assert(efs.pre_format == NULp); |
---|
835 | return ARB_strdup(efs.form); |
---|
836 | } |
---|
837 | // failed to load form |
---|
838 | |
---|
839 | sio_assert(efs.form == NULp); |
---|
840 | sio_assert(efs.pre_format == NULp); |
---|
841 | if (!error) { |
---|
842 | if (efs.export_mode != EXPORT_USING_FORM) { |
---|
843 | if (efs.export_mode == EXPORT_XML) { |
---|
844 | error = "exports all fields"; |
---|
845 | } |
---|
846 | else { |
---|
847 | error = "unsupported filter type"; |
---|
848 | } |
---|
849 | } |
---|
850 | else { |
---|
851 | error = "no form loaded"; |
---|
852 | } |
---|
853 | } |
---|
854 | |
---|
855 | sio_assert(error); |
---|
856 | if (error) { |
---|
857 | char *nameOnly = NULp; |
---|
858 | GB_split_full_path(eft_formname, NULp, &nameOnly, NULp, NULp); |
---|
859 | |
---|
860 | const char *shownName = nameOnly ? nameOnly : eft_formname; |
---|
861 | error = GBS_global_string("%s (%s)", error, shownName); |
---|
862 | |
---|
863 | free(nameOnly); |
---|
864 | } |
---|
865 | return NULp; |
---|
866 | } |
---|
867 | |
---|
868 | }; |
---|
869 | |
---|
870 | // -------------------------------------------------------------------------------- |
---|
871 | |
---|
872 | #ifdef UNIT_TESTS |
---|
873 | #include <test_unit.h> |
---|
874 | |
---|
875 | // uncomment to auto-update exported files |
---|
876 | // (needed once after changing database or export formats) |
---|
877 | // #define TEST_AUTO_UPDATE |
---|
878 | #define TEST_AUTO_UPDATE_ONLY_MISSING // do auto-update only if file is missing |
---|
879 | |
---|
880 | void TEST_sequence_export() { |
---|
881 | GB_shell shell; |
---|
882 | arb_suppress_progress silence; |
---|
883 | |
---|
884 | GBDATA *gb_main = GB_open("TEST_loadsave.arb", "r"); |
---|
885 | char *export_dir = nulldup(GB_path_in_ARBLIB("export")); |
---|
886 | StrArray eft; |
---|
887 | GBS_read_dir(eft, export_dir, "*.eft"); |
---|
888 | |
---|
889 | AP_filter *filter = NULp; |
---|
890 | { |
---|
891 | GB_transaction ta(gb_main); |
---|
892 | |
---|
893 | char *ali = GBT_get_default_alignment(gb_main); |
---|
894 | size_t alilen = GBT_get_alignment_len(gb_main, ali); |
---|
895 | filter = new AP_filter(alilen); |
---|
896 | |
---|
897 | GBT_mark_all(gb_main, 0); |
---|
898 | GBDATA *gb_species = GBT_find_species(gb_main, "MetMazei"); |
---|
899 | TEST_REJECT_NULL(gb_species); |
---|
900 | |
---|
901 | GB_write_flag(gb_species, 1); // mark |
---|
902 | free(ali); |
---|
903 | } |
---|
904 | for (int e = 0; eft[e]; ++e) { |
---|
905 | for (int complete = 0; complete <= 1; ++complete) { |
---|
906 | const char *name = strrchr(eft[e], '/'); |
---|
907 | TEST_REJECT_NULL(name); |
---|
908 | name++; |
---|
909 | |
---|
910 | TEST_ANNOTATE(name); |
---|
911 | |
---|
912 | { |
---|
913 | export_format efo; |
---|
914 | TEST_EXPECT_NO_ERROR(read_export_format(&efo, eft[e], complete)); |
---|
915 | if (strcmp(name, "fasta_wacc.eft") == 0) { // test description of one filter |
---|
916 | TEST_EXPECT_EQUAL(efo.description, |
---|
917 | "Exports sequences to fasta-format.\n" |
---|
918 | "Header exported as: >ID SEQLENGTH bp SEQTYPE ACC"); |
---|
919 | } |
---|
920 | } |
---|
921 | |
---|
922 | if (complete) { |
---|
923 | const char *outname = "impexp/exported"; |
---|
924 | char *used_outname = NULp; |
---|
925 | |
---|
926 | { |
---|
927 | GB_transaction ta(gb_main); |
---|
928 | TEST_EXPECT_NO_ERROR(export_by_format(gb_main, EBF_MARKED, NULp, |
---|
929 | filter, 0, 0, |
---|
930 | "DBname", eft[e], NULp, // @@@ currently only tests export w/o FTS (pass FTS for some formats? or separately) |
---|
931 | outname, 0, &used_outname)); |
---|
932 | } |
---|
933 | |
---|
934 | char *expected = GBS_global_string_copy("impexp/%s.exported", name); |
---|
935 | |
---|
936 | #if defined(TEST_AUTO_UPDATE) |
---|
937 | #if defined(TEST_AUTO_UPDATE_ONLY_MISSING) |
---|
938 | if (GB_is_regularfile(expected)) { |
---|
939 | TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0); |
---|
940 | } |
---|
941 | else |
---|
942 | #else |
---|
943 | { |
---|
944 | TEST_COPY_FILE(outname, expected); |
---|
945 | } |
---|
946 | #endif |
---|
947 | #else |
---|
948 | TEST_EXPECT_TEXTFILE_DIFFLINES_IGNORE_DATES(outname, expected, 0); |
---|
949 | // see ../../UNIT_TESTER/run/impexp |
---|
950 | #endif // TEST_AUTO_UPDATE |
---|
951 | TEST_EXPECT_ZERO_OR_SHOW_ERRNO(unlink(outname)); |
---|
952 | |
---|
953 | free(expected); |
---|
954 | free(used_outname); |
---|
955 | } |
---|
956 | } |
---|
957 | } |
---|
958 | |
---|
959 | delete filter; |
---|
960 | free(export_dir); |
---|
961 | GB_close(gb_main); |
---|
962 | } |
---|
963 | |
---|
964 | #endif // UNIT_TESTS |
---|