root/trunk/CONVERTALN/fconv.cxx

Revision 8607, 9.9 KB (checked in by westram, 5 weeks ago)

merge from e4fix [8135] [8136] [8137] [8138] [8139] [8140] [8141] [8142] [8143] [8144] [8222]
this revives the reverted patches [8129] [8130] [8131] [8132]

  • fixes
    • some free/delete mismatches
    • wrong definition of ORF objects (Level was no bit value)
    • amino consensus (failed for columns only containing 'C')
  • rename
    • AA_sequence_term -> orf_term
    • ED4_sequence_terminal_basic -> ED4_abstract_sequence_terminal
  • cleaned up hierarchy dumps
  • tweaked is_terminal()/to_terminal()
  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
Line 
1// ------------- File format converting subroutine -------------
2
3#include "defs.h"
4#include "fun.h"
5#include "global.h"
6#include <static_assert.h>
7
8static const char *format2name(Format type) {
9    switch (type) {
10        case EMBL:      return "EMBL";
11        case GCG:       return "GCG";
12        case GENBANK:   return "GENBANK";
13        case MACKE:     return "MACKE";
14        case NEXUS:     return "NEXUS";
15        case PHYLIP:    return "PHYLIP";
16        case FASTDNAML: return "FASTDNAML";
17        case PRINTABLE: return "PRINTABLE";
18        case SWISSPROT: return "SWISSPROT";
19
20        case UNKNOWN: ca_assert(0);
21    }
22    return NULL;
23}
24
25void throw_conversion_not_supported(Format inType, Format ouType) { // __ATTR__NORETURN
26    throw_errorf(90, "Conversion from %s to %s is not supported",
27                 format2name(inType), format2name(ouType));
28}
29void throw_conversion_failure(Format inType, Format ouType) { // __ATTR__NORETURN
30    throw_errorf(91, "Conversion from %s to %s fails",
31                 format2name(inType), format2name(ouType));
32}
33void throw_conversion_not_implemented(Format inType, Format ouType) { // __ATTR__NORETURN
34    throw_errorf(92, "Conversion from %s to %s is not implemented (but is expected to be here)",
35                 format2name(inType), format2name(ouType));
36}
37void throw_unsupported_input_format(Format inType) {  // __ATTR__NORETURN
38    throw_errorf(93, "Unsupported input format %s", format2name(inType));
39}
40
41void throw_incomplete_entry() { // __ATTR__NORETURN
42    throw_error(84, "Reached EOF before complete entry has been read");
43}
44
45static int log_processed_counter = 0;
46static int log_seq_counter       = 0;
47
48void log_processed(int seqCount) {
49#if defined(CALOG)
50    fprintf(stderr, "Total %d sequences have been processed\n", seqCount);
51#endif // CALOG
52
53    log_processed_counter++;
54    log_seq_counter += seqCount;
55}
56
57// --------------------------------------------------------------------------------
58
59#ifdef UNIT_TESTS
60#include <arbdbt.h> // before test_unit.h!
61#include <arb_file.h>
62#include <test_unit.h>
63
64
65#define TEST_THROW // comment out to temp. disable intentional throws
66
67struct FormatSpec {
68    Format      type;           // GENBANK, MACKE, ...
69    const char *name;
70    const char *testfile;       // existing testfile (or NULL)
71    int         sequence_count; // number of sequences in 'testfile'
72};
73
74#define FORMATSPEC_OUT_ONLY(tag)                { tag, #tag, NULL, 1 }
75#define FORMATSPEC_GOT______(tag,file)          { tag, #tag, "impexp/" file ".eft.exported", 1 }
76#define FORMATSPEC_GOT_PLAIN(tag,file,seqcount) { tag, #tag, "impexp/" file, seqcount }
77
78static FormatSpec format_spec[] = {
79    // input formats
80    // FORMATSPEC_GOT______(GENBANK, "genbank"),
81    FORMATSPEC_GOT_PLAIN(GENBANK, "genbank.input", 3),
82    FORMATSPEC_GOT_PLAIN(EMBL, "embl.input", 5),
83    FORMATSPEC_GOT_PLAIN(MACKE, "macke.input", 5),
84    FORMATSPEC_GOT_PLAIN(SWISSPROT, "swissprot.input", 1), // SWISSPROT
85
86    // output formats
87    FORMATSPEC_OUT_ONLY(GCG),
88    FORMATSPEC_OUT_ONLY(NEXUS),
89    FORMATSPEC_OUT_ONLY(PHYLIP),
90    FORMATSPEC_OUT_ONLY(PRINTABLE),
91};
92static const int fcount = ARRAY_ELEMS(format_spec);
93
94enum FormatNum { // same order as above
95    NUM_GENBANK,
96    NUM_EMBL,
97    NUM_MACKE,
98    NUM_SWISSPROT,
99
100    NUM_GCG,
101    NUM_NEXUS,
102    NUM_PHYLIP,
103
104    NUM_PRINTABLE,
105
106    FORMATNUM_COUNT,
107};
108
109struct Capabilities {
110    bool supported;
111    bool neverReturns;
112
113    Capabilities() :
114        supported(true),
115        neverReturns(false)
116    {}
117
118    bool shall_be_tested() {
119#if defined(TEST_THROW)
120        return !neverReturns;
121#else // !defined(TEST_THROW)
122        return supported && !neverReturns;
123#endif
124    }
125};
126
127static Capabilities cap[fcount][fcount];
128#define CAP(from,to) (cap[NUM_##from][NUM_##to])
129
130#define TYPE(f)  format_spec[f].type
131#define NAME(f)  format_spec[f].name
132#define INPUT(f) format_spec[f].testfile
133#define EXSEQ(f) format_spec[f].sequence_count
134
135// ----------------------------------
136//      update .expected files ?
137
138// #define TEST_AUTO_UPDATE // never does update if undefined
139// #define UPDATE_ONLY_IF_MISSING
140#define UPDATE_ONLY_IF_MORE_THAN_DATE_DIFFERS
141
142inline bool more_than_date_differs(const char *file, const char *expected) {
143    return !GB_test_textfile_difflines(file, expected, 0, 1);
144}
145
146#if defined(TEST_AUTO_UPDATE)
147inline bool want_auto_update(const char *file, const char *expected) {
148    bool shall_update = true;
149
150    file     = file;
151    expected = expected;
152
153#if defined(UPDATE_ONLY_IF_MISSING)
154    shall_update = shall_update && !GB_is_regularfile(expected);
155#endif
156#if defined(UPDATE_ONLY_IF_MORE_THAN_DATE_DIFFERS)
157    shall_update = shall_update && more_than_date_differs(file, expected);
158#endif
159    return shall_update;
160}
161#else // !TEST_AUTO_UPDATE
162inline bool want_auto_update(const char * /* file */, const char * /* expected */) {
163    return false;
164}
165#endif
166
167static void test_expected_conversion(const char *file, const char *flavor) {
168    char *expected;
169    if (flavor) expected = GBS_global_string_copy("%s.%s.expected", file, flavor);
170    else expected = GBS_global_string_copy("%s.expected", file);
171
172    bool shall_update = want_auto_update(file, expected);
173    if (shall_update) {
174        // TEST_ASSERT(0); // completely avoid real update
175        TEST_ASSERT_ZERO_OR_SHOW_ERRNO(system(GBS_global_string("cp %s %s", file, expected)));
176    }
177    else {
178        TEST_ASSERT(!more_than_date_differs(file, expected));
179    }
180    free(expected);
181}
182
183static const char *test_convert(const char *inf, const char *outf, Format inType, Format ouType) {
184    const char *error = NULL;
185    try {
186        convert(FormattedFile(inf ? inf : "infilename", inType),
187                FormattedFile(outf ? outf : "outfilename", ouType));
188    }
189    catch (Convaln_exception& exc) { error = GBS_global_string("%s (#%i)", exc.get_msg(), exc.get_code()); }
190    return error;
191}
192
193static void test_convert_by_format_num(int from, int to) {
194    char *toFile = GBS_global_string_copy("impexp/conv.%s_2_%s", NAME(from), NAME(to));
195    if (GB_is_regularfile(toFile)) TEST_ASSERT_ZERO_OR_SHOW_ERRNO(unlink(toFile));
196
197    int old_processed_counter = log_processed_counter;
198    int old_seq_counter       = log_seq_counter;
199
200    const char *error = test_convert(INPUT(from), toFile, TYPE(from), TYPE(to));
201
202    int converted_seqs = log_seq_counter-old_seq_counter;
203    int expected_seqs  = EXSEQ(from);
204    if (to == NUM_GCG) expected_seqs = 1; // we stop after first file (useless to generate numerous files)
205
206    Capabilities& me = cap[from][to];
207
208    if (me.supported) {
209        if (error) TEST_ERROR("convert() reports error: '%s' (for supported conversion)", error);
210        TEST_ASSERT(GB_is_regularfile(toFile));
211        TEST_ASSERT_EQUAL(converted_seqs, expected_seqs);
212        TEST_ASSERT_EQUAL(log_processed_counter, old_processed_counter+1);
213
214        TEST_ASSERT_LOWER_EQUAL(10, GB_size_of_file(toFile)); // less than 10 bytes
215        test_expected_conversion(toFile, NULL);
216        TEST_ASSERT_ZERO_OR_SHOW_ERRNO(unlink(toFile));
217    }
218    else {
219        if (!error) TEST_ERROR("No error for unsupported conversion '%s'", GBS_global_string("%s -> %s", NAME(from), NAME(to)));
220        TEST_ASSERT(strstr(error, "supported")); // wring error
221        TEST_ASSERT(!GB_is_regularfile(toFile)); // unsupported produced output
222    }
223    TEST_ASSERT(me.supported == !error);
224
225#if defined(TEST_THROW)
226    {
227        // test if conversion from empty and text file fails
228
229        const char *fromFile = "general/empty.input";
230
231        error = test_convert(fromFile, toFile, TYPE(from), TYPE(to));
232        TEST_ASSERT(error);
233
234        fromFile = "general/text.input";
235        error = test_convert(fromFile, toFile, TYPE(from), TYPE(to));
236        TEST_ASSERT(error);
237    }
238#endif
239
240    free(toFile);
241}
242
243inline bool isInputFormat(int num) { return is_input_format(TYPE(num)); }
244
245static void init_cap() {
246    for (int from = 0; from<fcount; from++) {
247        for (int to = 0; to<fcount; to++) {
248            Capabilities& me = cap[from][to];
249            if (!isInputFormat(from)) me.supported = false;
250        }
251    }
252}
253
254#define NOT_SUPPORTED(t1,t2) TEST_ASSERT(isInputFormat(NUM_##t1)); cap[NUM_##t1][NUM_##t2].supported = false
255
256static int will_convert(int from) {
257    int will = 0;
258    for (int to = 0; to<fcount; to++) {
259        Capabilities& me = cap[from][to];
260        if (me.supported && me.shall_be_tested()) {
261            will++;
262        }
263    }
264    return will;
265}
266
267void TEST_converter() {
268    COMPILE_ASSERT(FORMATNUM_COUNT == fcount);
269
270    init_cap();
271
272    NOT_SUPPORTED(GENBANK, SWISSPROT);
273    NOT_SUPPORTED(EMBL, SWISSPROT);
274    NOT_SUPPORTED(SWISSPROT, GENBANK);
275    NOT_SUPPORTED(SWISSPROT, EMBL);
276
277    int possible     = 0;
278    int tested       = 0;
279    int unsupported  = 0;
280    int neverReturns = 0;
281
282    for (int from = 0; from<fcount; from++) {
283        TEST_ANNOTATE_ASSERT(GBS_global_string("while converting from '%s'", NAME(from)));
284        if (isInputFormat(from)) {
285            if (will_convert(from)<1) {
286                TEST_ERROR("Conversion from %s seems unsupported", NAME(from));
287            }
288        }
289        for (int to = 0; to<fcount; to++) {
290            possible++;
291            Capabilities& me = cap[from][to];
292
293            if (me.shall_be_tested()) {
294                TEST_ANNOTATE_ASSERT(GBS_global_string("while converting %s -> %s", NAME(from), NAME(to)));
295                test_convert_by_format_num(from, to);
296                tested++;
297            }
298
299            unsupported  += !me.supported;
300            neverReturns += me.neverReturns;
301        }
302    }
303
304    fprintf(stderr,
305            "Conversion test summary:\n"
306            " - formats:      %3i\n"
307            " - conversions:  %3i (possible)\n"
308            " - unsupported:  %3i\n"
309            " - tested:       %3i\n"
310            " - neverReturns: %3i (would never return - not checked)\n"
311            " - converted:    %3i\n",
312            fcount,
313            possible,
314            unsupported,
315            tested,
316            neverReturns,
317            tested-unsupported);
318
319    int untested = possible - tested;
320    TEST_ASSERT_EQUAL(untested, neverReturns);
321}
322
323#endif // UNIT_TESTS
Note: See TracBrowser for help on using the browser.