| 1 | // ------------- File format converting subroutine ------------- |
|---|
| 2 | |
|---|
| 3 | #include "defs.h" |
|---|
| 4 | #include "fun.h" |
|---|
| 5 | #include "global.h" |
|---|
| 6 | #include <static_assert.h> |
|---|
| 7 | #include <unistd.h> |
|---|
| 8 | #include <arb_diff.h> |
|---|
| 9 | |
|---|
| 10 | static const char *format2name(Format type) { |
|---|
| 11 | switch (type) { |
|---|
| 12 | case EMBL: return "EMBL"; |
|---|
| 13 | case GCG: return "GCG"; |
|---|
| 14 | case GENBANK: return "GENBANK"; |
|---|
| 15 | case MACKE: return "MACKE"; |
|---|
| 16 | case NEXUS: return "NEXUS"; |
|---|
| 17 | case PHYLIP: return "PHYLIP"; |
|---|
| 18 | case FASTDNAML: return "FASTDNAML"; |
|---|
| 19 | case PRINTABLE: return "PRINTABLE"; |
|---|
| 20 | case SWISSPROT: return "SWISSPROT"; |
|---|
| 21 | |
|---|
| 22 | case UNKNOWN: ca_assert(0); |
|---|
| 23 | } |
|---|
| 24 | return NULp; |
|---|
| 25 | } |
|---|
| 26 | |
|---|
| 27 | void throw_conversion_not_supported(Format inType, Format ouType) { // __ATTR__NORETURN |
|---|
| 28 | throw_errorf(90, "Conversion from %s to %s is not supported", |
|---|
| 29 | format2name(inType), format2name(ouType)); |
|---|
| 30 | } |
|---|
| 31 | void throw_conversion_failure(Format inType, Format ouType) { // __ATTR__NORETURN |
|---|
| 32 | throw_errorf(91, "Conversion from %s to %s fails", |
|---|
| 33 | format2name(inType), format2name(ouType)); |
|---|
| 34 | } |
|---|
| 35 | void throw_conversion_not_implemented(Format inType, Format ouType) { // __ATTR__NORETURN |
|---|
| 36 | throw_errorf(92, "Conversion from %s to %s is not implemented (but is expected to be here)", |
|---|
| 37 | format2name(inType), format2name(ouType)); |
|---|
| 38 | } |
|---|
| 39 | void throw_unsupported_input_format(Format inType) { // __ATTR__NORETURN |
|---|
| 40 | throw_errorf(93, "Unsupported input format %s", format2name(inType)); |
|---|
| 41 | } |
|---|
| 42 | |
|---|
| 43 | void throw_incomplete_entry() { // __ATTR__NORETURN |
|---|
| 44 | throw_error(84, "Reached EOF before complete entry has been read"); |
|---|
| 45 | } |
|---|
| 46 | |
|---|
| 47 | static int log_processed_counter = 0; |
|---|
| 48 | static int log_seq_counter = 0; |
|---|
| 49 | |
|---|
| 50 | void log_processed(int seqCount) { |
|---|
| 51 | #if defined(CALOG) |
|---|
| 52 | fprintf(stderr, "Total %d sequences have been processed\n", seqCount); |
|---|
| 53 | #endif // CALOG |
|---|
| 54 | |
|---|
| 55 | log_processed_counter++; |
|---|
| 56 | log_seq_counter += seqCount; |
|---|
| 57 | } |
|---|
| 58 | |
|---|
| 59 | // -------------------------------------------------------------------------------- |
|---|
| 60 | |
|---|
| 61 | #ifdef UNIT_TESTS |
|---|
| 62 | #include <arbdbt.h> // before test_unit.h! |
|---|
| 63 | #include <arb_file.h> |
|---|
| 64 | #include <test_unit.h> |
|---|
| 65 | |
|---|
| 66 | |
|---|
| 67 | #define TEST_THROW // comment out to temp. disable intentional throws |
|---|
| 68 | |
|---|
| 69 | struct FormatSpec { |
|---|
| 70 | Format type; // GENBANK, MACKE, ... |
|---|
| 71 | const char *name; |
|---|
| 72 | const char *testfile; // existing testfile (or NULp) |
|---|
| 73 | int sequence_count; // number of sequences in 'testfile' |
|---|
| 74 | }; |
|---|
| 75 | |
|---|
| 76 | #define FORMATSPEC_OUT_ONLY(tag) { tag, #tag, NULp, 1 } |
|---|
| 77 | #define FORMATSPEC_GOT______(tag,file) { tag, #tag, "impexp/" file ".eft.exported", 1 } |
|---|
| 78 | #define FORMATSPEC_GOT_PLAIN(tag,file,seqcount) { tag, #tag, "impexp/" file, seqcount } |
|---|
| 79 | |
|---|
| 80 | static FormatSpec format_spec[] = { |
|---|
| 81 | // input formats |
|---|
| 82 | // FORMATSPEC_GOT______(GENBANK, "genbank"), |
|---|
| 83 | FORMATSPEC_GOT_PLAIN(GENBANK, "genbank.input", 3), |
|---|
| 84 | FORMATSPEC_GOT_PLAIN(EMBL, "embl.input", 5), |
|---|
| 85 | FORMATSPEC_GOT_PLAIN(MACKE, "macke.input", 5), |
|---|
| 86 | FORMATSPEC_GOT_PLAIN(SWISSPROT, "swissprot.input", 1), // SWISSPROT |
|---|
| 87 | |
|---|
| 88 | // output formats |
|---|
| 89 | FORMATSPEC_OUT_ONLY(GCG), |
|---|
| 90 | FORMATSPEC_OUT_ONLY(NEXUS), |
|---|
| 91 | FORMATSPEC_OUT_ONLY(PHYLIP), |
|---|
| 92 | FORMATSPEC_OUT_ONLY(PRINTABLE), |
|---|
| 93 | }; |
|---|
| 94 | static const int fcount = ARRAY_ELEMS(format_spec); |
|---|
| 95 | |
|---|
| 96 | enum FormatNum { // same order as above |
|---|
| 97 | NUM_GENBANK, |
|---|
| 98 | NUM_EMBL, |
|---|
| 99 | NUM_MACKE, |
|---|
| 100 | NUM_SWISSPROT, |
|---|
| 101 | |
|---|
| 102 | NUM_GCG, |
|---|
| 103 | NUM_NEXUS, |
|---|
| 104 | NUM_PHYLIP, |
|---|
| 105 | |
|---|
| 106 | NUM_PRINTABLE, |
|---|
| 107 | |
|---|
| 108 | FORMATNUM_COUNT, |
|---|
| 109 | }; |
|---|
| 110 | |
|---|
| 111 | struct Capabilities { |
|---|
| 112 | bool supported; |
|---|
| 113 | bool neverReturns; |
|---|
| 114 | |
|---|
| 115 | Capabilities() : |
|---|
| 116 | supported(true), |
|---|
| 117 | neverReturns(false) |
|---|
| 118 | {} |
|---|
| 119 | |
|---|
| 120 | bool shall_be_tested() { |
|---|
| 121 | #if defined(TEST_THROW) |
|---|
| 122 | return !neverReturns; |
|---|
| 123 | #else // !defined(TEST_THROW) |
|---|
| 124 | return supported && !neverReturns; |
|---|
| 125 | #endif |
|---|
| 126 | } |
|---|
| 127 | }; |
|---|
| 128 | |
|---|
| 129 | static Capabilities cap[fcount][fcount]; |
|---|
| 130 | #define CAP(from,to) (cap[NUM_##from][NUM_##to]) |
|---|
| 131 | |
|---|
| 132 | #define TYPE(f) format_spec[f].type |
|---|
| 133 | #define NAME(f) format_spec[f].name |
|---|
| 134 | #define INPUT(f) format_spec[f].testfile |
|---|
| 135 | #define EXSEQ(f) format_spec[f].sequence_count |
|---|
| 136 | |
|---|
| 137 | // ---------------------------------- |
|---|
| 138 | // update .expected files ? |
|---|
| 139 | |
|---|
| 140 | // #define TEST_AUTO_UPDATE // never does update if undefined |
|---|
| 141 | // #define UPDATE_ONLY_IF_MISSING |
|---|
| 142 | #define UPDATE_ONLY_IF_MORE_THAN_DATE_DIFFERS |
|---|
| 143 | |
|---|
| 144 | inline bool more_than_date_differs(const char *file, const char *expected) { |
|---|
| 145 | return ARB_textfiles_have_difflines(file, expected, 0, TextDiffMode(TDM_NOT_DIFF_LINECOUNT|TDM_IGNORE_TIMESTAMPS)); |
|---|
| 146 | } |
|---|
| 147 | |
|---|
| 148 | #if defined(TEST_AUTO_UPDATE) |
|---|
| 149 | inline bool want_auto_update(const char *file, const char *expected) { |
|---|
| 150 | bool shall_update = true; |
|---|
| 151 | |
|---|
| 152 | file = file; |
|---|
| 153 | expected = expected; |
|---|
| 154 | |
|---|
| 155 | #if defined(UPDATE_ONLY_IF_MISSING) |
|---|
| 156 | shall_update = shall_update && !GB_is_regularfile(expected); |
|---|
| 157 | #endif |
|---|
| 158 | #if defined(UPDATE_ONLY_IF_MORE_THAN_DATE_DIFFERS) |
|---|
| 159 | shall_update = shall_update && more_than_date_differs(file, expected); |
|---|
| 160 | #endif |
|---|
| 161 | return shall_update; |
|---|
| 162 | } |
|---|
| 163 | #else // !TEST_AUTO_UPDATE |
|---|
| 164 | inline bool want_auto_update(const char * /* file */, const char * /* expected */) { |
|---|
| 165 | return false; |
|---|
| 166 | } |
|---|
| 167 | #endif |
|---|
| 168 | |
|---|
| 169 | static void test_expected_conversion(const char *file, const char *flavor) { |
|---|
| 170 | char *expected; |
|---|
| 171 | if (flavor) expected = GBS_global_string_copy("%s.%s.expected", file, flavor); |
|---|
| 172 | else expected = GBS_global_string_copy("%s.expected", file); |
|---|
| 173 | |
|---|
| 174 | bool shall_update = want_auto_update(file, expected); |
|---|
| 175 | if (shall_update) { |
|---|
| 176 | // TEST_EXPECT(0); // completely avoid real update |
|---|
| 177 | TEST_EXPECT_ZERO_OR_SHOW_ERRNO(system(GBS_global_string("cp %s %s", file, expected))); |
|---|
| 178 | } |
|---|
| 179 | else { |
|---|
| 180 | TEST_REJECT(more_than_date_differs(file, expected)); |
|---|
| 181 | } |
|---|
| 182 | free(expected); |
|---|
| 183 | } |
|---|
| 184 | |
|---|
| 185 | static const char *test_convert(const char *inf, const char *outf, Format inType, Format ouType) { |
|---|
| 186 | const char *error = NULp; |
|---|
| 187 | try { |
|---|
| 188 | convert(FormattedFile(inf ? inf : "infilename", inType), |
|---|
| 189 | FormattedFile(outf ? outf : "outfilename", ouType)); |
|---|
| 190 | } |
|---|
| 191 | catch (Convaln_exception& exc) { error = GBS_global_string("%s (#%i)", exc.get_msg(), exc.get_code()); } |
|---|
| 192 | return error; |
|---|
| 193 | } |
|---|
| 194 | |
|---|
| 195 | static void test_convert_by_format_num(int from, int to) { |
|---|
| 196 | char *toFile = GBS_global_string_copy("impexp/conv.%s_2_%s", NAME(from), NAME(to)); |
|---|
| 197 | if (GB_is_regularfile(toFile)) TEST_EXPECT_ZERO_OR_SHOW_ERRNO(unlink(toFile)); |
|---|
| 198 | |
|---|
| 199 | int old_processed_counter = log_processed_counter; |
|---|
| 200 | int old_seq_counter = log_seq_counter; |
|---|
| 201 | |
|---|
| 202 | const char *error = test_convert(INPUT(from), toFile, TYPE(from), TYPE(to)); |
|---|
| 203 | |
|---|
| 204 | int converted_seqs = log_seq_counter-old_seq_counter; |
|---|
| 205 | int expected_seqs = EXSEQ(from); |
|---|
| 206 | if (to == NUM_GCG) expected_seqs = 1; // we stop after first file (useless to generate numerous files) |
|---|
| 207 | |
|---|
| 208 | Capabilities& me = cap[from][to]; |
|---|
| 209 | |
|---|
| 210 | if (me.supported) { |
|---|
| 211 | if (error) TEST_ERROR("convert() reports error: '%s' (for supported conversion)", error); |
|---|
| 212 | TEST_EXPECT(GB_is_regularfile(toFile)); |
|---|
| 213 | TEST_EXPECT_EQUAL(converted_seqs, expected_seqs); |
|---|
| 214 | TEST_EXPECT_EQUAL(log_processed_counter, old_processed_counter+1); |
|---|
| 215 | |
|---|
| 216 | TEST_EXPECT_LESS_EQUAL(10, GB_size_of_file(toFile)); // less than 10 bytes |
|---|
| 217 | test_expected_conversion(toFile, NULp); |
|---|
| 218 | TEST_EXPECT_ZERO_OR_SHOW_ERRNO(unlink(toFile)); |
|---|
| 219 | } |
|---|
| 220 | else { |
|---|
| 221 | if (!error) TEST_ERROR("No error for unsupported conversion '%s'", GBS_global_string("%s -> %s", NAME(from), NAME(to))); |
|---|
| 222 | TEST_REJECT_NULL(strstr(error, "supported")); // wrong error |
|---|
| 223 | TEST_REJECT(GB_is_regularfile(toFile)); // unsupported produced output |
|---|
| 224 | } |
|---|
| 225 | TEST_EXPECT_EQUAL(me.supported, !error); |
|---|
| 226 | |
|---|
| 227 | #if defined(TEST_THROW) |
|---|
| 228 | { |
|---|
| 229 | // test if conversion from empty and text file fails |
|---|
| 230 | |
|---|
| 231 | const char *fromFile = "general/empty.input"; |
|---|
| 232 | |
|---|
| 233 | error = test_convert(fromFile, toFile, TYPE(from), TYPE(to)); |
|---|
| 234 | TEST_REJECT_NULL(error); |
|---|
| 235 | |
|---|
| 236 | fromFile = "general/text.input"; |
|---|
| 237 | error = test_convert(fromFile, toFile, TYPE(from), TYPE(to)); |
|---|
| 238 | TEST_REJECT_NULL(error); |
|---|
| 239 | } |
|---|
| 240 | #endif |
|---|
| 241 | |
|---|
| 242 | free(toFile); |
|---|
| 243 | } |
|---|
| 244 | |
|---|
| 245 | inline bool isInputFormat(int num) { return is_input_format(TYPE(num)); } |
|---|
| 246 | |
|---|
| 247 | static void init_cap() { |
|---|
| 248 | for (int from = 0; from<fcount; from++) { |
|---|
| 249 | for (int to = 0; to<fcount; to++) { |
|---|
| 250 | Capabilities& me = cap[from][to]; |
|---|
| 251 | if (!isInputFormat(from)) me.supported = false; |
|---|
| 252 | } |
|---|
| 253 | } |
|---|
| 254 | } |
|---|
| 255 | |
|---|
| 256 | #define NOT_SUPPORTED(t1,t2) TEST_EXPECT(isInputFormat(NUM_##t1)); cap[NUM_##t1][NUM_##t2].supported = false |
|---|
| 257 | |
|---|
| 258 | static int will_convert(int from) { |
|---|
| 259 | int will = 0; |
|---|
| 260 | for (int to = 0; to<fcount; to++) { |
|---|
| 261 | Capabilities& me = cap[from][to]; |
|---|
| 262 | if (me.supported && me.shall_be_tested()) { |
|---|
| 263 | will++; |
|---|
| 264 | } |
|---|
| 265 | } |
|---|
| 266 | return will; |
|---|
| 267 | } |
|---|
| 268 | |
|---|
| 269 | void TEST_SLOW_converter() { |
|---|
| 270 | STATIC_ASSERT(FORMATNUM_COUNT == fcount); |
|---|
| 271 | |
|---|
| 272 | init_cap(); |
|---|
| 273 | |
|---|
| 274 | NOT_SUPPORTED(GENBANK, SWISSPROT); |
|---|
| 275 | NOT_SUPPORTED(EMBL, SWISSPROT); |
|---|
| 276 | NOT_SUPPORTED(SWISSPROT, GENBANK); |
|---|
| 277 | NOT_SUPPORTED(SWISSPROT, EMBL); |
|---|
| 278 | |
|---|
| 279 | int possible = 0; |
|---|
| 280 | int tested = 0; |
|---|
| 281 | int unsupported = 0; |
|---|
| 282 | int neverReturns = 0; |
|---|
| 283 | |
|---|
| 284 | for (int from = 0; from<fcount; from++) { |
|---|
| 285 | TEST_ANNOTATE(GBS_global_string("while converting from '%s'", NAME(from))); |
|---|
| 286 | if (isInputFormat(from)) { |
|---|
| 287 | if (will_convert(from)<1) { |
|---|
| 288 | TEST_ERROR("Conversion from %s seems unsupported", NAME(from)); |
|---|
| 289 | } |
|---|
| 290 | } |
|---|
| 291 | for (int to = 0; to<fcount; to++) { |
|---|
| 292 | possible++; |
|---|
| 293 | Capabilities& me = cap[from][to]; |
|---|
| 294 | |
|---|
| 295 | if (me.shall_be_tested()) { |
|---|
| 296 | TEST_ANNOTATE(GBS_global_string("while converting %s -> %s", NAME(from), NAME(to))); |
|---|
| 297 | test_convert_by_format_num(from, to); |
|---|
| 298 | tested++; |
|---|
| 299 | } |
|---|
| 300 | |
|---|
| 301 | unsupported += !me.supported; |
|---|
| 302 | neverReturns += me.neverReturns; |
|---|
| 303 | } |
|---|
| 304 | } |
|---|
| 305 | TEST_ANNOTATE(NULp); |
|---|
| 306 | |
|---|
| 307 | fprintf(stderr, |
|---|
| 308 | "Conversion test summary:\n" |
|---|
| 309 | " - formats: %3i\n" |
|---|
| 310 | " - conversions: %3i (possible)\n" |
|---|
| 311 | " - unsupported: %3i\n" |
|---|
| 312 | " - tested: %3i\n" |
|---|
| 313 | " - neverReturns: %3i (would never return - not checked)\n" |
|---|
| 314 | " - converted: %3i\n", |
|---|
| 315 | fcount, |
|---|
| 316 | possible, |
|---|
| 317 | unsupported, |
|---|
| 318 | tested, |
|---|
| 319 | neverReturns, |
|---|
| 320 | tested-unsupported); |
|---|
| 321 | |
|---|
| 322 | int untested = possible - tested; |
|---|
| 323 | TEST_EXPECT_EQUAL(untested, neverReturns); |
|---|
| 324 | } |
|---|
| 325 | |
|---|
| 326 | #endif // UNIT_TESTS |
|---|