| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : PT_io.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | |
|---|
| 12 | #include "probe.h" |
|---|
| 13 | #include "pt_prototypes.h" |
|---|
| 14 | #include "PT_compress.h" |
|---|
| 15 | |
|---|
| 16 | #include <arbdbt.h> |
|---|
| 17 | #include <BI_basepos.hxx> |
|---|
| 18 | #include <arb_progress.h> |
|---|
| 19 | #include <arb_file.h> |
|---|
| 20 | |
|---|
| 21 | int compress_data(char *probestring) { |
|---|
| 22 | //! change a sequence with normal bases the PT_? format and delete all other signs |
|---|
| 23 | char c; |
|---|
| 24 | char *src, |
|---|
| 25 | *dest; |
|---|
| 26 | dest = src = probestring; |
|---|
| 27 | |
|---|
| 28 | while ((c=*(src++))) { |
|---|
| 29 | switch (c) { |
|---|
| 30 | case 'A': |
|---|
| 31 | case 'a': *(dest++) = PT_A; break; |
|---|
| 32 | case 'C': |
|---|
| 33 | case 'c': *(dest++) = PT_C; break; |
|---|
| 34 | case 'G': |
|---|
| 35 | case 'g': *(dest++) = PT_G; break; |
|---|
| 36 | case 'U': |
|---|
| 37 | case 'u': |
|---|
| 38 | case 'T': |
|---|
| 39 | case 't': *(dest++) = PT_T; break; |
|---|
| 40 | case 'N': |
|---|
| 41 | case 'n': *(dest++) = PT_N; break; |
|---|
| 42 | default: break; |
|---|
| 43 | } |
|---|
| 44 | |
|---|
| 45 | } |
|---|
| 46 | *dest = PT_QU; |
|---|
| 47 | return 0; |
|---|
| 48 | } |
|---|
| 49 | |
|---|
| 50 | ARB_ERROR probe_read_data_base(const char *name, bool readOnly) { // goes to header: __ATTR__USERESULT |
|---|
| 51 | ARB_ERROR error; |
|---|
| 52 | GB_set_verbose(); |
|---|
| 53 | |
|---|
| 54 | psg.gb_shell = new GB_shell; |
|---|
| 55 | |
|---|
| 56 | if (!readOnly && !GB_is_writeablefile(name)) { |
|---|
| 57 | error = GBS_global_string("Database '%s' is write-protected - aborting", name); |
|---|
| 58 | } |
|---|
| 59 | if (!error) { |
|---|
| 60 | GBDATA *gb_main = GB_open(name, readOnly ? "r" : "rw"); |
|---|
| 61 | if (!gb_main) error = GB_await_error(); |
|---|
| 62 | else { |
|---|
| 63 | error = GB_begin_transaction(gb_main); |
|---|
| 64 | if (!error) psg.gb_main = gb_main; |
|---|
| 65 | error = GB_end_transaction(gb_main, error); |
|---|
| 66 | } |
|---|
| 67 | } |
|---|
| 68 | return error; |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | uchar PT_compressed::translate[256]; |
|---|
| 72 | bool PT_compressed::translation_initialized = false; |
|---|
| 73 | |
|---|
| 74 | #if defined(COUNT_COMPRESSES_BASES) |
|---|
| 75 | BaseCounter PT_compressed::base_counter; |
|---|
| 76 | #endif |
|---|
| 77 | |
|---|
| 78 | size_t probe_compress_sequence(char *seq, size_t seqsize) { |
|---|
| 79 | // translates a readable sequence into PT_base |
|---|
| 80 | // (see also: probe_2_readable) |
|---|
| 81 | |
|---|
| 82 | PT_compressed compressed(seqsize); |
|---|
| 83 | |
|---|
| 84 | compressed.createFrom(reinterpret_cast<unsigned char*>(seq), seqsize); |
|---|
| 85 | pt_assert(compressed.get_size() <= (seqsize+1)); |
|---|
| 86 | |
|---|
| 87 | memcpy(seq, compressed.get_seq(), compressed.get_size()); |
|---|
| 88 | return compressed.get_size(); |
|---|
| 89 | } |
|---|
| 90 | |
|---|
| 91 | char *readable_probe(const char *compressed_probe, size_t len, char T_or_U) { |
|---|
| 92 | static SmartMallocPtr(uchar) smart_tab; |
|---|
| 93 | uchar *tab = NULp; |
|---|
| 94 | |
|---|
| 95 | if (smart_tab.isNull()) { |
|---|
| 96 | ARB_alloc(tab, 256); |
|---|
| 97 | memset(tab, '?', 256); |
|---|
| 98 | |
|---|
| 99 | tab[PT_A] = 'A'; |
|---|
| 100 | tab[PT_C] = 'C'; |
|---|
| 101 | tab[PT_G] = 'G'; |
|---|
| 102 | tab[PT_QU] = '.'; |
|---|
| 103 | tab[PT_N] = 'N'; |
|---|
| 104 | |
|---|
| 105 | tab[PT_B_UNDEF] = '!'; |
|---|
| 106 | |
|---|
| 107 | smart_tab = tab; |
|---|
| 108 | } |
|---|
| 109 | |
|---|
| 110 | tab = &*smart_tab; |
|---|
| 111 | tab[PT_T] = T_or_U; |
|---|
| 112 | |
|---|
| 113 | char *result = ARB_alloc<char>(len+1); |
|---|
| 114 | for (size_t i = 0; i<len; ++i) { |
|---|
| 115 | result[i] = tab[safeCharIndex(compressed_probe[i])]; |
|---|
| 116 | } |
|---|
| 117 | result[len] = 0; |
|---|
| 118 | return result; |
|---|
| 119 | } |
|---|
| 120 | |
|---|
| 121 | inline GBDATA *expect_entry(GBDATA *gb_species, const char *entry_name) { |
|---|
| 122 | GBDATA *gb_entry = GB_entry(gb_species, entry_name); |
|---|
| 123 | if (!gb_entry) { |
|---|
| 124 | GB_export_errorf("Expected entry '%s' is missing for species '%s'", |
|---|
| 125 | entry_name, GBT_get_name_or_description(gb_species)); |
|---|
| 126 | } |
|---|
| 127 | return gb_entry; |
|---|
| 128 | } |
|---|
| 129 | |
|---|
| 130 | cache::Cache<SmartCharPtr> probe_input_data::seq_cache(1); // resized later |
|---|
| 131 | cache::Cache<probe_input_data::SmartIntPtr> probe_input_data::rel2abs_cache(1); // resized later |
|---|
| 132 | |
|---|
| 133 | GB_ERROR probe_input_data::init(GBDATA *gb_species_) { |
|---|
| 134 | GBDATA *gb_cs = expect_entry(gb_species_, "cs"); |
|---|
| 135 | GBDATA *gb_compr = expect_entry(gb_species_, "compr"); |
|---|
| 136 | GBDATA *gb_baseoff = expect_entry(gb_species_, "baseoff"); |
|---|
| 137 | |
|---|
| 138 | GB_ERROR error = NULp; |
|---|
| 139 | if (!gb_cs || !gb_compr || !gb_baseoff) error = GB_await_error(); |
|---|
| 140 | else { |
|---|
| 141 | gb_species = gb_species_; |
|---|
| 142 | size = GB_read_count(gb_compr); |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | return error; |
|---|
| 146 | } |
|---|
| 147 | |
|---|
| 148 | inline GB_ERROR PT_prepare_species_sequence(GBDATA *gb_species, const char *alignment_name, bool& data_missing, PT_compressed& compressed) { |
|---|
| 149 | GB_ERROR error = NULp; |
|---|
| 150 | GBDATA *gb_ali = GB_entry(gb_species, alignment_name); |
|---|
| 151 | GBDATA *gb_data = gb_ali ? GB_entry(gb_ali, "data") : NULp; |
|---|
| 152 | |
|---|
| 153 | data_missing = false; |
|---|
| 154 | |
|---|
| 155 | if (!gb_data) { |
|---|
| 156 | data_missing = true; |
|---|
| 157 | } |
|---|
| 158 | else { |
|---|
| 159 | const char *seq = GB_read_char_pntr(gb_data); |
|---|
| 160 | if (!seq) { |
|---|
| 161 | error = GBS_global_string("Could not read data in '%s' for species '%s'\n(Reason: %s)", |
|---|
| 162 | alignment_name, GBT_get_name_or_description(gb_species), GB_await_error()); |
|---|
| 163 | } |
|---|
| 164 | else { |
|---|
| 165 | size_t seqlen = GB_read_string_count(gb_data); |
|---|
| 166 | if (seqlen>compressed.get_allowed_size()) { |
|---|
| 167 | error = GBS_global_string("Sequence too long in '%s' of '%s'\n(Hint: format alignment to fix this problem)", |
|---|
| 168 | alignment_name, GBT_get_name_or_description(gb_species)); |
|---|
| 169 | } |
|---|
| 170 | |
|---|
| 171 | if (!error) { |
|---|
| 172 | compressed.createFrom(seq, seqlen); |
|---|
| 173 | { |
|---|
| 174 | uint32_t checksum = GB_checksum(seq, seqlen, 1, ".-"); |
|---|
| 175 | GBDATA *gb_cs = GB_create(gb_species, "cs", GB_INT); |
|---|
| 176 | error = gb_cs |
|---|
| 177 | ? GB_write_int(gb_cs, int32_t(checksum)) |
|---|
| 178 | : GB_await_error(); |
|---|
| 179 | } |
|---|
| 180 | } |
|---|
| 181 | |
|---|
| 182 | if (!error) { |
|---|
| 183 | GBDATA *gb_compr = GB_create(gb_species, "compr", GB_BYTES); |
|---|
| 184 | error = gb_compr |
|---|
| 185 | ? GB_write_bytes(gb_compr, compressed.get_seq(), compressed.get_size()) |
|---|
| 186 | : GB_await_error(); |
|---|
| 187 | } |
|---|
| 188 | |
|---|
| 189 | if (!error) { |
|---|
| 190 | GBDATA *gb_baseoff = GB_create(gb_species, "baseoff", GB_INTS); |
|---|
| 191 | error = gb_baseoff |
|---|
| 192 | ? GB_write_ints(gb_baseoff, compressed.get_offsets(), compressed.get_size()) |
|---|
| 193 | : GB_await_error(); |
|---|
| 194 | } |
|---|
| 195 | |
|---|
| 196 | if (!error) error = GB_delete(gb_ali); // delete original seq data |
|---|
| 197 | } |
|---|
| 198 | } |
|---|
| 199 | |
|---|
| 200 | return error; |
|---|
| 201 | } |
|---|
| 202 | |
|---|
| 203 | GB_ERROR PT_prepare_data(GBDATA *gb_main) { |
|---|
| 204 | GB_ERROR error = GB_begin_transaction(gb_main); |
|---|
| 205 | GBDATA *gb_species_data = GBT_get_species_data(gb_main); |
|---|
| 206 | |
|---|
| 207 | if (!gb_species_data) { |
|---|
| 208 | error = GB_await_error(); |
|---|
| 209 | } |
|---|
| 210 | else { |
|---|
| 211 | long icount = GB_number_of_subentries(gb_species_data); |
|---|
| 212 | int data_missing = 0; |
|---|
| 213 | |
|---|
| 214 | char *ali_name = GBT_get_default_alignment(gb_main); |
|---|
| 215 | if (!ali_name) { |
|---|
| 216 | error = GB_await_error(); |
|---|
| 217 | } |
|---|
| 218 | else { |
|---|
| 219 | long ali_len = GBT_get_alignment_len(gb_main, ali_name); |
|---|
| 220 | pt_assert(ali_len>0); |
|---|
| 221 | |
|---|
| 222 | PT_compressed compressBuffer(ali_len); |
|---|
| 223 | |
|---|
| 224 | printf("Database contains %li species\n", icount); |
|---|
| 225 | { |
|---|
| 226 | { |
|---|
| 227 | arb_progress progress("Preparing sequence data", icount); |
|---|
| 228 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data); |
|---|
| 229 | gb_species && !error; |
|---|
| 230 | ) |
|---|
| 231 | { |
|---|
| 232 | GBDATA *gb_next = GBT_next_species(gb_species); |
|---|
| 233 | bool no_data; |
|---|
| 234 | |
|---|
| 235 | error = PT_prepare_species_sequence(gb_species, ali_name, no_data, compressBuffer); |
|---|
| 236 | if (no_data) { |
|---|
| 237 | pt_assert(!error); |
|---|
| 238 | data_missing++; |
|---|
| 239 | error = GB_delete(gb_species); |
|---|
| 240 | } |
|---|
| 241 | progress.inc(); |
|---|
| 242 | gb_species = gb_next; |
|---|
| 243 | } |
|---|
| 244 | if (error) progress.done(); |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | if (!error) { |
|---|
| 248 | char *master_data_name = GBS_global_string_copy("%s/@master_data", GB_SYSTEM_FOLDER); |
|---|
| 249 | GBDATA *gb_master_data = GB_search(gb_main, master_data_name, GB_FIND); |
|---|
| 250 | if (gb_master_data) error = GB_delete(gb_master_data); |
|---|
| 251 | free(master_data_name); |
|---|
| 252 | } |
|---|
| 253 | } |
|---|
| 254 | if (data_missing) { |
|---|
| 255 | printf("\n%i species were ignored because of missing data.\n", data_missing); |
|---|
| 256 | } |
|---|
| 257 | else { |
|---|
| 258 | printf("\nAll species contain data in alignment '%s'.\n", ali_name); |
|---|
| 259 | } |
|---|
| 260 | fflush_all(); |
|---|
| 261 | free(ali_name); |
|---|
| 262 | } |
|---|
| 263 | } |
|---|
| 264 | |
|---|
| 265 | error = GB_end_transaction(gb_main, error); |
|---|
| 266 | return error; |
|---|
| 267 | } |
|---|
| 268 | |
|---|
| 269 | GB_ERROR PT_init_input_data() { |
|---|
| 270 | // reads sequence data into psg.data |
|---|
| 271 | |
|---|
| 272 | GB_begin_transaction(psg.gb_main); |
|---|
| 273 | |
|---|
| 274 | // read ref SAI (e.g. ecoli) |
|---|
| 275 | { |
|---|
| 276 | char *def_ref = GBT_get_default_ref(psg.gb_main); |
|---|
| 277 | GBDATA *gb_sai_data = GBT_get_SAI_data(psg.gb_main); |
|---|
| 278 | GBDATA *gb_ref = GBT_find_SAI_rel_SAI_data(gb_sai_data, def_ref); |
|---|
| 279 | |
|---|
| 280 | psg.ecoli = NULp; |
|---|
| 281 | if (gb_ref) { |
|---|
| 282 | GBDATA *gb_data = GBT_find_sequence(gb_ref, psg.alignment_name); |
|---|
| 283 | if (gb_data) { |
|---|
| 284 | psg.ecoli = GB_read_string(gb_data); // @@@ NOT_ALL_SAI_HAVE_DATA |
|---|
| 285 | } |
|---|
| 286 | } |
|---|
| 287 | free(def_ref); |
|---|
| 288 | } |
|---|
| 289 | |
|---|
| 290 | GBDATA *gb_species_data = GBT_get_species_data(psg.gb_main); |
|---|
| 291 | long icount = GB_number_of_subentries(gb_species_data); |
|---|
| 292 | |
|---|
| 293 | psg.data = new probe_input_data[icount]; |
|---|
| 294 | psg.data_count = 0; |
|---|
| 295 | |
|---|
| 296 | printf("Database contains %li species\n", icount); |
|---|
| 297 | |
|---|
| 298 | GB_ERROR error = NULp; |
|---|
| 299 | { |
|---|
| 300 | arb_progress progress("Checking data", icount); |
|---|
| 301 | int count = 0; |
|---|
| 302 | |
|---|
| 303 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data); |
|---|
| 304 | gb_species; |
|---|
| 305 | gb_species = GBT_next_species(gb_species)) |
|---|
| 306 | { |
|---|
| 307 | probe_input_data& pid = psg.data[count]; |
|---|
| 308 | |
|---|
| 309 | error = pid.init(gb_species); |
|---|
| 310 | if (error) break; |
|---|
| 311 | count++; |
|---|
| 312 | progress.inc(); |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | psg.data_count = count; |
|---|
| 316 | GB_commit_transaction(psg.gb_main); |
|---|
| 317 | |
|---|
| 318 | if (error) progress.done(); |
|---|
| 319 | } |
|---|
| 320 | |
|---|
| 321 | fflush_all(); |
|---|
| 322 | return error; |
|---|
| 323 | } |
|---|
| 324 | |
|---|
| 325 | void PT_build_species_hash() { |
|---|
| 326 | long i; |
|---|
| 327 | psg.namehash = GBS_create_hash(psg.data_count, GB_MIND_CASE); |
|---|
| 328 | for (i=0; i<psg.data_count; i++) { |
|---|
| 329 | GBS_write_hash(psg.namehash, psg.data[i].get_shortname(), i+1); |
|---|
| 330 | } |
|---|
| 331 | unsigned int max_size; |
|---|
| 332 | max_size = 0; |
|---|
| 333 | for (i = 0; i < psg.data_count; i++) { // get max sequence len |
|---|
| 334 | max_size = std::max(max_size, (unsigned)(psg.data[i].get_size())); |
|---|
| 335 | psg.char_count += psg.data[i].get_size(); |
|---|
| 336 | } |
|---|
| 337 | psg.max_size = max_size; |
|---|
| 338 | |
|---|
| 339 | if (psg.ecoli) { |
|---|
| 340 | BI_ecoli_ref *ref = new BI_ecoli_ref; |
|---|
| 341 | ref->init(psg.ecoli, strlen(psg.ecoli)); |
|---|
| 342 | psg.bi_ecoli = ref; |
|---|
| 343 | } |
|---|
| 344 | } |
|---|
| 345 | |
|---|
| 346 | |
|---|
| 347 | long PT_abs_2_ecoli_rel(long pos) { |
|---|
| 348 | if (!psg.ecoli) return pos; |
|---|
| 349 | return psg.bi_ecoli->abs_2_rel(pos); |
|---|
| 350 | } |
|---|
| 351 | |
|---|