1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : PT_io.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | |
---|
12 | #include "probe.h" |
---|
13 | #include "pt_prototypes.h" |
---|
14 | #include "PT_compress.h" |
---|
15 | |
---|
16 | #include <arbdbt.h> |
---|
17 | #include <BI_basepos.hxx> |
---|
18 | #include <arb_progress.h> |
---|
19 | #include <arb_file.h> |
---|
20 | |
---|
21 | int compress_data(char *probestring) { |
---|
22 | //! change a sequence with normal bases the PT_? format and delete all other signs |
---|
23 | char c; |
---|
24 | char *src, |
---|
25 | *dest; |
---|
26 | dest = src = probestring; |
---|
27 | |
---|
28 | while ((c=*(src++))) { |
---|
29 | switch (c) { |
---|
30 | case 'A': |
---|
31 | case 'a': *(dest++) = PT_A; break; |
---|
32 | case 'C': |
---|
33 | case 'c': *(dest++) = PT_C; break; |
---|
34 | case 'G': |
---|
35 | case 'g': *(dest++) = PT_G; break; |
---|
36 | case 'U': |
---|
37 | case 'u': |
---|
38 | case 'T': |
---|
39 | case 't': *(dest++) = PT_T; break; |
---|
40 | case 'N': |
---|
41 | case 'n': *(dest++) = PT_N; break; |
---|
42 | default: break; |
---|
43 | } |
---|
44 | |
---|
45 | } |
---|
46 | *dest = PT_QU; |
---|
47 | return 0; |
---|
48 | } |
---|
49 | |
---|
50 | ARB_ERROR probe_read_data_base(const char *name, bool readOnly) { // goes to header: __ATTR__USERESULT |
---|
51 | ARB_ERROR error; |
---|
52 | GB_set_verbose(); |
---|
53 | |
---|
54 | psg.gb_shell = new GB_shell; |
---|
55 | |
---|
56 | if (!readOnly && !GB_is_writeablefile(name)) { |
---|
57 | error = GBS_global_string("Database '%s' is write-protected - aborting", name); |
---|
58 | } |
---|
59 | if (!error) { |
---|
60 | GBDATA *gb_main = GB_open(name, readOnly ? "r" : "rw"); |
---|
61 | if (!gb_main) error = GB_await_error(); |
---|
62 | else { |
---|
63 | error = GB_begin_transaction(gb_main); |
---|
64 | if (!error) psg.gb_main = gb_main; |
---|
65 | error = GB_end_transaction(gb_main, error); |
---|
66 | } |
---|
67 | } |
---|
68 | return error; |
---|
69 | } |
---|
70 | |
---|
71 | uchar PT_compressed::translate[256]; |
---|
72 | bool PT_compressed::translation_initialized = false; |
---|
73 | |
---|
74 | #if defined(COUNT_COMPRESSES_BASES) |
---|
75 | BaseCounter PT_compressed::base_counter; |
---|
76 | #endif |
---|
77 | |
---|
78 | size_t probe_compress_sequence(char *seq, size_t seqsize) { |
---|
79 | // translates a readable sequence into PT_base |
---|
80 | // (see also: probe_2_readable) |
---|
81 | |
---|
82 | PT_compressed compressed(seqsize); |
---|
83 | |
---|
84 | compressed.createFrom(reinterpret_cast<unsigned char*>(seq), seqsize); |
---|
85 | pt_assert(compressed.get_size() <= (seqsize+1)); |
---|
86 | |
---|
87 | memcpy(seq, compressed.get_seq(), compressed.get_size()); |
---|
88 | return compressed.get_size(); |
---|
89 | } |
---|
90 | |
---|
91 | char *readable_probe(const char *compressed_probe, size_t len, char T_or_U) { |
---|
92 | static SmartMallocPtr(uchar) smart_tab; |
---|
93 | uchar *tab = NULp; |
---|
94 | |
---|
95 | if (smart_tab.isNull()) { |
---|
96 | ARB_alloc(tab, 256); |
---|
97 | memset(tab, '?', 256); |
---|
98 | |
---|
99 | tab[PT_A] = 'A'; |
---|
100 | tab[PT_C] = 'C'; |
---|
101 | tab[PT_G] = 'G'; |
---|
102 | tab[PT_QU] = '.'; |
---|
103 | tab[PT_N] = 'N'; |
---|
104 | |
---|
105 | tab[PT_B_UNDEF] = '!'; |
---|
106 | |
---|
107 | smart_tab = tab; |
---|
108 | } |
---|
109 | |
---|
110 | tab = &*smart_tab; |
---|
111 | tab[PT_T] = T_or_U; |
---|
112 | |
---|
113 | char *result = ARB_alloc<char>(len+1); |
---|
114 | for (size_t i = 0; i<len; ++i) { |
---|
115 | result[i] = tab[safeCharIndex(compressed_probe[i])]; |
---|
116 | } |
---|
117 | result[len] = 0; |
---|
118 | return result; |
---|
119 | } |
---|
120 | |
---|
121 | inline GBDATA *expect_entry(GBDATA *gb_species, const char *entry_name) { |
---|
122 | GBDATA *gb_entry = GB_entry(gb_species, entry_name); |
---|
123 | if (!gb_entry) { |
---|
124 | GB_export_errorf("Expected entry '%s' is missing for species '%s'", |
---|
125 | entry_name, GBT_get_name_or_description(gb_species)); |
---|
126 | } |
---|
127 | return gb_entry; |
---|
128 | } |
---|
129 | |
---|
130 | cache::Cache<SmartCharPtr> probe_input_data::seq_cache(1); // resized later |
---|
131 | cache::Cache<probe_input_data::SmartIntPtr> probe_input_data::rel2abs_cache(1); // resized later |
---|
132 | |
---|
133 | GB_ERROR probe_input_data::init(GBDATA *gb_species_) { |
---|
134 | GBDATA *gb_cs = expect_entry(gb_species_, "cs"); |
---|
135 | GBDATA *gb_compr = expect_entry(gb_species_, "compr"); |
---|
136 | GBDATA *gb_baseoff = expect_entry(gb_species_, "baseoff"); |
---|
137 | |
---|
138 | GB_ERROR error = NULp; |
---|
139 | if (!gb_cs || !gb_compr || !gb_baseoff) error = GB_await_error(); |
---|
140 | else { |
---|
141 | gb_species = gb_species_; |
---|
142 | size = GB_read_count(gb_compr); |
---|
143 | } |
---|
144 | |
---|
145 | return error; |
---|
146 | } |
---|
147 | |
---|
148 | inline GB_ERROR PT_prepare_species_sequence(GBDATA *gb_species, const char *alignment_name, bool& data_missing, PT_compressed& compressed) { |
---|
149 | GB_ERROR error = NULp; |
---|
150 | GBDATA *gb_ali = GB_entry(gb_species, alignment_name); |
---|
151 | GBDATA *gb_data = gb_ali ? GB_entry(gb_ali, "data") : NULp; |
---|
152 | |
---|
153 | data_missing = false; |
---|
154 | |
---|
155 | if (!gb_data) { |
---|
156 | data_missing = true; |
---|
157 | } |
---|
158 | else { |
---|
159 | const char *seq = GB_read_char_pntr(gb_data); |
---|
160 | if (!seq) { |
---|
161 | error = GBS_global_string("Could not read data in '%s' for species '%s'\n(Reason: %s)", |
---|
162 | alignment_name, GBT_get_name_or_description(gb_species), GB_await_error()); |
---|
163 | } |
---|
164 | else { |
---|
165 | size_t seqlen = GB_read_string_count(gb_data); |
---|
166 | if (seqlen>compressed.get_allowed_size()) { |
---|
167 | error = GBS_global_string("Sequence too long in '%s' of '%s'\n(Hint: format alignment to fix this problem)", |
---|
168 | alignment_name, GBT_get_name_or_description(gb_species)); |
---|
169 | } |
---|
170 | |
---|
171 | if (!error) { |
---|
172 | compressed.createFrom(seq, seqlen); |
---|
173 | { |
---|
174 | uint32_t checksum = GB_checksum(seq, seqlen, 1, ".-"); |
---|
175 | GBDATA *gb_cs = GB_create(gb_species, "cs", GB_INT); |
---|
176 | error = gb_cs |
---|
177 | ? GB_write_int(gb_cs, int32_t(checksum)) |
---|
178 | : GB_await_error(); |
---|
179 | } |
---|
180 | } |
---|
181 | |
---|
182 | if (!error) { |
---|
183 | GBDATA *gb_compr = GB_create(gb_species, "compr", GB_BYTES); |
---|
184 | error = gb_compr |
---|
185 | ? GB_write_bytes(gb_compr, compressed.get_seq(), compressed.get_size()) |
---|
186 | : GB_await_error(); |
---|
187 | } |
---|
188 | |
---|
189 | if (!error) { |
---|
190 | GBDATA *gb_baseoff = GB_create(gb_species, "baseoff", GB_INTS); |
---|
191 | error = gb_baseoff |
---|
192 | ? GB_write_ints(gb_baseoff, compressed.get_offsets(), compressed.get_size()) |
---|
193 | : GB_await_error(); |
---|
194 | } |
---|
195 | |
---|
196 | if (!error) error = GB_delete(gb_ali); // delete original seq data |
---|
197 | } |
---|
198 | } |
---|
199 | |
---|
200 | return error; |
---|
201 | } |
---|
202 | |
---|
203 | GB_ERROR PT_prepare_data(GBDATA *gb_main) { |
---|
204 | GB_ERROR error = GB_begin_transaction(gb_main); |
---|
205 | GBDATA *gb_species_data = GBT_get_species_data(gb_main); |
---|
206 | |
---|
207 | if (!gb_species_data) { |
---|
208 | error = GB_await_error(); |
---|
209 | } |
---|
210 | else { |
---|
211 | long icount = GB_number_of_subentries(gb_species_data); |
---|
212 | int data_missing = 0; |
---|
213 | |
---|
214 | char *ali_name = GBT_get_default_alignment(gb_main); |
---|
215 | if (!ali_name) { |
---|
216 | error = GB_await_error(); |
---|
217 | } |
---|
218 | else { |
---|
219 | long ali_len = GBT_get_alignment_len(gb_main, ali_name); |
---|
220 | pt_assert(ali_len>0); |
---|
221 | |
---|
222 | PT_compressed compressBuffer(ali_len); |
---|
223 | |
---|
224 | printf("Database contains %li species\n", icount); |
---|
225 | { |
---|
226 | { |
---|
227 | arb_progress progress("Preparing sequence data", icount); |
---|
228 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
229 | gb_species && !error; |
---|
230 | ) |
---|
231 | { |
---|
232 | GBDATA *gb_next = GBT_next_species(gb_species); |
---|
233 | bool no_data; |
---|
234 | |
---|
235 | error = PT_prepare_species_sequence(gb_species, ali_name, no_data, compressBuffer); |
---|
236 | if (no_data) { |
---|
237 | pt_assert(!error); |
---|
238 | data_missing++; |
---|
239 | error = GB_delete(gb_species); |
---|
240 | } |
---|
241 | progress.inc(); |
---|
242 | gb_species = gb_next; |
---|
243 | } |
---|
244 | if (error) progress.done(); |
---|
245 | } |
---|
246 | |
---|
247 | if (!error) { |
---|
248 | char *master_data_name = GBS_global_string_copy("%s/@master_data", GB_SYSTEM_FOLDER); |
---|
249 | GBDATA *gb_master_data = GB_search(gb_main, master_data_name, GB_FIND); |
---|
250 | if (gb_master_data) error = GB_delete(gb_master_data); |
---|
251 | free(master_data_name); |
---|
252 | } |
---|
253 | } |
---|
254 | if (data_missing) { |
---|
255 | printf("\n%i species were ignored because of missing data.\n", data_missing); |
---|
256 | } |
---|
257 | else { |
---|
258 | printf("\nAll species contain data in alignment '%s'.\n", ali_name); |
---|
259 | } |
---|
260 | fflush_all(); |
---|
261 | free(ali_name); |
---|
262 | } |
---|
263 | } |
---|
264 | |
---|
265 | error = GB_end_transaction(gb_main, error); |
---|
266 | return error; |
---|
267 | } |
---|
268 | |
---|
269 | GB_ERROR PT_init_input_data() { |
---|
270 | // reads sequence data into psg.data |
---|
271 | |
---|
272 | GB_begin_transaction(psg.gb_main); |
---|
273 | |
---|
274 | // read ref SAI (e.g. ecoli) |
---|
275 | { |
---|
276 | char *def_ref = GBT_get_default_ref(psg.gb_main); |
---|
277 | GBDATA *gb_sai_data = GBT_get_SAI_data(psg.gb_main); |
---|
278 | GBDATA *gb_ref = GBT_find_SAI_rel_SAI_data(gb_sai_data, def_ref); |
---|
279 | |
---|
280 | psg.ecoli = NULp; |
---|
281 | if (gb_ref) { |
---|
282 | GBDATA *gb_data = GBT_find_sequence(gb_ref, psg.alignment_name); |
---|
283 | if (gb_data) { |
---|
284 | psg.ecoli = GB_read_string(gb_data); // @@@ NOT_ALL_SAI_HAVE_DATA |
---|
285 | } |
---|
286 | } |
---|
287 | free(def_ref); |
---|
288 | } |
---|
289 | |
---|
290 | GBDATA *gb_species_data = GBT_get_species_data(psg.gb_main); |
---|
291 | long icount = GB_number_of_subentries(gb_species_data); |
---|
292 | |
---|
293 | psg.data = new probe_input_data[icount]; |
---|
294 | psg.data_count = 0; |
---|
295 | |
---|
296 | printf("Database contains %li species\n", icount); |
---|
297 | |
---|
298 | GB_ERROR error = NULp; |
---|
299 | { |
---|
300 | arb_progress progress("Checking data", icount); |
---|
301 | int count = 0; |
---|
302 | |
---|
303 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
304 | gb_species; |
---|
305 | gb_species = GBT_next_species(gb_species)) |
---|
306 | { |
---|
307 | probe_input_data& pid = psg.data[count]; |
---|
308 | |
---|
309 | error = pid.init(gb_species); |
---|
310 | if (error) break; |
---|
311 | count++; |
---|
312 | progress.inc(); |
---|
313 | } |
---|
314 | |
---|
315 | psg.data_count = count; |
---|
316 | GB_commit_transaction(psg.gb_main); |
---|
317 | |
---|
318 | if (error) progress.done(); |
---|
319 | } |
---|
320 | |
---|
321 | fflush_all(); |
---|
322 | return error; |
---|
323 | } |
---|
324 | |
---|
325 | void PT_build_species_hash() { |
---|
326 | long i; |
---|
327 | psg.namehash = GBS_create_hash(psg.data_count, GB_MIND_CASE); |
---|
328 | for (i=0; i<psg.data_count; i++) { |
---|
329 | GBS_write_hash(psg.namehash, psg.data[i].get_shortname(), i+1); |
---|
330 | } |
---|
331 | unsigned int max_size; |
---|
332 | max_size = 0; |
---|
333 | for (i = 0; i < psg.data_count; i++) { // get max sequence len |
---|
334 | max_size = std::max(max_size, (unsigned)(psg.data[i].get_size())); |
---|
335 | psg.char_count += psg.data[i].get_size(); |
---|
336 | } |
---|
337 | psg.max_size = max_size; |
---|
338 | |
---|
339 | if (psg.ecoli) { |
---|
340 | BI_ecoli_ref *ref = new BI_ecoli_ref; |
---|
341 | ref->init(psg.ecoli, strlen(psg.ecoli)); |
---|
342 | psg.bi_ecoli = ref; |
---|
343 | } |
---|
344 | } |
---|
345 | |
---|
346 | |
---|
347 | long PT_abs_2_ecoli_rel(long pos) { |
---|
348 | if (!psg.ecoli) return pos; |
---|
349 | return psg.bi_ecoli->abs_2_rel(pos); |
---|
350 | } |
---|
351 | |
---|