1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | // #include <malloc.h> |
---|
4 | #include <memory.h> |
---|
5 | #include <string.h> |
---|
6 | #include <stdint.h> |
---|
7 | |
---|
8 | #include <PT_server.h> |
---|
9 | #include "probe.h" |
---|
10 | #include <arbdbt.h> |
---|
11 | #include <BI_helix.hxx> |
---|
12 | |
---|
13 | #include <inline.h> |
---|
14 | |
---|
15 | extern "C" { char *gbs_malloc_copy(char *,long); } |
---|
16 | |
---|
17 | |
---|
18 | /* change a sequence with normal bases the PT_? format and delete all other signs */ |
---|
19 | int compress_data(char *probestring) |
---|
20 | { |
---|
21 | char c; |
---|
22 | char *src, |
---|
23 | *dest; |
---|
24 | dest = src = probestring; |
---|
25 | |
---|
26 | while((c=*(src++))) |
---|
27 | { |
---|
28 | switch (c) { |
---|
29 | case 'A': |
---|
30 | case 'a': *(dest++) = PT_A;break; |
---|
31 | case 'C': |
---|
32 | case 'c': *(dest++) = PT_C;break; |
---|
33 | case 'G': |
---|
34 | case 'g': *(dest++) = PT_G;break; |
---|
35 | case 'U': |
---|
36 | case 'u': |
---|
37 | case 'T': |
---|
38 | case 't': *(dest++) = PT_T;break; |
---|
39 | case 'N': |
---|
40 | case 'n': *(dest++) = PT_N;break; |
---|
41 | default: break; |
---|
42 | } |
---|
43 | |
---|
44 | } |
---|
45 | *dest = PT_QU; |
---|
46 | return 0; |
---|
47 | } |
---|
48 | |
---|
49 | /* get a string with readable bases from a string with PT_? */ |
---|
50 | void PT_base_2_string(char *id_string, long len) |
---|
51 | { |
---|
52 | char c; |
---|
53 | char *src, |
---|
54 | *dest; |
---|
55 | if (!len) len = strlen(id_string); |
---|
56 | dest = src = id_string; |
---|
57 | |
---|
58 | while((len--)>0){ |
---|
59 | c=*(src++); |
---|
60 | switch (c) { |
---|
61 | case PT_A: *(dest++) = 'A';break; |
---|
62 | case PT_C: *(dest++) = 'C';break; |
---|
63 | case PT_G: *(dest++) = 'G';break; |
---|
64 | case PT_T: *(dest++) = 'U';break; |
---|
65 | case PT_N: *(dest++) = 'N';break; |
---|
66 | case 0: *(dest++) = '0'; break; |
---|
67 | default: *(dest++) = c;break; |
---|
68 | } |
---|
69 | |
---|
70 | } |
---|
71 | *dest = '\0'; |
---|
72 | } |
---|
73 | |
---|
74 | void probe_read_data_base(char *name) |
---|
75 | { |
---|
76 | GBDATA *gb_main; |
---|
77 | GBDATA *gb_species_data; |
---|
78 | |
---|
79 | GB_set_verbose(); |
---|
80 | #if defined(DEVEL_RALF) |
---|
81 | #warning gb_main should be closed |
---|
82 | #endif // DEVEL_RALF |
---|
83 | gb_main = GB_open(name,"r"); |
---|
84 | if (!gb_main) { |
---|
85 | printf("Error reading file %s\n",name); |
---|
86 | exit(EXIT_FAILURE); |
---|
87 | } |
---|
88 | GB_begin_transaction(gb_main); |
---|
89 | gb_species_data = GB_entry(gb_main,"species_data"); |
---|
90 | if (!gb_species_data){ |
---|
91 | printf("Database %s is empty\n",name); |
---|
92 | exit(EXIT_FAILURE); |
---|
93 | } |
---|
94 | psg.gb_main = gb_main; |
---|
95 | psg.gb_species_data = gb_species_data; |
---|
96 | psg.gb_sai_data = GBT_get_SAI_data(gb_main); |
---|
97 | |
---|
98 | GB_commit_transaction(gb_main); |
---|
99 | } |
---|
100 | |
---|
101 | inline size_t count_uint_32(uint32_t *seq, size_t seqsize, uint32_t cmp) { |
---|
102 | size_t count = 0; |
---|
103 | while (count<seqsize && seq[count] == cmp) count++; |
---|
104 | return count*4; |
---|
105 | } |
---|
106 | |
---|
107 | inline size_t count_char(const char *seq, size_t seqsize, char c, uint32_t c4) { |
---|
108 | if (seq[0] == c) { |
---|
109 | size_t count = 1+count_uint_32((uint32_t*)(seq+1), (seqsize-1)/4, c4); |
---|
110 | for (; count<seqsize && seq[count] == c; ++count) ; |
---|
111 | return count; |
---|
112 | } |
---|
113 | return 0; |
---|
114 | } |
---|
115 | |
---|
116 | inline size_t count_dots(const char *seq, int seqsize) { return count_char(seq, seqsize, '.', 0x2E2E2E2E); } |
---|
117 | inline size_t count_gaps(const char *seq, int seqsize) { return count_char(seq, seqsize, '-', 0x2D2D2D2D); } |
---|
118 | |
---|
119 | inline size_t count_gaps_and_dots(const char *seq, int seqsize) { |
---|
120 | size_t count = 0; |
---|
121 | size_t count2; |
---|
122 | size_t count3; |
---|
123 | |
---|
124 | do { |
---|
125 | count2 = count_dots(seq+count, seqsize-count); |
---|
126 | count += count2; |
---|
127 | count3 = count_gaps(seq+count, seqsize-count); |
---|
128 | count += count3; |
---|
129 | } |
---|
130 | while (count2 || count3); |
---|
131 | return count; |
---|
132 | } |
---|
133 | |
---|
134 | int probe_compress_sequence(char *seq, int seqsize) |
---|
135 | { |
---|
136 | static uchar *tab = 0; |
---|
137 | if (!tab) { |
---|
138 | tab = (uchar *)malloc(256); |
---|
139 | memset(tab,PT_N,256); |
---|
140 | |
---|
141 | tab['A'] = tab['a'] = PT_A; |
---|
142 | tab['C'] = tab['c'] = PT_C; |
---|
143 | tab['G'] = tab['g'] = PT_G; |
---|
144 | tab['T'] = tab['t'] = PT_T; |
---|
145 | tab['U'] = tab['u'] = PT_T; |
---|
146 | tab['.'] = PT_QU; |
---|
147 | tab[0] = PT_B_UNDEF; |
---|
148 | } |
---|
149 | |
---|
150 | char *dest = seq; |
---|
151 | size_t offset = 0; |
---|
152 | |
---|
153 | while (seq[offset]) { |
---|
154 | offset += count_gaps(seq+offset, seqsize-offset); // skip over gaps |
---|
155 | |
---|
156 | uchar c = tab[safeCharIndex(seq[offset++])]; |
---|
157 | if (c == PT_B_UNDEF) break; // already seen terminal zerobyte |
---|
158 | |
---|
159 | *dest++ = c; |
---|
160 | if (c == PT_QU) { // TODO: *seq = '.' ??? |
---|
161 | offset += count_gaps_and_dots(seq+offset, seqsize-offset); // skip over gaps and dots |
---|
162 | // dest[-1] = PT_N; // @@@ uncomment this to handle '.' like 'N' (experimental!!!) |
---|
163 | } |
---|
164 | } |
---|
165 | |
---|
166 | if (dest[-1] != PT_QU) { |
---|
167 | *dest++ = PT_QU; |
---|
168 | } |
---|
169 | |
---|
170 | #ifdef ARB_64 |
---|
171 | arb_assert(!((dest - seq) & 0xffffffff00000000)); // must fit into 32 bit |
---|
172 | #endif |
---|
173 | |
---|
174 | return dest-seq; |
---|
175 | } |
---|
176 | |
---|
177 | static char *probe_read_string_append_point(GBDATA *gb_data, int *psize) { |
---|
178 | long len = GB_read_string_count(gb_data); |
---|
179 | char *data = GB_read_string(gb_data); |
---|
180 | |
---|
181 | if (data) { |
---|
182 | if (data[len - 1] != '.') { |
---|
183 | char *buffer = (char *) malloc(len + 2); |
---|
184 | strcpy(buffer, data); |
---|
185 | buffer[len++] = '.'; |
---|
186 | buffer[len] = 0; |
---|
187 | freeset(data, buffer); |
---|
188 | } |
---|
189 | *psize = len; |
---|
190 | } |
---|
191 | else { |
---|
192 | *psize = 0; |
---|
193 | } |
---|
194 | return data; |
---|
195 | } |
---|
196 | |
---|
197 | char *probe_read_alignment(int j, int *psize) { |
---|
198 | char *buffer = 0; |
---|
199 | GBDATA *gb_ali = GB_entry(psg.data[j].gbd, psg.alignment_name); |
---|
200 | |
---|
201 | if (gb_ali) { |
---|
202 | GBDATA *gb_data = GB_entry(gb_ali, "data"); |
---|
203 | if (gb_data) buffer = probe_read_string_append_point(gb_data, psize); |
---|
204 | } |
---|
205 | return buffer; |
---|
206 | } |
---|
207 | |
---|
208 | void probe_read_alignments() { |
---|
209 | // reads sequence data into psg.data |
---|
210 | |
---|
211 | GB_begin_transaction(psg.gb_main); |
---|
212 | |
---|
213 | // read ref SAI (e.g. ecoli) |
---|
214 | { |
---|
215 | char *def_ref = GBT_get_default_ref(psg.gb_main); |
---|
216 | GBDATA *gb_ref = GBT_find_SAI_rel_SAI_data(psg.gb_sai_data, def_ref); |
---|
217 | |
---|
218 | psg.ecoli = 0; |
---|
219 | if (gb_ref) { |
---|
220 | GBDATA *gb_data = GBT_read_sequence(gb_ref, psg.alignment_name); |
---|
221 | if (gb_data) { |
---|
222 | psg.ecoli = GB_read_string(gb_data); |
---|
223 | } |
---|
224 | } |
---|
225 | free(def_ref); |
---|
226 | } |
---|
227 | |
---|
228 | int count = GB_number_of_subentries(psg.gb_species_data); |
---|
229 | |
---|
230 | psg.data = (probe_input_data *)calloc(sizeof(probe_input_data), count); |
---|
231 | psg.data_count = 0; |
---|
232 | |
---|
233 | printf("Database contains %i species.\nPreparing sequence data:\n", count); |
---|
234 | |
---|
235 | int data_missing = 0; |
---|
236 | int species_count = count; |
---|
237 | |
---|
238 | count = 0; |
---|
239 | |
---|
240 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(psg.gb_species_data); |
---|
241 | gb_species; |
---|
242 | gb_species = GBT_next_species(gb_species) ) |
---|
243 | { |
---|
244 | probe_input_data& pid = psg.data[count]; |
---|
245 | |
---|
246 | pid.name = strdup(GBT_read_name(gb_species)); |
---|
247 | pid.fullname = GBT_read_string(gb_species, "full_name"); |
---|
248 | |
---|
249 | if (!pid.fullname) pid.fullname = strdup(""); |
---|
250 | |
---|
251 | pid.is_group = 1; |
---|
252 | pid.gbd = gb_species; |
---|
253 | |
---|
254 | GBDATA *gb_ali = GB_entry(gb_species, psg.alignment_name); |
---|
255 | if (gb_ali) { |
---|
256 | GBDATA *gb_data = GB_entry(gb_ali,"data"); |
---|
257 | if (!gb_data) { |
---|
258 | fprintf(stderr,"Species '%s' has no data in '%s'\n", pid.name, psg.alignment_name); |
---|
259 | data_missing++; |
---|
260 | } |
---|
261 | else { |
---|
262 | int hsize; |
---|
263 | char *data = probe_read_string_append_point(gb_data, &hsize); |
---|
264 | |
---|
265 | if (!data) { |
---|
266 | GB_ERROR error = GB_await_error(); |
---|
267 | fprintf(stderr, "Could not read data in '%s' for species '%s'\n(Reason: %s)\n", |
---|
268 | psg.alignment_name, pid.name, error); |
---|
269 | data_missing++; |
---|
270 | } |
---|
271 | else { |
---|
272 | pid.checksum = GB_checksum(data, hsize, 1, ".-"); |
---|
273 | int size = probe_compress_sequence(data, hsize); |
---|
274 | |
---|
275 | pid.data = (char *)gbs_malloc_copy(data, size); |
---|
276 | pid.size = size; |
---|
277 | |
---|
278 | count ++; |
---|
279 | if (count%10 == 0) { |
---|
280 | if (count%500) fputc('.', stdout); |
---|
281 | else printf(".%6i (%5.1f%%)\n", count, ((double)count/species_count)*100); |
---|
282 | fflush(stdout); |
---|
283 | } |
---|
284 | } |
---|
285 | free(data); |
---|
286 | } |
---|
287 | } |
---|
288 | } |
---|
289 | |
---|
290 | psg.data_count = count; |
---|
291 | GB_commit_transaction(psg.gb_main); |
---|
292 | |
---|
293 | if (data_missing) { |
---|
294 | printf("\n%i species were ignored because of missing data.\n", data_missing); |
---|
295 | } |
---|
296 | else { |
---|
297 | printf("\nAll species contain data in alignment '%s'.\n", psg.alignment_name); |
---|
298 | } |
---|
299 | fflush(stdout); |
---|
300 | } |
---|
301 | |
---|
302 | void PT_build_species_hash(void){ |
---|
303 | long i; |
---|
304 | psg.namehash = GBS_create_hash(PT_NAME_HASH_SIZE, GB_MIND_CASE); |
---|
305 | for (i=0;i<psg.data_count;i++){ |
---|
306 | GBS_write_hash(psg.namehash, psg.data[i].name,i+1); |
---|
307 | } |
---|
308 | unsigned int max_size; |
---|
309 | max_size = 0; |
---|
310 | for (i = 0; i < psg.data_count; i++){ /* get max sequence len */ |
---|
311 | max_size = max(max_size, (unsigned)(psg.data[i].size)); |
---|
312 | psg.char_count += psg.data[i].size; |
---|
313 | } |
---|
314 | psg.max_size = max_size; |
---|
315 | |
---|
316 | if ( psg.ecoli){ |
---|
317 | BI_ecoli_ref *ref = new BI_ecoli_ref; |
---|
318 | const char *error = ref->init(psg.ecoli,strlen(psg.ecoli)); |
---|
319 | if (error) { |
---|
320 | delete psg.ecoli; |
---|
321 | psg.ecoli = 0; |
---|
322 | }else{ |
---|
323 | psg.bi_ecoli = ref; |
---|
324 | } |
---|
325 | } |
---|
326 | } |
---|
327 | |
---|
328 | |
---|
329 | long PT_abs_2_rel(long pos) { |
---|
330 | if (!psg.ecoli) return pos; |
---|
331 | return psg.bi_ecoli->abs_2_rel(pos); |
---|
332 | } |
---|
333 | |
---|
334 | long PT_rel_2_abs(long pos) { |
---|
335 | if (!psg.ecoli) return pos; |
---|
336 | return psg.bi_ecoli->rel_2_abs(pos); |
---|
337 | } |
---|