| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | // #include <malloc.h> |
|---|
| 4 | #include <memory.h> |
|---|
| 5 | #include <string.h> |
|---|
| 6 | #include <stdint.h> |
|---|
| 7 | |
|---|
| 8 | #include <PT_server.h> |
|---|
| 9 | #include "probe.h" |
|---|
| 10 | #include <arbdbt.h> |
|---|
| 11 | #include <BI_helix.hxx> |
|---|
| 12 | |
|---|
| 13 | #include <inline.h> |
|---|
| 14 | |
|---|
| 15 | extern "C" { char *gbs_malloc_copy(char *,long); } |
|---|
| 16 | |
|---|
| 17 | |
|---|
| 18 | /* change a sequence with normal bases the PT_? format and delete all other signs */ |
|---|
| 19 | int compress_data(char *probestring) |
|---|
| 20 | { |
|---|
| 21 | char c; |
|---|
| 22 | char *src, |
|---|
| 23 | *dest; |
|---|
| 24 | dest = src = probestring; |
|---|
| 25 | |
|---|
| 26 | while((c=*(src++))) |
|---|
| 27 | { |
|---|
| 28 | switch (c) { |
|---|
| 29 | case 'A': |
|---|
| 30 | case 'a': *(dest++) = PT_A;break; |
|---|
| 31 | case 'C': |
|---|
| 32 | case 'c': *(dest++) = PT_C;break; |
|---|
| 33 | case 'G': |
|---|
| 34 | case 'g': *(dest++) = PT_G;break; |
|---|
| 35 | case 'U': |
|---|
| 36 | case 'u': |
|---|
| 37 | case 'T': |
|---|
| 38 | case 't': *(dest++) = PT_T;break; |
|---|
| 39 | case 'N': |
|---|
| 40 | case 'n': *(dest++) = PT_N;break; |
|---|
| 41 | default: break; |
|---|
| 42 | } |
|---|
| 43 | |
|---|
| 44 | } |
|---|
| 45 | *dest = PT_QU; |
|---|
| 46 | return 0; |
|---|
| 47 | } |
|---|
| 48 | |
|---|
| 49 | /* get a string with readable bases from a string with PT_? */ |
|---|
| 50 | void PT_base_2_string(char *id_string, long len) |
|---|
| 51 | { |
|---|
| 52 | char c; |
|---|
| 53 | char *src, |
|---|
| 54 | *dest; |
|---|
| 55 | if (!len) len = strlen(id_string); |
|---|
| 56 | dest = src = id_string; |
|---|
| 57 | |
|---|
| 58 | while((len--)>0){ |
|---|
| 59 | c=*(src++); |
|---|
| 60 | switch (c) { |
|---|
| 61 | case PT_A: *(dest++) = 'A';break; |
|---|
| 62 | case PT_C: *(dest++) = 'C';break; |
|---|
| 63 | case PT_G: *(dest++) = 'G';break; |
|---|
| 64 | case PT_T: *(dest++) = 'U';break; |
|---|
| 65 | case PT_N: *(dest++) = 'N';break; |
|---|
| 66 | case 0: *(dest++) = '0'; break; |
|---|
| 67 | default: *(dest++) = c;break; |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | } |
|---|
| 71 | *dest = '\0'; |
|---|
| 72 | } |
|---|
| 73 | |
|---|
| 74 | void probe_read_data_base(char *name) |
|---|
| 75 | { |
|---|
| 76 | GBDATA *gb_main; |
|---|
| 77 | GBDATA *gb_species_data; |
|---|
| 78 | |
|---|
| 79 | GB_set_verbose(); |
|---|
| 80 | #if defined(DEVEL_RALF) |
|---|
| 81 | #warning gb_main should be closed |
|---|
| 82 | #endif // DEVEL_RALF |
|---|
| 83 | gb_main = GB_open(name,"r"); |
|---|
| 84 | if (!gb_main) { |
|---|
| 85 | printf("Error reading file %s\n",name); |
|---|
| 86 | exit(EXIT_FAILURE); |
|---|
| 87 | } |
|---|
| 88 | GB_begin_transaction(gb_main); |
|---|
| 89 | gb_species_data = GB_entry(gb_main,"species_data"); |
|---|
| 90 | if (!gb_species_data){ |
|---|
| 91 | printf("Database %s is empty\n",name); |
|---|
| 92 | exit(EXIT_FAILURE); |
|---|
| 93 | } |
|---|
| 94 | psg.gb_main = gb_main; |
|---|
| 95 | psg.gb_species_data = gb_species_data; |
|---|
| 96 | psg.gb_sai_data = GBT_get_SAI_data(gb_main); |
|---|
| 97 | |
|---|
| 98 | GB_commit_transaction(gb_main); |
|---|
| 99 | } |
|---|
| 100 | |
|---|
| 101 | inline size_t count_uint_32(uint32_t *seq, size_t seqsize, uint32_t cmp) { |
|---|
| 102 | size_t count = 0; |
|---|
| 103 | while (count<seqsize && seq[count] == cmp) count++; |
|---|
| 104 | return count*4; |
|---|
| 105 | } |
|---|
| 106 | |
|---|
| 107 | inline size_t count_char(const char *seq, size_t seqsize, char c, uint32_t c4) { |
|---|
| 108 | if (seq[0] == c) { |
|---|
| 109 | size_t count = 1+count_uint_32((uint32_t*)(seq+1), (seqsize-1)/4, c4); |
|---|
| 110 | for (; count<seqsize && seq[count] == c; ++count) ; |
|---|
| 111 | return count; |
|---|
| 112 | } |
|---|
| 113 | return 0; |
|---|
| 114 | } |
|---|
| 115 | |
|---|
| 116 | inline size_t count_dots(const char *seq, int seqsize) { return count_char(seq, seqsize, '.', 0x2E2E2E2E); } |
|---|
| 117 | inline size_t count_gaps(const char *seq, int seqsize) { return count_char(seq, seqsize, '-', 0x2D2D2D2D); } |
|---|
| 118 | |
|---|
| 119 | inline size_t count_gaps_and_dots(const char *seq, int seqsize) { |
|---|
| 120 | size_t count = 0; |
|---|
| 121 | size_t count2; |
|---|
| 122 | size_t count3; |
|---|
| 123 | |
|---|
| 124 | do { |
|---|
| 125 | count2 = count_dots(seq+count, seqsize-count); |
|---|
| 126 | count += count2; |
|---|
| 127 | count3 = count_gaps(seq+count, seqsize-count); |
|---|
| 128 | count += count3; |
|---|
| 129 | } |
|---|
| 130 | while (count2 || count3); |
|---|
| 131 | return count; |
|---|
| 132 | } |
|---|
| 133 | |
|---|
| 134 | int probe_compress_sequence(char *seq, int seqsize) |
|---|
| 135 | { |
|---|
| 136 | static uchar *tab = 0; |
|---|
| 137 | if (!tab) { |
|---|
| 138 | tab = (uchar *)malloc(256); |
|---|
| 139 | memset(tab,PT_N,256); |
|---|
| 140 | |
|---|
| 141 | tab['A'] = tab['a'] = PT_A; |
|---|
| 142 | tab['C'] = tab['c'] = PT_C; |
|---|
| 143 | tab['G'] = tab['g'] = PT_G; |
|---|
| 144 | tab['T'] = tab['t'] = PT_T; |
|---|
| 145 | tab['U'] = tab['u'] = PT_T; |
|---|
| 146 | tab['.'] = PT_QU; |
|---|
| 147 | tab[0] = PT_B_UNDEF; |
|---|
| 148 | } |
|---|
| 149 | |
|---|
| 150 | char *dest = seq; |
|---|
| 151 | size_t offset = 0; |
|---|
| 152 | |
|---|
| 153 | while (seq[offset]) { |
|---|
| 154 | offset += count_gaps(seq+offset, seqsize-offset); // skip over gaps |
|---|
| 155 | |
|---|
| 156 | uchar c = tab[safeCharIndex(seq[offset++])]; |
|---|
| 157 | if (c == PT_B_UNDEF) break; // already seen terminal zerobyte |
|---|
| 158 | |
|---|
| 159 | *dest++ = c; |
|---|
| 160 | if (c == PT_QU) { // TODO: *seq = '.' ??? |
|---|
| 161 | offset += count_gaps_and_dots(seq+offset, seqsize-offset); // skip over gaps and dots |
|---|
| 162 | // dest[-1] = PT_N; // @@@ uncomment this to handle '.' like 'N' (experimental!!!) |
|---|
| 163 | } |
|---|
| 164 | } |
|---|
| 165 | |
|---|
| 166 | if (dest[-1] != PT_QU) { |
|---|
| 167 | *dest++ = PT_QU; |
|---|
| 168 | } |
|---|
| 169 | |
|---|
| 170 | #ifdef ARB_64 |
|---|
| 171 | arb_assert(!((dest - seq) & 0xffffffff00000000)); // must fit into 32 bit |
|---|
| 172 | #endif |
|---|
| 173 | |
|---|
| 174 | return dest-seq; |
|---|
| 175 | } |
|---|
| 176 | |
|---|
| 177 | static char *probe_read_string_append_point(GBDATA *gb_data, int *psize) { |
|---|
| 178 | long len = GB_read_string_count(gb_data); |
|---|
| 179 | char *data = GB_read_string(gb_data); |
|---|
| 180 | |
|---|
| 181 | if (data) { |
|---|
| 182 | if (data[len - 1] != '.') { |
|---|
| 183 | char *buffer = (char *) malloc(len + 2); |
|---|
| 184 | strcpy(buffer, data); |
|---|
| 185 | buffer[len++] = '.'; |
|---|
| 186 | buffer[len] = 0; |
|---|
| 187 | freeset(data, buffer); |
|---|
| 188 | } |
|---|
| 189 | *psize = len; |
|---|
| 190 | } |
|---|
| 191 | else { |
|---|
| 192 | *psize = 0; |
|---|
| 193 | } |
|---|
| 194 | return data; |
|---|
| 195 | } |
|---|
| 196 | |
|---|
| 197 | char *probe_read_alignment(int j, int *psize) { |
|---|
| 198 | char *buffer = 0; |
|---|
| 199 | GBDATA *gb_ali = GB_entry(psg.data[j].gbd, psg.alignment_name); |
|---|
| 200 | |
|---|
| 201 | if (gb_ali) { |
|---|
| 202 | GBDATA *gb_data = GB_entry(gb_ali, "data"); |
|---|
| 203 | if (gb_data) buffer = probe_read_string_append_point(gb_data, psize); |
|---|
| 204 | } |
|---|
| 205 | return buffer; |
|---|
| 206 | } |
|---|
| 207 | |
|---|
| 208 | void probe_read_alignments() { |
|---|
| 209 | // reads sequence data into psg.data |
|---|
| 210 | |
|---|
| 211 | GB_begin_transaction(psg.gb_main); |
|---|
| 212 | |
|---|
| 213 | // read ref SAI (e.g. ecoli) |
|---|
| 214 | { |
|---|
| 215 | char *def_ref = GBT_get_default_ref(psg.gb_main); |
|---|
| 216 | GBDATA *gb_ref = GBT_find_SAI_rel_SAI_data(psg.gb_sai_data, def_ref); |
|---|
| 217 | |
|---|
| 218 | psg.ecoli = 0; |
|---|
| 219 | if (gb_ref) { |
|---|
| 220 | GBDATA *gb_data = GBT_read_sequence(gb_ref, psg.alignment_name); |
|---|
| 221 | if (gb_data) { |
|---|
| 222 | psg.ecoli = GB_read_string(gb_data); |
|---|
| 223 | } |
|---|
| 224 | } |
|---|
| 225 | free(def_ref); |
|---|
| 226 | } |
|---|
| 227 | |
|---|
| 228 | int count = GB_number_of_subentries(psg.gb_species_data); |
|---|
| 229 | |
|---|
| 230 | psg.data = (probe_input_data *)calloc(sizeof(probe_input_data), count); |
|---|
| 231 | psg.data_count = 0; |
|---|
| 232 | |
|---|
| 233 | printf("Database contains %i species.\nPreparing sequence data:\n", count); |
|---|
| 234 | |
|---|
| 235 | int data_missing = 0; |
|---|
| 236 | int species_count = count; |
|---|
| 237 | |
|---|
| 238 | count = 0; |
|---|
| 239 | |
|---|
| 240 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(psg.gb_species_data); |
|---|
| 241 | gb_species; |
|---|
| 242 | gb_species = GBT_next_species(gb_species) ) |
|---|
| 243 | { |
|---|
| 244 | probe_input_data& pid = psg.data[count]; |
|---|
| 245 | |
|---|
| 246 | pid.name = strdup(GBT_read_name(gb_species)); |
|---|
| 247 | pid.fullname = GBT_read_string(gb_species, "full_name"); |
|---|
| 248 | |
|---|
| 249 | if (!pid.fullname) pid.fullname = strdup(""); |
|---|
| 250 | |
|---|
| 251 | pid.is_group = 1; |
|---|
| 252 | pid.gbd = gb_species; |
|---|
| 253 | |
|---|
| 254 | GBDATA *gb_ali = GB_entry(gb_species, psg.alignment_name); |
|---|
| 255 | if (gb_ali) { |
|---|
| 256 | GBDATA *gb_data = GB_entry(gb_ali,"data"); |
|---|
| 257 | if (!gb_data) { |
|---|
| 258 | fprintf(stderr,"Species '%s' has no data in '%s'\n", pid.name, psg.alignment_name); |
|---|
| 259 | data_missing++; |
|---|
| 260 | } |
|---|
| 261 | else { |
|---|
| 262 | int hsize; |
|---|
| 263 | char *data = probe_read_string_append_point(gb_data, &hsize); |
|---|
| 264 | |
|---|
| 265 | if (!data) { |
|---|
| 266 | GB_ERROR error = GB_await_error(); |
|---|
| 267 | fprintf(stderr, "Could not read data in '%s' for species '%s'\n(Reason: %s)\n", |
|---|
| 268 | psg.alignment_name, pid.name, error); |
|---|
| 269 | data_missing++; |
|---|
| 270 | } |
|---|
| 271 | else { |
|---|
| 272 | pid.checksum = GB_checksum(data, hsize, 1, ".-"); |
|---|
| 273 | int size = probe_compress_sequence(data, hsize); |
|---|
| 274 | |
|---|
| 275 | pid.data = (char *)gbs_malloc_copy(data, size); |
|---|
| 276 | pid.size = size; |
|---|
| 277 | |
|---|
| 278 | count ++; |
|---|
| 279 | if (count%10 == 0) { |
|---|
| 280 | if (count%500) fputc('.', stdout); |
|---|
| 281 | else printf(".%6i (%5.1f%%)\n", count, ((double)count/species_count)*100); |
|---|
| 282 | fflush(stdout); |
|---|
| 283 | } |
|---|
| 284 | } |
|---|
| 285 | free(data); |
|---|
| 286 | } |
|---|
| 287 | } |
|---|
| 288 | } |
|---|
| 289 | |
|---|
| 290 | psg.data_count = count; |
|---|
| 291 | GB_commit_transaction(psg.gb_main); |
|---|
| 292 | |
|---|
| 293 | if (data_missing) { |
|---|
| 294 | printf("\n%i species were ignored because of missing data.\n", data_missing); |
|---|
| 295 | } |
|---|
| 296 | else { |
|---|
| 297 | printf("\nAll species contain data in alignment '%s'.\n", psg.alignment_name); |
|---|
| 298 | } |
|---|
| 299 | fflush(stdout); |
|---|
| 300 | } |
|---|
| 301 | |
|---|
| 302 | void PT_build_species_hash(void){ |
|---|
| 303 | long i; |
|---|
| 304 | psg.namehash = GBS_create_hash(PT_NAME_HASH_SIZE, GB_MIND_CASE); |
|---|
| 305 | for (i=0;i<psg.data_count;i++){ |
|---|
| 306 | GBS_write_hash(psg.namehash, psg.data[i].name,i+1); |
|---|
| 307 | } |
|---|
| 308 | unsigned int max_size; |
|---|
| 309 | max_size = 0; |
|---|
| 310 | for (i = 0; i < psg.data_count; i++){ /* get max sequence len */ |
|---|
| 311 | max_size = max(max_size, (unsigned)(psg.data[i].size)); |
|---|
| 312 | psg.char_count += psg.data[i].size; |
|---|
| 313 | } |
|---|
| 314 | psg.max_size = max_size; |
|---|
| 315 | |
|---|
| 316 | if ( psg.ecoli){ |
|---|
| 317 | BI_ecoli_ref *ref = new BI_ecoli_ref; |
|---|
| 318 | const char *error = ref->init(psg.ecoli,strlen(psg.ecoli)); |
|---|
| 319 | if (error) { |
|---|
| 320 | delete psg.ecoli; |
|---|
| 321 | psg.ecoli = 0; |
|---|
| 322 | }else{ |
|---|
| 323 | psg.bi_ecoli = ref; |
|---|
| 324 | } |
|---|
| 325 | } |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | |
|---|
| 329 | long PT_abs_2_rel(long pos) { |
|---|
| 330 | if (!psg.ecoli) return pos; |
|---|
| 331 | return psg.bi_ecoli->abs_2_rel(pos); |
|---|
| 332 | } |
|---|
| 333 | |
|---|
| 334 | long PT_rel_2_abs(long pos) { |
|---|
| 335 | if (!psg.ecoli) return pos; |
|---|
| 336 | return psg.bi_ecoli->rel_2_abs(pos); |
|---|
| 337 | } |
|---|