| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <malloc.h> |
|---|
| 4 | #include <memory.h> |
|---|
| 5 | #include <string.h> |
|---|
| 6 | #include <PT_server.h> |
|---|
| 7 | #include "probe.h" |
|---|
| 8 | #include <arbdbt.h> |
|---|
| 9 | #include <BI_helix.hxx> |
|---|
| 10 | extern "C" { char *gbs_malloc_copy(char *,long); } |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | /* change a sequence with normal bases the PT_? format and delete all other signs */ |
|---|
| 14 | int compress_data(char *probestring) |
|---|
| 15 | { |
|---|
| 16 | char c; |
|---|
| 17 | char *src, |
|---|
| 18 | *dest; |
|---|
| 19 | dest = src = probestring; |
|---|
| 20 | |
|---|
| 21 | while((c=*(src++))) |
|---|
| 22 | { |
|---|
| 23 | switch (c) { |
|---|
| 24 | case 'A': |
|---|
| 25 | case 'a': *(dest++) = PT_A;break; |
|---|
| 26 | case 'C': |
|---|
| 27 | case 'c': *(dest++) = PT_C;break; |
|---|
| 28 | case 'G': |
|---|
| 29 | case 'g': *(dest++) = PT_G;break; |
|---|
| 30 | case 'U': |
|---|
| 31 | case 'u': |
|---|
| 32 | case 'T': |
|---|
| 33 | case 't': *(dest++) = PT_T;break; |
|---|
| 34 | case 'N': |
|---|
| 35 | case 'n': *(dest++) = PT_N;break; |
|---|
| 36 | default: break; |
|---|
| 37 | } |
|---|
| 38 | |
|---|
| 39 | } |
|---|
| 40 | *dest = PT_QU; |
|---|
| 41 | return 0; |
|---|
| 42 | } |
|---|
| 43 | |
|---|
| 44 | /* get a string with readable bases from a string with PT_? */ |
|---|
| 45 | int PT_base_2_string(char *id_string, long len) |
|---|
| 46 | { |
|---|
| 47 | char c; |
|---|
| 48 | char *src, |
|---|
| 49 | *dest; |
|---|
| 50 | if (!len) len = strlen(id_string); |
|---|
| 51 | dest = src = id_string; |
|---|
| 52 | |
|---|
| 53 | while((len--)>0){ |
|---|
| 54 | c=*(src++); |
|---|
| 55 | switch (c) { |
|---|
| 56 | case PT_A: *(dest++) = 'A';break; |
|---|
| 57 | case PT_C: *(dest++) = 'C';break; |
|---|
| 58 | case PT_G: *(dest++) = 'G';break; |
|---|
| 59 | case PT_T: *(dest++) = 'U';break; |
|---|
| 60 | case PT_N: *(dest++) = 'N';break; |
|---|
| 61 | case 0: *(dest++) = '0'; break; |
|---|
| 62 | default: *(dest++) = c;break; |
|---|
| 63 | } |
|---|
| 64 | |
|---|
| 65 | } |
|---|
| 66 | *dest = '\0'; |
|---|
| 67 | return 0; |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | void probe_read_data_base(char *name) |
|---|
| 71 | { |
|---|
| 72 | GBDATA *gb_main; |
|---|
| 73 | GBDATA *gb_species_data; |
|---|
| 74 | gb_main = GB_open(name,"r"); |
|---|
| 75 | if (!gb_main) { |
|---|
| 76 | printf("Error reading file %s\n",name); |
|---|
| 77 | exit (-1); |
|---|
| 78 | } |
|---|
| 79 | GB_begin_transaction(gb_main); |
|---|
| 80 | gb_species_data = GB_find(gb_main,"species_data",0,down_level); |
|---|
| 81 | if (!gb_species_data){ |
|---|
| 82 | printf("Database %s is empty\n",name); |
|---|
| 83 | exit (-1); |
|---|
| 84 | } |
|---|
| 85 | psg.gb_main = gb_main; |
|---|
| 86 | psg.gb_species_data = gb_species_data; |
|---|
| 87 | psg.gb_extended_data = GB_search(gb_main,"extended_data",GB_CREATE_CONTAINER); |
|---|
| 88 | GB_commit_transaction(gb_main); |
|---|
| 89 | } |
|---|
| 90 | |
|---|
| 91 | int probe_compress_sequence(char *seq) |
|---|
| 92 | { |
|---|
| 93 | static uchar *tab = 0; |
|---|
| 94 | if (!tab){ |
|---|
| 95 | tab = (uchar *)malloc(256); |
|---|
| 96 | memset(tab,PT_N,256); |
|---|
| 97 | tab['A'] = tab['a'] = PT_A; |
|---|
| 98 | tab['C'] = tab['c'] = PT_C; |
|---|
| 99 | tab['G'] = tab['g'] = PT_G; |
|---|
| 100 | tab['T'] = tab['t'] = PT_T; |
|---|
| 101 | tab['U'] = tab['u'] = PT_T; |
|---|
| 102 | tab['.'] = PT_QU; |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | char *dest; |
|---|
| 106 | uchar c; |
|---|
| 107 | char *source; |
|---|
| 108 | int last_is_q = 0; |
|---|
| 109 | dest = source = seq; |
|---|
| 110 | while((c=(uchar)*(seq++))) |
|---|
| 111 | { |
|---|
| 112 | if (c == '-') continue; |
|---|
| 113 | c = tab[c]; |
|---|
| 114 | if (c == PT_QU){ |
|---|
| 115 | if (last_is_q) continue; |
|---|
| 116 | last_is_q = 1; |
|---|
| 117 | }else{ |
|---|
| 118 | last_is_q = 0; |
|---|
| 119 | } |
|---|
| 120 | *(dest)++ = c; |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | if (dest[-1] != PT_QU){ |
|---|
| 124 | *(dest++) = PT_QU; |
|---|
| 125 | printf("error\n"); |
|---|
| 126 | } |
|---|
| 127 | return dest-source; |
|---|
| 128 | } |
|---|
| 129 | char *probe_read_string_append_point(GBDATA *gb_data,int *psize) |
|---|
| 130 | { |
|---|
| 131 | char *data; |
|---|
| 132 | char *buffer; |
|---|
| 133 | int len; |
|---|
| 134 | len = (int)GB_read_string_count(gb_data); |
|---|
| 135 | data = GB_read_string(gb_data); |
|---|
| 136 | if (data[len-1] != '.') { |
|---|
| 137 | buffer = (char *)calloc(sizeof(char),len+2); |
|---|
| 138 | strcpy(buffer,data); |
|---|
| 139 | buffer[len++] = '.'; |
|---|
| 140 | free(data); |
|---|
| 141 | data = buffer; |
|---|
| 142 | } |
|---|
| 143 | *psize = len; |
|---|
| 144 | return data; |
|---|
| 145 | } |
|---|
| 146 | char * |
|---|
| 147 | probe_read_alignment(int j, int *psize) |
|---|
| 148 | { |
|---|
| 149 | static char *buffer = 0; |
|---|
| 150 | GBDATA *gb_ali,*gb_data; |
|---|
| 151 | buffer = 0; |
|---|
| 152 | gb_ali = GB_find(psg.data[j].gbd, psg.alignment_name, 0, down_level); |
|---|
| 153 | if (!gb_ali) |
|---|
| 154 | return 0; |
|---|
| 155 | /* this alignment dont exists */ |
|---|
| 156 | gb_data = GB_find(gb_ali, "data", 0, down_level); |
|---|
| 157 | if (!gb_data) { |
|---|
| 158 | return 0; |
|---|
| 159 | } |
|---|
| 160 | buffer = probe_read_string_append_point(gb_data,psize); |
|---|
| 161 | return buffer; |
|---|
| 162 | } |
|---|
| 163 | void probe_read_alignments() |
|---|
| 164 | /* liest die Alignments in psg.data */ |
|---|
| 165 | { |
|---|
| 166 | GBDATA *gb_species; |
|---|
| 167 | GBDATA *gb_name; |
|---|
| 168 | GBDATA *gb_ali; |
|---|
| 169 | GBDATA *gb_data; |
|---|
| 170 | char *data; |
|---|
| 171 | int count; |
|---|
| 172 | static int size,hsize; |
|---|
| 173 | count = 0; |
|---|
| 174 | GB_begin_transaction(psg.gb_main); |
|---|
| 175 | /* count all data */ |
|---|
| 176 | char *def_ref = GBT_get_default_ref(psg.gb_main); |
|---|
| 177 | gb_species = GBT_find_SAI_rel_exdata(psg.gb_extended_data, def_ref); |
|---|
| 178 | delete def_ref; |
|---|
| 179 | psg.ecoli = 0; |
|---|
| 180 | if (gb_species) { |
|---|
| 181 | gb_data = GBT_read_sequence(gb_species,psg.alignment_name); |
|---|
| 182 | if (gb_data) { |
|---|
| 183 | psg.ecoli = GB_read_string(gb_data); |
|---|
| 184 | } |
|---|
| 185 | } |
|---|
| 186 | for ( gb_species = GBT_first_species_rel_species_data(psg.gb_species_data); |
|---|
| 187 | gb_species; |
|---|
| 188 | gb_species = GBT_next_species(gb_species) ){ |
|---|
| 189 | count ++; |
|---|
| 190 | } |
|---|
| 191 | psg.data = (struct probe_input_data *) |
|---|
| 192 | calloc(sizeof(struct probe_input_data),count); |
|---|
| 193 | psg.data_count = 0; |
|---|
| 194 | count = 0; |
|---|
| 195 | for ( gb_species = GBT_first_species_rel_species_data(psg.gb_species_data); |
|---|
| 196 | gb_species; |
|---|
| 197 | gb_species = GBT_next_species(gb_species) ){ |
|---|
| 198 | gb_name = GB_find(gb_species,"name",0,down_level); |
|---|
| 199 | if (!gb_name) continue; |
|---|
| 200 | psg.data[count].name = GB_read_string(gb_name); |
|---|
| 201 | gb_name = GB_find(gb_species,"full_name",0,down_level); |
|---|
| 202 | if (gb_name) { |
|---|
| 203 | psg.data[count].fullname = GB_read_string(gb_name); |
|---|
| 204 | }else{ |
|---|
| 205 | static char empty[] = ""; |
|---|
| 206 | psg.data[count].fullname = empty; |
|---|
| 207 | } |
|---|
| 208 | psg.data[count].is_group = 1; |
|---|
| 209 | psg.data[count].gbd = gb_species; |
|---|
| 210 | /* get alignments */ |
|---|
| 211 | gb_ali = GB_find(gb_species,psg.alignment_name,0,down_level); |
|---|
| 212 | if (!gb_ali) continue; |
|---|
| 213 | /* this alignment doesnt exist */ |
|---|
| 214 | gb_data = GB_find(gb_ali,"data",0,down_level); |
|---|
| 215 | if (!gb_data) { |
|---|
| 216 | fprintf(stderr,"Bact. %s has no ali_xxx/data\n", |
|---|
| 217 | psg.data[count].name); |
|---|
| 218 | continue; |
|---|
| 219 | } |
|---|
| 220 | psg.data[count].checksum = GBS_checksum(GB_read_char_pntr(gb_data),1,".-"); |
|---|
| 221 | data = probe_read_string_append_point(gb_data,&hsize); |
|---|
| 222 | size = probe_compress_sequence(data); |
|---|
| 223 | psg.data[count].data = (char *)gbs_malloc_copy(data,size); |
|---|
| 224 | psg.data[count].size = size; |
|---|
| 225 | free(data); |
|---|
| 226 | count ++; |
|---|
| 227 | } |
|---|
| 228 | psg.data_count = count; |
|---|
| 229 | GB_commit_transaction(psg.gb_main); |
|---|
| 230 | } |
|---|
| 231 | |
|---|
| 232 | void PT_build_species_hash(void){ |
|---|
| 233 | long i; |
|---|
| 234 | psg.namehash = GBS_create_hash(PT_NAME_HASH_SIZE,0); |
|---|
| 235 | for (i=0;i<psg.data_count;i++){ |
|---|
| 236 | GBS_write_hash(psg.namehash, psg.data[i].name,i+1); |
|---|
| 237 | } |
|---|
| 238 | unsigned int max_size; |
|---|
| 239 | max_size = 0; |
|---|
| 240 | for (i = 0; i < psg.data_count; i++){ /* get max sequence len */ |
|---|
| 241 | max_size = max(max_size, (unsigned)(psg.data[i].size)); |
|---|
| 242 | psg.char_count += psg.data[i].size; |
|---|
| 243 | } |
|---|
| 244 | psg.max_size = max_size; |
|---|
| 245 | |
|---|
| 246 | if ( psg.ecoli){ |
|---|
| 247 | BI_ecoli_ref *ref = new BI_ecoli_ref; |
|---|
| 248 | const char *error = ref->init(psg.ecoli,strlen(psg.ecoli)); |
|---|
| 249 | if (error) { |
|---|
| 250 | delete psg.ecoli; |
|---|
| 251 | psg.ecoli = 0; |
|---|
| 252 | }else{ |
|---|
| 253 | psg.bi_ecoli = (void *)ref; |
|---|
| 254 | } |
|---|
| 255 | } |
|---|
| 256 | } |
|---|
| 257 | |
|---|
| 258 | |
|---|
| 259 | long PT_abs_2_rel(long pos) |
|---|
| 260 | { |
|---|
| 261 | if (!psg.ecoli) return pos; |
|---|
| 262 | long res,dummy; |
|---|
| 263 | ((BI_ecoli_ref *)psg.bi_ecoli)->abs_2_rel(pos,res,dummy); |
|---|
| 264 | return res; |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | long PT_rel_2_abs(long pos) |
|---|
| 268 | { |
|---|
| 269 | if (!psg.ecoli) return pos; |
|---|
| 270 | long res; |
|---|
| 271 | ((BI_ecoli_ref *)psg.bi_ecoli)->rel_2_abs(pos,0,res); |
|---|
| 272 | return res; |
|---|
| 273 | } |
|---|