1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | #include <malloc.h> |
---|
4 | #include <memory.h> |
---|
5 | #include <string.h> |
---|
6 | #include <PT_server.h> |
---|
7 | #include "probe.h" |
---|
8 | #include <arbdbt.h> |
---|
9 | #include <BI_helix.hxx> |
---|
10 | extern "C" { char *gbs_malloc_copy(char *,long); } |
---|
11 | |
---|
12 | |
---|
13 | /* change a sequence with normal bases the PT_? format and delete all other signs */ |
---|
14 | int compress_data(char *probestring) |
---|
15 | { |
---|
16 | char c; |
---|
17 | char *src, |
---|
18 | *dest; |
---|
19 | dest = src = probestring; |
---|
20 | |
---|
21 | while((c=*(src++))) |
---|
22 | { |
---|
23 | switch (c) { |
---|
24 | case 'A': |
---|
25 | case 'a': *(dest++) = PT_A;break; |
---|
26 | case 'C': |
---|
27 | case 'c': *(dest++) = PT_C;break; |
---|
28 | case 'G': |
---|
29 | case 'g': *(dest++) = PT_G;break; |
---|
30 | case 'U': |
---|
31 | case 'u': |
---|
32 | case 'T': |
---|
33 | case 't': *(dest++) = PT_T;break; |
---|
34 | case 'N': |
---|
35 | case 'n': *(dest++) = PT_N;break; |
---|
36 | default: break; |
---|
37 | } |
---|
38 | |
---|
39 | } |
---|
40 | *dest = PT_QU; |
---|
41 | return 0; |
---|
42 | } |
---|
43 | |
---|
44 | /* get a string with readable bases from a string with PT_? */ |
---|
45 | int PT_base_2_string(char *id_string, long len) |
---|
46 | { |
---|
47 | char c; |
---|
48 | char *src, |
---|
49 | *dest; |
---|
50 | if (!len) len = strlen(id_string); |
---|
51 | dest = src = id_string; |
---|
52 | |
---|
53 | while((len--)>0){ |
---|
54 | c=*(src++); |
---|
55 | switch (c) { |
---|
56 | case PT_A: *(dest++) = 'A';break; |
---|
57 | case PT_C: *(dest++) = 'C';break; |
---|
58 | case PT_G: *(dest++) = 'G';break; |
---|
59 | case PT_T: *(dest++) = 'U';break; |
---|
60 | case PT_N: *(dest++) = 'N';break; |
---|
61 | case 0: *(dest++) = '0'; break; |
---|
62 | default: *(dest++) = c;break; |
---|
63 | } |
---|
64 | |
---|
65 | } |
---|
66 | *dest = '\0'; |
---|
67 | return 0; |
---|
68 | } |
---|
69 | |
---|
70 | void probe_read_data_base(char *name) |
---|
71 | { |
---|
72 | GBDATA *gb_main; |
---|
73 | GBDATA *gb_species_data; |
---|
74 | gb_main = GB_open(name,"r"); |
---|
75 | if (!gb_main) { |
---|
76 | printf("Error reading file %s\n",name); |
---|
77 | exit (-1); |
---|
78 | } |
---|
79 | GB_begin_transaction(gb_main); |
---|
80 | gb_species_data = GB_find(gb_main,"species_data",0,down_level); |
---|
81 | if (!gb_species_data){ |
---|
82 | printf("Database %s is empty\n",name); |
---|
83 | exit (-1); |
---|
84 | } |
---|
85 | psg.gb_main = gb_main; |
---|
86 | psg.gb_species_data = gb_species_data; |
---|
87 | psg.gb_extended_data = GB_search(gb_main,"extended_data",GB_CREATE_CONTAINER); |
---|
88 | GB_commit_transaction(gb_main); |
---|
89 | } |
---|
90 | |
---|
91 | int probe_compress_sequence(char *seq) |
---|
92 | { |
---|
93 | static uchar *tab = 0; |
---|
94 | if (!tab){ |
---|
95 | tab = (uchar *)malloc(256); |
---|
96 | memset(tab,PT_N,256); |
---|
97 | tab['A'] = tab['a'] = PT_A; |
---|
98 | tab['C'] = tab['c'] = PT_C; |
---|
99 | tab['G'] = tab['g'] = PT_G; |
---|
100 | tab['T'] = tab['t'] = PT_T; |
---|
101 | tab['U'] = tab['u'] = PT_T; |
---|
102 | tab['.'] = PT_QU; |
---|
103 | } |
---|
104 | |
---|
105 | char *dest; |
---|
106 | uchar c; |
---|
107 | char *source; |
---|
108 | int last_is_q = 0; |
---|
109 | dest = source = seq; |
---|
110 | while((c=(uchar)*(seq++))) |
---|
111 | { |
---|
112 | if (c == '-') continue; |
---|
113 | c = tab[c]; |
---|
114 | if (c == PT_QU){ |
---|
115 | if (last_is_q) continue; |
---|
116 | last_is_q = 1; |
---|
117 | }else{ |
---|
118 | last_is_q = 0; |
---|
119 | } |
---|
120 | *(dest)++ = c; |
---|
121 | } |
---|
122 | |
---|
123 | if (dest[-1] != PT_QU){ |
---|
124 | *(dest++) = PT_QU; |
---|
125 | printf("error\n"); |
---|
126 | } |
---|
127 | return dest-source; |
---|
128 | } |
---|
129 | char *probe_read_string_append_point(GBDATA *gb_data,int *psize) |
---|
130 | { |
---|
131 | char *data; |
---|
132 | char *buffer; |
---|
133 | int len; |
---|
134 | len = (int)GB_read_string_count(gb_data); |
---|
135 | data = GB_read_string(gb_data); |
---|
136 | if (data[len-1] != '.') { |
---|
137 | buffer = (char *)calloc(sizeof(char),len+2); |
---|
138 | strcpy(buffer,data); |
---|
139 | buffer[len++] = '.'; |
---|
140 | free(data); |
---|
141 | data = buffer; |
---|
142 | } |
---|
143 | *psize = len; |
---|
144 | return data; |
---|
145 | } |
---|
146 | char * |
---|
147 | probe_read_alignment(int j, int *psize) |
---|
148 | { |
---|
149 | static char *buffer = 0; |
---|
150 | GBDATA *gb_ali,*gb_data; |
---|
151 | buffer = 0; |
---|
152 | gb_ali = GB_find(psg.data[j].gbd, psg.alignment_name, 0, down_level); |
---|
153 | if (!gb_ali) |
---|
154 | return 0; |
---|
155 | /* this alignment dont exists */ |
---|
156 | gb_data = GB_find(gb_ali, "data", 0, down_level); |
---|
157 | if (!gb_data) { |
---|
158 | return 0; |
---|
159 | } |
---|
160 | buffer = probe_read_string_append_point(gb_data,psize); |
---|
161 | return buffer; |
---|
162 | } |
---|
163 | void probe_read_alignments() |
---|
164 | /* liest die Alignments in psg.data */ |
---|
165 | { |
---|
166 | GBDATA *gb_species; |
---|
167 | GBDATA *gb_name; |
---|
168 | GBDATA *gb_ali; |
---|
169 | GBDATA *gb_data; |
---|
170 | char *data; |
---|
171 | int count; |
---|
172 | static int size,hsize; |
---|
173 | count = 0; |
---|
174 | GB_begin_transaction(psg.gb_main); |
---|
175 | /* count all data */ |
---|
176 | char *def_ref = GBT_get_default_ref(psg.gb_main); |
---|
177 | gb_species = GBT_find_SAI_rel_exdata(psg.gb_extended_data, def_ref); |
---|
178 | delete def_ref; |
---|
179 | psg.ecoli = 0; |
---|
180 | if (gb_species) { |
---|
181 | gb_data = GBT_read_sequence(gb_species,psg.alignment_name); |
---|
182 | if (gb_data) { |
---|
183 | psg.ecoli = GB_read_string(gb_data); |
---|
184 | } |
---|
185 | } |
---|
186 | for ( gb_species = GBT_first_species_rel_species_data(psg.gb_species_data); |
---|
187 | gb_species; |
---|
188 | gb_species = GBT_next_species(gb_species) ){ |
---|
189 | count ++; |
---|
190 | } |
---|
191 | psg.data = (struct probe_input_data *) |
---|
192 | calloc(sizeof(struct probe_input_data),count); |
---|
193 | psg.data_count = 0; |
---|
194 | count = 0; |
---|
195 | for ( gb_species = GBT_first_species_rel_species_data(psg.gb_species_data); |
---|
196 | gb_species; |
---|
197 | gb_species = GBT_next_species(gb_species) ){ |
---|
198 | gb_name = GB_find(gb_species,"name",0,down_level); |
---|
199 | if (!gb_name) continue; |
---|
200 | psg.data[count].name = GB_read_string(gb_name); |
---|
201 | gb_name = GB_find(gb_species,"full_name",0,down_level); |
---|
202 | if (gb_name) { |
---|
203 | psg.data[count].fullname = GB_read_string(gb_name); |
---|
204 | }else{ |
---|
205 | static char empty[] = ""; |
---|
206 | psg.data[count].fullname = empty; |
---|
207 | } |
---|
208 | psg.data[count].is_group = 1; |
---|
209 | psg.data[count].gbd = gb_species; |
---|
210 | /* get alignments */ |
---|
211 | gb_ali = GB_find(gb_species,psg.alignment_name,0,down_level); |
---|
212 | if (!gb_ali) continue; |
---|
213 | /* this alignment doesnt exist */ |
---|
214 | gb_data = GB_find(gb_ali,"data",0,down_level); |
---|
215 | if (!gb_data) { |
---|
216 | fprintf(stderr,"Bact. %s has no ali_xxx/data\n", |
---|
217 | psg.data[count].name); |
---|
218 | continue; |
---|
219 | } |
---|
220 | psg.data[count].checksum = GBS_checksum(GB_read_char_pntr(gb_data),1,".-"); |
---|
221 | data = probe_read_string_append_point(gb_data,&hsize); |
---|
222 | size = probe_compress_sequence(data); |
---|
223 | psg.data[count].data = (char *)gbs_malloc_copy(data,size); |
---|
224 | psg.data[count].size = size; |
---|
225 | free(data); |
---|
226 | count ++; |
---|
227 | } |
---|
228 | psg.data_count = count; |
---|
229 | GB_commit_transaction(psg.gb_main); |
---|
230 | } |
---|
231 | |
---|
232 | void PT_build_species_hash(void){ |
---|
233 | long i; |
---|
234 | psg.namehash = GBS_create_hash(PT_NAME_HASH_SIZE,0); |
---|
235 | for (i=0;i<psg.data_count;i++){ |
---|
236 | GBS_write_hash(psg.namehash, psg.data[i].name,i+1); |
---|
237 | } |
---|
238 | unsigned int max_size; |
---|
239 | max_size = 0; |
---|
240 | for (i = 0; i < psg.data_count; i++){ /* get max sequence len */ |
---|
241 | max_size = max(max_size, (unsigned)(psg.data[i].size)); |
---|
242 | psg.char_count += psg.data[i].size; |
---|
243 | } |
---|
244 | psg.max_size = max_size; |
---|
245 | |
---|
246 | if ( psg.ecoli){ |
---|
247 | BI_ecoli_ref *ref = new BI_ecoli_ref; |
---|
248 | const char *error = ref->init(psg.ecoli,strlen(psg.ecoli)); |
---|
249 | if (error) { |
---|
250 | delete psg.ecoli; |
---|
251 | psg.ecoli = 0; |
---|
252 | }else{ |
---|
253 | psg.bi_ecoli = (void *)ref; |
---|
254 | } |
---|
255 | } |
---|
256 | } |
---|
257 | |
---|
258 | |
---|
259 | long PT_abs_2_rel(long pos) |
---|
260 | { |
---|
261 | if (!psg.ecoli) return pos; |
---|
262 | long res,dummy; |
---|
263 | ((BI_ecoli_ref *)psg.bi_ecoli)->abs_2_rel(pos,res,dummy); |
---|
264 | return res; |
---|
265 | } |
---|
266 | |
---|
267 | long PT_rel_2_abs(long pos) |
---|
268 | { |
---|
269 | if (!psg.ecoli) return pos; |
---|
270 | long res; |
---|
271 | ((BI_ecoli_ref *)psg.bi_ecoli)->rel_2_abs(pos,0,res); |
---|
272 | return res; |
---|
273 | } |
---|