1 | #include <stdio.h> |
---|
2 | #include <string.h> |
---|
3 | #include <memory.h> |
---|
4 | // #include <malloc.h> |
---|
5 | #include <arbdb.h> |
---|
6 | #include <arbdbt.h> |
---|
7 | #include "tc.hxx" |
---|
8 | #include <BI_helix.hxx> |
---|
9 | |
---|
10 | void pt_align(FILE *out, int &pos, int mod) { |
---|
11 | int m = pos % mod; |
---|
12 | if (!m) return; |
---|
13 | while (m<mod) { |
---|
14 | if (out) putc(0,out); |
---|
15 | pos++; |
---|
16 | m++; |
---|
17 | } |
---|
18 | } |
---|
19 | |
---|
20 | char *pt_getstring(FILE *in){ |
---|
21 | char *p,*buffer; |
---|
22 | int c; |
---|
23 | void *strstruct = GBS_stropen(1024); |
---|
24 | GBS_strcat(strstruct,0); |
---|
25 | for (c = 1; c; ) { |
---|
26 | c = getc(in); |
---|
27 | if (c== EOF) return 0; |
---|
28 | GBS_chrcat(strstruct,c); |
---|
29 | } |
---|
30 | p = GBS_strclose(strstruct); |
---|
31 | |
---|
32 | if (!p[0]){ |
---|
33 | delete p; |
---|
34 | return 0; |
---|
35 | } |
---|
36 | if (p[0] == 1) { |
---|
37 | buffer = strdup(p+1); |
---|
38 | }else{ |
---|
39 | buffer = strdup(p); |
---|
40 | } |
---|
41 | delete p; |
---|
42 | return buffer; |
---|
43 | } |
---|
44 | |
---|
45 | int pt_putstring(char *s,FILE *out){ |
---|
46 | if (!s) { |
---|
47 | putc(0,out); |
---|
48 | return 0; |
---|
49 | } |
---|
50 | int len = strlen(s)+1; |
---|
51 | putc(1,out); |
---|
52 | len = fwrite(s,len,1,out); |
---|
53 | if (1 != len){ |
---|
54 | return -1; |
---|
55 | } |
---|
56 | return 0; |
---|
57 | } |
---|
58 | |
---|
59 | |
---|
60 | master_gap_compress::master_gap_compress(void) { |
---|
61 | memset((char *)this,0,sizeof(master_gap_compress)); |
---|
62 | } |
---|
63 | client_gap_compress::client_gap_compress(master_gap_compress *mgc) { |
---|
64 | memset((char *)this,0,sizeof(client_gap_compress)); |
---|
65 | master = mgc; |
---|
66 | memsize = MEM_ALLOC_INIT_CLIENT; |
---|
67 | l = (client_gap_compress_data *)calloc(sizeof(client_gap_compress_data),memsize); |
---|
68 | } |
---|
69 | |
---|
70 | master_gap_compress::~master_gap_compress(void) { |
---|
71 | delete rel_2_abs; |
---|
72 | delete rel_2_abss; |
---|
73 | } |
---|
74 | |
---|
75 | client_gap_compress::~client_gap_compress(void) { |
---|
76 | delete l; |
---|
77 | } |
---|
78 | |
---|
79 | void client_gap_compress::clean(master_gap_compress *mgc){ |
---|
80 | if (memsize < MEM_ALLOC_INIT_CLIENT) { |
---|
81 | delete l; |
---|
82 | memset((char *)this,0,sizeof(client_gap_compress)); |
---|
83 | memsize = MEM_ALLOC_INIT_CLIENT; |
---|
84 | l = (client_gap_compress_data *)calloc(sizeof(client_gap_compress_data),memsize); |
---|
85 | }else{ |
---|
86 | client_gap_compress_data *oldl = l; |
---|
87 | int old_memsize = memsize; |
---|
88 | memset((char *)this,0,sizeof(client_gap_compress)); |
---|
89 | memsize = old_memsize; |
---|
90 | l = oldl; |
---|
91 | } |
---|
92 | master = mgc; |
---|
93 | } |
---|
94 | |
---|
95 | void master_gap_compress::optimize(void) { |
---|
96 | if (!rel_2_abs) return; |
---|
97 | if (rel_2_abs[len-1] > 0x7fff) { |
---|
98 | rel_2_abs = (long *)realloc((char *)rel_2_abs,len*sizeof(long)); |
---|
99 | }else{ |
---|
100 | rel_2_abss = (short *)malloc(sizeof(short) * len); |
---|
101 | int i; |
---|
102 | for (i=0;i<len;i++) { |
---|
103 | rel_2_abss[i] = (short) rel_2_abs[i]; |
---|
104 | } |
---|
105 | delete rel_2_abs; rel_2_abs = 0; |
---|
106 | } |
---|
107 | |
---|
108 | memsize = len; |
---|
109 | } |
---|
110 | |
---|
111 | void master_gap_compress::write(long rel, long abs){ |
---|
112 | if (rel_2_abss) GB_CORE; // no write after optimize |
---|
113 | |
---|
114 | if (!rel_2_abs) { |
---|
115 | memsize = MEM_ALLOC_INIT_MASTER; |
---|
116 | rel_2_abs = (long *)calloc(sizeof(long),memsize); |
---|
117 | } |
---|
118 | |
---|
119 | if (rel >= memsize) { |
---|
120 | memsize = memsize * MEM_ALLOC_FACTOR; |
---|
121 | rel_2_abs = (long *)realloc((char *)rel_2_abs,sizeof(long)*memsize); |
---|
122 | } |
---|
123 | for( ; len<rel; len++) rel_2_abs[len] = 0; |
---|
124 | |
---|
125 | rel_2_abs[rel] = abs; |
---|
126 | if (len <= rel) len = (int)rel+1; |
---|
127 | } |
---|
128 | |
---|
129 | long master_gap_compress::read(long rel, long def_abs){ |
---|
130 | if (rel>=len) { |
---|
131 | this->write(rel,def_abs); |
---|
132 | return def_abs; |
---|
133 | } |
---|
134 | if (rel_2_abss) return rel_2_abss[rel]; |
---|
135 | return rel_2_abs[rel]; |
---|
136 | } |
---|
137 | |
---|
138 | void client_gap_compress::basic_write(long rel, long x, long y){ |
---|
139 | |
---|
140 | if (rel >= memsize) { |
---|
141 | memsize = (int)rel * MEM_ALLOC_FACTOR; |
---|
142 | l = (client_gap_compress_data *)realloc( |
---|
143 | (char *)l,sizeof(client_gap_compress_data)*memsize); |
---|
144 | } |
---|
145 | l[rel].rel = rel; |
---|
146 | l[rel].x = (short)x; |
---|
147 | l[rel].y = y; |
---|
148 | if (len <= rel) len = (int)rel+1; |
---|
149 | } |
---|
150 | |
---|
151 | void client_gap_compress::write(long rel, long abs) { |
---|
152 | if (rel >= master->len) { |
---|
153 | master->write(rel,abs); |
---|
154 | this->basic_write(rel,0,0); |
---|
155 | return; |
---|
156 | } |
---|
157 | long m,r; |
---|
158 | long *ml = master->rel_2_abs; |
---|
159 | old_rel++; |
---|
160 | old_m++; |
---|
161 | // search ?: rel_2_abs[?] <= abs && rel_2_abs[?+1] > abs |
---|
162 | |
---|
163 | if ( rel == old_rel && old_m < master->len && (r=ml[old_m] ) >= abs ) { |
---|
164 | if (r == abs) { |
---|
165 | m = old_m; |
---|
166 | }else{ |
---|
167 | if (ml[ old_m -1 ] > abs) goto cgc_search_by_divide; |
---|
168 | old_m--; |
---|
169 | m = old_m; |
---|
170 | } |
---|
171 | }else{ |
---|
172 | cgc_search_by_divide: |
---|
173 | long l; |
---|
174 | l = 0; r = master->len-1; |
---|
175 | while (l<r-1) { // search the masters rel_2_abs[?] == abs |
---|
176 | m = (l+r)>>1; |
---|
177 | if ( ml[m] <= abs ) { |
---|
178 | l = m; |
---|
179 | continue; |
---|
180 | } |
---|
181 | r = m; |
---|
182 | } |
---|
183 | if ( ml[r] <= abs) m = r; else m = l; |
---|
184 | old_rel = rel; |
---|
185 | old_m = m; |
---|
186 | } |
---|
187 | this->basic_write(rel,m - rel,abs - ml[m] ); |
---|
188 | } |
---|
189 | |
---|
190 | // remove (eval. ) duplicated entries in client compress |
---|
191 | // eg. (4,20,5) (5,21,5) -> delete second |
---|
192 | |
---|
193 | void client_gap_compress::optimize(int realloc_flag) { |
---|
194 | if (!len) return; |
---|
195 | int i; |
---|
196 | int newlen = 0; |
---|
197 | long old_x = -9999; |
---|
198 | long old_y = -9999; |
---|
199 | long old_dx = 0; |
---|
200 | |
---|
201 | for (i=0; i<len; i++) { |
---|
202 | if ( l[i].x != old_x || l[i].y != old_y ) { |
---|
203 | if (i<len+1) { |
---|
204 | old_dx = l[i+1].x - l[i].x; |
---|
205 | if (old_dx < -1 ) old_dx = 0; |
---|
206 | if (old_dx > 0 ) old_dx = 0; |
---|
207 | } |
---|
208 | old_x = l[i].x; |
---|
209 | old_y = l[i].y; |
---|
210 | l[i].dx = (short)old_dx; |
---|
211 | l[newlen++] = l[i]; |
---|
212 | } |
---|
213 | old_x += old_dx; |
---|
214 | old_y -= old_dx; |
---|
215 | } |
---|
216 | if (realloc_flag) { |
---|
217 | l = (struct client_gap_compress_data *)realloc((char *)l, |
---|
218 | sizeof(struct client_gap_compress_data) * newlen); |
---|
219 | } |
---|
220 | len = memsize = newlen; |
---|
221 | } |
---|
222 | |
---|
223 | |
---|
224 | long client_gap_compress::rel_2_abs(long rel){ |
---|
225 | int left,right; |
---|
226 | int m; |
---|
227 | long lrel; |
---|
228 | right = len-1; |
---|
229 | left = 0; |
---|
230 | if (left>=right) { // client is master |
---|
231 | return master->read(rel,0); |
---|
232 | } |
---|
233 | while (left<right-1) { // search the clients l[??].rel <= rel |
---|
234 | // && l[??+1] > rel |
---|
235 | m = (left+right)>>1; |
---|
236 | lrel = l[m].rel; |
---|
237 | if ( lrel <= rel ) { |
---|
238 | left = m; |
---|
239 | continue; |
---|
240 | } |
---|
241 | right = m; |
---|
242 | } |
---|
243 | if ( l[right].rel <= rel) m = right; else m = left; |
---|
244 | long master_pos = l[m].rel; |
---|
245 | long pos; |
---|
246 | if (!l[m].dx) { // substitution or deletion |
---|
247 | master_pos += l[m].x; // masterpos offset |
---|
248 | master_pos += (rel - l[m].rel); // compressed data offset |
---|
249 | pos = master->read(master_pos ,0) // read master at rel+offset |
---|
250 | + l[m].y; |
---|
251 | }else{ // insertion |
---|
252 | master_pos += l[m].x; |
---|
253 | pos = master->read(master_pos ,0) // read master at rel+offset |
---|
254 | + (rel - l[m].rel) // + offset (missing data in compressed ) |
---|
255 | + l[m].y; |
---|
256 | } |
---|
257 | return pos; |
---|
258 | } |
---|
259 | |
---|
260 | gap_compress::gap_compress(void) { |
---|
261 | memset((char *)this,0,sizeof(gap_compress)); |
---|
262 | memclients = MEM_ALLOC_INIT_CLIENTS; |
---|
263 | clients = (client_gap_compress **)calloc(sizeof(void *),memclients); |
---|
264 | memmasters = MEM_ALLOC_INIT_MASTERS; |
---|
265 | masters = (master_gap_compress **)calloc(sizeof(void *),memmasters); |
---|
266 | } |
---|
267 | |
---|
268 | gap_compress::~gap_compress(void) { |
---|
269 | int i; |
---|
270 | for (i=0;i<nmasters;i++) delete masters[i]; |
---|
271 | delete masters; |
---|
272 | for (i=0;i<nclients;i++) delete clients[i]; |
---|
273 | delete clients; |
---|
274 | } |
---|
275 | |
---|
276 | void client_gap_compress::basic_write_sequence(char *sequence, long seq_len, int gap1,int gap2){ |
---|
277 | long rel = 0; |
---|
278 | long abs = 0; |
---|
279 | rel = 0; |
---|
280 | for (abs = 0; abs < seq_len; abs++) { |
---|
281 | if ( sequence[abs] == gap1 ) continue; |
---|
282 | if ( sequence[abs] == gap2 ) continue; |
---|
283 | this->write(rel,abs); |
---|
284 | rel++; |
---|
285 | } |
---|
286 | this->abs_seq_len = seq_len; |
---|
287 | this->rel_seq_len = rel; |
---|
288 | } |
---|
289 | // create a new master and client sequence |
---|
290 | void client_gap_compress::basic_write_master_sequence(char *sequence, long seq_len, int gap1,int gap2){ |
---|
291 | long rel = 0; |
---|
292 | long abs = 0; |
---|
293 | |
---|
294 | for (abs = 0; abs < seq_len; abs++) { // write sequence |
---|
295 | if ( sequence[abs] == gap1 ) continue; |
---|
296 | if ( sequence[abs] == gap2 ) continue; |
---|
297 | master->write(rel,abs); |
---|
298 | rel++; |
---|
299 | } |
---|
300 | this->basic_write(0,0,0); |
---|
301 | |
---|
302 | this->abs_seq_len = seq_len; |
---|
303 | this->rel_seq_len = rel; |
---|
304 | } |
---|
305 | |
---|
306 | master_gap_compress *gap_compress::search_best_master(client_gap_compress *client, |
---|
307 | master_gap_compress *exclude,long max_diff, |
---|
308 | char *sequence, long seq_len, int gap1,int gap2, int &best_alt) { |
---|
309 | |
---|
310 | long best = max_diff; |
---|
311 | best_alt = -1; |
---|
312 | master_gap_compress *best_master = 0; |
---|
313 | int best_try; |
---|
314 | master_gap_compress *master; |
---|
315 | int _try = 0; |
---|
316 | |
---|
317 | if (!nmasters) best_alt = (int)max_diff; |
---|
318 | |
---|
319 | for (_try = 0;_try<nmasters;_try++) { |
---|
320 | master = masters[_try]; |
---|
321 | if (master == exclude) continue; |
---|
322 | client->clean(master); |
---|
323 | client->basic_write_sequence(sequence,seq_len,gap1,gap2); |
---|
324 | client->optimize(0); |
---|
325 | if (best_alt < 0 || client->len < best_alt) best_alt = client->len; |
---|
326 | if (best <0 || client->len < best) { |
---|
327 | best = client->len; |
---|
328 | best_master = master; |
---|
329 | best_try = _try; |
---|
330 | if (best <=3 ) break; // very good choice, stop search |
---|
331 | } |
---|
332 | } |
---|
333 | if (!best_master) return 0; |
---|
334 | |
---|
335 | int i; |
---|
336 | for (i=best_try; i>0; i--) masters[i] = masters[i-1]; |
---|
337 | masters[0] = best_master; // bring found client to top of list |
---|
338 | |
---|
339 | client->clean(best_master); |
---|
340 | client->basic_write_sequence(sequence,seq_len,gap1,gap2); |
---|
341 | client->optimize(1); |
---|
342 | return best_master; |
---|
343 | } |
---|
344 | |
---|
345 | long sort_masters(void *m1, void *m2, char *cd ){ |
---|
346 | cd = cd; |
---|
347 | master_gap_compress *master1 = (master_gap_compress *)m1; |
---|
348 | master_gap_compress *master2 = (master_gap_compress *)m2; |
---|
349 | if( master1->ref_cnt * master1->alt_master > |
---|
350 | master2->ref_cnt * master2->alt_master ) |
---|
351 | return -1; |
---|
352 | return 0; |
---|
353 | } |
---|
354 | |
---|
355 | gap_index gap_compress::write_sequence(char *sequence, long seq_len, int gap1,int gap2){ |
---|
356 | client_gap_compress *client = 0; |
---|
357 | client = new client_gap_compress(0); |
---|
358 | master_gap_compress *master; |
---|
359 | int i; |
---|
360 | long max_diff = REL_DIFF_CLIENT_MASTER(seq_len); |
---|
361 | int alt; // alternate master !!!! |
---|
362 | |
---|
363 | |
---|
364 | master = search_best_master(client,0,max_diff, sequence, seq_len, gap1,gap2, alt); |
---|
365 | |
---|
366 | if (!master){ // create a new master !! |
---|
367 | master = new master_gap_compress(); |
---|
368 | master->alt_master = alt; |
---|
369 | client->clean(master); |
---|
370 | master->main_child = client; |
---|
371 | client->basic_write_sequence(sequence,seq_len,gap1,gap2); |
---|
372 | client->optimize(1); |
---|
373 | |
---|
374 | if (nmasters >= memmasters) { |
---|
375 | memmasters = memmasters * MEM_ALLOC_FACTOR; |
---|
376 | masters = (master_gap_compress **)realloc( |
---|
377 | (char *)masters,sizeof(void *)*memmasters); |
---|
378 | } |
---|
379 | for (i=nmasters; i>0; i--) masters[i] = masters[i-1]; |
---|
380 | nmasters++; |
---|
381 | masters[0] = master; |
---|
382 | } |
---|
383 | master->ref_cnt++; |
---|
384 | |
---|
385 | if (nmasters >= MAX_MASTERS) { // maximum masters exceeded delete |
---|
386 | int j; // search single refed masters and search new master |
---|
387 | // for the only single child |
---|
388 | for (j=0;j<MAX_MASTERS_REMOVE_N;j++){ |
---|
389 | GB_mergesort((void **)masters,0,nmasters, sort_masters,0); |
---|
390 | i = nmasters-1; |
---|
391 | int cli; |
---|
392 | for (cli = 0; cli < nclients; cli++) { |
---|
393 | client_gap_compress *cl2 = clients[cli]; |
---|
394 | if ( cl2->master != masters[i] ) continue; |
---|
395 | char *seq2 = this->read_sequence(cli); |
---|
396 | long seq_len2 = cl2->abs_seq_len; |
---|
397 | master = search_best_master(clients[cli], |
---|
398 | masters[i],-1, |
---|
399 | seq2, seq_len2, '-','-',alt); |
---|
400 | delete seq2; |
---|
401 | master->ref_cnt++; |
---|
402 | } |
---|
403 | nmasters--; |
---|
404 | delete masters[i]; |
---|
405 | } |
---|
406 | } |
---|
407 | |
---|
408 | if (nclients >= memclients) { |
---|
409 | memclients = memclients * MEM_ALLOC_FACTOR; |
---|
410 | clients = (client_gap_compress **)realloc( |
---|
411 | (char *)clients,sizeof(void *)*memclients); |
---|
412 | } |
---|
413 | clients[nclients++] = client; |
---|
414 | return nclients-1; |
---|
415 | } |
---|
416 | |
---|
417 | void gap_compress::optimize(void) { |
---|
418 | int i; |
---|
419 | for (i=0;i<nmasters;i++) masters[i]->optimize(); |
---|
420 | masters = (master_gap_compress **)realloc( |
---|
421 | (char *)masters,sizeof(master_gap_compress) * nmasters); |
---|
422 | memmasters = nmasters; |
---|
423 | clients = (client_gap_compress **)realloc( |
---|
424 | (char *)clients,sizeof(client_gap_compress) * nclients); |
---|
425 | memclients = nclients; |
---|
426 | } |
---|
427 | |
---|
428 | long gap_compress::rel_2_abs(gap_index index, long rel){ |
---|
429 | if (index >= nclients) GB_CORE; |
---|
430 | return clients[index]->rel_2_abs(rel); |
---|
431 | } |
---|
432 | |
---|
433 | char *gap_compress::read_sequence(gap_index index){ |
---|
434 | if (index >= nclients) GB_CORE; |
---|
435 | client_gap_compress *client = clients[index]; |
---|
436 | int i; |
---|
437 | long abs; |
---|
438 | char *res = (char *)malloc((int)client->abs_seq_len+1); |
---|
439 | memset(res,'-',(int)client->abs_seq_len); |
---|
440 | res[client->abs_seq_len] = 0; |
---|
441 | for (i=0;i<client->rel_seq_len; i++) { |
---|
442 | abs = client->rel_2_abs(i); |
---|
443 | res[abs] = '+'; |
---|
444 | } |
---|
445 | return res; |
---|
446 | } |
---|
447 | |
---|
448 | void gap_compress::statistic(int full){ |
---|
449 | |
---|
450 | int i; |
---|
451 | printf("nmasters %i\n",nmasters); |
---|
452 | printf("nclients %i\n",nclients); |
---|
453 | if (full) { |
---|
454 | long sum = 0; |
---|
455 | for (i=0;i<nmasters;i++) { |
---|
456 | printf(" %i: ref %i alt %i\n", |
---|
457 | i,masters[i]->ref_cnt,masters[i]->alt_master); |
---|
458 | } |
---|
459 | for (i=0;i<nclients;i++) { |
---|
460 | printf("%i ",clients[i]->len); |
---|
461 | sum += clients[i]->len; |
---|
462 | } |
---|
463 | printf("\nsum len clients %i avg %f\n",sum,(double)sum/nclients); |
---|
464 | #if 0 |
---|
465 | client_gap_compress *client = clients[nclients-1]; |
---|
466 | for (i=0 ; i < client->len; i++){ |
---|
467 | printf(" %i rel %i x %i dx %i y %i\n", |
---|
468 | i,client->l[i].rel,client->l[i].x,client->l[i].dx,client->l[i].y); |
---|
469 | } |
---|
470 | #endif |
---|
471 | } |
---|
472 | } |
---|
473 | |
---|
474 | int master_gap_compress::save(FILE *out, int index_write, int &pos){ |
---|
475 | if (index_write) { |
---|
476 | putw(len,out); |
---|
477 | if (rel_2_abs) { |
---|
478 | pt_align(0,pos,sizeof(long)); |
---|
479 | putc(1,out); |
---|
480 | }else{ |
---|
481 | pt_align(0,pos,sizeof(short)); |
---|
482 | putc(0,out); |
---|
483 | } |
---|
484 | putw(pos,out); |
---|
485 | }else{ |
---|
486 | if (rel_2_abs) { |
---|
487 | pt_align(out,pos,sizeof(long)); |
---|
488 | if (len != fwrite((char *)rel_2_abs,sizeof(long), len, out)){ |
---|
489 | return -1; |
---|
490 | } |
---|
491 | }else{ |
---|
492 | pt_align(out,pos,sizeof(short)); |
---|
493 | if (len != fwrite((char *)rel_2_abss,sizeof(short), len, out)){ |
---|
494 | return -1; |
---|
495 | } |
---|
496 | } |
---|
497 | } |
---|
498 | if (rel_2_abs) { |
---|
499 | pos += len * sizeof(long); |
---|
500 | }else{ |
---|
501 | pos += len * sizeof(short); |
---|
502 | } |
---|
503 | return 0; |
---|
504 | } |
---|
505 | |
---|
506 | int master_gap_compress::load(FILE *in, char *baseaddr){ |
---|
507 | len = getw(in); |
---|
508 | if (getc(in) != 0) { |
---|
509 | rel_2_abs = (long *)(getw(in) + baseaddr); |
---|
510 | }else{ |
---|
511 | rel_2_abss = (short *)(getw(in) + baseaddr); |
---|
512 | } |
---|
513 | return 0; |
---|
514 | } |
---|
515 | |
---|
516 | int client_gap_compress::save(FILE *out, int index_write, int &pos){ |
---|
517 | if (index_write) { |
---|
518 | putw(master->index,out); |
---|
519 | putw(len,out); |
---|
520 | pt_align(0,pos,sizeof(long)); |
---|
521 | putw(pos,out); |
---|
522 | putw((int)abs_seq_len,out); |
---|
523 | putw((int)rel_seq_len,out); |
---|
524 | }else{ |
---|
525 | pt_align(out,pos,sizeof(long)); |
---|
526 | if (len != fwrite((char *)l,sizeof(client_gap_compress_data), len, out)){ |
---|
527 | return -1; |
---|
528 | } |
---|
529 | } |
---|
530 | pos += sizeof(client_gap_compress_data) * len; |
---|
531 | return 0; |
---|
532 | } |
---|
533 | |
---|
534 | int client_gap_compress::load(FILE *in, char *baseaddr,master_gap_compress **masters){ |
---|
535 | int master_index = getw(in); |
---|
536 | master = masters[master_index]; |
---|
537 | len = getw(in); |
---|
538 | l = (client_gap_compress_data *)(getw(in) + baseaddr); |
---|
539 | abs_seq_len = getw(in); |
---|
540 | rel_seq_len = getw(in); |
---|
541 | return 0; |
---|
542 | } |
---|
543 | |
---|
544 | int gap_compress::save(FILE *out, int index_write, int &pos){ |
---|
545 | if (index_write) { |
---|
546 | putw(nmasters,out); |
---|
547 | putw(nclients,out); |
---|
548 | } |
---|
549 | long i; |
---|
550 | for (i=0;i<nmasters;i++){ |
---|
551 | if(masters[i]->save(out,index_write,pos)){ |
---|
552 | return -1; |
---|
553 | } |
---|
554 | masters[i]->index = (int)i; |
---|
555 | } |
---|
556 | for (i=0;i<nclients;i++){ |
---|
557 | if (clients[i]->save(out,index_write,pos)){ |
---|
558 | return -1; |
---|
559 | } |
---|
560 | } |
---|
561 | return 0; |
---|
562 | } |
---|
563 | |
---|
564 | int gap_compress::load(FILE *in, char *baseaddr){ |
---|
565 | delete masters; |
---|
566 | delete clients; |
---|
567 | int i; |
---|
568 | memmasters = nmasters = getw(in); |
---|
569 | memclients = nclients = getw(in); |
---|
570 | clients = (client_gap_compress **)calloc(sizeof(void *),memclients); |
---|
571 | masters = (master_gap_compress **)calloc(sizeof(void *),memmasters); |
---|
572 | |
---|
573 | for (i=0;i<nmasters;i++){ |
---|
574 | masters[i] = (master_gap_compress *)calloc(sizeof(master_gap_compress),1); |
---|
575 | masters[i]->load(in,baseaddr); |
---|
576 | } |
---|
577 | for (i=0;i<nclients;i++){ |
---|
578 | clients[i] = (client_gap_compress *)calloc(sizeof(client_gap_compress),1); |
---|
579 | clients[i]->load(in,baseaddr,masters); |
---|
580 | } |
---|
581 | return 0; |
---|
582 | } |
---|
583 | |
---|
584 | // pt_species_class::set destroys the sequence !!!!!! |
---|
585 | // name and fullname must be a new copy |
---|
586 | void pt_species_class::set(char *sequence, long seq_len, char *name, char *fullname){ |
---|
587 | delete this->fullname; |
---|
588 | this->fullname = fullname; |
---|
589 | delete this->name; |
---|
590 | this->name = name; |
---|
591 | |
---|
592 | delete this->sequence; |
---|
593 | delete this->abs_sequence; |
---|
594 | |
---|
595 | this->abs_sequence = sequence; |
---|
596 | this->sequence = (char *)calloc(sizeof(char), (int)seq_len+1); |
---|
597 | |
---|
598 | |
---|
599 | char c; |
---|
600 | char *src, |
---|
601 | *dest, |
---|
602 | *abs; |
---|
603 | src = sequence; |
---|
604 | dest = this->sequence; |
---|
605 | abs = this->abs_sequence; |
---|
606 | char *first_n = dest; // maximum number of consequtive n's |
---|
607 | char *first_abs_n = abs; |
---|
608 | |
---|
609 | #define SET_NN first_n = dest; first_abs_n = abs |
---|
610 | |
---|
611 | while(c=*(src++)){ |
---|
612 | switch (c) { |
---|
613 | case 'A': |
---|
614 | case 'a': *(dest++) = PT_A;*(abs++) = '+';SET_NN; break; |
---|
615 | case 'C': |
---|
616 | case 'c': *(dest++) = PT_C;*(abs++) = '+';SET_NN; break; |
---|
617 | case 'G': |
---|
618 | case 'g': *(dest++) = PT_G;*(abs++) = '+';SET_NN; break; |
---|
619 | case 'U': |
---|
620 | case 'u': |
---|
621 | case 'T': |
---|
622 | case 't': *(dest++) = PT_T;*(abs++) = '+';SET_NN; break; |
---|
623 | case '.': *(dest)++ = PT_QU;*(abs++) = '+'; |
---|
624 | while (*src =='.') { src++; *(abs++) = '-'; } |
---|
625 | SET_NN; |
---|
626 | break; |
---|
627 | case '-': *(abs++) = '-'; break; |
---|
628 | default: |
---|
629 | if (dest-first_n < PT_MAX_NN) { |
---|
630 | *(dest++) = PT_N;*(abs++) = '+';break; |
---|
631 | }else{ |
---|
632 | while (first_abs_n < abs) *(first_abs_n++) = '-'; |
---|
633 | dest = first_n; |
---|
634 | *(dest++) = PT_QU; |
---|
635 | *(abs++) = '+'; |
---|
636 | } |
---|
637 | break; |
---|
638 | } |
---|
639 | |
---|
640 | } |
---|
641 | *dest = PT_QU; |
---|
642 | *abs = 0; |
---|
643 | if ( abs - this->abs_sequence != seq_len) GB_CORE; |
---|
644 | this->abs_seq_len = seq_len; |
---|
645 | this->seq_len = dest-this->sequence; |
---|
646 | } |
---|
647 | pt_species_class::pt_species_class(void){ |
---|
648 | memset((char *)this,0,sizeof(pt_species_class)); |
---|
649 | } |
---|
650 | pt_species_class::~pt_species_class(void){ |
---|
651 | delete sequence; |
---|
652 | delete abs_sequence; |
---|
653 | delete name; |
---|
654 | delete fullname; |
---|
655 | } |
---|
656 | |
---|
657 | void pt_species_class::calc_gap_index(gap_compress *gc){ |
---|
658 | index = gc->write_sequence(abs_sequence, abs_seq_len, '-', '-'); |
---|
659 | } |
---|
660 | |
---|
661 | int pt_species_class::save(FILE *out, int index_write, int &pos){ |
---|
662 | if (index_write) { |
---|
663 | putw(pos,out); |
---|
664 | putw((int)abs_seq_len,out); |
---|
665 | putw((int)seq_len,out); |
---|
666 | if (pt_putstring(name,out) ){ |
---|
667 | return -1; |
---|
668 | } |
---|
669 | if (pt_putstring(fullname,out) ){ |
---|
670 | return -1; |
---|
671 | } |
---|
672 | putw((int)index,out); |
---|
673 | }else{ |
---|
674 | if (seq_len+1 != fwrite(sequence,sizeof(char), (int)seq_len+1, out)) |
---|
675 | return -1; |
---|
676 | } |
---|
677 | pos += (int)seq_len+1; |
---|
678 | return 0; |
---|
679 | } |
---|
680 | |
---|
681 | |
---|
682 | int pt_species_class::load(FILE *in, char *baseaddr){ |
---|
683 | sequence = getw(in) + baseaddr; |
---|
684 | abs_seq_len = getw(in); |
---|
685 | seq_len = getw(in); |
---|
686 | name = pt_getstring(in); |
---|
687 | fullname = pt_getstring(in); |
---|
688 | index = getw(in); |
---|
689 | return 0; |
---|
690 | } |
---|
691 | |
---|
692 | long pt_species_class::rel_2_abs(long rel){ |
---|
693 | return cgc->rel_2_abs(rel); |
---|
694 | } |
---|
695 | |
---|
696 | void pt_species_class::test_print(void){ |
---|
697 | char *buffer = (char *)calloc(sizeof(char), (int)abs_seq_len + 1); |
---|
698 | memset(buffer,'-',(int)abs_seq_len); |
---|
699 | int i; |
---|
700 | long abs; |
---|
701 | for (i=0; i <seq_len; i++) { |
---|
702 | abs = rel_2_abs(i); |
---|
703 | buffer[abs] = PT_BASE_2_ASCII[this->get(i)]; |
---|
704 | } |
---|
705 | for (i=0;i< abs_seq_len; i+= 80) { |
---|
706 | printf("%i ",i); |
---|
707 | fwrite(buffer+i,1,80,stdout); |
---|
708 | printf("\n"); |
---|
709 | } |
---|
710 | delete buffer; |
---|
711 | } |
---|
712 | |
---|
713 | |
---|
714 | pt_main_class::pt_main_class(void) { |
---|
715 | memset((char *)this,0,sizeof(pt_main_class)); |
---|
716 | } |
---|
717 | |
---|
718 | pt_main_class::~pt_main_class(void) { |
---|
719 | GB_CORE; |
---|
720 | } |
---|
721 | |
---|
722 | void pt_main_class::calc_name_hash(void){ |
---|
723 | long i; |
---|
724 | namehash = GBS_create_hash(PT_NAME_HASH_SIZE); |
---|
725 | for (i=0;i<data_count;i++){ |
---|
726 | GBS_write_hash(namehash, species[i].name,i+1); |
---|
727 | } |
---|
728 | |
---|
729 | } |
---|
730 | |
---|
731 | void pt_main_class::calc_max_size_char_count(void){ |
---|
732 | max_size = 0; |
---|
733 | long i; |
---|
734 | for (i = 0; i < data_count; i++){ /* get max sequence len */ |
---|
735 | max_size = (int)MAX(max_size, species[i].seq_len); |
---|
736 | char_count += species[i].seq_len; |
---|
737 | } |
---|
738 | } |
---|
739 | |
---|
740 | void pt_main_class::calc_bi_ecoli(void){ |
---|
741 | if ( ecoli && strlen(ecoli) ){ |
---|
742 | BI_ecoli_ref *ref = new BI_ecoli_ref; |
---|
743 | char *error = ref->init(ecoli,strlen(ecoli)); |
---|
744 | if (error) { |
---|
745 | delete ecoli; |
---|
746 | ecoli = 0; |
---|
747 | }else{ |
---|
748 | bi_ecoli = (void *)ref; |
---|
749 | } |
---|
750 | } |
---|
751 | } |
---|
752 | void pt_main_class::init(GBDATA *gb_main){ |
---|
753 | GB_transaction dummy(gb_main); |
---|
754 | GBDATA *gb_extended_data = GB_search(gb_main, "extended_data", GB_CREATE_CONTAINER); |
---|
755 | GBDATA *gb_species_data = GB_search(gb_main, "species_data", GB_CREATE_CONTAINER); |
---|
756 | GBDATA *gb_extended; |
---|
757 | GBDATA *gb_species; |
---|
758 | GBDATA *gb_data; |
---|
759 | GBDATA *gb_name; |
---|
760 | alignment_name = GBT_get_default_alignment(gb_main); |
---|
761 | |
---|
762 | gb_extended = GBT_find_extended(gb_extended_data, GBT_get_default_ref(gb_main)); |
---|
763 | delete ecoli; |
---|
764 | ecoli = 0; |
---|
765 | if (gb_extended) { |
---|
766 | gb_data = GBT_read_sequence(gb_extended,alignment_name); |
---|
767 | if (gb_data) { |
---|
768 | ecoli = GB_read_string(gb_data); |
---|
769 | } |
---|
770 | } |
---|
771 | data_count = 0; |
---|
772 | |
---|
773 | for ( gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
774 | gb_species; |
---|
775 | gb_species = GBT_next_species(gb_species) ){ |
---|
776 | gb_data = GBT_read_sequence(gb_extended,alignment_name); |
---|
777 | if (!gb_data) continue; |
---|
778 | gb_name = GB_search(gb_species,"name",GB_FIND); |
---|
779 | if (!gb_name) continue; |
---|
780 | data_count ++; |
---|
781 | } |
---|
782 | } |
---|
783 | // pt_main_class::save does more than save: all the conversation is done |
---|
784 | // by this procedure |
---|
785 | |
---|
786 | int pt_main_class::save(GBDATA *gb_main, FILE *out, int index_write, int &pos){ |
---|
787 | |
---|
788 | GB_transaction dummy(gb_main); |
---|
789 | GBDATA *gb_species_data = GB_search(gb_main, "species_data", GB_CREATE_CONTAINER); |
---|
790 | GBDATA *gb_species; |
---|
791 | GBDATA *gb_name; |
---|
792 | GBDATA *gb_fullname; |
---|
793 | GBDATA *gb_data; |
---|
794 | char *name; |
---|
795 | char *fullname; |
---|
796 | pt_species_class *species = new pt_species_class; |
---|
797 | if (index_write){ |
---|
798 | this->init(gb_main); // calculate the number of species |
---|
799 | putw(data_count,out); |
---|
800 | pt_putstring(ecoli,out); |
---|
801 | gc = new gap_compress; |
---|
802 | } |
---|
803 | int count = 0; |
---|
804 | for ( gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
805 | gb_species; |
---|
806 | gb_species = GBT_next_species(gb_species) ){ |
---|
807 | gb_data = GBT_read_sequence(gb_species,alignment_name); |
---|
808 | if (!gb_data) continue; |
---|
809 | gb_name = GB_search(gb_species,"name",GB_FIND); |
---|
810 | if (!gb_name) continue; |
---|
811 | name = GB_read_string(gb_name); |
---|
812 | gb_fullname = GB_search(gb_species,"full_name",GB_FIND); |
---|
813 | if (gb_fullname) fullname = GB_read_string(gb_fullname); |
---|
814 | else fullname = strdup(""); |
---|
815 | char *seq = GB_read_string(gb_data);; |
---|
816 | long seq_len = GB_read_string_count(gb_data); |
---|
817 | species->set(seq,seq_len, name,fullname); |
---|
818 | if (index_write) { |
---|
819 | species->calc_gap_index(gc); |
---|
820 | if (count != species->index) GB_CORE; |
---|
821 | } |
---|
822 | if (species->save(out,index_write, pos)){ |
---|
823 | return -1; |
---|
824 | } |
---|
825 | |
---|
826 | count ++; |
---|
827 | if (count % 100 == 0) { |
---|
828 | printf("%i ...\n",count); |
---|
829 | } |
---|
830 | } |
---|
831 | delete species; |
---|
832 | if (index_write) { |
---|
833 | gc->optimize(); |
---|
834 | gc->statistic(); |
---|
835 | } |
---|
836 | if (gc->save(out,index_write,pos)){ |
---|
837 | return -1; |
---|
838 | } |
---|
839 | return 0; |
---|
840 | } |
---|
841 | |
---|
842 | int pt_main_class::load(FILE *in, char *baseaddr){ |
---|
843 | data_count = getw(in); |
---|
844 | ecoli = pt_getstring(in); |
---|
845 | species = (pt_species_class *)calloc(sizeof(pt_species_class),data_count); |
---|
846 | int i; |
---|
847 | for (i=0;i<data_count;i++) { |
---|
848 | species[i].load(in,baseaddr); |
---|
849 | } |
---|
850 | |
---|
851 | gc = new gap_compress; |
---|
852 | if (gc->load(in,baseaddr)){ |
---|
853 | return -1; |
---|
854 | } |
---|
855 | |
---|
856 | for (i=0;i<data_count;i++) { |
---|
857 | species[i].cgc = gc->clients[i]; |
---|
858 | } |
---|
859 | |
---|
860 | calc_name_hash(); |
---|
861 | calc_max_size_char_count(); |
---|
862 | calc_bi_ecoli(); |
---|
863 | |
---|
864 | return 0; |
---|
865 | } |
---|
866 | |
---|
867 | long pt_main_class::abs_2_ecoli(long pos) |
---|
868 | { |
---|
869 | if (!bi_ecoli) return pos; |
---|
870 | long res,dummy; |
---|
871 | ((BI_ecoli_ref *)bi_ecoli)->abs_2_rel(pos,res,dummy); |
---|
872 | return res; |
---|
873 | } |
---|
874 | |
---|
875 | int pt_main_class::main_convert(GBDATA *gb_main, char *basename){ |
---|
876 | FILE *index_file; |
---|
877 | FILE *sequence_file; |
---|
878 | char *index_name = GBS_string_eval(basename,"*=*1.ind",0); |
---|
879 | char *sequence_name = GBS_string_eval(basename,"*=*1.seq",0); |
---|
880 | |
---|
881 | index_file = fopen(index_name,"w"); |
---|
882 | if (!index_file) { |
---|
883 | GB_ERROR("Error Cannot create and write to file %s\n",index_name); |
---|
884 | return -1; |
---|
885 | } |
---|
886 | sequence_file = fopen(sequence_name,"w"); |
---|
887 | if (!sequence_file) { |
---|
888 | GB_ERROR("Error Cannot create and write to file %s\n",sequence_name); |
---|
889 | return -1; |
---|
890 | } |
---|
891 | GB_transaction dummy(gb_main); |
---|
892 | |
---|
893 | int pos = 0; |
---|
894 | if (this->save(gb_main, index_file,1,pos)) { |
---|
895 | GB_ERROR("Index Write Error\n"); return(-1); |
---|
896 | }; |
---|
897 | fclose(index_file); |
---|
898 | pos = 0; |
---|
899 | if (this->save(gb_main, sequence_file,0,pos)) { |
---|
900 | GB_ERROR("Index Write Error\n"); return (-1); |
---|
901 | }; |
---|
902 | fclose(sequence_file); |
---|
903 | printf("Sequence File length: %i\n",pos); |
---|
904 | return 0; |
---|
905 | } |
---|
906 | |
---|
907 | int pt_main_class::import_all(char *basename){ |
---|
908 | FILE *index_file; |
---|
909 | char *index_name = GBS_string_eval(basename,"*=*1.ind",0); |
---|
910 | char *sequence_name = GBS_string_eval(basename,"*=*1.seq",0); |
---|
911 | index_file = fopen(index_name,"r"); |
---|
912 | if (!index_file) { |
---|
913 | fprintf(stderr,"Error Cannot create and write to file %s\n",index_name); |
---|
914 | return -1; |
---|
915 | } |
---|
916 | char *baseaddr = GB_map_file(sequence_name); |
---|
917 | if (!baseaddr) { |
---|
918 | GB_print_error(); |
---|
919 | return -1; |
---|
920 | } |
---|
921 | if (load(index_file, baseaddr) ) { |
---|
922 | return -1; |
---|
923 | } |
---|
924 | return 0; |
---|
925 | } |
---|
926 | |
---|
927 | int |
---|
928 | main(int argc, char **argv) |
---|
929 | { |
---|
930 | char *path; |
---|
931 | GBDATA *gb_main; |
---|
932 | pt_main_class *mc = new pt_main_class; |
---|
933 | char *filename = "out.arb"; |
---|
934 | if (argc < 2){ |
---|
935 | if (mc->import_all(filename)) { |
---|
936 | fprintf(stderr,"Load Error\n"); |
---|
937 | return -1; |
---|
938 | } |
---|
939 | int i; |
---|
940 | for (i=0; i < mc->data_count ; i++ ) { |
---|
941 | mc->species[i].test_print(); |
---|
942 | } |
---|
943 | return 0; |
---|
944 | }else{ |
---|
945 | path = argv[1]; |
---|
946 | } |
---|
947 | |
---|
948 | if (!(gb_main = GB_open(path, "r"))) |
---|
949 | return (-1); |
---|
950 | if (mc->main_convert(gb_main,filename)) { |
---|
951 | fprintf(stderr,"Error\n"); |
---|
952 | return -1; |
---|
953 | } |
---|
954 | GB_close(gb_main); |
---|
955 | return 0; |
---|
956 | } |
---|
957 | |
---|
958 | #if 0 |
---|
959 | char *cmp; |
---|
960 | int i; |
---|
961 | cmp = gc.read_sequence(index); |
---|
962 | for (i=0;i<seq_len;i++) { |
---|
963 | if ( cmp[i] == '-' && ( |
---|
964 | seq[i] == '-' || seq[i] == '.')) continue; |
---|
965 | if ( cmp[i] != '-' && ( |
---|
966 | seq[i] != '-' && seq[i] != '.')) continue; |
---|
967 | break; |
---|
968 | } |
---|
969 | if (i<seq_len) { |
---|
970 | printf("sequences differ %i\n%s\n%s\n",count, seq,cmp); |
---|
971 | } |
---|
972 | |
---|
973 | delete cmp; |
---|
974 | #endif |
---|