| 1 | #include <cstdio> |
|---|
| 2 | #include <cstdlib> |
|---|
| 3 | #include <cstring> |
|---|
| 4 | #include <climits> |
|---|
| 5 | |
|---|
| 6 | #include <PT_server.h> |
|---|
| 7 | #include <PT_server_prototypes.h> |
|---|
| 8 | #include <struct_man.h> |
|---|
| 9 | #include "probe.h" |
|---|
| 10 | #include "probe_tree.hxx" |
|---|
| 11 | #include <arbdbt.h> |
|---|
| 12 | #include <inline.h> |
|---|
| 13 | |
|---|
| 14 | #ifdef P_ |
|---|
| 15 | #error P_ already defined |
|---|
| 16 | #endif |
|---|
| 17 | |
|---|
| 18 | #include "pt_prototypes.h" |
|---|
| 19 | |
|---|
| 20 | // overloaded functions to avoid problems with type-punning: |
|---|
| 21 | inline void aisc_link(dll_public *dll, PT_tprobes *tprobe) { aisc_link(reinterpret_cast<dllpublic_ext*>(dll), reinterpret_cast<dllheader_ext*>(tprobe)); } |
|---|
| 22 | inline void aisc_link(dll_public *dll, PT_probeparts *parts) { aisc_link(reinterpret_cast<dllpublic_ext*>(dll), reinterpret_cast<dllheader_ext*>(parts)); } |
|---|
| 23 | inline void aisc_link(dll_public *dll, PT_probematch *match) { aisc_link(reinterpret_cast<dllpublic_ext*>(dll), reinterpret_cast<dllheader_ext*>(match)); } |
|---|
| 24 | |
|---|
| 25 | extern "C" { |
|---|
| 26 | int pt_init_bond_matrix(PT_pdc *THIS) |
|---|
| 27 | { |
|---|
| 28 | THIS->bond[0].val = 0.0; |
|---|
| 29 | THIS->bond[1].val = 0.0; |
|---|
| 30 | THIS->bond[2].val = 0.5; |
|---|
| 31 | THIS->bond[3].val = 1.1; |
|---|
| 32 | THIS->bond[4].val = 0.0; |
|---|
| 33 | THIS->bond[5].val = 0.0; |
|---|
| 34 | THIS->bond[6].val = 1.5; |
|---|
| 35 | THIS->bond[7].val = 0.0; |
|---|
| 36 | THIS->bond[8].val = 0.5; |
|---|
| 37 | THIS->bond[9].val = 1.5; |
|---|
| 38 | THIS->bond[10].val = 0.4; |
|---|
| 39 | THIS->bond[11].val = 0.9; |
|---|
| 40 | THIS->bond[12].val = 1.1; |
|---|
| 41 | THIS->bond[13].val = 0.0; |
|---|
| 42 | THIS->bond[14].val = 0.9; |
|---|
| 43 | THIS->bond[15].val = 0.0; |
|---|
| 44 | |
|---|
| 45 | return 0; |
|---|
| 46 | } |
|---|
| 47 | } |
|---|
| 48 | struct ptnd_loop_com { |
|---|
| 49 | PT_pdc *pdc; |
|---|
| 50 | PT_local *locs; |
|---|
| 51 | PT_probeparts *parts; |
|---|
| 52 | int mishits; |
|---|
| 53 | int new_match; /* match or design the probe: 1 match 0 design */ |
|---|
| 54 | double sum_bonds; /* sum of bond of longest non mismatch string */ |
|---|
| 55 | double dt; /* sum of mismatches */ |
|---|
| 56 | } ptnd; |
|---|
| 57 | |
|---|
| 58 | |
|---|
| 59 | static int ptnd_compare_quality(const void *PT_tprobes_ptr1, const void *PT_tprobes_ptr2, void *) { |
|---|
| 60 | const PT_tprobes *tprobe1 = (const PT_tprobes*)PT_tprobes_ptr1; |
|---|
| 61 | const PT_tprobes *tprobe2 = (const PT_tprobes*)PT_tprobes_ptr2; |
|---|
| 62 | |
|---|
| 63 | // sort best quality first |
|---|
| 64 | if (tprobe1->quality < tprobe2->quality) return 1; |
|---|
| 65 | if (tprobe1->quality > tprobe2->quality) return -1; |
|---|
| 66 | return 0; |
|---|
| 67 | } |
|---|
| 68 | |
|---|
| 69 | static int ptnd_compare_sequence(const void *PT_tprobes_ptr1, const void *PT_tprobes_ptr2, void*) { |
|---|
| 70 | const PT_tprobes *tprobe1 = (const PT_tprobes*)PT_tprobes_ptr1; |
|---|
| 71 | const PT_tprobes *tprobe2 = (const PT_tprobes*)PT_tprobes_ptr2; |
|---|
| 72 | |
|---|
| 73 | return strcmp(tprobe1->sequence,tprobe2->sequence); |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | static void ptnd_sort_probes_by(PT_pdc *pdc, int mode) /* mode 0 quality, mode 1 sequence */ |
|---|
| 77 | { |
|---|
| 78 | PT_tprobes **my_list; |
|---|
| 79 | int list_len; |
|---|
| 80 | PT_tprobes *tprobe; |
|---|
| 81 | int i; |
|---|
| 82 | |
|---|
| 83 | if (!pdc->tprobes) return; |
|---|
| 84 | list_len = get_TPROBE_CNT(pdc->tprobes); |
|---|
| 85 | if (list_len <= 1) return; |
|---|
| 86 | my_list = (PT_tprobes **)calloc(sizeof(void *),list_len); |
|---|
| 87 | for (i=0, tprobe = pdc->tprobes; |
|---|
| 88 | tprobe; |
|---|
| 89 | i++,tprobe=tprobe->next) |
|---|
| 90 | { |
|---|
| 91 | my_list[i] = tprobe; |
|---|
| 92 | } |
|---|
| 93 | switch (mode) { |
|---|
| 94 | case 0: |
|---|
| 95 | GB_sort((void **)my_list, 0, list_len, ptnd_compare_quality, 0); |
|---|
| 96 | break; |
|---|
| 97 | case 1: |
|---|
| 98 | GB_sort((void **)my_list, 0, list_len, ptnd_compare_sequence, 0); |
|---|
| 99 | break; |
|---|
| 100 | } |
|---|
| 101 | |
|---|
| 102 | for (i=0;i<list_len;i++) { |
|---|
| 103 | aisc_unlink(reinterpret_cast<dllheader_ext*>(my_list[i])); |
|---|
| 104 | aisc_link(&pdc->ptprobes, my_list[i]); |
|---|
| 105 | } |
|---|
| 106 | free((char *)my_list); |
|---|
| 107 | } |
|---|
| 108 | static void ptnd_probe_delete_all_but(PT_pdc *pdc, int count) |
|---|
| 109 | { |
|---|
| 110 | PT_tprobes *tprobe; |
|---|
| 111 | int i; |
|---|
| 112 | |
|---|
| 113 | for (i=0,tprobe = pdc->tprobes; |
|---|
| 114 | tprobe && i< count; |
|---|
| 115 | i++,tprobe = tprobe->next ) ; |
|---|
| 116 | |
|---|
| 117 | if (tprobe) { |
|---|
| 118 | while(tprobe->next) { |
|---|
| 119 | destroy_PT_tprobes( tprobe->next ); |
|---|
| 120 | } |
|---|
| 121 | } |
|---|
| 122 | } |
|---|
| 123 | static int pt_get_gc_content(char *probe) |
|---|
| 124 | { |
|---|
| 125 | int gc = 0; |
|---|
| 126 | while (*probe){ |
|---|
| 127 | if ( *probe == PT_G || |
|---|
| 128 | *probe == PT_C) { |
|---|
| 129 | gc++; |
|---|
| 130 | } |
|---|
| 131 | probe ++; |
|---|
| 132 | } |
|---|
| 133 | return gc; |
|---|
| 134 | } |
|---|
| 135 | static double pt_get_temperature(const char *probe) |
|---|
| 136 | { |
|---|
| 137 | int i; |
|---|
| 138 | double t = 0; |
|---|
| 139 | while (( i=*(probe++) )) { |
|---|
| 140 | if (i == PT_G || i == PT_C) t+=4.0;else t+= 2.0; |
|---|
| 141 | } |
|---|
| 142 | return t; |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | #if 0 |
|---|
| 146 | int ptnd_check_pure(char *probe) |
|---|
| 147 | { |
|---|
| 148 | int i; |
|---|
| 149 | while (( i=*(probe++) )) { |
|---|
| 150 | if ( i < PT_A || i > PT_T) return 1; |
|---|
| 151 | } |
|---|
| 152 | return 0; |
|---|
| 153 | } |
|---|
| 154 | #endif |
|---|
| 155 | |
|---|
| 156 | static void ptnd_calc_quality(PT_pdc *pdc) { |
|---|
| 157 | PT_tprobes *tprobe; |
|---|
| 158 | int i; |
|---|
| 159 | |
|---|
| 160 | for (tprobe = pdc->tprobes; |
|---|
| 161 | tprobe; |
|---|
| 162 | tprobe = tprobe->next) |
|---|
| 163 | { |
|---|
| 164 | for (i=0; i< PERC_SIZE-1; i++) { |
|---|
| 165 | if (tprobe->perc[i] > tprobe->mishit) break; |
|---|
| 166 | } |
|---|
| 167 | tprobe->quality = ((double)tprobe->groupsize * i) + 1000.0/(1000.0 + tprobe->perc[i]) ; |
|---|
| 168 | } |
|---|
| 169 | } |
|---|
| 170 | /*********************************************************************** |
|---|
| 171 | check the bond val for a probe |
|---|
| 172 | ***********************************************************************/ |
|---|
| 173 | static double ptnd_check_max_bond( PT_pdc *pdc, char base) |
|---|
| 174 | { |
|---|
| 175 | int complement = PT_complement(base); |
|---|
| 176 | return pdc->bond[ (complement-(int)PT_A)*4 + base-(int)PT_A].val; |
|---|
| 177 | } |
|---|
| 178 | |
|---|
| 179 | double ptnd_check_split(PT_pdc *pdc, char *probe, int pos, char ref) { |
|---|
| 180 | int base = probe[pos]; |
|---|
| 181 | int complement = PT_complement(base); |
|---|
| 182 | double max_bind = pdc->bond[ (complement-(int)PT_A)*4 + base-(int)PT_A].val; |
|---|
| 183 | double new_bind = pdc->bond[ (complement-(int)PT_A)*4 + ref-(int)PT_A].val; |
|---|
| 184 | |
|---|
| 185 | if ( new_bind < pdc->split) |
|---|
| 186 | return new_bind-max_bind; /* negative values indicate split */ |
|---|
| 187 | /* this mismatch splits the probe in several domains */ |
|---|
| 188 | return (max_bind - new_bind); |
|---|
| 189 | } |
|---|
| 190 | |
|---|
| 191 | |
|---|
| 192 | /*********************************************************************** |
|---|
| 193 | primary search for probes |
|---|
| 194 | ***********************************************************************/ |
|---|
| 195 | /* count all mishits for a probe */ |
|---|
| 196 | |
|---|
| 197 | struct ptnd_chain_count_mishits { |
|---|
| 198 | int operator()(int name, int apos, int rpos) { |
|---|
| 199 | char *probe = psg.probe; |
|---|
| 200 | psg.abs_pos.announce(apos); |
|---|
| 201 | if (psg.data[name].is_group) return 0; /* dont count group or neverminds */ |
|---|
| 202 | if (probe) { |
|---|
| 203 | rpos+=psg.height; |
|---|
| 204 | while (*probe && psg.data[name].data[rpos]) { |
|---|
| 205 | if (psg.data[name].data[rpos] != *(probe)) |
|---|
| 206 | return 0; |
|---|
| 207 | probe++; rpos++; |
|---|
| 208 | } |
|---|
| 209 | } |
|---|
| 210 | ptnd.mishits++; |
|---|
| 211 | return 0; |
|---|
| 212 | } |
|---|
| 213 | }; |
|---|
| 214 | |
|---|
| 215 | /* go down the tree to chains and leafs; count the species that are in the non member group */ |
|---|
| 216 | static int ptnd_count_mishits2(POS_TREE *pt) |
|---|
| 217 | { |
|---|
| 218 | int base; |
|---|
| 219 | int name; |
|---|
| 220 | int mishits = 0; |
|---|
| 221 | |
|---|
| 222 | if (pt == NULL) |
|---|
| 223 | return 0; |
|---|
| 224 | if (PT_read_type(pt) == PT_NT_LEAF) { |
|---|
| 225 | name = PT_read_name(psg.ptmain,pt); |
|---|
| 226 | int apos = PT_read_apos(psg.ptmain,pt); |
|---|
| 227 | psg.abs_pos.announce(apos); |
|---|
| 228 | if (!psg.data[name].is_group) return 1; |
|---|
| 229 | return 0; |
|---|
| 230 | }else if (PT_read_type(pt) == PT_NT_CHAIN) { |
|---|
| 231 | psg.probe = 0; |
|---|
| 232 | ptnd.mishits = 0; |
|---|
| 233 | PT_read_chain(psg.ptmain,pt, ptnd_chain_count_mishits()); |
|---|
| 234 | return ptnd.mishits; |
|---|
| 235 | }else{ |
|---|
| 236 | for (base = PT_QU; base< PT_B_MAX; base++) { |
|---|
| 237 | mishits += ptnd_count_mishits2(PT_read_son(psg.ptmain,pt,(PT_BASES)base)); |
|---|
| 238 | } |
|---|
| 239 | return mishits; |
|---|
| 240 | } |
|---|
| 241 | } |
|---|
| 242 | extern "C" char *get_design_info(PT_tprobes *tprobe) |
|---|
| 243 | { |
|---|
| 244 | char *buffer = (char *)GB_give_buffer(2000); |
|---|
| 245 | char *probe = (char *)GB_give_buffer2(tprobe->seq_len + 10); |
|---|
| 246 | PT_pdc *pdc = (PT_pdc *)tprobe->mh.parent->parent; |
|---|
| 247 | char *p; |
|---|
| 248 | int i; |
|---|
| 249 | int sum; |
|---|
| 250 | |
|---|
| 251 | p = buffer; |
|---|
| 252 | |
|---|
| 253 | // target |
|---|
| 254 | strcpy(probe,tprobe->sequence); |
|---|
| 255 | PT_base_2_string(probe,0); /* convert probe to real ASCII */ |
|---|
| 256 | sprintf(p, "%-*s", pdc->probelen+1, probe); |
|---|
| 257 | p += strlen(p); |
|---|
| 258 | |
|---|
| 259 | int apos = tprobe->apos; |
|---|
| 260 | int c = 0; |
|---|
| 261 | int cs = '='; |
|---|
| 262 | { // search nearest hit |
|---|
| 263 | for (c=0;c<ALPHA_SIZE;c++) { |
|---|
| 264 | if (!pdc->pos_groups[c]) { // new group |
|---|
| 265 | pdc->pos_groups[c] = tprobe->apos; |
|---|
| 266 | break; |
|---|
| 267 | } |
|---|
| 268 | int dist = tprobe->apos - pdc->pos_groups[c]; |
|---|
| 269 | if (dist<0) dist = -dist; |
|---|
| 270 | if ( dist < pdc->probelen) { |
|---|
| 271 | apos = tprobe->apos - pdc->pos_groups[c]; |
|---|
| 272 | if (apos > 0) { |
|---|
| 273 | cs = '+'; |
|---|
| 274 | }else{ |
|---|
| 275 | apos = -apos; cs = '-'; |
|---|
| 276 | } |
|---|
| 277 | break; |
|---|
| 278 | } |
|---|
| 279 | } |
|---|
| 280 | } |
|---|
| 281 | |
|---|
| 282 | // le apos ecol |
|---|
| 283 | sprintf(p,"%2i %c%c%6i %4li ", tprobe->seq_len, c+'A', cs, apos, PT_abs_2_rel(tprobe->apos)); |
|---|
| 284 | p += strlen(p); |
|---|
| 285 | |
|---|
| 286 | // grps |
|---|
| 287 | sprintf(p,"%4i ",tprobe->groupsize); |
|---|
| 288 | p += strlen(p); |
|---|
| 289 | |
|---|
| 290 | // G+C |
|---|
| 291 | sprintf(p,"%-4.1f ",((double)pt_get_gc_content(tprobe->sequence))/tprobe->seq_len*100.0); |
|---|
| 292 | p += strlen(p); |
|---|
| 293 | |
|---|
| 294 | // 4GC+2AT |
|---|
| 295 | sprintf(p,"%-7.1f ", pt_get_temperature(tprobe->sequence)); |
|---|
| 296 | p += strlen(p); |
|---|
| 297 | |
|---|
| 298 | // probe string |
|---|
| 299 | probe = reverse_probe(tprobe->sequence,0); |
|---|
| 300 | complement_probe(probe,0); |
|---|
| 301 | PT_base_2_string(probe,0); /* convert probe to real ASCII */ |
|---|
| 302 | sprintf(p,"%-*s |", pdc->probelen, probe); |
|---|
| 303 | free(probe); |
|---|
| 304 | p += strlen(p); |
|---|
| 305 | |
|---|
| 306 | // non-group hits by temp. decrease |
|---|
| 307 | for (sum=i=0;i<PERC_SIZE;i++) { |
|---|
| 308 | sum+= tprobe->perc[i]; |
|---|
| 309 | sprintf(p,"%3i,",sum); |
|---|
| 310 | p += strlen(p); |
|---|
| 311 | } |
|---|
| 312 | |
|---|
| 313 | return buffer; |
|---|
| 314 | } |
|---|
| 315 | |
|---|
| 316 | |
|---|
| 317 | extern "C" char *get_design_hinfo(PT_tprobes *tprobe) { |
|---|
| 318 | char *buffer = (char *)GB_give_buffer(2000); |
|---|
| 319 | char *s = buffer; |
|---|
| 320 | PT_pdc *pdc; |
|---|
| 321 | |
|---|
| 322 | if (!tprobe) { |
|---|
| 323 | return (char*)"Sorry, there are no probes for your selection !!!"; |
|---|
| 324 | } |
|---|
| 325 | pdc = (PT_pdc *)tprobe->mh.parent->parent; |
|---|
| 326 | sprintf(buffer, |
|---|
| 327 | "Probe design Parameters:\n" |
|---|
| 328 | "Length of probe %4i\n" |
|---|
| 329 | "Temperature [%4.1f -%4.1f]\n" |
|---|
| 330 | "GC-Content [%4.1f -%4.1f]\n" |
|---|
| 331 | "E.Coli Position [%4i -%4i]\n" |
|---|
| 332 | "Max Non Group Hits %4i\n" |
|---|
| 333 | "Min Group Hits %4.0f%%\n", |
|---|
| 334 | pdc->probelen, |
|---|
| 335 | pdc->mintemp,pdc->maxtemp, |
|---|
| 336 | pdc->min_gc*100.0,pdc->max_gc*100.0, |
|---|
| 337 | pdc->minpos, pdc->maxpos, |
|---|
| 338 | pdc->mishit, pdc->mintarget*100.0); |
|---|
| 339 | |
|---|
| 340 | s += strlen(s); |
|---|
| 341 | |
|---|
| 342 | sprintf(s,"%-*s", pdc->probelen+1, "Target"); |
|---|
| 343 | s += strlen(s); |
|---|
| 344 | |
|---|
| 345 | sprintf(s,"%2s %8s %4s ","le","apos","ecol"); |
|---|
| 346 | s += strlen(s); |
|---|
| 347 | |
|---|
| 348 | sprintf(s,"%4s ","grps"); // groupsize |
|---|
| 349 | s += strlen(s); |
|---|
| 350 | |
|---|
| 351 | sprintf(s,"%-4s %-7s %-*s | "," G+C","4GC+2AT", pdc->probelen, "Probe sequence"); |
|---|
| 352 | s += strlen(s); |
|---|
| 353 | |
|---|
| 354 | sprintf(s, "Decrease T by n*.3C -> probe matches n non group species"); |
|---|
| 355 | // s += strlen(s); |
|---|
| 356 | |
|---|
| 357 | return buffer; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | /* search down the tree to find matching species for the given probe */ |
|---|
| 361 | static int ptnd_count_mishits(char *probe, POS_TREE *pt,int height) |
|---|
| 362 | { |
|---|
| 363 | int name; |
|---|
| 364 | int i; |
|---|
| 365 | POS_TREE *pthelp; |
|---|
| 366 | int pos; |
|---|
| 367 | int mishits; |
|---|
| 368 | |
|---|
| 369 | if (!pt) return 0; |
|---|
| 370 | mishits = 0; |
|---|
| 371 | if (PT_read_type(pt) == PT_NT_NODE && *probe) { |
|---|
| 372 | for (i=PT_A; i<PT_B_MAX; i++) { |
|---|
| 373 | if (i!= *probe) continue; |
|---|
| 374 | if (( pthelp = PT_read_son(psg.ptmain,pt, (PT_BASES)i) )) |
|---|
| 375 | mishits += ptnd_count_mishits(probe+1,pthelp, height+1); |
|---|
| 376 | } |
|---|
| 377 | return mishits; |
|---|
| 378 | } |
|---|
| 379 | if (*probe) { |
|---|
| 380 | if (PT_read_type(pt) == PT_NT_LEAF) { |
|---|
| 381 | pos = PT_read_rpos(psg.ptmain,pt) + height; |
|---|
| 382 | name = PT_read_name(psg.ptmain,pt); |
|---|
| 383 | if (pos + (int)(strlen(probe)) >= psg.data[name].size) /* after end */ |
|---|
| 384 | return 0; |
|---|
| 385 | |
|---|
| 386 | while (*probe) { |
|---|
| 387 | if (psg.data[name].data[pos++] != *(probe++)) |
|---|
| 388 | return 0; |
|---|
| 389 | } |
|---|
| 390 | } else { /* chain */ |
|---|
| 391 | psg.probe = probe; |
|---|
| 392 | psg.height = height; |
|---|
| 393 | ptnd.mishits = 0; |
|---|
| 394 | PT_read_chain(psg.ptmain,pt, ptnd_chain_count_mishits()); |
|---|
| 395 | return ptnd.mishits; |
|---|
| 396 | } |
|---|
| 397 | } |
|---|
| 398 | return ptnd_count_mishits2(pt); |
|---|
| 399 | } |
|---|
| 400 | |
|---|
| 401 | static void ptnd_first_check(PT_pdc *pdc) /* checks the direkt mishits */ |
|---|
| 402 | { |
|---|
| 403 | PT_tprobes *tprobe, *tprobe_next; |
|---|
| 404 | for ( tprobe = pdc->tprobes; |
|---|
| 405 | tprobe; |
|---|
| 406 | tprobe = tprobe_next ) { |
|---|
| 407 | tprobe_next = tprobe->next; |
|---|
| 408 | psg.abs_pos.clear(); |
|---|
| 409 | tprobe->mishit = ptnd_count_mishits(tprobe->sequence,psg.pt,0); |
|---|
| 410 | tprobe->apos = psg.abs_pos.get_most_used(); |
|---|
| 411 | if (tprobe->mishit > pdc->mishit) { |
|---|
| 412 | destroy_PT_tprobes(tprobe); |
|---|
| 413 | } |
|---|
| 414 | } |
|---|
| 415 | psg.abs_pos.clear(); |
|---|
| 416 | } |
|---|
| 417 | /*********************************************************************** |
|---|
| 418 | Check the probes Position |
|---|
| 419 | ***********************************************************************/ |
|---|
| 420 | |
|---|
| 421 | static void ptnd_check_position(PT_pdc *pdc) /* checks the direkt mishits */ |
|---|
| 422 | { |
|---|
| 423 | PT_tprobes *tprobe, *tprobe_next; |
|---|
| 424 | if ( pdc->minpos == pdc->maxpos) return; |
|---|
| 425 | |
|---|
| 426 | for ( tprobe = pdc->tprobes; |
|---|
| 427 | tprobe; |
|---|
| 428 | tprobe = tprobe_next ) { |
|---|
| 429 | tprobe_next = tprobe->next; |
|---|
| 430 | long relpos = PT_abs_2_rel(tprobe->apos); |
|---|
| 431 | if (relpos < pdc->minpos || relpos > pdc->maxpos) { |
|---|
| 432 | destroy_PT_tprobes(tprobe); |
|---|
| 433 | } |
|---|
| 434 | } |
|---|
| 435 | } |
|---|
| 436 | /*********************************************************************** |
|---|
| 437 | check the average bond size |
|---|
| 438 | ***********************************************************************/ |
|---|
| 439 | static void ptnd_check_bonds(PT_pdc *pdc, int match) /* checks probe hairpin bonds */ |
|---|
| 440 | { |
|---|
| 441 | PT_tprobes *tprobe, *tprobe_next; |
|---|
| 442 | int i; |
|---|
| 443 | double sbond; |
|---|
| 444 | for ( tprobe = pdc->tprobes; |
|---|
| 445 | tprobe; |
|---|
| 446 | tprobe = tprobe_next ) { |
|---|
| 447 | tprobe_next = tprobe->next; |
|---|
| 448 | tprobe->seq_len = strlen(tprobe->sequence); |
|---|
| 449 | sbond = 0.0; |
|---|
| 450 | for (i=0;i<tprobe->seq_len;i++) { |
|---|
| 451 | sbond += ptnd_check_max_bond(pdc,tprobe->sequence[i]); |
|---|
| 452 | } |
|---|
| 453 | tprobe->sum_bonds = sbond; |
|---|
| 454 | } |
|---|
| 455 | match = match; |
|---|
| 456 | } |
|---|
| 457 | /*********************************************************************** |
|---|
| 458 | split the probes in probeparts |
|---|
| 459 | ***********************************************************************/ |
|---|
| 460 | static void ptnd_cp_tprobe_2_probepart(PT_pdc *pdc) |
|---|
| 461 | { |
|---|
| 462 | PT_tprobes *tprobe; |
|---|
| 463 | PT_probeparts *parts; |
|---|
| 464 | int pos; |
|---|
| 465 | int probelen; |
|---|
| 466 | |
|---|
| 467 | while (pdc->parts) destroy_PT_probeparts(pdc->parts); |
|---|
| 468 | for ( tprobe = pdc->tprobes; |
|---|
| 469 | tprobe; |
|---|
| 470 | tprobe = tprobe->next ) { |
|---|
| 471 | probelen = strlen(tprobe->sequence); |
|---|
| 472 | probelen -= DOMAIN_MIN_LENGTH; |
|---|
| 473 | for (pos = 0; pos < probelen; pos ++ ) { |
|---|
| 474 | parts = create_PT_probeparts(); |
|---|
| 475 | parts->sequence = strdup(tprobe->sequence+pos); |
|---|
| 476 | parts->source = tprobe; |
|---|
| 477 | parts->start = pos; |
|---|
| 478 | aisc_link(&pdc->pparts, parts); |
|---|
| 479 | } |
|---|
| 480 | } |
|---|
| 481 | } |
|---|
| 482 | static void ptnd_duplikate_probepart_rek(PT_pdc *pdc, char *insequence, int deep, double dt,double sum_bonds, PT_probeparts *parts) |
|---|
| 483 | { |
|---|
| 484 | PT_probeparts *newparts; |
|---|
| 485 | int base; |
|---|
| 486 | int i; |
|---|
| 487 | double max_bind; |
|---|
| 488 | double split; |
|---|
| 489 | double ndt,nsum_bonds; |
|---|
| 490 | char *sequence; |
|---|
| 491 | |
|---|
| 492 | sequence = strdup(insequence); |
|---|
| 493 | if (deep >= PT_PART_DEEP) { /* now do it */ |
|---|
| 494 | newparts = create_PT_probeparts(); |
|---|
| 495 | newparts->sequence = sequence; |
|---|
| 496 | newparts->source = parts->source; |
|---|
| 497 | newparts->dt = dt; |
|---|
| 498 | newparts->start = parts->start; |
|---|
| 499 | newparts->sum_bonds = sum_bonds; |
|---|
| 500 | aisc_link(&pdc->pdparts, newparts); |
|---|
| 501 | return; |
|---|
| 502 | } |
|---|
| 503 | base = sequence[deep]; |
|---|
| 504 | max_bind = ptnd_check_max_bond(pdc, base); |
|---|
| 505 | for (i = PT_A; i <=PT_T; i++) { |
|---|
| 506 | sequence[deep] = i; |
|---|
| 507 | if ( (split = ptnd_check_split(pdc, insequence, deep, i)) < 0.0) continue; |
|---|
| 508 | /* this mismatch splits the probe in several domains */ |
|---|
| 509 | ndt = split + dt; |
|---|
| 510 | nsum_bonds = sum_bonds+max_bind-split; |
|---|
| 511 | ptnd_duplikate_probepart_rek(pdc,sequence,deep+1,ndt,nsum_bonds,parts); |
|---|
| 512 | } |
|---|
| 513 | free(sequence); |
|---|
| 514 | } |
|---|
| 515 | static void ptnd_duplikate_probepart(PT_pdc *pdc) |
|---|
| 516 | { |
|---|
| 517 | PT_probeparts *parts; |
|---|
| 518 | for (parts = pdc->parts; parts; parts = parts->next) |
|---|
| 519 | ptnd_duplikate_probepart_rek(pdc,parts->sequence,0,parts->dt,0.0, parts); |
|---|
| 520 | |
|---|
| 521 | while (( parts = pdc->parts )) |
|---|
| 522 | destroy_PT_probeparts(parts); /* delete the source */ |
|---|
| 523 | |
|---|
| 524 | while (( parts = pdc->dparts )) |
|---|
| 525 | { |
|---|
| 526 | aisc_unlink((struct_dllheader_ext*)parts); |
|---|
| 527 | aisc_link(&pdc->pparts, parts); |
|---|
| 528 | } |
|---|
| 529 | } |
|---|
| 530 | /*********************************************************************** |
|---|
| 531 | sort the parts and check for duplicated parts |
|---|
| 532 | ***********************************************************************/ |
|---|
| 533 | |
|---|
| 534 | static int ptnd_compare_parts(const void *PT_probeparts_ptr1, const void *PT_probeparts_ptr2, void*) { |
|---|
| 535 | const PT_probeparts *tprobe1 = (const PT_probeparts*)PT_probeparts_ptr1; |
|---|
| 536 | const PT_probeparts *tprobe2 = (const PT_probeparts*)PT_probeparts_ptr2; |
|---|
| 537 | |
|---|
| 538 | return strcmp(tprobe1->sequence,tprobe2->sequence); |
|---|
| 539 | } |
|---|
| 540 | |
|---|
| 541 | static void ptnd_sort_parts(PT_pdc *pdc) |
|---|
| 542 | { |
|---|
| 543 | PT_probeparts **my_list; |
|---|
| 544 | int list_len; |
|---|
| 545 | PT_probeparts *tprobe; |
|---|
| 546 | int i; |
|---|
| 547 | |
|---|
| 548 | if (!pdc->parts) return; |
|---|
| 549 | list_len = get_TPROBEPARTS_CNT(pdc->parts); |
|---|
| 550 | if (list_len <= 1) return; |
|---|
| 551 | my_list = (PT_probeparts **)calloc(sizeof(void *),list_len); |
|---|
| 552 | for ( i=0, tprobe = pdc->parts; |
|---|
| 553 | tprobe; |
|---|
| 554 | i++,tprobe=tprobe->next ) { |
|---|
| 555 | my_list[i] = tprobe; |
|---|
| 556 | } |
|---|
| 557 | GB_sort((void **)my_list, 0, list_len, ptnd_compare_parts, 0); |
|---|
| 558 | |
|---|
| 559 | for (i=0;i<list_len;i++) { |
|---|
| 560 | aisc_unlink((struct_dllheader_ext*)my_list[i]); |
|---|
| 561 | aisc_link(&pdc->pparts, my_list[i]); |
|---|
| 562 | } |
|---|
| 563 | free((char *)my_list); |
|---|
| 564 | } |
|---|
| 565 | static void ptnd_remove_duplikated_probepart(PT_pdc *pdc) |
|---|
| 566 | { |
|---|
| 567 | PT_probeparts *parts, *parts_next; |
|---|
| 568 | for ( parts = pdc->parts; |
|---|
| 569 | parts; |
|---|
| 570 | parts = parts_next) { |
|---|
| 571 | parts_next = parts->next; |
|---|
| 572 | if (parts_next) { |
|---|
| 573 | if ( (parts->source== parts_next->source) && !strcmp(parts->sequence, parts_next->sequence)) { /* equal sequence */ |
|---|
| 574 | if (parts->dt < parts_next->dt) { /* delete higher dt */ |
|---|
| 575 | destroy_PT_probeparts(parts_next); |
|---|
| 576 | parts_next = parts; |
|---|
| 577 | }else{ |
|---|
| 578 | destroy_PT_probeparts(parts); |
|---|
| 579 | } |
|---|
| 580 | } |
|---|
| 581 | } |
|---|
| 582 | } |
|---|
| 583 | } |
|---|
| 584 | /*********************************************************************** |
|---|
| 585 | test the probe parts, search the longest non mismatch string |
|---|
| 586 | ***********************************************************************/ |
|---|
| 587 | static void ptnd_check_part_inc_dt(PT_pdc *pdc, PT_probeparts *parts, |
|---|
| 588 | int name, int apos, int rpos, |
|---|
| 589 | double dt,double sum_bonds) |
|---|
| 590 | { |
|---|
| 591 | PT_tprobes *tprobe = parts->source; |
|---|
| 592 | double ndt; |
|---|
| 593 | int pos; |
|---|
| 594 | int start; |
|---|
| 595 | double h; |
|---|
| 596 | char *probe; |
|---|
| 597 | int split; |
|---|
| 598 | |
|---|
| 599 | if (( start = parts->start )) { /* look backwards */ |
|---|
| 600 | probe = parts->source->sequence; |
|---|
| 601 | pos = rpos-1; |
|---|
| 602 | start--; /* test the base left of start */ |
|---|
| 603 | split = 0; |
|---|
| 604 | while (start>=0) { |
|---|
| 605 | if (pos<0) break; /* out of sight */ |
|---|
| 606 | h = ptnd_check_split(ptnd.pdc, probe, start,psg.data[name].data[pos]); |
|---|
| 607 | if (h>0.0 && !split) return; /* there is a longer part matching this */ |
|---|
| 608 | dt -= h; |
|---|
| 609 | start --; |
|---|
| 610 | pos --; |
|---|
| 611 | split = 1; |
|---|
| 612 | } |
|---|
| 613 | } |
|---|
| 614 | ndt = (dt * pdc->dt + (tprobe->sum_bonds - sum_bonds)*pdc->dte ) / tprobe->seq_len; |
|---|
| 615 | pos = (int)ndt; |
|---|
| 616 | |
|---|
| 617 | pt_assert(pos >= 0); |
|---|
| 618 | |
|---|
| 619 | if (pos >= PERC_SIZE) return; /* out of observation */ |
|---|
| 620 | tprobe->perc[pos] ++; |
|---|
| 621 | if (ptnd.new_match) { /* save the result in probematch */ |
|---|
| 622 | PT_probematch *match; |
|---|
| 623 | if (psg.data[name].match) { |
|---|
| 624 | if (psg.data[name].match->dt < ndt) return; |
|---|
| 625 | /* there is a better hit for that sequence */ |
|---|
| 626 | match = psg.data[name].match; |
|---|
| 627 | }else{ |
|---|
| 628 | match = create_PT_probematch(); |
|---|
| 629 | aisc_link(&ptnd.locs->ppm, match); |
|---|
| 630 | psg.data[name].match = match; |
|---|
| 631 | } |
|---|
| 632 | match->name = name; |
|---|
| 633 | match->b_pos = apos - parts->start; /* thats not correct !!! */ |
|---|
| 634 | match->rpos = rpos-parts->start; |
|---|
| 635 | match->N_mismatches = -1; /* there are no mismatches in this mode */ |
|---|
| 636 | match->mismatches = -1; |
|---|
| 637 | match->wmismatches = dt; /* only weighted mismatches (maybe) */ |
|---|
| 638 | match->dt = ndt; |
|---|
| 639 | match->sequence = psg.main_probe; |
|---|
| 640 | match->reversed = psg.reversed; |
|---|
| 641 | } |
|---|
| 642 | } |
|---|
| 643 | static int ptnd_check_inc_mode(PT_pdc *pdc ,PT_probeparts *parts,double dt,double sum_bonds) |
|---|
| 644 | { |
|---|
| 645 | PT_tprobes *tprobe = parts->source; |
|---|
| 646 | double ndt = (dt * pdc->dt + (tprobe->sum_bonds - sum_bonds)*pdc->dte ) / tprobe->seq_len; |
|---|
| 647 | int pos = (int)ndt; |
|---|
| 648 | |
|---|
| 649 | pt_assert(pos >= 0); |
|---|
| 650 | |
|---|
| 651 | if (pos >= PERC_SIZE) return 1; /* out of observation */ |
|---|
| 652 | return 0; |
|---|
| 653 | } |
|---|
| 654 | |
|---|
| 655 | struct ptnd_chain_check_part { |
|---|
| 656 | ptnd_chain_check_part(int s) : split(s) {} |
|---|
| 657 | int split; |
|---|
| 658 | int operator() (int name, int apos, int rpos) |
|---|
| 659 | { |
|---|
| 660 | char *probe = psg.probe; |
|---|
| 661 | int height = psg.height; |
|---|
| 662 | double sbond = ptnd.sum_bonds; |
|---|
| 663 | double dt = ptnd.dt; |
|---|
| 664 | double h = 1.0; |
|---|
| 665 | int pos; |
|---|
| 666 | int base; |
|---|
| 667 | |
|---|
| 668 | if (!ptnd.new_match && psg.data[name].is_group) return 0; /* dont count group or neverminds */ |
|---|
| 669 | if (probe) { |
|---|
| 670 | pos = rpos+psg.height; |
|---|
| 671 | while (probe[height] && (base = psg.data[name].data[pos])) { |
|---|
| 672 | if (!split && (h = ptnd_check_split(ptnd.pdc, probe, height, |
|---|
| 673 | base) < 0.0) ) { |
|---|
| 674 | dt -= h; |
|---|
| 675 | split = 1; |
|---|
| 676 | }else{ |
|---|
| 677 | dt += h; |
|---|
| 678 | sbond += ptnd_check_max_bond(ptnd.pdc,probe[height]) - h; |
|---|
| 679 | } |
|---|
| 680 | height++; pos++; |
|---|
| 681 | } |
|---|
| 682 | } |
|---|
| 683 | ptnd_check_part_inc_dt(ptnd.pdc,ptnd.parts,name,apos,rpos,dt,sbond); |
|---|
| 684 | return 0; |
|---|
| 685 | } |
|---|
| 686 | }; |
|---|
| 687 | |
|---|
| 688 | /* go down the tree to chains and leafs; check all */ |
|---|
| 689 | static void ptnd_check_part_all(POS_TREE *pt, double dt, double sum_bonds) |
|---|
| 690 | { |
|---|
| 691 | int base; |
|---|
| 692 | int name, apos, rpos; |
|---|
| 693 | |
|---|
| 694 | if (pt == NULL) |
|---|
| 695 | return; |
|---|
| 696 | if (PT_read_type(pt) == PT_NT_LEAF) { |
|---|
| 697 | name = PT_read_name(psg.ptmain,pt); |
|---|
| 698 | if (!ptnd.new_match && psg.data[name].is_group) return; |
|---|
| 699 | rpos = PT_read_rpos(psg.ptmain,pt); |
|---|
| 700 | apos = PT_read_apos(psg.ptmain,pt); |
|---|
| 701 | ptnd_check_part_inc_dt( ptnd.pdc, ptnd.parts, |
|---|
| 702 | name,apos,rpos, dt, sum_bonds); |
|---|
| 703 | }else if (PT_read_type(pt) == PT_NT_CHAIN) { |
|---|
| 704 | psg.probe = 0; |
|---|
| 705 | ptnd.dt = dt; |
|---|
| 706 | ptnd.sum_bonds = sum_bonds; |
|---|
| 707 | PT_read_chain(psg.ptmain,pt, ptnd_chain_check_part(0)); |
|---|
| 708 | }else{ |
|---|
| 709 | for (base = PT_QU; base< PT_B_MAX; base++) { |
|---|
| 710 | ptnd_check_part_all(PT_read_son(psg.ptmain,pt,(PT_BASES)base),dt,sum_bonds); |
|---|
| 711 | } |
|---|
| 712 | } |
|---|
| 713 | } |
|---|
| 714 | /* search down the tree to find matching species for the given probe */ |
|---|
| 715 | static void ptnd_check_part(char *probe, POS_TREE *pt,int height, double dt, double sum_bonds, int split) |
|---|
| 716 | { |
|---|
| 717 | int name; |
|---|
| 718 | int i; |
|---|
| 719 | POS_TREE *pthelp; |
|---|
| 720 | int rpos,apos,pos; |
|---|
| 721 | double ndt,nsum_bonds,h; |
|---|
| 722 | int nsplit; |
|---|
| 723 | int ref; |
|---|
| 724 | |
|---|
| 725 | if (!pt) return; |
|---|
| 726 | if (dt/ptnd.parts->source->seq_len > PERC_SIZE) return; /* out of scope */ |
|---|
| 727 | if (PT_read_type(pt) == PT_NT_NODE && probe[height]) { |
|---|
| 728 | if (split && ptnd_check_inc_mode(ptnd.pdc ,ptnd.parts, dt, sum_bonds)) return; |
|---|
| 729 | for (i=PT_A; i<PT_B_MAX; i++) { |
|---|
| 730 | if (( pthelp = PT_read_son(psg.ptmain,pt, (PT_BASES)i) )) |
|---|
| 731 | { |
|---|
| 732 | nsplit = split; |
|---|
| 733 | nsum_bonds = sum_bonds; |
|---|
| 734 | if (height < PT_PART_DEEP) { |
|---|
| 735 | if ( i != probe[height] ) continue; |
|---|
| 736 | ndt = dt; |
|---|
| 737 | }else{ |
|---|
| 738 | if (split) { |
|---|
| 739 | h = ptnd_check_split(ptnd.pdc, probe, height,i); |
|---|
| 740 | if (h>0.0) ndt = dt+h; else ndt = dt-h; |
|---|
| 741 | }else if ((h = ptnd_check_split(ptnd.pdc, probe, height,i)) < 0.0 ) { |
|---|
| 742 | ndt = dt - h; |
|---|
| 743 | nsplit = 1; |
|---|
| 744 | }else{ |
|---|
| 745 | ndt = dt + h; |
|---|
| 746 | nsum_bonds += |
|---|
| 747 | ptnd_check_max_bond(ptnd.pdc,probe[height]) - h; |
|---|
| 748 | } |
|---|
| 749 | } |
|---|
| 750 | if (nsplit && height <= DOMAIN_MIN_LENGTH) continue; |
|---|
| 751 | ptnd_check_part(probe,pthelp,height+1,ndt,nsum_bonds,nsplit); |
|---|
| 752 | } |
|---|
| 753 | } |
|---|
| 754 | return; |
|---|
| 755 | } |
|---|
| 756 | if (probe[height]) { |
|---|
| 757 | if (PT_read_type(pt) == PT_NT_LEAF) { |
|---|
| 758 | name = PT_read_name(psg.ptmain,pt); |
|---|
| 759 | if (!ptnd.new_match && psg.data[name].is_group) return; |
|---|
| 760 | rpos = PT_read_rpos(psg.ptmain,pt); |
|---|
| 761 | apos = PT_read_apos(psg.ptmain,pt); |
|---|
| 762 | pos = rpos + height; |
|---|
| 763 | if (pos + (int)(strlen(probe+height)) >= psg.data[name].size) /* after end */ |
|---|
| 764 | return; |
|---|
| 765 | while (probe[height] && (ref = psg.data[name].data[pos])) { |
|---|
| 766 | if (split) { |
|---|
| 767 | h = ptnd_check_split(ptnd.pdc, probe, height,ref); |
|---|
| 768 | if (h<0.0) dt -= h; else dt += h; |
|---|
| 769 | }else if ( (h = ptnd_check_split(ptnd.pdc, probe, height, |
|---|
| 770 | ref)) < 0.0 ) { |
|---|
| 771 | dt -= h; |
|---|
| 772 | split = 1; |
|---|
| 773 | }else{ |
|---|
| 774 | dt += h; |
|---|
| 775 | sum_bonds += ptnd_check_max_bond(ptnd.pdc,probe[height]) - h; |
|---|
| 776 | } |
|---|
| 777 | height++; pos++; |
|---|
| 778 | } |
|---|
| 779 | ptnd_check_part_inc_dt( ptnd.pdc,ptnd.parts, |
|---|
| 780 | name, apos, rpos, dt, sum_bonds); |
|---|
| 781 | return; |
|---|
| 782 | } else { /* chain */ |
|---|
| 783 | psg.probe = probe; |
|---|
| 784 | psg.height = height; |
|---|
| 785 | ptnd.dt = dt; |
|---|
| 786 | ptnd.sum_bonds = sum_bonds; |
|---|
| 787 | PT_read_chain(psg.ptmain,pt, ptnd_chain_check_part(split)); |
|---|
| 788 | return; |
|---|
| 789 | } |
|---|
| 790 | } |
|---|
| 791 | ptnd_check_part_all(pt,dt,sum_bonds); |
|---|
| 792 | } |
|---|
| 793 | static void ptnd_check_probepart(PT_pdc *pdc) |
|---|
| 794 | { |
|---|
| 795 | PT_probeparts *parts; |
|---|
| 796 | ptnd.pdc = pdc; |
|---|
| 797 | for ( parts = pdc->parts; |
|---|
| 798 | parts; |
|---|
| 799 | parts = parts->next) { |
|---|
| 800 | ptnd.parts = parts; |
|---|
| 801 | ptnd_check_part(parts->sequence, psg.pt, 0, parts->dt, parts->sum_bonds,0); |
|---|
| 802 | } |
|---|
| 803 | } |
|---|
| 804 | /*********************************************************************** |
|---|
| 805 | search for possible probes |
|---|
| 806 | ***********************************************************************/ |
|---|
| 807 | inline int ptnd_check_tprobe(PT_pdc *pdc, const char *probe, int len) |
|---|
| 808 | { |
|---|
| 809 | int occ[PT_B_MAX] = { 0, 0, 0, 0, 0, 0 }; |
|---|
| 810 | |
|---|
| 811 | int count = 0; |
|---|
| 812 | while (count<len) { |
|---|
| 813 | occ[safeCharIndex(probe[count++])]++; |
|---|
| 814 | } |
|---|
| 815 | int gc = occ[PT_G]+occ[PT_C]; |
|---|
| 816 | int at = occ[PT_A]+occ[PT_T]; |
|---|
| 817 | |
|---|
| 818 | int all = gc+at; |
|---|
| 819 | |
|---|
| 820 | if (all < len) return 1; // there were some unwanted characters ('N' or '.') |
|---|
| 821 | if (all < pdc->probelen) return 1; |
|---|
| 822 | |
|---|
| 823 | if (gc < pdc->min_gc*len) return 1; |
|---|
| 824 | if (gc > pdc->max_gc*len) return 1; |
|---|
| 825 | |
|---|
| 826 | double temp = at*2 + gc*4; |
|---|
| 827 | if (temp< pdc->mintemp) return 1; |
|---|
| 828 | if (temp>pdc->maxtemp) return 1; |
|---|
| 829 | |
|---|
| 830 | return 0; |
|---|
| 831 | } |
|---|
| 832 | |
|---|
| 833 | static long ptnd_build_probes_collect(const char *probe, long count, void*) { |
|---|
| 834 | PT_tprobes *tprobe; |
|---|
| 835 | if (count >= ptnd.locs->group_count*ptnd.pdc->mintarget) { |
|---|
| 836 | tprobe = create_PT_tprobes(); |
|---|
| 837 | tprobe->sequence = strdup(probe); |
|---|
| 838 | tprobe->temp = pt_get_temperature(probe); |
|---|
| 839 | tprobe->groupsize = (int)count; |
|---|
| 840 | aisc_link(&ptnd.pdc->ptprobes, tprobe); |
|---|
| 841 | } |
|---|
| 842 | return count; |
|---|
| 843 | } |
|---|
| 844 | |
|---|
| 845 | inline void PT_incr_hash(GB_HASH *hash, char *sequence, int len) { |
|---|
| 846 | char c = sequence[len]; |
|---|
| 847 | sequence[len] = 0; |
|---|
| 848 | |
|---|
| 849 | pt_assert(strlen(sequence) == (size_t)len); |
|---|
| 850 | |
|---|
| 851 | GBS_incr_hash(hash, sequence); |
|---|
| 852 | |
|---|
| 853 | sequence[len] = c; |
|---|
| 854 | } |
|---|
| 855 | |
|---|
| 856 | static void ptnd_add_sequence_to_hash(PT_pdc *pdc, GB_HASH *hash, char *sequence, int seq_len, int probe_len, char *prefix, int prefix_len) { |
|---|
| 857 | int pos; |
|---|
| 858 | if (*prefix) { // partition search, else very large hash tables (>60 mbytes) |
|---|
| 859 | for (pos = seq_len-probe_len; pos >= 0; pos--, sequence++) { |
|---|
| 860 | if ((*prefix!= *sequence || strncmp(sequence,prefix,prefix_len)) ) continue; |
|---|
| 861 | if (!ptnd_check_tprobe(pdc,sequence, probe_len)) { |
|---|
| 862 | PT_incr_hash(hash, sequence, probe_len); |
|---|
| 863 | } |
|---|
| 864 | } |
|---|
| 865 | } |
|---|
| 866 | else { |
|---|
| 867 | for (pos = seq_len-probe_len; pos >= 0; pos--, sequence++) { |
|---|
| 868 | if (!ptnd_check_tprobe(pdc,sequence, probe_len)) { |
|---|
| 869 | PT_incr_hash(hash, sequence, probe_len); |
|---|
| 870 | } |
|---|
| 871 | } |
|---|
| 872 | } |
|---|
| 873 | } |
|---|
| 874 | |
|---|
| 875 | static long ptnd_collect_hash(const char *key, long val, void *gb_hash) { |
|---|
| 876 | pt_assert(val); |
|---|
| 877 | GBS_incr_hash((GB_HASH*)gb_hash, key); |
|---|
| 878 | return val; |
|---|
| 879 | } |
|---|
| 880 | |
|---|
| 881 | #if defined(DEVEL_RALF) |
|---|
| 882 | static long avoid_hash_warning(const char *, long , void*) { return 0; } |
|---|
| 883 | #endif // DEVEL_RALF |
|---|
| 884 | |
|---|
| 885 | static void ptnd_build_tprobes(PT_pdc *pdc, int group_count) { |
|---|
| 886 | int *group_idx = new int[group_count]; |
|---|
| 887 | unsigned long datasize = 0; // of marked species/genes |
|---|
| 888 | unsigned long maxseqlength = 0; // of marked species/genes |
|---|
| 889 | |
|---|
| 890 | // get group indices and count datasize of marked species |
|---|
| 891 | { |
|---|
| 892 | int used_idx = 0; |
|---|
| 893 | for (int name = 0; name < psg.data_count; name++) { |
|---|
| 894 | if(psg.data[name].is_group == 1) { |
|---|
| 895 | group_idx[used_idx++] = name; // store marked group indices |
|---|
| 896 | unsigned long size = psg.data[name].size; |
|---|
| 897 | datasize += size; |
|---|
| 898 | if (datasize<size) datasize = ULONG_MAX; // avoid overflow! |
|---|
| 899 | if (size > maxseqlength) maxseqlength = size; |
|---|
| 900 | } |
|---|
| 901 | } |
|---|
| 902 | pt_assert(used_idx == group_count); |
|---|
| 903 | } |
|---|
| 904 | |
|---|
| 905 | unsigned long outer_hash_size; |
|---|
| 906 | int partsize = 0; // no partitions |
|---|
| 907 | { |
|---|
| 908 | const unsigned long max_allowed_hash_size = 1000000; // approx. |
|---|
| 909 | const unsigned long max_allowed_tprobes = (unsigned long)(max_allowed_hash_size*0.66); // results in 66% fill rate for hash (which is much!) |
|---|
| 910 | |
|---|
| 911 | // tests found about 5-8% tprobes (in relation to datasize) -> we use 10% here |
|---|
| 912 | unsigned long estimated_no_of_tprobes = (unsigned long)((datasize-pdc->probelen+1)*0.10+0.5); |
|---|
| 913 | |
|---|
| 914 | while (estimated_no_of_tprobes > max_allowed_tprobes) { |
|---|
| 915 | partsize++; |
|---|
| 916 | estimated_no_of_tprobes /= 4; // 4 different bases |
|---|
| 917 | } |
|---|
| 918 | |
|---|
| 919 | outer_hash_size = (unsigned long)(estimated_no_of_tprobes*1.5); |
|---|
| 920 | |
|---|
| 921 | #if defined(DEBUG) |
|---|
| 922 | printf("marked=%i datasize=%lu partsize=%i estimated_no_of_tprobes=%lu outer_hash_size=%lu\n", |
|---|
| 923 | group_count, datasize, partsize, estimated_no_of_tprobes, outer_hash_size); |
|---|
| 924 | #endif // DEBUG |
|---|
| 925 | } |
|---|
| 926 | |
|---|
| 927 | #if defined(DEBUG) |
|---|
| 928 | GBS_clear_hash_statistic_summary("outer"); |
|---|
| 929 | #endif // DEBUG |
|---|
| 930 | |
|---|
| 931 | const int hash_multiply = 4; // hash fill ratio is 1/hash_multiply |
|---|
| 932 | |
|---|
| 933 | char partstring[256]; |
|---|
| 934 | PT_init_base_string_counter(partstring,PT_A,partsize); |
|---|
| 935 | while (!PT_base_string_counter_eof(partstring)) { |
|---|
| 936 | #if defined(DEBUG) |
|---|
| 937 | fputs("partition='", stdout); |
|---|
| 938 | for (int i = 0; i<partsize; ++i) { |
|---|
| 939 | putchar("ACGT"[partstring[i]-PT_A]); |
|---|
| 940 | } |
|---|
| 941 | fputs("'\n", stdout); |
|---|
| 942 | #endif // DEBUG |
|---|
| 943 | |
|---|
| 944 | GB_HASH *hash_outer = GBS_create_hash(outer_hash_size, GB_MIND_CASE); // count in how many groups/sequences the tprobe occurs (key = tprobe, value = counter) |
|---|
| 945 | |
|---|
| 946 | #if defined(DEBUG) |
|---|
| 947 | GBS_clear_hash_statistic_summary("inner"); |
|---|
| 948 | #endif // DEBUG |
|---|
| 949 | |
|---|
| 950 | // for (name = 0; name < psg.data_count; name++) { |
|---|
| 951 | // if(psg.data[name].is_group != 1) continue; |
|---|
| 952 | for (int g = 0; g<group_count; ++g) { |
|---|
| 953 | int name = group_idx[g]; |
|---|
| 954 | long possible_tprobes = psg.data[name].size-pdc->probelen+1; |
|---|
| 955 | GB_HASH *hash_one = GBS_create_hash(possible_tprobes*hash_multiply, GB_MIND_CASE); // count tprobe occurrences for one group/sequence |
|---|
| 956 | ptnd_add_sequence_to_hash(pdc, hash_one, psg.data[name].data, psg.data[name].size, pdc->probelen, partstring, partsize); |
|---|
| 957 | GBS_hash_do_loop(hash_one, ptnd_collect_hash, hash_outer); // merge hash_one into hash |
|---|
| 958 | #if defined(DEBUG) |
|---|
| 959 | GBS_calc_hash_statistic(hash_one, "inner", 0); |
|---|
| 960 | #endif // DEBUG |
|---|
| 961 | GBS_free_hash(hash_one); |
|---|
| 962 | } |
|---|
| 963 | PT_sequence *seq; |
|---|
| 964 | for ( seq = pdc->sequences; seq; seq = seq->next) { |
|---|
| 965 | long possible_tprobes = seq->seq.size-pdc->probelen+1; |
|---|
| 966 | GB_HASH *hash_one = GBS_create_hash(possible_tprobes*hash_multiply, GB_MIND_CASE); // count tprobe occurrences for one group/sequence |
|---|
| 967 | ptnd_add_sequence_to_hash(pdc, hash_one, seq->seq.data, seq->seq.size, pdc->probelen, partstring, partsize); |
|---|
| 968 | GBS_hash_do_loop(hash_one, ptnd_collect_hash, hash_outer); // merge hash_one into hash |
|---|
| 969 | #if defined(DEBUG) |
|---|
| 970 | GBS_calc_hash_statistic(hash_one, "inner", 0); |
|---|
| 971 | #endif // DEBUG |
|---|
| 972 | GBS_free_hash(hash_one); |
|---|
| 973 | } |
|---|
| 974 | |
|---|
| 975 | #if defined(DEBUG) |
|---|
| 976 | GBS_print_hash_statistic_summary("inner"); |
|---|
| 977 | #endif // DEBUG |
|---|
| 978 | |
|---|
| 979 | GBS_hash_do_loop(hash_outer, ptnd_build_probes_collect, NULL); |
|---|
| 980 | |
|---|
| 981 | #if defined(DEBUG) |
|---|
| 982 | GBS_calc_hash_statistic(hash_outer, "outer", 1); |
|---|
| 983 | #if defined(DEVEL_RALF) |
|---|
| 984 | GBS_hash_do_loop(hash_outer, avoid_hash_warning, NULL); // hash is known to be overfilled - avoid assertion |
|---|
| 985 | #endif // DEVEL_RALF |
|---|
| 986 | #endif // DEBUG |
|---|
| 987 | |
|---|
| 988 | GBS_free_hash(hash_outer); |
|---|
| 989 | PT_inc_base_string_count(partstring,PT_A,PT_B_MAX,partsize); |
|---|
| 990 | } |
|---|
| 991 | #if defined(DEBUG) |
|---|
| 992 | GBS_print_hash_statistic_summary("outer"); |
|---|
| 993 | #endif // DEBUG |
|---|
| 994 | } |
|---|
| 995 | |
|---|
| 996 | #if 0 |
|---|
| 997 | static void ptnd_print_probes(PT_pdc *pdc) { |
|---|
| 998 | PT_tprobes *tprobe; |
|---|
| 999 | char buffer[1024]; |
|---|
| 1000 | int i; |
|---|
| 1001 | |
|---|
| 1002 | for ( tprobe = pdc->tprobes; |
|---|
| 1003 | tprobe; |
|---|
| 1004 | tprobe = tprobe->next ) { |
|---|
| 1005 | strncpy(buffer,tprobe->sequence,250); |
|---|
| 1006 | buffer[250] = 0; |
|---|
| 1007 | PT_base_2_string(buffer,0); |
|---|
| 1008 | printf("seq: %s group %i mishits: %i ",buffer,tprobe->groupsize,tprobe->mishit); |
|---|
| 1009 | for (i=0;i<PERC_SIZE;i++) { |
|---|
| 1010 | printf("%3i,",tprobe->perc[i]); |
|---|
| 1011 | } |
|---|
| 1012 | printf("\n"); |
|---|
| 1013 | } |
|---|
| 1014 | } |
|---|
| 1015 | #endif |
|---|
| 1016 | |
|---|
| 1017 | extern "C" int PT_start_design(PT_pdc *pdc, int /*dummy*/) { |
|---|
| 1018 | |
|---|
| 1019 | // IDP probe design |
|---|
| 1020 | |
|---|
| 1021 | PT_local *locs = (PT_local*)pdc->mh.parent->parent; |
|---|
| 1022 | ptnd.new_match = 0; |
|---|
| 1023 | ptnd.locs = locs; |
|---|
| 1024 | ptnd.pdc = pdc; |
|---|
| 1025 | |
|---|
| 1026 | const char *error; |
|---|
| 1027 | { |
|---|
| 1028 | char *unknown_names = ptpd_read_names(locs, pdc->names.data, pdc->checksums.data, error); /* read the names */ |
|---|
| 1029 | if (unknown_names) free(unknown_names); // unused here (handled by PT_unknown_names) |
|---|
| 1030 | } |
|---|
| 1031 | |
|---|
| 1032 | if (error) { |
|---|
| 1033 | pt_export_error(locs, error); |
|---|
| 1034 | return 0; |
|---|
| 1035 | } |
|---|
| 1036 | |
|---|
| 1037 | PT_sequence *seq; |
|---|
| 1038 | for ( seq = pdc->sequences; seq; seq = seq->next) { // Convert all external sequence to internal format |
|---|
| 1039 | seq->seq.size = probe_compress_sequence(seq->seq.data, seq->seq.size); |
|---|
| 1040 | locs->group_count++; |
|---|
| 1041 | } |
|---|
| 1042 | |
|---|
| 1043 | if (locs->group_count <=0) { |
|---|
| 1044 | pt_export_error(locs, GBS_global_string("No %s marked - no probes designed", |
|---|
| 1045 | gene_flag ? "genes" : "species")); |
|---|
| 1046 | return 0; |
|---|
| 1047 | } |
|---|
| 1048 | |
|---|
| 1049 | while (pdc->tprobes) destroy_PT_tprobes(pdc->tprobes); |
|---|
| 1050 | ptnd_build_tprobes(pdc, locs->group_count); |
|---|
| 1051 | while (pdc->sequences) destroy_PT_sequence(pdc->sequences); |
|---|
| 1052 | ptnd_sort_probes_by(pdc,1); |
|---|
| 1053 | ptnd_first_check(pdc); |
|---|
| 1054 | ptnd_check_position(pdc); |
|---|
| 1055 | ptnd_check_bonds(pdc,ptnd.new_match); |
|---|
| 1056 | ptnd_cp_tprobe_2_probepart(pdc); |
|---|
| 1057 | ptnd_duplikate_probepart(pdc); |
|---|
| 1058 | ptnd_sort_parts(pdc); |
|---|
| 1059 | ptnd_remove_duplikated_probepart(pdc); |
|---|
| 1060 | ptnd_check_probepart(pdc); |
|---|
| 1061 | while (pdc->parts) destroy_PT_probeparts(pdc->parts); |
|---|
| 1062 | ptnd_calc_quality(pdc); |
|---|
| 1063 | ptnd_sort_probes_by(pdc,0); |
|---|
| 1064 | ptnd_probe_delete_all_but(pdc,pdc->clipresult); |
|---|
| 1065 | /* ptnd_print_probes(pdc); */ |
|---|
| 1066 | |
|---|
| 1067 | return 0; |
|---|
| 1068 | } |
|---|
| 1069 | |
|---|
| 1070 | void ptnd_new_match(PT_local * locs, char *probestring) |
|---|
| 1071 | { |
|---|
| 1072 | PT_pdc *pdc = locs->pdc; |
|---|
| 1073 | PT_tprobes *tprobe; |
|---|
| 1074 | |
|---|
| 1075 | ptnd.locs = locs; |
|---|
| 1076 | ptnd.pdc = pdc; |
|---|
| 1077 | ptnd.new_match = 1; |
|---|
| 1078 | |
|---|
| 1079 | if (!pdc) return; /* no config */ |
|---|
| 1080 | |
|---|
| 1081 | tprobe = create_PT_tprobes(); |
|---|
| 1082 | tprobe->sequence = strdup(probestring); |
|---|
| 1083 | |
|---|
| 1084 | aisc_link(&pdc->ptprobes, tprobe); |
|---|
| 1085 | ptnd_check_bonds(pdc,ptnd.new_match); |
|---|
| 1086 | ptnd_cp_tprobe_2_probepart(pdc); |
|---|
| 1087 | ptnd_duplikate_probepart(pdc); |
|---|
| 1088 | ptnd_sort_parts(pdc); |
|---|
| 1089 | ptnd_remove_duplikated_probepart(pdc); |
|---|
| 1090 | ptnd_check_probepart(pdc); |
|---|
| 1091 | |
|---|
| 1092 | while (pdc->parts) destroy_PT_probeparts(pdc->parts); |
|---|
| 1093 | while (( tprobe = pdc->tprobes )) destroy_PT_tprobes(tprobe); |
|---|
| 1094 | } |
|---|