1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : PT_match.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include "probe.h" |
---|
12 | #include <PT_server_prototypes.h> |
---|
13 | #include <struct_man.h> |
---|
14 | |
---|
15 | #include "pt_split.h" |
---|
16 | #include "probe_tree.h" |
---|
17 | |
---|
18 | #include <arb_strbuf.h> |
---|
19 | #include <arb_defs.h> |
---|
20 | #include <arb_sort.h> |
---|
21 | #include <cctype> |
---|
22 | #include <map> |
---|
23 | |
---|
24 | // overloaded functions to avoid problems with type-punning: |
---|
25 | inline void aisc_link(dll_public *dll, PT_probematch *match) { aisc_link(reinterpret_cast<dllpublic_ext*>(dll), reinterpret_cast<dllheader_ext*>(match)); } |
---|
26 | |
---|
27 | class MismatchWeights { |
---|
28 | const PT_bond *bonds; |
---|
29 | double weight[PT_BASES][PT_BASES]; |
---|
30 | |
---|
31 | double get_simple_wmismatch(char probe, char seq) { |
---|
32 | pt_assert(is_std_base(probe)); |
---|
33 | pt_assert(is_std_base(seq)); |
---|
34 | |
---|
35 | int complement = get_complement(probe); |
---|
36 | |
---|
37 | int rowIdx = (complement-int(PT_A))*4; |
---|
38 | int maxIdx = rowIdx + probe-(int)PT_A; |
---|
39 | int newIdx = rowIdx + seq-(int)PT_A; |
---|
40 | |
---|
41 | pt_assert(maxIdx >= 0 && maxIdx < 16); |
---|
42 | pt_assert(newIdx >= 0 && newIdx < 16); |
---|
43 | |
---|
44 | double max_bind = bonds[maxIdx].val; |
---|
45 | double new_bind = bonds[newIdx].val; |
---|
46 | |
---|
47 | return (max_bind - new_bind); |
---|
48 | } |
---|
49 | |
---|
50 | void init() { |
---|
51 | for (int probe = PT_A; probe < PT_BASES; ++probe) { |
---|
52 | double sum = 0.0; |
---|
53 | for (int seq = PT_A; seq < PT_BASES; ++seq) { |
---|
54 | sum += weight[probe][seq] = get_simple_wmismatch(probe, seq); |
---|
55 | } |
---|
56 | weight[probe][PT_N] = sum/4.0; |
---|
57 | } |
---|
58 | for (int seq = PT_N; seq < PT_BASES; ++seq) { |
---|
59 | double sum = 0.0; |
---|
60 | for (int probe = PT_A; probe < PT_BASES; ++probe) { |
---|
61 | sum += weight[probe][seq]; |
---|
62 | } |
---|
63 | weight[PT_N][seq] = sum/4.0; |
---|
64 | } |
---|
65 | |
---|
66 | for (int i = PT_N; i<PT_BASES; ++i) { |
---|
67 | weight[PT_QU][i] = weight[PT_N][i]; |
---|
68 | weight[i][PT_QU] = weight[i][PT_N]; |
---|
69 | } |
---|
70 | weight[PT_QU][PT_QU] = weight[PT_N][PT_N]; |
---|
71 | } |
---|
72 | |
---|
73 | public: |
---|
74 | MismatchWeights(const PT_bond *bonds_) : bonds(bonds_) { init(); } |
---|
75 | double get(int probe, int seq) const { return weight[probe][seq]; } |
---|
76 | }; |
---|
77 | |
---|
78 | |
---|
79 | class MatchRequest; |
---|
80 | class Mismatches { |
---|
81 | MatchRequest& req; |
---|
82 | |
---|
83 | int plain; // plain mismatch between 2 standard bases |
---|
84 | int ambig; // mismatch with N or '.' involved |
---|
85 | |
---|
86 | double weighted; // weighted mismatches |
---|
87 | |
---|
88 | public: |
---|
89 | |
---|
90 | Mismatches(MatchRequest& req_) : req(req_), plain(0), ambig(0), weighted(0.0) {} |
---|
91 | Mismatches(const Mismatches& other) : req(other.req), plain(other.plain), ambig(other.ambig), weighted(other.weighted) {} |
---|
92 | DECLARE_ASSIGNMENT_OPERATOR(Mismatches); |
---|
93 | |
---|
94 | inline void count_weighted(char probe, char seq, int height); |
---|
95 | void count_versus(const ReadableDataLoc& loc, const char *probe, int height); |
---|
96 | |
---|
97 | inline bool accepted() const; |
---|
98 | |
---|
99 | int get_plain() const { return plain; } |
---|
100 | int get_ambig() const { return ambig; } |
---|
101 | |
---|
102 | double get_weighted() const { return weighted; } |
---|
103 | |
---|
104 | inline PT_local& get_PT_local() const; |
---|
105 | }; |
---|
106 | |
---|
107 | class MatchRequest : virtual Noncopyable { |
---|
108 | PT_local& pt_local; |
---|
109 | |
---|
110 | int max_ambig; // max. possible ambiguous hits (i.e. max value in Mismatches::ambig) |
---|
111 | int *accepted_N_mismatches; |
---|
112 | |
---|
113 | MismatchWeights weights; |
---|
114 | |
---|
115 | void init_accepted_N_mismatches(int ignored_Nmismatches, int when_less_than_Nmismatches); |
---|
116 | |
---|
117 | public: |
---|
118 | explicit MatchRequest(PT_local& locs, int probe_length) |
---|
119 | : pt_local(locs), |
---|
120 | max_ambig(probe_length), |
---|
121 | accepted_N_mismatches(new int[max_ambig+1]), |
---|
122 | weights(locs.bond) |
---|
123 | { |
---|
124 | init_accepted_N_mismatches(pt_local.pm_nmatches_ignored, pt_local.pm_nmatches_limit); |
---|
125 | } |
---|
126 | ~MatchRequest() { |
---|
127 | delete [] accepted_N_mismatches; |
---|
128 | } |
---|
129 | |
---|
130 | PT_local& get_PT_local() const { return pt_local; } |
---|
131 | |
---|
132 | bool hit_limit_reached() const { |
---|
133 | bool reached = pt_local.pm_max_hits>0 && pt_local.ppm.cnt >= pt_local.pm_max_hits; |
---|
134 | if (reached) pt_local.matches_truncated = 1; |
---|
135 | return reached; |
---|
136 | } |
---|
137 | |
---|
138 | int accept_N_mismatches(int ambig) const { |
---|
139 | pt_assert(ambig<=max_ambig); |
---|
140 | return accepted_N_mismatches[ambig]; |
---|
141 | } |
---|
142 | |
---|
143 | bool add_hit(const DataLoc& at, const Mismatches& mismatch); |
---|
144 | bool add_hits_for_children(POS_TREE2 *pt, const Mismatches& mismatch); |
---|
145 | bool collect_hits_for(const char *probe, POS_TREE2 *pt, Mismatches& mismatch, int height); |
---|
146 | |
---|
147 | int allowed_mismatches() const { return pt_local.pm_max; } |
---|
148 | double get_mismatch_weight(char probe, char seq) const { return weights.get(probe, seq); } |
---|
149 | }; |
---|
150 | |
---|
151 | |
---|
152 | |
---|
153 | void MatchRequest::init_accepted_N_mismatches(int ignored_Nmismatches, int when_less_than_Nmismatches) { |
---|
154 | // calculate table for PT_N mismatches |
---|
155 | // |
---|
156 | // 'ignored_Nmismatches' specifies, how many N-mismatches will be accepted as |
---|
157 | // matches, when overall number of N-mismatches is below 'when_less_than_Nmismatches'. |
---|
158 | // |
---|
159 | // above that limit, every N-mismatch counts as mismatch |
---|
160 | |
---|
161 | when_less_than_Nmismatches = std::min(when_less_than_Nmismatches, max_ambig+1); |
---|
162 | ignored_Nmismatches = std::min(ignored_Nmismatches, when_less_than_Nmismatches-1); |
---|
163 | |
---|
164 | accepted_N_mismatches[0] = 0; |
---|
165 | int mm; |
---|
166 | for (mm = 1; mm<when_less_than_Nmismatches; ++mm) { |
---|
167 | accepted_N_mismatches[mm] = mm>ignored_Nmismatches ? mm-ignored_Nmismatches : 0; |
---|
168 | } |
---|
169 | pt_assert(mm <= (max_ambig+1)); |
---|
170 | for (; mm <= max_ambig; ++mm) { |
---|
171 | accepted_N_mismatches[mm] = mm; |
---|
172 | } |
---|
173 | } |
---|
174 | |
---|
175 | inline void Mismatches::count_weighted(char probe, char seq, int height) { |
---|
176 | bool is_ambig = is_ambig_base(probe) || is_ambig_base(seq); |
---|
177 | if (is_ambig || probe != seq) { |
---|
178 | if (is_ambig) ambig++; else plain++; |
---|
179 | weighted += req.get_mismatch_weight(probe, seq) * psg.pos_to_weight[height]; |
---|
180 | } |
---|
181 | } |
---|
182 | |
---|
183 | inline bool Mismatches::accepted() const { |
---|
184 | if (get_PT_local().sort_by == PT_MATCH_TYPE_INTEGER) { |
---|
185 | return (req.accept_N_mismatches(ambig)+plain) <= req.allowed_mismatches(); |
---|
186 | } |
---|
187 | return weighted <= (req.allowed_mismatches()+0.5); |
---|
188 | } |
---|
189 | |
---|
190 | inline PT_local& Mismatches::get_PT_local() const { |
---|
191 | return req.get_PT_local(); |
---|
192 | } |
---|
193 | |
---|
194 | bool MatchRequest::add_hit(const DataLoc& at, const Mismatches& mismatch) { |
---|
195 | PT_probematch *ml = create_PT_probematch(); |
---|
196 | |
---|
197 | ml->name = at.get_name(); |
---|
198 | ml->b_pos = at.get_abs_pos(); |
---|
199 | ml->g_pos = -1; |
---|
200 | ml->rpos = at.get_rel_pos(); |
---|
201 | |
---|
202 | ml->mismatches = mismatch.get_plain() + accept_N_mismatches(mismatch.get_ambig()); |
---|
203 | ml->wmismatches = mismatch.get_weighted(); |
---|
204 | ml->N_mismatches = mismatch.get_ambig(); |
---|
205 | |
---|
206 | ml->sequence = psg.main_probe; |
---|
207 | ml->reversed = psg.reversed ? 1 : 0; |
---|
208 | |
---|
209 | aisc_link(&get_PT_local().ppm, ml); |
---|
210 | |
---|
211 | return hit_limit_reached(); |
---|
212 | } |
---|
213 | |
---|
214 | bool MatchRequest::add_hits_for_children(POS_TREE2 *pt, const Mismatches& mismatch) { |
---|
215 | //! go down the tree to chains and leafs; copy names, positions and mismatches in locs structure |
---|
216 | |
---|
217 | pt_assert(pt && mismatch.accepted()); // invalid or superfluous call |
---|
218 | pt_assert(!hit_limit_reached()); |
---|
219 | |
---|
220 | bool enough = false; |
---|
221 | switch (pt->get_type()) { |
---|
222 | case PT2_LEAF: |
---|
223 | enough = add_hit(DataLoc(pt), mismatch); |
---|
224 | break; |
---|
225 | |
---|
226 | case PT2_CHAIN: { |
---|
227 | ChainIteratorStage2 entry(pt); |
---|
228 | while (entry && !enough) { |
---|
229 | enough = add_hit(DataLoc(entry.at()), mismatch); |
---|
230 | ++entry; |
---|
231 | } |
---|
232 | break; |
---|
233 | } |
---|
234 | case PT2_NODE: |
---|
235 | for (int base = PT_QU; base < PT_BASES && !enough; base++) { |
---|
236 | POS_TREE2 *son = PT_read_son(pt, (PT_base)base); |
---|
237 | if (son) enough = add_hits_for_children(son, mismatch); |
---|
238 | } |
---|
239 | break; |
---|
240 | } |
---|
241 | return enough; |
---|
242 | } |
---|
243 | |
---|
244 | void Mismatches::count_versus(const ReadableDataLoc& loc, const char *probe, int height) { |
---|
245 | int base; |
---|
246 | while ((base = probe[height])) { |
---|
247 | int ref = loc[height]; |
---|
248 | if (ref == PT_QU) break; |
---|
249 | |
---|
250 | count_weighted(base, ref, height); |
---|
251 | height++; |
---|
252 | } |
---|
253 | |
---|
254 | if (base != PT_QU) { // not end of probe |
---|
255 | pt_assert(loc[height] == PT_QU); // at EOS |
---|
256 | do { |
---|
257 | count_weighted(base, PT_QU, height); |
---|
258 | height++; |
---|
259 | } |
---|
260 | while ((base = probe[height])); |
---|
261 | } |
---|
262 | } |
---|
263 | |
---|
264 | bool MatchRequest::collect_hits_for(const char *probe, POS_TREE2 *pt, Mismatches& mismatches, const int height) { |
---|
265 | //! search down the tree to find matching species for the given probe |
---|
266 | |
---|
267 | pt_assert(pt && mismatches.accepted()); // invalid or superfluous call |
---|
268 | pt_assert(!hit_limit_reached()); |
---|
269 | |
---|
270 | bool enough = false; |
---|
271 | if (probe[height] == PT_QU) { |
---|
272 | enough = add_hits_for_children(pt, mismatches); |
---|
273 | } |
---|
274 | else { |
---|
275 | switch (pt->get_type()) { |
---|
276 | case PT2_LEAF: { |
---|
277 | ReadableDataLoc loc(pt); |
---|
278 | mismatches.count_versus(loc, probe, height); |
---|
279 | if (mismatches.accepted()) { |
---|
280 | enough = add_hit(loc, mismatches); |
---|
281 | } |
---|
282 | break; |
---|
283 | } |
---|
284 | case PT2_CHAIN: { |
---|
285 | pt_assert(probe); |
---|
286 | |
---|
287 | ChainIteratorStage2 entry(pt); |
---|
288 | while (entry && !enough) { |
---|
289 | Mismatches entry_mismatches(mismatches); |
---|
290 | DataLoc dloc(entry.at()); // @@@ EXPENSIVE_CONVERSION |
---|
291 | entry_mismatches.count_versus(ReadableDataLoc(dloc), probe, height); // @@@ EXPENSIVE_CONVERSION |
---|
292 | if (entry_mismatches.accepted()) { |
---|
293 | enough = add_hit(dloc, entry_mismatches); |
---|
294 | } |
---|
295 | ++entry; |
---|
296 | } |
---|
297 | break; |
---|
298 | } |
---|
299 | case PT2_NODE: |
---|
300 | for (int i=PT_QU; i<PT_BASES && !enough; i++) { |
---|
301 | POS_TREE2 *son = PT_read_son(pt, (PT_base)i); |
---|
302 | if (son) { |
---|
303 | Mismatches son_mismatches(mismatches); |
---|
304 | son_mismatches.count_weighted(probe[height], i, height); |
---|
305 | if (son_mismatches.accepted()) { |
---|
306 | if (i == PT_QU) { |
---|
307 | // @@@ calculation here is constant for a fixed probe (cache results) |
---|
308 | pt_assert(probe[height] != PT_QU); |
---|
309 | |
---|
310 | int son_height = height+1; |
---|
311 | while (1) { |
---|
312 | int base = probe[son_height]; |
---|
313 | if (base == PT_QU) { |
---|
314 | if (son_mismatches.accepted()) { |
---|
315 | enough = add_hits_for_children(son, son_mismatches); |
---|
316 | } |
---|
317 | break; |
---|
318 | } |
---|
319 | |
---|
320 | son_mismatches.count_weighted(base, PT_QU, son_height); |
---|
321 | if (!son_mismatches.accepted()) break; |
---|
322 | |
---|
323 | ++son_height; |
---|
324 | } |
---|
325 | } |
---|
326 | else { |
---|
327 | enough = collect_hits_for(probe, son, son_mismatches, height+1); |
---|
328 | } |
---|
329 | } |
---|
330 | } |
---|
331 | } |
---|
332 | break; |
---|
333 | } |
---|
334 | } |
---|
335 | |
---|
336 | return enough; |
---|
337 | } |
---|
338 | |
---|
339 | static int pt_sort_compare_match(const void *PT_probematch_ptr1, const void *PT_probematch_ptr2, void *) { |
---|
340 | const PT_probematch *mach1 = (const PT_probematch*)PT_probematch_ptr1; |
---|
341 | const PT_probematch *mach2 = (const PT_probematch*)PT_probematch_ptr2; |
---|
342 | |
---|
343 | if (psg.sort_by != PT_MATCH_TYPE_INTEGER) { |
---|
344 | if (mach1->wmismatches > mach2->wmismatches) return 1; |
---|
345 | if (mach1->wmismatches < mach2->wmismatches) return -1; |
---|
346 | } |
---|
347 | |
---|
348 | int cmp = mach1->mismatches - mach2->mismatches; |
---|
349 | if (!cmp) { |
---|
350 | cmp = mach1->N_mismatches - mach2->N_mismatches; |
---|
351 | if (!cmp) { |
---|
352 | if (mach1->wmismatches < mach2->wmismatches) cmp = -1; |
---|
353 | else if (mach1->wmismatches > mach2->wmismatches) cmp = 1; |
---|
354 | else { |
---|
355 | cmp = mach1->b_pos - mach2->b_pos; |
---|
356 | if (!cmp) { |
---|
357 | cmp = mach1->name - mach2->name; |
---|
358 | } |
---|
359 | } |
---|
360 | } |
---|
361 | } |
---|
362 | |
---|
363 | return cmp; |
---|
364 | } |
---|
365 | |
---|
366 | static void pt_sort_match_list(PT_local * locs) { |
---|
367 | if (locs->pm) { |
---|
368 | psg.sort_by = locs->sort_by; |
---|
369 | |
---|
370 | int list_len = locs->pm->get_count(); |
---|
371 | if (list_len > 1) { |
---|
372 | PT_probematch **my_list = (PT_probematch **)calloc(sizeof(void *), list_len); |
---|
373 | { |
---|
374 | PT_probematch *match = locs->pm; |
---|
375 | for (int i=0; match; i++) { |
---|
376 | my_list[i] = match; |
---|
377 | match = match->next; |
---|
378 | } |
---|
379 | } |
---|
380 | GB_sort((void **)my_list, 0, list_len, pt_sort_compare_match, 0); |
---|
381 | for (int i=0; i<list_len; i++) { |
---|
382 | aisc_unlink((dllheader_ext*)my_list[i]); |
---|
383 | aisc_link(&locs->ppm, my_list[i]); |
---|
384 | } |
---|
385 | free(my_list); |
---|
386 | } |
---|
387 | } |
---|
388 | } |
---|
389 | char *create_reversed_probe(char *probe, int len) { |
---|
390 | //! reverse order of bases in a probe |
---|
391 | char *rev_probe = GB_strduplen(probe, len); |
---|
392 | reverse_probe(rev_probe, len); |
---|
393 | return rev_probe; |
---|
394 | } |
---|
395 | |
---|
396 | static double calc_position_wmis(int pos, int seq_len, double y1, double y2) { |
---|
397 | return (double)(((double)(pos * (seq_len - 1 - pos)) / (double)((seq_len - 1) * (seq_len - 1)))* (double)(y2*4.0) + y1); |
---|
398 | } |
---|
399 | |
---|
400 | static void pt_build_pos_to_weight(PT_MATCH_TYPE type, const char *sequence) { |
---|
401 | delete [] psg.pos_to_weight; |
---|
402 | int slen = strlen(sequence); |
---|
403 | psg.pos_to_weight = new double[slen+1]; |
---|
404 | int p; |
---|
405 | for (p=0; p<slen; p++) { |
---|
406 | if (type == PT_MATCH_TYPE_WEIGHTED_PLUS_POS) { |
---|
407 | psg.pos_to_weight[p] = calc_position_wmis(p, slen, 0.3, 1.0); |
---|
408 | } |
---|
409 | else { |
---|
410 | psg.pos_to_weight[p] = 1.0; |
---|
411 | } |
---|
412 | } |
---|
413 | psg.pos_to_weight[slen] = 0; |
---|
414 | } |
---|
415 | |
---|
416 | static std::map<PT_local*,Splits> splits_for_match_overlay; // initialized by probe-match, used by match-retrieval (one entry for each match-request); @@@ leaks.. 1 entry for each request |
---|
417 | |
---|
418 | int probe_match(PT_local *locs, aisc_string probestring) { |
---|
419 | //! find out where a given probe matches |
---|
420 | |
---|
421 | freedup(locs->pm_sequence, probestring); |
---|
422 | psg.main_probe = locs->pm_sequence; |
---|
423 | |
---|
424 | compress_data(probestring); |
---|
425 | while (PT_probematch *ml = locs->pm) destroy_PT_probematch(ml); |
---|
426 | locs->matches_truncated = 0; |
---|
427 | |
---|
428 | #if defined(DEBUG) && 0 |
---|
429 | printf("Current bond values:\n"); |
---|
430 | for (int y = 0; y<4; y++) { |
---|
431 | for (int x = 0; x<4; x++) { |
---|
432 | printf("%5.2f", locs->bond[y*4+x].val); |
---|
433 | } |
---|
434 | printf("\n"); |
---|
435 | } |
---|
436 | #endif // DEBUG |
---|
437 | |
---|
438 | int probe_len = strlen(probestring); |
---|
439 | if (probe_len<MIN_PROBE_LENGTH) { |
---|
440 | pt_export_error(locs, GBS_global_string("Min. probe length is %i", MIN_PROBE_LENGTH)); |
---|
441 | return 0; |
---|
442 | } |
---|
443 | |
---|
444 | { |
---|
445 | int max_poss_mismatches = probe_len/2; |
---|
446 | pt_assert(max_poss_mismatches>0); |
---|
447 | if (locs->pm_max > max_poss_mismatches) { |
---|
448 | pt_export_error(locs, GBS_global_string("Max. %i mismatch%s are allowed for probes of length %i", |
---|
449 | max_poss_mismatches, |
---|
450 | max_poss_mismatches == 1 ? "" : "es", |
---|
451 | probe_len)); |
---|
452 | return 0; |
---|
453 | } |
---|
454 | } |
---|
455 | |
---|
456 | if (locs->pm_complement) { |
---|
457 | complement_probe(probestring, probe_len); |
---|
458 | } |
---|
459 | psg.reversed = 0; |
---|
460 | |
---|
461 | freedup(locs->pm_sequence, probestring); |
---|
462 | psg.main_probe = locs->pm_sequence; |
---|
463 | |
---|
464 | pt_build_pos_to_weight((PT_MATCH_TYPE)locs->sort_by, probestring); |
---|
465 | |
---|
466 | MatchRequest req(*locs, probe_len); |
---|
467 | |
---|
468 | pt_assert(req.allowed_mismatches() >= 0); // till [8011] value<0 was used to trigger "new match" (feature unused) |
---|
469 | Mismatches mismatch(req); |
---|
470 | req.collect_hits_for(probestring, psg.TREE_ROOT2(), mismatch, 0); |
---|
471 | |
---|
472 | if (locs->pm_reversed) { |
---|
473 | psg.reversed = 1; |
---|
474 | char *rev_pro = create_reversed_probe(probestring, probe_len); |
---|
475 | complement_probe(rev_pro, probe_len); |
---|
476 | freeset(locs->pm_csequence, psg.main_probe = strdup(rev_pro)); |
---|
477 | |
---|
478 | Mismatches rev_mismatch(req); |
---|
479 | req.collect_hits_for(rev_pro, psg.TREE_ROOT2(), rev_mismatch, 0); |
---|
480 | free(rev_pro); |
---|
481 | } |
---|
482 | pt_sort_match_list(locs); |
---|
483 | splits_for_match_overlay[locs] = Splits(locs); |
---|
484 | free(probestring); |
---|
485 | |
---|
486 | return 0; |
---|
487 | } |
---|
488 | |
---|
489 | struct format_props { |
---|
490 | bool show_mismatches; // whether to show 'mis' and 'N_mis' |
---|
491 | bool show_ecoli; // whether to show 'ecoli' column |
---|
492 | bool show_gpos; // whether to show 'gpos' column |
---|
493 | |
---|
494 | int name_width; // width of 'name' column |
---|
495 | int gene_or_full_width; // width of 'genename' or 'fullname' column |
---|
496 | int pos_width; // max. width of pos column |
---|
497 | int gpos_width; // max. width of gpos column |
---|
498 | int ecoli_width; // max. width of ecoli column |
---|
499 | |
---|
500 | int rev_width() const { return 3; } |
---|
501 | int mis_width() const { return 3; } |
---|
502 | int N_mis_width() const { return 5; } |
---|
503 | int wmis_width() const { return 4; } |
---|
504 | }; |
---|
505 | |
---|
506 | inline void set_max(const char *str, int &curr_max) { |
---|
507 | if (str) { |
---|
508 | int len = strlen(str); |
---|
509 | if (len>curr_max) curr_max = len; |
---|
510 | } |
---|
511 | } |
---|
512 | |
---|
513 | static format_props detect_format_props(const PT_local *locs, bool show_gpos) { |
---|
514 | PT_probematch *ml = locs->pm; // probe matches |
---|
515 | format_props format; |
---|
516 | |
---|
517 | format.show_mismatches = (ml->N_mismatches >= 0); |
---|
518 | format.show_ecoli = psg.ecoli; // display only if there is ecoli |
---|
519 | format.show_gpos = show_gpos; // display only for gene probe matches |
---|
520 | |
---|
521 | // minimum values (caused by header widths) : |
---|
522 | format.name_width = gene_flag ? 8 : 4; // 'organism' or 'name' |
---|
523 | format.gene_or_full_width = 8; // 'genename' or 'fullname' |
---|
524 | format.pos_width = 3; // 'pos' |
---|
525 | format.gpos_width = 4; // 'gpos' |
---|
526 | format.ecoli_width = 5; // 'ecoli' |
---|
527 | |
---|
528 | for (; ml; ml = ml->next) { |
---|
529 | set_max(virt_name(ml), format.name_width); |
---|
530 | set_max(virt_fullname(ml), format.gene_or_full_width); |
---|
531 | set_max(GBS_global_string("%i", info2bio(ml->b_pos)), format.pos_width); |
---|
532 | if (show_gpos) set_max(GBS_global_string("%i", info2bio(ml->g_pos)), format.gpos_width); |
---|
533 | if (format.show_ecoli) set_max(GBS_global_string("%li", PT_abs_2_ecoli_rel(ml->b_pos+1)), format.ecoli_width); |
---|
534 | } |
---|
535 | |
---|
536 | return format; |
---|
537 | } |
---|
538 | |
---|
539 | inline void cat_internal(GBS_strstruct *memfile, int len, const char *text, int width, char spacer, bool align_left) { |
---|
540 | if (len == width) { |
---|
541 | GBS_strcat(memfile, text); // text has exact len |
---|
542 | } |
---|
543 | else if (len > width) { // text to long |
---|
544 | char buf[width+1]; |
---|
545 | memcpy(buf, text, width); |
---|
546 | buf[width] = 0; |
---|
547 | GBS_strcat(memfile, buf); |
---|
548 | } |
---|
549 | else { // text is too short -> insert spaces |
---|
550 | int spaces = width-len; |
---|
551 | pt_assert(spaces>0); |
---|
552 | char sp[spaces+1]; |
---|
553 | memset(sp, spacer, spaces); |
---|
554 | sp[spaces] = 0; |
---|
555 | |
---|
556 | if (align_left) { |
---|
557 | GBS_strcat(memfile, text); |
---|
558 | GBS_strcat(memfile, sp); |
---|
559 | } |
---|
560 | else { |
---|
561 | GBS_strcat(memfile, sp); |
---|
562 | GBS_strcat(memfile, text); |
---|
563 | } |
---|
564 | } |
---|
565 | GBS_chrcat(memfile, ' '); // one space behind each column |
---|
566 | } |
---|
567 | inline void cat_spaced_left (GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, ' ', true); } |
---|
568 | inline void cat_spaced_right(GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, ' ', false); } |
---|
569 | inline void cat_dashed_left (GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, '-', true); } |
---|
570 | inline void cat_dashed_right(GBS_strstruct *memfile, const char *text, int width) { cat_internal(memfile, strlen(text), text, width, '-', false); } |
---|
571 | |
---|
572 | const char *get_match_overlay(const PT_probematch *ml) { |
---|
573 | int pr_len = strlen(ml->sequence); |
---|
574 | PT_local *locs = (PT_local *)ml->mh.parent->parent; |
---|
575 | |
---|
576 | const int CONTEXT_SIZE = 9; |
---|
577 | |
---|
578 | char *ref = (char *)calloc(sizeof(char), CONTEXT_SIZE+1+pr_len+1+CONTEXT_SIZE+1); |
---|
579 | memset(ref, '.', CONTEXT_SIZE+1); |
---|
580 | |
---|
581 | SmartCharPtr seqPtr = psg.data[ml->name].get_dataPtr(); |
---|
582 | const char *seq = &*seqPtr; |
---|
583 | |
---|
584 | const Splits& splits = splits_for_match_overlay[locs]; |
---|
585 | |
---|
586 | for (int pr_pos = CONTEXT_SIZE-1, al_pos = ml->rpos-1; |
---|
587 | pr_pos >= 0 && al_pos >= 0; |
---|
588 | pr_pos--, al_pos--) |
---|
589 | { |
---|
590 | if (!seq[al_pos]) break; |
---|
591 | ref[pr_pos] = base_2_readable(seq[al_pos]); |
---|
592 | } |
---|
593 | ref[CONTEXT_SIZE] = '-'; |
---|
594 | |
---|
595 | pt_build_pos_to_weight((PT_MATCH_TYPE)locs->sort_by, ml->sequence); |
---|
596 | |
---|
597 | bool display_right_context = true; |
---|
598 | { |
---|
599 | char *pref = ref+CONTEXT_SIZE+1; |
---|
600 | |
---|
601 | for (int pr_pos = 0, al_pos = ml->rpos; |
---|
602 | pr_pos < pr_len && al_pos < psg.data[ml->name].get_size(); |
---|
603 | pr_pos++, al_pos++) |
---|
604 | { |
---|
605 | int ps = ml->sequence[pr_pos]; // probe sequence |
---|
606 | int ts = seq[al_pos]; // target sequence (hit) |
---|
607 | if (ps == ts) { |
---|
608 | pref[pr_pos] = '='; |
---|
609 | } |
---|
610 | else { |
---|
611 | if (ts) { |
---|
612 | int r = base_2_readable(ts); |
---|
613 | if (is_std_base(ps) && is_std_base(ts)) { |
---|
614 | double h = splits.check(ml->sequence[pr_pos], ts); |
---|
615 | if (h>=0.0) r = tolower(r); |
---|
616 | } |
---|
617 | pref[pr_pos] = r; |
---|
618 | } |
---|
619 | else { |
---|
620 | // end of sequence or missing data (dots inside sequence) reached |
---|
621 | // (rest of probe was accepted by N-matches) |
---|
622 | display_right_context = false; |
---|
623 | for (; pr_pos < pr_len; pr_pos++) { |
---|
624 | pref[pr_pos] = '.'; |
---|
625 | } |
---|
626 | } |
---|
627 | } |
---|
628 | |
---|
629 | } |
---|
630 | } |
---|
631 | |
---|
632 | { |
---|
633 | char *cref = ref+CONTEXT_SIZE+1+pr_len+1; |
---|
634 | cref[-1] = '-'; |
---|
635 | |
---|
636 | int al_size = psg.data[ml->name].get_size(); |
---|
637 | int al_pos = ml->rpos+pr_len; |
---|
638 | |
---|
639 | if (display_right_context) { |
---|
640 | for (int pr_pos = 0; |
---|
641 | pr_pos < CONTEXT_SIZE && al_pos < al_size; |
---|
642 | pr_pos++, al_pos++) |
---|
643 | { |
---|
644 | cref[pr_pos] = base_2_readable(seq[al_pos]); |
---|
645 | } |
---|
646 | } |
---|
647 | else { |
---|
648 | if (al_pos < al_size) strcpy(cref, "<more>"); |
---|
649 | } |
---|
650 | } |
---|
651 | |
---|
652 | static char *result = 0; |
---|
653 | freeset(result, ref); |
---|
654 | return result; |
---|
655 | } |
---|
656 | |
---|
657 | const char* get_match_acc(const PT_probematch *ml) { |
---|
658 | return psg.data[ml->name].get_acc(); |
---|
659 | } |
---|
660 | int get_match_start(const PT_probematch *ml) { |
---|
661 | return psg.data[ml->name].get_start(); |
---|
662 | } |
---|
663 | int get_match_stop(const PT_probematch *ml) { |
---|
664 | return psg.data[ml->name].get_stop(); |
---|
665 | } |
---|
666 | |
---|
667 | static const char *get_match_info_formatted(PT_probematch *ml, const format_props& format) { |
---|
668 | GBS_strstruct *memfile = GBS_stropen(256); |
---|
669 | GBS_strcat(memfile, " "); |
---|
670 | |
---|
671 | cat_spaced_left(memfile, virt_name(ml), format.name_width); |
---|
672 | cat_spaced_left(memfile, virt_fullname(ml), format.gene_or_full_width); |
---|
673 | |
---|
674 | if (format.show_mismatches) { |
---|
675 | cat_spaced_right(memfile, GBS_global_string("%i", ml->mismatches), format.mis_width()); |
---|
676 | cat_spaced_right(memfile, GBS_global_string("%i", ml->N_mismatches), format.N_mis_width()); |
---|
677 | } |
---|
678 | cat_spaced_right(memfile, GBS_global_string("%.1f", ml->wmismatches), format.wmis_width()); |
---|
679 | cat_spaced_right(memfile, GBS_global_string("%i", info2bio(ml->b_pos)), format.pos_width); |
---|
680 | if (format.show_gpos) { |
---|
681 | cat_spaced_right(memfile, GBS_global_string("%i", info2bio(ml->g_pos)), format.gpos_width); |
---|
682 | } |
---|
683 | if (format.show_ecoli) { |
---|
684 | cat_spaced_right(memfile, GBS_global_string("%li", PT_abs_2_ecoli_rel(ml->b_pos+1)), format.ecoli_width); |
---|
685 | } |
---|
686 | cat_spaced_left(memfile, GBS_global_string("%i", ml->reversed), format.rev_width()); |
---|
687 | |
---|
688 | GBS_strcat(memfile, get_match_overlay(ml)); |
---|
689 | |
---|
690 | static char *result = 0; |
---|
691 | freeset(result, GBS_strclose(memfile)); |
---|
692 | return result; |
---|
693 | } |
---|
694 | |
---|
695 | static const char *get_match_hinfo_formatted(PT_probematch *ml, const format_props& format) { |
---|
696 | if (ml) { |
---|
697 | GBS_strstruct *memfile = GBS_stropen(500); |
---|
698 | GBS_strcat(memfile, " "); // one space more than in get_match_info_formatted() |
---|
699 | |
---|
700 | cat_dashed_left(memfile, gene_flag ? "organism" : "name", format.name_width); |
---|
701 | cat_dashed_left(memfile, gene_flag ? "genename" : "fullname", format.gene_or_full_width); |
---|
702 | |
---|
703 | if (format.show_mismatches) { |
---|
704 | cat_dashed_right(memfile, "mis", format.mis_width()); |
---|
705 | cat_dashed_right(memfile, "N_mis", format.N_mis_width()); |
---|
706 | } |
---|
707 | cat_dashed_right(memfile, "wmis", format.wmis_width()); |
---|
708 | cat_dashed_right(memfile, "pos", format.pos_width); |
---|
709 | if (format.show_gpos) { |
---|
710 | cat_dashed_right(memfile, "gpos", format.gpos_width); |
---|
711 | } |
---|
712 | if (format.show_ecoli) { |
---|
713 | cat_dashed_right(memfile, "ecoli", format.ecoli_width); |
---|
714 | } |
---|
715 | cat_dashed_left(memfile, "rev", format.rev_width()); |
---|
716 | |
---|
717 | if (ml->N_mismatches >= 0) { // |
---|
718 | char *seq = strdup(ml->sequence); |
---|
719 | probe_2_readable(seq, strlen(ml->sequence)); // @@@ maybe wrong if match contains PT_QU (see [9070]) |
---|
720 | |
---|
721 | GBS_strcat(memfile, " '"); |
---|
722 | GBS_strcat(memfile, seq); |
---|
723 | GBS_strcat(memfile, "'"); |
---|
724 | |
---|
725 | free(seq); |
---|
726 | } |
---|
727 | |
---|
728 | static char *result = 0; |
---|
729 | freeset(result, GBS_strclose(memfile)); |
---|
730 | |
---|
731 | return result; |
---|
732 | } |
---|
733 | // Else set header of result |
---|
734 | return "There are no targets"; |
---|
735 | } |
---|
736 | |
---|
737 | static void gene_rel_2_abs(PT_probematch *ml) { |
---|
738 | /*! after gene probe match all positions are gene-relative. |
---|
739 | * gene_rel_2_abs() makes them genome-absolute. |
---|
740 | */ |
---|
741 | |
---|
742 | GB_transaction ta(psg.gb_main); |
---|
743 | |
---|
744 | for (; ml; ml = ml->next) { |
---|
745 | long gene_pos = psg.data[ml->name].get_geneabspos(); |
---|
746 | if (gene_pos >= 0) { |
---|
747 | ml->g_pos = ml->b_pos; |
---|
748 | ml->b_pos += gene_pos; |
---|
749 | } |
---|
750 | else { |
---|
751 | fprintf(stderr, "Error in gene-pt-server: gene w/o position info\n"); |
---|
752 | pt_assert(0); |
---|
753 | } |
---|
754 | } |
---|
755 | } |
---|
756 | |
---|
757 | bytestring *match_string(const PT_local *locs) { |
---|
758 | /*! Create list of species where probe matches. |
---|
759 | * |
---|
760 | * header^1name^1info^1name^1info....^0 |
---|
761 | * (where ^0 and ^1 are ASCII 0 and 1) |
---|
762 | * |
---|
763 | * Implements server function 'MATCH_STRING' |
---|
764 | */ |
---|
765 | static bytestring bs = { 0, 0 }; |
---|
766 | GBS_strstruct *memfile; |
---|
767 | PT_probematch *ml; |
---|
768 | |
---|
769 | free(bs.data); |
---|
770 | |
---|
771 | char empty[] = ""; |
---|
772 | bs.data = empty; |
---|
773 | memfile = GBS_stropen(50000); |
---|
774 | |
---|
775 | if (locs->pm) { |
---|
776 | if (gene_flag) gene_rel_2_abs(locs->pm); |
---|
777 | |
---|
778 | format_props format = detect_format_props(locs, gene_flag); |
---|
779 | |
---|
780 | GBS_strcat(memfile, get_match_hinfo_formatted(locs->pm, format)); |
---|
781 | GBS_chrcat(memfile, (char)1); |
---|
782 | |
---|
783 | for (ml = locs->pm; ml; ml = ml->next) { |
---|
784 | GBS_strcat(memfile, virt_name(ml)); |
---|
785 | GBS_chrcat(memfile, (char)1); |
---|
786 | GBS_strcat(memfile, get_match_info_formatted(ml, format)); |
---|
787 | GBS_chrcat(memfile, (char)1); |
---|
788 | } |
---|
789 | } |
---|
790 | |
---|
791 | bs.data = GBS_strclose(memfile); |
---|
792 | bs.size = strlen(bs.data)+1; |
---|
793 | return &bs; |
---|
794 | } |
---|
795 | |
---|
796 | |
---|
797 | |
---|
798 | |
---|
799 | bytestring *MP_match_string(const PT_local *locs) { |
---|
800 | /*! Create list of species where probe matches and append number of mismatches and weighted mismatches (used by multiprobe) |
---|
801 | * |
---|
802 | * Format: "header^1name^1#mismatches^1#wmismatches^1name^1#mismatches^1#wmismatches....^0" |
---|
803 | * (where ^0 and ^1 are ASCII 0 and 1) |
---|
804 | * |
---|
805 | * Implements server function 'MP_MATCH_STRING' |
---|
806 | */ |
---|
807 | static bytestring bs = { 0, 0 }; |
---|
808 | |
---|
809 | char buffer[50]; |
---|
810 | char buffer1[50]; |
---|
811 | GBS_strstruct *memfile; |
---|
812 | PT_probematch *ml; |
---|
813 | |
---|
814 | delete bs.data; |
---|
815 | bs.data = 0; |
---|
816 | memfile = GBS_stropen(50000); |
---|
817 | |
---|
818 | for (ml = locs->pm; ml; ml = ml->next) |
---|
819 | { |
---|
820 | GBS_strcat(memfile, virt_name(ml)); |
---|
821 | GBS_chrcat(memfile, (char)1); |
---|
822 | sprintf(buffer, "%2i", ml->mismatches); |
---|
823 | GBS_strcat(memfile, buffer); |
---|
824 | GBS_chrcat(memfile, (char)1); |
---|
825 | sprintf(buffer1, "%1.1f", ml->wmismatches); |
---|
826 | GBS_strcat(memfile, buffer1); |
---|
827 | GBS_chrcat(memfile, (char)1); |
---|
828 | } |
---|
829 | bs.data = GBS_strclose(memfile); |
---|
830 | bs.size = strlen(bs.data)+1; |
---|
831 | return &bs; |
---|
832 | } |
---|
833 | |
---|
834 | |
---|
835 | bytestring *MP_all_species_string(const PT_local *) { |
---|
836 | /*! Create list of all species known to PT server |
---|
837 | * |
---|
838 | * Format: ^1name^1name....^0 |
---|
839 | * (where ^0 and ^1 are ASCII 0 and 1) |
---|
840 | * |
---|
841 | * Implements server function 'MP_ALL_SPECIES_STRING' |
---|
842 | */ |
---|
843 | static bytestring bs = { 0, 0 }; |
---|
844 | |
---|
845 | GBS_strstruct *memfile; |
---|
846 | int i; |
---|
847 | |
---|
848 | delete bs.data; |
---|
849 | bs.data = 0; |
---|
850 | memfile = GBS_stropen(1000); |
---|
851 | |
---|
852 | for (i = 0; i < psg.data_count; i++) |
---|
853 | { |
---|
854 | GBS_strcat(memfile, psg.data[i].get_shortname()); |
---|
855 | GBS_chrcat(memfile, (char)1); |
---|
856 | } |
---|
857 | |
---|
858 | bs.data = GBS_strclose(memfile); |
---|
859 | bs.size = strlen(bs.data)+1; |
---|
860 | return &bs; |
---|
861 | } |
---|
862 | |
---|
863 | int MP_count_all_species(const PT_local *) { |
---|
864 | return psg.data_count; |
---|
865 | } |
---|
866 | |
---|
867 | // -------------------------------------------------------------------------------- |
---|
868 | |
---|
869 | #ifdef UNIT_TESTS |
---|
870 | #ifndef TEST_UNIT_H |
---|
871 | #include <test_unit.h> |
---|
872 | #endif |
---|
873 | |
---|
874 | struct EnterStage2 { |
---|
875 | EnterStage2() { |
---|
876 | PT_init_psg(); |
---|
877 | psg.enter_stage(STAGE2); |
---|
878 | } |
---|
879 | ~EnterStage2() { |
---|
880 | PT_exit_psg(); |
---|
881 | } |
---|
882 | }; |
---|
883 | |
---|
884 | #define TEST_WEIGHTED_MISMATCH(probe,seq,expected) TEST_EXPECT_SIMILAR(weights.get(probe,seq), expected, EPS) |
---|
885 | |
---|
886 | void TEST_weighted_mismatches() { |
---|
887 | EnterStage2 stage2; |
---|
888 | PT_bond bonds[16] = { |
---|
889 | { 0.0 }, { 0.0 }, { 0.5 }, { 1.1 }, |
---|
890 | { 0.0 }, { 0.0 }, { 1.5 }, { 0.0 }, |
---|
891 | { 0.5 }, { 1.5 }, { 0.4 }, { 0.9 }, |
---|
892 | { 1.1 }, { 0.0 }, { 0.9 }, { 0.0 }, |
---|
893 | }; |
---|
894 | |
---|
895 | MismatchWeights weights(bonds); |
---|
896 | |
---|
897 | double EPS = 0.0001; |
---|
898 | |
---|
899 | TEST_WEIGHTED_MISMATCH(PT_A, PT_A, 0.0); |
---|
900 | TEST_WEIGHTED_MISMATCH(PT_A, PT_C, 1.1); // (T~A = 1.1) - (T~C = 0) |
---|
901 | TEST_WEIGHTED_MISMATCH(PT_A, PT_G, 0.2); // (T~A = 1.1) - (T~G = 0.9) |
---|
902 | TEST_WEIGHTED_MISMATCH(PT_A, PT_T, 1.1); |
---|
903 | |
---|
904 | TEST_WEIGHTED_MISMATCH(PT_C, PT_A, 1.0); |
---|
905 | TEST_WEIGHTED_MISMATCH(PT_C, PT_C, 0.0); |
---|
906 | TEST_WEIGHTED_MISMATCH(PT_C, PT_G, 1.1); |
---|
907 | TEST_WEIGHTED_MISMATCH(PT_C, PT_T, 0.6); // (G~C = 1.5) - (G~T = 0.9) |
---|
908 | |
---|
909 | TEST_WEIGHTED_MISMATCH(PT_G, PT_A, 1.5); |
---|
910 | TEST_WEIGHTED_MISMATCH(PT_G, PT_C, 1.5); |
---|
911 | TEST_WEIGHTED_MISMATCH(PT_G, PT_G, 0.0); |
---|
912 | TEST_WEIGHTED_MISMATCH(PT_G, PT_T, 1.5); |
---|
913 | |
---|
914 | TEST_WEIGHTED_MISMATCH(PT_T, PT_A, 1.1); |
---|
915 | TEST_WEIGHTED_MISMATCH(PT_T, PT_C, 1.1); |
---|
916 | TEST_WEIGHTED_MISMATCH(PT_T, PT_G, 0.6); |
---|
917 | TEST_WEIGHTED_MISMATCH(PT_T, PT_T, 0.0); |
---|
918 | |
---|
919 | |
---|
920 | TEST_WEIGHTED_MISMATCH(PT_N, PT_A, 0.9); |
---|
921 | TEST_WEIGHTED_MISMATCH(PT_N, PT_C, 0.925); |
---|
922 | TEST_WEIGHTED_MISMATCH(PT_N, PT_G, 0.475); |
---|
923 | TEST_WEIGHTED_MISMATCH(PT_N, PT_T, 0.8); |
---|
924 | |
---|
925 | TEST_WEIGHTED_MISMATCH(PT_A, PT_N, 0.6); |
---|
926 | TEST_WEIGHTED_MISMATCH(PT_C, PT_N, 0.675); |
---|
927 | TEST_WEIGHTED_MISMATCH(PT_G, PT_N, 1.125); |
---|
928 | TEST_WEIGHTED_MISMATCH(PT_T, PT_N, 0.7); |
---|
929 | |
---|
930 | TEST_WEIGHTED_MISMATCH(PT_N, PT_N, 0.775); |
---|
931 | TEST_WEIGHTED_MISMATCH(PT_QU, PT_QU, 0.775); |
---|
932 | TEST_WEIGHTED_MISMATCH(PT_QU, PT_N, 0.775); |
---|
933 | } |
---|
934 | |
---|
935 | #endif // UNIT_TESTS |
---|
936 | |
---|
937 | // -------------------------------------------------------------------------------- |
---|
938 | |
---|
939 | |
---|
940 | |
---|