1 | /* |
---|
2 | Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu> |
---|
3 | |
---|
4 | This file is part of SINA. |
---|
5 | SINA is free software: you can redistribute it and/or modify it under |
---|
6 | the terms of the GNU General Public License as published by the Free |
---|
7 | Software Foundation, either version 3 of the License, or (at your |
---|
8 | option) any later version. |
---|
9 | |
---|
10 | SINA is distributed in the hope that it will be useful, but WITHOUT ANY |
---|
11 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
---|
12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
---|
13 | for more details. |
---|
14 | |
---|
15 | You should have received a copy of the GNU General Public License |
---|
16 | along with SINA. If not, see <http://www.gnu.org/licenses/>. |
---|
17 | |
---|
18 | Additional permission under GNU GPL version 3 section 7 |
---|
19 | |
---|
20 | If you modify SINA, or any covered work, by linking or combining it |
---|
21 | with components of ARB (or a modified version of that software), |
---|
22 | containing parts covered by the terms of the |
---|
23 | ARB-public-library-license, the licensors of SINA grant you additional |
---|
24 | permission to convey the resulting work. Corresponding Source for a |
---|
25 | non-source form of such a combination shall include the source code |
---|
26 | for the parts of ARB used as well as that of the covered work. |
---|
27 | */ |
---|
28 | |
---|
29 | #include "cseq_comparator.h" |
---|
30 | |
---|
31 | #include <boost/algorithm/string/predicate.hpp> |
---|
32 | using boost::algorithm::istarts_with; |
---|
33 | using boost::algorithm::iequals; |
---|
34 | |
---|
35 | #include <boost/foreach.hpp> |
---|
36 | |
---|
37 | #include <cmath> |
---|
38 | |
---|
39 | namespace po = boost::program_options; |
---|
40 | |
---|
41 | namespace sina { |
---|
42 | |
---|
43 | static float jukes_cantor(float in) { |
---|
44 | return -3.0/4 * log( 1.0 - 4.0/3*in); |
---|
45 | } |
---|
46 | |
---|
47 | |
---|
48 | cseq_comparator::cseq_comparator(CMP_IUPAC_TYPE iupac, CMP_DIST_TYPE dist, |
---|
49 | CMP_COVER_TYPE cover, bool filter_lc) |
---|
50 | : iupac_rule(iupac), dist_rule(dist), cover_rule(cover), |
---|
51 | filter_lc_rule(filter_lc) |
---|
52 | { |
---|
53 | } |
---|
54 | |
---|
55 | cseq_comparator::cseq_comparator() = default; |
---|
56 | |
---|
57 | template<typename FUNC> |
---|
58 | void |
---|
59 | traverse(const cseq_base& A, const cseq_base& B, FUNC F) { |
---|
60 | auto a = A.bases.begin(); |
---|
61 | auto a_end = A.bases.end(); |
---|
62 | auto b = B.bases.begin(); |
---|
63 | auto b_end = B.bases.end(); |
---|
64 | |
---|
65 | // skip filtered bases at beginning |
---|
66 | while (a != a_end && F->filtered(*a)) { |
---|
67 | ++a; |
---|
68 | } |
---|
69 | while (b != b_end && F->filtered(*b)) { |
---|
70 | ++b; |
---|
71 | } |
---|
72 | |
---|
73 | // skip filtered bases at end |
---|
74 | while (a != a_end && F->filtered(*(a_end-1)) ) { |
---|
75 | --a_end; |
---|
76 | } |
---|
77 | while (b != b_end && F->filtered(*(b_end-1)) ) { |
---|
78 | --b_end; |
---|
79 | } |
---|
80 | |
---|
81 | // count left overhang |
---|
82 | if (a->getPosition() < b->getPosition()) { |
---|
83 | // overhang in sequence A |
---|
84 | while (a != a_end && |
---|
85 | a->getPosition() < b->getPosition()) { |
---|
86 | F->overhangA(*a); |
---|
87 | ++a; |
---|
88 | } |
---|
89 | } else { |
---|
90 | // overhang in sequence B or no overhang |
---|
91 | while (b != b_end && |
---|
92 | a->getPosition() > b->getPosition()) { |
---|
93 | F->overhangB(*b); |
---|
94 | ++b; |
---|
95 | } |
---|
96 | } |
---|
97 | |
---|
98 | // count overlaping zone |
---|
99 | while (a != a_end && b != b_end) { |
---|
100 | int diff = a->getPosition() - b->getPosition(); |
---|
101 | if (diff > 0) { // a>b |
---|
102 | F->onlyB(*b); |
---|
103 | ++b; |
---|
104 | } else if (diff < 0) { // a<b |
---|
105 | F->onlyA(*a); |
---|
106 | ++a; |
---|
107 | } else { // a==b |
---|
108 | F->both(*a,*b); |
---|
109 | ++a; |
---|
110 | ++b; |
---|
111 | } |
---|
112 | } |
---|
113 | |
---|
114 | // count right overhang |
---|
115 | while (a != a_end) { F->overhangA(*a); ++a; } |
---|
116 | while (b != b_end) { F->overhangB(*b); ++b; } |
---|
117 | } |
---|
118 | |
---|
119 | struct base_comp_optimistic { |
---|
120 | bool operator()(const base_iupac& a, const base_iupac& b) const { |
---|
121 | return a.comp(b); |
---|
122 | } |
---|
123 | }; |
---|
124 | |
---|
125 | struct base_comp_pessimistic { |
---|
126 | bool operator()(const base_iupac& a, const base_iupac& b) const { |
---|
127 | return a.comp_pessimistic(b); |
---|
128 | } |
---|
129 | }; |
---|
130 | |
---|
131 | struct base_comp_exact { |
---|
132 | bool operator()(const base_iupac& a, const base_iupac& b) const { |
---|
133 | return a.comp_exact(b); |
---|
134 | } |
---|
135 | }; |
---|
136 | |
---|
137 | struct filter_none { |
---|
138 | bool filtered(const aligned_base& /*unused*/) const { |
---|
139 | return false; |
---|
140 | } |
---|
141 | }; |
---|
142 | |
---|
143 | struct filter_lowercase { |
---|
144 | bool filtered(const aligned_base& b) const { |
---|
145 | return b.isLowerCase(); |
---|
146 | } |
---|
147 | }; |
---|
148 | |
---|
149 | |
---|
150 | |
---|
151 | struct match_counter { |
---|
152 | int only_a_overhang{0}; |
---|
153 | int only_b_overhang{0}; |
---|
154 | int only_a{0}; |
---|
155 | int only_b{0}; |
---|
156 | int match{0}; |
---|
157 | int mismatch{0}; |
---|
158 | |
---|
159 | template<typename BCOMP, typename FILTER> |
---|
160 | struct counter; |
---|
161 | |
---|
162 | template<typename BCOMP, typename FILTER> |
---|
163 | counter<BCOMP, FILTER>* func() { |
---|
164 | return static_cast<counter<BCOMP, FILTER>* >(this); |
---|
165 | } |
---|
166 | }; |
---|
167 | |
---|
168 | template<typename BCOMP, typename FILTER> |
---|
169 | struct match_counter::counter : public match_counter, public FILTER { |
---|
170 | counter(const counter&); |
---|
171 | void overhangA(const aligned_base& b) { |
---|
172 | if (!FILTER::filtered(b)) { |
---|
173 | only_a_overhang++; |
---|
174 | } |
---|
175 | } |
---|
176 | void overhangB(const aligned_base& b) { |
---|
177 | if (!FILTER::filtered(b)) { |
---|
178 | only_b_overhang++; |
---|
179 | } |
---|
180 | } |
---|
181 | void onlyA(const aligned_base& b) { |
---|
182 | if (!FILTER::filtered(b)) { |
---|
183 | only_a++; |
---|
184 | } |
---|
185 | } |
---|
186 | void onlyB(const aligned_base& b) { |
---|
187 | if (!FILTER::filtered(b)) { |
---|
188 | only_b++; |
---|
189 | } |
---|
190 | } |
---|
191 | void both(const aligned_base& b1, |
---|
192 | const aligned_base& b2) { |
---|
193 | if (!FILTER::filtered(b1) && !FILTER::filtered(b2)) { |
---|
194 | if (cmp(b1,b2)) { |
---|
195 | match++; |
---|
196 | } else { |
---|
197 | mismatch++; |
---|
198 | } |
---|
199 | } else if (!FILTER::filtered(b1)) { |
---|
200 | only_a++; |
---|
201 | } else if (!FILTER::filtered(b2)) { |
---|
202 | only_b++; |
---|
203 | } |
---|
204 | } |
---|
205 | BCOMP cmp; |
---|
206 | }; |
---|
207 | |
---|
208 | |
---|
209 | |
---|
210 | float |
---|
211 | cseq_comparator::operator()(const cseq& query, const cseq& target) { |
---|
212 | match_counter m; |
---|
213 | int base; |
---|
214 | |
---|
215 | switch(iupac_rule) { |
---|
216 | case CMP_IUPAC_OPTIMISTIC: |
---|
217 | if (filter_lc_rule) { |
---|
218 | traverse(query, target, m.func<base_comp_optimistic, filter_lowercase>()); |
---|
219 | } else { |
---|
220 | traverse(query, target, m.func<base_comp_optimistic, filter_none>()); |
---|
221 | } |
---|
222 | break; |
---|
223 | case CMP_IUPAC_PESSIMISTIC: |
---|
224 | if (filter_lc_rule) { |
---|
225 | traverse(query, target, m.func<base_comp_pessimistic, filter_lowercase>()); |
---|
226 | } else { |
---|
227 | traverse(query, target, m.func<base_comp_pessimistic, filter_none>()); |
---|
228 | } |
---|
229 | break; |
---|
230 | case CMP_IUPAC_EXACT: |
---|
231 | if (filter_lc_rule) { |
---|
232 | traverse(query, target, m.func<base_comp_exact, filter_lowercase>()); |
---|
233 | } else { |
---|
234 | traverse(query, target, m.func<base_comp_exact, filter_none>()); |
---|
235 | } |
---|
236 | break; |
---|
237 | default: |
---|
238 | throw std::logic_error("unknown iupac rule"); |
---|
239 | } |
---|
240 | |
---|
241 | switch(cover_rule) { |
---|
242 | case CMP_COVER_ABS: |
---|
243 | base = 1; |
---|
244 | break; |
---|
245 | case CMP_COVER_QUERY: |
---|
246 | base = m.match + m.mismatch + m.only_a + m.only_a_overhang; |
---|
247 | break; |
---|
248 | case CMP_COVER_TARGET: |
---|
249 | base = m.match + m.mismatch + m.only_b + m.only_b_overhang; |
---|
250 | break; |
---|
251 | case CMP_COVER_OVERLAP: |
---|
252 | base = m.match + m.mismatch + m.only_a + m.only_b; |
---|
253 | break; |
---|
254 | case CMP_COVER_ALL: |
---|
255 | base = m.match + m.mismatch + m.only_a + m.only_b |
---|
256 | + m.only_a_overhang + m.only_b_overhang; |
---|
257 | break; |
---|
258 | case CMP_COVER_AVERAGE: |
---|
259 | base = m.match + m.mismatch + |
---|
260 | (m.only_a + m.only_b |
---|
261 | + m.only_a_overhang + m.only_b_overhang)/2; |
---|
262 | break; |
---|
263 | case CMP_COVER_MIN: |
---|
264 | base = m.match + m.mismatch |
---|
265 | + std::min(m.only_a + m.only_a_overhang, |
---|
266 | m.only_b + m.only_b_overhang); |
---|
267 | break; |
---|
268 | case CMP_COVER_MAX: |
---|
269 | base = m.match + m.mismatch |
---|
270 | + std::max(m.only_a + m.only_a_overhang, |
---|
271 | m.only_b + m.only_b_overhang); |
---|
272 | break; |
---|
273 | case CMP_COVER_NOGAP: |
---|
274 | base = m.match + m.mismatch; |
---|
275 | break; |
---|
276 | default: |
---|
277 | throw std::logic_error("unknown cover rule"); |
---|
278 | } |
---|
279 | |
---|
280 | float dist = (float)m.match / base; |
---|
281 | |
---|
282 | switch(dist_rule) { |
---|
283 | case CMP_DIST_NONE: |
---|
284 | break; |
---|
285 | case CMP_DIST_JC: |
---|
286 | dist = jukes_cantor(dist); |
---|
287 | break; |
---|
288 | default: |
---|
289 | throw std::logic_error("unknown dist rule"); |
---|
290 | } |
---|
291 | |
---|
292 | return dist; |
---|
293 | } |
---|
294 | |
---|
295 | |
---|
296 | ////////////////////////// OPTION PARSING ////////////////////////// |
---|
297 | |
---|
298 | void |
---|
299 | validate(boost::any& v, |
---|
300 | const std::vector<std::string>& values, |
---|
301 | CMP_IUPAC_TYPE* /*unused*/, int /*unused*/) { |
---|
302 | po::validators::check_first_occurrence(v); |
---|
303 | const std::string& s = po::validators::get_single_string(values); |
---|
304 | if (istarts_with("optimistic", s)) { |
---|
305 | v = CMP_IUPAC_OPTIMISTIC; |
---|
306 | } else if (istarts_with("pessimistic", s)) { |
---|
307 | v = CMP_IUPAC_PESSIMISTIC; |
---|
308 | } else if (istarts_with("exact", s)) { |
---|
309 | v = CMP_IUPAC_EXACT; |
---|
310 | } else { |
---|
311 | throw po::invalid_option_value |
---|
312 | ("iupac matching must be either optimistic or pessimistic"); |
---|
313 | } |
---|
314 | } |
---|
315 | |
---|
316 | std::ostream& |
---|
317 | operator<<(std::ostream& out, const CMP_IUPAC_TYPE& t) { |
---|
318 | switch(t) { |
---|
319 | case CMP_IUPAC_OPTIMISTIC: |
---|
320 | out << "optimistic"; |
---|
321 | break; |
---|
322 | case CMP_IUPAC_PESSIMISTIC: |
---|
323 | out << "pessimistic"; |
---|
324 | break; |
---|
325 | case CMP_IUPAC_EXACT: |
---|
326 | out << "exact"; |
---|
327 | break; |
---|
328 | default: |
---|
329 | out << "UNDEFINED!"; |
---|
330 | } |
---|
331 | return out; |
---|
332 | } |
---|
333 | |
---|
334 | void |
---|
335 | validate(boost::any& v, |
---|
336 | const std::vector<std::string>& values, |
---|
337 | CMP_DIST_TYPE* /*unused*/, int /*unused*/) { |
---|
338 | po::validators::check_first_occurrence(v); |
---|
339 | const std::string& s = po::validators::get_single_string(values); |
---|
340 | if (iequals(s, "none")) { |
---|
341 | v = CMP_DIST_NONE; |
---|
342 | } else if (iequals(s, "jc")) { |
---|
343 | v = CMP_DIST_JC; |
---|
344 | } else { |
---|
345 | throw po::invalid_option_value |
---|
346 | ("distance correction must be either none or jc"); |
---|
347 | } |
---|
348 | } |
---|
349 | |
---|
350 | std::ostream& |
---|
351 | operator<<(std::ostream& out, const CMP_DIST_TYPE& t) { |
---|
352 | switch(t) { |
---|
353 | case CMP_DIST_NONE: |
---|
354 | out << "none"; |
---|
355 | break; |
---|
356 | case CMP_DIST_JC: |
---|
357 | out << "jc"; |
---|
358 | break; |
---|
359 | default: |
---|
360 | out << "UNDEFINED!"; |
---|
361 | } |
---|
362 | return out; |
---|
363 | } |
---|
364 | |
---|
365 | void |
---|
366 | validate(boost::any& v, |
---|
367 | const std::vector<std::string>& values, |
---|
368 | CMP_COVER_TYPE* /*unused*/, int /*unused*/) { |
---|
369 | po::validators::check_first_occurrence(v); |
---|
370 | const std::string& s = po::validators::get_single_string(values); |
---|
371 | if (iequals(s, "abs")) { |
---|
372 | v = CMP_COVER_ABS; |
---|
373 | } else if (iequals(s, "query")) { |
---|
374 | v = CMP_COVER_QUERY; |
---|
375 | } else if (iequals(s, "target")) { |
---|
376 | v = CMP_COVER_TARGET; |
---|
377 | } else if (iequals(s, "overlap")) { |
---|
378 | v = CMP_COVER_OVERLAP; |
---|
379 | } else if (iequals(s, "all")) { |
---|
380 | v = CMP_COVER_ALL; |
---|
381 | } else if (iequals(s, "average")) { |
---|
382 | v = CMP_COVER_AVERAGE; |
---|
383 | } else if (iequals(s, "min")) { |
---|
384 | v = CMP_COVER_MIN; |
---|
385 | } else if (iequals(s, "max")) { |
---|
386 | v = CMP_COVER_MAX; |
---|
387 | } else if (iequals(s, "nogap")) { |
---|
388 | v = CMP_COVER_NOGAP; |
---|
389 | } else { |
---|
390 | throw po::invalid_option_value |
---|
391 | ("coverage type must be one of abs, query, target, overlap," |
---|
392 | "average, nogap, min or max"); |
---|
393 | } |
---|
394 | } |
---|
395 | |
---|
396 | std::ostream& |
---|
397 | operator<<(std::ostream& out, const CMP_COVER_TYPE& t) { |
---|
398 | switch(t) { |
---|
399 | case CMP_COVER_ABS: |
---|
400 | out << "abs"; |
---|
401 | break; |
---|
402 | case CMP_COVER_QUERY: |
---|
403 | out << "query"; |
---|
404 | break; |
---|
405 | case CMP_COVER_TARGET: |
---|
406 | out << "target"; |
---|
407 | break; |
---|
408 | case CMP_COVER_OVERLAP: |
---|
409 | out << "overlap"; |
---|
410 | break; |
---|
411 | case CMP_COVER_ALL: |
---|
412 | out << "all"; |
---|
413 | break; |
---|
414 | case CMP_COVER_AVERAGE: |
---|
415 | out << "average"; |
---|
416 | break; |
---|
417 | case CMP_COVER_MIN: |
---|
418 | out << "min"; |
---|
419 | break; |
---|
420 | case CMP_COVER_MAX: |
---|
421 | out << "max"; |
---|
422 | break; |
---|
423 | case CMP_COVER_NOGAP: |
---|
424 | out << "nogap"; |
---|
425 | break; |
---|
426 | default: |
---|
427 | out << "UNDEFINED!"; |
---|
428 | } |
---|
429 | return out; |
---|
430 | } |
---|
431 | |
---|
432 | |
---|
433 | po::options_description |
---|
434 | cseq_comparator::get_options_description(const char* prefix) { |
---|
435 | po::options_description od; |
---|
436 | std::string p(prefix); |
---|
437 | od.add_options() |
---|
438 | ((p + "iupac").c_str(), |
---|
439 | po::value<CMP_IUPAC_TYPE>()->default_value(CMP_IUPAC_OPTIMISTIC, ""), |
---|
440 | "strategy for comparing ambiguous bases [pessimistic|*optimistic*|exact]") |
---|
441 | |
---|
442 | ((p + "correction").c_str(), |
---|
443 | po::value<CMP_DIST_TYPE>()->default_value(CMP_DIST_NONE,""), |
---|
444 | "apply distance correction. [*none*|jc]") |
---|
445 | |
---|
446 | ((p + "cover").c_str(), |
---|
447 | po::value<CMP_COVER_TYPE>()->default_value(CMP_COVER_QUERY,""), |
---|
448 | "compute comparative measure relative to\n" |
---|
449 | " abs: 1\n" |
---|
450 | " *query: query sequence length\n" |
---|
451 | " target: target sequence length\n" |
---|
452 | " min: length of shorter sequence\n" |
---|
453 | " max: length of longer sequence\n" |
---|
454 | " avg: average length\n" |
---|
455 | " overlap: length of overlap\n" |
---|
456 | " all: columns with bases in either\n" |
---|
457 | " nogap: columns with bases in both\n") |
---|
458 | |
---|
459 | ((p + "filter-lowercase").c_str(), |
---|
460 | po::bool_switch(), |
---|
461 | "ignore bases in lowercase when comparing sequences") |
---|
462 | ; |
---|
463 | return od; |
---|
464 | } |
---|
465 | |
---|
466 | cseq_comparator |
---|
467 | cseq_comparator::make_from_variables_map(po::variables_map& vm, |
---|
468 | const char* prefix){ |
---|
469 | std::string p(prefix); |
---|
470 | |
---|
471 | CMP_IUPAC_TYPE iupac = vm[p + "iupac"].as<CMP_IUPAC_TYPE>(); |
---|
472 | CMP_DIST_TYPE dist = vm[p + "correction"].as<CMP_DIST_TYPE>(); |
---|
473 | CMP_COVER_TYPE cover = vm[p + "cover"].as<CMP_COVER_TYPE>(); |
---|
474 | bool filter_lc = vm[p + "filter-lowercase"].as<bool>(); |
---|
475 | |
---|
476 | if (CMP_COVER_ABS == cover && CMP_DIST_NONE != dist) { |
---|
477 | throw std::logic_error |
---|
478 | ("only fractional identity can be distance corrected"); |
---|
479 | } |
---|
480 | |
---|
481 | return {iupac, dist, cover, filter_lc}; |
---|
482 | } |
---|
483 | |
---|
484 | } // namespace sina |
---|
485 | |
---|
486 | /* |
---|
487 | Local Variables: |
---|
488 | mode:c++ |
---|
489 | c-file-style:"stroustrup" |
---|
490 | c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . 0)) |
---|
491 | indent-tabs-mode:nil |
---|
492 | fill-column:99 |
---|
493 | End: |
---|
494 | */ |
---|
495 | // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 : |
---|