source: branches/properties/GDE/SINA/builddir/src/famfinder.cpp

Last change on this file was 19177, checked in by westram, 3 years ago
  • let sina support CLI interface ARB7.1. reapply [18995].
  • fix CLI help: sina default is to use internal engine. reapply [19004].
File size: 21.5 KB
Line 
1/*
2Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu>
3
4This file is part of SINA.
5SINA is free software: you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation, either version 3 of the License, or (at your
8option) any later version.
9
10SINA is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15You should have received a copy of the GNU General Public License
16along with SINA.  If not, see <http://www.gnu.org/licenses/>.
17
18Additional permission under GNU GPL version 3 section 7
19
20If you modify SINA, or any covered work, by linking or combining it
21with components of ARB (or a modified version of that software),
22containing parts covered by the terms of the
23ARB-public-library-license, the licensors of SINA grant you additional
24permission to convey the resulting work. Corresponding Source for a
25non-source form of such a combination shall include the source code
26for the parts of ARB used as well as that of the covered work.
27*/
28
29#include "famfinder.h"
30#include "config.h"
31#include "query_pt.h"
32#include "query_arb.h"
33#include "kmer_search.h"
34#include "log.h"
35#include "cseq_comparator.h"
36
37#include <iostream>
38using std::ostream;
39
40#include <sstream>
41using std::stringstream;
42
43#include <iomanip>
44using std::setprecision;
45
46#include <string>
47using std::string;
48
49#include <vector>
50using std::vector;
51
52using std::pair;
53
54#include <exception>
55using std::logic_error;
56
57
58#include <boost/program_options.hpp>
59namespace po = boost::program_options;
60
61#include <boost/filesystem.hpp>
62namespace fs = boost::filesystem;
63
64#include <boost/algorithm/string/predicate.hpp>
65using boost::algorithm::istarts_with;
66using boost::algorithm::iequals;
67
68
69
70namespace sina {
71
72static auto logger = Log::create_logger("famfinder");
73
74
75
76struct options {
77    TURN_TYPE turn_which;
78    ENGINE_TYPE engine;
79
80    int gene_start;
81    int gene_end;
82
83    string posvar_filter;
84    string posvar_autofilter_field;
85    float  posvar_autofilter_thres;
86
87    unsigned int   fs_min;
88    unsigned int   fs_max;
89    float          fs_msc;
90    float          fs_msc_max;
91    bool           fs_leave_query_out;
92    unsigned int   fs_req;
93    unsigned int   fs_req_full;
94    unsigned int   fs_full_len;
95    unsigned int   fs_req_gaps;
96    bool           fs_no_fast;
97    unsigned int   fs_kmer_len;
98    unsigned int   fs_kmer_mm;
99    bool           fs_kmer_norel;
100    unsigned int   fs_min_len;
101    unsigned int   fs_cover_gene;
102
103    fs::path database;
104    string   pt_port;
105
106    bool oldmatch;
107    bool dont_expect_start;
108};
109
110static options opts;
111
112
113
114void validate(boost::any& v,
115              const std::vector<std::string>& values,
116              TURN_TYPE* /*tt*/, int /*unused*/) {
117    using namespace boost::program_options;
118    validators::check_first_occurrence(v);
119    const std::string& s = validators::get_single_string(values);
120    if (iequals(s, "none")) {
121        v = TURN_NONE;
122    } else if (iequals(s, "revcomp")) {
123        v = TURN_REVCOMP;
124    } else if (iequals(s, "all")) {
125        v = TURN_ALL;
126    } else {
127        throw po::invalid_option_value(s);
128    }
129}
130
131std::ostream& operator<<(std::ostream& out, const TURN_TYPE& t) {
132    switch(t) {
133    case TURN_NONE: out << "none"; break;
134    case TURN_REVCOMP: out << "revcomp"; break;
135    case TURN_ALL: out << "all"; break;
136    default: out << "[UNKNOWN!]";
137    }
138    return out;
139}
140
141void
142famfinder::get_options_description(po::options_description& main,
143                                   po::options_description& adv) {
144
145    main.add_options()
146        ("db,r", po::value<fs::path>(&opts.database), "reference database")
147        ("turn,t", po::value<TURN_TYPE>(&opts.turn_which)
148         ->default_value(TURN_NONE, "")
149         ->implicit_value(TURN_REVCOMP, ""),
150         "check other strand as well\n"
151         "'all' checks all four frames")
152        ;
153
154    po::options_description mid("Reference Selection");
155    mid.add_options()
156        ("fs-engine", po::value<ENGINE_TYPE>(&opts.engine)->default_value(ENGINE_SINA_KMER, ""),
157         "search engine to use for reference selection "
158         "[pt-server|*internal*]")
159        ("fs-kmer-len", po::value<unsigned int>(&opts.fs_kmer_len)->default_value(10u,""),
160         "length of k-mers (10)")
161        ("fs-req", po::value<unsigned int>(&opts.fs_req)->default_value(1u,""),
162         "required number of reference sequences (1)\n"
163         "queries with less matches will be dropped")
164        ("fs-min", po::value<unsigned int>(&opts.fs_min)->default_value(40u,""),
165         "number of references used regardless of shared fraction (40)")
166        ("fs-max", po::value<unsigned int>(&opts.fs_max)->default_value(40u,""),
167         "number of references used at most (40)")
168        ("fs-msc", po::value<float>(&opts.fs_msc)->default_value(.7, ""),
169         "required fractional identity of references (0.7)")
170        ("fs-req-full", po::value<unsigned int>(&opts.fs_req_full)->default_value(1u, ""),
171         "required number of full length references (1)")
172        ("fs-full-len", po::value<unsigned int>(&opts.fs_full_len)->default_value(1400u, ""),
173         "minimum length of full length reference (1400)")
174        ("fs-req-gaps", po::value<unsigned int>(&opts.fs_req_gaps)->default_value(10u, ""),
175         "ignore references with less internal gaps (10)")
176        ("fs-min-len", po::value<unsigned int>(&opts.fs_min_len)->default_value(150u, ""),
177         "minimal reference length (150)")
178        ;
179    main.add(mid);
180
181    po::options_description od("Advanced Reference Selection");
182    od.add_options()
183        ("ptdb", po::value<fs::path>(&opts.database),
184         "PT server database (old name)")
185        ("ptport", po::value<string>(&opts.pt_port)
186         ->default_value(fmt::format(":/tmp/sina_pt_{}", getpid()), ""),
187         "PT server port (:/tmp/sina_pt_<pid>)")
188        ("fs-kmer-no-fast", po::bool_switch(&opts.fs_no_fast),
189         "don't use fast family search")
190        ("fs-kmer-mm", po::value<unsigned int>(&opts.fs_kmer_mm)->default_value(0,""),
191         "allowed mismatches per k-mer (0)")
192        ("fs-kmer-norel", po::bool_switch(&opts.fs_kmer_norel),
193         "don't score k-mer distance relative to target length")
194        ("fs-msc-max", po::value<float>(&opts.fs_msc_max)->default_value(2, ""),
195         "max identity of used references (for evaluation)")
196        ("fs-leave-query-out", po::bool_switch(&opts.fs_leave_query_out),
197         "ignore candidate if found in reference (for evaluation)")
198        ("gene-start", po::value<int>(&opts.gene_start)->default_value(0,""),
199         "alignment position of first base of gene (0)")
200        ("gene-end", po::value<int>(&opts.gene_end)->default_value(0,""),
201         "alignment position of last base of gene (0)")
202        ("fs-cover-gene", po::value<unsigned int>(&opts.fs_cover_gene)->default_value(0,""),
203         "required number of references covering each gene end (0)")
204        ("filter", po::value<string>(&opts.posvar_filter)->default_value(""),
205         "select posvar filter")
206        ("auto-filter-field", po::value<string>(&opts.posvar_autofilter_field)
207         ->default_value(""), "select field for auto filter selection")
208        ("auto-filter-threshold",  po::value<float>(&opts.posvar_autofilter_thres)
209         ->default_value(0.8, ""), "quorum for auto filter selection (0.8)")
210        ("fs-oldmatch", po::bool_switch(&opts.oldmatch),
211         "use legacy family composition algorithm")
212        ("dont-expect-start",
213         po::bool_switch(&opts.dont_expect_start),
214         "do not warn about missing 'start' field")
215        ;
216    adv.add(od);
217}
218
219void famfinder::validate_vm(po::variables_map& vm,
220                            po::options_description&  /*desc*/) {
221    if (vm["db"].empty() && vm["ptdb"].empty()) {
222        throw logic_error("Family Finder: Must have reference database (--db/-r)");
223    }
224    if (not vm["ptdb"].empty()) {
225        logger->warn("Option --ptdb deprecated; please use --db/-r instead");
226    }
227    if (not vm["ptdb"].empty() && not vm["db"].empty()) {
228        throw logic_error("Family Finder: please use only new --db/-r option");
229    }
230    if (not fs::exists(opts.database)) {
231        if (opts.database.compare(":") == 0) {
232            logger->warn("Loading reference database from running ARB DB server");
233        } else {
234            throw logic_error(fmt::format("Reference database file {} does not exist",
235                                          opts.database));
236        }
237    }
238    if (vm["fs-req"].as<unsigned int>() < 1) {
239        throw logic_error("Family Finder: fs-req must be >= 1");
240    }
241    if (vm["fs-oldmatch"].as<bool>() && vm["fs-engine"].as<ENGINE_TYPE>() != ENGINE_ARB_PT) {
242        throw logic_error("Legacy family composition only available for pt-server engine");
243    }
244}
245
246ENGINE_TYPE famfinder::get_engine() {
247    return opts.engine;
248}
249
250class famfinder::impl {
251public:
252    search *index{nullptr};
253    query_arb *arb{nullptr};
254    vector<alignment_stats> vastats;
255
256    void do_turn_check(cseq& /*c*/);
257    int turn_check(const cseq& /*query*/, bool /*all*/);
258    void select_astats(tray &t);
259    void match(search::result_vector& results, const cseq& query);
260
261    impl();
262    impl(const impl&);
263    ~impl();
264    tray operator()(tray /*t*/);
265};
266
267// pimpl wrappers
268famfinder::famfinder() : pimpl(new impl()) {}
269famfinder::famfinder(const famfinder& o) = default;
270famfinder& famfinder::operator=(const famfinder& o) = default;
271famfinder::~famfinder() = default;
272tray famfinder::operator()(const tray& t) { return (*pimpl)(t); }
273
274int famfinder::turn_check(const cseq& query, bool all) {
275    return pimpl->turn_check(query, all);
276}
277
278famfinder::impl::impl()
279    : arb(query_arb::getARBDB(opts.database))
280{
281    switch(opts.engine) {
282    case ENGINE_ARB_PT:
283        index = query_pt_pool::get_pool(
284            opts.database,
285            opts.fs_kmer_len,
286            not opts.fs_no_fast,
287            opts.fs_kmer_norel,
288            opts.fs_kmer_mm,
289            opts.pt_port);
290        logger->warn("Using ARB PT server for reference search");
291        break;
292    case ENGINE_SINA_KMER:
293        index = kmer_search::get_kmer_search(
294            opts.database,
295            opts.fs_kmer_len,
296            opts.fs_no_fast);
297        logger->warn("Using internal engine for reference search");
298        break;
299    default:
300        throw std::runtime_error("Unknown sequence search engine type");
301    }
302    vastats = arb->getAlignmentStats();
303    //index->set_range(opts.gene_start, opts.gene_end);
304
305    //posvar_filter
306    //readonly
307}
308
309
310famfinder::impl::~impl() {
311    delete index;
312}
313
314
315void
316famfinder::impl::do_turn_check(cseq &c) {
317    // Turning sequence.
318    // Strictly, this could be considered a "modification" of the input
319    // sequence. However, we're really only correcting its representation.
320    // The purpose of keeping the original is so that we can compare
321    // changed made to the alignment at the end. This is way easier if we
322    // don't have to worry about sequence orientation.
323    if (opts.turn_which != TURN_NONE) {
324        switch(turn_check(c, opts.turn_which==TURN_ALL)) {
325        case 0:
326            c.set_attr(query_arb::fn_turn, "none");
327            break;
328        case 1:
329            c.set_attr(query_arb::fn_turn, "reversed");
330            c.reverse();
331            break;
332        case 2:
333            c.set_attr(query_arb::fn_turn, "complemented");
334            c.complement();
335            break;
336        case 3:
337            c.set_attr(query_arb::fn_turn, "reversed and complemented");
338            c.reverse();
339            c.complement();
340            break;
341        }
342    } else {
343        c.set_attr(query_arb::fn_turn, "turn-check disabled");
344    }
345}
346
347
348int
349famfinder::impl::turn_check(const cseq& query, bool all) {
350    search::result_vector matches;
351    double score[4];
352    index->find(query, matches, 1);
353    score[0] = matches.empty() ? 0 : matches[0].score;
354
355    cseq turn(query);
356    turn.reverse();
357    if (all) {
358        index->find(turn, matches, 1);
359        score[1] = matches.empty() ? 0 : matches[0].score;
360
361        cseq comp(query);
362        comp.complement();
363
364        index->find(comp, matches, 1);
365        score[2] = matches.empty() ? 0 : matches[0].score;
366    } else {
367        score[1] = score[2] = 0;
368    }
369
370    turn.complement();
371    index->find(turn, matches, 1);
372    score[3] = matches.empty() ? 0 : matches[0].score;
373
374    double max = 0;
375    int best = 0;
376    for (int i = 0; i < 4; i++) {
377        if (max < score[i]) {
378            max = score[i], best = i;
379        }
380    }
381    return best;
382}
383
384
385void
386famfinder::impl::select_astats(tray& t) {
387    alignment_stats *astats = nullptr;
388
389    // load default as per --filter
390    if (!opts.posvar_filter.empty()) {
391        for (alignment_stats &as: vastats) {
392            if (as.getName() == opts.posvar_filter
393                || as.getName() == opts.posvar_filter + ":ALL"
394                || as.getName() == opts.posvar_filter + ":all"
395                ) {
396                astats = &as;
397            }
398        }
399    }
400
401    // do autoselection if --auto-filter-field given
402    // this uses a quorum of the alignment reference:
403    // at least the fraction given by posvar_autofilter_thres
404    // must agree on the filter chosen
405    // fixme: prefers higher level filters, should prefer most specific
406    // filter, i.e. bacteria will always beat bacteria;proteobacteria
407    if (opts.posvar_autofilter_field.length() > 0) {
408        auto &vc = *t.alignment_reference;
409        int best_count = 0;
410        using vastats_t = pair<string, alignment_stats>;
411        alignment_stats *best = nullptr;
412        for (alignment_stats& p: vastats) {
413            string filter_name = p.getName();
414            int n = 0;
415            for (auto &r: vc) {
416                string f = opts.posvar_filter + ":" + r.sequence->get_attr<string>(opts.posvar_autofilter_field);
417                if (boost::algorithm::istarts_with(f, filter_name)) {
418                    ++n;
419                }
420            }
421
422            if (n > best_count) {
423                best_count = n;
424                best = &p;
425            }
426        }
427        if (best_count > vc.size() * opts.posvar_autofilter_thres) {
428            t.log << "autofilter: " << best->getName() << ";";
429            astats = best;
430        } else {
431            t.log << "autofilter: no match;";
432        }
433    }
434
435    if (astats == nullptr) {
436        astats = new alignment_stats();
437    }
438
439    t.astats = astats;
440}
441
442
443tray
444famfinder::impl::operator()(tray t) {
445    t.alignment_reference = new search::result_vector();
446    auto &vc = *t.alignment_reference;
447    cseq &c = *t.input_sequence;
448
449    do_turn_check(c);
450
451    // FIXME: int noid = opts.realign
452    bool noid = false;
453    if (opts.oldmatch) {
454        index->match(vc, c, opts.fs_min, opts.fs_max, opts.fs_msc, opts.fs_msc_max,
455                     arb, noid, opts.fs_min_len, opts.fs_req_full,
456                     opts.fs_full_len, opts.fs_cover_gene, opts.fs_leave_query_out);
457    } else {
458        match(vc, c);
459    }
460
461    // prepare log string for alignment reference
462    fmt::memory_buffer tmp;
463    for (auto &r: vc) {
464        if (opts.posvar_autofilter_field.length() > 0) {
465            arb->loadKey(*r.sequence, opts.posvar_autofilter_field);
466        }
467        arb->loadKey(*r.sequence, query_arb::fn_acc);
468        arb->loadKey(*r.sequence, query_arb::fn_start, false, !opts.dont_expect_start); // suppress warnings about missing 'start' field on CLI-request
469        fmt::format_to(tmp, "{}.{}:{:.2f} ",
470                       r.sequence->get_attr<string>(query_arb::fn_acc),
471                       r.sequence->get_attr<string>(query_arb::fn_start, "0"),
472                       r.score);
473    }
474    c.set_attr(query_arb::fn_family, string(tmp.data(), tmp.size()));
475
476    // remove sequences having too few gaps
477    // FIXME: this should be done in match()
478    if (opts.fs_req_gaps != 0) {
479        auto too_few_gaps = [&](search::result_item& i) {
480            return 0 == i.sequence->size()
481            || i.sequence->rbegin()->getPosition() - i.sequence->size() + 1 < opts.fs_req_gaps;
482        };
483        vc.erase(std::remove_if(vc.begin(), vc.end(), too_few_gaps), vc.end());
484    }
485
486    // load apropriate alignment statistics into tray
487    select_astats(t);
488   
489    // no reference => no alignment
490    if (vc.size() < opts.fs_req) {
491        t.log << "unable to align: too few relatives (" << vc.size() << ");";
492        delete t.alignment_reference;
493        t.alignment_reference = nullptr;
494        return t;
495    } 
496
497    return t;
498}
499
500
501void
502famfinder::impl::match(search::result_vector& results, const cseq& query) {
503    unsigned int min_match = opts.fs_min;
504    unsigned int max_match = opts.fs_max;
505    float min_score = opts.fs_msc;
506    float max_score = opts.fs_msc_max;
507    bool noid = false;
508    unsigned int min_len = opts.fs_min_len;
509    unsigned int num_full = opts.fs_req_full;
510    unsigned int full_min_len = opts.fs_full_len;
511    unsigned int range_cover = opts.fs_cover_gene;
512    bool leave_query_out = opts.fs_leave_query_out;
513
514    using item_t = search::result_item;
515
516    size_t range_begin = 0, range_end = 0;
517    auto is_full = [full_min_len](const item_t& result) {
518        return result.sequence->size() >= full_min_len;
519    };
520    auto is_range_left = [range_begin](const item_t& result) {
521        return result.sequence->begin()->getPosition() <= range_begin;
522    };
523    auto is_range_right = [range_end](const item_t& result) {
524        return result.sequence->getById(result.sequence->size()-1).getPosition() >= range_end;
525    };
526
527    size_t have = 0, have_full = 0, have_cover_left = 0, have_cover_right = 0;
528    auto count_good = [&](const item_t& result) {
529        ++have;
530        if (num_full && is_full(result)) {
531            ++have_full;
532        }
533        if (range_cover && is_range_right(result)) {
534            ++have_cover_right;
535        }
536        if (range_cover && is_range_left(result)) {
537            ++have_cover_left;
538        }
539        return false;
540    };
541
542    // matches results shorter than min_len
543    auto remove_short = [min_len](const item_t& result) {
544        return result.sequence->size() < min_len;
545    };
546
547    // matches results sharing name with query
548    auto remove_query = [&, leave_query_out](const item_t& result) {
549        return leave_query_out && query.getName() == result.sequence->getName();
550    };
551
552    // matches results containing query
553    auto remove_superstring = [&, noid](const item_t& result) {
554        return noid && boost::algorithm::icontains(result.sequence->getBases(), query.getBases());
555    };
556
557    // matches results too similar to query
558    cseq_comparator cmp(CMP_IUPAC_OPTIMISTIC, CMP_DIST_NONE, CMP_COVER_QUERY, false);
559    auto remove_similar = [&, max_score](const item_t& result) {
560        return max_score <= 2 && cmp(query, *result.sequence) > max_score;
561    };
562
563    auto min_reached = [&](const item_t&) {
564        return have >= min_match;
565    };
566    auto max_reached = [&](const item_t&) {
567        return have >= max_match;
568    };
569    auto score_good = [&](const item_t& result) {
570        return result.score < min_score;
571    };
572    auto adds_to_full = [&](const item_t& result) {
573        return num_full && have_full < num_full && is_full(result);
574    };
575    auto adds_to_range = [&](const item_t& result) {
576        return
577        (range_cover && have_cover_right < range_cover && is_range_right(result))
578        || (range_cover && have_cover_left < range_cover && is_range_left(result))
579        ;
580    };
581
582    auto remove = [&](const item_t& result) {
583        return
584        remove_short(result) ||
585        remove_query(result) ||
586        remove_superstring(result) ||
587        remove_similar(result) || (
588            min_reached(result) &&
589            (max_reached(result) || !score_good(result)) &&
590            !adds_to_full(result) &&
591            !adds_to_range(result) ) ||
592        count_good(result);
593    };
594
595    size_t max_results = max_match + 1;
596    search::result_vector::iterator from;
597    while (have < max_match || have_full < num_full ||
598           have_cover_left < range_cover || have_cover_right < range_cover) {
599
600        results.clear();
601        index->find(query, results, max_results);
602        if (results.empty()) {
603            return;
604        }
605
606        have = 0, have_full = 0, have_cover_left = 0, have_cover_right = 0;
607        from = std::remove_if(results.begin(), results.end(), remove);
608        if (max_results >= index->size()) {
609            break;
610        }
611        max_results *= 10;
612    }
613
614    results.erase(from, results.end());
615    return;
616}
617
618
619
620} // namespace sina
621
622
623#if 0
624void fixme() {
625    int termini_begin = -1, termini_end = -1;
626    string termini = arb->getFilter("termini");
627    if (!termini.empty()) {
628        termini_begin = termini.find_first_of('x')+1 ;
629        termini_end = termini.find_last_of('x')+1;
630        logger->info("Found TERMINI filter: {} - {}",
631                     termini_begin, termini_end);
632    }
633
634    // FIXME: find a good way to do this with program_options
635    if (opts.gene_start < 1) {
636        if (termini_begin == -1) {
637            opts.gene_start = 0;
638        } else {
639            opts.gene_start = termini_begin;
640        }
641    }
642    if (opts.gene_end < 1 || opts.gene_end > arb->getAlignmentWidth()) {
643        if (termini_end == -1) {
644            opts.gene_end = arb->getAlignmentWidth();
645        } else {
646            opts.gene_end = termini_end;
647        }
648    }
649    log->info("Range of gene within alignment: {} - {}",
650              opts.gene_start, opts.gene_end);
651    // decrement range ... we start at 0 around here
652    --opts.gene_start;
653    --opts.gene_end;
654}
655#endif
656
657
658/*
659  Local Variables:
660  mode:c++
661  c-file-style:"stroustrup"
662  c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . 0))
663  indent-tabs-mode:nil
664  fill-column:99
665  End:
666*/
667// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 :
Note: See TracBrowser for help on using the repository browser.