source: trunk/GDE/SINA/builddir/src/align.cpp

Last change on this file was 19170, checked in by westram, 2 years ago
  • sina source
    • unpack + remove tarball
    • no longer ignore sina builddir.
File size: 17.3 KB
Line 
1/*
2Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu>
3
4This file is part of SINA.
5SINA is free software: you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation, either version 3 of the License, or (at your
8option) any later version.
9
10SINA is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15You should have received a copy of the GNU General Public License
16along with SINA.  If not, see <http://www.gnu.org/licenses/>.
17
18Additional permission under GNU GPL version 3 section 7
19
20If you modify SINA, or any covered work, by linking or combining it
21with components of ARB (or a modified version of that software),
22containing parts covered by the terms of the
23ARB-public-library-license, the licensors of SINA grant you additional
24permission to convey the resulting work. Corresponding Source for a
25non-source form of such a combination shall include the source code
26for the parts of ARB used as well as that of the covered work.
27*/
28
29
30#include "align.h"
31#include "config.h"
32
33#include <string>
34using std::string;
35
36#include <utility>
37#include <vector>
38using std::vector;
39
40#include <list>
41using std::list;
42using std::pair;
43
44#include <iostream>
45using std::endl;
46using std::ostream;
47
48#include <fstream>
49using std::ofstream;
50
51#include <iterator>
52using std::ostream_iterator;
53
54#include <sstream>
55using std::stringstream;
56
57#include <exception>
58using std::exception;
59
60#include <algorithm>
61using std::find_if;
62
63#ifdef HAVE_TBB_MALLOC
64#  include "tbb/tbb_allocator.h"
65#endif
66
67#include <boost/shared_ptr.hpp>
68using boost::shared_ptr;
69
70#include <boost/algorithm/string/predicate.hpp>
71using boost::algorithm::iequals;
72
73#include <boost/assert.hpp>
74#include <boost/algorithm/string/find.hpp>
75
76#include <boost/program_options.hpp>
77namespace po = boost::program_options;
78
79#include <sys/types.h>
80#include <unistd.h> //for getpid()
81
82#include "query_pt.h"
83#include "pseq.h"
84#include "cseq_comparator.h"
85#include "query_arb.h"
86
87#include "log.h"
88auto logger = sina::Log::create_logger("align");
89
90#include "mesh.h"
91#include "mesh_debug.h"
92#include "mseq.h"
93
94namespace sina {
95
96template<typename SCORING_SCHEME, typename MASTER>
97void choose_transition(cseq& c, const cseq& orig, MASTER& m, SCORING_SCHEME& s, ostream& log);
98
99template<typename transition, typename MASTER>
100void do_align(cseq& c, const cseq& orig, MASTER& m, transition& tr, ostream& log);
101
102struct aligner::options {
103    bool realign;
104    OVERHANG_TYPE overhang;
105    LOWERCASE_TYPE lowercase;
106    INSERTION_TYPE insertion;
107    bool calc_idty;
108
109    bool fs_no_graph;
110    float fs_weight;
111
112    float match_score;
113    float mismatch_score;
114    float gap_penalty;
115    float gap_ext_penalty;
116
117    bool debug_graph;
118    bool write_used_rels;
119
120    bool use_subst_matrix;
121};
122struct aligner::options *aligner::opts;
123
124
125void validate(boost::any& v,
126              const std::vector<std::string>& values,
127              OVERHANG_TYPE* /*tt*/, int /*unused*/) {
128  using namespace boost::program_options;
129  validators::check_first_occurrence(v);
130  const std::string& s = validators::get_single_string(values);
131  if (iequals(s, "attach")) {
132      v = OVERHANG_ATTACH;
133  } else if (iequals(s, "remove")) {
134      v = OVERHANG_REMOVE;
135  } else if (iequals(s, "edge")) {
136      v = OVERHANG_EDGE;
137  } else {
138      throw po::invalid_option_value("must be one of 'attach', 'remove' or 'edge'");
139  }
140}
141std::ostream& operator<<(std::ostream& out, const OVERHANG_TYPE& t) {
142  switch(t) {
143  case OVERHANG_ATTACH:
144    out << "attach";
145    break;
146  case OVERHANG_REMOVE:
147    out << "remove";
148    break;
149  case OVERHANG_EDGE:
150    out << "edge";
151    break;
152  default:
153    out << "[UNKNOWN!]";
154  }
155  return out;
156}
157
158void validate(boost::any& v,
159              const std::vector<std::string>& values,
160              LOWERCASE_TYPE* /*tt*/, int /*unused*/) {
161  using namespace boost::program_options;
162  validators::check_first_occurrence(v);
163  const std::string& s = validators::get_single_string(values);
164  if (iequals(s, "none")) {
165      v = LOWERCASE_NONE;
166  } else if (iequals(s, "original")) {
167      v = LOWERCASE_ORIGINAL;
168  } else if (iequals(s, "unaligned")) {
169      v = LOWERCASE_UNALIGNED;
170  } else {
171      throw po::invalid_option_value("must be one of 'none', 'original' or 'unaligned'");
172  }
173}
174std::ostream& operator<<(std::ostream& out, const LOWERCASE_TYPE& t) {
175  switch(t) {
176  case LOWERCASE_NONE:
177    out << "none";
178    break;
179  case LOWERCASE_ORIGINAL:
180    out << "original";
181    break;
182  case LOWERCASE_UNALIGNED:
183    out << "unaligned";
184    break;
185  default:
186    out << "[UNKNOWN!]";
187  }
188  return out;
189}
190
191void validate(boost::any& v,
192              const std::vector<std::string>& values,
193              INSERTION_TYPE* /*tt*/, int /*unused*/) {
194  using namespace boost::program_options;
195  validators::check_first_occurrence(v);
196  const std::string& s = validators::get_single_string(values);
197  if (iequals(s, "shift")) {
198      v = INSERTION_SHIFT;
199  } else if (iequals(s, "forbid")) {
200      v = INSERTION_FORBID;
201  } else if (iequals(s, "remove")) {
202      v = INSERTION_REMOVE;
203  } else {
204      throw po::invalid_option_value("must be one of 'shift', 'forbid' or 'remove'");
205  }
206}
207std::ostream& operator<<(std::ostream& out, const INSERTION_TYPE& t) {
208  switch(t) {
209  case INSERTION_SHIFT:
210    out << "shift";
211    break;
212  case INSERTION_FORBID:
213    out << "forbid";
214    break;
215  case INSERTION_REMOVE:
216    out << "remove";
217    break;
218  default:
219    out << "[UNKNOWN!]";
220  }
221  return out;
222}
223
224
225void
226aligner::get_options_description(po::options_description&  /*main*/,
227                                 po::options_description& adv) {
228    opts = new struct aligner::options();
229
230    po::options_description od("Aligner");
231    od.add_options()
232        ("realign",
233         po::bool_switch(&opts->realign),
234         "do not copy alignment from reference")
235        ("overhang",
236         po::value<OVERHANG_TYPE>(&opts->overhang)->default_value(OVERHANG_ATTACH,""),
237         "select type of overhang placement [*attach*|remove|edge] ")
238        ("lowercase",
239         po::value<LOWERCASE_TYPE>(&opts->lowercase)->default_value(LOWERCASE_NONE,""),
240         "select which bases to put in lower case [*none*|original|unaligned] ")
241        ("insertion",
242         po::value<INSERTION_TYPE>(&opts->insertion)->default_value(INSERTION_SHIFT,""),
243         "handling of insertions not accomodatable by reference alignment [*shift*|forbid|remove]")
244        ("fs-no-graph",
245         po::bool_switch(&opts->fs_no_graph),
246         "use profile vector instead of DAG to as template")
247        ("fs-weight",
248         po::value<float>(&opts->fs_weight)->default_value(1,""),
249         "scales weight derived from fs base freq (1)")
250        ("match-score", 
251         po::value<float>(&opts->match_score)->default_value(2,""),
252         "score awarded for a match (2)")
253        ("mismatch-score",
254         po::value<float>(&opts->mismatch_score)->default_value(-1,""),
255         "score awarded for a mismatch (-1)")
256        ("pen-gap",
257         po::value<float>(&opts->gap_penalty)->default_value(5.0,""),
258         "gap open penalty (5)")
259        ("pen-gapext",
260         po::value<float>(&opts->gap_ext_penalty)->default_value(2.0, ""),
261         "gap extend penalty (2)")
262        ("debug-graph",
263         po::bool_switch(&opts->debug_graph),
264         "dump reference graphs to disk")
265        ("use-subst-matrix",
266         po::bool_switch(&opts->use_subst_matrix),
267         "use experimental scoring system (slow)")
268        ("write-used-rels",
269         po::bool_switch(&opts->write_used_rels),
270         "write used reference sequences to field 'used_rels'")
271        ("calc-idty",
272         po::bool_switch(&opts->calc_idty),
273         "calculate highest identity of aligned sequence with any reference")
274        ;
275
276    adv.add(od);
277}
278
279void aligner::validate_vm(boost::program_options::variables_map& /*vm*/,
280                          po::options_description& /*desc*/) {
281}
282
283} // namespace sina
284
285using namespace sina;
286
287static string
288make_datetime() {
289        time_t  t;
290        tm      tm;
291        char   buf[50];
292
293        time(&t);
294        gmtime_r(&t, &tm);
295        strftime(buf, 50, "%F %T", &tm);
296
297        return string(buf);
298}
299
300
301aligner::aligner() = default;
302aligner::~aligner() = default;
303aligner::aligner(const aligner&) = default;
304aligner& aligner::operator=(const aligner&  /*a*/) = default;
305
306
307tray
308aligner::operator()(tray t) {
309    // skip if requirements missing
310    if ((t.input_sequence == nullptr) ||
311        (t.alignment_reference == nullptr) ||
312        (t.astats == nullptr) ) {
313        logger->error("Internal error - incomplete data for alignment ({}-{}-{})",
314                      t.input_sequence == nullptr,
315                      t.alignment_reference == nullptr,
316                      t.astats == nullptr);
317        return t;
318    }
319    // prepare variables
320    cseq &c = *(new cseq(*t.input_sequence)); // working copy
321    search::result_vector &vc = *t.alignment_reference;
322    const string bases = c.getBases(); // unaligned sequence
323
324    if (opts->lowercase != LOWERCASE_ORIGINAL) {
325        c.upperCaseAll();
326    }
327
328    // sort reference sequences containing candidate to end of family
329    auto not_contains_query = [&](search::result_item& item) {
330        // FIXME: we can do this w/o converting to string
331        return !boost::algorithm::icontains(item.sequence->getBases(), bases);
332    };
333    auto begin_containing = partition(vc.begin(), vc.end(), not_contains_query);
334
335    // if there are such sequences...
336    if (begin_containing != vc.end()) {
337        if (opts->realign) { // realign means ignore those sequences
338            // FIXME: this should be done in famfinder to re-fill family
339            t.log << "sequences ";
340            for (auto it = begin_containing; it != vc.end(); ++it) {
341                t.log << it->sequence->get_attr<string>(query_arb::fn_acc) << " ";
342            }
343            t.log << "containing exact candidate removed from family;";
344            vc.erase(begin_containing, vc.end());
345            if (vc.empty()) {
346                t.log << "that's ALL of them. skipping sequence;";
347                return t;
348            }
349        } else { // otherwise, we steal the alignment
350            auto same_as_query = [&](search::result_item& item) {
351                return iequals(bases, item.sequence->getBases());
352            };
353            auto exact_match = find_if(begin_containing, vc.end(), same_as_query);
354            if (exact_match != vc.end()) {
355                c.setAlignedBases(exact_match->sequence->getAlignedBases());
356                t.log << "copied alignment from identical template sequence "
357                      << exact_match->sequence->get_attr<string>(query_arb::fn_acc) << ":"
358                      << exact_match->sequence->get_attr<string>(query_arb::fn_start, "0")
359                      << "; ";
360            } else {
361                vector<aligned_base> subalignment;
362                const vector<aligned_base>& refalignment = begin_containing->sequence->getAlignedBases();
363                string refsequence = begin_containing->sequence->getBases();
364                boost::iterator_range<string::iterator> substr = boost::ifind_first(refsequence, bases);
365                subalignment.reserve(substr.size());
366                std::copy(refalignment.begin() + std::distance(refsequence.begin(), substr.begin()),
367                          refalignment.begin() + std::distance(refsequence.begin(), substr.end()),
368                          std::back_inserter(subalignment));
369                c.setAlignedBases(subalignment);
370                t.log << "copied alignment from (longer) template sequence "
371                      << begin_containing->sequence->get_attr<string>(query_arb::fn_acc) << ":"
372                      << begin_containing->sequence->get_attr<string>(query_arb::fn_start, "0")
373                      << "; ";
374                BOOST_ASSERT(bases == c.getBases());
375            }
376
377            c.setWidth(begin_containing->sequence->getWidth());
378            c.set_attr(query_arb::fn_date, make_datetime());
379            c.set_attr(query_arb::fn_qual, 100);
380            if (opts->calc_idty) {
381                c.set_attr(query_arb::fn_idty, 100.f);
382            }
383            c.set_attr(query_arb::fn_head, 0);
384            c.set_attr(query_arb::fn_tail, 0);
385            c.set_attr(query_arb::fn_filter, "");
386            t.aligned_sequence = &c;
387            return t;
388        }
389    }
390
391    std::vector<const cseq*> vcp;
392    vcp.reserve(vc.size());
393    for (auto& r : vc) {
394        vcp.push_back(r.sequence);
395    }
396
397    if (!opts->fs_no_graph) {
398        // prepare reference
399        mseq m(vcp.begin(), vcp.end(), opts->fs_weight);
400        // (remove duplicate edges:)
401        m.sort();
402        m.reduce_edges();
403
404        if (not opts->use_subst_matrix) {
405            if (t.astats->getWidth() == 0) {
406                scoring_scheme_simple s(-opts->match_score, -opts->mismatch_score,
407                                        opts->gap_penalty, opts->gap_ext_penalty);
408                choose_transition(c, *t.input_sequence, m, s, t.log);
409            } else {
410                vector<float> weights = t.astats->getWeights();
411                scoring_scheme_weighted s(-opts->match_score, -opts->mismatch_score,
412                                          opts->gap_penalty, opts->gap_ext_penalty,
413                                          weights);
414                choose_transition(c, *t.input_sequence, m, s, t.log);
415            }
416        } else {
417            vector<float> weights(vc.begin()->sequence->getWidth(), 1.f);
418            if (t.astats->getWidth() == 0) { // FIXME: this looks broken
419                weights = t.astats->getWeights();
420            }
421            float dist = vc.begin()->score;
422            t.log << "using dist: " << dist << endl;
423            scoring_scheme_matrix<aligned_base::matrix_type>
424                s(opts->gap_penalty, opts->gap_ext_penalty, weights,
425                  t.astats->getSubstMatrix(dist));
426            choose_transition(c, *t.input_sequence, m, s, t.log);
427        }
428    } else {
429        pseq p(vcp.begin(), vcp.end());
430        scoring_scheme_profile s(-opts->match_score, -opts->mismatch_score,
431                                 opts->gap_penalty, opts->gap_ext_penalty);
432        choose_transition(c, *t.input_sequence, p, s, t.log);
433    }
434
435    if (opts->write_used_rels) {
436        stringstream tmp;
437        for (auto &s: vc) {
438            tmp << s.sequence->getName() << " ";
439        }
440        c.set_attr(query_arb::fn_used_rels, tmp.str());
441    }
442
443    if (opts->calc_idty) {
444        cseq_comparator calc_id(CMP_IUPAC_OPTIMISTIC,
445                                CMP_DIST_NONE,
446                                CMP_COVER_OVERLAP,
447                                false);
448        float idty = 0;
449        for (auto &s: vc) {
450            idty = std::max(idty, calc_id(c, *s.sequence));
451        }
452        c.set_attr(query_arb::fn_idty, 100.f*idty);
453    }
454
455    c.set_attr(query_arb::fn_date, make_datetime());
456    c.set_attr(query_arb::fn_filter, t.astats->getName());
457    t.aligned_sequence = &c;
458
459    return t;
460}
461
462template<typename SCORING_SCHEME, typename MASTER>
463void
464sina::choose_transition(cseq& c, const cseq& orig, MASTER& m,
465                  SCORING_SCHEME& s, ostream& log) {
466    if (aligner::opts->insertion == INSERTION_FORBID) {
467        transition_aspace_aware<SCORING_SCHEME, MASTER, cseq> tr(s);
468        do_align(c, orig, m, tr, log);
469    } else {
470        transition_simple<SCORING_SCHEME, MASTER, cseq> tr(s);
471        do_align(c, orig, m, tr, log);
472    }
473}
474
475template<typename transition, typename MASTER>
476void
477sina::do_align(cseq& c, const cseq& orig, MASTER &m,
478               transition &tr, ostream& log) {
479    using cnsts_type = compute_node_simple<transition>;
480    using data_type = typename cnsts_type::data_type;
481    cnsts_type cns(tr);
482
483    // create the alignment "mesh" (not quite a matrix)
484#ifdef HAVE_TBB_MALLOC
485    using mesh_t = mesh<MASTER, cseq, data_type, tbb::tbb_allocator<data_type>>;
486#else
487    using mesh_t = mesh<MASTER, cseq, data_type>;
488#endif
489    logger->debug("Allocating {}MB for alignment matrix", mesh_t::guessMem(m, c)/1024/1024);
490    mesh_t A(m, c);
491
492    int oh_head, oh_tail;
493
494    // compute values of mesh nodes
495    compute(A, cns);
496
497    // remove bases from sequence container
498    c.clearSequence();
499
500    // run backtracking on the mesh
501    float score = backtrack(A, c, tr,
502                            aligner::opts->overhang,
503                            aligner::opts->lowercase,
504                            aligner::opts->insertion,
505                            oh_head, oh_tail, log);
506    // alignment done :-)
507    c.set_attr(query_arb::fn_head, oh_head);
508    c.set_attr(query_arb::fn_tail, oh_tail);
509    c.set_attr(query_arb::fn_qual, (int)std::min(100.f, std::max(0.f, 100.f * score)));
510
511    if (aligner::opts->debug_graph) {
512        ofstream out(fmt::format("mseq_{}.dot", c.getName()));
513        m.print_graphviz(out, "reference");
514
515        for (auto part : orig.find_differing_parts(c)) {
516            mesh_to_svg(A, part.first, part.second,
517                        fmt::format("mesh_{}_{}_{}.dot",
518                                    c.getName(), part.first, part.second).c_str());
519        }
520    }
521}
522
523
524/*
525  Local Variables:
526  mode:c++
527  c-file-style:"stroustrup"
528  c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . 0))
529  indent-tabs-mode:nil
530  fill-column:99
531  End:
532*/
533// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 :
Note: See TracBrowser for help on using the repository browser.