source: trunk/GDE/SINA/builddir/src/rw_arb.cpp

Last change on this file was 19180, checked in by westram, 2 years ago
  • add new CLI option to pass name of alignment to use
File size: 11.2 KB
Line 
1/*
2Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu>
3
4This file is part of SINA.
5SINA is free software: you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation, either version 3 of the License, or (at your
8option) any later version.
9
10SINA is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15You should have received a copy of the GNU General Public License
16along with SINA.  If not, see <http://www.gnu.org/licenses/>.
17
18Additional permission under GNU GPL version 3 section 7
19
20If you modify SINA, or any covered work, by linking or combining it
21with components of ARB (or a modified version of that software),
22containing parts covered by the terms of the
23ARB-public-library-license, the licensors of SINA grant you additional
24permission to convey the resulting work. Corresponding Source for a
25non-source form of such a combination shall include the source code
26for the parts of ARB used as well as that of the covered work.
27*/
28
29#include "rw_arb.h"
30
31#include <iostream>
32#include <iomanip>
33
34#include <fstream>
35using std::istream;
36using std::ifstream;
37
38#include <string>
39using std::string;
40
41#include <sstream>
42using std::stringstream;
43
44#include <vector>
45using std::vector;
46
47#include <map>
48using std::map;
49
50#include <unordered_set>
51
52#include <boost/algorithm/string.hpp>
53using boost::split;
54using boost::is_any_of;
55
56#include "query_arb.h"
57#include "log.h"
58#include "progress.h"
59
60using namespace sina;
61namespace po = boost::program_options;
62namespace fs = boost::filesystem;
63
64static const char* module_name = "ARB I/O";
65static auto logger = Log::create_logger(module_name);
66
67
68/** Section:  Configuration
69 */
70
71struct rw_arb::options {
72    bool markcopied;
73    bool markaligned;
74    int prot_lvl;
75    string select_file;
76    int select_step;
77    int select_skip;
78    string pt_database;
79    string default_alignment;
80};
81struct rw_arb::options *rw_arb::opts;
82
83
84void
85rw_arb::get_options_description(po::options_description& /*main*/,
86                                po::options_description& adv) {
87    opts = new options();
88
89    po::options_description od(module_name);
90    od.add_options()
91        // show
92        ("arb-list-fields",
93         po::value<string>(),
94         "list available ARB fields and exit")
95
96        // alignment selection
97        ("default-alignment",
98         po::value<string>(&opts->default_alignment)->default_value(""),
99         "override name of handled alignment (default is to read default from ARB database)")
100
101        // write
102        ("markcopied",
103         po::bool_switch(&opts->markcopied),
104         "mark copied references")
105        ("markaligned",
106         po::bool_switch(&opts->markaligned),
107         "mark aligned sequences")
108        ("prot-level",
109         po::value<int>(&opts->prot_lvl)->default_value(4, ""),
110         "arb export protection level (4)")
111
112        // read
113        ("select-file",
114         po::value<string>(&opts->select_file)->default_value(""),
115         "file containting arb names to be used ('-' for STDIN)")
116        ("select-step",
117         po::value<int>(&opts->select_step)->default_value(1, ""),
118         "use every n-th sequence (1)" )
119        ("select-skip",
120         po::value<int>(&opts->select_skip)->default_value(0,""),
121         "skip the first n sequences (0)")
122        ;
123
124    adv.add(od);
125}
126
127void
128rw_arb::validate_vm(po::variables_map& vm,
129                    po::options_description& /*desc*/) {
130    if (vm.count("arb-list-fields") > 0u) {
131        auto fname = vm["arb-list-fields"].as<string>();
132        auto arb = query_arb::getARBDB(fname);
133        auto keys = arb->listKeys();
134        std::cout << "Listing ARB database keys for '" << fname << "':" << std::endl;
135        size_t width_name = 0, width_type_str = 0;
136        for (auto& key: keys) {
137            auto name = std::get<0>(key);
138            auto type_str = std::get<1>(key);
139            width_name = std::max(name.size(), width_name);
140            width_type_str = std::max(type_str.size(), width_type_str);
141        }
142        std::cout
143            << std::left
144            << std::setw(width_name) << "name" << " | "
145            << std::setw(width_type_str) << "data type" << " | "
146            << "type id"
147            << std::endl
148            << std::string(width_name + 3 + width_type_str + 3 + 7, '-')
149            << std::endl;
150        for (auto& key: keys) {
151            auto name = std::get<0>(key);
152            auto type_str = std::get<1>(key);
153            auto type = std::get<2>(key);
154            std::cout
155                << std::left
156                << std::setw(width_name) << name << " | "
157                << std::setw(width_type_str) << type_str << " | "
158                << type
159                << std::endl;
160        }
161        exit(0);
162    }
163}
164
165string
166rw_arb::get_cli_alignment() {
167    return opts->default_alignment;
168}
169
170/** Section: Reader
171 */
172
173struct rw_arb::reader::priv_data {
174    query_arb* arb{nullptr};
175    istream *in{nullptr};
176    int seqno{0};
177    int total_expected_sequences{0};
178    vector<string>& v_fields;
179    logger_progress *p{nullptr};
180    explicit priv_data(vector<string>& fields)
181        : v_fields(fields)
182    {
183    }
184
185    ~priv_data() {
186        logger->info("read {} sequences", seqno);
187    }
188};
189
190rw_arb::reader::reader() = default;
191rw_arb::reader::reader(const reader& o) = default;
192rw_arb::reader::~reader() = default;
193rw_arb::reader& rw_arb::reader::operator=(const reader& o) = default;
194
195rw_arb::reader::reader(fs::path infile,
196                       vector<string>& fields)
197    : data(new priv_data(fields))
198{
199    data->arb = query_arb::getARBDB(infile);
200
201    int n_seqs_db = data->arb->getSeqCount();
202    int n_seqs_sel = 0;
203
204    if (opts->select_file ==  "-") {
205        data->in = &std::cin;
206    } else if (!opts->select_file.empty()) {
207        data->in = new ifstream(opts->select_file.c_str());
208    } else {
209        auto *tmp = new stringstream();
210        vector<string> cache = data->arb->getSequenceNames();
211        for (auto & it : cache) {
212            *tmp << it << std::endl;
213        }
214        data->in = tmp;
215        n_seqs_sel = n_seqs_db;
216    }
217
218    // ignore first <select_skip> names
219    if (opts->select_skip) {
220        logger->info("Skipping first {} sequences", opts->select_skip);
221        string tmp;
222        for (int i = 0; i < opts->select_skip; i++) {
223            if (data->in->bad()) {
224                logger->error("After skipping {} sequences, none where left", i);
225                break;
226            }
227            (*data->in) >> tmp;
228        }
229        n_seqs_sel -= opts->select_skip;
230    }
231
232    if (opts->select_step > 1) {
233        logger->info("Processing only every {}th sequence", opts->select_step);
234        n_seqs_sel = 1 + (n_seqs_sel - 1) / opts->select_step;
235    }
236
237    if (n_seqs_sel > 0 && n_seqs_sel < n_seqs_db) {
238        logger->info("Processing {} sequences out of {} in the input database",
239                     n_seqs_sel, n_seqs_db);
240    }   
241    data->total_expected_sequences = (n_seqs_sel > 0) ? n_seqs_sel : 0;
242}
243
244void
245rw_arb::reader::set_progress(logger_progress &p) {
246    data->p = &p;
247    data->p->set_total(data->total_expected_sequences);
248}
249
250
251bool
252rw_arb::reader::operator()(tray& t) {
253    string name;
254    t.seqno = ++data->seqno;
255
256    t.input_sequence = nullptr; // we may get initialized tray
257
258    while (t.input_sequence == nullptr) {
259        if (data->in->bad()) {
260            data->total_expected_sequences = data->seqno - 1;
261            data->p->set_total(data->total_expected_sequences);
262            return false;
263        }
264
265        for (int i = 2; i <= opts->select_step; i++) {
266            (*data->in) >> name;
267        }
268        (*data->in) >> name;
269   
270        if (name.empty()) {
271            data->total_expected_sequences = data->seqno - 1;
272            data->p->set_total(data->total_expected_sequences);
273            return false;
274        }
275
276        try {
277            t.input_sequence = new cseq(data->arb->getCseq(name));
278        } catch (base_iupac::bad_character_exception& e) {
279            --data->total_expected_sequences;
280            data->p->set_total(data->total_expected_sequences);
281            logger->error("Bad character {} in sequence {}",
282                          e.character, name);
283        } catch(query_arb_exception& e) {
284            --data->total_expected_sequences;
285            data->p->set_total(data->total_expected_sequences);
286            logger->error("Read error: {}", e.what());
287        }
288    }
289    for (const auto& f: data->v_fields) {
290        data->arb->loadKey(*t.input_sequence, f);
291    }
292
293    logger->debug("loaded sequence {}", t.input_sequence->getName());
294    return true;
295}
296
297
298/** Section: Writer
299 */
300struct rw_arb::writer::priv_data {
301    query_arb *arb;
302    fs::path arb_fname;
303    int count;
304    int excluded;
305    std::unordered_set<string> relatives_written;
306    unsigned long copy_relatives;
307    vector<string>& v_fields;
308    priv_data(fs::path& arb_fname_,
309              unsigned long copy_relatives_,
310              vector<string>& fields)
311        : arb(nullptr),
312          arb_fname(arb_fname_),
313          count(0),
314          excluded(0),
315          copy_relatives(copy_relatives_),
316          v_fields(fields)
317    {
318    }
319    ~priv_data() {
320        if (arb == nullptr) { // might never have been initialized
321            return;
322        }
323        logger->info("wrote {} sequences ({} excluded, {} relatives)",
324                     count, excluded, relatives_written.size());
325        if (arb_fname.native() != ":") {
326            arb->save();
327        }
328    }   
329};
330
331rw_arb::writer::writer() = default;
332rw_arb::writer::writer(const writer& o) = default;
333rw_arb::writer::~writer() = default;
334rw_arb::writer& rw_arb::writer::operator=(const writer& o) = default;
335
336rw_arb::writer::writer(fs::path outfile, unsigned int copy_relatives,
337                       vector<string>& fields)
338    :  data(new priv_data(outfile, copy_relatives, fields))
339{
340    data->arb = query_arb::getARBDB(outfile); // might throw
341    data->arb->setProtectionLevel(opts->prot_lvl);
342}
343
344
345tray
346rw_arb::writer::operator()(tray t) {
347    if (t.aligned_sequence == nullptr) {
348        logger->info("Not writing sequence {} (>{}): not aligned",
349                     t.seqno, t.input_sequence->getName());
350        ++data->excluded;
351        return t;
352    }
353    cseq &c = *t.aligned_sequence;
354
355    data->arb->putCseq(c);
356
357    if (data->copy_relatives != 0u) {
358        // FIXME: we should copy if reference is an arb database
359        auto* relatives = t.search_result != nullptr ? t.search_result : t.alignment_reference;
360        if (relatives != nullptr) {
361            int i = data->copy_relatives;
362            for (auto& item : *relatives) {
363                if (data->relatives_written.insert( item.sequence->getName() ).second) {
364                    data->arb->putCseq(*item.sequence);
365                    data->count++;
366                }
367                if (--i == 0) {
368                    break;
369                }
370            }
371        }
372    }
373    data->count++;
374    return t;
375}
376
377/*
378  Local Variables:
379  mode:c++
380  c-file-style:"stroustrup"
381  c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . 0))
382  indent-tabs-mode:nil
383  fill-column:99
384  End:
385*/
386// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 :
Note: See TracBrowser for help on using the repository browser.