source: trunk/GDE/SINA/builddir/src/rw_csv.cpp

Last change on this file was 19170, checked in by westram, 2 years ago
  • sina source
    • unpack + remove tarball
    • no longer ignore sina builddir.
File size: 6.8 KB
Line 
1/*
2Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu>
3
4This file is part of SINA.
5SINA is free software: you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation, either version 3 of the License, or (at your
8option) any later version.
9
10SINA is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15You should have received a copy of the GNU General Public License
16along with SINA.  If not, see <http://www.gnu.org/licenses/>.
17
18Additional permission under GNU GPL version 3 section 7
19
20If you modify SINA, or any covered work, by linking or combining it
21with components of ARB (or a modified version of that software),
22containing parts covered by the terms of the
23ARB-public-library-license, the licensors of SINA grant you additional
24permission to convey the resulting work. Corresponding Source for a
25non-source form of such a combination shall include the source code
26for the parts of ARB used as well as that of the covered work.
27*/
28
29#include "rw_csv.h"
30#include "log.h"
31#include "query_arb.h"
32
33#include <boost/program_options.hpp>
34namespace po = boost::program_options;
35#include <boost/filesystem.hpp>
36namespace fs = boost::filesystem;
37#include <boost/iostreams/filtering_stream.hpp>
38#include <boost/iostreams/filter/gzip.hpp>
39#include <boost/iostreams/device/file_descriptor.hpp>
40namespace bi = boost::iostreams;
41#include <boost/algorithm/string.hpp>
42using boost::algorithm::equals;
43
44namespace sina {
45namespace rw_csv {
46
47static const char* module_name = "CSV I/O";
48static auto logger = Log::create_logger(module_name);
49
50struct options {
51    bool crlf;
52    std::string sep;
53    std::string id;
54};
55static options opts;
56
57void get_options_description(po::options_description &main,
58                             po::options_description &adv) {
59    po::options_description od(module_name);
60    od.add_options()
61        ("csv-crlf", po::bool_switch(&opts.crlf),
62         "Write CSV using CRLF line ends (as RFC4180 demands)");
63    od.add_options()
64        ("csv-sep", po::value(&opts.sep)->default_value(""),
65         "Overrides field separator. Default is ',', or TAB if the "
66         " output file ends in '.tsv'");
67    od.add_options()
68        ("csv-id", po::value(&opts.id)->default_value("name"),
69         "Override column header for sequence ID");
70    adv.add(od);
71}
72
73void validate_vm(po::variables_map &, po::options_description &) {
74}
75
76struct writer::priv_data {
77    bi::file_descriptor_sink file;
78    bi::filtering_ostream out;
79    unsigned long copy_relatives;
80    std::vector<std::string> v_fields;
81    std::vector<std::string> headers;
82    bool header_printed{false};
83    const char *line_end;
84    size_t line_end_len;
85    const char *sep;
86    size_t sep_len;
87    std::string escape_chars;
88
89    void add_newline(fmt::memory_buffer& buf) {
90        buf.append(line_end, line_end + line_end_len);
91    }
92    void add_sep(fmt::memory_buffer& buf) {
93        buf.append(sep, sep + sep_len);
94    }
95    void add_str(fmt::memory_buffer& buf, const std::string& str) {
96        if (str.find_first_of(escape_chars) == std::string::npos) {
97            buf.append(str.data(), str.data() + str.size());
98        } else {
99            const char quote[] = "\"";
100            buf.append(quote, quote + sizeof(quote) - 1);
101            size_t j = 0;
102            for (auto i = str.find('"'); i != std::string::npos; i = str.find('"', i+1)) {
103                buf.append(str.data() + j, str.data() + i);
104                buf.append(quote, quote + sizeof(quote) - 1);
105                j = i;
106            }
107            buf.append(str.data() + j, str.data() + str.size());
108            buf.append(quote, quote + sizeof(quote) - 1);
109        }
110    }
111};
112
113writer::writer(const fs::path& outfile,
114               unsigned int copy_relatives,
115               std::vector<std::string>& fields)
116    : data(new priv_data())
117{
118    data->copy_relatives = copy_relatives;
119    data->v_fields = fields;
120
121    try {
122        if (outfile == "-") {
123            data->file.open(STDOUT_FILENO, bi::never_close_handle);
124        } else {
125            data->file.open(outfile.c_str(), std::ios_base::binary);
126        }
127    } catch(std::runtime_error &e) {
128        auto msg = "Unable to open file {} for writing ('{}')";
129        throw std::runtime_error(fmt::format(msg, outfile, e.what()));
130    }
131
132    if (outfile.extension() == ".gz") {
133        data->out.push(bi::gzip_compressor());
134    }
135    data->out.push(data->file);
136
137    if (opts.sep != "") {
138        data->sep = opts.sep.c_str();
139    } else if (outfile.extension() == ".tsv"
140        || (outfile.extension() == ".gz" &&
141            outfile.extension().extension() == ".tsv")
142        ) {
143            data->sep = "\t";
144    } else {
145        data->sep = ",";
146    }
147    data->sep_len = strlen(data->sep);
148
149    if (opts.crlf) {
150        data->line_end = "\r\n";
151        data->line_end_len = 2;
152    } else {
153        data->line_end = "\n";
154        data->line_end_len = 1;
155    }
156
157    data->escape_chars = std::string("\"") + data->line_end + data->sep;
158}
159
160writer::writer(const writer&) = default;
161writer& writer::operator=(const writer&) = default;
162writer::~writer() = default;
163
164tray writer::operator()(tray t) {
165    fmt::memory_buffer buf;
166
167    if (t.aligned_sequence == nullptr) {
168        return t;
169    }
170    if (!data->header_printed) {
171        data->add_str(buf, opts.id);
172
173        if (data->v_fields.empty()
174            ||
175            (data->v_fields.size() == 1 && equals(data->v_fields[0],
176                                                  query_arb::fn_fullname))
177            ) {
178            auto attrs = t.aligned_sequence->get_attrs();
179            data->headers.reserve(attrs.size());
180            for (auto& ap : attrs) {
181                data->headers.push_back(ap.first);
182            }
183        } else {
184            data->headers.reserve(data->v_fields.size());
185            for (const auto& f: data->v_fields) {
186                data->headers.push_back(f);
187            }
188        }
189        for (const auto& header : data->headers) {
190            data->add_sep(buf);
191            data->add_str(buf, header);
192        }
193
194        data->add_newline(buf);
195        data->header_printed = true;
196    }
197
198    data->add_str(buf, t.aligned_sequence->getName());
199    for (const auto& key : data->headers) {
200        data->add_sep(buf);
201        data->add_str(buf, t.aligned_sequence->get_attr<std::string>(key));
202    }
203    data->add_newline(buf);
204
205    fmt::internal::write(data->out, buf);
206
207    return t;
208}
209
210} // namespace rw_csv
211} // namespace sina
212
213/*
214  Local Variables:
215  mode:c++
216  c-file-style:"stroustrup"
217  c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . 0))
218  indent-tabs-mode:nil
219  fill-column:99
220  End:
221*/
222// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 :
Note: See TracBrowser for help on using the repository browser.