source: trunk/GDE/SINA/builddir/src/kmer.h

Last change on this file was 19170, checked in by westram, 2 years ago
  • sina source
    • unpack + remove tarball
    • no longer ignore sina builddir.
File size: 7.7 KB
Line 
1/*
2  Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu>
3
4  This file is part of SINA.
5  SINA is free software: you can redistribute it and/or modify it under
6  the terms of the GNU General Public License as published by the Free
7  Software Foundation, either version 3 of the License, or (at your
8  option) any later version.
9
10  SINA is distributed in the hope that it will be useful, but WITHOUT ANY
11  WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13  for more details.
14
15  You should have received a copy of the GNU General Public License
16  along with SINA.  If not, see <http://www.gnu.org/licenses/>.
17
18  Additional permission under GNU GPL version 3 section 7
19
20  If you modify SINA, or any covered work, by linking or combining it
21  with components of ARB (or a modified version of that software),
22  containing parts covered by the terms of the
23  ARB-public-library-license, the licensors of SINA grant you additional
24  permission to convey the resulting work. Corresponding Source for a
25  non-source form of such a combination shall include the source code
26  for the parts of ARB used as well as that of the covered work.
27*/
28
29#ifndef _KMER_H_
30#define _KMER_H_
31
32#include <stdexcept> // for runtime_error
33#include <unordered_set>
34#include <vector>
35#include <iostream>
36#include "aligned_base.h"
37#include "helpers.h" // for unlikely()
38
39
40namespace sina {
41
42namespace kmer {
43
44/** converts base_iupac sequence to kmer
45 */
46class generator {
47protected:
48    const unsigned int _k;
49    const unsigned int _mask;
50    unsigned int _val;
51    unsigned int _good_count;
52
53public:
54    generator(unsigned int k, unsigned int val=0)
55        :  _k(k), _mask((1UL<<(2*k))-1), _val(val), _good_count(0)
56    {
57        if (_k < 1) {
58            throw std::runtime_error("K must be at least 1");
59        }
60        if (sizeof(_val)*8 < 2UL*k) {
61            throw std::runtime_error("K too large!");
62        }
63    }
64
65    /** shift kmer by adding a base to the right
66     *
67     * use good() to check if the new window is a valid kmer
68     */
69    void push(const base_iupac& b) {
70        if (unlikely(b.is_ambig())) {
71            _good_count = 0;
72        } else {
73            _good_count++;
74            _val <<= 2;
75            _val &= _mask;
76            _val += b.getBaseType();
77        }
78    }
79
80    /** checks if the current sequence window is a valid kmer */
81    bool good() const {
82        return _good_count >= _k;
83    }
84
85    /** implicit conversion to int representation of kmer */
86    operator unsigned int() const {
87        return _val;
88    }
89
90    /** explicit converto to int representation of kmer */
91    unsigned int val() const {
92        return _val;
93    }
94
95    /** access good count **/
96    unsigned int get_good_count() const {
97        return _good_count;
98    };
99
100    /** pretty print self **/
101    std::ostream& print_to(std::ostream& out) const {
102        for (int i = _k-1; i >= 0; --i) {
103            out << base_iupac( base_types( (_val >> (i*2)) & 3 ));
104        }
105        return out;
106    }
107};
108
109/** filter kmers by prefix */
110template<class GENERATOR>
111class prefix_filter : public GENERATOR {
112private:
113    const unsigned int _p_mask;
114    const unsigned int _p_val;
115public:
116    template<typename... ARGS>
117    prefix_filter(unsigned int k, unsigned int p_len, unsigned int p_val, ARGS&&... data)
118        : GENERATOR(k, std::forward<ARGS>(data)...),
119          _p_mask(((1<<p_len*2)-1)<<((k-p_len)*2)),
120          _p_val(p_val<<((k-p_len)*2)) {
121    }
122    bool good() const {
123        return GENERATOR::good() && (GENERATOR::val() & _p_mask) == _p_val;
124    }
125};
126
127/** filter all but first occurrence */
128template<class GENERATOR>
129class unique_filter : public GENERATOR {
130private:
131    std::unordered_set<unsigned int>& _seen;
132    bool is_good;
133public:
134    template<typename... ARGS>
135    unique_filter(std::unordered_set<unsigned int>& seen, ARGS&&... data)
136        : GENERATOR(std::forward<ARGS>(data)...),
137          _seen(seen),
138          is_good(false) {
139        seen.clear();
140        // seen.reserve(?)
141    }
142
143    void push(const base_iupac& b) {
144        GENERATOR::push(b);
145        is_good = GENERATOR::good() && _seen.insert(GENERATOR::val()).second;
146    }
147
148    bool good() const {
149        return is_good;
150    }
151};
152
153/** provide container-like access for for-loops */
154template<typename GENERATOR, typename bases>
155class iterable : GENERATOR {
156private:
157    using bases_iterator = typename bases::const_iterator;
158    bases_iterator _begin, _end;
159public:
160    template<typename... ARGS>
161    iterable(bases &v, ARGS&&... data)
162        : GENERATOR(std::forward<ARGS>(data)...),
163          _begin(v.begin()), _end(v.end())
164    {}
165
166    class iterator;
167    iterator begin() {
168        return iterator(*this, _begin, _end);
169    }
170    iterator end() {
171        return iterator(*this, _end, _end);
172    }
173
174    class iterator {
175    private:
176        iterable _generator;
177        bases_iterator _begin, _end;
178    public:
179        iterator(iterable& gen,
180                 bases_iterator& begin, bases_iterator& end)
181            : _generator(gen), _begin(begin), _end(end)
182        {
183            if (begin != end) {
184                ++*this;
185            }
186        }
187
188        iterator& operator++() {
189            do {
190                _generator.push(*_begin++);
191            } while (not _generator.good() &&_begin != _end);
192            return *this;
193        }
194
195        unsigned int operator*() const {
196            return _generator;
197        }
198
199        bool operator!=(const iterator& rhs) const {
200            return _begin != rhs._begin;
201        }
202    };
203};
204
205
206} // namespace kmer
207
208using kmer_generator = kmer::generator;
209using unique_kmer_generator = kmer::unique_filter<kmer_generator>;
210using prefix_kmer_generator = kmer::prefix_filter<kmer_generator>;
211using unique_prefix_kmer_generator = kmer::unique_filter<prefix_kmer_generator>;
212
213/**
214 * iterate over all kmers in container v
215 *
216 * params:
217 *  v   container to iterate over
218 *  k   size of k
219 *  val supply initial value (optional)
220 */
221template<typename bases, typename... ARGS>
222kmer::iterable<kmer_generator, bases>
223all_kmers(bases &v, ARGS&&... data) {
224    return kmer::iterable<kmer_generator, bases>(v, std::forward<ARGS>(data)...);
225}
226
227/**
228 * iterate over unique kmers in container v
229 *
230 * params:
231 *  v   container to iterate over
232 *  k   size of k
233 *  val supply initial value (optional)
234 */
235template<typename bases, typename... ARGS>
236kmer::iterable<unique_kmer_generator, bases>
237unique_kmers(bases &v, ARGS&&... data) {
238    return kmer::iterable<unique_kmer_generator, bases>(v, std::forward<ARGS>(data)...);
239}
240
241/**
242 * iterate over kmers in container v with specific prefix
243 *
244 * params:
245 *  v     container to iterate over
246 *  k     size of k
247 *  p_len prefix length
248 *  p_val prefix value
249 *  val   supply initial value (optional)
250 */
251template<typename bases, typename... ARGS>
252kmer::iterable<prefix_kmer_generator, bases>
253prefix_kmers(bases &v, ARGS&&... data) {
254    return kmer::iterable<prefix_kmer_generator, bases>(v, std::forward<ARGS>(data)...);
255}
256
257
258/**
259 * iterate over unique kmers in container v with specific prefix
260 *
261 * params:
262 *  v     container to iterate over
263 *  k     size of k
264 *  p_len prefix length
265 *  p_val prefix value
266 *  val   supply initial value (optional)
267 */
268template<typename bases, typename... ARGS>
269kmer::iterable<unique_prefix_kmer_generator, bases>
270unique_prefix_kmers(bases &v, ARGS&&... data) {
271    return kmer::iterable<unique_prefix_kmer_generator, bases>(v, std::forward<ARGS>(data)...);
272}
273
274
275
276} // namespace sina
277#endif // _KMER_H_
278
279/*
280  Local Variables:
281  mode:c++
282  c-file-style:"stroustrup"
283  c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . +))
284  indent-tabs-mode:nil
285  fill-column:99
286  End:
287*/
288// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 :
Note: See TracBrowser for help on using the repository browser.