1 | /* |
---|
2 | Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu> |
---|
3 | |
---|
4 | This file is part of SINA. |
---|
5 | SINA is free software: you can redistribute it and/or modify it under |
---|
6 | the terms of the GNU General Public License as published by the Free |
---|
7 | Software Foundation, either version 3 of the License, or (at your |
---|
8 | option) any later version. |
---|
9 | |
---|
10 | SINA is distributed in the hope that it will be useful, but WITHOUT ANY |
---|
11 | WARRANTY; without even the implied warranty of MERCHANTABILITY or |
---|
12 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
---|
13 | for more details. |
---|
14 | |
---|
15 | You should have received a copy of the GNU General Public License |
---|
16 | along with SINA. If not, see <http://www.gnu.org/licenses/>. |
---|
17 | |
---|
18 | Additional permission under GNU GPL version 3 section 7 |
---|
19 | |
---|
20 | If you modify SINA, or any covered work, by linking or combining it |
---|
21 | with components of ARB (or a modified version of that software), |
---|
22 | containing parts covered by the terms of the |
---|
23 | ARB-public-library-license, the licensors of SINA grant you additional |
---|
24 | permission to convey the resulting work. Corresponding Source for a |
---|
25 | non-source form of such a combination shall include the source code |
---|
26 | for the parts of ARB used as well as that of the covered work. |
---|
27 | */ |
---|
28 | |
---|
29 | // template classes for alignment matrix and alignment functions |
---|
30 | #ifndef _MESH_H_ |
---|
31 | #define _MESH_H_ |
---|
32 | |
---|
33 | #include <vector> |
---|
34 | #include <iomanip> |
---|
35 | #include <iostream> |
---|
36 | #include <algorithm> |
---|
37 | #include <sstream> |
---|
38 | #include <cmath> |
---|
39 | #include <set> |
---|
40 | #include <memory> |
---|
41 | #include <string> |
---|
42 | |
---|
43 | |
---|
44 | #include "align.h" |
---|
45 | #include "graph.h" |
---|
46 | #include "scoring_schemes.h" |
---|
47 | #include "buffer.h" |
---|
48 | |
---|
49 | namespace sina { |
---|
50 | |
---|
51 | template<typename SEQ_MASTER, // mseq |
---|
52 | typename SEQ_SLAVE, // cseq |
---|
53 | typename DATA, // data_type |
---|
54 | typename ALLOCATOR = std::allocator<DATA> > |
---|
55 | class mesh |
---|
56 | { |
---|
57 | public: |
---|
58 | using master_type = SEQ_MASTER; |
---|
59 | using master_iterator_type = typename master_type::iterator; |
---|
60 | using master_idx_type = typename master_type::idx_type; |
---|
61 | |
---|
62 | using slave_type = SEQ_SLAVE; |
---|
63 | using slave_iterator_type = typename slave_type::iterator; |
---|
64 | using slave_idx_type = typename slave_type::idx_type; |
---|
65 | |
---|
66 | using value_type = DATA; |
---|
67 | using vector_type = aligned_buffer<value_type>; |
---|
68 | using size_type = typename vector_type::size_type; |
---|
69 | |
---|
70 | class iterator; |
---|
71 | |
---|
72 | mesh() = delete; |
---|
73 | mesh(SEQ_MASTER &m, SEQ_SLAVE &s) |
---|
74 | : _master(m), |
---|
75 | _slave(s), |
---|
76 | _data(m.size() * s.size()), |
---|
77 | _master_begin(_master.begin()), |
---|
78 | _slave_begin(_slave.begin()) |
---|
79 | {} |
---|
80 | |
---|
81 | |
---|
82 | template<typename IT_M, typename IT_S> |
---|
83 | value_type& |
---|
84 | operator()(const IT_M& itm, const IT_S& its) { |
---|
85 | return (*this)(get_node_id(_master,itm), |
---|
86 | get_node_id(_slave,its)); |
---|
87 | } |
---|
88 | |
---|
89 | value_type& |
---|
90 | operator()(const master_idx_type& mi, const slave_idx_type& si) { |
---|
91 | return _data[mi * _slave.size() + si]; |
---|
92 | } |
---|
93 | |
---|
94 | value_type& |
---|
95 | operator()(size_type s) { |
---|
96 | return _data[s]; |
---|
97 | } |
---|
98 | |
---|
99 | iterator |
---|
100 | begin() { |
---|
101 | return iterator(*this, _master.begin(), _slave.begin()); |
---|
102 | } |
---|
103 | |
---|
104 | iterator |
---|
105 | end() { |
---|
106 | return iterator(*this, _master.end(), _slave.end()); |
---|
107 | } |
---|
108 | |
---|
109 | // private: |
---|
110 | SEQ_MASTER _master; |
---|
111 | SEQ_SLAVE _slave; |
---|
112 | vector_type _data; |
---|
113 | |
---|
114 | master_iterator_type _master_begin; |
---|
115 | slave_iterator_type _slave_begin; |
---|
116 | |
---|
117 | template<typename T> friend void compute(T&); |
---|
118 | |
---|
119 | static size_type guessMem(SEQ_MASTER &m, SEQ_SLAVE &s) { |
---|
120 | return sizeof(value_type) * m.size() * s.size(); |
---|
121 | } |
---|
122 | |
---|
123 | #if 0 |
---|
124 | // unused code |
---|
125 | |
---|
126 | /* == missing in mseq |
---|
127 | bool |
---|
128 | operator==(const mesh& rhs) { |
---|
129 | return ( (_data == rhs._data) && (_master == rhs._master) && (_slave == rhs._master) ); |
---|
130 | } |
---|
131 | */ |
---|
132 | |
---|
133 | |
---|
134 | void |
---|
135 | reuse(SEQ_MASTER &m, SEQ_SLAVE &s) { |
---|
136 | _master=m; |
---|
137 | _slave=s; |
---|
138 | unsigned int needed = m.size() * s.size(); |
---|
139 | if (_data.size() < needed) _data.resize(needed); |
---|
140 | _master_begin=m.begin(); |
---|
141 | _slave_begin=s.begin(); |
---|
142 | } |
---|
143 | #endif |
---|
144 | }; |
---|
145 | |
---|
146 | template<typename SEQ_MASTER, // mseq, pseq |
---|
147 | typename SEQ_SLAVE, // cseq |
---|
148 | typename DATA, |
---|
149 | typename ALLOCATOR> |
---|
150 | class mesh<SEQ_MASTER, |
---|
151 | SEQ_SLAVE, |
---|
152 | DATA, |
---|
153 | ALLOCATOR>::iterator { |
---|
154 | public: |
---|
155 | using mesh_type = mesh<SEQ_MASTER, SEQ_SLAVE, DATA, ALLOCATOR>; |
---|
156 | using master_type = SEQ_MASTER; |
---|
157 | using miterator_type = typename master_type::iterator; |
---|
158 | using midx_type = typename master_type::idx_type; |
---|
159 | using slave_type = SEQ_SLAVE; |
---|
160 | using siterator_type = typename slave_type::iterator; |
---|
161 | using sidx_type = typename slave_type::idx_type; |
---|
162 | using value_type = DATA; |
---|
163 | using size_type = typename mesh_type::size_type; |
---|
164 | |
---|
165 | iterator(mesh_type& _mesh, |
---|
166 | const miterator_type& _mit, |
---|
167 | const siterator_type& _sit) |
---|
168 | : mymesh(_mesh), mit(_mit), sit(_sit) |
---|
169 | { |
---|
170 | sidx = get_node_id(mymesh._slave, sit); |
---|
171 | midx = get_node_id(mymesh._master, mit); |
---|
172 | } |
---|
173 | |
---|
174 | iterator& |
---|
175 | operator++() { |
---|
176 | ++sit; |
---|
177 | sidx = get_node_id(mymesh._slave, sit); |
---|
178 | |
---|
179 | if (sit != mymesh._slave.end()) { |
---|
180 | return *this; |
---|
181 | } |
---|
182 | |
---|
183 | ++mit; |
---|
184 | midx = get_node_id(mymesh._master, mit); |
---|
185 | |
---|
186 | if (mit == mymesh._master.end()) { |
---|
187 | return *this; |
---|
188 | } |
---|
189 | |
---|
190 | sit = mymesh._slave.begin(); |
---|
191 | sidx = get_node_id(mymesh._slave,sit); |
---|
192 | return *this; |
---|
193 | } |
---|
194 | |
---|
195 | bool |
---|
196 | operator==(const iterator& rhs) const { |
---|
197 | return ( |
---|
198 | // == missing in mseq and therefor in mesh |
---|
199 | // (mymesh == rhs.mymesh) && == |
---|
200 | (mit == rhs.mit) && |
---|
201 | (sit == rhs.sit) |
---|
202 | ); |
---|
203 | } |
---|
204 | |
---|
205 | bool operator!=(const iterator& rhs) const { return !(*this == rhs); } |
---|
206 | |
---|
207 | value_type& |
---|
208 | operator*() { |
---|
209 | return mymesh(*this); |
---|
210 | } |
---|
211 | |
---|
212 | operator size_type() { |
---|
213 | return (midx * mymesh._slave.size() + sidx); |
---|
214 | } |
---|
215 | |
---|
216 | midx_type getMidx() const { return midx; } |
---|
217 | sidx_type getSidx() const { return sidx; } |
---|
218 | |
---|
219 | mesh_type &mymesh; |
---|
220 | miterator_type mit; |
---|
221 | midx_type midx; |
---|
222 | siterator_type sit; |
---|
223 | sidx_type sidx; |
---|
224 | }; |
---|
225 | |
---|
226 | |
---|
227 | template<typename SEQ_MASTER, typename SEQ_SLAVE, typename DATA> |
---|
228 | std::ostream& |
---|
229 | operator<<(std::ostream& out, mesh<SEQ_MASTER,SEQ_SLAVE,DATA> m) { |
---|
230 | typename SEQ_MASTER::iterator mit = m._master.begin(); |
---|
231 | typename SEQ_MASTER::iterator mit_end = m._master.end(); |
---|
232 | typename SEQ_SLAVE ::iterator sit_end = m._slave.end(); |
---|
233 | |
---|
234 | out << "BEGIN MESH DATA" << std::endl; |
---|
235 | out << "SEQ_MASTER: " << m._master << std::endl; |
---|
236 | out << "SEQ_SLAVE: " << m._slave << std::endl; |
---|
237 | |
---|
238 | { |
---|
239 | typename SEQ_SLAVE::iterator sit = m._slave.begin(); |
---|
240 | out << " "; |
---|
241 | for (; sit!=sit_end; ++sit) { |
---|
242 | out << std::setw(5) << *sit << " "; |
---|
243 | } |
---|
244 | out << std::endl; |
---|
245 | } |
---|
246 | |
---|
247 | for (; mit!=mit_end; ++mit) { |
---|
248 | out << std::setw(2) << get_node_id(m._master,mit) << ":" |
---|
249 | << std::setw(5) << *mit << " "; |
---|
250 | typename SEQ_SLAVE::iterator sit=m._slave.begin(); |
---|
251 | for (; sit!=sit_end; ++sit) { |
---|
252 | out << std::setw(5) << m(mit,sit).value << " "; |
---|
253 | } |
---|
254 | out << std::endl; |
---|
255 | } |
---|
256 | out << "END MESH DATA" << std::endl; |
---|
257 | return out; |
---|
258 | } |
---|
259 | |
---|
260 | template<typename SCORING_SCHEME, |
---|
261 | typename SEQ_MASTER, // mseq, pseq |
---|
262 | typename SEQ_SLAVE> // cseq |
---|
263 | class transition_simple { |
---|
264 | public: |
---|
265 | using master_type = SEQ_MASTER; |
---|
266 | using slave_type = SEQ_SLAVE; |
---|
267 | using value_type = typename SCORING_SCHEME::value_type; |
---|
268 | struct data_type; |
---|
269 | using mesh_type = mesh<master_type, slave_type, data_type>; |
---|
270 | using mesh_iterator_type = typename mesh_type::iterator; |
---|
271 | |
---|
272 | //private: |
---|
273 | const SCORING_SCHEME& s; |
---|
274 | |
---|
275 | public: |
---|
276 | |
---|
277 | explicit transition_simple(const SCORING_SCHEME& _s) |
---|
278 | : s(_s) |
---|
279 | { |
---|
280 | } |
---|
281 | |
---|
282 | struct data_type { |
---|
283 | typename master_type::idx_type value_midx; |
---|
284 | typename slave_type::idx_type value_sidx; |
---|
285 | typename master_type::idx_type gapm_idx; |
---|
286 | typename slave_type::idx_type gaps_idx; |
---|
287 | |
---|
288 | value_type value; |
---|
289 | value_type gapm_val; |
---|
290 | value_type gaps_val; |
---|
291 | |
---|
292 | bool operator<(const data_type& o) const { return value<o.value; } |
---|
293 | |
---|
294 | void init_edge() { |
---|
295 | value = gapm_val = gaps_val = 1, |
---|
296 | value_midx = value_sidx = gapm_idx = gaps_idx = 0; |
---|
297 | } |
---|
298 | void init() { |
---|
299 | value = gapm_val = gaps_val = 1000000, |
---|
300 | value_midx = value_sidx = gapm_idx = gaps_idx = 0; |
---|
301 | } |
---|
302 | }; |
---|
303 | |
---|
304 | |
---|
305 | template<typename BASE_TYPE_A, typename BASE_TYPE_B> |
---|
306 | void |
---|
307 | deletion(const data_type &src, data_type &dest, |
---|
308 | const BASE_TYPE_A& b1, const BASE_TYPE_B& b2, |
---|
309 | typename SEQ_MASTER::idx_type midx, |
---|
310 | typename SEQ_MASTER::idx_type sidx) { |
---|
311 | |
---|
312 | value_type value = s.deletion(src.value, b1, b2); |
---|
313 | value_type gap_val = s.deletion_ext(src.gapm_val, b1, b2, 0); |
---|
314 | |
---|
315 | if (value < gap_val) { |
---|
316 | dest.gapm_val = value; |
---|
317 | dest.gapm_idx = midx; |
---|
318 | } else { |
---|
319 | dest.gapm_val = gap_val; |
---|
320 | dest.gapm_idx = src.gapm_idx; |
---|
321 | value = gap_val; |
---|
322 | midx = src.gapm_idx; |
---|
323 | } |
---|
324 | |
---|
325 | if ( value < dest.value ) { |
---|
326 | dest.value = value; |
---|
327 | dest.value_midx = midx; |
---|
328 | dest.value_sidx = sidx; |
---|
329 | } |
---|
330 | } |
---|
331 | |
---|
332 | template<typename BASE_TYPE_A, typename BASE_TYPE_B> |
---|
333 | void |
---|
334 | insertion(const data_type &src, data_type &dest, |
---|
335 | const BASE_TYPE_A& b1, const BASE_TYPE_B& b2, |
---|
336 | typename SEQ_MASTER::idx_type midx, |
---|
337 | typename SEQ_SLAVE::idx_type sidx, |
---|
338 | typename SEQ_SLAVE::idx_type /*smax*/) { |
---|
339 | |
---|
340 | if (src.gaps_val != src.value ) { |
---|
341 | // opening gap |
---|
342 | dest.gaps_val = s.insertion(src.value, b1, b2); |
---|
343 | dest.gaps_idx = sidx; |
---|
344 | } else { |
---|
345 | // extending gap |
---|
346 | dest.gaps_val = s.insertion_ext(src.gaps_val, b1, b2, |
---|
347 | sidx-src.gaps_idx); |
---|
348 | dest.gaps_idx = src.gaps_idx; |
---|
349 | } |
---|
350 | |
---|
351 | if ( dest.gaps_val <= dest.value ) { |
---|
352 | // if gap open/extend is currently the best |
---|
353 | // option, remember this |
---|
354 | dest.value = dest.gaps_val; |
---|
355 | dest.value_sidx = dest.gaps_idx; |
---|
356 | dest.value_midx = midx; |
---|
357 | } |
---|
358 | } |
---|
359 | |
---|
360 | template<typename BASE_TYPE_A, typename BASE_TYPE_B> |
---|
361 | void |
---|
362 | match(const data_type &src, data_type &dest, |
---|
363 | const BASE_TYPE_A& b1, const BASE_TYPE_B& b2, |
---|
364 | typename SEQ_MASTER::idx_type midx, |
---|
365 | typename SEQ_SLAVE::idx_type sidx) { |
---|
366 | |
---|
367 | value_type value = s.match(src.value, b1, b2) ; |
---|
368 | |
---|
369 | if (value < dest.value) { |
---|
370 | dest.value = value; |
---|
371 | dest.value_midx = midx; |
---|
372 | dest.value_sidx = sidx; |
---|
373 | } |
---|
374 | } |
---|
375 | }; |
---|
376 | |
---|
377 | template<typename SCORING_SCHEME, |
---|
378 | typename SEQ_MASTER_, |
---|
379 | typename SEQ_SLAVE_> |
---|
380 | class transition_aspace_aware |
---|
381 | : public transition_simple<SCORING_SCHEME, SEQ_MASTER_, SEQ_SLAVE_> { |
---|
382 | public: |
---|
383 | typedef transition_simple<SCORING_SCHEME, |
---|
384 | SEQ_MASTER_, SEQ_SLAVE_> transition; |
---|
385 | explicit transition_aspace_aware(const SCORING_SCHEME& _s) |
---|
386 | : transition(_s) |
---|
387 | { |
---|
388 | } |
---|
389 | |
---|
390 | struct data_type |
---|
391 | : public transition::data_type { |
---|
392 | typename transition::slave_type::idx_type gaps_max; |
---|
393 | void init_edge() { |
---|
394 | transition::data_type::init_edge(); |
---|
395 | gaps_max = 0; |
---|
396 | } |
---|
397 | void init() { |
---|
398 | transition::data_type::init(); |
---|
399 | gaps_max = 0; |
---|
400 | } |
---|
401 | }; |
---|
402 | |
---|
403 | template<typename BASE_TYPE_A, typename BASE_TYPE_B> |
---|
404 | void |
---|
405 | insertion(const data_type &src, data_type &dest, |
---|
406 | const BASE_TYPE_A& b1, const BASE_TYPE_B& b2, |
---|
407 | typename transition::master_type::idx_type midx, |
---|
408 | typename transition::slave_type::idx_type sidx, |
---|
409 | typename transition::slave_type::idx_type smax) { |
---|
410 | |
---|
411 | if (smax < 1) { |
---|
412 | return; // can't insert |
---|
413 | } |
---|
414 | |
---|
415 | if (src.gaps_val != src.value ) { |
---|
416 | // opening gap |
---|
417 | dest.gaps_val = transition::s.insertion(src.value, b1, b2); |
---|
418 | dest.gaps_idx = sidx; |
---|
419 | dest.gaps_max = smax-1; |
---|
420 | } else if (src.gaps_max > 0) { |
---|
421 | // extending gap |
---|
422 | dest.gaps_val = transition::s.insertion_ext(src.gaps_val, b1, b2, |
---|
423 | sidx-src.gaps_idx); |
---|
424 | dest.gaps_idx = src.gaps_idx; |
---|
425 | dest.gaps_max = src.gaps_max-1; |
---|
426 | } else { |
---|
427 | return; |
---|
428 | } |
---|
429 | |
---|
430 | if ( dest.gaps_val <= dest.value ) { |
---|
431 | // if gap open/extend is currently the best |
---|
432 | // option, remember this |
---|
433 | dest.value = dest.gaps_val; |
---|
434 | dest.value_sidx = dest.gaps_idx; |
---|
435 | dest.value_midx = midx; |
---|
436 | } |
---|
437 | } |
---|
438 | }; |
---|
439 | |
---|
440 | /* computes all possible transitions to this node and choses the best */ |
---|
441 | template<typename TRANSITION> |
---|
442 | struct compute_node_simple { |
---|
443 | using master_type = typename TRANSITION::master_type; |
---|
444 | using slave_type = typename TRANSITION::slave_type; |
---|
445 | using data_type = typename TRANSITION::data_type; |
---|
446 | using midx_type = typename master_type::idx_type; |
---|
447 | using sidx_type = typename slave_type::idx_type; |
---|
448 | using mpit_type = typename master_type::pn_iterator; |
---|
449 | using spit_type = typename slave_type::pn_iterator; |
---|
450 | |
---|
451 | explicit compute_node_simple(TRANSITION _t) : t(_t) {} |
---|
452 | |
---|
453 | template<typename T> |
---|
454 | void |
---|
455 | calc(T& it) |
---|
456 | { |
---|
457 | using mesh_type = typename T::mesh_type; |
---|
458 | mesh_type &mesh = it.mymesh; |
---|
459 | |
---|
460 | typename master_type::iterator m = it.mit; |
---|
461 | typename slave_type::iterator s = it.sit; |
---|
462 | |
---|
463 | mpit_type mpit_end = prev_end(mesh._master, m); |
---|
464 | spit_type spit_end = prev_end(mesh._slave, s); |
---|
465 | midx_type midx = it.getMidx(); |
---|
466 | sidx_type sidx = it.getSidx(); |
---|
467 | |
---|
468 | data_type d; |
---|
469 | if (prev_begin(mesh._master,m) == mpit_end || s == mesh._slave.begin()) { |
---|
470 | d.init_edge(); |
---|
471 | } else { |
---|
472 | d.init(); |
---|
473 | } |
---|
474 | |
---|
475 | for (mpit_type mpit = prev_begin(mesh._master,m); mpit != mpit_end; ++mpit) { |
---|
476 | midx_type mi = get_node_id(mesh._master,mpit); |
---|
477 | t.deletion(mesh(mi,sidx),d,*m,*s,mi,sidx); |
---|
478 | } |
---|
479 | |
---|
480 | unsigned int min_mpos = 1000000; |
---|
481 | for (mpit_type mpit = next_begin(mesh._master,m); mpit != next_end(mesh._master,m); ++mpit) { |
---|
482 | min_mpos = std::min(min_mpos, mpit->getPosition()); |
---|
483 | } |
---|
484 | int max_insert = min_mpos - m->getPosition() - 1; |
---|
485 | |
---|
486 | for (spit_type spit = prev_begin(mesh._slave,s); |
---|
487 | spit != spit_end; ++spit) { |
---|
488 | sidx_type si = get_node_id(mesh._slave,spit); |
---|
489 | t.insertion(mesh(midx,si),d,*m,*s,midx,si,max_insert); |
---|
490 | } |
---|
491 | |
---|
492 | for (mpit_type mpit = prev_begin(mesh._master,m); mpit != mpit_end; ++mpit) { |
---|
493 | midx_type mi = get_node_id(mesh._master,mpit); |
---|
494 | for (spit_type spit = prev_begin(mesh._slave,s); |
---|
495 | spit != spit_end; ++spit) { |
---|
496 | sidx_type si = get_node_id(mesh._slave,spit); |
---|
497 | |
---|
498 | t.match(mesh(mi,si),d,*m,*s,mi,si); |
---|
499 | } |
---|
500 | } |
---|
501 | *it = d; |
---|
502 | } |
---|
503 | TRANSITION t; |
---|
504 | }; |
---|
505 | |
---|
506 | |
---|
507 | /* computes values in a mesh by executing NODE_COMPUTOR for every |
---|
508 | position */ |
---|
509 | template<typename MESH_TYPE, |
---|
510 | typename NODE_COMPUTOR> |
---|
511 | void |
---|
512 | compute(MESH_TYPE& mesh, NODE_COMPUTOR& nc) { |
---|
513 | using master_iterator_type = typename MESH_TYPE::master_iterator_type; |
---|
514 | using slave_iterator_type = typename MESH_TYPE::slave_iterator_type; |
---|
515 | |
---|
516 | mesh._master.sort(); |
---|
517 | mesh._slave.sort(); |
---|
518 | |
---|
519 | master_iterator_type m = mesh._master.begin(); |
---|
520 | master_iterator_type mend = mesh._master.end(); |
---|
521 | slave_iterator_type send = mesh._slave.end(); |
---|
522 | |
---|
523 | typename MESH_TYPE::iterator it = mesh.begin(), end = mesh.end(); |
---|
524 | |
---|
525 | for (;it != end ; ++it) { |
---|
526 | nc.calc(it); |
---|
527 | } |
---|
528 | } |
---|
529 | |
---|
530 | |
---|
531 | /** backtrack creates the cseq result sequence from a mesh that has |
---|
532 | * been computed. returns float score. |
---|
533 | */ |
---|
534 | template<typename MESH_TYPE, typename TRANSITION> |
---|
535 | float |
---|
536 | backtrack(MESH_TYPE& mesh, cseq& out, TRANSITION &tr, |
---|
537 | OVERHANG_TYPE overhang_pos, LOWERCASE_TYPE lowercase, |
---|
538 | INSERTION_TYPE insertion, |
---|
539 | int& cutoff_head, int& cutoff_tail, |
---|
540 | std::ostream& log) { |
---|
541 | using master_type = typename MESH_TYPE::master_type; |
---|
542 | using slave_type = typename MESH_TYPE::slave_type; |
---|
543 | using master_idx_type = typename MESH_TYPE::master_idx_type; |
---|
544 | using slave_idx_type = typename MESH_TYPE::slave_idx_type; |
---|
545 | using m_pnit_type = typename master_type::pn_iterator; |
---|
546 | using s_pnit_type = typename slave_type::pn_iterator; |
---|
547 | using master_base_type = typename master_type::value_type; |
---|
548 | using slave_base_type = typename slave_type::value_type; |
---|
549 | |
---|
550 | master_idx_type alig_width = mesh._master.getWidth(); |
---|
551 | |
---|
552 | // find outer nodes in slave sequence (hack... :/ ) |
---|
553 | slave_idx_type sbegin = get_node_id(mesh._slave, mesh._slave.begin()); |
---|
554 | slave_idx_type send = get_node_id(mesh._slave, --mesh._slave.end()); |
---|
555 | |
---|
556 | // find outer nodes in reference graph |
---|
557 | std::set<master_idx_type> set_mbegin, set_mend; |
---|
558 | for (m_pnit_type pit = mesh._master.pn_first_begin(); |
---|
559 | pit != mesh._master.pn_first_end(); ++pit) { |
---|
560 | set_mbegin.insert(get_node_id(mesh._master, pit)); |
---|
561 | } |
---|
562 | for (m_pnit_type pit = mesh._master.pn_last_begin(); |
---|
563 | pit != mesh._master.pn_last_end(); ++pit) { |
---|
564 | set_mend.insert(get_node_id(mesh._master, pit)); |
---|
565 | } |
---|
566 | |
---|
567 | // find starting point for alignment |
---|
568 | // check right side |
---|
569 | master_idx_type m = get_node_id(mesh._master, |
---|
570 | mesh._master.pn_last_begin()); |
---|
571 | // match last slave base with each graph node |
---|
572 | for (typename master_type::iterator pit = mesh._master.begin(); |
---|
573 | pit != mesh._master.end(); ++pit) { |
---|
574 | master_idx_type tmp = get_node_id(mesh._master, pit); |
---|
575 | if (mesh(tmp,send)<mesh(m,send)) { |
---|
576 | m=tmp; |
---|
577 | } |
---|
578 | } |
---|
579 | slave_idx_type s = send; |
---|
580 | // for each graph end-node |
---|
581 | for (m_pnit_type pit = mesh._master.pn_last_begin(); |
---|
582 | pit != mesh._master.pn_last_end(); ++pit) { |
---|
583 | master_idx_type mtmp = get_node_id(mesh._master, pit); |
---|
584 | // for each |
---|
585 | for (s_pnit_type sit = mesh._slave.begin(); |
---|
586 | sit != mesh._slave.end(); ++sit) { |
---|
587 | slave_idx_type stmp = get_node_id(mesh._slave, sit); |
---|
588 | if (mesh(mtmp, stmp)<mesh(m,s)) { |
---|
589 | m=mtmp, s=stmp; |
---|
590 | } |
---|
591 | } |
---|
592 | } |
---|
593 | |
---|
594 | // handle right hand overhang |
---|
595 | cutoff_tail = send - s; |
---|
596 | if (cutoff_tail && overhang_pos != OVERHANG_REMOVE) { |
---|
597 | int pos; |
---|
598 | if (overhang_pos == OVERHANG_ATTACH) { |
---|
599 | pos = alig_width -1 - mesh._master.getById(m).getPosition() - cutoff_tail; |
---|
600 | } else { // OVERHANG_EDGE |
---|
601 | pos = 0; |
---|
602 | } |
---|
603 | auto it = mesh._slave.rbegin(); |
---|
604 | auto end = it + cutoff_tail; |
---|
605 | |
---|
606 | if (lowercase == LOWERCASE_UNALIGNED) { |
---|
607 | for (; it != end; ++it) { |
---|
608 | out.append(aligned_base(std::max(0, pos++), it->getBase().setLowerCase())); |
---|
609 | } |
---|
610 | } else { |
---|
611 | for (; it != end; ++it) { |
---|
612 | out.append(aligned_base(std::max(0, pos++), it->getBase())); |
---|
613 | } |
---|
614 | } |
---|
615 | } |
---|
616 | |
---|
617 | // calculate score |
---|
618 | auto rval = (float)mesh(m,s).value; |
---|
619 | |
---|
620 | unsigned int pos=alig_width -1 -mesh._master.getById(m).getPosition(); |
---|
621 | float sum_weight=0; |
---|
622 | int aligned_bases=0; |
---|
623 | |
---|
624 | // add pair slave base with position from reference base and |
---|
625 | // append the newly aligned base to result vector. |
---|
626 | slave_base_type ab1(pos,mesh._slave.getById(s).getBase()); |
---|
627 | out.append(ab1); |
---|
628 | aligned_bases++; |
---|
629 | |
---|
630 | // count used weights |
---|
631 | { |
---|
632 | // create a copy of the master base object |
---|
633 | master_base_type ab2(mesh._master.getById(m)); |
---|
634 | // set the copy to be a match to the slave |
---|
635 | ab2.setBase(mesh._slave.getById(s).getBase()); |
---|
636 | // count score "had there been a match" |
---|
637 | sum_weight = tr.s.match(sum_weight,ab2,ab1); |
---|
638 | } |
---|
639 | |
---|
640 | // while we have neither reached the leftmost base of the slave |
---|
641 | // nor one of the leftmost bases of the reference |
---|
642 | while (s!=sbegin && set_mbegin.find(m) == set_mbegin.end()) { |
---|
643 | // get the slave/reference base pair that has |
---|
644 | // yielded the best score to get here |
---|
645 | slave_idx_type snew = mesh(m,s).value_sidx; |
---|
646 | m = mesh(m,s).value_midx; |
---|
647 | |
---|
648 | // if we had a deletion in the slave, the next cell |
---|
649 | // will point to the same slave idx. we want the second |
---|
650 | // cell, as that is the one that was reached via a "match" |
---|
651 | // => if moving purely along master (and not at end of slave) |
---|
652 | // skip first cell (the one indicating the deletion) |
---|
653 | if (snew == mesh(m,snew).value_sidx && snew != 0) { |
---|
654 | m = mesh(m,snew).value_midx; |
---|
655 | } |
---|
656 | |
---|
657 | // get the position in the reference alignment |
---|
658 | pos = alig_width - 1 - mesh._master.getById(m).getPosition(); |
---|
659 | |
---|
660 | // step left in the slave sequence until we reach the base |
---|
661 | // the score was based on (only once for match, multiple |
---|
662 | // iterations of the loop only if slave sequence has had |
---|
663 | // insertions relative to reference) |
---|
664 | while (s != snew) { |
---|
665 | --s; |
---|
666 | |
---|
667 | #ifdef DEBUG |
---|
668 | if (s != snew) { |
---|
669 | log << "multi insertion: " << alig_width - pos << "; "; |
---|
670 | } |
---|
671 | #endif |
---|
672 | |
---|
673 | // create aligned base from position/base pairing |
---|
674 | slave_base_type ab1(pos, mesh._slave.getById(s).getBase()); |
---|
675 | |
---|
676 | // remember |
---|
677 | out.append(ab1); |
---|
678 | aligned_bases++; |
---|
679 | |
---|
680 | // count used weights (see above for explanation) |
---|
681 | master_base_type ab2(mesh._master.getById(m)); |
---|
682 | ab2.setBase(mesh._slave.getById(s).getBase()); |
---|
683 | sum_weight = tr.s.match(sum_weight,ab2,ab1); |
---|
684 | } |
---|
685 | } |
---|
686 | |
---|
687 | // if and while we have not yet reached the leftmost base of the slave |
---|
688 | // sequence, that is if we have overhang of the slave sequence on the |
---|
689 | // left side |
---|
690 | if (s != sbegin) { |
---|
691 | cutoff_head = s - sbegin; |
---|
692 | |
---|
693 | switch(overhang_pos) { |
---|
694 | case OVERHANG_ATTACH: |
---|
695 | while (s-- != sbegin) { |
---|
696 | char b = mesh._slave.getById(s).getBase(); |
---|
697 | aligned_base ab(std::min(alig_width-1,++pos), b); |
---|
698 | if (lowercase == LOWERCASE_UNALIGNED) { |
---|
699 | ab.setLowerCase(); |
---|
700 | } |
---|
701 | out.append(ab); |
---|
702 | } |
---|
703 | break; |
---|
704 | case OVERHANG_REMOVE: |
---|
705 | // do nothing |
---|
706 | break; |
---|
707 | case OVERHANG_EDGE: |
---|
708 | int n = s-sbegin; |
---|
709 | while (n--) { |
---|
710 | char b = mesh._slave.getById(n).getBase(); |
---|
711 | aligned_base ab(alig_width - n - 1,b); |
---|
712 | if (lowercase == LOWERCASE_UNALIGNED) { |
---|
713 | ab.setLowerCase(); |
---|
714 | } |
---|
715 | out.append(ab); |
---|
716 | } |
---|
717 | break; |
---|
718 | } |
---|
719 | } else { |
---|
720 | cutoff_head = 0; |
---|
721 | } |
---|
722 | |
---|
723 | out.setWidth(alig_width); |
---|
724 | out.reverse(); |
---|
725 | out.fix_duplicate_positions(log, lowercase==LOWERCASE_UNALIGNED, |
---|
726 | insertion==INSERTION_REMOVE); |
---|
727 | |
---|
728 | if (out.getWidth() > alig_width) { |
---|
729 | const char* wrn = "warning: result sequence too wide!"; |
---|
730 | log << wrn; |
---|
731 | } |
---|
732 | |
---|
733 | log << "scoring: raw=" << rval << ", weight=" << sum_weight |
---|
734 | << ", query-len=" << mesh._slave.size() |
---|
735 | << ", aligned-bases=" << aligned_bases |
---|
736 | << ", score=" << rval/sum_weight <<"; "; |
---|
737 | |
---|
738 | return rval/sum_weight; |
---|
739 | } |
---|
740 | |
---|
741 | } // namespace sina |
---|
742 | |
---|
743 | #endif // _MESH_H_ |
---|
744 | |
---|
745 | /* |
---|
746 | Local Variables: |
---|
747 | mode:c++ |
---|
748 | c-file-style:"stroustrup" |
---|
749 | c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . 0)) |
---|
750 | indent-tabs-mode:nil |
---|
751 | fill-column:99 |
---|
752 | End: |
---|
753 | */ |
---|
754 | // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 : |
---|
755 | |
---|