source: trunk/GDE/SINA/builddir/src/alignment_stats.cpp

Last change on this file was 19170, checked in by westram, 2 years ago
  • sina source
    • unpack + remove tarball
    • no longer ignore sina builddir.
File size: 6.6 KB
Line 
1/*
2Copyright (c) 2006-2018 Elmar Pruesse <elmar.pruesse@ucdenver.edu>
3
4This file is part of SINA.
5SINA is free software: you can redistribute it and/or modify it under
6the terms of the GNU General Public License as published by the Free
7Software Foundation, either version 3 of the License, or (at your
8option) any later version.
9
10SINA is distributed in the hope that it will be useful, but WITHOUT ANY
11WARRANTY; without even the implied warranty of MERCHANTABILITY or
12FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
13for more details.
14
15You should have received a copy of the GNU General Public License
16along with SINA.  If not, see <http://www.gnu.org/licenses/>.
17
18Additional permission under GNU GPL version 3 section 7
19
20If you modify SINA, or any covered work, by linking or combining it
21with components of ARB (or a modified version of that software),
22containing parts covered by the terms of the
23ARB-public-library-license, the licensors of SINA grant you additional
24permission to convey the resulting work. Corresponding Source for a
25non-source form of such a combination shall include the source code
26for the parts of ARB used as well as that of the covered work.
27*/
28
29#include "alignment_stats.h"
30#include "log.h"
31
32#include <cmath>
33#include <algorithm>
34#include <iostream>
35#include <utility>
36using std::string;
37using std::vector;
38namespace sina {
39
40float jukes_cantor(float in) {
41    return -3.0/4 * log( 1.0 - 4.0/3*in);
42}
43
44alignment_stats::alignment_stats() {
45
46    global_freqs.num_a=1000;
47    global_freqs.num_g=1000;
48    global_freqs.num_c=1000;
49    global_freqs.num_u=1000;
50    global_freqs.num_mutations=20;
51    global_freqs.num_transversions=10;
52}
53
54alignment_stats::alignment_stats(
55    std::string  name_,
56    unsigned int ntaxa, unsigned int alen,
57    const unsigned int *na, const unsigned int *nc, const unsigned int *ng,
58    const unsigned int *nu, const unsigned int *nM, const unsigned int *nT,
59    std::vector<int>  pairs_
60    ) 
61    : name(std::move(name_)),
62      num_taxa(ntaxa), width(alen),
63      pairs(std::move(pairs_)),
64      minweight(9999999)
65{
66    auto console = Log::create_logger("alignment_stats");
67
68    column_freqs.resize(width);
69    weights.resize(width);
70    unsigned int first_weighted = width, last_weighted=0;
71    for (unsigned int i = 0; i < width; i++) {
72        freqs &f = column_freqs[i];
73        f.num_a = na[i];
74        f.num_c = nc[i];
75        f.num_g = ng[i];
76        f.num_u = nu[i];
77        f.num_mutations = nM[i];
78        f.num_transversions = nT[i];
79
80        global_freqs.num_a += f.num_a;
81        global_freqs.num_c += f.num_c;
82        global_freqs.num_g += f.num_g;
83        global_freqs.num_u += f.num_u;
84        global_freqs.num_mutations += f.num_mutations;
85        global_freqs.num_transversions += f.num_transversions;
86
87        int sum = f.num_a + f.num_c + f.num_g + f.num_u;
88        if (sum > ntaxa * 0.2) {
89            double rate = std::min((double)f.num_mutations / sum, .95 * .75);
90            rate = std::min(jukes_cantor(rate), 1.f);
91            double weight = .5 - log(rate);
92            if (weight > 20 ) {
93                console->info("extreme weight {} for column {} clamped to 20", weight, i);
94                weight = 20;
95            }
96            weights[i] = weight;
97            sumweight += weight;
98            maxweight = std::max(maxweight, (float)weight);
99            minweight = std::min(minweight, (float)weight);
100            weighted_columns++;
101            if (i < first_weighted) {
102                first_weighted = i;
103            }
104            if (i > last_weighted) {
105                last_weighted = i;
106            }
107        } else {
108            weights[i] = 1;
109        }
110    }
111
112    int total_bases = global_freqs.num_a + global_freqs.num_c + 
113        global_freqs.num_g + global_freqs.num_u;
114
115    console->info("alignment stats for subset {}", name);
116    console->info("weighted/unweighted columns = {}/{}",
117                  weighted_columns, width - weighted_columns);
118    console->info("average weight = {}", sumweight / weighted_columns);
119    console->info("minimum weight = {}", minweight);
120    console->info("maximum weight = {}", maxweight);
121    console->info("ntaxa = {}", ntaxa);
122    console->info("base frequencies: na={} nu={} nc={} ng={}",
123                  (double)global_freqs.num_a/total_bases,
124                  (double)global_freqs.num_c/total_bases,
125                  (double)global_freqs.num_g/total_bases,
126                  (double)global_freqs.num_u/total_bases);
127    console->info("mutation frequencies: any={} transversions={}",
128                  (double)global_freqs.num_mutations/total_bases,
129                  (double)global_freqs.num_transversions/total_bases);
130    console->info("first/last weighted column={}/{}",
131                  first_weighted, last_weighted);
132}
133
134const aligned_base::matrix_type
135alignment_stats::getSubstMatrix(double  /*identity*/) const {
136    aligned_base::matrix_type m;
137#if 0
138    int total_bases = global_freqs.num_a + global_freqs.num_c +
139        global_freqs.num_g + global_freqs.num_u;
140    double f[BASE_MAX];
141    f[BASE_A] = (double)global_freqs.num_a/total_bases;
142    f[BASE_C] = (double)global_freqs.num_c/total_bases;
143    f[BASE_G] = (double)global_freqs.num_g/total_bases;
144    f[BASE_TU] = (double)global_freqs.num_u/total_bases;
145
146    double avgmm=0;
147    for (int i=0; i < BASE_MAX; i++) {
148        for (int j=0; j < BASE_MAX; j++) {
149            double p;
150            if (i==j) {
151                p = identity / 4;
152            } else {
153                p = (1-identity) / 12;
154            }
155            avgmm += m.v[i*BASE_MAX+j] = -log( p / (f[i] * f[j]) );
156        }
157    }
158    avgmm /= 12;
159# endif
160
161    return m;
162}
163
164
165/*
166const aligned_base::matrix_type
167alignment_stats::getHky85Matrix(double identity) {
168    aligned_base::matrix_type m;
169
170    int total_bases = global_freqs.num_a + global_freqs.num_c +
171        global_freqs.num_g + global_freqs.num_u;
172    double f[aligned_base::BASE_MAX], fa, fc, fg, fu;
173    fa = f[aligned_base::BASE_A] = (double)global_freqs.num_a/total_bases;
174    fc = f[aligned_base::BASE_C] = (double)global_freqs.num_c/total_bases;
175    fg = f[aligned_base::BASE_G] = (double)global_freqs.num_g/total_bases;
176    fu = f[aligned_base::BASE_TU] = (double)global_freqs.num_u/total_bases;
177
178    double k = global_freqs.num_transversions /
179        (global_freqs.num_mutations-global_freqs.num_transversions);
180    double beta = 1 /
181        (2 * (fa+fg)*(fc+fu) + 2*k*(fa*fg + fc*ft));
182
183}
184
185*/
186
187} // namespace sina
188
189/*
190  Local Variables:
191  mode:c++
192  c-file-style:"stroustrup"
193  c-file-offsets:((innamespace . 0)(inline-open . 0)(case-label . +))
194  indent-tabs-mode:nil
195  fill-column:99
196  End:
197*/
198// vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:encoding=utf-8:textwidth=99 :
Note: See TracBrowser for help on using the repository browser.