1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : st_quality.hxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #ifndef ST_QUALITY_HXX |
---|
12 | #define ST_QUALITY_HXX |
---|
13 | |
---|
14 | #ifndef ST_WINDOW_HXX |
---|
15 | #include "st_window.hxx" |
---|
16 | #endif |
---|
17 | |
---|
18 | #define st_assert(cond) arb_assert(cond) |
---|
19 | |
---|
20 | class Sampler { |
---|
21 | double sum; //!< sum of added values |
---|
22 | size_t count; //!< number of values added |
---|
23 | public: |
---|
24 | Sampler() : sum(0.0) , count(0) {} |
---|
25 | |
---|
26 | void add(double val) { |
---|
27 | sum += val; |
---|
28 | count++; |
---|
29 | } |
---|
30 | void add(const Sampler& other) { |
---|
31 | sum += other.sum; |
---|
32 | count += other.count; |
---|
33 | } |
---|
34 | |
---|
35 | size_t get_count() const { return count; } |
---|
36 | double get_sum() const { return sum; } |
---|
37 | double get_median() const { return sum / count; } |
---|
38 | }; |
---|
39 | |
---|
40 | class VarianceSampler : public Sampler { |
---|
41 | /*! stores values such that variance can quickly be calculated |
---|
42 | * for algorithm see |
---|
43 | * http://de.wikipedia.org/wiki/Standardabweichung#Berechnung_f.C3.BCr_auflaufende_Messwerte or |
---|
44 | * http://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods |
---|
45 | */ |
---|
46 | |
---|
47 | double square_sum; //!< sum of squares of added values |
---|
48 | |
---|
49 | public: |
---|
50 | VarianceSampler() : square_sum(0.0) {} |
---|
51 | |
---|
52 | void add(double val) { |
---|
53 | Sampler::add(val); |
---|
54 | square_sum += val*val; |
---|
55 | } |
---|
56 | |
---|
57 | void add(const VarianceSampler& other) { |
---|
58 | Sampler::add(other); |
---|
59 | square_sum += other.square_sum; |
---|
60 | } |
---|
61 | |
---|
62 | double get_variance(double median) const { return square_sum/get_count() - median*median; } |
---|
63 | double get_variance() const { return get_variance(get_median()); } |
---|
64 | }; |
---|
65 | |
---|
66 | class LikelihoodRanges : virtual Noncopyable { |
---|
67 | /*! The alignment is split into multiple, similar-sized column ranges. |
---|
68 | * For each range the likelihoods get summarized |
---|
69 | */ |
---|
70 | size_t ranges; |
---|
71 | VarianceSampler *likelihood; |
---|
72 | |
---|
73 | public: |
---|
74 | |
---|
75 | LikelihoodRanges(size_t ranges); |
---|
76 | ~LikelihoodRanges() { delete [] likelihood; } |
---|
77 | |
---|
78 | size_t get_range_count() const { return ranges; } |
---|
79 | |
---|
80 | void add(size_t range, double probability) { |
---|
81 | st_assert(range < ranges); |
---|
82 | likelihood[range].add(probability); |
---|
83 | } |
---|
84 | void add_relative(double seq_rel_pos, double probability) { |
---|
85 | st_assert(seq_rel_pos>=0.0 && seq_rel_pos<1.0); |
---|
86 | add(seq_rel_pos*ranges, probability); |
---|
87 | } |
---|
88 | |
---|
89 | VarianceSampler summarize_all_ranges(); |
---|
90 | char *generate_string(Sampler& t_values); |
---|
91 | |
---|
92 | // int is_bad(); // 0 == ok, 1 strange, 2 bad, 3 very bad |
---|
93 | }; |
---|
94 | |
---|
95 | struct ColumnQualityInfo { |
---|
96 | LikelihoodRanges stat_half; // two ranges (one for each half) |
---|
97 | LikelihoodRanges stat_five; // five ranges |
---|
98 | LikelihoodRanges stat_user; // user defined range size |
---|
99 | LikelihoodRanges stat_cons; // two ranges (conserved + variable positions) -- @@@ unused |
---|
100 | |
---|
101 | ColumnQualityInfo(int seq_len, int bucket_size); |
---|
102 | |
---|
103 | size_t overall_range_count() { |
---|
104 | return |
---|
105 | stat_half.get_range_count() + |
---|
106 | stat_five.get_range_count() + |
---|
107 | stat_user.get_range_count() + |
---|
108 | stat_cons.get_range_count(); |
---|
109 | } |
---|
110 | }; |
---|
111 | |
---|
112 | class WeightedFilter; |
---|
113 | class ColumnStat; |
---|
114 | |
---|
115 | GB_ERROR st_ml_check_sequence_quality(GBDATA *gb_main, const char *tree_name, |
---|
116 | const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size, |
---|
117 | int marked_only, st_report_enum report, const char *dest_field); |
---|
118 | |
---|
119 | #else |
---|
120 | #error st_quality.hxx included twice |
---|
121 | #endif // ST_QUALITY_HXX |
---|