| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : st_quality.hxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #ifndef ST_QUALITY_HXX |
|---|
| 12 | #define ST_QUALITY_HXX |
|---|
| 13 | |
|---|
| 14 | #ifndef ST_WINDOW_HXX |
|---|
| 15 | #include "st_window.hxx" |
|---|
| 16 | #endif |
|---|
| 17 | |
|---|
| 18 | #define st_assert(cond) arb_assert(cond) |
|---|
| 19 | |
|---|
| 20 | class Sampler { |
|---|
| 21 | double sum; //!< sum of added values |
|---|
| 22 | size_t count; //!< number of values added |
|---|
| 23 | public: |
|---|
| 24 | Sampler() : sum(0.0) , count(0) {} |
|---|
| 25 | |
|---|
| 26 | void add(double val) { |
|---|
| 27 | sum += val; |
|---|
| 28 | count++; |
|---|
| 29 | } |
|---|
| 30 | void add(const Sampler& other) { |
|---|
| 31 | sum += other.sum; |
|---|
| 32 | count += other.count; |
|---|
| 33 | } |
|---|
| 34 | |
|---|
| 35 | size_t get_count() const { return count; } |
|---|
| 36 | double get_sum() const { return sum; } |
|---|
| 37 | double get_median() const { return sum / count; } |
|---|
| 38 | }; |
|---|
| 39 | |
|---|
| 40 | class VarianceSampler : public Sampler { |
|---|
| 41 | /*! stores values such that variance can quickly be calculated |
|---|
| 42 | * for algorithm see |
|---|
| 43 | * http://de.wikipedia.org/wiki/Standardabweichung#Berechnung_f.C3.BCr_auflaufende_Messwerte or |
|---|
| 44 | * http://en.wikipedia.org/wiki/Standard_deviation#Rapid_calculation_methods |
|---|
| 45 | */ |
|---|
| 46 | |
|---|
| 47 | double square_sum; //!< sum of squares of added values |
|---|
| 48 | |
|---|
| 49 | public: |
|---|
| 50 | VarianceSampler() : square_sum(0.0) {} |
|---|
| 51 | |
|---|
| 52 | void add(double val) { |
|---|
| 53 | Sampler::add(val); |
|---|
| 54 | square_sum += val*val; |
|---|
| 55 | } |
|---|
| 56 | |
|---|
| 57 | void add(const VarianceSampler& other) { |
|---|
| 58 | Sampler::add(other); |
|---|
| 59 | square_sum += other.square_sum; |
|---|
| 60 | } |
|---|
| 61 | |
|---|
| 62 | double get_variance(double median) const { return square_sum/get_count() - median*median; } |
|---|
| 63 | double get_variance() const { return get_variance(get_median()); } |
|---|
| 64 | }; |
|---|
| 65 | |
|---|
| 66 | class LikelihoodRanges : virtual Noncopyable { |
|---|
| 67 | /*! The alignment is split into multiple, similar-sized column ranges. |
|---|
| 68 | * For each range the likelihoods get summarized |
|---|
| 69 | */ |
|---|
| 70 | size_t ranges; |
|---|
| 71 | VarianceSampler *likelihood; |
|---|
| 72 | |
|---|
| 73 | public: |
|---|
| 74 | |
|---|
| 75 | LikelihoodRanges(size_t ranges); |
|---|
| 76 | ~LikelihoodRanges() { delete [] likelihood; } |
|---|
| 77 | |
|---|
| 78 | size_t get_range_count() const { return ranges; } |
|---|
| 79 | |
|---|
| 80 | void add(size_t range, double probability) { |
|---|
| 81 | st_assert(range < ranges); |
|---|
| 82 | likelihood[range].add(probability); |
|---|
| 83 | } |
|---|
| 84 | void add_relative(double seq_rel_pos, double probability) { |
|---|
| 85 | st_assert(seq_rel_pos>=0.0 && seq_rel_pos<1.0); |
|---|
| 86 | add(seq_rel_pos*ranges, probability); |
|---|
| 87 | } |
|---|
| 88 | |
|---|
| 89 | VarianceSampler summarize_all_ranges(); |
|---|
| 90 | char *generate_string(Sampler& t_values); |
|---|
| 91 | |
|---|
| 92 | // int is_bad(); // 0 == ok, 1 strange, 2 bad, 3 very bad |
|---|
| 93 | }; |
|---|
| 94 | |
|---|
| 95 | struct ColumnQualityInfo { |
|---|
| 96 | LikelihoodRanges stat_half; // two ranges (one for each half) |
|---|
| 97 | LikelihoodRanges stat_five; // five ranges |
|---|
| 98 | LikelihoodRanges stat_user; // user defined range size |
|---|
| 99 | LikelihoodRanges stat_cons; // two ranges (conserved + variable positions) -- @@@ unused |
|---|
| 100 | |
|---|
| 101 | ColumnQualityInfo(int seq_len, int bucket_size); |
|---|
| 102 | |
|---|
| 103 | size_t overall_range_count() { |
|---|
| 104 | return |
|---|
| 105 | stat_half.get_range_count() + |
|---|
| 106 | stat_five.get_range_count() + |
|---|
| 107 | stat_user.get_range_count() + |
|---|
| 108 | stat_cons.get_range_count(); |
|---|
| 109 | } |
|---|
| 110 | }; |
|---|
| 111 | |
|---|
| 112 | class WeightedFilter; |
|---|
| 113 | class ColumnStat; |
|---|
| 114 | |
|---|
| 115 | GB_ERROR st_ml_check_sequence_quality(GBDATA *gb_main, const char *tree_name, |
|---|
| 116 | const char *alignment_name, ColumnStat *colstat, const WeightedFilter *weighted_filter, int bucket_size, |
|---|
| 117 | int marked_only, st_report_enum report, const char *dest_field); |
|---|
| 118 | |
|---|
| 119 | #else |
|---|
| 120 | #error st_quality.hxx included twice |
|---|
| 121 | #endif // ST_QUALITY_HXX |
|---|