Context Navigation

source: tags/arb-6.0-rc3/STAT/ST_quality.cxx

Visit:

Last change on this file was 10074, checked in by westram, 13 years ago
no longer create multiple instances of column-stat-config-window (which only differed by what was called when 'GO' button was pressed) allow to reconfigure column-statistic when column stat get dis- and then enabled again, it now again asks for parameters fixed worst leaks caused by recalculation. the whole STAT module still leaks as hell - cant fix that atm fix some valgrind errors
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 11.5 KB

Line
1	// =============================================================== //
2	// //
3	// File : ST_quality.cxx //
4	// Purpose : //
5	// //
6	// Institute of Microbiology (Technical University Munich) //
7	// http://www.arb-home.de/ //
8	// //
9	// =============================================================== //
10
11	#include "st_quality.hxx"
12	#include "st_ml.hxx"
13	#include "MostLikelySeq.hxx"
14
15	#include <ColumnStat.hxx>
16	#include <BI_helix.hxx>
17	#include <AP_filter.hxx>
18	#include <arb_progress.h>
19	#include <arbdbt.h>
20	#include <arb_strbuf.h>
21
22	#include <cctype>
23	#include <cmath>
24
25	// --------------------------
26	// LikelihoodRanges
27
28	LikelihoodRanges::LikelihoodRanges(size_t no_of_ranges) {
29	ranges = no_of_ranges;
30	likelihood = new VarianceSampler[ranges];
31	}
32
33	VarianceSampler LikelihoodRanges::summarize_all_ranges() {
34	VarianceSampler allRange;
35	for (size_t range = 0; range<ranges; ++range) {
36	allRange.add(likelihood[range]);
37	}
38	return allRange;
39	}
40
41	char *LikelihoodRanges::generate_string(Sampler& t_values) {
42	VarianceSampler allRange = summarize_all_ranges();
43	char *report = new char[ranges+1];
44
45	if (allRange.get_count()>1) {
46	double all_median = allRange.get_median();
47	double all_variance = allRange.get_variance(all_median);
48	double all_std_deviation = sqrt(all_variance);
49
50	for (size_t range = 0; range<ranges; ++range) {
51	size_t count = likelihood[range].get_count();
52	if (count>1) {
53	// perform students-t-test
54
55	double standard_error = all_std_deviation / sqrt(count); // standard error of mean
56	double median_diff = likelihood[range].get_median() - all_median;
57	double t_value = median_diff / standard_error; // predictand of t_value is 0.0
58
59	t_values.add(fabs(t_value)); // sample t_values
60
61	double val = 0.7 * t_value;
62
63	// val outside [-0.5 .. +0.5] -> "t-test fails"
64	//
65	// 0.7 * t_value = approx. t_value / 1.428;
66	//
67	// 1.428 scheint ein fest-kodiertes quantil zu sein
68	// (bzw. dessen Haelfte, da ja nur der Range [-0.5 .. +0.5] getestet wird)
69	// Das eigentliche Quantil ist also ca. 2.856
70	//
71	// Eigentlich haengt das Quantil von der Stichprobengroesse (hier "count",
72	// d.h. "Anzahl der Basen im aktuellen Range") ab.
73	// (vgl. http://de.wikipedia.org/wiki/T-Verteilung#Ausgew.C3.A4hlte_Quantile_der_t-Verteilung )
74
75	int ival = int (val + .5) + 5;
76
77	if (ival > 9) ival = 9;
78	if (ival < 0) ival = 0;
79
80	report[range] = (ival == 5) ? '-' : '0'+ival;
81	}
82	else {
83	report[range] = '.';
84	}
85	}
86	}
87	else {
88	for (size_t range = 0; range<ranges; ++range) {
89	report[range] = '.';
90	}
91	}
92
93	report[ranges] = 0;
94
95	return report;
96	}
97
98	// --------------------------
99	// ColumnQualityInfo
100
101	ColumnQualityInfo::ColumnQualityInfo(int seq_len, int bucket_size)
102	: stat_half(2)
103	, stat_five(5)
104	, stat_user(seq_len / bucket_size + 1)
105	, stat_cons(2)
106	{}
107
108	#if defined(WARN_TODO)
109	#warning stat_cons unused - implement
110	#endif
111
112	static void st_ml_add_sequence_part_to_stat(ST_ML st_ml, ColumnStat /* awt_csp */,
113	const char *species_name, int seq_len, int bucket_size,
114	GB_HASH *species_to_info_hash, int start, int end)
115	{
116	AP_tree *node = STAT_find_node_by_name(st_ml, species_name);
117	if (node) {
118	MostLikelySeq *sml = st_ml->get_ml_vectors(0, node, start, end);
119	if (sml) {
120	ColumnQualityInfo *info;
121	if (start > 0) {
122	info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name);
123	}
124	else {
125	info = new ColumnQualityInfo(seq_len, bucket_size);
126	GBS_write_hash(species_to_info_hash, species_name, long (info));
127	}
128
129	int pos;
130	const char *source_sequence = 0;
131	int source_sequence_len = 0;
132
133	GBDATA *gb_data = sml->get_bound_species_data();
134	if (gb_data) {
135	source_sequence_len = GB_read_string_count(gb_data);
136	source_sequence = GB_read_char_pntr(gb_data);
137	}
138	if (end > source_sequence_len) end = source_sequence_len;
139
140	ST_base_vector *vec = sml->tmp_out + start;
141	for (pos = start; pos < end; vec++, pos++) {
142	DNA_Base base = dna_table.char_to_enum(source_sequence[pos]);
143	if (base != ST_UNKNOWN && base != ST_GAP) { // don't count gaps
144	ST_FLOAT max = vec->max_frequency();
145
146	double val = max / (0.0001 + vec->b[base]);
147	double log_val = log(val);
148	double seq_rel_pos = double(pos)/seq_len;
149
150	info->stat_half.add_relative(seq_rel_pos, log_val);
151	info->stat_five.add_relative(seq_rel_pos, log_val);
152	info->stat_user.add_relative(seq_rel_pos, log_val);
153	}
154	}
155	}
156	}
157	}
158
159	static GB_ERROR st_ml_add_quality_string_to_species(GBDATA gb_main, const AP_filter filter,
160	const char alignment_name, const char species_name,
161	size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report,
162	const char *dest_field)
163	{
164	GB_ERROR error = 0;
165	GBDATA *gb_species = GBT_find_species(gb_main, species_name);
166	if (!gb_species) error = GBS_global_string("Unknown species '%s'", species_name);
167	else {
168	ColumnQualityInfo info = (ColumnQualityInfo ) GBS_read_hash(species_to_info_hash, species_name);
169	if (!info) error = GBS_global_string("Statistic missing for species '%s'", species_name);
170	else {
171	GBDATA *gb_dest = GB_search(gb_species, dest_field, GB_STRING);
172	if (!gb_dest) error = GB_await_error();
173	else {
174	Sampler t_values; // sample t-values here
175	char *user_str = info->stat_user.generate_string(t_values);
176	{
177	char *half_str = info->stat_half.generate_string(t_values);
178	char *five_str = info->stat_five.generate_string(t_values);
179
180	GBS_strstruct buffer = GBS_stropen(29 + 3*2 + (2+5+info->overall_range_count()));
181
182	#if 1
183	GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_median()));
184	GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_sum()));
185	#else
186	#warning t-value summary disabled for test-purposes
187	#endif
188
189	GBS_chrcat(buffer, 'a'); GBS_strcat(buffer, half_str); GBS_chrcat(buffer, ' ');
190	GBS_chrcat(buffer, 'b'); GBS_strcat(buffer, five_str); GBS_chrcat(buffer, ' ');
191	GBS_chrcat(buffer, 'c'); GBS_strcat(buffer, user_str);
192
193	error = GB_write_string(gb_dest, GBS_mempntr(buffer));
194	GBS_strforget(buffer);
195
196	delete [] half_str;
197	delete [] five_str;
198	}
199
200	if (!error && report) {
201	// name "quality" handled specially by ../EDIT4/EDB_root_bact.cxx@chimera_check_quality_string
202	GBDATA *gb_report = GBT_add_data(gb_species, alignment_name, "quality", GB_STRING);
203
204	if (!gb_report) error = GB_await_error();
205	else {
206	char *report_str;
207	{
208	size_t filtered_len = filter->get_filtered_length();
209	report_str = new char[filtered_len + 1];
210
211	for (size_t i = 0; i < filtered_len; i++) {
212	report_str[i] = user_str[i / bucket_size];
213	}
214	report_str[filtered_len] = 0;
215	}
216
217	{
218	char *blownUp_report = filter->blowup_string(report_str, ' ');
219	error = GB_write_string(gb_report, blownUp_report);
220	free(blownUp_report);
221	}
222	if (report == ST_QUALITY_REPORT_TEMP) error = GB_set_temporary(gb_report);
223
224	delete [] report_str;
225	}
226	}
227	delete [] user_str;
228	}
229	}
230	}
231
232	return error;
233	}
234
235	static void destroy_ColumnQualityInfo(long cl_info) {
236	ColumnQualityInfo info = (ColumnQualityInfo)cl_info;
237	delete info;
238	}
239
240	GB_ERROR st_ml_check_sequence_quality(GBDATA gb_main, const char tree_name,
241	const char alignment_name, ColumnStat colstat, const WeightedFilter *weighted_filter, int bucket_size,
242	int marked_only, st_report_enum report, const char *dest_field)
243	{
244	#if defined(WARN_TODO)
245	#warning parameter 'alignment_name' can be retrieved from WeightedFilter (as soon as automatic mapping works for filters)
246	#endif
247	ST_ML st_ml(gb_main);
248	GB_ERROR error = st_ml.calc_st_ml(tree_name, alignment_name, 0, marked_only, colstat, weighted_filter);
249	arb_progress glob_progress("Sequence Quality Check");
250
251	if (!error) {
252	GB_HASH *species2info = GBS_create_dynaval_hash(GBT_get_species_count(gb_main), GB_IGNORE_CASE, destroy_ColumnQualityInfo);
253	size_t species_count;
254	GB_CSTR *snames = GBT_get_names_of_species_in_tree(st_ml.get_gbt_tree(), &species_count);
255	int seq_len = st_ml.get_filtered_length();
256	size_t parts = (seq_len-1)/ST_MAX_SEQ_PART+1;
257
258	{
259	arb_progress add_progress("Calculating stat", parts*species_count);
260
261	for (int pos = 0; pos < seq_len && !error; pos += ST_MAX_SEQ_PART) {
262	int end = pos + ST_MAX_SEQ_PART - 1;
263	if (end > seq_len) end = seq_len;
264
265	for (GB_CSTR pspecies_name = snames; pspecies_name && !error; pspecies_name++) {
266	st_ml_add_sequence_part_to_stat(&st_ml, colstat, *pspecies_name, seq_len, bucket_size, species2info, pos, end);
267	add_progress.inc_and_check_user_abort(error);
268	}
269	}
270	}
271
272	if (!error) {
273	arb_progress res_progress("Calculating result", species_count);
274	const AP_filter *filter = st_ml.get_filter();
275
276	for (GB_CSTR pspecies_name = snames; pspecies_name && !error; pspecies_name++) {
277	error = st_ml_add_quality_string_to_species(gb_main, filter, alignment_name, *pspecies_name, bucket_size, species2info, report, dest_field);
278	res_progress.inc_and_check_user_abort(error);
279	}
280	}
281
282	free(snames);
283	GBS_free_hash(species2info);
284	}
285
286	return error;
287	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: