Context Navigation

source: branches/stable/STAT/ST_quality.cxx

Visit:

Last change on this file was 18665, checked in by westram, 3 years ago
change many WARN_TODO triggered warnings into todo markers.
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 11.4 KB

Line
1	// =============================================================== //
2	// //
3	// File : ST_quality.cxx //
4	// Purpose : //
5	// //
6	// Institute of Microbiology (Technical University Munich) //
7	// http://www.arb-home.de/ //
8	// //
9	// =============================================================== //
10
11	#include "st_quality.hxx"
12	#include "st_ml.hxx"
13	#include "MostLikelySeq.hxx"
14
15	#include <ColumnStat.hxx>
16	#include <BI_helix.hxx>
17	#include <AP_filter.hxx>
18	#include <arb_progress.h>
19	#include <arbdbt.h>
20	#include <arb_strbuf.h>
21
22	#include <cctype>
23	#include <cmath>
24	#include <items.h>
25
26	// --------------------------
27	// LikelihoodRanges
28
29	LikelihoodRanges::LikelihoodRanges(size_t no_of_ranges) {
30	ranges = no_of_ranges;
31	likelihood = new VarianceSampler[ranges];
32	}
33
34	VarianceSampler LikelihoodRanges::summarize_all_ranges() {
35	VarianceSampler allRange;
36	for (size_t range = 0; range<ranges; ++range) {
37	allRange.add(likelihood[range]);
38	}
39	return allRange;
40	}
41
42	char *LikelihoodRanges::generate_string(Sampler& t_values) {
43	VarianceSampler allRange = summarize_all_ranges();
44	char *report = new char[ranges+1];
45
46	if (allRange.get_count()>1) {
47	double all_median = allRange.get_median();
48	double all_variance = allRange.get_variance(all_median);
49	double all_std_deviation = sqrt(all_variance);
50
51	for (size_t range = 0; range<ranges; ++range) {
52	size_t count = likelihood[range].get_count();
53	if (count>1) {
54	// perform students-t-test
55
56	double standard_error = all_std_deviation / sqrt(count); // standard error of mean
57	double median_diff = likelihood[range].get_median() - all_median;
58	double t_value = median_diff / standard_error; // predictand of t_value is 0.0
59
60	t_values.add(fabs(t_value)); // sample t_values
61
62	double val = 0.7 * t_value;
63
64	// val outside [-0.5 .. +0.5] -> "t-test fails"
65	//
66	// 0.7 * t_value = approx. t_value / 1.428;
67	//
68	// 1.428 scheint ein fest-kodiertes quantil zu sein
69	// (bzw. dessen Haelfte, da ja nur der Range [-0.5 .. +0.5] getestet wird)
70	// Das eigentliche Quantil ist also ca. 2.856
71	//
72	// Eigentlich haengt das Quantil von der Stichprobengroesse (hier "count",
73	// d.h. "Anzahl der Basen im aktuellen Range") ab.
74	// (vgl. http://de.wikipedia.org/wiki/T-Verteilung#Ausgew.C3.A4hlte_Quantile_der_t-Verteilung )
75
76	int ival = int (val + .5) + 5;
77
78	if (ival > 9) ival = 9;
79	if (ival < 0) ival = 0;
80
81	report[range] = (ival == 5) ? '-' : '0'+ival;
82	}
83	else {
84	report[range] = '.';
85	}
86	}
87	}
88	else {
89	for (size_t range = 0; range<ranges; ++range) {
90	report[range] = '.';
91	}
92	}
93
94	report[ranges] = 0;
95
96	return report;
97	}
98
99	// --------------------------
100	// ColumnQualityInfo
101
102	ColumnQualityInfo::ColumnQualityInfo(int seq_len, int bucket_size) :
103	stat_half(2),
104	stat_five(5),
105	stat_user(seq_len / bucket_size + 1),
106	stat_cons(2) // @@@ stat_cons is unused - implement
107	{}
108
109
110	static void st_ml_add_sequence_part_to_stat(ST_ML st_ml, ColumnStat /* awt_csp */,
111	const char *species_name, int seq_len, int bucket_size,
112	GB_HASH *species_to_info_hash, int start, int end)
113	{
114	AP_tree *node = STAT_find_node_by_name(st_ml, species_name);
115	if (node) {
116	MostLikelySeq *sml = st_ml->get_ml_vectors(NULp, node, start, end);
117	if (sml) {
118	ColumnQualityInfo *info;
119	if (start > 0) {
120	info = (ColumnQualityInfo *) GBS_read_hash(species_to_info_hash, species_name);
121	}
122	else {
123	info = new ColumnQualityInfo(seq_len, bucket_size);
124	GBS_write_hash(species_to_info_hash, species_name, long (info));
125	}
126
127	int pos;
128	const char *source_sequence = NULp;
129	int source_sequence_len = 0;
130
131	GBDATA *gb_data = sml->get_bound_species_data();
132	if (gb_data) {
133	source_sequence_len = GB_read_string_count(gb_data);
134	source_sequence = GB_read_char_pntr(gb_data);
135	}
136	if (end > source_sequence_len) end = source_sequence_len;
137
138	ST_base_vector *vec = sml->tmp_out + start;
139	for (pos = start; pos < end; vec++, pos++) {
140	DNA_Base base = dna_table.char_to_enum(source_sequence[pos]);
141	if (base != ST_UNKNOWN && base != ST_GAP) { // don't count gaps
142	ST_FLOAT max = vec->max_frequency();
143
144	double val = max / (0.0001 + vec->b[base]);
145	double log_val = log(val);
146	double seq_rel_pos = double(pos)/seq_len;
147
148	info->stat_half.add_relative(seq_rel_pos, log_val);
149	info->stat_five.add_relative(seq_rel_pos, log_val);
150	info->stat_user.add_relative(seq_rel_pos, log_val);
151	}
152	}
153	}
154	}
155	}
156
157	static GB_ERROR st_ml_add_quality_string_to_species(GBDATA gb_main, const AP_filter filter,
158	const char alignment_name, const char species_name,
159	size_t bucket_size, GB_HASH *species_to_info_hash, st_report_enum report,
160	const char *dest_field)
161	{
162	GB_ERROR error = NULp;
163	GBDATA *gb_species = GBT_find_species(gb_main, species_name);
164	if (!gb_species) error = GBS_global_string("Unknown species '%s'", species_name);
165	else {
166	ColumnQualityInfo info = (ColumnQualityInfo ) GBS_read_hash(species_to_info_hash, species_name);
167	if (!info) error = GBS_global_string("Statistic missing for species '%s'", species_name);
168	else {
169	GBDATA *gb_dest = GBT_searchOrCreate_itemfield_according_to_changekey(gb_species, dest_field, SPECIES_get_selector().change_key_path);
170	if (!gb_dest) error = GB_await_error();
171	else {
172	Sampler t_values; // sample t-values here
173	char *user_str = info->stat_user.generate_string(t_values);
174	{
175	char *half_str = info->stat_half.generate_string(t_values);
176	char *five_str = info->stat_five.generate_string(t_values);
177
178	GBS_strstruct buffer = GBS_stropen(29 + 3*2 + (2+5+info->overall_range_count()));
179
180	#if 1
181	GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_median()));
182	GBS_strcat(buffer, GBS_global_string("%8.3f ", t_values.get_sum()));
183	#else
184	#warning t-value summary disabled for test-purposes
185	#endif
186
187	GBS_chrcat(buffer, 'a'); GBS_strcat(buffer, half_str); GBS_chrcat(buffer, ' ');
188	GBS_chrcat(buffer, 'b'); GBS_strcat(buffer, five_str); GBS_chrcat(buffer, ' ');
189	GBS_chrcat(buffer, 'c'); GBS_strcat(buffer, user_str);
190
191	error = GB_write_string(gb_dest, GBS_mempntr(buffer));
192	GBS_strforget(buffer);
193
194	delete [] half_str;
195	delete [] five_str;
196	}
197
198	if (!error && report) {
199	// name "quality" handled specially by ../EDIT4/EDB_root_bact.cxx@chimera_check_quality_string
200	GBDATA *gb_report = GBT_add_data(gb_species, alignment_name, "quality", GB_STRING);
201
202	if (!gb_report) error = GB_await_error();
203	else {
204	char *report_str;
205	{
206	size_t filtered_len = filter->get_filtered_length();
207	report_str = new char[filtered_len + 1];
208
209	for (size_t i = 0; i < filtered_len; i++) {
210	report_str[i] = user_str[i / bucket_size];
211	}
212	report_str[filtered_len] = 0;
213	}
214
215	{
216	char *blownUp_report = filter->blowup_string(report_str, ' ');
217	error = GB_write_string(gb_report, blownUp_report);
218	free(blownUp_report);
219	}
220	if (report == ST_QUALITY_REPORT_TEMP) error = GB_set_temporary(gb_report);
221
222	delete [] report_str;
223	}
224	}
225	delete [] user_str;
226	}
227	}
228	}
229
230	return error;
231	}
232
233	static void destroy_ColumnQualityInfo(long cl_info) {
234	ColumnQualityInfo info = (ColumnQualityInfo)cl_info;
235	delete info;
236	}
237
238	GB_ERROR st_ml_check_sequence_quality(GBDATA gb_main, const char tree_name,
239	const char alignment_name, ColumnStat colstat, const WeightedFilter *weighted_filter, int bucket_size,
240	int marked_only, st_report_enum report, const char *dest_field)
241	{
242	// @@@ parameter 'alignment_name' can be retrieved from WeightedFilter (as soon as automatic mapping works for filters)
243
244	ST_ML st_ml(gb_main);
245	GB_ERROR error = st_ml.calc_st_ml(tree_name, alignment_name, NULp, marked_only, colstat, weighted_filter);
246
247	if (!error) {
248	GB_HASH *species2info = GBS_create_dynaval_hash(GBT_get_species_count(gb_main), GB_IGNORE_CASE, destroy_ColumnQualityInfo);
249	size_t species_count;
250
251	GB_CSTR *snames = GBT_get_names_of_species_in_tree(st_ml.get_gbt_tree(), &species_count);
252	int seq_len = st_ml.get_filtered_length();
253	size_t parts = (seq_len-1)/ST_MAX_SEQ_PART+1;
254
255	{
256	arb_progress add_progress("Calculating stat", parts*species_count);
257
258	for (int pos = 0; pos < seq_len && !error; pos += ST_MAX_SEQ_PART) {
259	int end = pos + ST_MAX_SEQ_PART - 1;
260	if (end > seq_len) end = seq_len;
261
262	for (GB_CSTR pspecies_name = snames; pspecies_name && !error; pspecies_name++) {
263	st_ml_add_sequence_part_to_stat(&st_ml, colstat, *pspecies_name, seq_len, bucket_size, species2info, pos, end);
264	add_progress.inc_and_check_user_abort(error);
265	}
266	}
267	}
268
269	if (!error) {
270	arb_progress res_progress("Calculating reports", species_count);
271	const AP_filter *filter = st_ml.get_filter();
272
273	for (GB_CSTR pspecies_name = snames; pspecies_name && !error; pspecies_name++) {
274	error = st_ml_add_quality_string_to_species(gb_main, filter, alignment_name, *pspecies_name, bucket_size, species2info, report, dest_field);
275	res_progress.inc_and_check_user_abort(error);
276	}
277	}
278
279	free(snames);
280	GBS_free_hash(species2info);
281	}
282
283	return error;
284	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: