Context Navigation

source: branches/profile/SL/SEQUENCE/AP_seq_protein.cxx

Visit:

Last change on this file was 12490, checked in by westram, 10 years ago
treat dots as 'X' when combining ancestor sequences in ARB_PARSIMONY (protein sequences) previously dots where treated as gaps dna/rna combine always treated dots as 'N' thx to Yan Shi for tracking down that misbehavior fixes #144
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 13.5 KB

Line
1	#include "AP_seq_protein.hxx"
2	#include "AP_parsimony_defaults.hxx"
3	#include <AP_pro_a_nucs.hxx>
4
5	#include <AP_filter.hxx>
6	#include <ARB_Tree.hxx>
7
8	#include <arb_str.h>
9	#include <climits>
10
11
12	// #define ap_assert(bed) arb_assert(bed)
13
14	AP_sequence_protein::AP_sequence_protein(const AliView *aliview)
15	: AP_sequence(aliview)
16	, seq_prot(NULL)
17	{
18	}
19
20	AP_sequence_protein::~AP_sequence_protein() {
21	delete [] seq_prot;
22	}
23
24	AP_sequence *AP_sequence_protein::dup() const {
25	return new AP_sequence_protein(get_aliview());
26	}
27
28	static AP_PROTEINS prot2AP_PROTEIN[26] = {
29	APP_A,
30	APP_B,
31	APP_C,
32	APP_D,
33	APP_E,
34	APP_F,
35	APP_G,
36	APP_H,
37	APP_I,
38	APP_ILLEGAL,
39	APP_K,
40	APP_L,
41	APP_M,
42	APP_N,
43	APP_ILLEGAL,
44	APP_P,
45	APP_Q,
46	APP_R,
47	APP_S,
48	APP_T,
49	APP_ILLEGAL,
50	APP_V,
51	APP_W,
52	APP_X,
53	APP_Y,
54	APP_Z
55	};
56
57	#define PROTEINS_TO_TEST 22
58
59	static AP_PROTEINS prot2test[PROTEINS_TO_TEST] = { // uses same indexing as prot_idx
60	APP_STAR,
61	APP_A,
62	APP_C,
63	APP_D,
64	APP_E,
65	APP_F,
66	APP_G,
67	APP_H,
68	APP_I,
69	APP_K,
70	APP_L,
71	APP_M,
72	APP_N,
73	APP_P,
74	APP_Q,
75	APP_R,
76	APP_S,
77	APP_T,
78	APP_V,
79	APP_W,
80	APP_Y,
81	APP_GAP
82	};
83
84	static int prot_idx[PROTEINS_TO_TEST] = { // uses same indexing as prot2test
85	// contains indexes for 'AWT_distance_meter->dist'
86	0, // *
87	1, // A
88	3, // C
89	4, // D
90	5, // E
91	6, // F
92	7, // G
93	8, // H
94	9, // I
95	10, // K
96	11, // L
97	12, // M
98	13, // N
99	14, // P
100	15, // Q
101	16, // R
102	17, // S
103	18, // T
104	19, // V
105	20, // W
106	21, // Y
107	23 // gap
108	};
109
110	static char prot_mindist[PROTEINS_TO_TEST][PROTEINS_TO_TEST];
111	static int min_mutations_initialized_for_codenr = -1;
112
113	static void update_min_mutations(int code_nr, const AWT_distance_meter *distance_meter) {
114	if (min_mutations_initialized_for_codenr != code_nr) {
115	for (int d = 0; d<PROTEINS_TO_TEST; ++d) {
116	int D = prot_idx[d];
117	int D_bit = 1<<D;
118
119	for (int s = 0; s<PROTEINS_TO_TEST; ++s) {
120	const AWT_PDP *dist = distance_meter->getDistance(prot_idx[s]);
121
122	int i;
123	for (i = 0; i<3; ++i) {
124	if (dist->patd[i] & D_bit) break;
125	}
126
127	prot_mindist[s][d] = char(i);
128	}
129	}
130
131	min_mutations_initialized_for_codenr = code_nr;
132	}
133	}
134
135
136	#if defined(DEBUG)
137	// #define SHOW_SEQ
138	#endif // DEBUG
139
140	void AP_sequence_protein::set(const char *isequence) {
141	AWT_translator *translator = AWT_get_user_translator(get_aliview()->get_gb_main());
142	update_min_mutations(translator->CodeNr(), translator->getDistanceMeter());
143
144	size_t sequence_len = get_sequence_length();
145	seq_prot = new AP_PROTEINS[sequence_len+1];
146
147	ap_assert(!get_filter()->does_bootstrap()); // bootstrapping not implemented for protein parsimony
148
149	const AP_filter *filt = get_filter();
150	const uchar *simplify = filt->get_simplify_table();
151	int left_bases = sequence_len;
152	long filter_len = filt->get_length();
153
154	ap_assert(filt);
155
156	size_t oidx = 0; // index for output sequence
157
158	for (int idx = 0; idx<filter_len && left_bases; ++idx) {
159	if (filt->use_position(idx)) {
160	char c = toupper(simplify[safeCharIndex(isequence[idx])]);
161	AP_PROTEINS p = APP_ILLEGAL;
162
163	#if defined(SHOW_SEQ)
164	fputc(c, stdout);
165	#endif // SHOW_SEQ
166
167	if (c >= 'A' && c <= 'Z') p = prot2AP_PROTEIN[c-'A'];
168	else if (c == '-') p = APP_GAP;
169	else if (c == '.') p = APP_X;
170	else if (c == '*') p = APP_STAR;
171
172	if (p == APP_ILLEGAL) {
173	GB_warning(GBS_global_string("Illegal sequence character '%c' replaced by gap", c));
174	p = APP_GAP;
175	}
176
177	seq_prot[oidx++] = p;
178	--left_bases;
179	}
180	}
181
182	ap_assert(oidx == sequence_len);
183	seq_prot[sequence_len] = APP_ILLEGAL;
184
185	#if defined(SHOW_SEQ)
186	fputc('\n', stdout);
187	#endif // SHOW_SEQ
188
189	mark_sequence_set(true);
190	}
191
192	void AP_sequence_protein::unset() {
193	delete [] seq_prot;
194	seq_prot = 0;
195	mark_sequence_set(false);
196	}
197
198	#if !defined(AP_PARSIMONY_DEFAULTS_HXX)
199	#error AP_parsimony_defaults.hxx not included
200	#endif // AP_PARSIMONY_DEFAULTS_HXX
201
202
203	AP_FLOAT AP_sequence_protein::combine(const AP_sequence lefts, const AP_sequence rights, char *mutation_per_site) {
204	// this works similar to AP_sequence_parsimony::combine
205
206	const AP_sequence_protein left = (const AP_sequence_protein )lefts;
207	const AP_sequence_protein right = (const AP_sequence_protein )rights;
208
209	size_t sequence_len = get_sequence_length();
210	if (!seq_prot) seq_prot = new AP_PROTEINS[sequence_len + 1];
211
212	const AP_PROTEINS *p1 = left->get_sequence();
213	const AP_PROTEINS *p2 = right->get_sequence();
214	AP_PROTEINS *p = seq_prot;
215	const AP_weights *weights = get_weights();
216	char *mutpsite = mutation_per_site;
217
218	long result = 0;
219	// check if initialized for correct instance of translator:
220	ap_assert(min_mutations_initialized_for_codenr == AWT_get_user_translator()->CodeNr());
221
222	for (size_t idx = 0; idx<sequence_len; ++idx) {
223	AP_PROTEINS c1 = p1[idx];
224	AP_PROTEINS c2 = p2[idx];
225
226	ap_assert(c1 != APP_ILLEGAL);
227	ap_assert(c2 != APP_ILLEGAL);
228
229	if ((c1&c2) == 0) { // proteins are distinct
230	p[idx] = AP_PROTEINS(c1\|c2); // mix distinct proteins
231
232	int mutations = INT_MAX;
233
234	if (p[idx]&APP_GAP) { // contains a gap
235	mutations = 1; // count first gap as mutation
236	// @@@ FIXME: rethink the line above. maybe it should be 3 mutations ?
237
238	#if !defined(MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS)
239	// count multiple mutations as 1 mutation
240	// this was an experiment (don't use it, it does not work!)
241	if (idx>0 && (p[idx-1]&APP_GAP)) { // last position also contained gap..
242	if (((c1&APP_GAP) && (p1[idx-1]&APP_GAP)) \|\| // ..in same sequence
243	((c2&APP_GAP) && (p2[idx-1]&APP_GAP)))
244	{
245	if (!(p1[idx-1]&APP_GAP) \|\| !(p2[idx-1]&APP_GAP)) { // if one of the sequences had no gap at previous position
246	mutations = 0; // skip multiple gaps
247	}
248	}
249	}
250	#endif // MULTIPLE_GAPS_ARE_MULTIPLE_MUTATIONS
251	}
252	else {
253	for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test
254	if (c1&prot2test[t1]) { // if protein is contained in subtree
255	for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) {
256	if (c2&prot2test[t2]) {
257	int mut = prot_mindist[t1][t2];
258	if (mut<mutations) {
259	mutations = mut;
260	if (mutations < 2) break; // minimum reached -- abort
261	}
262	}
263	}
264	}
265	}
266	}
267
268	ap_assert(mutations >= 0 && mutations <= 3);
269
270	if (mutpsite) mutpsite[idx] += mutations; // count mutations per site (unweighted)
271	result += mutations * weights->weight(idx); // count weighted or simple
272
273	}
274	else {
275	p[idx] = AP_PROTEINS(c1&c2); // store common proteins for both subtrees
276	}
277
278	#if !defined(PROPAGATE_GAPS_UPWARDS)
279	// do not propagate mixed gaps upwards (they cause neg. branches)
280	if (p[idx] & APP_GAP) { // contains gap
281	if (p[idx] != APP_GAP) { // is not real gap
282	p[idx] = AP_PROTEINS(p[idx]^APP_GAP); // remove the gap
283	}
284	}
285	#endif // PROPAGATE_GAPS_UPWARDS
286	}
287
288	#if defined(DEBUG) && 0
289	#define P1 75
290	#define P2 90
291	printf("Seq1: ");
292	for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p1[idx]);
293	printf("\nSeq2: ");
294	for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p2[idx]);
295	printf("\nCombine value: %f\n", float(result));
296	#undef P1
297	#undef P2
298	#endif // DEBUG
299
300	#if defined(DEBUG) && 0
301	printf("\nCombine value: %f\n", float(result));
302	#endif // DEBUG
303
304	inc_combine_count();
305	mark_sequence_set(true);
306
307	ap_assert(result >= 0.0);
308	return (AP_FLOAT)result;
309	}
310
311	void AP_sequence_protein::partial_match(const AP_sequence* part_, long overlapPtr, long penaltyPtr) const {
312	// matches the partial sequences 'part_' against 'this'
313	// '*penaltyPtr' is set to the number of mismatches (possibly weighted)
314	// '*overlapPtr' is set to the number of base positions both sequences overlap
315	// example:
316	// fullseq 'XXX---XXX' 'XXX---XXX'
317	// partialseq '-XX---XX-' '---XXX---'
318	// overlap 7 3
319	//
320	// algorithm is similar to AP_sequence_protein::combine()
321
322	const AP_sequence_protein part = (const AP_sequence_protein )part_;
323
324	const AP_PROTEINS *pf = get_sequence();
325	const AP_PROTEINS *pp = part->get_sequence();
326	const AP_weights *weights = get_weights();
327
328	long min_end; // minimum of both last non-gap positions
329	for (min_end = get_sequence_length()-1; min_end >= 0; --min_end) {
330	AP_PROTEINS both = AP_PROTEINS(pf[min_end]\|pp[min_end]);
331	if (both != APP_GAP) { // last non-gap found
332	if (pf[min_end] != APP_GAP) { // occurred in full sequence
333	for (; min_end >= 0; --min_end) { // search same in partial sequence
334	if (pp[min_end] != APP_GAP) break;
335	}
336	}
337	else {
338	ap_assert(pp[min_end] != APP_GAP); // occurred in partial sequence
339	for (; min_end >= 0; --min_end) { // search same in full sequence
340	if (pf[min_end] != APP_GAP) break;
341	}
342	}
343	break;
344	}
345	}
346
347	long penalty = 0;
348	long overlap = 0;
349
350	if (min_end >= 0) {
351	long max_start; // maximum of both first non-gap positions
352	for (max_start = 0; max_start <= min_end; ++max_start) {
353	AP_PROTEINS both = AP_PROTEINS(pf[max_start]\|pp[max_start]);
354	if (both != APP_GAP) { // first non-gap found
355	if (pf[max_start] != APP_GAP) { // occurred in full sequence
356	for (; max_start <= min_end; ++max_start) { // search same in partial
357	if (pp[max_start] != APP_GAP) break;
358	}
359	}
360	else {
361	ap_assert(pp[max_start] != APP_GAP); // occurred in partial sequence
362	for (; max_start <= min_end; ++max_start) { // search same in full
363	if (pf[max_start] != APP_GAP) break;
364	}
365	}
366	break;
367	}
368	}
369
370	if (max_start <= min_end) { // if sequences overlap
371	for (long idx = max_start; idx <= min_end; ++idx) {
372	if ((pf[idx]&pp[idx]) == 0) { // bases are distinct (aka mismatch)
373	int mutations;
374	if ((pf[idx]\|pp[idx])&APP_GAP) { // one is a gap
375	mutations = 3;
376	}
377	else {
378	mutations = INT_MAX;
379	for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test
380	if (pf[idx] & prot2test[t1]) { // if protein is contained in subtree
381	for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) {
382	if (pp[idx] & prot2test[t2]) {
383	int mut = prot_mindist[t1][t2];
384	if (mut<mutations) {
385	mutations = mut;
386	if (mutations < 2) break; // minimum reached -- abort
387	}
388	}
389	}
390	}
391	}
392	}
393	penalty += weights->weight(idx)*mutations;
394	}
395	}
396	overlap = (min_end-max_start+1)*3;
397	}
398	}
399
400	*overlapPtr = overlap;
401	*penaltyPtr = penalty;
402	}
403
404	AP_FLOAT AP_sequence_protein::count_weighted_bases() const {
405	AP_FLOAT wcount;
406	const AP_PROTEINS *sequence = get_sequence();
407
408	if (!sequence) wcount = -1.0;
409	else {
410	long sum = 0;
411	size_t sequence_len = get_sequence_length();
412
413	const AP_weights *weights = get_weights();
414
415	for (size_t idx = 0; idx<sequence_len; ++idx) {
416	if (sequence[idx] != APP_GAP) {
417	sum += weights->weight(idx);
418	}
419	}
420	wcount = sum;
421	}
422	return wcount;
423	}
424
425
426
427
428

Note: See TracBrowser for help on using the repository browser.

Download in other formats: