Context Navigation

source: tags/ms_r16q3/SL/SEQUENCE/AP_seq_protein.cxx

Visit:

Last change on this file was 13625, checked in by westram, 10 years ago
reintegrates 'pars' into 'trunk' fixes #528, #609, #620, #631, #633, #640, #641 implements #619, #627 adds: log:branches/addtest@13123:13260,13523:13551,13570 log:branches/check@13365:13375,13450:13457 log:branches/check2@13458:13465 log:branches/fix@13522:13588 log:branches/fixres@13270:13277,13279:13293,13295:13326 log:branches/pars@12938:13280,13282:13325,13327:13387,13389:13527,13530:13624
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 21.6 KB

Line
1	#include "AP_seq_protein.hxx"
2	#include <AP_pro_a_nucs.hxx>
3
4	#include <AP_filter.hxx>
5	#include <ARB_Tree.hxx>
6
7	#include <arb_str.h>
8	#include <climits>
9
10	inline bool hasGap(AP_PROTEINS c) { return c & APP_GAP; }
11	inline bool isGap(AP_PROTEINS c) { return c == APP_GAP; }
12
13	inline bool notHasGap(AP_PROTEINS c) { return !hasGap(c); }
14	inline bool notIsGap(AP_PROTEINS c) { return !isGap(c); }
15
16	// #define ap_assert(bed) arb_assert(bed)
17
18	AP_sequence_protein::AP_sequence_protein(const AliView *aliview)
19	: AP_sequence(aliview),
20	seq_prot(NULL),
21	mut1(NULL),
22	mut2(NULL)
23	{
24	}
25
26	AP_sequence_protein::~AP_sequence_protein() {
27	delete [] mut2;
28	delete [] mut1;
29	delete [] seq_prot;
30	}
31
32	AP_sequence *AP_sequence_protein::dup() const {
33	return new AP_sequence_protein(get_aliview());
34	}
35
36	static AP_PROTEINS prot2AP_PROTEIN[26] = {
37	APP_A,
38	APP_B,
39	APP_C,
40	APP_D,
41	APP_E,
42	APP_F,
43	APP_G,
44	APP_H,
45	APP_I,
46	APP_ILLEGAL, // J
47	APP_K,
48	APP_L,
49	APP_M,
50	APP_N,
51	APP_ILLEGAL, // O
52	APP_P,
53	APP_Q,
54	APP_R,
55	APP_S,
56	APP_T,
57	APP_ILLEGAL, // U
58	APP_V,
59	APP_W,
60	APP_X,
61	APP_Y,
62	APP_Z
63	};
64
65	#define PROTEINS_TO_TEST 22 // 26 plus gap and star, minus 3 illegal, 'X', 'B' and 'Z'
66
67	static AP_PROTEINS prot2test[PROTEINS_TO_TEST] = { // uses same indexing as prot_idx
68	APP_STAR,
69	APP_A,
70	APP_C,
71	APP_D,
72	APP_E,
73	APP_F,
74	APP_G,
75	APP_H,
76	APP_I,
77	APP_K,
78	APP_L,
79	APP_M,
80	APP_N,
81	APP_P,
82	APP_Q,
83	APP_R,
84	APP_S,
85	APP_T,
86	APP_V,
87	APP_W,
88	APP_Y,
89	APP_GAP
90	};
91
92	static int prot_idx[PROTEINS_TO_TEST] = { // uses same indexing as prot2test
93	// contains indexes for 'AWT_distance_meter->dist'
94	0, // *
95	1, // A
96	3, // C
97	4, // D
98	5, // E
99	6, // F
100	7, // G
101	8, // H
102	9, // I
103	10, // K
104	11, // L
105	12, // M
106	13, // N
107	14, // P
108	15, // Q
109	16, // R
110	17, // S
111	18, // T
112	19, // V
113	20, // W
114	21, // Y
115	23 // gap
116	};
117
118	inline const char *readable_combined_protein(AP_PROTEINS p) {
119	if (p == APP_X) { return "X"; }
120	if (p == APP_DOT) { return "."; }
121
122	static char buffer[PROTEINS_TO_TEST+1];
123	memset(buffer, 0, PROTEINS_TO_TEST+1);
124	char *bp = buffer;
125	const char readable = "ACDEFGHIKLMNPQRSTVWY-"; // same index as prot2test
126
127	for (int b = 0; b<PROTEINS_TO_TEST; ++b) {
128	if (p&prot2test[b]) {
129	*bp++ = readable[b];
130	}
131	}
132	return buffer;
133	}
134
135	static char prot_mindist[PROTEINS_TO_TEST][PROTEINS_TO_TEST];
136	static int min_mutations_initialized_for_codenr = -1;
137
138	// OMA = one mutation away
139	// (speedup for huge table is approx. 4-7%)
140	#define OMA_SLOW_LOWMEM
141
142	#if defined(ASSERTION_USED) && 0
143	#define OMA_DOUBLE_CHECK
144	#endif
145
146	#if defined(OMA_DOUBLE_CHECK)
147	# define IMPL_OMA_SLOW_LOWMEM
148	# define IMPL_OMA_FAST_BIGMEM
149	#else
150	# if defined(OMA_SLOW_LOWMEM)
151	# define IMPL_OMA_SLOW_LOWMEM
152	# else
153	# define IMPL_OMA_FAST_BIGMEM
154	# endif
155	#endif
156
157	STATIC_ASSERT(APP_MAX == 4194303);
158	STATIC_ASSERT(sizeof(AP_PROTEINS) == 4);
159
160	#if defined(IMPL_OMA_FAST_BIGMEM)
161
162	static AP_PROTEINS one_mutation_away[APP_MAX+1]; // contains all proteins that are <= 1 nuc-mutations away from protein-combination used as index
163	STATIC_ASSERT(sizeof(one_mutation_away) == 16777216); // ~ 16Mb
164
165	#endif
166
167	#if defined(IMPL_OMA_SLOW_LOWMEM)
168
169	static AP_PROTEINS one_mutation_away_0_7[256];
170	static AP_PROTEINS one_mutation_away_8_15[256];
171	static AP_PROTEINS one_mutation_away_16_23[256];
172
173	inline AP_PROTEINS calcOneMutationAway(AP_PROTEINS p) {
174	return AP_PROTEINS(one_mutation_away_0_7 [ p & 0xff] \|
175	one_mutation_away_8_15 [(p>>8) & 0xff] \|
176	one_mutation_away_16_23[(p>>16) & 0xff]);
177	}
178
179	#endif
180
181
182	inline AP_PROTEINS oneMutationAway(AP_PROTEINS p) {
183	#if defined(OMA_SLOW_LOWMEM)
184	return calcOneMutationAway(p);
185	#else
186	return one_mutation_away[p];
187	#endif
188	}
189
190	static void update_min_mutations(int code_nr, const AWT_distance_meter *distance_meter) {
191	if (min_mutations_initialized_for_codenr != code_nr) {
192	for (int d = 0; d<PROTEINS_TO_TEST; ++d) {
193	int D = prot_idx[d];
194	int D_bit = 1<<D;
195
196	for (int s = 0; s<PROTEINS_TO_TEST; ++s) {
197	const AWT_PDP *dist = distance_meter->getDistance(prot_idx[s]);
198
199	int i;
200	for (i = 0; i<3; ++i) {
201	if (dist->patd[i] & D_bit) break;
202	}
203
204	prot_mindist[s][d] = char(i);
205	}
206	}
207
208
209	#if defined(IMPL_OMA_FAST_BIGMEM)
210	memset(one_mutation_away, 0, sizeof(one_mutation_away));
211	#endif
212	#if defined(IMPL_OMA_SLOW_LOWMEM)
213	memset(one_mutation_away_0_7, 0, sizeof(one_mutation_away_0_7));
214	memset(one_mutation_away_8_15, 0, sizeof(one_mutation_away_8_15));
215	memset(one_mutation_away_16_23, 0, sizeof(one_mutation_away_16_23));
216	#endif
217
218	for (int s = 0; s<PROTEINS_TO_TEST; ++s) {
219	AP_PROTEINS oma = APP_ILLEGAL;
220	for (int d = 0; d<PROTEINS_TO_TEST; ++d) {
221	if (prot_mindist[s][d] == 1) {
222	oma = AP_PROTEINS(oma\|prot2test[d]);
223	}
224	}
225
226	AP_PROTEINS source = prot2test[s];
227	oma = AP_PROTEINS(oma\|source);
228
229	#if defined(IMPL_OMA_FAST_BIGMEM)
230	one_mutation_away[source] = oma;
231	#endif
232	#if defined(IMPL_OMA_SLOW_LOWMEM)
233	uint32_t idx = source & 0xff; if (idx) one_mutation_away_0_7 [idx] = oma;
234	idx = (source>>8) & 0xff; if (idx) one_mutation_away_8_15 [idx] = oma;
235	idx = (source>>16) & 0xff; if (idx) one_mutation_away_16_23[idx] = oma;
236	#endif
237	}
238
239	#if defined(IMPL_OMA_FAST_BIGMEM)
240	for (size_t i = 0; i<=APP_MAX; ++i) {
241	if (one_mutation_away[i] == APP_ILLEGAL) {
242	size_t j = i;
243	size_t b = 1;
244	AP_PROTEINS oma = APP_ILLEGAL;
245
246	while (j) {
247	if (j&1) oma = AP_PROTEINS(oma\|one_mutation_away[b]);
248	j >>= 1;
249	b <<= 1;
250	}
251
252	one_mutation_away[i] = oma;
253	}
254	}
255	#endif
256	#if defined(IMPL_OMA_SLOW_LOWMEM)
257	for (int s = 0; s<8; s++) {
258	int b = 1<<s;
259	for (int i=b+1; i<256; i++) {
260	if (i & b) {
261	one_mutation_away_0_7[i] = AP_PROTEINS(one_mutation_away_0_7[i] \| one_mutation_away_0_7[b]);
262	one_mutation_away_8_15[i] = AP_PROTEINS(one_mutation_away_8_15[i] \| one_mutation_away_8_15[b]);
263	one_mutation_away_16_23[i] = AP_PROTEINS(one_mutation_away_16_23[i] \| one_mutation_away_16_23[b]);
264	}
265	}
266	}
267	#endif
268
269	#if defined(IMPL_OMA_FAST_BIGMEM) && defined(DEBUG)
270	for (size_t i = 0; i<=APP_MAX; ++i) {
271	if (one_mutation_away[i] == 0) {
272	fprintf(stderr, "oma[%s] is zero\n", readable_combined_protein(AP_PROTEINS(i)));
273	}
274	}
275	for (size_t i = 0; i<=APP_MAX; ++i) {
276	AP_PROTEINS two_mutations_away = one_mutation_away[one_mutation_away[i]];
277	bool gap = hasGap(AP_PROTEINS(i));
278	if ((!gap && two_mutations_away != APP_X) \|\| (gap && two_mutations_away != APP_DOT)) {
279	// reached for a few amino-acid-combinations: C, F, C\|F, K, M, K\|M
280	// and for APP_ILLEGAL and APP_GAP as below for 3 mutations
281	fprintf(stderr, "tma[%s]", readable_combined_protein(AP_PROTEINS(i)));
282	fprintf(stderr, "=%s\n", readable_combined_protein(two_mutations_away));
283	}
284	}
285	for (size_t i = 0; i<=APP_MAX; ++i) {
286	AP_PROTEINS three_mutations_away = one_mutation_away[one_mutation_away[one_mutation_away[i]]];
287	bool gap = hasGap(AP_PROTEINS(i));
288	if ((!gap && three_mutations_away != APP_X) \|\| (gap && three_mutations_away != APP_DOT)) {
289	// only reached for i==APP_ILLEGAL and i==APP_GAP (result is wrong for latter)
290	fprintf(stderr, "3ma[%s]", readable_combined_protein(AP_PROTEINS(i)));
291	fprintf(stderr, "=%s\n", readable_combined_protein(three_mutations_away));
292	}
293	}
294	#endif
295
296	#if defined(OMA_DOUBLE_CHECK)
297	for (size_t i = 0; i<=APP_MAX; ++i) {
298	AP_PROTEINS p = AP_PROTEINS(i);
299	ap_assert(calcOneMutationAway(p) == one_mutation_away[p]);
300	}
301	#endif
302
303	min_mutations_initialized_for_codenr = code_nr;
304	}
305	}
306
307
308	#if defined(DEBUG)
309	// #define SHOW_SEQ
310	#endif // DEBUG
311
312	void AP_sequence_protein::set(const char *isequence) {
313	AWT_translator *translator = AWT_get_user_translator(get_aliview()->get_gb_main());
314	update_min_mutations(translator->CodeNr(), translator->getDistanceMeter());
315
316	size_t sequence_len = get_sequence_length();
317
318	seq_prot = new AP_PROTEINS[sequence_len+1];
319	mut1 = new AP_PROTEINS[sequence_len+1];
320	mut2 = new AP_PROTEINS[sequence_len+1];
321
322	ap_assert(!get_filter()->does_bootstrap()); // bootstrapping not implemented for protein parsimony
323
324	const AP_filter *filt = get_filter();
325	const uchar *simplify = filt->get_simplify_table();
326	int left_bases = sequence_len;
327	long filter_len = filt->get_length();
328
329	ap_assert(filt);
330
331	size_t oidx = 0; // index for output sequence
332
333	// check if initialized for correct instance of translator:
334	ap_assert(min_mutations_initialized_for_codenr == AWT_get_user_translator()->CodeNr());
335
336	for (int idx = 0; idx<filter_len && left_bases; ++idx) {
337	if (filt->use_position(idx)) {
338	char c = toupper(simplify[safeCharIndex(isequence[idx])]);
339	AP_PROTEINS p = APP_ILLEGAL;
340
341	#if defined(SHOW_SEQ)
342	fputc(c, stdout);
343	#endif // SHOW_SEQ
344
345	if (c >= 'A' && c <= 'Z') p = prot2AP_PROTEIN[c-'A'];
346	else if (c == '-') p = APP_GAP;
347	else if (c == '.') p = APP_DOT;
348	else if (c == '*') p = APP_STAR;
349
350	if (p == APP_ILLEGAL) {
351	GB_warning(GBS_global_string("Invalid sequence character '%c' replaced by dot", c));
352	p = APP_DOT;
353	}
354
355	seq_prot[oidx] = p;
356	mut1[oidx] = oneMutationAway(p);
357	mut2[oidx] = oneMutationAway(mut1[oidx]);
358
359	++oidx;
360	--left_bases;
361	}
362	}
363
364	ap_assert(oidx == sequence_len);
365	seq_prot[sequence_len] = APP_ILLEGAL;
366
367	#if defined(SHOW_SEQ)
368	fputc('\n', stdout);
369	#endif // SHOW_SEQ
370
371	mark_sequence_set(true);
372	}
373
374	void AP_sequence_protein::unset() {
375	delete [] mut2; mut2 = 0;
376	delete [] mut1; mut1 = 0;
377	delete [] seq_prot; seq_prot = 0;
378	mark_sequence_set(false);
379	}
380
381	AP_FLOAT AP_sequence_protein::combine(const AP_sequence lefts, const AP_sequence rights, char *mutation_per_site) {
382	// Note: changes done here should also be be applied to AP_seq_dna.cxx@combine_impl
383
384	// now uses same algorithm as done till [877]
385
386	const AP_sequence_protein left = DOWNCAST(const AP_sequence_protein , lefts);
387	const AP_sequence_protein right = DOWNCAST(const AP_sequence_protein , rights);
388
389	size_t sequence_len = get_sequence_length();
390	if (!seq_prot) {
391	seq_prot = new AP_PROTEINS[sequence_len + 1];
392	mut1 = new AP_PROTEINS[sequence_len + 1];
393	mut2 = new AP_PROTEINS[sequence_len + 1];
394	}
395
396	const AP_PROTEINS *p1 = left->get_sequence();
397	const AP_PROTEINS *p2 = right->get_sequence();
398
399	const AP_PROTEINS *mut11 = left->get_mut1();
400	const AP_PROTEINS *mut21 = left->get_mut2();
401	const AP_PROTEINS *mut12 = right->get_mut1();
402	const AP_PROTEINS *mut22 = right->get_mut2();
403
404	AP_PROTEINS *p = seq_prot;
405	const AP_weights *weights = get_weights();
406	char *mutpsite = mutation_per_site;
407
408	long result = 0;
409	// check if initialized for correct instance of translator:
410	ap_assert(min_mutations_initialized_for_codenr == AWT_get_user_translator()->CodeNr());
411
412	for (size_t idx = 0; idx<sequence_len; ++idx) {
413	AP_PROTEINS c1 = p1[idx];
414	AP_PROTEINS c2 = p2[idx];
415
416	AP_PROTEINS onemut1 = mut11[idx];
417	AP_PROTEINS onemut2 = mut12[idx];
418	AP_PROTEINS twomut1 = mut21[idx];
419	AP_PROTEINS twomut2 = mut22[idx];
420
421	ap_assert(c1 != APP_ILLEGAL);
422	ap_assert(c2 != APP_ILLEGAL);
423
424	AP_PROTEINS contained_in_both = AP_PROTEINS(c1 & c2);
425	AP_PROTEINS contained_in_any = AP_PROTEINS(c1 \| c2);
426
427	AP_PROTEINS reachable_from_both_with_1_mut = AP_PROTEINS(onemut1 & onemut2);
428	AP_PROTEINS reachable_from_both_with_2_mut = AP_PROTEINS(twomut1 & twomut2);
429
430	AP_PROTEINS max_cost_1 = AP_PROTEINS(contained_in_any & reachable_from_both_with_1_mut);
431	AP_PROTEINS max_cost_2 = AP_PROTEINS((contained_in_any & reachable_from_both_with_2_mut) \| reachable_from_both_with_1_mut);
432
433	if (contained_in_both) { // there are common proteins
434	p[idx] = contained_in_both; // store common proteins for both subtrees
435	mut1[idx] = max_cost_1;
436	mut2[idx] = max_cost_2;
437	}
438	else { // proteins are distinct
439	int mutations = INT_MAX;
440
441	AP_PROTEINS reachable_from_both_with_3_mut = AP_PROTEINS((onemut1 & twomut2) \| (onemut2 & twomut1)); // one with 1 mutation, other with 2 mutations
442
443	AP_PROTEINS max_cost_3 = AP_PROTEINS(contained_in_any // = one w/o mutations, other with 3 mutations (=anything, i.e. & APP_DOT, skipped)
444	\| reachable_from_both_with_3_mut);
445
446	if (max_cost_1) {
447	// some proteins can be reached from both subtrees with 1 mutation
448	mutations = 1;
449	p[idx] = max_cost_1;
450	mut1[idx] = max_cost_2;
451	mut2[idx] = max_cost_3;
452	}
453	else {
454	AP_PROTEINS reachable_from_any_with_1_mut = AP_PROTEINS(onemut1 \| onemut2);
455
456	AP_PROTEINS max_cost_4 = AP_PROTEINS(reachable_from_any_with_1_mut // one with 1 mutation, other with 3 mutations (=anything, i.e. & APP_DOT, skipped)
457	\| reachable_from_both_with_2_mut);
458	if (max_cost_2) {
459	// some proteins can be reached from both subtrees with 2 mutations
460	mutations = 2;
461	p[idx] = max_cost_2;
462	mut1[idx] = max_cost_3;
463	mut2[idx] = max_cost_4;
464	}
465	else {
466	ap_assert(max_cost_3);
467	AP_PROTEINS reachable_from_any_with_2_mut = AP_PROTEINS(twomut1 \| twomut2);
468
469	mutations = 3;
470	p[idx] = max_cost_3;
471	mut1[idx] = max_cost_4;
472	mut2[idx] = reachable_from_any_with_2_mut; // one with 2 mutations, other with 3 mutations
473	}
474	}
475	ap_assert(mutations >= 1 && mutations <= 3);
476
477	if (mutpsite) mutpsite[idx] += mutations; // count mutations per site (unweighted)
478	result += mutations * weights->weight(idx); // count weighted or simple
479
480	}
481
482	AP_PROTEINS calc_mut1 = oneMutationAway(p[idx]);
483	mut1[idx] = AP_PROTEINS(mut1[idx] \| calc_mut1);
484	AP_PROTEINS calc_mut2 = oneMutationAway(mut1[idx]);
485	mut2[idx] = AP_PROTEINS(mut2[idx] \| calc_mut2);
486	}
487
488	#if defined(DEBUG) && 0
489	#define P1 75
490	#define P2 90
491	printf("Seq1: ");
492	for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p1[idx]);
493	printf("\nSeq2: ");
494	for (long idx = P1; idx <= P2; ++idx) printf("%3i ", p2[idx]);
495	printf("\nCombine value: %f\n", float(result));
496	#undef P1
497	#undef P2
498	#endif // DEBUG
499
500	#if defined(DEBUG) && 0
501	printf("\nCombine value: %f\n", float(result));
502	#endif // DEBUG
503
504	inc_combine_count();
505	mark_sequence_set(true);
506
507	ap_assert(result >= 0.0);
508	return (AP_FLOAT)result;
509	}
510
511	void AP_sequence_protein::partial_match(const AP_sequence* part_, long overlapPtr, long penaltyPtr) const {
512	// matches the partial sequences 'part_' against 'this'
513	// '*penaltyPtr' is set to the number of mismatches (possibly weighted)
514	// '*overlapPtr' is set to the number of base positions both sequences overlap
515	// example:
516	// fullseq 'XXX---XXX' 'XXX---XXX'
517	// partialseq '-XX---XX-' '---XXX---'
518	// overlap 7 3
519	//
520	// algorithm is similar to AP_sequence_protein::combine()
521	// Note: changes done here should also be be applied to AP_seq_dna.cxx@partial_match_impl
522
523	const AP_sequence_protein part = (const AP_sequence_protein )part_;
524
525	const AP_PROTEINS *pf = get_sequence();
526	const AP_PROTEINS *pp = part->get_sequence();
527
528	const AP_weights *weights = get_weights();
529
530	long min_end; // minimum of both last non-gap positions
531	for (min_end = get_sequence_length()-1; min_end >= 0; --min_end) {
532	AP_PROTEINS both = AP_PROTEINS(pf[min_end]\|pp[min_end]);
533	if (notHasGap(both)) { // last non-gap found
534	if (notHasGap(pf[min_end])) { // occurred in full sequence
535	for (; min_end >= 0; --min_end) { // search same in partial sequence
536	if (notHasGap(pp[min_end])) break;
537	}
538	}
539	else {
540	ap_assert(notHasGap(pp[min_end])); // occurred in partial sequence
541	for (; min_end >= 0; --min_end) { // search same in full sequence
542	if (notHasGap(pf[min_end])) break;
543	}
544	}
545	break;
546	}
547	}
548
549	long penalty = 0;
550	long overlap = 0;
551
552	if (min_end >= 0) {
553	long max_start; // maximum of both first non-gap positions
554	for (max_start = 0; max_start <= min_end; ++max_start) {
555	AP_PROTEINS both = AP_PROTEINS(pf[max_start]\|pp[max_start]);
556	if (notHasGap(both)) { // first non-gap found
557	if (notHasGap(pf[max_start])) { // occurred in full sequence
558	for (; max_start <= min_end; ++max_start) { // search same in partial
559	if (notHasGap(pp[max_start])) break;
560	}
561	}
562	else {
563	ap_assert(notHasGap(pp[max_start])); // occurred in partial sequence
564	for (; max_start <= min_end; ++max_start) { // search same in full
565	if (notHasGap(pf[max_start])) break;
566	}
567	}
568	break;
569	}
570	}
571
572	if (max_start <= min_end) { // if sequences overlap
573	for (long idx = max_start; idx <= min_end; ++idx) {
574	if ((pf[idx]&pp[idx]) == 0) { // bases are distinct (aka mismatch)
575	int mutations;
576	if (hasGap(AP_PROTEINS(pf[idx]\|pp[idx]))) { // one is a gap
577	mutations = 3;
578	}
579	else {
580	mutations = INT_MAX;
581	for (int t1 = 0; t1<PROTEINS_TO_TEST && mutations>1; ++t1) { // with all proteins to test
582	if (pf[idx] & prot2test[t1]) { // if protein is contained in subtree
583	for (int t2 = 0; t2<PROTEINS_TO_TEST; ++t2) {
584	if (pp[idx] & prot2test[t2]) {
585	int mut = prot_mindist[t1][t2];
586	if (mut<mutations) {
587	mutations = mut;
588	if (mutations < 2) break; // minimum reached -- abort
589	}
590	}
591	}
592	}
593	}
594	}
595	penalty += weights->weight(idx)*mutations;
596	}
597	}
598	overlap = (min_end-max_start+1)*3;
599	}
600	}
601
602	*overlapPtr = overlap;
603	*penaltyPtr = penalty;
604	}
605
606	AP_FLOAT AP_sequence_protein::count_weighted_bases() const {
607	AP_FLOAT wcount;
608	const AP_PROTEINS *sequence = get_sequence();
609
610	if (!sequence) wcount = -1.0;
611	else {
612	long sum = 0;
613	size_t sequence_len = get_sequence_length();
614
615	const AP_weights *weights = get_weights();
616
617	for (size_t idx = 0; idx<sequence_len; ++idx) {
618	if (notHasGap(sequence[idx])) {
619	sum += weights->weight(idx) * 2.0;
620	}
621	else if /has gap but / (notIsGap(sequence[idx])) {
622	sum += weights->weight(idx);
623	}
624	}
625	wcount = sum * 0.5;
626	}
627	return wcount;
628	}
629
630	uint32_t AP_sequence_protein::checksum() const {
631	const AP_PROTEINS *seq = get_sequence();
632	return GB_checksum(reinterpret_cast<const char >(seq), sizeof(seq)*get_sequence_length(), 0, NULL);
633	}
634
635	bool AP_sequence_protein::equals(const AP_sequence_protein *other) const {
636	const AP_PROTEINS *seq = get_sequence();
637	const AP_PROTEINS *oseq = other->get_sequence();
638
639	size_t len = get_sequence_length();
640	for (size_t p = 0; p<len; ++p) {
641	if (seq[p] != oseq[p]) return false;
642	}
643	return true;
644	}
645
646
647
648
649

Note: See TracBrowser for help on using the repository browser.

Download in other formats: