1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : ClustalV.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Coded based on ClustalV by Ralf Westram (coder@reallysoft.de) // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #include "ClustalV.hxx" |
---|
13 | #include "seq_search.hxx" |
---|
14 | |
---|
15 | #include <cctype> |
---|
16 | #include <climits> |
---|
17 | |
---|
18 | |
---|
19 | // ---------------------------------------------------------------- |
---|
20 | |
---|
21 | #define MASTER_GAP_OPEN 50 |
---|
22 | #define CHEAP_GAP_OPEN 20 // penalty for creating a gap (if we have gaps in master) |
---|
23 | |
---|
24 | #define DEFAULT_GAP_OPEN 30 // penalty for creating a gap |
---|
25 | |
---|
26 | #define MASTER_GAP_EXTEND 18 |
---|
27 | #define CHEAP_GAP_EXTEND 5 // penalty for extending a gap (if we have gaps in master) |
---|
28 | |
---|
29 | #define DEFAULT_GAP_EXTEND 10 // penalty for extending a gap |
---|
30 | |
---|
31 | #define DEFAULT_IMPROBABLY_MUTATION 10 // penalty for mutations |
---|
32 | #define DEFAULT_PROBABLY_MUTATION 4 // penalty for special mutations (A<->G,C<->U/T) (only if 'weighted') |
---|
33 | |
---|
34 | #define DYNAMIC_PENALTIES // otherwise you get FIXED PENALTIES (=no cheap penalties) |
---|
35 | |
---|
36 | // ---------------------------------------------------------------- |
---|
37 | |
---|
38 | #define MAX_GAP_OPEN_DISCOUNT (DEFAULT_GAP_OPEN-CHEAP_GAP_OPEN) // maximum subtracted from DEFAULT_GAP_OPEN |
---|
39 | #define MAX_GAP_EXTEND_DISCOUNT (DEFAULT_GAP_EXTEND-CHEAP_GAP_EXTEND) // maximum subtracted from DEFAULT_GAP_EXTEND |
---|
40 | |
---|
41 | #define MAXN 2 // Maximum number of sequences (both groups) |
---|
42 | |
---|
43 | static bool module_initialized = false; |
---|
44 | |
---|
45 | typedef int Boolean; |
---|
46 | |
---|
47 | static Boolean dnaflag; |
---|
48 | static Boolean is_weight; |
---|
49 | |
---|
50 | #define MAX_BASETYPES 21 |
---|
51 | |
---|
52 | static int xover; |
---|
53 | static int little_pam; |
---|
54 | static int big_pam; |
---|
55 | static int pamo[(MAX_BASETYPES-1)*MAX_BASETYPES/2]; |
---|
56 | static int pam[MAX_BASETYPES][MAX_BASETYPES]; |
---|
57 | |
---|
58 | static int pos1; |
---|
59 | static int pos2; |
---|
60 | static int **naa1; // naa1[basetype][position] counts bases for each position of all sequences in group1 |
---|
61 | static int **naa2; // naa2[basetype][position] same for group2 |
---|
62 | static int **naas; // |
---|
63 | static int seqlen_array[MAXN+1]; // length of all sequences |
---|
64 | static unsigned char *seq_array[MAXN+1]; // the sequences |
---|
65 | static int group[MAXN+1]; // group of sequence |
---|
66 | static int alist[MAXN+1]; // indices of sequences to be aligned |
---|
67 | static int fst_list[MAXN+1]; |
---|
68 | static int snd_list[MAXN+1]; |
---|
69 | |
---|
70 | static int nseqs; // # of sequences |
---|
71 | static int weights[MAX_BASETYPES][MAX_BASETYPES]; // weights[b1][b2] : penalty for mutation from base 'b1' to base 'b2' |
---|
72 | |
---|
73 | #if defined(ASSERTION_USED) |
---|
74 | static size_t displ_size = 0; |
---|
75 | #endif // ASSERTION_USED |
---|
76 | |
---|
77 | static int *displ; // displ == 0 -> base in both , displ<0 -> displ gaps in slave, displ>0 -> displ gaps in master |
---|
78 | static int *zza; // column (left->right) of align matrix (minimum of all paths to this matrix element) |
---|
79 | static int *zzb; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
---|
80 | static int *zzc; // column (left<-right) of align matrix (minimum of all paths to this matrix element) |
---|
81 | static int *zzd; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
---|
82 | static int print_ptr; |
---|
83 | static int last_print; |
---|
84 | |
---|
85 | static const int *gaps_before_position; |
---|
86 | |
---|
87 | #if defined(DEBUG) |
---|
88 | // #define MATRIX_DUMP |
---|
89 | // #define DISPLAY_DIFF |
---|
90 | #endif // DEBUG |
---|
91 | |
---|
92 | #ifdef MATRIX_DUMP |
---|
93 | # define IF_MATRIX_DUMP(xxx) xxx |
---|
94 | # define DISPLAY_MATRIX_SIZE 3000 |
---|
95 | |
---|
96 | static int vertical [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
97 | static int verticalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
98 | static int diagonal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
99 | static int horizontal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
100 | static int horizontalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
101 | |
---|
102 | #else |
---|
103 | # define IF_MATRIX_DUMP(xxx) |
---|
104 | #endif |
---|
105 | |
---|
106 | inline int master_gap_open(int beforePosition) { |
---|
107 | #ifdef DYNAMIC_PENALTIES |
---|
108 | long gaps = gaps_before_position[beforePosition-1]; |
---|
109 | return gaps ? MASTER_GAP_OPEN - MAX_GAP_OPEN_DISCOUNT : MASTER_GAP_OPEN; |
---|
110 | |
---|
111 | /* return |
---|
112 | gaps >= MAX_GAP_OPEN_DISCOUNT |
---|
113 | ? DEFAULT_GAP_OPEN-MAX_GAP_OPEN_DISCOUNT |
---|
114 | : DEFAULT_GAP_OPEN-gaps; */ |
---|
115 | #else |
---|
116 | return DEFAULT_GAP_OPEN; |
---|
117 | #endif |
---|
118 | } |
---|
119 | inline int master_gap_extend(int beforePosition) { |
---|
120 | #ifdef DYNAMIC_PENALTIES |
---|
121 | long gaps = gaps_before_position[beforePosition-1]; |
---|
122 | return gaps ? MASTER_GAP_EXTEND - MAX_GAP_EXTEND_DISCOUNT : MASTER_GAP_EXTEND; |
---|
123 | /* return |
---|
124 | gaps >= MAX_GAP_EXTEND_DISCOUNT |
---|
125 | ? DEFAULT_GAP_EXTEND-MAX_GAP_EXTEND_DISCOUNT |
---|
126 | : DEFAULT_GAP_EXTEND-gaps; */ |
---|
127 | #else |
---|
128 | return DEFAULT_GAP_EXTEND; |
---|
129 | #endif |
---|
130 | } |
---|
131 | |
---|
132 | inline int master_gapAtWithOpenPenalty(int atPosition, int length, int penalty) { |
---|
133 | if (length<=0) return 0; |
---|
134 | |
---|
135 | int beforePosition = atPosition; |
---|
136 | int afterPosition = atPosition-1; |
---|
137 | |
---|
138 | while (length--) { |
---|
139 | int p1 = master_gap_extend(beforePosition); |
---|
140 | int p2 = master_gap_extend(afterPosition+1); |
---|
141 | |
---|
142 | if (p1<p2 && beforePosition>1) { |
---|
143 | penalty += p1; |
---|
144 | beforePosition--; |
---|
145 | } |
---|
146 | else { |
---|
147 | penalty += p2; |
---|
148 | afterPosition++; |
---|
149 | } |
---|
150 | } |
---|
151 | |
---|
152 | return penalty; |
---|
153 | } |
---|
154 | |
---|
155 | inline int master_gapAt(int atPosition, int length) { |
---|
156 | return master_gapAtWithOpenPenalty(atPosition, length, master_gap_open(atPosition)); |
---|
157 | } |
---|
158 | |
---|
159 | inline int slave_gap_open(int /* beforePosition */) { |
---|
160 | return DEFAULT_GAP_OPEN; |
---|
161 | } |
---|
162 | |
---|
163 | inline int slave_gap_extend(int /* beforePosition */) { |
---|
164 | return DEFAULT_GAP_EXTEND; |
---|
165 | } |
---|
166 | |
---|
167 | inline int slave_gapAtWithOpenPenalty(int atPosition, int length, int penalty) { |
---|
168 | return length<=0 ? 0 : penalty + length*slave_gap_extend(atPosition); |
---|
169 | } |
---|
170 | |
---|
171 | inline int slave_gapAt(int atPosition, int length) { |
---|
172 | return slave_gapAtWithOpenPenalty(atPosition, length, slave_gap_open(atPosition)); |
---|
173 | } |
---|
174 | |
---|
175 | #define UNKNOWN_ACID 255 |
---|
176 | static const char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW"; |
---|
177 | |
---|
178 | #define NUCLEIDS 16 |
---|
179 | static const char *nucleic_acid_order = "-ACGTUMRWSYKVHDBN"; |
---|
180 | static const char *nucleic_maybe_A = "-A----AAA---AAA-A"; |
---|
181 | static const char *nucleic_maybe_C = "--C---C--CC-CC-CC"; |
---|
182 | static const char *nucleic_maybe_G = "---G---G-G-GG-GGG"; |
---|
183 | static const char *nucleic_maybe_T = "----T---T-TT-TTTT"; |
---|
184 | static const char *nucleic_maybe_U = "-----U--U-UU-UUUU"; |
---|
185 | static const char *nucleic_maybe[6] = { NULp, nucleic_maybe_A, nucleic_maybe_C, nucleic_maybe_G, nucleic_maybe_T, nucleic_maybe_U }; |
---|
186 | |
---|
187 | /* |
---|
188 | * M = A or C S = G or C V = A or G or C N = A or C or G or T |
---|
189 | * R = A or G Y = C or T H = A or C or T |
---|
190 | * W = A or T K = G or T D = A or G or T |
---|
191 | */ |
---|
192 | |
---|
193 | #define cheap_if(cond) ((cond) ? 1 : 2) |
---|
194 | static int baseCmp(unsigned char c1, unsigned char c2) { |
---|
195 | // c1,c2 == 1=A,2=C (==index of character in nucleic_acid_order[]) |
---|
196 | // returns 0 for equal |
---|
197 | // 1 for probably mutations |
---|
198 | // 2 for improbably mutations |
---|
199 | #define COMPARABLE_BASES 5 |
---|
200 | |
---|
201 | if (c1==c2) return 0; |
---|
202 | |
---|
203 | if (c2<c1) { // swap |
---|
204 | int c3 = c1; |
---|
205 | |
---|
206 | c1 = c2; |
---|
207 | c2 = c3; |
---|
208 | } |
---|
209 | |
---|
210 | if (c2<=COMPARABLE_BASES) { |
---|
211 | fa_assert(c1<=COMPARABLE_BASES); |
---|
212 | |
---|
213 | switch (c1) { |
---|
214 | case 0: return 2; |
---|
215 | case 1: return cheap_if(c2==3); // A->G |
---|
216 | case 2: return cheap_if(c2==4 || c2==5); // C->T/U |
---|
217 | case 3: return cheap_if(c2==1); // G->A |
---|
218 | case 4: if (c2==5) return 0; // T->U |
---|
219 | FALLTHROUGH; |
---|
220 | case 5: if (c2==4) return 0; // U->T |
---|
221 | return cheap_if(c2==2); // T/U->C |
---|
222 | default: fa_assert(0); |
---|
223 | } |
---|
224 | } |
---|
225 | |
---|
226 | int bestMatch = 3; |
---|
227 | if (c1<=COMPARABLE_BASES) { |
---|
228 | for (int i=1; i<=COMPARABLE_BASES; i++) { |
---|
229 | if (isalpha(nucleic_maybe[i][c2])) { // 'c2' maybe a 'i' |
---|
230 | int match = baseCmp(c1, i); |
---|
231 | if (match<bestMatch) bestMatch = match; |
---|
232 | } |
---|
233 | } |
---|
234 | } |
---|
235 | else { |
---|
236 | for (int i=1; i<=COMPARABLE_BASES; i++) { |
---|
237 | if (isalpha(nucleic_maybe[i][c1])) { // 'c1' maybe a 'i' |
---|
238 | int match = baseCmp(i, c2); |
---|
239 | if (match<bestMatch) bestMatch = match; |
---|
240 | } |
---|
241 | } |
---|
242 | } |
---|
243 | |
---|
244 | fa_assert(bestMatch>=0 && bestMatch<=2); |
---|
245 | return bestMatch; |
---|
246 | } |
---|
247 | #undef cheap_if |
---|
248 | |
---|
249 | int baseMatch(char c1, char c2) { |
---|
250 | // c1,c2 == ACGUTRS... |
---|
251 | // returns 0 for equal |
---|
252 | // 1 for probably mutations |
---|
253 | // 2 for improbably mutations |
---|
254 | // -1 if one char is illegal |
---|
255 | |
---|
256 | const char *p1 = strchr(nucleic_acid_order, c1); |
---|
257 | const char *p2 = strchr(nucleic_acid_order, c2); |
---|
258 | |
---|
259 | fa_assert(c1); |
---|
260 | fa_assert(c2); |
---|
261 | |
---|
262 | if (p1 && p2) return baseCmp(p1-nucleic_acid_order, p2-nucleic_acid_order); |
---|
263 | return -1; |
---|
264 | } |
---|
265 | |
---|
266 | static int getPenalty(char c1, char c2) { |
---|
267 | // c1,c2 = A=0,C=1,... s.o. |
---|
268 | switch (baseCmp(c1, c2)) { |
---|
269 | case 1: return DEFAULT_PROBABLY_MUTATION; |
---|
270 | case 2: return DEFAULT_IMPROBABLY_MUTATION; |
---|
271 | default: break; |
---|
272 | } |
---|
273 | |
---|
274 | return 0; |
---|
275 | } |
---|
276 | |
---|
277 | static char *result[3]; // result buffers |
---|
278 | |
---|
279 | static char pam250mt[] = { |
---|
280 | 12, |
---|
281 | 0, 2, |
---|
282 | -2, 1, 3, |
---|
283 | -3, 1, 0, 6, |
---|
284 | -2, 1, 1, 1, 2, |
---|
285 | -3, 1, 0, -1, 1, 5, |
---|
286 | -4, 1, 0, -1, 0, 0, 2, |
---|
287 | -5, 0, 0, -1, 0, 1, 2, 4, |
---|
288 | -5, 0, 0, -1, 0, 0, 1, 3, 4, |
---|
289 | -5, -1, -1, 0, 0, -1, 1, 2, 2, 4, |
---|
290 | -3, -1, -1, 0, -1, -2, 2, 1, 1, 3, 6, |
---|
291 | -4, 0, -1, 0, -2, -3, 0, -1, -1, 1, 2, 6, |
---|
292 | -5, 0, 0, -1, -1, -2, 1, 0, 0, 1, 0, 3, 5, |
---|
293 | -5, -2, -1, -2, -1, -3, -2, -3, -2, -1, -2, 0, 0, 6, |
---|
294 | -2, -1, 0, -2, -1, -3, -2, -2, -2, -2, -2, -2, -2, 2, 5, |
---|
295 | -6, -3, -2, -3, -2, -4, -3, -4, -3, -2, -2, -3, -3, 4, 2, 6, |
---|
296 | -2, -1, 0, -1, 0, -1, -2, -2, -2, -2, -2, -2, -2, 2, 4, 2, 4, |
---|
297 | -4, -3, -3, -5, -4, -5, -4, -6, -5, -5, -2, -4, -5, 0, 1, 2, -1, 9, |
---|
298 | 0, -3, -3, -5, -3, -5, -2, -4, -4, -4, 0, -4, -4, -2, -1, -1, -2, 7, 10, |
---|
299 | -8, -2, -5, -6, -6, -7, -4, -7, -7, -5, -3, 2, -3, -4, -5, -2, -6, 0, 0, 17 |
---|
300 | }; |
---|
301 | |
---|
302 | static char *matptr = pam250mt; |
---|
303 | |
---|
304 | #if (defined(DISPLAY_DIFF) || defined(MATRIX_DUMP)) |
---|
305 | static void p_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
---|
306 | int len = strlen(amino_acid_order); |
---|
307 | |
---|
308 | for (int i=1; i<=l && naseq[i]; i++) { |
---|
309 | fa_assert(naseq[i]<len); |
---|
310 | seq[i] = amino_acid_order[naseq[i]]; |
---|
311 | } |
---|
312 | } |
---|
313 | |
---|
314 | static void n_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
---|
315 | int len = strlen(nucleic_acid_order); |
---|
316 | |
---|
317 | for (int i=1; i<=l && naseq[i]; i++) { |
---|
318 | fa_assert(naseq[i]<len); |
---|
319 | seq[i] = nucleic_acid_order[naseq[i]]; |
---|
320 | } |
---|
321 | } |
---|
322 | #endif |
---|
323 | |
---|
324 | inline ARB_ERROR MAXLENtooSmall() { |
---|
325 | return "ClustalV-aligner: MAXLEN is dimensioned to small for this sequence"; |
---|
326 | } |
---|
327 | |
---|
328 | inline void *ckalloc(size_t bytes, ARB_ERROR& error) { |
---|
329 | if (error) return NULp; |
---|
330 | |
---|
331 | void *ret = malloc(bytes); |
---|
332 | |
---|
333 | if (!ret) error = "out of memory"; |
---|
334 | return ret; |
---|
335 | } |
---|
336 | |
---|
337 | static ARB_ERROR init_myers(long max_seq_length) { |
---|
338 | ARB_ERROR error; |
---|
339 | |
---|
340 | naa1 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *), error); |
---|
341 | naa2 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *), error); |
---|
342 | naas = (int **)ckalloc(MAX_BASETYPES * sizeof (int *), error); |
---|
343 | |
---|
344 | for (int i=0; i<MAX_BASETYPES && !error; i++) { |
---|
345 | naa1[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int), error); |
---|
346 | naa2[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int), error); |
---|
347 | naas[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int), error); |
---|
348 | } |
---|
349 | return error; |
---|
350 | } |
---|
351 | |
---|
352 | static void make_pamo(int nv) { |
---|
353 | little_pam=big_pam=matptr[0]; |
---|
354 | for (int i=0; i<210; ++i) { // LOOP_VECTORIZED |
---|
355 | int c=matptr[i]; |
---|
356 | little_pam=(little_pam<c) ? little_pam : c; |
---|
357 | big_pam=(big_pam>c) ? big_pam : c; |
---|
358 | } |
---|
359 | for (int i=0; i<210; ++i) { // LOOP_VECTORIZED |
---|
360 | pamo[i] = matptr[i]-little_pam; |
---|
361 | } |
---|
362 | nv -= little_pam; |
---|
363 | big_pam -= little_pam; |
---|
364 | xover = big_pam - nv; |
---|
365 | /* |
---|
366 | fprintf(stdout,"\n\nxover= %d, big_pam = %d, little_pam=%d, nv = %d\n\n" |
---|
367 | ,xover,big_pam,little_pam,nv); |
---|
368 | */ |
---|
369 | } |
---|
370 | |
---|
371 | static void fill_pam() { |
---|
372 | int pos = 0; |
---|
373 | |
---|
374 | for (int i=0; i<20; ++i) { |
---|
375 | for (int j=0; j<=i; ++j) { |
---|
376 | pam[i][j]=pamo[pos++]; |
---|
377 | } |
---|
378 | } |
---|
379 | |
---|
380 | for (int i=0; i<20; ++i) { |
---|
381 | for (int j=0; j<=i; ++j) { // LOOP_VECTORIZED[!<8.1] |
---|
382 | pam[j][i]=pam[i][j]; |
---|
383 | } |
---|
384 | } |
---|
385 | |
---|
386 | if (dnaflag) { |
---|
387 | xover=4; |
---|
388 | big_pam=8; |
---|
389 | for (int i=0; i<=NUCLEIDS; ++i) { |
---|
390 | for (int j=0; j<=NUCLEIDS; ++j) { |
---|
391 | weights[i][j] = getPenalty(i, j); |
---|
392 | } |
---|
393 | } |
---|
394 | } |
---|
395 | else { |
---|
396 | for (int i=1; i<MAX_BASETYPES; ++i) { |
---|
397 | for (int j=1; j<MAX_BASETYPES; ++j) { // LOOP_VECTORIZED |
---|
398 | weights[i][j] = big_pam - pam[i-1][j-1]; |
---|
399 | } |
---|
400 | } |
---|
401 | for (int i=0; i<MAX_BASETYPES; ++i) { |
---|
402 | weights[0][i] = xover; |
---|
403 | weights[i][0] = xover; |
---|
404 | } |
---|
405 | } |
---|
406 | } |
---|
407 | |
---|
408 | static void exit_myers() { |
---|
409 | for (int i=0; i<MAX_BASETYPES; i++) { |
---|
410 | free(naas[i]); |
---|
411 | free(naa2[i]); |
---|
412 | free(naa1[i]); |
---|
413 | } |
---|
414 | free(naas); |
---|
415 | free(naa2); |
---|
416 | free(naa1); |
---|
417 | } |
---|
418 | |
---|
419 | static ARB_ERROR init_show_pair(long max_seq_length) { |
---|
420 | ARB_ERROR error; |
---|
421 | |
---|
422 | #if defined(ASSERTION_USED) |
---|
423 | displ_size = (2*max_seq_length + 1); |
---|
424 | #endif // ASSERTION_USED |
---|
425 | |
---|
426 | displ = (int *) ckalloc((2*max_seq_length + 1) * sizeof (int), error); |
---|
427 | last_print = 0; |
---|
428 | |
---|
429 | zza = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
---|
430 | zzb = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
---|
431 | |
---|
432 | zzc = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
---|
433 | zzd = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
---|
434 | |
---|
435 | return error; |
---|
436 | } |
---|
437 | |
---|
438 | static void exit_show_pair() { |
---|
439 | freenull(zzd); |
---|
440 | freenull(zzc); |
---|
441 | freenull(zzb); |
---|
442 | freenull(zza); |
---|
443 | freenull(displ); |
---|
444 | #if defined(ASSERTION_USED) |
---|
445 | displ_size = 0; |
---|
446 | #endif // ASSERTION_USED |
---|
447 | } |
---|
448 | |
---|
449 | inline int set_displ(int offset, int value) { |
---|
450 | fa_assert(offset >= 0 && offset<int(displ_size)); |
---|
451 | displ[offset] = value; |
---|
452 | return displ[offset]; |
---|
453 | } |
---|
454 | inline int decr_displ(int offset, int value) { |
---|
455 | fa_assert(offset >= 0 && offset<int(displ_size)); |
---|
456 | displ[offset] -= value; |
---|
457 | return displ[offset]; |
---|
458 | } |
---|
459 | |
---|
460 | |
---|
461 | inline void add(int v) { |
---|
462 | // insert 'v' gaps into master ??? |
---|
463 | if (last_print<0 && print_ptr>0) { |
---|
464 | set_displ(print_ptr-1, v); |
---|
465 | set_displ(print_ptr++, last_print); |
---|
466 | } |
---|
467 | else { |
---|
468 | last_print = set_displ(print_ptr++, v); |
---|
469 | } |
---|
470 | } |
---|
471 | |
---|
472 | inline int calc_weight(int iat, int jat, int v1, int v2) { |
---|
473 | fa_assert(pos1==1 && pos2==1); |
---|
474 | unsigned char j = seq_array[alist[1]][v2+jat-1]; |
---|
475 | return j<128 ? naas[j][v1+iat-1] : 0; |
---|
476 | } |
---|
477 | |
---|
478 | #ifdef MATRIX_DUMP |
---|
479 | |
---|
480 | inline const unsigned char *lstr(const unsigned char *s, int len) { |
---|
481 | static unsigned char *lstr_ss = 0; |
---|
482 | |
---|
483 | freeset(lstr_ss, (unsigned char*)strndup((const char*)s, len)); |
---|
484 | return lstr_ss; |
---|
485 | } |
---|
486 | |
---|
487 | inline const unsigned char *nstr(unsigned char *cp, int length) { |
---|
488 | const unsigned char *s = lstr(cp, length); |
---|
489 | (dnaflag ? n_decode : p_decode)(s-1, const_cast<unsigned char *>(s-1), length); |
---|
490 | return s; |
---|
491 | } |
---|
492 | |
---|
493 | inline void dumpMatrix(int x0, int y0, int breite, int hoehe, int mitte_vert) { |
---|
494 | char *sl = ARB_alloc<char>(hoehe+3); |
---|
495 | char *ma = ARB_alloc<char>(breite+3); |
---|
496 | |
---|
497 | sprintf(ma, "-%s-", nstr(seq_array[1]+x0, breite)); |
---|
498 | sprintf(sl, "-%s-", nstr(seq_array[2]+y0, hoehe)); |
---|
499 | |
---|
500 | printf(" "); |
---|
501 | |
---|
502 | { |
---|
503 | int b; |
---|
504 | for (b=0; b<=mitte_vert; b++) { |
---|
505 | printf("%5c", ma[b]); |
---|
506 | } |
---|
507 | printf(" MID"); |
---|
508 | for (b++; b<=breite+1+1; b++) { |
---|
509 | printf("%5c", ma[b-1]); |
---|
510 | } |
---|
511 | } |
---|
512 | |
---|
513 | for (int h=0; h<=hoehe+1; h++) { |
---|
514 | printf("\n%c vertical: ", sl[h]); |
---|
515 | for (int b=0; b<=breite+1+1; b++) printf("%5i", vertical[b][h]); |
---|
516 | printf("\n verticalOpen: "); |
---|
517 | for (int b=0; b<=breite+1+1; b++) printf("%5i", verticalOpen[b][h]); |
---|
518 | printf("\n diagonal: "); |
---|
519 | for (int b=0; b<=breite+1+1; b++) printf("%5i", diagonal[b][h]); |
---|
520 | printf("\n horizontal: "); |
---|
521 | for (int b=0; b<=breite+1+1; b++) printf("%5i", horizontal[b][h]); |
---|
522 | printf("\n horizontalOpen:"); |
---|
523 | for (int b=0; b<=breite+1+1; b++) printf("%5i", horizontalOpen[b][h]); |
---|
524 | printf("\n"); |
---|
525 | } |
---|
526 | |
---|
527 | printf("--------------------\n"); |
---|
528 | |
---|
529 | free(ma); |
---|
530 | free(sl); |
---|
531 | } |
---|
532 | #endif |
---|
533 | |
---|
534 | static int diff(int v1, int v2, int v3, int v4, int st, int en) { |
---|
535 | /* v1,v3 master sequence (v1=offset, v3=length) |
---|
536 | * v2,v4 slave sequence (v2=offset, v4=length) |
---|
537 | * st,en gap_open-penalties for start and end of the sequence |
---|
538 | * |
---|
539 | * returns costs for inserted gaps |
---|
540 | */ |
---|
541 | #ifdef DEBUG |
---|
542 | # ifdef MATRIX_DUMP |
---|
543 | int display_matrix = 0; |
---|
544 | |
---|
545 | fa_assert(v3<=(DISPLAY_MATRIX_SIZE+2)); // width |
---|
546 | fa_assert(v4<=(DISPLAY_MATRIX_SIZE)); // height |
---|
547 | |
---|
548 | # define DISPLAY_MATRIX_ELEMENTS ((DISPLAY_MATRIX_SIZE+2)*(DISPLAY_MATRIX_SIZE+2)) |
---|
549 | |
---|
550 | memset(vertical, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
551 | memset(verticalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
552 | memset(diagonal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
553 | memset(horizontal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
554 | memset(horizontalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
555 | |
---|
556 | # endif |
---|
557 | #endif |
---|
558 | static int deep; |
---|
559 | deep++; |
---|
560 | |
---|
561 | #if (defined (DEBUG) && 0) |
---|
562 | { |
---|
563 | char *d = lstr(seq_array[1]+v1, v3); |
---|
564 | |
---|
565 | (dnaflag ? n_decode : p_decode)(d-1, d-1, v3); |
---|
566 | |
---|
567 | for (int cnt=0; cnt<deep; cnt++) putchar(' '); |
---|
568 | printf("master = '%s' (penalties left=%i right=%i)\n", d, st, en); |
---|
569 | |
---|
570 | d = lstr(seq_array[2]+v2, v4); |
---|
571 | (dnaflag ? n_decode : p_decode)(d-1, d-1, v4); |
---|
572 | |
---|
573 | for (cnt=0; cnt<deep; cnt++) putchar(' '); |
---|
574 | printf("slave = '%s'\n", d); |
---|
575 | } |
---|
576 | #endif |
---|
577 | |
---|
578 | if (v4<=0) { // if slave sequence is empty |
---|
579 | if (v3>0) { |
---|
580 | if (last_print<0 && print_ptr>0) { |
---|
581 | last_print = decr_displ(print_ptr-1, v3); // add .. |
---|
582 | } |
---|
583 | else { |
---|
584 | last_print = set_displ(print_ptr++, -(v3)); // .. or insert gap of length 'v3' into slave |
---|
585 | } |
---|
586 | } |
---|
587 | |
---|
588 | deep--; |
---|
589 | return master_gapAt(v1, v3); |
---|
590 | } |
---|
591 | |
---|
592 | int ctrj = 0; |
---|
593 | if (v3<=1) { |
---|
594 | if (v3<=0) { // if master sequence is empty |
---|
595 | add(v4); // ??? insert gap length 'v4' into master ??? |
---|
596 | deep--; |
---|
597 | return slave_gapAt(v2, v4); |
---|
598 | } |
---|
599 | |
---|
600 | fa_assert(v3==1); |
---|
601 | // if master length == 1 |
---|
602 | |
---|
603 | if (v4==1) { |
---|
604 | if (st>en) st=en; |
---|
605 | |
---|
606 | //!*************if(!v4)*********BUG******************************* |
---|
607 | |
---|
608 | int ctrc = slave_gapAtWithOpenPenalty(v2, v4, st); |
---|
609 | ctrj = 0; |
---|
610 | |
---|
611 | for (int j=1; j<=v4; ++j) { |
---|
612 | int k = slave_gapAt(v2, j-1) + calc_weight(1, j, v1, v2) + slave_gapAt(v2+j, v4-j); |
---|
613 | if (k<ctrc) { ctrc = k; ctrj = j; } |
---|
614 | } |
---|
615 | |
---|
616 | if (!ctrj) { |
---|
617 | add(v4); |
---|
618 | if (last_print<0 && print_ptr>0) { |
---|
619 | last_print = decr_displ(print_ptr-1, 1); |
---|
620 | } |
---|
621 | else { |
---|
622 | last_print = set_displ(print_ptr++, -1); |
---|
623 | } |
---|
624 | } |
---|
625 | else { |
---|
626 | if (ctrj>1) add(ctrj-1); |
---|
627 | set_displ(print_ptr++, last_print = 0); |
---|
628 | if (ctrj<v4) add(v4-ctrj); |
---|
629 | } |
---|
630 | |
---|
631 | deep--; |
---|
632 | return ctrc; |
---|
633 | } |
---|
634 | } |
---|
635 | |
---|
636 | fa_assert(v3>=1 && v4>=1); |
---|
637 | |
---|
638 | // slave length >= 1 |
---|
639 | // master length >= 1 |
---|
640 | |
---|
641 | # define AF 0 |
---|
642 | |
---|
643 | // first column of matrix (slave side): |
---|
644 | IF_MATRIX_DUMP(vertical[0][0]=) |
---|
645 | zza[0] = 0; |
---|
646 | int p = master_gap_open(v1); |
---|
647 | for (int j=1; j<=v4; j++) { |
---|
648 | p += master_gap_extend(v1); |
---|
649 | IF_MATRIX_DUMP(vertical[0][j]=) |
---|
650 | zza[j] = p; |
---|
651 | IF_MATRIX_DUMP(verticalOpen[0][j]=) |
---|
652 | zzb[j] = p + master_gap_open(v1); |
---|
653 | } |
---|
654 | |
---|
655 | // left half of the matrix |
---|
656 | p = st; |
---|
657 | int ctri = v3 / 2; |
---|
658 | for (int i=1; i<=ctri; i++) { |
---|
659 | int n = zza[0]; |
---|
660 | p += master_gap_extend(v1+i+AF); |
---|
661 | int k = p; |
---|
662 | IF_MATRIX_DUMP(vertical[i][0]=) |
---|
663 | zza[0] = k; |
---|
664 | int l = p + master_gap_open(v1+i+AF); |
---|
665 | |
---|
666 | for (int j=1; j<=v4; j++) { |
---|
667 | // from above (gap in master (behind position i)) |
---|
668 | IF_MATRIX_DUMP(verticalOpen[i][j]=) k += master_gap_open(v1+i+AF)+master_gap_extend(v1+i+AF); // (1) |
---|
669 | IF_MATRIX_DUMP(vertical[i][j]=) l += master_gap_extend(v1+i+AF); // (2) |
---|
670 | if (k<l) l = k; // l=min((1),(2)) |
---|
671 | |
---|
672 | // from left (gap in slave (behind position j)) |
---|
673 | IF_MATRIX_DUMP(horizontalOpen[i][j]=) k = zza[j] + slave_gap_open(v2+j+AF)+slave_gap_extend(v2+j+AF); // (3) |
---|
674 | int m; |
---|
675 | IF_MATRIX_DUMP(horizontal[i][j]=) m = zzb[j] + slave_gap_extend(v2+j+AF); // (4) |
---|
676 | if (k<m) m = k; // m=min((3),(4)) |
---|
677 | |
---|
678 | // diagonal (no gaps) |
---|
679 | IF_MATRIX_DUMP(diagonal[i][j]=) k = n + calc_weight(i, j, v1, v2); // (5) |
---|
680 | if (l<k) k = l; |
---|
681 | if (m<k) k = m; // k = minimum of all paths |
---|
682 | |
---|
683 | n = zza[j]; // minimum of same row; one column to the left |
---|
684 | zza[j] = k; // minimum of all paths to this matrix position |
---|
685 | zzb[j] = m; // minimum of those two paths, where gap was inserted into slave |
---|
686 | } |
---|
687 | } |
---|
688 | |
---|
689 | # define MHO 1 // X-Offset for second half of matrix-arrays (only used to insert MID-column into dumpMatrix()) |
---|
690 | # define BO 1 // Offset for second half of matrix (cause now the indices i and j are smaller as above) |
---|
691 | |
---|
692 | // last column of matrix: |
---|
693 | |
---|
694 | zzb[0]=zza[0]; |
---|
695 | IF_MATRIX_DUMP(vertical[v3+1+MHO][v4+1]=) |
---|
696 | zzc[v4]=0; |
---|
697 | p = master_gap_open(v1+v3); |
---|
698 | for (int j=v4-1; j>-1; j--) { |
---|
699 | p += master_gap_extend(v1+v3); |
---|
700 | IF_MATRIX_DUMP(vertical[v3+1+MHO][j+BO]=) |
---|
701 | zzc[j] = p; |
---|
702 | IF_MATRIX_DUMP(verticalOpen[v3+1+MHO][j+BO]=) |
---|
703 | zzd[j] = p+master_gap_open(v1+v3); |
---|
704 | } |
---|
705 | |
---|
706 | // right half of matrix (backwards): |
---|
707 | p = en; |
---|
708 | for (int i=v3-1; i>=ctri; i--) { |
---|
709 | int n = zzc[v4]; |
---|
710 | p += master_gap_extend(v1+i); |
---|
711 | int k = p; |
---|
712 | IF_MATRIX_DUMP(vertical[i+BO+MHO][v4+1]=) |
---|
713 | zzc[v4] = k; |
---|
714 | int l = p+master_gap_open(v1+i); |
---|
715 | |
---|
716 | for (int j=v4-1; j>=0; j--) { |
---|
717 | // from below (gap in master (in front of position (i+BO))) |
---|
718 | IF_MATRIX_DUMP(verticalOpen[i+BO+MHO][j+BO]=) k += master_gap_open(v1+i)+master_gap_extend(v1+i); // (1) |
---|
719 | IF_MATRIX_DUMP(vertical[i+BO+MHO][j+BO]=) l += master_gap_extend(v1+i); // (2) |
---|
720 | if (k<l) l = k; // l=min((1),(2)) |
---|
721 | |
---|
722 | // from right (gap in slave (in front of position (j+BO))) |
---|
723 | IF_MATRIX_DUMP(horizontalOpen[i+BO+MHO][j+BO]=) k = zzc[j] + slave_gap_open(v2+j) + slave_gap_extend(v2+j); // (3) |
---|
724 | int m; |
---|
725 | IF_MATRIX_DUMP(horizontal[i+BO+MHO][j+BO]=) m = zzd[j] + slave_gap_extend(v2+j); // (4) |
---|
726 | if (k<m) m = k; // m=min((3),(4)) |
---|
727 | |
---|
728 | // diagonal (no gaps) |
---|
729 | IF_MATRIX_DUMP(diagonal[i+BO+MHO][j+BO]=) k = n + calc_weight(i+BO, j+BO, v1, v2); // (5) |
---|
730 | if (l<k) k = l; |
---|
731 | if (m<k) k = m; // k = minimum of all paths |
---|
732 | |
---|
733 | n = zzc[j]; // minimum of same row; one column to the right |
---|
734 | zzc[j] = k; // minimum of all paths to this matrix position |
---|
735 | zzd[j] = m; // minimum of those two paths, where gap was inserted into slave |
---|
736 | } |
---|
737 | } |
---|
738 | |
---|
739 | #undef BO |
---|
740 | |
---|
741 | // connect rightmost column of first half (column ctri) |
---|
742 | // with leftmost column of second half (column ctri+1) |
---|
743 | |
---|
744 | zzd[v4] = zzc[v4]; |
---|
745 | |
---|
746 | int ctrc = INT_MAX; |
---|
747 | int flag = 0; |
---|
748 | |
---|
749 | for (int j=(ctri ? 0 : 1); j<=v4; j++) { |
---|
750 | int k; |
---|
751 | IF_MATRIX_DUMP(vertical[ctri+MHO][j]=) |
---|
752 | k = zza[j] + zzc[j]; // sum up both calculations (=diagonal=no gaps) |
---|
753 | |
---|
754 | if (k<ctrc || ((k==ctrc) && (zza[j]!=zzb[j]) && (zzc[j]==zzd[j]))) { |
---|
755 | ctrc = k; |
---|
756 | ctrj = j; |
---|
757 | flag = 1; |
---|
758 | } |
---|
759 | } |
---|
760 | |
---|
761 | for (int j=v4; j>=(ctri ? 0 : 1); j--) { |
---|
762 | int k; |
---|
763 | IF_MATRIX_DUMP(verticalOpen[ctri+MHO][j]=) |
---|
764 | k = zzb[j] + zzd[j] // paths where gaps were inserted into slave (left and right side!) |
---|
765 | - slave_gap_open(j); // subtract gap_open-penalty which was added twice (at left and right end of gap) |
---|
766 | |
---|
767 | if (k<ctrc) { |
---|
768 | ctrc = k; |
---|
769 | ctrj = j; |
---|
770 | flag = 2; |
---|
771 | } |
---|
772 | } |
---|
773 | |
---|
774 | fa_assert(flag>=1 && flag<=2); |
---|
775 | |
---|
776 | #undef MHO |
---|
777 | |
---|
778 | #ifdef MATRIX_DUMP |
---|
779 | if (display_matrix) { |
---|
780 | dumpMatrix(v1, v2, v3, v4, ctri); |
---|
781 | } |
---|
782 | #endif |
---|
783 | |
---|
784 | // Conquer recursively around midpoint |
---|
785 | |
---|
786 | if (flag==1) { // Type 1 gaps |
---|
787 | diff(v1, v2, ctri, ctrj, st, master_gap_open(v1+ctri)); // includes midpoint ctri and ctrj |
---|
788 | diff(v1+ctri, v2+ctrj, v3-ctri, v4-ctrj, master_gap_open(v1+ctri), en); |
---|
789 | } |
---|
790 | else { |
---|
791 | diff(v1, v2, ctri-1, ctrj, st, 0); // includes midpoint ctrj |
---|
792 | |
---|
793 | if (last_print<0 && print_ptr>0) { // Delete 2 |
---|
794 | last_print = decr_displ(print_ptr-1, 2); |
---|
795 | } |
---|
796 | else { |
---|
797 | last_print = set_displ(print_ptr++, -2); |
---|
798 | } |
---|
799 | |
---|
800 | diff(v1+ctri+1, v2+ctrj, v3-ctri-1, v4-ctrj, 0, en); |
---|
801 | } |
---|
802 | |
---|
803 | deep--; |
---|
804 | return ctrc; // Return the score of the best alignment |
---|
805 | } |
---|
806 | |
---|
807 | static void do_align(int& score, long act_seq_length) { |
---|
808 | pos1 = pos2 = 0; |
---|
809 | |
---|
810 | // clear statistics |
---|
811 | |
---|
812 | for (int i=1; i<=act_seq_length; ++i) { |
---|
813 | for (int j=0; j<MAX_BASETYPES; ++j) { |
---|
814 | naa1[j][i] = naa2[j][i]=0; |
---|
815 | naas[j][i]=0; |
---|
816 | } |
---|
817 | } |
---|
818 | |
---|
819 | // create position statistics for each group |
---|
820 | // [here every group contains only one seq] |
---|
821 | |
---|
822 | int l1 = 0; |
---|
823 | int l2 = 0; |
---|
824 | for (int i=1; i<=nseqs; ++i) { |
---|
825 | if (group[i]==1) { |
---|
826 | fst_list[++pos1]=i; |
---|
827 | for (int j=1; j<=seqlen_array[i]; ++j) { |
---|
828 | unsigned char b = seq_array[i][j]; |
---|
829 | if (b<128) { |
---|
830 | ++naa1[b][j]; |
---|
831 | ++naa1[0][j]; |
---|
832 | } |
---|
833 | } |
---|
834 | if (seqlen_array[i]>l1) l1=seqlen_array[i]; |
---|
835 | } |
---|
836 | else if (group[i]==2) { |
---|
837 | snd_list[++pos2]=i; |
---|
838 | for (int j=1; j<=seqlen_array[i]; ++j) { |
---|
839 | unsigned char b = seq_array[i][j]; |
---|
840 | if (b<128) { |
---|
841 | ++naa2[b][j]; |
---|
842 | ++naa2[0][j]; |
---|
843 | } |
---|
844 | } |
---|
845 | if (seqlen_array[i]>l2) l2=seqlen_array[i]; |
---|
846 | } |
---|
847 | } |
---|
848 | |
---|
849 | if (pos1>=pos2) { |
---|
850 | int t_arr[MAX_BASETYPES]; |
---|
851 | |
---|
852 | for (int i=1; i<=pos2; ++i) alist[i]=snd_list[i]; |
---|
853 | for (int n=1; n<=l1; ++n) { |
---|
854 | for (int i=1; i<MAX_BASETYPES; ++i) t_arr[i]=0; |
---|
855 | for (int i=1; i<MAX_BASETYPES; ++i) { |
---|
856 | if (naa1[i][n]>0) { |
---|
857 | for (int j=1; j<MAX_BASETYPES; ++j) { // LOOP_VECTORIZED |
---|
858 | t_arr[j] += (weights[i][j]*naa1[i][n]); |
---|
859 | } |
---|
860 | } |
---|
861 | } |
---|
862 | int k = naa1[0][n]; |
---|
863 | if (k>0) { |
---|
864 | for (int i=1; i<MAX_BASETYPES; ++i) { |
---|
865 | naas[i][n]=t_arr[i]/k; |
---|
866 | } |
---|
867 | } |
---|
868 | } |
---|
869 | } |
---|
870 | else { |
---|
871 | fa_assert(0); // should never occur if MAXN==2 |
---|
872 | } |
---|
873 | |
---|
874 | score = diff(1, 1, l1, l2, master_gap_open(1), master_gap_open(l1+1)); // Myers and Miller alignment now |
---|
875 | } |
---|
876 | |
---|
877 | static int add_ggaps(long /* max_seq_length */) { |
---|
878 | int pos = 1; |
---|
879 | int to_do = print_ptr; |
---|
880 | |
---|
881 | for (int i=0; i<to_do; ++i) { // was: 1 .. <=to_do |
---|
882 | if (displ[i]==0) { |
---|
883 | result[1][pos]=result[2][pos]='*'; |
---|
884 | ++pos; |
---|
885 | } |
---|
886 | else { |
---|
887 | int k = displ[i]; |
---|
888 | if (k>0) { |
---|
889 | for (int j=0; j<=k-1; ++j) { // LOOP_VECTORIZED =0[>=9<11] =1 // completely fails with 9.x and 10.x; 1x with 4.9.2 + 5.4 + 5.5 + 6.5 + 7.2 + 7.5 + 8.1 + 8.5 |
---|
890 | result[2][pos+j]='*'; |
---|
891 | result[1][pos+j]='-'; |
---|
892 | } |
---|
893 | pos += k; |
---|
894 | } |
---|
895 | else { |
---|
896 | k = (displ[i]<0) ? displ[i] * -1 : displ[i]; |
---|
897 | for (int j=0; j<=k-1; ++j) { // LOOP_VECTORIZED =0[>=9<11] =1 // completely fails with 9.x and 10.x; 1x with 4.9.2 + 5.4 + 5.5 + 6.5 + 7.2 + 7.5 + 8.1 + 8.5 |
---|
898 | result[1][pos+j]='*'; |
---|
899 | result[2][pos+j]='-'; |
---|
900 | } |
---|
901 | pos += k; |
---|
902 | } |
---|
903 | } |
---|
904 | } |
---|
905 | |
---|
906 | #ifdef DEBUG |
---|
907 | result[1][pos] = 0; |
---|
908 | result[2][pos] = 0; |
---|
909 | #endif |
---|
910 | |
---|
911 | return pos-1; |
---|
912 | } |
---|
913 | |
---|
914 | |
---|
915 | static int res_index(const char *t, char c) { |
---|
916 | if (t[0]==c) return UNKNOWN_ACID; |
---|
917 | for (int i=1; t[i]; i++) { |
---|
918 | if (t[i]==c) return i; |
---|
919 | } |
---|
920 | return 0; |
---|
921 | } |
---|
922 | |
---|
923 | static void p_encode(const unsigned char *seq, unsigned char *naseq, int l) { |
---|
924 | // code seq as ints .. use -2 for gap |
---|
925 | bool warned = false; |
---|
926 | |
---|
927 | for (int i=1; i<=l; i++) { |
---|
928 | int c = res_index(amino_acid_order, seq[i]); |
---|
929 | |
---|
930 | if (!c) { |
---|
931 | if (seq[i] == '*') { |
---|
932 | c = -2; |
---|
933 | } |
---|
934 | else { |
---|
935 | if (!warned && seq[i] != '?') { // consensus contains ? and it means X |
---|
936 | char buf[100]; |
---|
937 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
---|
938 | aw_message(buf); |
---|
939 | warned = true; |
---|
940 | } |
---|
941 | c = res_index(amino_acid_order, 'X'); |
---|
942 | } |
---|
943 | } |
---|
944 | |
---|
945 | fa_assert(c>0 || c == -2); |
---|
946 | naseq[i] = c; |
---|
947 | } |
---|
948 | } |
---|
949 | |
---|
950 | static void n_encode(const unsigned char *seq, unsigned char *naseq, int l) { |
---|
951 | // code seq as ints .. use -2 for gap |
---|
952 | bool warned = false; |
---|
953 | for (int i=1; i<=l; i++) { |
---|
954 | int c = res_index(nucleic_acid_order, seq[i]); |
---|
955 | |
---|
956 | if (!c) { |
---|
957 | if (!warned && seq[i] != '?') { |
---|
958 | char buf[100]; |
---|
959 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
---|
960 | aw_message(buf); |
---|
961 | warned = true; |
---|
962 | } |
---|
963 | c = res_index(nucleic_acid_order, 'N'); |
---|
964 | } |
---|
965 | |
---|
966 | fa_assert(c>0); |
---|
967 | naseq[i] = c; |
---|
968 | } |
---|
969 | } |
---|
970 | |
---|
971 | ARB_ERROR ClustalV_align(int is_dna, |
---|
972 | int weighted, |
---|
973 | const char *seq1, |
---|
974 | int length1, |
---|
975 | const char *seq2, |
---|
976 | int length2, |
---|
977 | const int *gapsBefore1, // size of array = length1+1 |
---|
978 | int max_seq_length, |
---|
979 | // result params: |
---|
980 | const char*& res1, |
---|
981 | const char*& res2, |
---|
982 | int& reslen, |
---|
983 | int& score) |
---|
984 | { |
---|
985 | ARB_ERROR error; |
---|
986 | gaps_before_position = gapsBefore1; |
---|
987 | |
---|
988 | if (!module_initialized) { // initialize only once |
---|
989 | dnaflag = is_dna; |
---|
990 | is_weight = weighted; |
---|
991 | |
---|
992 | error = init_myers(max_seq_length); |
---|
993 | if (!error) error = init_show_pair(max_seq_length); |
---|
994 | make_pamo(0); |
---|
995 | fill_pam(); |
---|
996 | |
---|
997 | for (int i=1; i<=2 && !error; i++) { |
---|
998 | seq_array[i] = (unsigned char*)ckalloc((max_seq_length+2)*sizeof(char), error); |
---|
999 | result[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char), error); |
---|
1000 | group[i] = i; |
---|
1001 | } |
---|
1002 | |
---|
1003 | if (!error) module_initialized = true; |
---|
1004 | } |
---|
1005 | else { |
---|
1006 | if (dnaflag!=is_dna || is_weight!=weighted) { |
---|
1007 | error = "Please call ClustalV_exit() between calls that differ in\n" |
---|
1008 | "one of the following parameters:\n" |
---|
1009 | " is_dna, weighted"; |
---|
1010 | } |
---|
1011 | } |
---|
1012 | |
---|
1013 | if (!error) { |
---|
1014 | nseqs = 2; |
---|
1015 | print_ptr = 0; |
---|
1016 | |
---|
1017 | #if defined(DEBUG) && 1 |
---|
1018 | memset(&seq_array[1][1], 0, max_seq_length*sizeof(seq_array[1][1])); |
---|
1019 | memset(&seq_array[2][1], 0, max_seq_length*sizeof(seq_array[2][1])); |
---|
1020 | memset(&displ[1], 0xff, max_seq_length*sizeof(displ[1])); |
---|
1021 | seq_array[1][0] = '_'; |
---|
1022 | seq_array[2][0] = '_'; |
---|
1023 | #endif |
---|
1024 | |
---|
1025 | { |
---|
1026 | typedef void (*encoder)(const unsigned char*, unsigned char*, int); |
---|
1027 | encoder encode = dnaflag ? n_encode : p_encode; |
---|
1028 | |
---|
1029 | encode((const unsigned char*)(seq1-1), seq_array[1], length1); |
---|
1030 | seqlen_array[1] = length1; |
---|
1031 | encode((const unsigned char*)(seq2-1), seq_array[2], length2); |
---|
1032 | seqlen_array[2] = length2; |
---|
1033 | |
---|
1034 | do_align(score, max(length1, length2)); |
---|
1035 | int alignedLength = add_ggaps(max_seq_length); |
---|
1036 | |
---|
1037 | result[1][alignedLength+1] = 0; |
---|
1038 | result[2][alignedLength+1] = 0; |
---|
1039 | |
---|
1040 | res1 = result[1]+1; |
---|
1041 | res2 = result[2]+1; |
---|
1042 | reslen = alignedLength; |
---|
1043 | } |
---|
1044 | } |
---|
1045 | |
---|
1046 | return error; |
---|
1047 | } |
---|
1048 | |
---|
1049 | void ClustalV_exit() { |
---|
1050 | if (module_initialized) { |
---|
1051 | module_initialized = false; |
---|
1052 | |
---|
1053 | for (int i=1; i<=2; i++) { |
---|
1054 | free(result[i]); |
---|
1055 | free(seq_array[i]); |
---|
1056 | } |
---|
1057 | exit_show_pair(); |
---|
1058 | exit_myers(); |
---|
1059 | } |
---|
1060 | } |
---|
1061 | |
---|
1062 | // -------------------------------------------------------------------------------- |
---|
1063 | |
---|
1064 | #ifdef UNIT_TESTS |
---|
1065 | #ifndef TEST_UNIT_H |
---|
1066 | #include <test_unit.h> |
---|
1067 | #endif |
---|
1068 | |
---|
1069 | static arb_test::match_expectation clustal_aligns(const char *i1, const char *i2, const char *expected1, const char *expected2, int expected_score) { |
---|
1070 | size_t l1 = strlen(i1); |
---|
1071 | size_t l2 = strlen(i2); |
---|
1072 | |
---|
1073 | const char *result1 = NULp; |
---|
1074 | const char *result2 = NULp; |
---|
1075 | int result_len; // test result value? |
---|
1076 | int score; |
---|
1077 | |
---|
1078 | int gaps_size = l1+1; |
---|
1079 | int no_gaps_before1[gaps_size]; |
---|
1080 | memset(no_gaps_before1, 0, gaps_size*sizeof(*no_gaps_before1)); |
---|
1081 | |
---|
1082 | ARB_ERROR error = ClustalV_align(0, 0, |
---|
1083 | i1, l1, |
---|
1084 | i2, l2, |
---|
1085 | no_gaps_before1, |
---|
1086 | max(l1, l2), |
---|
1087 | result1, |
---|
1088 | result2, |
---|
1089 | result_len, |
---|
1090 | score); |
---|
1091 | |
---|
1092 | using namespace arb_test; |
---|
1093 | expectation_group expected; |
---|
1094 | |
---|
1095 | expected.add(doesnt_report_error(error.deliver())); |
---|
1096 | expected.add(that(result1).is_equal_to(expected1)); |
---|
1097 | expected.add(that(result2).is_equal_to(expected2)); |
---|
1098 | expected.add(that(score).is_equal_to(expected_score)); |
---|
1099 | |
---|
1100 | return all().ofgroup(expected); |
---|
1101 | } |
---|
1102 | |
---|
1103 | #define TEST_CLUSTAL_ALIGNS(i1,i2,o1,o2,score) TEST_EXPECTATION(clustal_aligns(i1,i2,o1,o2,score)) |
---|
1104 | |
---|
1105 | void TEST_clustalV() { |
---|
1106 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
---|
1107 | "ACGTTGCAACGT", |
---|
1108 | "************", |
---|
1109 | "************", |
---|
1110 | 138); |
---|
1111 | |
---|
1112 | TEST_CLUSTAL_ALIGNS("ACGTTAACGT", |
---|
1113 | "ACGTTGCAACGT", |
---|
1114 | "*****--*****", |
---|
1115 | "************", |
---|
1116 | 207); |
---|
1117 | |
---|
1118 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
---|
1119 | "ACCAACGT", |
---|
1120 | "************", |
---|
1121 | "*----*******", |
---|
1122 | 156); |
---|
1123 | |
---|
1124 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
---|
1125 | "AGCTGTCACAGT", |
---|
1126 | "************", |
---|
1127 | "************", |
---|
1128 | 187); |
---|
1129 | |
---|
1130 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
---|
1131 | "ACGTACGTACGT", |
---|
1132 | "************", |
---|
1133 | "************", |
---|
1134 | 164); |
---|
1135 | |
---|
1136 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
---|
1137 | "ACGTACGT", |
---|
1138 | "************", |
---|
1139 | "****----****", |
---|
1140 | 162); |
---|
1141 | |
---|
1142 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
---|
1143 | "AACGGTTC", |
---|
1144 | "************", |
---|
1145 | // "ACGTTGCAACGT", |
---|
1146 | // "A----ACGGTTC", |
---|
1147 | "*----*******", |
---|
1148 | 193); |
---|
1149 | } |
---|
1150 | |
---|
1151 | #endif // UNIT_TESTS |
---|
1152 | |
---|
1153 | // -------------------------------------------------------------------------------- |
---|