1 | #include <ctype.h> |
---|
2 | #include <stdlib.h> |
---|
3 | #include <memory.h> |
---|
4 | #include <malloc.h> |
---|
5 | #include <string.h> |
---|
6 | #include <stdio.h> |
---|
7 | #include <limits.h> |
---|
8 | |
---|
9 | #include <arbdb.h> |
---|
10 | #include <aw_root.hxx> |
---|
11 | |
---|
12 | #include "awtc_ClustalV.hxx" |
---|
13 | #include "awtc_seq_search.hxx" |
---|
14 | |
---|
15 | /* ---------------------------------------------------------------- */ |
---|
16 | |
---|
17 | #define MASTER_GAP_OPEN 50 |
---|
18 | #define CHEAP_GAP_OPEN 20 // penalty for creating a gap (if we have gaps in master) |
---|
19 | |
---|
20 | #define DEFAULT_GAP_OPEN 30 // penalty for creating a gap |
---|
21 | |
---|
22 | #define MASTER_GAP_EXTEND 18 |
---|
23 | #define CHEAP_GAP_EXTEND 5 // penalty for extending a gap (if we have gaps in master) |
---|
24 | |
---|
25 | #define DEFAULT_GAP_EXTEND 10 // penalty for extending a gap |
---|
26 | |
---|
27 | #define DEFAULT_IMPROBABLY_MUTATION 10 // penalty for mutations |
---|
28 | #define DEFAULT_PROBABLY_MUTATION 4 // penalty for special mutations (A<->G,C<->U/T) (only if 'weighted') |
---|
29 | |
---|
30 | #define DYNAMIC_PENALTIES // otherwise you get FIXED PENALTIES (=no cheap penalties) |
---|
31 | |
---|
32 | /* ---------------------------------------------------------------- */ |
---|
33 | |
---|
34 | #define MAX_GAP_OPEN_DISCOUNT (DEFAULT_GAP_OPEN-CHEAP_GAP_OPEN) // maximum subtracted from DEFAULT_GAP_OPEN |
---|
35 | #define MAX_GAP_EXTEND_DISCOUNT (DEFAULT_GAP_EXTEND-CHEAP_GAP_EXTEND) // maximum subtracted from DEFAULT_GAP_EXTEND |
---|
36 | |
---|
37 | #define MAXN 2 /* Maximum number of sequences (both groups) */ |
---|
38 | |
---|
39 | #if (MAXN==2) |
---|
40 | #define MAXN_2(xxx) xxx |
---|
41 | #define MAXN_2_assert(xxx) awtc_assert(xxx) |
---|
42 | #else |
---|
43 | #define MAXN_2(xxx) |
---|
44 | #define MAXN_2_assert(xxx) |
---|
45 | #endif |
---|
46 | |
---|
47 | #define TRUE 1 |
---|
48 | #define FALSE 0 |
---|
49 | |
---|
50 | static GB_ERROR error; |
---|
51 | static int module_initialized = 0; |
---|
52 | |
---|
53 | typedef int Boolean; |
---|
54 | |
---|
55 | static Boolean dnaflag; |
---|
56 | static Boolean is_weight; |
---|
57 | |
---|
58 | static int xover; |
---|
59 | static int little_pam; |
---|
60 | static int big_pam; |
---|
61 | static int pamo[210]; |
---|
62 | static int pam[21][21]; |
---|
63 | |
---|
64 | static int pos1; |
---|
65 | static int pos2; |
---|
66 | static int **naa1; // naa1[basetype][position] counts bases for each position of all sequences in group1 |
---|
67 | static int **naa2; // naa2[basetype][position] same for group2 |
---|
68 | static int **naas; // |
---|
69 | static int seqlen_array[MAXN+1];// length of all sequences |
---|
70 | static char *seq_array[MAXN+1]; // the sequences |
---|
71 | static int group[MAXN+1]; // group of sequence |
---|
72 | static int alist[MAXN+1]; // indices of sequences to be aligned |
---|
73 | static int fst_list[MAXN+1]; |
---|
74 | static int snd_list[MAXN+1]; |
---|
75 | static int nseqs; // # of sequences |
---|
76 | static int weights[21][21]; // weights[b1][b2] : penalty for mutation from base 'b1' to base 'b2' |
---|
77 | static int *displ; // displ==0 -> base in both , displ<0 -> displ gaps in slave, displ>0 -> displ gaps in master |
---|
78 | static int *zza, // column (left->right) of align matrix (minimum of all paths to this matrix element) |
---|
79 | *zzb, // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
---|
80 | *zzc, // column (left<-right) of align matrix (minimum of all paths to this matrix element) |
---|
81 | *zzd; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
---|
82 | /*static int gap_open; |
---|
83 | static int gap_extend;*/ |
---|
84 | static int print_ptr; |
---|
85 | static int last_print; |
---|
86 | static const int *gapsBeforePosition; |
---|
87 | |
---|
88 | //#define MATRIX_DUMP |
---|
89 | |
---|
90 | #ifdef MATRIX_DUMP |
---|
91 | # define IF_MATRIX_DUMP(xxx) xxx |
---|
92 | # define DISPLAY_MATRIX_SIZE 500 |
---|
93 | static int vertical [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
94 | static int verticalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
95 | static int diagonal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
96 | static int horizontal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
97 | static int horizontalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
98 | #else |
---|
99 | # define IF_MATRIX_DUMP(xxx) |
---|
100 | #endif |
---|
101 | |
---|
102 | static inline int master_gap_open(int beforePosition) |
---|
103 | { |
---|
104 | #ifdef DYNAMIC_PENALTIES |
---|
105 | long gaps = gapsBeforePosition[beforePosition-1]; |
---|
106 | return (gaps)? MASTER_GAP_OPEN - MAX_GAP_OPEN_DISCOUNT : MASTER_GAP_OPEN; |
---|
107 | |
---|
108 | /* return |
---|
109 | gaps >= MAX_GAP_OPEN_DISCOUNT |
---|
110 | ? DEFAULT_GAP_OPEN-MAX_GAP_OPEN_DISCOUNT |
---|
111 | : DEFAULT_GAP_OPEN-gaps;*/ |
---|
112 | #else |
---|
113 | return DEFAULT_GAP_OPEN; |
---|
114 | #endif |
---|
115 | } |
---|
116 | static inline int master_gap_extend(int beforePosition) |
---|
117 | { |
---|
118 | #ifdef DYNAMIC_PENALTIES |
---|
119 | long gaps = gapsBeforePosition[beforePosition-1]; |
---|
120 | |
---|
121 | return (gaps)? MASTER_GAP_EXTEND - MAX_GAP_EXTEND_DISCOUNT : MASTER_GAP_EXTEND; |
---|
122 | /* return |
---|
123 | gaps >= MAX_GAP_EXTEND_DISCOUNT |
---|
124 | ? DEFAULT_GAP_EXTEND-MAX_GAP_EXTEND_DISCOUNT |
---|
125 | : DEFAULT_GAP_EXTEND-gaps;*/ |
---|
126 | #else |
---|
127 | return DEFAULT_GAP_EXTEND; |
---|
128 | #endif |
---|
129 | } |
---|
130 | |
---|
131 | static inline int master_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
---|
132 | { |
---|
133 | if (length<=0) |
---|
134 | return 0; |
---|
135 | |
---|
136 | int beforePosition = atPosition, |
---|
137 | afterPosition = atPosition-1; |
---|
138 | |
---|
139 | while (length--) |
---|
140 | { |
---|
141 | int p1,p2; |
---|
142 | |
---|
143 | if ((p1=master_gap_extend(beforePosition)) < (p2=master_gap_extend(afterPosition+1)) && |
---|
144 | beforePosition>1) |
---|
145 | { |
---|
146 | penalty += p1; |
---|
147 | beforePosition--; |
---|
148 | } |
---|
149 | else |
---|
150 | { |
---|
151 | penalty += p2; |
---|
152 | afterPosition++; |
---|
153 | } |
---|
154 | } |
---|
155 | |
---|
156 | return penalty; |
---|
157 | } |
---|
158 | |
---|
159 | static inline int master_gapAt(int atPosition, int length) |
---|
160 | { |
---|
161 | return master_gapAtWithOpenPenalty(atPosition, length, master_gap_open(atPosition)); |
---|
162 | } |
---|
163 | |
---|
164 | static inline int slave_gap_open(int /*beforePosition*/) |
---|
165 | { |
---|
166 | return DEFAULT_GAP_OPEN; |
---|
167 | } |
---|
168 | |
---|
169 | static inline int slave_gap_extend(int /*beforePosition*/) |
---|
170 | { |
---|
171 | return DEFAULT_GAP_EXTEND; |
---|
172 | } |
---|
173 | |
---|
174 | static inline int slave_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
---|
175 | { |
---|
176 | return length<=0 ? 0 : penalty + length*slave_gap_extend(atPosition); |
---|
177 | } |
---|
178 | |
---|
179 | static inline int slave_gapAt(int atPosition, int length) |
---|
180 | { |
---|
181 | return slave_gapAtWithOpenPenalty(atPosition, length, slave_gap_open(atPosition)); |
---|
182 | } |
---|
183 | |
---|
184 | #define UNKNOWN_ACID 255 |
---|
185 | static const char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW"; |
---|
186 | |
---|
187 | #define NUCLEIDS 16 |
---|
188 | static const char *nucleic_acid_order = "-ACGTUMRWSYKVHDBN"; |
---|
189 | static const char *nucleic_maybe_A = "-A----AAA---AAA-A"; |
---|
190 | static const char *nucleic_maybe_C = "--C---C--CC-CC-CC"; |
---|
191 | static const char *nucleic_maybe_G = "---G---G-G-GG-GGG"; |
---|
192 | static const char *nucleic_maybe_T = "----T---T-TT-TTTT"; |
---|
193 | static const char *nucleic_maybe_U = "-----U--U-UU-UUUU"; |
---|
194 | static const char *nucleic_maybe[6] = { NULL, nucleic_maybe_A, nucleic_maybe_C, nucleic_maybe_G, nucleic_maybe_T, nucleic_maybe_U }; |
---|
195 | |
---|
196 | /* |
---|
197 | * M = A or C S = G or C V = A or G or C N = A or C or G or T |
---|
198 | * R = A or G Y = C or T H = A or C or T |
---|
199 | * W = A or T K = G or T D = A or G or T |
---|
200 | */ |
---|
201 | |
---|
202 | #define cheap_if(cond) ((cond) ? 1 : 2) |
---|
203 | static int baseCmp(char c1, char c2) // c1,c2 == 1=A,2=C (==index of character in nucleic_acid_order[]) |
---|
204 | // returns 0 for equal |
---|
205 | // 1 for probably mutations |
---|
206 | // 2 for improbably mutations |
---|
207 | { |
---|
208 | #define COMPARABLE_BASES 5 |
---|
209 | |
---|
210 | if (c1==c2) return 0; |
---|
211 | |
---|
212 | if (c2<c1) // swap |
---|
213 | { |
---|
214 | int c3 = c1; |
---|
215 | |
---|
216 | c1 = c2; |
---|
217 | c2 = c3; |
---|
218 | } |
---|
219 | |
---|
220 | if (c2<=COMPARABLE_BASES) |
---|
221 | { |
---|
222 | awtc_assert(c1<=COMPARABLE_BASES); |
---|
223 | |
---|
224 | switch(c1) |
---|
225 | { |
---|
226 | case 0: return 2; |
---|
227 | case 1: return cheap_if(c2==3); // A->G |
---|
228 | case 2: return cheap_if(c2==4 || c2==5); // C->T/U |
---|
229 | case 3: return cheap_if(c2==1); // G->A |
---|
230 | case 4: if (c2==5) return 0; // T->U |
---|
231 | case 5: if (c2==4) return 0; // U->T |
---|
232 | return cheap_if(c2==2); // T/U->C |
---|
233 | default: awtc_assert(0); |
---|
234 | } |
---|
235 | } |
---|
236 | |
---|
237 | int i; |
---|
238 | int bestMatch = 3; |
---|
239 | |
---|
240 | if (c1<=COMPARABLE_BASES) |
---|
241 | { |
---|
242 | for (i=1; i<=COMPARABLE_BASES; i++) |
---|
243 | { |
---|
244 | if (isalpha(nucleic_maybe[i][c2])) // 'c2' maybe a 'i' |
---|
245 | { |
---|
246 | int match = baseCmp(c1,i); |
---|
247 | if (match<bestMatch) bestMatch = match; |
---|
248 | } |
---|
249 | } |
---|
250 | } |
---|
251 | else |
---|
252 | { |
---|
253 | for (i=1; i<=COMPARABLE_BASES; i++) |
---|
254 | { |
---|
255 | if (isalpha(nucleic_maybe[i][c1])) // 'c1' maybe a 'i' |
---|
256 | { |
---|
257 | int match = baseCmp(i,c2); |
---|
258 | if (match<bestMatch) bestMatch = match; |
---|
259 | } |
---|
260 | } |
---|
261 | } |
---|
262 | |
---|
263 | awtc_assert(bestMatch>=0 && bestMatch<=2); |
---|
264 | return bestMatch; |
---|
265 | } |
---|
266 | #undef cheap_if |
---|
267 | |
---|
268 | int AWTC_baseMatch(char c1, char c2) // c1,c2 == ACGUTRS... |
---|
269 | // returns 0 for equal |
---|
270 | // 1 for probably mutations |
---|
271 | // 2 for improbably mutations |
---|
272 | // -1 if one char is illegal |
---|
273 | { |
---|
274 | char *p1 = strchr(nucleic_acid_order, c1); |
---|
275 | char *p2 = strchr(nucleic_acid_order, c2); |
---|
276 | |
---|
277 | awtc_assert(c1); |
---|
278 | awtc_assert(c2); |
---|
279 | |
---|
280 | if (p1 && p2) return baseCmp(p1-nucleic_acid_order, p2-nucleic_acid_order); |
---|
281 | return -1; |
---|
282 | } |
---|
283 | |
---|
284 | static int getPenalty(char c1, char c2) /* c1,c2 = A=0,C=1,... s.o. */ |
---|
285 | { |
---|
286 | switch (baseCmp(c1,c2)) |
---|
287 | { |
---|
288 | case 1: return DEFAULT_PROBABLY_MUTATION; |
---|
289 | case 2: return DEFAULT_IMPROBABLY_MUTATION; |
---|
290 | default: break; |
---|
291 | } |
---|
292 | |
---|
293 | return 0; |
---|
294 | } |
---|
295 | |
---|
296 | static char *result[3]; // result buffers |
---|
297 | |
---|
298 | static char pam250mt[]={ |
---|
299 | 12, |
---|
300 | 0, 2, |
---|
301 | -2, 1, 3, |
---|
302 | -3, 1, 0, 6, |
---|
303 | -2, 1, 1, 1, 2, |
---|
304 | -3, 1, 0,-1, 1, 5, |
---|
305 | -4, 1, 0,-1, 0, 0, 2, |
---|
306 | -5, 0, 0,-1, 0, 1, 2, 4, |
---|
307 | -5, 0, 0,-1, 0, 0, 1, 3, 4, |
---|
308 | -5,-1,-1, 0, 0,-1, 1, 2, 2, 4, |
---|
309 | -3,-1,-1, 0,-1,-2, 2, 1, 1, 3, 6, |
---|
310 | -4, 0,-1, 0,-2,-3, 0,-1,-1, 1, 2, 6, |
---|
311 | -5, 0, 0,-1,-1,-2, 1, 0, 0, 1, 0, 3, 5, |
---|
312 | -5,-2,-1,-2,-1,-3,-2,-3,-2,-1,-2, 0, 0, 6, |
---|
313 | -2,-1, 0,-2,-1,-3,-2,-2,-2,-2,-2,-2,-2, 2, 5, |
---|
314 | -6,-3,-2,-3,-2,-4,-3,-4,-3,-2,-2,-3,-3, 4, 2, 6, |
---|
315 | -2,-1, 0,-1, 0,-1,-2,-2,-2,-2,-2,-2,-2, 2, 4, 2, 4, |
---|
316 | -4,-3,-3,-5,-4,-5,-4,-6,-5,-5,-2,-4,-5, 0, 1, 2,-1, 9, |
---|
317 | 0,-3,-3,-5,-3,-5,-2,-4,-4,-4, 0,-4,-4,-2,-1,-1,-2, 7,10, |
---|
318 | -8,-2,-5,-6,-6,-7,-4,-7,-7,-5,-3, 2,-3,-4,-5,-2,-6, 0, 0,17 }; |
---|
319 | |
---|
320 | /* |
---|
321 | static char idmat[]={ |
---|
322 | 10, |
---|
323 | 0, 10, |
---|
324 | 0, 0, 10, |
---|
325 | 0, 0, 0, 10, |
---|
326 | 0, 0, 0, 0, 10, |
---|
327 | 0, 0, 0, 0, 1, 10, |
---|
328 | 0, 0, 0, 0, 0, 0, 10, |
---|
329 | 0, 0, 0, 0, 0, 0, 0, 10, |
---|
330 | 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
331 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
332 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
333 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
334 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
335 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
336 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
337 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
338 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
339 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
340 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10, |
---|
341 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10}; |
---|
342 | */ |
---|
343 | |
---|
344 | /* |
---|
345 | static char pam100mt[]={ |
---|
346 | 14, |
---|
347 | -1, 6, |
---|
348 | -5, 2, 7, |
---|
349 | -6, 1, -1, 10, |
---|
350 | -5, 2, 2, 1, 6, |
---|
351 | -8, 1, -3, -3, 1, 8, |
---|
352 | -8, 2, 0, -3, -1, -1, 7, |
---|
353 | -11,-1, -2, -4, -1, -1, 4, 8, |
---|
354 | -11,-2, -3, -3, 0, -2, 1, 5, 8, |
---|
355 | -11,-3, -3, -1, -2, -5,-1, 1, 4, 9, |
---|
356 | -6,-4, -5, -2, -5, -7, 2, -1, -2, 4,11, |
---|
357 | -6,-1, -4, -2, -5, -8,-3, -6, -5, 1, 1, 10, |
---|
358 | -11,-2, -1, -4, -4, -5, 1, -2, -2, -1,-3, 3, 8, |
---|
359 | -11,-4, -2, -6, -3, -8,-5, -8, -6, -2,-7, -2, 1, 13, |
---|
360 | -5,-4, -1, -6, -3, -7,-4, -6, -5, -5,-7, -4, -4, 2, 9, |
---|
361 | -12,-7, -5, -5, -5, -8,-6, -9, -7, -3,-5, -7, -6, 4, 2, 9, |
---|
362 | -4,-4, -1, -4, 0, -4,-5, -6, -5, -5,-6, -6, -6, 1, 5, 1, 8, |
---|
363 | -10,-5, -6, -9, -7, -8,-6,-11,-11,-10,-4, -7,-11, -2, 0, 0, -5,12, |
---|
364 | -2,-6, -6,-11, -6,-11,-3, -9, -7, -9,-1,-10,-10, -8, -4,-5, -6, 6,13, |
---|
365 | -13,-4,-10,-11,-11,-13,-8,-13,-14,-11,-7, 1, -9,-11,-12,-7,-14,-2,-2,19 }; |
---|
366 | */ |
---|
367 | |
---|
368 | static char *matptr = pam250mt; |
---|
369 | |
---|
370 | #if defined(DEBUG) && 0 |
---|
371 | static GB_ERROR p_decode(const char *naseq, char *seq, int l) |
---|
372 | { |
---|
373 | int len = strlen(amino_acid_order); |
---|
374 | |
---|
375 | for (int i=1; i<=l && naseq[i]; i++) |
---|
376 | { |
---|
377 | awtc_assert(naseq[i]<len); |
---|
378 | seq[i] = amino_acid_order[naseq[i]]; |
---|
379 | } |
---|
380 | |
---|
381 | return 0; |
---|
382 | } |
---|
383 | |
---|
384 | static GB_ERROR n_decode(const char *naseq, char *seq, int l) |
---|
385 | { |
---|
386 | int len = strlen(nucleic_acid_order); |
---|
387 | |
---|
388 | for (int i=1; i<=l && naseq[i]; i++) |
---|
389 | { |
---|
390 | awtc_assert(naseq[i]<len); |
---|
391 | seq[i] = nucleic_acid_order[naseq[i]]; |
---|
392 | } |
---|
393 | |
---|
394 | return 0; |
---|
395 | } |
---|
396 | #endif |
---|
397 | |
---|
398 | static inline GB_ERROR MAXLENtooSmall() |
---|
399 | { |
---|
400 | return GB_export_error("AWTC_ClustalV-aligner: MAXLEN is dimensioned to small for this sequence"); |
---|
401 | } |
---|
402 | |
---|
403 | static inline void *ckalloc(size_t bytes) |
---|
404 | { |
---|
405 | void *ret = malloc(bytes); |
---|
406 | |
---|
407 | if (!ret) error = "out of memory"; |
---|
408 | return ret; |
---|
409 | } |
---|
410 | |
---|
411 | static void init_myers(long max_seq_length) |
---|
412 | { |
---|
413 | int i; |
---|
414 | |
---|
415 | if (error) return; |
---|
416 | |
---|
417 | naa1 = (int **)ckalloc(21 * sizeof (int *) ); |
---|
418 | for(i=0;i<21;i++) naa1[i]=(int *)ckalloc( (max_seq_length+1)*sizeof (int)); |
---|
419 | if (error) return; |
---|
420 | |
---|
421 | naa2 = (int **)ckalloc(21 * sizeof (int *) ); |
---|
422 | for(i=0;i<21;i++) naa2[i]=(int *)ckalloc( (max_seq_length+1)*sizeof (int)); |
---|
423 | if (error) return; |
---|
424 | |
---|
425 | naas = (int **)ckalloc(21 * sizeof (int *) ); |
---|
426 | for(i=0;i<21;i++) naas[i]=(int *)ckalloc( (max_seq_length+1)*sizeof (int)); |
---|
427 | if (error) return; |
---|
428 | } |
---|
429 | |
---|
430 | static void make_pamo(int nv) |
---|
431 | { |
---|
432 | register int i,c; |
---|
433 | |
---|
434 | little_pam=big_pam=matptr[0]; |
---|
435 | for(i=0;i<210;++i) { |
---|
436 | c=matptr[i]; |
---|
437 | little_pam=(little_pam<c) ? little_pam : c; |
---|
438 | big_pam=(big_pam>c) ? big_pam : c; |
---|
439 | } |
---|
440 | for(i=0;i<210;++i) |
---|
441 | pamo[i]= matptr[i]-little_pam; |
---|
442 | nv -= little_pam; |
---|
443 | big_pam -= little_pam; |
---|
444 | xover = big_pam - nv; |
---|
445 | /* |
---|
446 | fprintf(stdout,"\n\nxover= %d, big_pam = %d, little_pam=%d, nv = %d\n\n" |
---|
447 | ,xover,big_pam,little_pam,nv); |
---|
448 | */ |
---|
449 | } |
---|
450 | |
---|
451 | static void fill_pam() |
---|
452 | { |
---|
453 | register int i,j,pos; |
---|
454 | |
---|
455 | pos=0; |
---|
456 | |
---|
457 | for(i=0;i<20;++i) |
---|
458 | for(j=0;j<=i;++j) |
---|
459 | pam[i][j]=pamo[pos++]; |
---|
460 | |
---|
461 | for(i=0;i<20;++i) |
---|
462 | for(j=0;j<=i;++j) |
---|
463 | pam[j][i]=pam[i][j]; |
---|
464 | |
---|
465 | if(dnaflag) |
---|
466 | { |
---|
467 | xover=4; |
---|
468 | big_pam=8; |
---|
469 | for(i=0;i<=NUCLEIDS;++i) |
---|
470 | for(j=0;j<=NUCLEIDS;++j) |
---|
471 | weights[i][j] = getPenalty(i,j); |
---|
472 | } |
---|
473 | else { |
---|
474 | for(i=1;i<21;++i) |
---|
475 | for(j=1;j<21;++j) { |
---|
476 | weights[i][j] = big_pam - pam[i-1][j-1]; |
---|
477 | } |
---|
478 | for(i=0;i<21;++i) { |
---|
479 | weights[0][i] = xover; |
---|
480 | weights[i][0] = xover; |
---|
481 | } |
---|
482 | } |
---|
483 | } |
---|
484 | |
---|
485 | static void exit_myers(void) |
---|
486 | { |
---|
487 | int i; |
---|
488 | for (i=0; i<21; i++) delete(naas[i]); |
---|
489 | delete(naas); |
---|
490 | for (i=0; i<21; i++) delete(naa2[i]); |
---|
491 | delete(naa2); |
---|
492 | for (i=0; i<21; i++) delete(naa1[i]); |
---|
493 | delete(naa1); |
---|
494 | } |
---|
495 | |
---|
496 | static void init_show_pair(long max_seq_length) |
---|
497 | { |
---|
498 | displ = (int *) ckalloc( (2*max_seq_length +1) * sizeof (int) ); |
---|
499 | |
---|
500 | zza = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
501 | zzb = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
502 | |
---|
503 | zzc = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
504 | zzd = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
505 | } |
---|
506 | |
---|
507 | void exit_show_pair(void) |
---|
508 | { |
---|
509 | delete(zzd); |
---|
510 | delete(zzc); |
---|
511 | delete(zzb); |
---|
512 | delete(zza); |
---|
513 | delete(displ); |
---|
514 | } |
---|
515 | |
---|
516 | static inline void add(int v) // insert 'v' gaps into master ??? |
---|
517 | { |
---|
518 | if (last_print<0 && print_ptr>0) { |
---|
519 | displ[print_ptr-1] = v; |
---|
520 | displ[print_ptr++] = last_print; |
---|
521 | } |
---|
522 | else { |
---|
523 | last_print = displ[print_ptr++] = v; |
---|
524 | } |
---|
525 | } |
---|
526 | |
---|
527 | static MAXN_2(inline) int calc_weight(int iat,int jat,int v1,int v2) |
---|
528 | { |
---|
529 | #if (MAXN==2) |
---|
530 | awtc_assert(pos1==1 && pos2==1); |
---|
531 | int j = seq_array[alist[1]][v2+jat-1]; |
---|
532 | return j>0 ? naas[j][v1+iat-1] : 0; |
---|
533 | #else |
---|
534 | int sum,i,j,lookn,ret; |
---|
535 | int ipos,jpos; |
---|
536 | |
---|
537 | ipos = v1 + iat -1; |
---|
538 | jpos = v2 + jat -1; |
---|
539 | |
---|
540 | ret = 0; |
---|
541 | sum = lookn = 0; |
---|
542 | |
---|
543 | if(pos1>=pos2) |
---|
544 | { |
---|
545 | for(i=1;i<=pos2;++i) |
---|
546 | { |
---|
547 | j=seq_array[alist[i]][jpos]; |
---|
548 | if(j>0) |
---|
549 | { |
---|
550 | sum += naas[j][ipos]; |
---|
551 | ++lookn; |
---|
552 | } |
---|
553 | } |
---|
554 | } |
---|
555 | else |
---|
556 | { |
---|
557 | for(i=1;i<=pos1;++i) |
---|
558 | { |
---|
559 | j=seq_array[alist[i]][ipos]; |
---|
560 | if(j>0) |
---|
561 | { |
---|
562 | sum += naas[j][jpos]; |
---|
563 | ++lookn; |
---|
564 | } |
---|
565 | } |
---|
566 | } |
---|
567 | if (sum>0) ret = sum/lookn; |
---|
568 | return ret; |
---|
569 | #endif |
---|
570 | } |
---|
571 | |
---|
572 | #ifdef MATRIX_DUMP |
---|
573 | static inline char *nstr(char *cp, int length) |
---|
574 | { |
---|
575 | char *s = lstr(cp,length); |
---|
576 | n_decode(s-1, s-1, length); |
---|
577 | return s; |
---|
578 | } |
---|
579 | |
---|
580 | static inline void dumpMatrix(int x0, int y0, int breite, int hoehe, int mitte_vert) |
---|
581 | { |
---|
582 | int b; |
---|
583 | int h; |
---|
584 | char *sl = (char*)malloc(hoehe+3); |
---|
585 | char *ma = (char*)malloc(breite+3); |
---|
586 | |
---|
587 | sprintf(ma, "-%s-", nstr(seq_array[1]+x0, breite)); |
---|
588 | sprintf(sl, "-%s-", nstr(seq_array[2]+y0, hoehe)); |
---|
589 | |
---|
590 | printf(" "); |
---|
591 | |
---|
592 | for (b=0; b<=mitte_vert; b++) |
---|
593 | printf("%5c", ma[b]); |
---|
594 | printf(" MID"); |
---|
595 | for (b++; b<=breite+1+1; b++) |
---|
596 | printf("%5c", ma[b-1]); |
---|
597 | |
---|
598 | for (h=0; h<=hoehe+1; h++) |
---|
599 | { |
---|
600 | printf("\n%c vertical: ", sl[h]); |
---|
601 | for (b=0; b<=breite+1+1; b++) printf("%5i", vertical[b][h]); |
---|
602 | printf("\n verticalOpen: "); |
---|
603 | for (b=0; b<=breite+1+1; b++) printf("%5i", verticalOpen[b][h]); |
---|
604 | printf("\n diagonal: "); |
---|
605 | for (b=0; b<=breite+1+1; b++) printf("%5i", diagonal[b][h]); |
---|
606 | printf("\n horizontal: "); |
---|
607 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontal[b][h]); |
---|
608 | printf("\n horizontalOpen:"); |
---|
609 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontalOpen[b][h]); |
---|
610 | printf("\n"); |
---|
611 | } |
---|
612 | |
---|
613 | printf("--------------------\n"); |
---|
614 | |
---|
615 | free(ma); |
---|
616 | free(sl); |
---|
617 | } |
---|
618 | #endif |
---|
619 | |
---|
620 | static int diff(int v1,int v2,int v3,int v4, int st,int en) |
---|
621 | /* v1,v3 master sequence (v1=offset, v3=length) |
---|
622 | * v2,v4 slave sequence (v2=offset, v4=length) |
---|
623 | * st,en gap_open-penalties for start and end of the sequence |
---|
624 | * |
---|
625 | * returns costs for inserted gaps |
---|
626 | */ |
---|
627 | { |
---|
628 | int ctrc,ctri,ctrj=0, |
---|
629 | i,j,k,l,m,n,p, |
---|
630 | flag; |
---|
631 | |
---|
632 | #ifdef DEBUG |
---|
633 | # ifdef MATRIX_DUMP |
---|
634 | int display_matrix = 0; |
---|
635 | |
---|
636 | awtc_assert(v3<=(DISPLAY_MATRIX_SIZE+2)); // width |
---|
637 | awtc_assert(v4<=(DISPLAY_MATRIX_SIZE)); // height |
---|
638 | |
---|
639 | # define DISPLAY_MATRIX_ELEMENTS ((DISPLAY_MATRIX_SIZE+2)*(DISPLAY_MATRIX_SIZE+2)) |
---|
640 | |
---|
641 | memset(vertical, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
642 | memset(verticalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
643 | memset(diagonal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
644 | memset(horizontal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
645 | memset(horizontalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
646 | |
---|
647 | # endif |
---|
648 | #endif |
---|
649 | static int deep; |
---|
650 | deep++; |
---|
651 | |
---|
652 | #if (defined (DEBUG) && 0) |
---|
653 | { |
---|
654 | char *d; |
---|
655 | |
---|
656 | d = lstr(seq_array[1]+v1,v3); |
---|
657 | n_decode(d-1,d-1,v3); |
---|
658 | |
---|
659 | for (int cnt=0; cnt<deep; cnt++) putchar(' '); |
---|
660 | printf("master = '%s' (penalties left=%i right=%i)\n", d, st, en); |
---|
661 | d = lstr(seq_array[2]+v2,v4); |
---|
662 | n_decode(d-1,d-1,v4); |
---|
663 | |
---|
664 | for (cnt=0; cnt<deep; cnt++) putchar(' '); |
---|
665 | printf("slave = '%s'\n", d); |
---|
666 | } |
---|
667 | #endif |
---|
668 | |
---|
669 | // q = gap_open + gap_extend; // q is replaced for local positions |
---|
670 | |
---|
671 | if(v4<=0) { // if slave sequence is empty |
---|
672 | if(v3>0) { |
---|
673 | if(last_print<0 ) { |
---|
674 | last_print = displ[print_ptr-1] -= v3; // add .. |
---|
675 | } |
---|
676 | else { |
---|
677 | last_print = displ[print_ptr++] = -(v3); // .. or insert gap of length 'v3' into slave |
---|
678 | } |
---|
679 | } |
---|
680 | |
---|
681 | deep--; |
---|
682 | return master_gapAt(v1,v3); |
---|
683 | } |
---|
684 | |
---|
685 | if(v3<=1) { |
---|
686 | if(v3<=0) { // if master sequence is empty |
---|
687 | add(v4); // ??? insert gap length 'v4' into master ??? |
---|
688 | deep--; |
---|
689 | return slave_gapAt(v2,v4); |
---|
690 | } |
---|
691 | |
---|
692 | awtc_assert(v3==1); |
---|
693 | // if master length == 1 |
---|
694 | |
---|
695 | if (v4==1) |
---|
696 | { |
---|
697 | if(st>en) |
---|
698 | st=en; |
---|
699 | |
---|
700 | /***************if(!v4)*********BUG********************************/ |
---|
701 | |
---|
702 | ctrc = slave_gapAtWithOpenPenalty(v2,v4,st); |
---|
703 | ctrj = 0; |
---|
704 | |
---|
705 | for(j=1;j<=v4;++j) |
---|
706 | { |
---|
707 | k = slave_gapAt(v2, j-1) + calc_weight(1,j,v1,v2) + slave_gapAt(v2+j,v4-j); |
---|
708 | if(k<ctrc) { ctrc = k; ctrj = j; } |
---|
709 | } |
---|
710 | |
---|
711 | if(!ctrj) |
---|
712 | { |
---|
713 | add(v4); |
---|
714 | if(last_print<0) |
---|
715 | last_print = displ[print_ptr-1] -= 1; |
---|
716 | else |
---|
717 | last_print = displ[print_ptr++] = -1; |
---|
718 | } |
---|
719 | else |
---|
720 | { |
---|
721 | if(ctrj>1) |
---|
722 | add(ctrj-1); |
---|
723 | displ[print_ptr++] = last_print = 0; |
---|
724 | if(ctrj<v4) |
---|
725 | add(v4-ctrj); |
---|
726 | } |
---|
727 | |
---|
728 | deep--; |
---|
729 | return ctrc; |
---|
730 | } |
---|
731 | } |
---|
732 | |
---|
733 | awtc_assert(v3>=1 && v4>=1); |
---|
734 | |
---|
735 | // slave length >= 1 |
---|
736 | // master length >= 1 |
---|
737 | |
---|
738 | # define AF 0 |
---|
739 | |
---|
740 | // first column of matrix (slave side): |
---|
741 | IF_MATRIX_DUMP(vertical[0][0]=) |
---|
742 | zza[0] = 0; |
---|
743 | p = master_gap_open(v1); |
---|
744 | for(j=1; j<=v4; j++) |
---|
745 | { |
---|
746 | p += master_gap_extend(v1); |
---|
747 | IF_MATRIX_DUMP(vertical[0][j]=) |
---|
748 | zza[j] = p; |
---|
749 | IF_MATRIX_DUMP(verticalOpen[0][j]=) |
---|
750 | zzb[j] = p + master_gap_open(v1); |
---|
751 | } |
---|
752 | |
---|
753 | // left half of the matrix |
---|
754 | p = st; |
---|
755 | ctri = v3 / 2; |
---|
756 | for(i=1; i<=ctri; i++) |
---|
757 | { |
---|
758 | n = zza[0]; |
---|
759 | p += master_gap_extend(v1+i+AF); |
---|
760 | k = p; |
---|
761 | IF_MATRIX_DUMP(vertical[i][0]=) |
---|
762 | zza[0] = k; |
---|
763 | l = p + master_gap_open(v1+i+AF); |
---|
764 | |
---|
765 | for(j=1; j<=v4; j++) |
---|
766 | { |
---|
767 | // from above (gap in master (behind position i)) |
---|
768 | IF_MATRIX_DUMP(verticalOpen[i][j]=) k += master_gap_open(v1+i+AF)+master_gap_extend(v1+i+AF); // (1) |
---|
769 | IF_MATRIX_DUMP(vertical[i][j]=) l += master_gap_extend(v1+i+AF); // (2) |
---|
770 | if(k<l) l = k; // l=min((1),(2)) |
---|
771 | |
---|
772 | // from left (gap in slave (behind position j)) |
---|
773 | IF_MATRIX_DUMP(horizontalOpen[i][j]=) k = zza[j] + slave_gap_open(v2+j+AF)+slave_gap_extend(v2+j+AF); // (3) |
---|
774 | IF_MATRIX_DUMP(horizontal[i][j]=) m = zzb[j] + slave_gap_extend(v2+j+AF); // (4) |
---|
775 | if(k<m) m = k; // m=min((3),(4)) |
---|
776 | |
---|
777 | // diagonal (no gaps) |
---|
778 | IF_MATRIX_DUMP(diagonal[i][j]=) k = n + calc_weight(i,j,v1,v2); // (5) |
---|
779 | if(l<k) k = l; |
---|
780 | if(m<k) k = m; // k = minimum of all paths |
---|
781 | |
---|
782 | n = zza[j]; // minimum of same row; one column to the left |
---|
783 | zza[j] = k; // minimum of all paths to this matrix position |
---|
784 | zzb[j] = m; // minimum of those two paths, where gap was inserted into slave |
---|
785 | } |
---|
786 | } |
---|
787 | |
---|
788 | # define MHO 1 // X-Offset for second half of matrix-arrays (only used to insert MID-column into dumpMatrix()) |
---|
789 | # define BO 1 // Offset for second half of matrix (cause now the indices i and j are smaller as above) |
---|
790 | |
---|
791 | // last column of matrix: |
---|
792 | |
---|
793 | zzb[0]=zza[0]; |
---|
794 | IF_MATRIX_DUMP(vertical[v3+1+MHO][v4+1]=) |
---|
795 | zzc[v4]=0; |
---|
796 | p = master_gap_open(v1+v3); |
---|
797 | for(j=v4-1; j>-1; j--) |
---|
798 | { |
---|
799 | p += master_gap_extend(v1+v3); |
---|
800 | IF_MATRIX_DUMP(vertical[v3+1+MHO][j+BO]=) |
---|
801 | zzc[j] = p; |
---|
802 | IF_MATRIX_DUMP(verticalOpen[v3+1+MHO][j+BO]=) |
---|
803 | zzd[j] = p+master_gap_open(v1+v3); |
---|
804 | } |
---|
805 | |
---|
806 | // right half of matrix (backwards): |
---|
807 | p = en; |
---|
808 | for(i=v3-1;i>=ctri;i--) |
---|
809 | { |
---|
810 | n = zzc[v4]; |
---|
811 | p += master_gap_extend(v1+i); |
---|
812 | k = p; |
---|
813 | IF_MATRIX_DUMP(vertical[i+BO+MHO][v4+1]=) |
---|
814 | zzc[v4] = k; |
---|
815 | l = p+master_gap_open(v1+i); |
---|
816 | |
---|
817 | for(j=v4-1; j>=0; j--) |
---|
818 | { |
---|
819 | // from below (gap in master (in front of position (i+BO))) |
---|
820 | IF_MATRIX_DUMP(verticalOpen[i+BO+MHO][j+BO]=) k += master_gap_open(v1+i)+master_gap_extend(v1+i); // (1) |
---|
821 | IF_MATRIX_DUMP(vertical[i+BO+MHO][j+BO]=) l += master_gap_extend(v1+i); // (2) |
---|
822 | if(k<l) l = k; // l=min((1),(2)) |
---|
823 | |
---|
824 | // from right (gap in slave (in front of position (j+BO))) |
---|
825 | IF_MATRIX_DUMP(horizontalOpen[i+BO+MHO][j+BO]=) k = zzc[j] + slave_gap_open(v2+j) + slave_gap_extend(v2+j); // (3) |
---|
826 | IF_MATRIX_DUMP(horizontal[i+BO+MHO][j+BO]=) m = zzd[j] + slave_gap_extend(v2+j); // (4) |
---|
827 | if(k<m) m = k; // m=min((3),(4)) |
---|
828 | |
---|
829 | // diagonal (no gaps) |
---|
830 | IF_MATRIX_DUMP(diagonal[i+BO+MHO][j+BO]=) k = n + calc_weight(i+BO,j+BO,v1,v2); // (5) |
---|
831 | if(l<k) k = l; |
---|
832 | if(m<k) k = m; // k = minimum of all paths |
---|
833 | |
---|
834 | n = zzc[j]; // minimum of same row; one column to the right |
---|
835 | zzc[j] = k; // minimum of all paths to this matrix position |
---|
836 | zzd[j] = m; // minimum of those two paths, where gap was inserted into slave |
---|
837 | } |
---|
838 | } |
---|
839 | |
---|
840 | #undef BO |
---|
841 | |
---|
842 | // connect rightmost column of first half (column ctri) |
---|
843 | // with leftmost column of second half (column ctri+1) |
---|
844 | |
---|
845 | zzd[v4] = zzc[v4]; |
---|
846 | |
---|
847 | ctrc = INT_MAX; |
---|
848 | flag = 0; |
---|
849 | |
---|
850 | for(j=(ctri?0:1);j<=v4;j++) // was j=0;... (this was erroneous cause it splitted zero-length subsequences) |
---|
851 | { |
---|
852 | IF_MATRIX_DUMP(vertical[ctri+MHO][j]=) |
---|
853 | k = zza[j] + zzc[j]; // sum up both calculations (=diagonal=no gaps) |
---|
854 | |
---|
855 | if (k<ctrc || ((k==ctrc) && (zza[j]!=zzb[j]) && (zzc[j]==zzd[j]))) |
---|
856 | { |
---|
857 | ctrc = k; |
---|
858 | ctrj = j; |
---|
859 | flag = 1; |
---|
860 | } |
---|
861 | } |
---|
862 | |
---|
863 | for(j=v4;j>=(ctri?0:1);j--) |
---|
864 | { |
---|
865 | IF_MATRIX_DUMP(verticalOpen[ctri+MHO][j]=) |
---|
866 | k = zzb[j] + zzd[j] // paths where gaps were inserted into slave (left and right side!) |
---|
867 | - slave_gap_open(j); // subtract gap_open-penalty which was added twice (at left and right end of gap) |
---|
868 | |
---|
869 | if(k<ctrc) |
---|
870 | { |
---|
871 | ctrc = k; |
---|
872 | ctrj = j; |
---|
873 | flag = 2; |
---|
874 | } |
---|
875 | } |
---|
876 | |
---|
877 | awtc_assert(flag>=1 && flag<=2); |
---|
878 | |
---|
879 | #undef MHO |
---|
880 | |
---|
881 | #ifdef MATRIX_DUMP |
---|
882 | if (display_matrix) |
---|
883 | dumpMatrix(v1,v2,v3,v4,ctri); |
---|
884 | #endif |
---|
885 | |
---|
886 | /* Conquer recursively around midpoint */ |
---|
887 | |
---|
888 | if(flag==1) /* Type 1 gaps */ |
---|
889 | { |
---|
890 | diff(v1,v2,ctri,ctrj,st,master_gap_open(v1+ctri)); // includes midpoint ctri and ctrj |
---|
891 | diff(v1+ctri,v2+ctrj,v3-ctri,v4-ctrj,master_gap_open(v1+ctri),en); |
---|
892 | } |
---|
893 | else |
---|
894 | { |
---|
895 | diff(v1,v2,ctri-1,ctrj,st,0); // includes midpoint ctrj |
---|
896 | |
---|
897 | if(last_print<0) /* Delete 2 */ |
---|
898 | last_print = displ[print_ptr-1] -= 2; |
---|
899 | else |
---|
900 | last_print = displ[print_ptr++] = -2; |
---|
901 | |
---|
902 | diff(v1+ctri+1,v2+ctrj,v3-ctri-1,v4-ctrj,0,en); |
---|
903 | } |
---|
904 | |
---|
905 | deep--; |
---|
906 | return ctrc; /* Return the score of the best alignment */ |
---|
907 | } |
---|
908 | |
---|
909 | static void do_align( /*int v1,*/ int *score, long act_seq_length) |
---|
910 | { |
---|
911 | int i,j,k,l1,l2,n; |
---|
912 | int t_arr[21]; |
---|
913 | |
---|
914 | l1=l2=pos1=pos2=0; |
---|
915 | |
---|
916 | // clear statistics |
---|
917 | |
---|
918 | for(i=1;i<=act_seq_length;++i) |
---|
919 | { |
---|
920 | for(j=0;j<21;++j) naa1[j][i]=naa2[j][i]=0; |
---|
921 | for(j=1;j<21;++j) naas[j][i]=0; |
---|
922 | } |
---|
923 | |
---|
924 | // create position statistics for each group |
---|
925 | // [here every group contains only one seq] |
---|
926 | |
---|
927 | for(i=1;i<=nseqs;++i) |
---|
928 | { |
---|
929 | if(group[i]==1) |
---|
930 | { |
---|
931 | fst_list[++pos1]=i; |
---|
932 | for(j=1;j<=seqlen_array[i];++j) |
---|
933 | if(seq_array[i][j]>0) |
---|
934 | { |
---|
935 | ++naa1[seq_array[i][j]][j]; |
---|
936 | ++naa1[0][j]; |
---|
937 | } |
---|
938 | if(seqlen_array[i]>l1) l1=seqlen_array[i]; |
---|
939 | } |
---|
940 | else if(group[i]==2) |
---|
941 | { |
---|
942 | snd_list[++pos2]=i; |
---|
943 | for(j=1;j<=seqlen_array[i];++j) |
---|
944 | if(seq_array[i][j]>0) |
---|
945 | { |
---|
946 | ++naa2[seq_array[i][j]][j]; |
---|
947 | ++naa2[0][j]; |
---|
948 | } |
---|
949 | if(seqlen_array[i]>l2) l2=seqlen_array[i]; |
---|
950 | } |
---|
951 | } |
---|
952 | |
---|
953 | if(pos1>=pos2) |
---|
954 | { |
---|
955 | for(i=1;i<=pos2;++i) alist[i]=snd_list[i]; |
---|
956 | for(n=1;n<=l1;++n) |
---|
957 | { |
---|
958 | for(i=1;i<21;++i) t_arr[i]=0; |
---|
959 | for(i=1;i<21;++i) |
---|
960 | if(naa1[i][n]>0) |
---|
961 | for(j=1;j<21;++j) |
---|
962 | t_arr[j] += (weights[i][j]*naa1[i][n]); |
---|
963 | k = naa1[0][n]; |
---|
964 | if(k>0) |
---|
965 | for(i=1;i<21;++i) |
---|
966 | naas[i][n]=t_arr[i]/k; |
---|
967 | } |
---|
968 | } |
---|
969 | else |
---|
970 | { |
---|
971 | MAXN_2_assert(0); // should never occur if MAXN==2 |
---|
972 | #if (MAXN!=2) |
---|
973 | for(i=1;i<=pos1;++i) alist[i]=fst_list[i]; |
---|
974 | for(n=1;n<=l2;++n) |
---|
975 | { |
---|
976 | for(i=1;i<21;++i) t_arr[i]=0; |
---|
977 | for(i=1;i<21;++i) |
---|
978 | if(naa2[i][n]>0) |
---|
979 | for(j=1;j<21;++j) |
---|
980 | t_arr[j] += (weights[i][j]*naa2[i][n]); |
---|
981 | k = naa2[0][n]; |
---|
982 | if(k>0) |
---|
983 | for(i=1;i<21;++i) |
---|
984 | naas[i][n]=t_arr[i]/k; |
---|
985 | } |
---|
986 | #endif |
---|
987 | } |
---|
988 | |
---|
989 | *score=diff(1,1,l1,l2,master_gap_open(1),master_gap_open(l1+1)/*v1,v1*/); /* Myers and Miller alignment now */ |
---|
990 | } |
---|
991 | |
---|
992 | /* |
---|
993 | static void group_gap(int len,int sclass,char *seq) |
---|
994 | { |
---|
995 | int i,j,k,xtra; |
---|
996 | |
---|
997 | for(i=1;i<=nseqs;++i) |
---|
998 | if(group[i] == sclass) |
---|
999 | { |
---|
1000 | xtra = len - seqlen_array[i]; |
---|
1001 | if(xtra>0) |
---|
1002 | for(j=1;j<=xtra;++j) |
---|
1003 | seq_array[i][seqlen_array[i]+j] = -1; |
---|
1004 | for(j=1;j<=len;++j) |
---|
1005 | if(seq[j] == '-') { |
---|
1006 | for(k=len;k>=j+1;--k) |
---|
1007 | seq_array[i][k] = seq_array[i][k-1]; |
---|
1008 | seq_array[i][j] = -1; |
---|
1009 | } |
---|
1010 | seqlen_array[i] = len; |
---|
1011 | } |
---|
1012 | } |
---|
1013 | */ |
---|
1014 | |
---|
1015 | static int add_ggaps(long /*max_seq_length*/) |
---|
1016 | { |
---|
1017 | int i,j,k,pos,to_do; |
---|
1018 | |
---|
1019 | pos=1; |
---|
1020 | to_do=print_ptr; |
---|
1021 | |
---|
1022 | for(i=0;i<to_do;++i) // was: 1 .. <=to_do |
---|
1023 | { |
---|
1024 | if(displ[i]==0) |
---|
1025 | { |
---|
1026 | result[1][pos]=result[2][pos]='*'; |
---|
1027 | ++pos; |
---|
1028 | } |
---|
1029 | else |
---|
1030 | { |
---|
1031 | if((k=displ[i])>0) |
---|
1032 | { |
---|
1033 | for(j=0;j<=k-1;++j) |
---|
1034 | { |
---|
1035 | result[2][pos+j]='*'; |
---|
1036 | result[1][pos+j]='-'; |
---|
1037 | } |
---|
1038 | pos += k; |
---|
1039 | } |
---|
1040 | else |
---|
1041 | { |
---|
1042 | k = (displ[i]<0) ? displ[i] * -1 : displ[i]; |
---|
1043 | for(j=0;j<=k-1;++j) |
---|
1044 | { |
---|
1045 | result[1][pos+j]='*'; |
---|
1046 | result[2][pos+j]='-'; |
---|
1047 | } |
---|
1048 | pos += k; |
---|
1049 | } |
---|
1050 | } |
---|
1051 | } |
---|
1052 | |
---|
1053 | #ifdef DEBUG |
---|
1054 | result[1][pos] = 0; |
---|
1055 | result[2][pos] = 0; |
---|
1056 | #endif |
---|
1057 | |
---|
1058 | return pos-1; |
---|
1059 | } |
---|
1060 | |
---|
1061 | |
---|
1062 | static int res_index(const char *t,char c) |
---|
1063 | { |
---|
1064 | if (t[0]==c) |
---|
1065 | return UNKNOWN_ACID; |
---|
1066 | for (int i=1; t[i]; i++) |
---|
1067 | if (t[i]==c) |
---|
1068 | return i; |
---|
1069 | return 0; |
---|
1070 | } |
---|
1071 | |
---|
1072 | static GB_ERROR p_encode(const char *seq, char *naseq, int l) |
---|
1073 | { /* code seq as ints .. use -2 for gap */ |
---|
1074 | int i; |
---|
1075 | /* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/ |
---|
1076 | |
---|
1077 | for(i=1; i<=l; i++) |
---|
1078 | { |
---|
1079 | int c = res_index(amino_acid_order,seq[i]); |
---|
1080 | |
---|
1081 | if (!c) |
---|
1082 | { |
---|
1083 | if (seq[i]!='?') { // consensus contains ??? |
---|
1084 | char buf[100]; |
---|
1085 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
---|
1086 | aw_message(buf); |
---|
1087 | } |
---|
1088 | |
---|
1089 | c = res_index(amino_acid_order, 'X'); |
---|
1090 | } |
---|
1091 | |
---|
1092 | awtc_assert(c>0); |
---|
1093 | naseq[i] = c; |
---|
1094 | } |
---|
1095 | |
---|
1096 | return 0; |
---|
1097 | } |
---|
1098 | |
---|
1099 | static GB_ERROR n_encode(const char *seq,char *naseq,int l) |
---|
1100 | { /* code seq as ints .. use -2 for gap */ |
---|
1101 | int i; |
---|
1102 | /* static char *nucs="ACGTU"; */ |
---|
1103 | int warned = 0; |
---|
1104 | |
---|
1105 | for(i=1; i<=l; i++) |
---|
1106 | { |
---|
1107 | int c = res_index(nucleic_acid_order,seq[i]); |
---|
1108 | |
---|
1109 | if (!c) |
---|
1110 | { |
---|
1111 | if (seq[i]!='?') { // consensus contains ??? |
---|
1112 | if (!warned) { |
---|
1113 | char buf[100]; |
---|
1114 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
---|
1115 | aw_message(buf); |
---|
1116 | warned = 1; |
---|
1117 | } |
---|
1118 | } |
---|
1119 | c = res_index(nucleic_acid_order, 'N'); |
---|
1120 | } |
---|
1121 | |
---|
1122 | awtc_assert(c>0); |
---|
1123 | naseq[i] = c; |
---|
1124 | } |
---|
1125 | |
---|
1126 | return 0; |
---|
1127 | } |
---|
1128 | |
---|
1129 | GB_ERROR AWTC_ClustalV_align(int is_dna, int weighted, |
---|
1130 | const char *seq1, int length1, |
---|
1131 | const char *seq2, int length2, |
---|
1132 | const int *gapsBefore1, |
---|
1133 | int max_seq_length, |
---|
1134 | char **resultPtr1, |
---|
1135 | char **resultPtr2, |
---|
1136 | int *resLengthPtr, |
---|
1137 | int *score) |
---|
1138 | { |
---|
1139 | error = 0; |
---|
1140 | gapsBeforePosition = gapsBefore1; |
---|
1141 | |
---|
1142 | if (!module_initialized) // initialize only once |
---|
1143 | { |
---|
1144 | dnaflag = is_dna; |
---|
1145 | is_weight = weighted; |
---|
1146 | |
---|
1147 | init_myers(max_seq_length); |
---|
1148 | init_show_pair(max_seq_length); |
---|
1149 | make_pamo(0); |
---|
1150 | fill_pam(); |
---|
1151 | |
---|
1152 | for (int i=1; i<=2; i++) |
---|
1153 | { |
---|
1154 | seq_array[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char)); |
---|
1155 | result[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char)); |
---|
1156 | group[i] = i; |
---|
1157 | } |
---|
1158 | |
---|
1159 | if (error) goto ende; |
---|
1160 | |
---|
1161 | module_initialized = 1; |
---|
1162 | } |
---|
1163 | else |
---|
1164 | { |
---|
1165 | if (dnaflag!=is_dna || is_weight!=weighted) |
---|
1166 | { |
---|
1167 | return "Please call AWTC_ClustalV_align_exit() between calls that differ in\n" |
---|
1168 | "one of the following parameters:\n" |
---|
1169 | " is_dna, weighted"; |
---|
1170 | } |
---|
1171 | } |
---|
1172 | |
---|
1173 | nseqs = 2; |
---|
1174 | print_ptr = 0; |
---|
1175 | |
---|
1176 | #if defined(DEBUG) && 1 |
---|
1177 | memset(&seq_array[1][1], 0, max_seq_length*sizeof(seq_array[1][1])); |
---|
1178 | memset(&seq_array[2][1], 0, max_seq_length*sizeof(seq_array[2][1])); |
---|
1179 | memset(&displ[1], 0xff, max_seq_length*sizeof(displ[1])); |
---|
1180 | seq_array[1][0] = '_'; |
---|
1181 | seq_array[2][0] = '_'; |
---|
1182 | #endif |
---|
1183 | |
---|
1184 | { |
---|
1185 | GB_ERROR (*encode)(const char*,char*,int) = dnaflag ? n_encode : p_encode; |
---|
1186 | |
---|
1187 | error = encode(seq1-1, seq_array[1], length1); |
---|
1188 | if (error) goto ende; |
---|
1189 | seqlen_array[1] = length1; |
---|
1190 | |
---|
1191 | error = encode(seq2-1, seq_array[2], length2); |
---|
1192 | if (error) goto ende; |
---|
1193 | seqlen_array[2] = length2; |
---|
1194 | |
---|
1195 | do_align(/*gap_open,*/ score,max(length1,length2)); |
---|
1196 | |
---|
1197 | int alignedLength = add_ggaps(max_seq_length); |
---|
1198 | |
---|
1199 | *resultPtr1 = result[1]+1; |
---|
1200 | *resultPtr2 = result[2]+1; |
---|
1201 | *resLengthPtr = alignedLength; |
---|
1202 | } |
---|
1203 | |
---|
1204 | ende: |
---|
1205 | |
---|
1206 | return error; |
---|
1207 | } |
---|
1208 | |
---|
1209 | void AWTC_ClustalV_align_exit(void) |
---|
1210 | { |
---|
1211 | if (module_initialized) |
---|
1212 | { |
---|
1213 | module_initialized = 0; |
---|
1214 | |
---|
1215 | for (int i=1; i<=2; i++) |
---|
1216 | { |
---|
1217 | free(result[i]); |
---|
1218 | free(seq_array[i]); |
---|
1219 | } |
---|
1220 | exit_show_pair(); |
---|
1221 | exit_myers(); |
---|
1222 | } |
---|
1223 | } |
---|
1224 | |
---|
1225 | |
---|
1226 | |
---|
1227 | |
---|
1228 | |
---|