1 | #include <ctype.h> |
---|
2 | #include <stdlib.h> |
---|
3 | #include <memory.h> |
---|
4 | // #include <malloc.h> |
---|
5 | #include <string.h> |
---|
6 | #include <stdio.h> |
---|
7 | #include <limits.h> |
---|
8 | |
---|
9 | #include <arbdb.h> |
---|
10 | #include <aw_root.hxx> |
---|
11 | |
---|
12 | #include "ClustalV.hxx" |
---|
13 | #include "awtc_seq_search.hxx" |
---|
14 | |
---|
15 | /* ---------------------------------------------------------------- */ |
---|
16 | |
---|
17 | #define MASTER_GAP_OPEN 50 |
---|
18 | #define CHEAP_GAP_OPEN 20 // penalty for creating a gap (if we have gaps in master) |
---|
19 | |
---|
20 | #define DEFAULT_GAP_OPEN 30 // penalty for creating a gap |
---|
21 | |
---|
22 | #define MASTER_GAP_EXTEND 18 |
---|
23 | #define CHEAP_GAP_EXTEND 5 // penalty for extending a gap (if we have gaps in master) |
---|
24 | |
---|
25 | #define DEFAULT_GAP_EXTEND 10 // penalty for extending a gap |
---|
26 | |
---|
27 | #define DEFAULT_IMPROBABLY_MUTATION 10 // penalty for mutations |
---|
28 | #define DEFAULT_PROBABLY_MUTATION 4 // penalty for special mutations (A<->G,C<->U/T) (only if 'weighted') |
---|
29 | |
---|
30 | #define DYNAMIC_PENALTIES // otherwise you get FIXED PENALTIES (=no cheap penalties) |
---|
31 | |
---|
32 | /* ---------------------------------------------------------------- */ |
---|
33 | |
---|
34 | #define MAX_GAP_OPEN_DISCOUNT (DEFAULT_GAP_OPEN-CHEAP_GAP_OPEN) // maximum subtracted from DEFAULT_GAP_OPEN |
---|
35 | #define MAX_GAP_EXTEND_DISCOUNT (DEFAULT_GAP_EXTEND-CHEAP_GAP_EXTEND) // maximum subtracted from DEFAULT_GAP_EXTEND |
---|
36 | |
---|
37 | #define MAXN 2 /* Maximum number of sequences (both groups) */ |
---|
38 | |
---|
39 | #if (MAXN==2) |
---|
40 | #define MAXN_2(xxx) xxx |
---|
41 | #define MAXN_2_assert(xxx) awtc_assert(xxx) |
---|
42 | #else |
---|
43 | #define MAXN_2(xxx) |
---|
44 | #define MAXN_2_assert(xxx) |
---|
45 | #endif |
---|
46 | |
---|
47 | #define TRUE 1 |
---|
48 | #define FALSE 0 |
---|
49 | |
---|
50 | static GB_ERROR error; |
---|
51 | static bool module_initialized = false; |
---|
52 | |
---|
53 | typedef int Boolean; |
---|
54 | |
---|
55 | static Boolean dnaflag; |
---|
56 | static Boolean is_weight; |
---|
57 | |
---|
58 | #define MAX_BASETYPES 21 |
---|
59 | |
---|
60 | static int xover; |
---|
61 | static int little_pam; |
---|
62 | static int big_pam; |
---|
63 | static int pamo[(MAX_BASETYPES-1)*MAX_BASETYPES/2]; |
---|
64 | static int pam[MAX_BASETYPES][MAX_BASETYPES]; |
---|
65 | |
---|
66 | static int pos1; |
---|
67 | static int pos2; |
---|
68 | static int **naa1; // naa1[basetype][position] counts bases for each position of all sequences in group1 |
---|
69 | static int **naa2; // naa2[basetype][position] same for group2 |
---|
70 | static int **naas; // |
---|
71 | static int seqlen_array[MAXN+1]; // length of all sequences |
---|
72 | static unsigned char *seq_array[MAXN+1]; // the sequences |
---|
73 | static int group[MAXN+1]; // group of sequence |
---|
74 | static int alist[MAXN+1]; // indices of sequences to be aligned |
---|
75 | static int fst_list[MAXN+1]; |
---|
76 | static int snd_list[MAXN+1]; |
---|
77 | |
---|
78 | static int nseqs; // # of sequences |
---|
79 | static int weights[MAX_BASETYPES][MAX_BASETYPES]; // weights[b1][b2] : penalty for mutation from base 'b1' to base 'b2' |
---|
80 | |
---|
81 | #if defined(DEBUG) |
---|
82 | size_t displ_size = 0; |
---|
83 | #endif // DEBUG |
---|
84 | |
---|
85 | static int *displ; // displ == 0 -> base in both , displ<0 -> displ gaps in slave, displ>0 -> displ gaps in master |
---|
86 | static int *zza; // column (left->right) of align matrix (minimum of all paths to this matrix element) |
---|
87 | static int *zzb; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
---|
88 | static int *zzc; // column (left<-right) of align matrix (minimum of all paths to this matrix element) |
---|
89 | static int *zzd; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
---|
90 | static int print_ptr; |
---|
91 | static int last_print; |
---|
92 | |
---|
93 | static const int *gapsBeforePosition; |
---|
94 | |
---|
95 | /* static int gap_open; |
---|
96 | static int gap_extend;*/ |
---|
97 | |
---|
98 | #if defined(DEBUG) |
---|
99 | // #define MATRIX_DUMP |
---|
100 | // #define DISPLAY_DIFF |
---|
101 | #endif // DEBUG |
---|
102 | |
---|
103 | #ifdef MATRIX_DUMP |
---|
104 | # define IF_MATRIX_DUMP(xxx) xxx |
---|
105 | # define DISPLAY_MATRIX_SIZE 3000 |
---|
106 | |
---|
107 | static int vertical [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
108 | static int verticalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
109 | static int diagonal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
110 | static int horizontal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
111 | static int horizontalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
---|
112 | |
---|
113 | #else |
---|
114 | # define IF_MATRIX_DUMP(xxx) |
---|
115 | #endif |
---|
116 | |
---|
117 | static inline int master_gap_open(int beforePosition) |
---|
118 | { |
---|
119 | #ifdef DYNAMIC_PENALTIES |
---|
120 | long gaps = gapsBeforePosition[beforePosition-1]; |
---|
121 | return (gaps)? MASTER_GAP_OPEN - MAX_GAP_OPEN_DISCOUNT : MASTER_GAP_OPEN; |
---|
122 | |
---|
123 | /* return |
---|
124 | gaps >= MAX_GAP_OPEN_DISCOUNT |
---|
125 | ? DEFAULT_GAP_OPEN-MAX_GAP_OPEN_DISCOUNT |
---|
126 | : DEFAULT_GAP_OPEN-gaps;*/ |
---|
127 | #else |
---|
128 | return DEFAULT_GAP_OPEN; |
---|
129 | #endif |
---|
130 | } |
---|
131 | static inline int master_gap_extend(int beforePosition) |
---|
132 | { |
---|
133 | #ifdef DYNAMIC_PENALTIES |
---|
134 | long gaps = gapsBeforePosition[beforePosition-1]; |
---|
135 | |
---|
136 | return (gaps)? MASTER_GAP_EXTEND - MAX_GAP_EXTEND_DISCOUNT : MASTER_GAP_EXTEND; |
---|
137 | /* return |
---|
138 | gaps >= MAX_GAP_EXTEND_DISCOUNT |
---|
139 | ? DEFAULT_GAP_EXTEND-MAX_GAP_EXTEND_DISCOUNT |
---|
140 | : DEFAULT_GAP_EXTEND-gaps;*/ |
---|
141 | #else |
---|
142 | return DEFAULT_GAP_EXTEND; |
---|
143 | #endif |
---|
144 | } |
---|
145 | |
---|
146 | static inline int master_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
---|
147 | { |
---|
148 | if (length<=0) |
---|
149 | return 0; |
---|
150 | |
---|
151 | int beforePosition = atPosition, |
---|
152 | afterPosition = atPosition-1; |
---|
153 | |
---|
154 | while (length--) |
---|
155 | { |
---|
156 | int p1,p2; |
---|
157 | |
---|
158 | if ((p1=master_gap_extend(beforePosition)) < (p2=master_gap_extend(afterPosition+1)) && |
---|
159 | beforePosition>1) |
---|
160 | { |
---|
161 | penalty += p1; |
---|
162 | beforePosition--; |
---|
163 | } |
---|
164 | else |
---|
165 | { |
---|
166 | penalty += p2; |
---|
167 | afterPosition++; |
---|
168 | } |
---|
169 | } |
---|
170 | |
---|
171 | return penalty; |
---|
172 | } |
---|
173 | |
---|
174 | static inline int master_gapAt(int atPosition, int length) |
---|
175 | { |
---|
176 | return master_gapAtWithOpenPenalty(atPosition, length, master_gap_open(atPosition)); |
---|
177 | } |
---|
178 | |
---|
179 | static inline int slave_gap_open(int /*beforePosition*/) |
---|
180 | { |
---|
181 | return DEFAULT_GAP_OPEN; |
---|
182 | } |
---|
183 | |
---|
184 | static inline int slave_gap_extend(int /*beforePosition*/) |
---|
185 | { |
---|
186 | return DEFAULT_GAP_EXTEND; |
---|
187 | } |
---|
188 | |
---|
189 | static inline int slave_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
---|
190 | { |
---|
191 | return length<=0 ? 0 : penalty + length*slave_gap_extend(atPosition); |
---|
192 | } |
---|
193 | |
---|
194 | static inline int slave_gapAt(int atPosition, int length) |
---|
195 | { |
---|
196 | return slave_gapAtWithOpenPenalty(atPosition, length, slave_gap_open(atPosition)); |
---|
197 | } |
---|
198 | |
---|
199 | #define UNKNOWN_ACID 255 |
---|
200 | static const char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW"; |
---|
201 | |
---|
202 | #define NUCLEIDS 16 |
---|
203 | static const char *nucleic_acid_order = "-ACGTUMRWSYKVHDBN"; |
---|
204 | static const char *nucleic_maybe_A = "-A----AAA---AAA-A"; |
---|
205 | static const char *nucleic_maybe_C = "--C---C--CC-CC-CC"; |
---|
206 | static const char *nucleic_maybe_G = "---G---G-G-GG-GGG"; |
---|
207 | static const char *nucleic_maybe_T = "----T---T-TT-TTTT"; |
---|
208 | static const char *nucleic_maybe_U = "-----U--U-UU-UUUU"; |
---|
209 | static const char *nucleic_maybe[6] = { NULL, nucleic_maybe_A, nucleic_maybe_C, nucleic_maybe_G, nucleic_maybe_T, nucleic_maybe_U }; |
---|
210 | |
---|
211 | /* |
---|
212 | * M = A or C S = G or C V = A or G or C N = A or C or G or T |
---|
213 | * R = A or G Y = C or T H = A or C or T |
---|
214 | * W = A or T K = G or T D = A or G or T |
---|
215 | */ |
---|
216 | |
---|
217 | #define cheap_if(cond) ((cond) ? 1 : 2) |
---|
218 | static int baseCmp(unsigned char c1, unsigned char c2) // c1,c2 == 1=A,2=C (==index of character in nucleic_acid_order[]) |
---|
219 | // returns 0 for equal |
---|
220 | // 1 for probably mutations |
---|
221 | // 2 for improbably mutations |
---|
222 | { |
---|
223 | #define COMPARABLE_BASES 5 |
---|
224 | |
---|
225 | if (c1==c2) return 0; |
---|
226 | |
---|
227 | if (c2<c1) // swap |
---|
228 | { |
---|
229 | int c3 = c1; |
---|
230 | |
---|
231 | c1 = c2; |
---|
232 | c2 = c3; |
---|
233 | } |
---|
234 | |
---|
235 | if (c2<=COMPARABLE_BASES) |
---|
236 | { |
---|
237 | awtc_assert(c1<=COMPARABLE_BASES); |
---|
238 | |
---|
239 | switch(c1) |
---|
240 | { |
---|
241 | case 0: return 2; |
---|
242 | case 1: return cheap_if(c2==3); // A->G |
---|
243 | case 2: return cheap_if(c2==4 || c2==5); // C->T/U |
---|
244 | case 3: return cheap_if(c2==1); // G->A |
---|
245 | case 4: if (c2==5) return 0; // T->U |
---|
246 | case 5: if (c2==4) return 0; // U->T |
---|
247 | return cheap_if(c2==2); // T/U->C |
---|
248 | default: awtc_assert(0); |
---|
249 | } |
---|
250 | } |
---|
251 | |
---|
252 | int i; |
---|
253 | int bestMatch = 3; |
---|
254 | |
---|
255 | if (c1<=COMPARABLE_BASES) |
---|
256 | { |
---|
257 | for (i=1; i<=COMPARABLE_BASES; i++) |
---|
258 | { |
---|
259 | if (isalpha(nucleic_maybe[i][c2])) // 'c2' maybe a 'i' |
---|
260 | { |
---|
261 | int match = baseCmp(c1,i); |
---|
262 | if (match<bestMatch) bestMatch = match; |
---|
263 | } |
---|
264 | } |
---|
265 | } |
---|
266 | else |
---|
267 | { |
---|
268 | for (i=1; i<=COMPARABLE_BASES; i++) |
---|
269 | { |
---|
270 | if (isalpha(nucleic_maybe[i][c1])) // 'c1' maybe a 'i' |
---|
271 | { |
---|
272 | int match = baseCmp(i,c2); |
---|
273 | if (match<bestMatch) bestMatch = match; |
---|
274 | } |
---|
275 | } |
---|
276 | } |
---|
277 | |
---|
278 | awtc_assert(bestMatch>=0 && bestMatch<=2); |
---|
279 | return bestMatch; |
---|
280 | } |
---|
281 | #undef cheap_if |
---|
282 | |
---|
283 | int AWTC_baseMatch(char c1, char c2) // c1,c2 == ACGUTRS... |
---|
284 | // returns 0 for equal |
---|
285 | // 1 for probably mutations |
---|
286 | // 2 for improbably mutations |
---|
287 | // -1 if one char is illegal |
---|
288 | { |
---|
289 | const char *p1 = strchr(nucleic_acid_order, c1); |
---|
290 | const char *p2 = strchr(nucleic_acid_order, c2); |
---|
291 | |
---|
292 | awtc_assert(c1); |
---|
293 | awtc_assert(c2); |
---|
294 | |
---|
295 | if (p1 && p2) return baseCmp(p1-nucleic_acid_order, p2-nucleic_acid_order); |
---|
296 | return -1; |
---|
297 | } |
---|
298 | |
---|
299 | static int getPenalty(char c1, char c2) /* c1,c2 = A=0,C=1,... s.o. */ |
---|
300 | { |
---|
301 | switch (baseCmp(c1,c2)) |
---|
302 | { |
---|
303 | case 1: return DEFAULT_PROBABLY_MUTATION; |
---|
304 | case 2: return DEFAULT_IMPROBABLY_MUTATION; |
---|
305 | default: break; |
---|
306 | } |
---|
307 | |
---|
308 | return 0; |
---|
309 | } |
---|
310 | |
---|
311 | static char *result[3]; // result buffers |
---|
312 | |
---|
313 | static char pam250mt[]={ |
---|
314 | 12, |
---|
315 | 0, 2, |
---|
316 | -2, 1, 3, |
---|
317 | -3, 1, 0, 6, |
---|
318 | -2, 1, 1, 1, 2, |
---|
319 | -3, 1, 0,-1, 1, 5, |
---|
320 | -4, 1, 0,-1, 0, 0, 2, |
---|
321 | -5, 0, 0,-1, 0, 1, 2, 4, |
---|
322 | -5, 0, 0,-1, 0, 0, 1, 3, 4, |
---|
323 | -5,-1,-1, 0, 0,-1, 1, 2, 2, 4, |
---|
324 | -3,-1,-1, 0,-1,-2, 2, 1, 1, 3, 6, |
---|
325 | -4, 0,-1, 0,-2,-3, 0,-1,-1, 1, 2, 6, |
---|
326 | -5, 0, 0,-1,-1,-2, 1, 0, 0, 1, 0, 3, 5, |
---|
327 | -5,-2,-1,-2,-1,-3,-2,-3,-2,-1,-2, 0, 0, 6, |
---|
328 | -2,-1, 0,-2,-1,-3,-2,-2,-2,-2,-2,-2,-2, 2, 5, |
---|
329 | -6,-3,-2,-3,-2,-4,-3,-4,-3,-2,-2,-3,-3, 4, 2, 6, |
---|
330 | -2,-1, 0,-1, 0,-1,-2,-2,-2,-2,-2,-2,-2, 2, 4, 2, 4, |
---|
331 | -4,-3,-3,-5,-4,-5,-4,-6,-5,-5,-2,-4,-5, 0, 1, 2,-1, 9, |
---|
332 | 0,-3,-3,-5,-3,-5,-2,-4,-4,-4, 0,-4,-4,-2,-1,-1,-2, 7,10, |
---|
333 | -8,-2,-5,-6,-6,-7,-4,-7,-7,-5,-3, 2,-3,-4,-5,-2,-6, 0, 0,17 }; |
---|
334 | |
---|
335 | /* |
---|
336 | static char idmat[]={ |
---|
337 | 10, |
---|
338 | 0, 10, |
---|
339 | 0, 0, 10, |
---|
340 | 0, 0, 0, 10, |
---|
341 | 0, 0, 0, 0, 10, |
---|
342 | 0, 0, 0, 0, 1, 10, |
---|
343 | 0, 0, 0, 0, 0, 0, 10, |
---|
344 | 0, 0, 0, 0, 0, 0, 0, 10, |
---|
345 | 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
346 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
347 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
348 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
349 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
350 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
351 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
352 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
353 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
354 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
---|
355 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10, |
---|
356 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10}; |
---|
357 | */ |
---|
358 | |
---|
359 | /* |
---|
360 | static char pam100mt[]={ |
---|
361 | 14, |
---|
362 | -1, 6, |
---|
363 | -5, 2, 7, |
---|
364 | -6, 1, -1, 10, |
---|
365 | -5, 2, 2, 1, 6, |
---|
366 | -8, 1, -3, -3, 1, 8, |
---|
367 | -8, 2, 0, -3, -1, -1, 7, |
---|
368 | -11,-1, -2, -4, -1, -1, 4, 8, |
---|
369 | -11,-2, -3, -3, 0, -2, 1, 5, 8, |
---|
370 | -11,-3, -3, -1, -2, -5,-1, 1, 4, 9, |
---|
371 | -6,-4, -5, -2, -5, -7, 2, -1, -2, 4,11, |
---|
372 | -6,-1, -4, -2, -5, -8,-3, -6, -5, 1, 1, 10, |
---|
373 | -11,-2, -1, -4, -4, -5, 1, -2, -2, -1,-3, 3, 8, |
---|
374 | -11,-4, -2, -6, -3, -8,-5, -8, -6, -2,-7, -2, 1, 13, |
---|
375 | -5,-4, -1, -6, -3, -7,-4, -6, -5, -5,-7, -4, -4, 2, 9, |
---|
376 | -12,-7, -5, -5, -5, -8,-6, -9, -7, -3,-5, -7, -6, 4, 2, 9, |
---|
377 | -4,-4, -1, -4, 0, -4,-5, -6, -5, -5,-6, -6, -6, 1, 5, 1, 8, |
---|
378 | -10,-5, -6, -9, -7, -8,-6,-11,-11,-10,-4, -7,-11, -2, 0, 0, -5,12, |
---|
379 | -2,-6, -6,-11, -6,-11,-3, -9, -7, -9,-1,-10,-10, -8, -4,-5, -6, 6,13, |
---|
380 | -13,-4,-10,-11,-11,-13,-8,-13,-14,-11,-7, 1, -9,-11,-12,-7,-14,-2,-2,19 }; |
---|
381 | */ |
---|
382 | |
---|
383 | static char *matptr = pam250mt; |
---|
384 | |
---|
385 | #if (defined(DISPLAY_DIFF) || defined(MATRIX_DUMP)) |
---|
386 | static GB_ERROR p_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
---|
387 | int len = strlen(amino_acid_order); |
---|
388 | |
---|
389 | for (int i=1; i<=l && naseq[i]; i++) |
---|
390 | { |
---|
391 | awtc_assert(naseq[i]<len); |
---|
392 | seq[i] = amino_acid_order[naseq[i]]; |
---|
393 | } |
---|
394 | |
---|
395 | return 0; |
---|
396 | } |
---|
397 | |
---|
398 | static GB_ERROR n_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
---|
399 | int len = strlen(nucleic_acid_order); |
---|
400 | |
---|
401 | for (int i=1; i<=l && naseq[i]; i++) |
---|
402 | { |
---|
403 | awtc_assert(naseq[i]<len); |
---|
404 | seq[i] = nucleic_acid_order[naseq[i]]; |
---|
405 | } |
---|
406 | |
---|
407 | return 0; |
---|
408 | } |
---|
409 | #endif |
---|
410 | |
---|
411 | static inline GB_ERROR MAXLENtooSmall() |
---|
412 | { |
---|
413 | return GB_export_error("AWTC_ClustalV-aligner: MAXLEN is dimensioned to small for this sequence"); |
---|
414 | } |
---|
415 | |
---|
416 | static inline void *ckalloc(size_t bytes) |
---|
417 | { |
---|
418 | void *ret = malloc(bytes); |
---|
419 | |
---|
420 | if (!ret) error = "out of memory"; |
---|
421 | return ret; |
---|
422 | } |
---|
423 | |
---|
424 | static void init_myers(long max_seq_length) { |
---|
425 | if (!error) { |
---|
426 | naa1 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *) ); |
---|
427 | naa2 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *) ); |
---|
428 | naas = (int **)ckalloc(MAX_BASETYPES * sizeof (int *) ); |
---|
429 | |
---|
430 | for(int i=0;i<MAX_BASETYPES && !error; i++) { |
---|
431 | naa1[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int)); |
---|
432 | naa2[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int)); |
---|
433 | naas[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int)); |
---|
434 | } |
---|
435 | } |
---|
436 | } |
---|
437 | |
---|
438 | static void make_pamo(int nv) |
---|
439 | { |
---|
440 | int i,c; |
---|
441 | |
---|
442 | little_pam=big_pam=matptr[0]; |
---|
443 | for(i=0;i<210;++i) { |
---|
444 | c=matptr[i]; |
---|
445 | little_pam=(little_pam<c) ? little_pam : c; |
---|
446 | big_pam=(big_pam>c) ? big_pam : c; |
---|
447 | } |
---|
448 | for(i=0;i<210;++i) |
---|
449 | pamo[i]= matptr[i]-little_pam; |
---|
450 | nv -= little_pam; |
---|
451 | big_pam -= little_pam; |
---|
452 | xover = big_pam - nv; |
---|
453 | /* |
---|
454 | fprintf(stdout,"\n\nxover= %d, big_pam = %d, little_pam=%d, nv = %d\n\n" |
---|
455 | ,xover,big_pam,little_pam,nv); |
---|
456 | */ |
---|
457 | } |
---|
458 | |
---|
459 | static void fill_pam() |
---|
460 | { |
---|
461 | int i,j,pos; |
---|
462 | |
---|
463 | pos=0; |
---|
464 | |
---|
465 | for(i=0;i<20;++i) |
---|
466 | for(j=0;j<=i;++j) |
---|
467 | pam[i][j]=pamo[pos++]; |
---|
468 | |
---|
469 | for(i=0;i<20;++i) |
---|
470 | for(j=0;j<=i;++j) |
---|
471 | pam[j][i]=pam[i][j]; |
---|
472 | |
---|
473 | if(dnaflag) |
---|
474 | { |
---|
475 | xover=4; |
---|
476 | big_pam=8; |
---|
477 | for(i=0;i<=NUCLEIDS;++i) |
---|
478 | for(j=0;j<=NUCLEIDS;++j) |
---|
479 | weights[i][j] = getPenalty(i,j); |
---|
480 | } |
---|
481 | else { |
---|
482 | for(i=1;i<MAX_BASETYPES;++i) { |
---|
483 | for(j=1;j<MAX_BASETYPES;++j) { |
---|
484 | weights[i][j] = big_pam - pam[i-1][j-1]; |
---|
485 | } |
---|
486 | } |
---|
487 | for(i=0;i<MAX_BASETYPES;++i) { |
---|
488 | weights[0][i] = xover; |
---|
489 | weights[i][0] = xover; |
---|
490 | } |
---|
491 | } |
---|
492 | } |
---|
493 | |
---|
494 | static void exit_myers(void) { |
---|
495 | for (int i=0; i<MAX_BASETYPES; i++) { |
---|
496 | free(naas[i]); |
---|
497 | free(naa2[i]); |
---|
498 | free(naa1[i]); |
---|
499 | } |
---|
500 | free(naas); |
---|
501 | free(naa2); |
---|
502 | free(naa1); |
---|
503 | } |
---|
504 | |
---|
505 | static void init_show_pair(long max_seq_length) |
---|
506 | { |
---|
507 | #if defined(DEBUG) |
---|
508 | displ_size = (2*max_seq_length +1); |
---|
509 | #endif // DEBUG |
---|
510 | displ = (int *) ckalloc( (2*max_seq_length +1) * sizeof (int) ); |
---|
511 | last_print = 0; |
---|
512 | |
---|
513 | zza = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
514 | zzb = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
515 | |
---|
516 | zzc = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
517 | zzd = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
---|
518 | } |
---|
519 | |
---|
520 | void exit_show_pair(void) |
---|
521 | { |
---|
522 | freeset(zzd, 0); |
---|
523 | freeset(zzc, 0); |
---|
524 | freeset(zzb, 0); |
---|
525 | freeset(zza, 0); |
---|
526 | freeset(displ, 0); |
---|
527 | #if defined(DEBUG) |
---|
528 | displ_size = 0; |
---|
529 | #endif // DEBUG |
---|
530 | } |
---|
531 | |
---|
532 | inline int set_displ(int offset, int value) { |
---|
533 | awtc_assert(offset >= 0 && offset<int(displ_size)); |
---|
534 | displ[offset] = value; |
---|
535 | return displ[offset]; |
---|
536 | } |
---|
537 | inline int decr_displ(int offset, int value) { |
---|
538 | awtc_assert(offset >= 0 && offset<int(displ_size)); |
---|
539 | displ[offset] -= value; |
---|
540 | return displ[offset]; |
---|
541 | } |
---|
542 | |
---|
543 | |
---|
544 | static inline void add(int v) // insert 'v' gaps into master ??? |
---|
545 | { |
---|
546 | if (last_print<0 && print_ptr>0) { |
---|
547 | set_displ(print_ptr-1,v); |
---|
548 | set_displ(print_ptr++,last_print); |
---|
549 | } |
---|
550 | else { |
---|
551 | last_print = set_displ(print_ptr++,v); |
---|
552 | } |
---|
553 | } |
---|
554 | |
---|
555 | static MAXN_2(inline) int calc_weight(int iat,int jat,int v1,int v2) |
---|
556 | { |
---|
557 | #if (MAXN==2) |
---|
558 | awtc_assert(pos1==1 && pos2==1); |
---|
559 | unsigned char j = seq_array[alist[1]][v2+jat-1]; |
---|
560 | return j<128 ? naas[j][v1+iat-1] : 0; |
---|
561 | #else |
---|
562 | int sum,i,lookn,ret; |
---|
563 | int ipos,jpos; |
---|
564 | |
---|
565 | ipos = v1 + iat -1; |
---|
566 | jpos = v2 + jat -1; |
---|
567 | |
---|
568 | ret = 0; |
---|
569 | sum = lookn = 0; |
---|
570 | |
---|
571 | if(pos1>=pos2) |
---|
572 | { |
---|
573 | for(i=1;i<=pos2;++i) |
---|
574 | { |
---|
575 | unsigned char j=seq_array[alist[i]][jpos]; |
---|
576 | if (j<128) { |
---|
577 | sum += naas[j][ipos]; |
---|
578 | ++lookn; |
---|
579 | } |
---|
580 | } |
---|
581 | } |
---|
582 | else |
---|
583 | { |
---|
584 | for(i=1;i<=pos1;++i) |
---|
585 | { |
---|
586 | unsigned char j = seq_array[alist[i]][ipos]; |
---|
587 | if (j<128) { |
---|
588 | sum += naas[j][jpos]; |
---|
589 | ++lookn; |
---|
590 | } |
---|
591 | } |
---|
592 | } |
---|
593 | if (sum>0) ret = sum/lookn; |
---|
594 | return ret; |
---|
595 | #endif |
---|
596 | } |
---|
597 | |
---|
598 | #ifdef MATRIX_DUMP |
---|
599 | |
---|
600 | static inline const unsigned char *lstr(const unsigned char *s, int len) { |
---|
601 | static unsigned char *lstr_ss = 0; |
---|
602 | |
---|
603 | freeset(lstr_ss, (unsigned char*)strndup((const char*)s,len)); |
---|
604 | return lstr_ss; |
---|
605 | } |
---|
606 | |
---|
607 | static inline const unsigned char *nstr(unsigned char *cp, int length) |
---|
608 | { |
---|
609 | const unsigned char *s = lstr(cp,length); |
---|
610 | (dnaflag ? n_decode : p_decode)(s-1, const_cast<unsigned char *>(s-1), length); |
---|
611 | return s; |
---|
612 | } |
---|
613 | |
---|
614 | static inline void dumpMatrix(int x0, int y0, int breite, int hoehe, int mitte_vert) |
---|
615 | { |
---|
616 | int b; |
---|
617 | int h; |
---|
618 | char *sl = (char*)malloc(hoehe+3); |
---|
619 | char *ma = (char*)malloc(breite+3); |
---|
620 | |
---|
621 | sprintf(ma, "-%s-", nstr(seq_array[1]+x0, breite)); |
---|
622 | sprintf(sl, "-%s-", nstr(seq_array[2]+y0, hoehe)); |
---|
623 | |
---|
624 | printf(" "); |
---|
625 | |
---|
626 | for (b=0; b<=mitte_vert; b++) |
---|
627 | printf("%5c", ma[b]); |
---|
628 | printf(" MID"); |
---|
629 | for (b++; b<=breite+1+1; b++) |
---|
630 | printf("%5c", ma[b-1]); |
---|
631 | |
---|
632 | for (h=0; h<=hoehe+1; h++) |
---|
633 | { |
---|
634 | printf("\n%c vertical: ", sl[h]); |
---|
635 | for (b=0; b<=breite+1+1; b++) printf("%5i", vertical[b][h]); |
---|
636 | printf("\n verticalOpen: "); |
---|
637 | for (b=0; b<=breite+1+1; b++) printf("%5i", verticalOpen[b][h]); |
---|
638 | printf("\n diagonal: "); |
---|
639 | for (b=0; b<=breite+1+1; b++) printf("%5i", diagonal[b][h]); |
---|
640 | printf("\n horizontal: "); |
---|
641 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontal[b][h]); |
---|
642 | printf("\n horizontalOpen:"); |
---|
643 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontalOpen[b][h]); |
---|
644 | printf("\n"); |
---|
645 | } |
---|
646 | |
---|
647 | printf("--------------------\n"); |
---|
648 | |
---|
649 | free(ma); |
---|
650 | free(sl); |
---|
651 | } |
---|
652 | #endif |
---|
653 | |
---|
654 | static int diff(int v1,int v2,int v3,int v4, int st,int en) |
---|
655 | /* v1,v3 master sequence (v1=offset, v3=length) |
---|
656 | * v2,v4 slave sequence (v2=offset, v4=length) |
---|
657 | * st,en gap_open-penalties for start and end of the sequence |
---|
658 | * |
---|
659 | * returns costs for inserted gaps |
---|
660 | */ |
---|
661 | { |
---|
662 | int ctrc,ctri,ctrj=0, |
---|
663 | i,j,k,l,m,n,p, |
---|
664 | flag; |
---|
665 | |
---|
666 | #ifdef DEBUG |
---|
667 | # ifdef MATRIX_DUMP |
---|
668 | int display_matrix = 0; |
---|
669 | |
---|
670 | awtc_assert(v3<=(DISPLAY_MATRIX_SIZE+2)); // width |
---|
671 | awtc_assert(v4<=(DISPLAY_MATRIX_SIZE)); // height |
---|
672 | |
---|
673 | # define DISPLAY_MATRIX_ELEMENTS ((DISPLAY_MATRIX_SIZE+2)*(DISPLAY_MATRIX_SIZE+2)) |
---|
674 | |
---|
675 | memset(vertical, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
676 | memset(verticalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
677 | memset(diagonal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
678 | memset(horizontal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
679 | memset(horizontalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
---|
680 | |
---|
681 | # endif |
---|
682 | #endif |
---|
683 | static int deep; |
---|
684 | deep++; |
---|
685 | |
---|
686 | #if (defined (DEBUG) && 0) |
---|
687 | { |
---|
688 | char *d; |
---|
689 | |
---|
690 | d = lstr(seq_array[1]+v1,v3); |
---|
691 | |
---|
692 | (dnaflag ? n_decode : p_decode)(d-1,d-1,v3); |
---|
693 | |
---|
694 | for (int cnt=0; cnt<deep; cnt++) putchar(' '); |
---|
695 | printf("master = '%s' (penalties left=%i right=%i)\n", d, st, en); |
---|
696 | |
---|
697 | d = lstr(seq_array[2]+v2,v4); |
---|
698 | (dnaflag ? n_decode : p_decode)(d-1,d-1,v4); |
---|
699 | |
---|
700 | for (cnt=0; cnt<deep; cnt++) putchar(' '); |
---|
701 | printf("slave = '%s'\n", d); |
---|
702 | } |
---|
703 | #endif |
---|
704 | |
---|
705 | // q = gap_open + gap_extend; // q is replaced for local positions |
---|
706 | |
---|
707 | if(v4<=0) { // if slave sequence is empty |
---|
708 | if(v3>0) { |
---|
709 | if (last_print<0 && print_ptr>0) { |
---|
710 | last_print = decr_displ(print_ptr-1,v3); // add .. |
---|
711 | } |
---|
712 | else { |
---|
713 | last_print = set_displ(print_ptr++,-(v3)); // .. or insert gap of length 'v3' into slave |
---|
714 | } |
---|
715 | } |
---|
716 | |
---|
717 | deep--; |
---|
718 | return master_gapAt(v1,v3); |
---|
719 | } |
---|
720 | |
---|
721 | if(v3<=1) { |
---|
722 | if(v3<=0) { // if master sequence is empty |
---|
723 | add(v4); // ??? insert gap length 'v4' into master ??? |
---|
724 | deep--; |
---|
725 | return slave_gapAt(v2,v4); |
---|
726 | } |
---|
727 | |
---|
728 | awtc_assert(v3==1); |
---|
729 | // if master length == 1 |
---|
730 | |
---|
731 | if (v4==1) |
---|
732 | { |
---|
733 | if(st>en) |
---|
734 | st=en; |
---|
735 | |
---|
736 | /***************if(!v4)*********BUG********************************/ |
---|
737 | |
---|
738 | ctrc = slave_gapAtWithOpenPenalty(v2,v4,st); |
---|
739 | ctrj = 0; |
---|
740 | |
---|
741 | for(j=1;j<=v4;++j) |
---|
742 | { |
---|
743 | k = slave_gapAt(v2, j-1) + calc_weight(1,j,v1,v2) + slave_gapAt(v2+j,v4-j); |
---|
744 | if(k<ctrc) { ctrc = k; ctrj = j; } |
---|
745 | } |
---|
746 | |
---|
747 | if(!ctrj) |
---|
748 | { |
---|
749 | add(v4); |
---|
750 | if(last_print<0 && print_ptr>0) |
---|
751 | last_print = decr_displ(print_ptr-1,1); |
---|
752 | else |
---|
753 | last_print = set_displ(print_ptr++,-1); |
---|
754 | } |
---|
755 | else |
---|
756 | { |
---|
757 | if (ctrj>1) add(ctrj-1); |
---|
758 | set_displ(print_ptr++,last_print = 0); |
---|
759 | if (ctrj<v4) add(v4-ctrj); |
---|
760 | } |
---|
761 | |
---|
762 | deep--; |
---|
763 | return ctrc; |
---|
764 | } |
---|
765 | } |
---|
766 | |
---|
767 | awtc_assert(v3>=1 && v4>=1); |
---|
768 | |
---|
769 | // slave length >= 1 |
---|
770 | // master length >= 1 |
---|
771 | |
---|
772 | # define AF 0 |
---|
773 | |
---|
774 | // first column of matrix (slave side): |
---|
775 | IF_MATRIX_DUMP(vertical[0][0]=) |
---|
776 | zza[0] = 0; |
---|
777 | p = master_gap_open(v1); |
---|
778 | for(j=1; j<=v4; j++) |
---|
779 | { |
---|
780 | p += master_gap_extend(v1); |
---|
781 | IF_MATRIX_DUMP(vertical[0][j]=) |
---|
782 | zza[j] = p; |
---|
783 | IF_MATRIX_DUMP(verticalOpen[0][j]=) |
---|
784 | zzb[j] = p + master_gap_open(v1); |
---|
785 | } |
---|
786 | |
---|
787 | // left half of the matrix |
---|
788 | p = st; |
---|
789 | ctri = v3 / 2; |
---|
790 | for(i=1; i<=ctri; i++) |
---|
791 | { |
---|
792 | n = zza[0]; |
---|
793 | p += master_gap_extend(v1+i+AF); |
---|
794 | k = p; |
---|
795 | IF_MATRIX_DUMP(vertical[i][0]=) |
---|
796 | zza[0] = k; |
---|
797 | l = p + master_gap_open(v1+i+AF); |
---|
798 | |
---|
799 | for(j=1; j<=v4; j++) |
---|
800 | { |
---|
801 | // from above (gap in master (behind position i)) |
---|
802 | IF_MATRIX_DUMP(verticalOpen[i][j]=) k += master_gap_open(v1+i+AF)+master_gap_extend(v1+i+AF); // (1) |
---|
803 | IF_MATRIX_DUMP(vertical[i][j]=) l += master_gap_extend(v1+i+AF); // (2) |
---|
804 | if(k<l) l = k; // l=min((1),(2)) |
---|
805 | |
---|
806 | // from left (gap in slave (behind position j)) |
---|
807 | IF_MATRIX_DUMP(horizontalOpen[i][j]=) k = zza[j] + slave_gap_open(v2+j+AF)+slave_gap_extend(v2+j+AF); // (3) |
---|
808 | IF_MATRIX_DUMP(horizontal[i][j]=) m = zzb[j] + slave_gap_extend(v2+j+AF); // (4) |
---|
809 | if(k<m) m = k; // m=min((3),(4)) |
---|
810 | |
---|
811 | // diagonal (no gaps) |
---|
812 | IF_MATRIX_DUMP(diagonal[i][j]=) k = n + calc_weight(i,j,v1,v2); // (5) |
---|
813 | if(l<k) k = l; |
---|
814 | if(m<k) k = m; // k = minimum of all paths |
---|
815 | |
---|
816 | n = zza[j]; // minimum of same row; one column to the left |
---|
817 | zza[j] = k; // minimum of all paths to this matrix position |
---|
818 | zzb[j] = m; // minimum of those two paths, where gap was inserted into slave |
---|
819 | } |
---|
820 | } |
---|
821 | |
---|
822 | # define MHO 1 // X-Offset for second half of matrix-arrays (only used to insert MID-column into dumpMatrix()) |
---|
823 | # define BO 1 // Offset for second half of matrix (cause now the indices i and j are smaller as above) |
---|
824 | |
---|
825 | // last column of matrix: |
---|
826 | |
---|
827 | zzb[0]=zza[0]; |
---|
828 | IF_MATRIX_DUMP(vertical[v3+1+MHO][v4+1]=) |
---|
829 | zzc[v4]=0; |
---|
830 | p = master_gap_open(v1+v3); |
---|
831 | for(j=v4-1; j>-1; j--) |
---|
832 | { |
---|
833 | p += master_gap_extend(v1+v3); |
---|
834 | IF_MATRIX_DUMP(vertical[v3+1+MHO][j+BO]=) |
---|
835 | zzc[j] = p; |
---|
836 | IF_MATRIX_DUMP(verticalOpen[v3+1+MHO][j+BO]=) |
---|
837 | zzd[j] = p+master_gap_open(v1+v3); |
---|
838 | } |
---|
839 | |
---|
840 | // right half of matrix (backwards): |
---|
841 | p = en; |
---|
842 | for(i=v3-1;i>=ctri;i--) |
---|
843 | { |
---|
844 | n = zzc[v4]; |
---|
845 | p += master_gap_extend(v1+i); |
---|
846 | k = p; |
---|
847 | IF_MATRIX_DUMP(vertical[i+BO+MHO][v4+1]=) |
---|
848 | zzc[v4] = k; |
---|
849 | l = p+master_gap_open(v1+i); |
---|
850 | |
---|
851 | for(j=v4-1; j>=0; j--) |
---|
852 | { |
---|
853 | // from below (gap in master (in front of position (i+BO))) |
---|
854 | IF_MATRIX_DUMP(verticalOpen[i+BO+MHO][j+BO]=) k += master_gap_open(v1+i)+master_gap_extend(v1+i); // (1) |
---|
855 | IF_MATRIX_DUMP(vertical[i+BO+MHO][j+BO]=) l += master_gap_extend(v1+i); // (2) |
---|
856 | if(k<l) l = k; // l=min((1),(2)) |
---|
857 | |
---|
858 | // from right (gap in slave (in front of position (j+BO))) |
---|
859 | IF_MATRIX_DUMP(horizontalOpen[i+BO+MHO][j+BO]=) k = zzc[j] + slave_gap_open(v2+j) + slave_gap_extend(v2+j); // (3) |
---|
860 | IF_MATRIX_DUMP(horizontal[i+BO+MHO][j+BO]=) m = zzd[j] + slave_gap_extend(v2+j); // (4) |
---|
861 | if(k<m) m = k; // m=min((3),(4)) |
---|
862 | |
---|
863 | // diagonal (no gaps) |
---|
864 | IF_MATRIX_DUMP(diagonal[i+BO+MHO][j+BO]=) k = n + calc_weight(i+BO,j+BO,v1,v2); // (5) |
---|
865 | if(l<k) k = l; |
---|
866 | if(m<k) k = m; // k = minimum of all paths |
---|
867 | |
---|
868 | n = zzc[j]; // minimum of same row; one column to the right |
---|
869 | zzc[j] = k; // minimum of all paths to this matrix position |
---|
870 | zzd[j] = m; // minimum of those two paths, where gap was inserted into slave |
---|
871 | } |
---|
872 | } |
---|
873 | |
---|
874 | #undef BO |
---|
875 | |
---|
876 | // connect rightmost column of first half (column ctri) |
---|
877 | // with leftmost column of second half (column ctri+1) |
---|
878 | |
---|
879 | zzd[v4] = zzc[v4]; |
---|
880 | |
---|
881 | ctrc = INT_MAX; |
---|
882 | flag = 0; |
---|
883 | |
---|
884 | for(j=(ctri?0:1);j<=v4;j++) // was j=0;... (this was erroneous cause it splitted zero-length subsequences) |
---|
885 | { |
---|
886 | IF_MATRIX_DUMP(vertical[ctri+MHO][j]=) |
---|
887 | k = zza[j] + zzc[j]; // sum up both calculations (=diagonal=no gaps) |
---|
888 | |
---|
889 | if (k<ctrc || ((k==ctrc) && (zza[j]!=zzb[j]) && (zzc[j]==zzd[j]))) |
---|
890 | { |
---|
891 | ctrc = k; |
---|
892 | ctrj = j; |
---|
893 | flag = 1; |
---|
894 | } |
---|
895 | } |
---|
896 | |
---|
897 | for(j=v4;j>=(ctri?0:1);j--) |
---|
898 | { |
---|
899 | IF_MATRIX_DUMP(verticalOpen[ctri+MHO][j]=) |
---|
900 | k = zzb[j] + zzd[j] // paths where gaps were inserted into slave (left and right side!) |
---|
901 | - slave_gap_open(j); // subtract gap_open-penalty which was added twice (at left and right end of gap) |
---|
902 | |
---|
903 | if(k<ctrc) |
---|
904 | { |
---|
905 | ctrc = k; |
---|
906 | ctrj = j; |
---|
907 | flag = 2; |
---|
908 | } |
---|
909 | } |
---|
910 | |
---|
911 | awtc_assert(flag>=1 && flag<=2); |
---|
912 | |
---|
913 | #undef MHO |
---|
914 | |
---|
915 | #ifdef MATRIX_DUMP |
---|
916 | if (display_matrix) |
---|
917 | dumpMatrix(v1,v2,v3,v4,ctri); |
---|
918 | #endif |
---|
919 | |
---|
920 | /* Conquer recursively around midpoint */ |
---|
921 | |
---|
922 | if(flag==1) /* Type 1 gaps */ |
---|
923 | { |
---|
924 | diff(v1,v2,ctri,ctrj,st,master_gap_open(v1+ctri)); // includes midpoint ctri and ctrj |
---|
925 | diff(v1+ctri,v2+ctrj,v3-ctri,v4-ctrj,master_gap_open(v1+ctri),en); |
---|
926 | } |
---|
927 | else |
---|
928 | { |
---|
929 | diff(v1,v2,ctri-1,ctrj,st,0); // includes midpoint ctrj |
---|
930 | |
---|
931 | if(last_print<0 && print_ptr>0) /* Delete 2 */ |
---|
932 | last_print = decr_displ(print_ptr-1,2); |
---|
933 | else |
---|
934 | last_print = set_displ(print_ptr++,-2); |
---|
935 | |
---|
936 | diff(v1+ctri+1,v2+ctrj,v3-ctri-1,v4-ctrj,0,en); |
---|
937 | } |
---|
938 | |
---|
939 | deep--; |
---|
940 | return ctrc; /* Return the score of the best alignment */ |
---|
941 | } |
---|
942 | |
---|
943 | static void do_align( /*int v1,*/ int *score, long act_seq_length) |
---|
944 | { |
---|
945 | int i,j,k,l1,l2,n; |
---|
946 | int t_arr[MAX_BASETYPES]; |
---|
947 | |
---|
948 | l1=l2=pos1=pos2=0; |
---|
949 | |
---|
950 | // clear statistics |
---|
951 | |
---|
952 | for(i=1;i<=act_seq_length;++i) { |
---|
953 | for(j=0;j<MAX_BASETYPES;++j) { |
---|
954 | naa1[j][i] = naa2[j][i]=0; |
---|
955 | naas[j][i]=0; |
---|
956 | } |
---|
957 | } |
---|
958 | |
---|
959 | // create position statistics for each group |
---|
960 | // [here every group contains only one seq] |
---|
961 | |
---|
962 | for (i=1;i<=nseqs;++i) { |
---|
963 | if (group[i]==1) { |
---|
964 | fst_list[++pos1]=i; |
---|
965 | for (j=1;j<=seqlen_array[i];++j) { |
---|
966 | unsigned char b = seq_array[i][j]; |
---|
967 | if (b<128) { |
---|
968 | ++naa1[b][j]; |
---|
969 | ++naa1[0][j]; |
---|
970 | } |
---|
971 | } |
---|
972 | if (seqlen_array[i]>l1) l1=seqlen_array[i]; |
---|
973 | } |
---|
974 | else if (group[i]==2) |
---|
975 | { |
---|
976 | snd_list[++pos2]=i; |
---|
977 | for (j=1;j<=seqlen_array[i];++j) { |
---|
978 | unsigned char b = seq_array[i][j]; |
---|
979 | if (b<128) { |
---|
980 | ++naa2[b][j]; |
---|
981 | ++naa2[0][j]; |
---|
982 | } |
---|
983 | } |
---|
984 | if (seqlen_array[i]>l2) l2=seqlen_array[i]; |
---|
985 | } |
---|
986 | } |
---|
987 | |
---|
988 | if(pos1>=pos2) { |
---|
989 | for(i=1;i<=pos2;++i) alist[i]=snd_list[i]; |
---|
990 | for(n=1;n<=l1;++n) |
---|
991 | { |
---|
992 | for(i=1;i<MAX_BASETYPES;++i) t_arr[i]=0; |
---|
993 | for(i=1;i<MAX_BASETYPES;++i) { |
---|
994 | if(naa1[i][n]>0) { |
---|
995 | for(j=1;j<MAX_BASETYPES;++j) { |
---|
996 | t_arr[j] += (weights[i][j]*naa1[i][n]); |
---|
997 | } |
---|
998 | } |
---|
999 | } |
---|
1000 | k = naa1[0][n]; |
---|
1001 | if (k>0) { |
---|
1002 | for (i=1;i<MAX_BASETYPES;++i) { |
---|
1003 | naas[i][n]=t_arr[i]/k; |
---|
1004 | } |
---|
1005 | } |
---|
1006 | } |
---|
1007 | } |
---|
1008 | else { |
---|
1009 | MAXN_2_assert(0); // should never occur if MAXN==2 |
---|
1010 | #if (MAXN!=2) |
---|
1011 | for (i=1;i<=pos1;++i) alist[i]=fst_list[i]; |
---|
1012 | for (n=1;n<=l2;++n) { |
---|
1013 | for (i=1;i<MAX_BASETYPES;++i) t_arr[i]=0; |
---|
1014 | for (i=1;i<MAX_BASETYPES;++i) |
---|
1015 | if (naa2[i][n]>0) |
---|
1016 | for (j=1;j<MAX_BASETYPES;++j) |
---|
1017 | t_arr[j] += (weights[i][j]*naa2[i][n]); |
---|
1018 | k = naa2[0][n]; |
---|
1019 | if (k>0) |
---|
1020 | for (i=1;i<MAX_BASETYPES;++i) |
---|
1021 | naas[i][n]=t_arr[i]/k; |
---|
1022 | } |
---|
1023 | #endif |
---|
1024 | } |
---|
1025 | |
---|
1026 | *score=diff(1,1,l1,l2,master_gap_open(1),master_gap_open(l1+1)/*v1,v1*/); /* Myers and Miller alignment now */ |
---|
1027 | } |
---|
1028 | |
---|
1029 | /* |
---|
1030 | static void group_gap(int len,int sclass,char *seq) |
---|
1031 | { |
---|
1032 | int i,j,k,xtra; |
---|
1033 | |
---|
1034 | for(i=1;i<=nseqs;++i) |
---|
1035 | if(group[i] == sclass) |
---|
1036 | { |
---|
1037 | xtra = len - seqlen_array[i]; |
---|
1038 | if(xtra>0) |
---|
1039 | for(j=1;j<=xtra;++j) |
---|
1040 | seq_array[i][seqlen_array[i]+j] = -1; |
---|
1041 | for(j=1;j<=len;++j) |
---|
1042 | if(seq[j] == '-') { |
---|
1043 | for(k=len;k>=j+1;--k) |
---|
1044 | seq_array[i][k] = seq_array[i][k-1]; |
---|
1045 | seq_array[i][j] = -1; |
---|
1046 | } |
---|
1047 | seqlen_array[i] = len; |
---|
1048 | } |
---|
1049 | } |
---|
1050 | */ |
---|
1051 | |
---|
1052 | static int add_ggaps(long /*max_seq_length*/) |
---|
1053 | { |
---|
1054 | int i,j,k,pos,to_do; |
---|
1055 | |
---|
1056 | pos=1; |
---|
1057 | to_do=print_ptr; |
---|
1058 | |
---|
1059 | for(i=0;i<to_do;++i) // was: 1 .. <=to_do |
---|
1060 | { |
---|
1061 | if(displ[i]==0) |
---|
1062 | { |
---|
1063 | result[1][pos]=result[2][pos]='*'; |
---|
1064 | ++pos; |
---|
1065 | } |
---|
1066 | else |
---|
1067 | { |
---|
1068 | if((k=displ[i])>0) |
---|
1069 | { |
---|
1070 | for(j=0;j<=k-1;++j) |
---|
1071 | { |
---|
1072 | result[2][pos+j]='*'; |
---|
1073 | result[1][pos+j]='-'; |
---|
1074 | } |
---|
1075 | pos += k; |
---|
1076 | } |
---|
1077 | else |
---|
1078 | { |
---|
1079 | k = (displ[i]<0) ? displ[i] * -1 : displ[i]; |
---|
1080 | for(j=0;j<=k-1;++j) |
---|
1081 | { |
---|
1082 | result[1][pos+j]='*'; |
---|
1083 | result[2][pos+j]='-'; |
---|
1084 | } |
---|
1085 | pos += k; |
---|
1086 | } |
---|
1087 | } |
---|
1088 | } |
---|
1089 | |
---|
1090 | #ifdef DEBUG |
---|
1091 | result[1][pos] = 0; |
---|
1092 | result[2][pos] = 0; |
---|
1093 | #endif |
---|
1094 | |
---|
1095 | return pos-1; |
---|
1096 | } |
---|
1097 | |
---|
1098 | |
---|
1099 | static int res_index(const char *t,char c) |
---|
1100 | { |
---|
1101 | if (t[0]==c) return UNKNOWN_ACID; |
---|
1102 | for (int i=1; t[i]; i++) { |
---|
1103 | if (t[i]==c) return i; |
---|
1104 | } |
---|
1105 | return 0; |
---|
1106 | } |
---|
1107 | |
---|
1108 | static GB_ERROR p_encode(const unsigned char *seq, unsigned char *naseq, int l) /* code seq as ints .. use -2 for gap */ { |
---|
1109 | bool warned = false; |
---|
1110 | |
---|
1111 | for(int i=1; i<=l; i++) { |
---|
1112 | int c = res_index(amino_acid_order,seq[i]); |
---|
1113 | |
---|
1114 | if (!c) { |
---|
1115 | if (seq[i] == '*') { |
---|
1116 | c = -2; |
---|
1117 | } |
---|
1118 | else { |
---|
1119 | if (!warned && seq[i] != '?') { // consensus contains ? and it means X |
---|
1120 | char buf[100]; |
---|
1121 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
---|
1122 | aw_message(buf); |
---|
1123 | warned = true; |
---|
1124 | } |
---|
1125 | c = res_index(amino_acid_order, 'X'); |
---|
1126 | } |
---|
1127 | } |
---|
1128 | |
---|
1129 | awtc_assert(c>0 || c == -2); |
---|
1130 | naseq[i] = c; |
---|
1131 | } |
---|
1132 | |
---|
1133 | return 0; |
---|
1134 | } |
---|
1135 | |
---|
1136 | static GB_ERROR n_encode(const unsigned char *seq,unsigned char *naseq,int l) |
---|
1137 | { /* code seq as ints .. use -2 for gap */ |
---|
1138 | int i; |
---|
1139 | /* static char *nucs="ACGTU"; */ |
---|
1140 | int warned = 0; |
---|
1141 | |
---|
1142 | for(i=1; i<=l; i++) { |
---|
1143 | int c = res_index(nucleic_acid_order,seq[i]); |
---|
1144 | |
---|
1145 | if (!c) { |
---|
1146 | if (!warned && seq[i] != '?') { |
---|
1147 | char buf[100]; |
---|
1148 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
---|
1149 | aw_message(buf); |
---|
1150 | warned = 1; |
---|
1151 | } |
---|
1152 | c = res_index(nucleic_acid_order, 'N'); |
---|
1153 | } |
---|
1154 | |
---|
1155 | awtc_assert(c>0); |
---|
1156 | naseq[i] = c; |
---|
1157 | } |
---|
1158 | |
---|
1159 | return 0; |
---|
1160 | } |
---|
1161 | |
---|
1162 | GB_ERROR AWTC_ClustalV_align(int is_dna, int weighted, |
---|
1163 | const char *seq1, int length1, |
---|
1164 | const char *seq2, int length2, |
---|
1165 | const int *gapsBefore1, |
---|
1166 | int max_seq_length, |
---|
1167 | char **resultPtr1, |
---|
1168 | char **resultPtr2, |
---|
1169 | int *resLengthPtr, |
---|
1170 | int *score) |
---|
1171 | { |
---|
1172 | error = 0; |
---|
1173 | gapsBeforePosition = gapsBefore1; |
---|
1174 | |
---|
1175 | if (!module_initialized) { // initialize only once |
---|
1176 | dnaflag = is_dna; |
---|
1177 | is_weight = weighted; |
---|
1178 | |
---|
1179 | init_myers(max_seq_length); |
---|
1180 | init_show_pair(max_seq_length); |
---|
1181 | make_pamo(0); |
---|
1182 | fill_pam(); |
---|
1183 | |
---|
1184 | for (int i=1; i<=2; i++) |
---|
1185 | { |
---|
1186 | seq_array[i] = (unsigned char*)ckalloc((max_seq_length+2)*sizeof(char)); |
---|
1187 | result[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char)); |
---|
1188 | group[i] = i; |
---|
1189 | } |
---|
1190 | |
---|
1191 | if (!error) module_initialized = true; |
---|
1192 | } |
---|
1193 | else { |
---|
1194 | if (dnaflag!=is_dna || is_weight!=weighted) { |
---|
1195 | error = "Please call AWTC_ClustalV_align_exit() between calls that differ in\n" |
---|
1196 | "one of the following parameters:\n" |
---|
1197 | " is_dna, weighted"; |
---|
1198 | } |
---|
1199 | } |
---|
1200 | |
---|
1201 | if (!error) { |
---|
1202 | nseqs = 2; |
---|
1203 | print_ptr = 0; |
---|
1204 | |
---|
1205 | #if defined(DEBUG) && 1 |
---|
1206 | memset(&seq_array[1][1], 0, max_seq_length*sizeof(seq_array[1][1])); |
---|
1207 | memset(&seq_array[2][1], 0, max_seq_length*sizeof(seq_array[2][1])); |
---|
1208 | memset(&displ[1], 0xff, max_seq_length*sizeof(displ[1])); |
---|
1209 | seq_array[1][0] = '_'; |
---|
1210 | seq_array[2][0] = '_'; |
---|
1211 | #endif |
---|
1212 | |
---|
1213 | { |
---|
1214 | GB_ERROR (*encode)(const unsigned char*,unsigned char*,int) = dnaflag ? n_encode : p_encode; |
---|
1215 | |
---|
1216 | error = encode((const unsigned char*)(seq1-1), seq_array[1], length1); |
---|
1217 | if (!error) { |
---|
1218 | seqlen_array[1] = length1; |
---|
1219 | error = encode((const unsigned char*)(seq2-1), seq_array[2], length2); |
---|
1220 | if (!error) { |
---|
1221 | seqlen_array[2] = length2; |
---|
1222 | |
---|
1223 | do_align(/* gap_open,*/ score, max(length1,length2)); |
---|
1224 | int alignedLength = add_ggaps(max_seq_length); |
---|
1225 | |
---|
1226 | *resultPtr1 = result[1]+1; |
---|
1227 | *resultPtr2 = result[2]+1; |
---|
1228 | *resLengthPtr = alignedLength; |
---|
1229 | } |
---|
1230 | } |
---|
1231 | } |
---|
1232 | } |
---|
1233 | |
---|
1234 | return error; |
---|
1235 | } |
---|
1236 | |
---|
1237 | void AWTC_ClustalV_align_exit(void) |
---|
1238 | { |
---|
1239 | if (module_initialized) { |
---|
1240 | module_initialized = false; |
---|
1241 | |
---|
1242 | for (int i=1; i<=2; i++) { |
---|
1243 | free(result[i]); |
---|
1244 | free(seq_array[i]); |
---|
1245 | } |
---|
1246 | exit_show_pair(); |
---|
1247 | exit_myers(); |
---|
1248 | } |
---|
1249 | } |
---|