| 1 | #include <ctype.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <memory.h> |
|---|
| 4 | // #include <malloc.h> |
|---|
| 5 | #include <string.h> |
|---|
| 6 | #include <stdio.h> |
|---|
| 7 | #include <limits.h> |
|---|
| 8 | |
|---|
| 9 | #include <arbdb.h> |
|---|
| 10 | #include <aw_root.hxx> |
|---|
| 11 | |
|---|
| 12 | #include "ClustalV.hxx" |
|---|
| 13 | #include "awtc_seq_search.hxx" |
|---|
| 14 | |
|---|
| 15 | /* ---------------------------------------------------------------- */ |
|---|
| 16 | |
|---|
| 17 | #define MASTER_GAP_OPEN 50 |
|---|
| 18 | #define CHEAP_GAP_OPEN 20 // penalty for creating a gap (if we have gaps in master) |
|---|
| 19 | |
|---|
| 20 | #define DEFAULT_GAP_OPEN 30 // penalty for creating a gap |
|---|
| 21 | |
|---|
| 22 | #define MASTER_GAP_EXTEND 18 |
|---|
| 23 | #define CHEAP_GAP_EXTEND 5 // penalty for extending a gap (if we have gaps in master) |
|---|
| 24 | |
|---|
| 25 | #define DEFAULT_GAP_EXTEND 10 // penalty for extending a gap |
|---|
| 26 | |
|---|
| 27 | #define DEFAULT_IMPROBABLY_MUTATION 10 // penalty for mutations |
|---|
| 28 | #define DEFAULT_PROBABLY_MUTATION 4 // penalty for special mutations (A<->G,C<->U/T) (only if 'weighted') |
|---|
| 29 | |
|---|
| 30 | #define DYNAMIC_PENALTIES // otherwise you get FIXED PENALTIES (=no cheap penalties) |
|---|
| 31 | |
|---|
| 32 | /* ---------------------------------------------------------------- */ |
|---|
| 33 | |
|---|
| 34 | #define MAX_GAP_OPEN_DISCOUNT (DEFAULT_GAP_OPEN-CHEAP_GAP_OPEN) // maximum subtracted from DEFAULT_GAP_OPEN |
|---|
| 35 | #define MAX_GAP_EXTEND_DISCOUNT (DEFAULT_GAP_EXTEND-CHEAP_GAP_EXTEND) // maximum subtracted from DEFAULT_GAP_EXTEND |
|---|
| 36 | |
|---|
| 37 | #define MAXN 2 /* Maximum number of sequences (both groups) */ |
|---|
| 38 | |
|---|
| 39 | #if (MAXN==2) |
|---|
| 40 | #define MAXN_2(xxx) xxx |
|---|
| 41 | #define MAXN_2_assert(xxx) awtc_assert(xxx) |
|---|
| 42 | #else |
|---|
| 43 | #define MAXN_2(xxx) |
|---|
| 44 | #define MAXN_2_assert(xxx) |
|---|
| 45 | #endif |
|---|
| 46 | |
|---|
| 47 | #define TRUE 1 |
|---|
| 48 | #define FALSE 0 |
|---|
| 49 | |
|---|
| 50 | static GB_ERROR error; |
|---|
| 51 | static bool module_initialized = false; |
|---|
| 52 | |
|---|
| 53 | typedef int Boolean; |
|---|
| 54 | |
|---|
| 55 | static Boolean dnaflag; |
|---|
| 56 | static Boolean is_weight; |
|---|
| 57 | |
|---|
| 58 | #define MAX_BASETYPES 21 |
|---|
| 59 | |
|---|
| 60 | static int xover; |
|---|
| 61 | static int little_pam; |
|---|
| 62 | static int big_pam; |
|---|
| 63 | static int pamo[(MAX_BASETYPES-1)*MAX_BASETYPES/2]; |
|---|
| 64 | static int pam[MAX_BASETYPES][MAX_BASETYPES]; |
|---|
| 65 | |
|---|
| 66 | static int pos1; |
|---|
| 67 | static int pos2; |
|---|
| 68 | static int **naa1; // naa1[basetype][position] counts bases for each position of all sequences in group1 |
|---|
| 69 | static int **naa2; // naa2[basetype][position] same for group2 |
|---|
| 70 | static int **naas; // |
|---|
| 71 | static int seqlen_array[MAXN+1]; // length of all sequences |
|---|
| 72 | static unsigned char *seq_array[MAXN+1]; // the sequences |
|---|
| 73 | static int group[MAXN+1]; // group of sequence |
|---|
| 74 | static int alist[MAXN+1]; // indices of sequences to be aligned |
|---|
| 75 | static int fst_list[MAXN+1]; |
|---|
| 76 | static int snd_list[MAXN+1]; |
|---|
| 77 | |
|---|
| 78 | static int nseqs; // # of sequences |
|---|
| 79 | static int weights[MAX_BASETYPES][MAX_BASETYPES]; // weights[b1][b2] : penalty for mutation from base 'b1' to base 'b2' |
|---|
| 80 | |
|---|
| 81 | #if defined(DEBUG) |
|---|
| 82 | size_t displ_size = 0; |
|---|
| 83 | #endif // DEBUG |
|---|
| 84 | |
|---|
| 85 | static int *displ; // displ == 0 -> base in both , displ<0 -> displ gaps in slave, displ>0 -> displ gaps in master |
|---|
| 86 | static int *zza; // column (left->right) of align matrix (minimum of all paths to this matrix element) |
|---|
| 87 | static int *zzb; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
|---|
| 88 | static int *zzc; // column (left<-right) of align matrix (minimum of all paths to this matrix element) |
|---|
| 89 | static int *zzd; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
|---|
| 90 | static int print_ptr; |
|---|
| 91 | static int last_print; |
|---|
| 92 | |
|---|
| 93 | static const int *gapsBeforePosition; |
|---|
| 94 | |
|---|
| 95 | /* static int gap_open; |
|---|
| 96 | static int gap_extend;*/ |
|---|
| 97 | |
|---|
| 98 | #if defined(DEBUG) |
|---|
| 99 | // #define MATRIX_DUMP |
|---|
| 100 | // #define DISPLAY_DIFF |
|---|
| 101 | #endif // DEBUG |
|---|
| 102 | |
|---|
| 103 | #ifdef MATRIX_DUMP |
|---|
| 104 | # define IF_MATRIX_DUMP(xxx) xxx |
|---|
| 105 | # define DISPLAY_MATRIX_SIZE 3000 |
|---|
| 106 | |
|---|
| 107 | static int vertical [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 108 | static int verticalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 109 | static int diagonal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 110 | static int horizontal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 111 | static int horizontalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 112 | |
|---|
| 113 | #else |
|---|
| 114 | # define IF_MATRIX_DUMP(xxx) |
|---|
| 115 | #endif |
|---|
| 116 | |
|---|
| 117 | static inline int master_gap_open(int beforePosition) |
|---|
| 118 | { |
|---|
| 119 | #ifdef DYNAMIC_PENALTIES |
|---|
| 120 | long gaps = gapsBeforePosition[beforePosition-1]; |
|---|
| 121 | return (gaps)? MASTER_GAP_OPEN - MAX_GAP_OPEN_DISCOUNT : MASTER_GAP_OPEN; |
|---|
| 122 | |
|---|
| 123 | /* return |
|---|
| 124 | gaps >= MAX_GAP_OPEN_DISCOUNT |
|---|
| 125 | ? DEFAULT_GAP_OPEN-MAX_GAP_OPEN_DISCOUNT |
|---|
| 126 | : DEFAULT_GAP_OPEN-gaps;*/ |
|---|
| 127 | #else |
|---|
| 128 | return DEFAULT_GAP_OPEN; |
|---|
| 129 | #endif |
|---|
| 130 | } |
|---|
| 131 | static inline int master_gap_extend(int beforePosition) |
|---|
| 132 | { |
|---|
| 133 | #ifdef DYNAMIC_PENALTIES |
|---|
| 134 | long gaps = gapsBeforePosition[beforePosition-1]; |
|---|
| 135 | |
|---|
| 136 | return (gaps)? MASTER_GAP_EXTEND - MAX_GAP_EXTEND_DISCOUNT : MASTER_GAP_EXTEND; |
|---|
| 137 | /* return |
|---|
| 138 | gaps >= MAX_GAP_EXTEND_DISCOUNT |
|---|
| 139 | ? DEFAULT_GAP_EXTEND-MAX_GAP_EXTEND_DISCOUNT |
|---|
| 140 | : DEFAULT_GAP_EXTEND-gaps;*/ |
|---|
| 141 | #else |
|---|
| 142 | return DEFAULT_GAP_EXTEND; |
|---|
| 143 | #endif |
|---|
| 144 | } |
|---|
| 145 | |
|---|
| 146 | static inline int master_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
|---|
| 147 | { |
|---|
| 148 | if (length<=0) |
|---|
| 149 | return 0; |
|---|
| 150 | |
|---|
| 151 | int beforePosition = atPosition, |
|---|
| 152 | afterPosition = atPosition-1; |
|---|
| 153 | |
|---|
| 154 | while (length--) |
|---|
| 155 | { |
|---|
| 156 | int p1,p2; |
|---|
| 157 | |
|---|
| 158 | if ((p1=master_gap_extend(beforePosition)) < (p2=master_gap_extend(afterPosition+1)) && |
|---|
| 159 | beforePosition>1) |
|---|
| 160 | { |
|---|
| 161 | penalty += p1; |
|---|
| 162 | beforePosition--; |
|---|
| 163 | } |
|---|
| 164 | else |
|---|
| 165 | { |
|---|
| 166 | penalty += p2; |
|---|
| 167 | afterPosition++; |
|---|
| 168 | } |
|---|
| 169 | } |
|---|
| 170 | |
|---|
| 171 | return penalty; |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | static inline int master_gapAt(int atPosition, int length) |
|---|
| 175 | { |
|---|
| 176 | return master_gapAtWithOpenPenalty(atPosition, length, master_gap_open(atPosition)); |
|---|
| 177 | } |
|---|
| 178 | |
|---|
| 179 | static inline int slave_gap_open(int /*beforePosition*/) |
|---|
| 180 | { |
|---|
| 181 | return DEFAULT_GAP_OPEN; |
|---|
| 182 | } |
|---|
| 183 | |
|---|
| 184 | static inline int slave_gap_extend(int /*beforePosition*/) |
|---|
| 185 | { |
|---|
| 186 | return DEFAULT_GAP_EXTEND; |
|---|
| 187 | } |
|---|
| 188 | |
|---|
| 189 | static inline int slave_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
|---|
| 190 | { |
|---|
| 191 | return length<=0 ? 0 : penalty + length*slave_gap_extend(atPosition); |
|---|
| 192 | } |
|---|
| 193 | |
|---|
| 194 | static inline int slave_gapAt(int atPosition, int length) |
|---|
| 195 | { |
|---|
| 196 | return slave_gapAtWithOpenPenalty(atPosition, length, slave_gap_open(atPosition)); |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | #define UNKNOWN_ACID 255 |
|---|
| 200 | static const char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW"; |
|---|
| 201 | |
|---|
| 202 | #define NUCLEIDS 16 |
|---|
| 203 | static const char *nucleic_acid_order = "-ACGTUMRWSYKVHDBN"; |
|---|
| 204 | static const char *nucleic_maybe_A = "-A----AAA---AAA-A"; |
|---|
| 205 | static const char *nucleic_maybe_C = "--C---C--CC-CC-CC"; |
|---|
| 206 | static const char *nucleic_maybe_G = "---G---G-G-GG-GGG"; |
|---|
| 207 | static const char *nucleic_maybe_T = "----T---T-TT-TTTT"; |
|---|
| 208 | static const char *nucleic_maybe_U = "-----U--U-UU-UUUU"; |
|---|
| 209 | static const char *nucleic_maybe[6] = { NULL, nucleic_maybe_A, nucleic_maybe_C, nucleic_maybe_G, nucleic_maybe_T, nucleic_maybe_U }; |
|---|
| 210 | |
|---|
| 211 | /* |
|---|
| 212 | * M = A or C S = G or C V = A or G or C N = A or C or G or T |
|---|
| 213 | * R = A or G Y = C or T H = A or C or T |
|---|
| 214 | * W = A or T K = G or T D = A or G or T |
|---|
| 215 | */ |
|---|
| 216 | |
|---|
| 217 | #define cheap_if(cond) ((cond) ? 1 : 2) |
|---|
| 218 | static int baseCmp(unsigned char c1, unsigned char c2) // c1,c2 == 1=A,2=C (==index of character in nucleic_acid_order[]) |
|---|
| 219 | // returns 0 for equal |
|---|
| 220 | // 1 for probably mutations |
|---|
| 221 | // 2 for improbably mutations |
|---|
| 222 | { |
|---|
| 223 | #define COMPARABLE_BASES 5 |
|---|
| 224 | |
|---|
| 225 | if (c1==c2) return 0; |
|---|
| 226 | |
|---|
| 227 | if (c2<c1) // swap |
|---|
| 228 | { |
|---|
| 229 | int c3 = c1; |
|---|
| 230 | |
|---|
| 231 | c1 = c2; |
|---|
| 232 | c2 = c3; |
|---|
| 233 | } |
|---|
| 234 | |
|---|
| 235 | if (c2<=COMPARABLE_BASES) |
|---|
| 236 | { |
|---|
| 237 | awtc_assert(c1<=COMPARABLE_BASES); |
|---|
| 238 | |
|---|
| 239 | switch(c1) |
|---|
| 240 | { |
|---|
| 241 | case 0: return 2; |
|---|
| 242 | case 1: return cheap_if(c2==3); // A->G |
|---|
| 243 | case 2: return cheap_if(c2==4 || c2==5); // C->T/U |
|---|
| 244 | case 3: return cheap_if(c2==1); // G->A |
|---|
| 245 | case 4: if (c2==5) return 0; // T->U |
|---|
| 246 | case 5: if (c2==4) return 0; // U->T |
|---|
| 247 | return cheap_if(c2==2); // T/U->C |
|---|
| 248 | default: awtc_assert(0); |
|---|
| 249 | } |
|---|
| 250 | } |
|---|
| 251 | |
|---|
| 252 | int i; |
|---|
| 253 | int bestMatch = 3; |
|---|
| 254 | |
|---|
| 255 | if (c1<=COMPARABLE_BASES) |
|---|
| 256 | { |
|---|
| 257 | for (i=1; i<=COMPARABLE_BASES; i++) |
|---|
| 258 | { |
|---|
| 259 | if (isalpha(nucleic_maybe[i][c2])) // 'c2' maybe a 'i' |
|---|
| 260 | { |
|---|
| 261 | int match = baseCmp(c1,i); |
|---|
| 262 | if (match<bestMatch) bestMatch = match; |
|---|
| 263 | } |
|---|
| 264 | } |
|---|
| 265 | } |
|---|
| 266 | else |
|---|
| 267 | { |
|---|
| 268 | for (i=1; i<=COMPARABLE_BASES; i++) |
|---|
| 269 | { |
|---|
| 270 | if (isalpha(nucleic_maybe[i][c1])) // 'c1' maybe a 'i' |
|---|
| 271 | { |
|---|
| 272 | int match = baseCmp(i,c2); |
|---|
| 273 | if (match<bestMatch) bestMatch = match; |
|---|
| 274 | } |
|---|
| 275 | } |
|---|
| 276 | } |
|---|
| 277 | |
|---|
| 278 | awtc_assert(bestMatch>=0 && bestMatch<=2); |
|---|
| 279 | return bestMatch; |
|---|
| 280 | } |
|---|
| 281 | #undef cheap_if |
|---|
| 282 | |
|---|
| 283 | int AWTC_baseMatch(char c1, char c2) // c1,c2 == ACGUTRS... |
|---|
| 284 | // returns 0 for equal |
|---|
| 285 | // 1 for probably mutations |
|---|
| 286 | // 2 for improbably mutations |
|---|
| 287 | // -1 if one char is illegal |
|---|
| 288 | { |
|---|
| 289 | const char *p1 = strchr(nucleic_acid_order, c1); |
|---|
| 290 | const char *p2 = strchr(nucleic_acid_order, c2); |
|---|
| 291 | |
|---|
| 292 | awtc_assert(c1); |
|---|
| 293 | awtc_assert(c2); |
|---|
| 294 | |
|---|
| 295 | if (p1 && p2) return baseCmp(p1-nucleic_acid_order, p2-nucleic_acid_order); |
|---|
| 296 | return -1; |
|---|
| 297 | } |
|---|
| 298 | |
|---|
| 299 | static int getPenalty(char c1, char c2) /* c1,c2 = A=0,C=1,... s.o. */ |
|---|
| 300 | { |
|---|
| 301 | switch (baseCmp(c1,c2)) |
|---|
| 302 | { |
|---|
| 303 | case 1: return DEFAULT_PROBABLY_MUTATION; |
|---|
| 304 | case 2: return DEFAULT_IMPROBABLY_MUTATION; |
|---|
| 305 | default: break; |
|---|
| 306 | } |
|---|
| 307 | |
|---|
| 308 | return 0; |
|---|
| 309 | } |
|---|
| 310 | |
|---|
| 311 | static char *result[3]; // result buffers |
|---|
| 312 | |
|---|
| 313 | static char pam250mt[]={ |
|---|
| 314 | 12, |
|---|
| 315 | 0, 2, |
|---|
| 316 | -2, 1, 3, |
|---|
| 317 | -3, 1, 0, 6, |
|---|
| 318 | -2, 1, 1, 1, 2, |
|---|
| 319 | -3, 1, 0,-1, 1, 5, |
|---|
| 320 | -4, 1, 0,-1, 0, 0, 2, |
|---|
| 321 | -5, 0, 0,-1, 0, 1, 2, 4, |
|---|
| 322 | -5, 0, 0,-1, 0, 0, 1, 3, 4, |
|---|
| 323 | -5,-1,-1, 0, 0,-1, 1, 2, 2, 4, |
|---|
| 324 | -3,-1,-1, 0,-1,-2, 2, 1, 1, 3, 6, |
|---|
| 325 | -4, 0,-1, 0,-2,-3, 0,-1,-1, 1, 2, 6, |
|---|
| 326 | -5, 0, 0,-1,-1,-2, 1, 0, 0, 1, 0, 3, 5, |
|---|
| 327 | -5,-2,-1,-2,-1,-3,-2,-3,-2,-1,-2, 0, 0, 6, |
|---|
| 328 | -2,-1, 0,-2,-1,-3,-2,-2,-2,-2,-2,-2,-2, 2, 5, |
|---|
| 329 | -6,-3,-2,-3,-2,-4,-3,-4,-3,-2,-2,-3,-3, 4, 2, 6, |
|---|
| 330 | -2,-1, 0,-1, 0,-1,-2,-2,-2,-2,-2,-2,-2, 2, 4, 2, 4, |
|---|
| 331 | -4,-3,-3,-5,-4,-5,-4,-6,-5,-5,-2,-4,-5, 0, 1, 2,-1, 9, |
|---|
| 332 | 0,-3,-3,-5,-3,-5,-2,-4,-4,-4, 0,-4,-4,-2,-1,-1,-2, 7,10, |
|---|
| 333 | -8,-2,-5,-6,-6,-7,-4,-7,-7,-5,-3, 2,-3,-4,-5,-2,-6, 0, 0,17 }; |
|---|
| 334 | |
|---|
| 335 | /* |
|---|
| 336 | static char idmat[]={ |
|---|
| 337 | 10, |
|---|
| 338 | 0, 10, |
|---|
| 339 | 0, 0, 10, |
|---|
| 340 | 0, 0, 0, 10, |
|---|
| 341 | 0, 0, 0, 0, 10, |
|---|
| 342 | 0, 0, 0, 0, 1, 10, |
|---|
| 343 | 0, 0, 0, 0, 0, 0, 10, |
|---|
| 344 | 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 345 | 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 346 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 347 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 348 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 349 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 350 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 351 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 352 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 353 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 354 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 355 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10, |
|---|
| 356 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10}; |
|---|
| 357 | */ |
|---|
| 358 | |
|---|
| 359 | /* |
|---|
| 360 | static char pam100mt[]={ |
|---|
| 361 | 14, |
|---|
| 362 | -1, 6, |
|---|
| 363 | -5, 2, 7, |
|---|
| 364 | -6, 1, -1, 10, |
|---|
| 365 | -5, 2, 2, 1, 6, |
|---|
| 366 | -8, 1, -3, -3, 1, 8, |
|---|
| 367 | -8, 2, 0, -3, -1, -1, 7, |
|---|
| 368 | -11,-1, -2, -4, -1, -1, 4, 8, |
|---|
| 369 | -11,-2, -3, -3, 0, -2, 1, 5, 8, |
|---|
| 370 | -11,-3, -3, -1, -2, -5,-1, 1, 4, 9, |
|---|
| 371 | -6,-4, -5, -2, -5, -7, 2, -1, -2, 4,11, |
|---|
| 372 | -6,-1, -4, -2, -5, -8,-3, -6, -5, 1, 1, 10, |
|---|
| 373 | -11,-2, -1, -4, -4, -5, 1, -2, -2, -1,-3, 3, 8, |
|---|
| 374 | -11,-4, -2, -6, -3, -8,-5, -8, -6, -2,-7, -2, 1, 13, |
|---|
| 375 | -5,-4, -1, -6, -3, -7,-4, -6, -5, -5,-7, -4, -4, 2, 9, |
|---|
| 376 | -12,-7, -5, -5, -5, -8,-6, -9, -7, -3,-5, -7, -6, 4, 2, 9, |
|---|
| 377 | -4,-4, -1, -4, 0, -4,-5, -6, -5, -5,-6, -6, -6, 1, 5, 1, 8, |
|---|
| 378 | -10,-5, -6, -9, -7, -8,-6,-11,-11,-10,-4, -7,-11, -2, 0, 0, -5,12, |
|---|
| 379 | -2,-6, -6,-11, -6,-11,-3, -9, -7, -9,-1,-10,-10, -8, -4,-5, -6, 6,13, |
|---|
| 380 | -13,-4,-10,-11,-11,-13,-8,-13,-14,-11,-7, 1, -9,-11,-12,-7,-14,-2,-2,19 }; |
|---|
| 381 | */ |
|---|
| 382 | |
|---|
| 383 | static char *matptr = pam250mt; |
|---|
| 384 | |
|---|
| 385 | #if (defined(DISPLAY_DIFF) || defined(MATRIX_DUMP)) |
|---|
| 386 | static GB_ERROR p_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
|---|
| 387 | int len = strlen(amino_acid_order); |
|---|
| 388 | |
|---|
| 389 | for (int i=1; i<=l && naseq[i]; i++) |
|---|
| 390 | { |
|---|
| 391 | awtc_assert(naseq[i]<len); |
|---|
| 392 | seq[i] = amino_acid_order[naseq[i]]; |
|---|
| 393 | } |
|---|
| 394 | |
|---|
| 395 | return 0; |
|---|
| 396 | } |
|---|
| 397 | |
|---|
| 398 | static GB_ERROR n_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
|---|
| 399 | int len = strlen(nucleic_acid_order); |
|---|
| 400 | |
|---|
| 401 | for (int i=1; i<=l && naseq[i]; i++) |
|---|
| 402 | { |
|---|
| 403 | awtc_assert(naseq[i]<len); |
|---|
| 404 | seq[i] = nucleic_acid_order[naseq[i]]; |
|---|
| 405 | } |
|---|
| 406 | |
|---|
| 407 | return 0; |
|---|
| 408 | } |
|---|
| 409 | #endif |
|---|
| 410 | |
|---|
| 411 | static inline GB_ERROR MAXLENtooSmall() |
|---|
| 412 | { |
|---|
| 413 | return GB_export_error("AWTC_ClustalV-aligner: MAXLEN is dimensioned to small for this sequence"); |
|---|
| 414 | } |
|---|
| 415 | |
|---|
| 416 | static inline void *ckalloc(size_t bytes) |
|---|
| 417 | { |
|---|
| 418 | void *ret = malloc(bytes); |
|---|
| 419 | |
|---|
| 420 | if (!ret) error = "out of memory"; |
|---|
| 421 | return ret; |
|---|
| 422 | } |
|---|
| 423 | |
|---|
| 424 | static void init_myers(long max_seq_length) { |
|---|
| 425 | if (!error) { |
|---|
| 426 | naa1 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *) ); |
|---|
| 427 | naa2 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *) ); |
|---|
| 428 | naas = (int **)ckalloc(MAX_BASETYPES * sizeof (int *) ); |
|---|
| 429 | |
|---|
| 430 | for(int i=0;i<MAX_BASETYPES && !error; i++) { |
|---|
| 431 | naa1[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int)); |
|---|
| 432 | naa2[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int)); |
|---|
| 433 | naas[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int)); |
|---|
| 434 | } |
|---|
| 435 | } |
|---|
| 436 | } |
|---|
| 437 | |
|---|
| 438 | static void make_pamo(int nv) |
|---|
| 439 | { |
|---|
| 440 | int i,c; |
|---|
| 441 | |
|---|
| 442 | little_pam=big_pam=matptr[0]; |
|---|
| 443 | for(i=0;i<210;++i) { |
|---|
| 444 | c=matptr[i]; |
|---|
| 445 | little_pam=(little_pam<c) ? little_pam : c; |
|---|
| 446 | big_pam=(big_pam>c) ? big_pam : c; |
|---|
| 447 | } |
|---|
| 448 | for(i=0;i<210;++i) |
|---|
| 449 | pamo[i]= matptr[i]-little_pam; |
|---|
| 450 | nv -= little_pam; |
|---|
| 451 | big_pam -= little_pam; |
|---|
| 452 | xover = big_pam - nv; |
|---|
| 453 | /* |
|---|
| 454 | fprintf(stdout,"\n\nxover= %d, big_pam = %d, little_pam=%d, nv = %d\n\n" |
|---|
| 455 | ,xover,big_pam,little_pam,nv); |
|---|
| 456 | */ |
|---|
| 457 | } |
|---|
| 458 | |
|---|
| 459 | static void fill_pam() |
|---|
| 460 | { |
|---|
| 461 | int i,j,pos; |
|---|
| 462 | |
|---|
| 463 | pos=0; |
|---|
| 464 | |
|---|
| 465 | for(i=0;i<20;++i) |
|---|
| 466 | for(j=0;j<=i;++j) |
|---|
| 467 | pam[i][j]=pamo[pos++]; |
|---|
| 468 | |
|---|
| 469 | for(i=0;i<20;++i) |
|---|
| 470 | for(j=0;j<=i;++j) |
|---|
| 471 | pam[j][i]=pam[i][j]; |
|---|
| 472 | |
|---|
| 473 | if(dnaflag) |
|---|
| 474 | { |
|---|
| 475 | xover=4; |
|---|
| 476 | big_pam=8; |
|---|
| 477 | for(i=0;i<=NUCLEIDS;++i) |
|---|
| 478 | for(j=0;j<=NUCLEIDS;++j) |
|---|
| 479 | weights[i][j] = getPenalty(i,j); |
|---|
| 480 | } |
|---|
| 481 | else { |
|---|
| 482 | for(i=1;i<MAX_BASETYPES;++i) { |
|---|
| 483 | for(j=1;j<MAX_BASETYPES;++j) { |
|---|
| 484 | weights[i][j] = big_pam - pam[i-1][j-1]; |
|---|
| 485 | } |
|---|
| 486 | } |
|---|
| 487 | for(i=0;i<MAX_BASETYPES;++i) { |
|---|
| 488 | weights[0][i] = xover; |
|---|
| 489 | weights[i][0] = xover; |
|---|
| 490 | } |
|---|
| 491 | } |
|---|
| 492 | } |
|---|
| 493 | |
|---|
| 494 | static void exit_myers(void) { |
|---|
| 495 | for (int i=0; i<MAX_BASETYPES; i++) { |
|---|
| 496 | free(naas[i]); |
|---|
| 497 | free(naa2[i]); |
|---|
| 498 | free(naa1[i]); |
|---|
| 499 | } |
|---|
| 500 | free(naas); |
|---|
| 501 | free(naa2); |
|---|
| 502 | free(naa1); |
|---|
| 503 | } |
|---|
| 504 | |
|---|
| 505 | static void init_show_pair(long max_seq_length) |
|---|
| 506 | { |
|---|
| 507 | #if defined(DEBUG) |
|---|
| 508 | displ_size = (2*max_seq_length +1); |
|---|
| 509 | #endif // DEBUG |
|---|
| 510 | displ = (int *) ckalloc( (2*max_seq_length +1) * sizeof (int) ); |
|---|
| 511 | last_print = 0; |
|---|
| 512 | |
|---|
| 513 | zza = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 514 | zzb = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 515 | |
|---|
| 516 | zzc = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 517 | zzd = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 518 | } |
|---|
| 519 | |
|---|
| 520 | void exit_show_pair(void) |
|---|
| 521 | { |
|---|
| 522 | freeset(zzd, 0); |
|---|
| 523 | freeset(zzc, 0); |
|---|
| 524 | freeset(zzb, 0); |
|---|
| 525 | freeset(zza, 0); |
|---|
| 526 | freeset(displ, 0); |
|---|
| 527 | #if defined(DEBUG) |
|---|
| 528 | displ_size = 0; |
|---|
| 529 | #endif // DEBUG |
|---|
| 530 | } |
|---|
| 531 | |
|---|
| 532 | inline int set_displ(int offset, int value) { |
|---|
| 533 | awtc_assert(offset >= 0 && offset<int(displ_size)); |
|---|
| 534 | displ[offset] = value; |
|---|
| 535 | return displ[offset]; |
|---|
| 536 | } |
|---|
| 537 | inline int decr_displ(int offset, int value) { |
|---|
| 538 | awtc_assert(offset >= 0 && offset<int(displ_size)); |
|---|
| 539 | displ[offset] -= value; |
|---|
| 540 | return displ[offset]; |
|---|
| 541 | } |
|---|
| 542 | |
|---|
| 543 | |
|---|
| 544 | static inline void add(int v) // insert 'v' gaps into master ??? |
|---|
| 545 | { |
|---|
| 546 | if (last_print<0 && print_ptr>0) { |
|---|
| 547 | set_displ(print_ptr-1,v); |
|---|
| 548 | set_displ(print_ptr++,last_print); |
|---|
| 549 | } |
|---|
| 550 | else { |
|---|
| 551 | last_print = set_displ(print_ptr++,v); |
|---|
| 552 | } |
|---|
| 553 | } |
|---|
| 554 | |
|---|
| 555 | static MAXN_2(inline) int calc_weight(int iat,int jat,int v1,int v2) |
|---|
| 556 | { |
|---|
| 557 | #if (MAXN==2) |
|---|
| 558 | awtc_assert(pos1==1 && pos2==1); |
|---|
| 559 | unsigned char j = seq_array[alist[1]][v2+jat-1]; |
|---|
| 560 | return j<128 ? naas[j][v1+iat-1] : 0; |
|---|
| 561 | #else |
|---|
| 562 | int sum,i,lookn,ret; |
|---|
| 563 | int ipos,jpos; |
|---|
| 564 | |
|---|
| 565 | ipos = v1 + iat -1; |
|---|
| 566 | jpos = v2 + jat -1; |
|---|
| 567 | |
|---|
| 568 | ret = 0; |
|---|
| 569 | sum = lookn = 0; |
|---|
| 570 | |
|---|
| 571 | if(pos1>=pos2) |
|---|
| 572 | { |
|---|
| 573 | for(i=1;i<=pos2;++i) |
|---|
| 574 | { |
|---|
| 575 | unsigned char j=seq_array[alist[i]][jpos]; |
|---|
| 576 | if (j<128) { |
|---|
| 577 | sum += naas[j][ipos]; |
|---|
| 578 | ++lookn; |
|---|
| 579 | } |
|---|
| 580 | } |
|---|
| 581 | } |
|---|
| 582 | else |
|---|
| 583 | { |
|---|
| 584 | for(i=1;i<=pos1;++i) |
|---|
| 585 | { |
|---|
| 586 | unsigned char j = seq_array[alist[i]][ipos]; |
|---|
| 587 | if (j<128) { |
|---|
| 588 | sum += naas[j][jpos]; |
|---|
| 589 | ++lookn; |
|---|
| 590 | } |
|---|
| 591 | } |
|---|
| 592 | } |
|---|
| 593 | if (sum>0) ret = sum/lookn; |
|---|
| 594 | return ret; |
|---|
| 595 | #endif |
|---|
| 596 | } |
|---|
| 597 | |
|---|
| 598 | #ifdef MATRIX_DUMP |
|---|
| 599 | |
|---|
| 600 | static inline const unsigned char *lstr(const unsigned char *s, int len) { |
|---|
| 601 | static unsigned char *lstr_ss = 0; |
|---|
| 602 | |
|---|
| 603 | freeset(lstr_ss, (unsigned char*)strndup((const char*)s,len)); |
|---|
| 604 | return lstr_ss; |
|---|
| 605 | } |
|---|
| 606 | |
|---|
| 607 | static inline const unsigned char *nstr(unsigned char *cp, int length) |
|---|
| 608 | { |
|---|
| 609 | const unsigned char *s = lstr(cp,length); |
|---|
| 610 | (dnaflag ? n_decode : p_decode)(s-1, const_cast<unsigned char *>(s-1), length); |
|---|
| 611 | return s; |
|---|
| 612 | } |
|---|
| 613 | |
|---|
| 614 | static inline void dumpMatrix(int x0, int y0, int breite, int hoehe, int mitte_vert) |
|---|
| 615 | { |
|---|
| 616 | int b; |
|---|
| 617 | int h; |
|---|
| 618 | char *sl = (char*)malloc(hoehe+3); |
|---|
| 619 | char *ma = (char*)malloc(breite+3); |
|---|
| 620 | |
|---|
| 621 | sprintf(ma, "-%s-", nstr(seq_array[1]+x0, breite)); |
|---|
| 622 | sprintf(sl, "-%s-", nstr(seq_array[2]+y0, hoehe)); |
|---|
| 623 | |
|---|
| 624 | printf(" "); |
|---|
| 625 | |
|---|
| 626 | for (b=0; b<=mitte_vert; b++) |
|---|
| 627 | printf("%5c", ma[b]); |
|---|
| 628 | printf(" MID"); |
|---|
| 629 | for (b++; b<=breite+1+1; b++) |
|---|
| 630 | printf("%5c", ma[b-1]); |
|---|
| 631 | |
|---|
| 632 | for (h=0; h<=hoehe+1; h++) |
|---|
| 633 | { |
|---|
| 634 | printf("\n%c vertical: ", sl[h]); |
|---|
| 635 | for (b=0; b<=breite+1+1; b++) printf("%5i", vertical[b][h]); |
|---|
| 636 | printf("\n verticalOpen: "); |
|---|
| 637 | for (b=0; b<=breite+1+1; b++) printf("%5i", verticalOpen[b][h]); |
|---|
| 638 | printf("\n diagonal: "); |
|---|
| 639 | for (b=0; b<=breite+1+1; b++) printf("%5i", diagonal[b][h]); |
|---|
| 640 | printf("\n horizontal: "); |
|---|
| 641 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontal[b][h]); |
|---|
| 642 | printf("\n horizontalOpen:"); |
|---|
| 643 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontalOpen[b][h]); |
|---|
| 644 | printf("\n"); |
|---|
| 645 | } |
|---|
| 646 | |
|---|
| 647 | printf("--------------------\n"); |
|---|
| 648 | |
|---|
| 649 | free(ma); |
|---|
| 650 | free(sl); |
|---|
| 651 | } |
|---|
| 652 | #endif |
|---|
| 653 | |
|---|
| 654 | static int diff(int v1,int v2,int v3,int v4, int st,int en) |
|---|
| 655 | /* v1,v3 master sequence (v1=offset, v3=length) |
|---|
| 656 | * v2,v4 slave sequence (v2=offset, v4=length) |
|---|
| 657 | * st,en gap_open-penalties for start and end of the sequence |
|---|
| 658 | * |
|---|
| 659 | * returns costs for inserted gaps |
|---|
| 660 | */ |
|---|
| 661 | { |
|---|
| 662 | int ctrc,ctri,ctrj=0, |
|---|
| 663 | i,j,k,l,m,n,p, |
|---|
| 664 | flag; |
|---|
| 665 | |
|---|
| 666 | #ifdef DEBUG |
|---|
| 667 | # ifdef MATRIX_DUMP |
|---|
| 668 | int display_matrix = 0; |
|---|
| 669 | |
|---|
| 670 | awtc_assert(v3<=(DISPLAY_MATRIX_SIZE+2)); // width |
|---|
| 671 | awtc_assert(v4<=(DISPLAY_MATRIX_SIZE)); // height |
|---|
| 672 | |
|---|
| 673 | # define DISPLAY_MATRIX_ELEMENTS ((DISPLAY_MATRIX_SIZE+2)*(DISPLAY_MATRIX_SIZE+2)) |
|---|
| 674 | |
|---|
| 675 | memset(vertical, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 676 | memset(verticalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 677 | memset(diagonal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 678 | memset(horizontal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 679 | memset(horizontalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 680 | |
|---|
| 681 | # endif |
|---|
| 682 | #endif |
|---|
| 683 | static int deep; |
|---|
| 684 | deep++; |
|---|
| 685 | |
|---|
| 686 | #if (defined (DEBUG) && 0) |
|---|
| 687 | { |
|---|
| 688 | char *d; |
|---|
| 689 | |
|---|
| 690 | d = lstr(seq_array[1]+v1,v3); |
|---|
| 691 | |
|---|
| 692 | (dnaflag ? n_decode : p_decode)(d-1,d-1,v3); |
|---|
| 693 | |
|---|
| 694 | for (int cnt=0; cnt<deep; cnt++) putchar(' '); |
|---|
| 695 | printf("master = '%s' (penalties left=%i right=%i)\n", d, st, en); |
|---|
| 696 | |
|---|
| 697 | d = lstr(seq_array[2]+v2,v4); |
|---|
| 698 | (dnaflag ? n_decode : p_decode)(d-1,d-1,v4); |
|---|
| 699 | |
|---|
| 700 | for (cnt=0; cnt<deep; cnt++) putchar(' '); |
|---|
| 701 | printf("slave = '%s'\n", d); |
|---|
| 702 | } |
|---|
| 703 | #endif |
|---|
| 704 | |
|---|
| 705 | // q = gap_open + gap_extend; // q is replaced for local positions |
|---|
| 706 | |
|---|
| 707 | if(v4<=0) { // if slave sequence is empty |
|---|
| 708 | if(v3>0) { |
|---|
| 709 | if (last_print<0 && print_ptr>0) { |
|---|
| 710 | last_print = decr_displ(print_ptr-1,v3); // add .. |
|---|
| 711 | } |
|---|
| 712 | else { |
|---|
| 713 | last_print = set_displ(print_ptr++,-(v3)); // .. or insert gap of length 'v3' into slave |
|---|
| 714 | } |
|---|
| 715 | } |
|---|
| 716 | |
|---|
| 717 | deep--; |
|---|
| 718 | return master_gapAt(v1,v3); |
|---|
| 719 | } |
|---|
| 720 | |
|---|
| 721 | if(v3<=1) { |
|---|
| 722 | if(v3<=0) { // if master sequence is empty |
|---|
| 723 | add(v4); // ??? insert gap length 'v4' into master ??? |
|---|
| 724 | deep--; |
|---|
| 725 | return slave_gapAt(v2,v4); |
|---|
| 726 | } |
|---|
| 727 | |
|---|
| 728 | awtc_assert(v3==1); |
|---|
| 729 | // if master length == 1 |
|---|
| 730 | |
|---|
| 731 | if (v4==1) |
|---|
| 732 | { |
|---|
| 733 | if(st>en) |
|---|
| 734 | st=en; |
|---|
| 735 | |
|---|
| 736 | /***************if(!v4)*********BUG********************************/ |
|---|
| 737 | |
|---|
| 738 | ctrc = slave_gapAtWithOpenPenalty(v2,v4,st); |
|---|
| 739 | ctrj = 0; |
|---|
| 740 | |
|---|
| 741 | for(j=1;j<=v4;++j) |
|---|
| 742 | { |
|---|
| 743 | k = slave_gapAt(v2, j-1) + calc_weight(1,j,v1,v2) + slave_gapAt(v2+j,v4-j); |
|---|
| 744 | if(k<ctrc) { ctrc = k; ctrj = j; } |
|---|
| 745 | } |
|---|
| 746 | |
|---|
| 747 | if(!ctrj) |
|---|
| 748 | { |
|---|
| 749 | add(v4); |
|---|
| 750 | if(last_print<0 && print_ptr>0) |
|---|
| 751 | last_print = decr_displ(print_ptr-1,1); |
|---|
| 752 | else |
|---|
| 753 | last_print = set_displ(print_ptr++,-1); |
|---|
| 754 | } |
|---|
| 755 | else |
|---|
| 756 | { |
|---|
| 757 | if (ctrj>1) add(ctrj-1); |
|---|
| 758 | set_displ(print_ptr++,last_print = 0); |
|---|
| 759 | if (ctrj<v4) add(v4-ctrj); |
|---|
| 760 | } |
|---|
| 761 | |
|---|
| 762 | deep--; |
|---|
| 763 | return ctrc; |
|---|
| 764 | } |
|---|
| 765 | } |
|---|
| 766 | |
|---|
| 767 | awtc_assert(v3>=1 && v4>=1); |
|---|
| 768 | |
|---|
| 769 | // slave length >= 1 |
|---|
| 770 | // master length >= 1 |
|---|
| 771 | |
|---|
| 772 | # define AF 0 |
|---|
| 773 | |
|---|
| 774 | // first column of matrix (slave side): |
|---|
| 775 | IF_MATRIX_DUMP(vertical[0][0]=) |
|---|
| 776 | zza[0] = 0; |
|---|
| 777 | p = master_gap_open(v1); |
|---|
| 778 | for(j=1; j<=v4; j++) |
|---|
| 779 | { |
|---|
| 780 | p += master_gap_extend(v1); |
|---|
| 781 | IF_MATRIX_DUMP(vertical[0][j]=) |
|---|
| 782 | zza[j] = p; |
|---|
| 783 | IF_MATRIX_DUMP(verticalOpen[0][j]=) |
|---|
| 784 | zzb[j] = p + master_gap_open(v1); |
|---|
| 785 | } |
|---|
| 786 | |
|---|
| 787 | // left half of the matrix |
|---|
| 788 | p = st; |
|---|
| 789 | ctri = v3 / 2; |
|---|
| 790 | for(i=1; i<=ctri; i++) |
|---|
| 791 | { |
|---|
| 792 | n = zza[0]; |
|---|
| 793 | p += master_gap_extend(v1+i+AF); |
|---|
| 794 | k = p; |
|---|
| 795 | IF_MATRIX_DUMP(vertical[i][0]=) |
|---|
| 796 | zza[0] = k; |
|---|
| 797 | l = p + master_gap_open(v1+i+AF); |
|---|
| 798 | |
|---|
| 799 | for(j=1; j<=v4; j++) |
|---|
| 800 | { |
|---|
| 801 | // from above (gap in master (behind position i)) |
|---|
| 802 | IF_MATRIX_DUMP(verticalOpen[i][j]=) k += master_gap_open(v1+i+AF)+master_gap_extend(v1+i+AF); // (1) |
|---|
| 803 | IF_MATRIX_DUMP(vertical[i][j]=) l += master_gap_extend(v1+i+AF); // (2) |
|---|
| 804 | if(k<l) l = k; // l=min((1),(2)) |
|---|
| 805 | |
|---|
| 806 | // from left (gap in slave (behind position j)) |
|---|
| 807 | IF_MATRIX_DUMP(horizontalOpen[i][j]=) k = zza[j] + slave_gap_open(v2+j+AF)+slave_gap_extend(v2+j+AF); // (3) |
|---|
| 808 | IF_MATRIX_DUMP(horizontal[i][j]=) m = zzb[j] + slave_gap_extend(v2+j+AF); // (4) |
|---|
| 809 | if(k<m) m = k; // m=min((3),(4)) |
|---|
| 810 | |
|---|
| 811 | // diagonal (no gaps) |
|---|
| 812 | IF_MATRIX_DUMP(diagonal[i][j]=) k = n + calc_weight(i,j,v1,v2); // (5) |
|---|
| 813 | if(l<k) k = l; |
|---|
| 814 | if(m<k) k = m; // k = minimum of all paths |
|---|
| 815 | |
|---|
| 816 | n = zza[j]; // minimum of same row; one column to the left |
|---|
| 817 | zza[j] = k; // minimum of all paths to this matrix position |
|---|
| 818 | zzb[j] = m; // minimum of those two paths, where gap was inserted into slave |
|---|
| 819 | } |
|---|
| 820 | } |
|---|
| 821 | |
|---|
| 822 | # define MHO 1 // X-Offset for second half of matrix-arrays (only used to insert MID-column into dumpMatrix()) |
|---|
| 823 | # define BO 1 // Offset for second half of matrix (cause now the indices i and j are smaller as above) |
|---|
| 824 | |
|---|
| 825 | // last column of matrix: |
|---|
| 826 | |
|---|
| 827 | zzb[0]=zza[0]; |
|---|
| 828 | IF_MATRIX_DUMP(vertical[v3+1+MHO][v4+1]=) |
|---|
| 829 | zzc[v4]=0; |
|---|
| 830 | p = master_gap_open(v1+v3); |
|---|
| 831 | for(j=v4-1; j>-1; j--) |
|---|
| 832 | { |
|---|
| 833 | p += master_gap_extend(v1+v3); |
|---|
| 834 | IF_MATRIX_DUMP(vertical[v3+1+MHO][j+BO]=) |
|---|
| 835 | zzc[j] = p; |
|---|
| 836 | IF_MATRIX_DUMP(verticalOpen[v3+1+MHO][j+BO]=) |
|---|
| 837 | zzd[j] = p+master_gap_open(v1+v3); |
|---|
| 838 | } |
|---|
| 839 | |
|---|
| 840 | // right half of matrix (backwards): |
|---|
| 841 | p = en; |
|---|
| 842 | for(i=v3-1;i>=ctri;i--) |
|---|
| 843 | { |
|---|
| 844 | n = zzc[v4]; |
|---|
| 845 | p += master_gap_extend(v1+i); |
|---|
| 846 | k = p; |
|---|
| 847 | IF_MATRIX_DUMP(vertical[i+BO+MHO][v4+1]=) |
|---|
| 848 | zzc[v4] = k; |
|---|
| 849 | l = p+master_gap_open(v1+i); |
|---|
| 850 | |
|---|
| 851 | for(j=v4-1; j>=0; j--) |
|---|
| 852 | { |
|---|
| 853 | // from below (gap in master (in front of position (i+BO))) |
|---|
| 854 | IF_MATRIX_DUMP(verticalOpen[i+BO+MHO][j+BO]=) k += master_gap_open(v1+i)+master_gap_extend(v1+i); // (1) |
|---|
| 855 | IF_MATRIX_DUMP(vertical[i+BO+MHO][j+BO]=) l += master_gap_extend(v1+i); // (2) |
|---|
| 856 | if(k<l) l = k; // l=min((1),(2)) |
|---|
| 857 | |
|---|
| 858 | // from right (gap in slave (in front of position (j+BO))) |
|---|
| 859 | IF_MATRIX_DUMP(horizontalOpen[i+BO+MHO][j+BO]=) k = zzc[j] + slave_gap_open(v2+j) + slave_gap_extend(v2+j); // (3) |
|---|
| 860 | IF_MATRIX_DUMP(horizontal[i+BO+MHO][j+BO]=) m = zzd[j] + slave_gap_extend(v2+j); // (4) |
|---|
| 861 | if(k<m) m = k; // m=min((3),(4)) |
|---|
| 862 | |
|---|
| 863 | // diagonal (no gaps) |
|---|
| 864 | IF_MATRIX_DUMP(diagonal[i+BO+MHO][j+BO]=) k = n + calc_weight(i+BO,j+BO,v1,v2); // (5) |
|---|
| 865 | if(l<k) k = l; |
|---|
| 866 | if(m<k) k = m; // k = minimum of all paths |
|---|
| 867 | |
|---|
| 868 | n = zzc[j]; // minimum of same row; one column to the right |
|---|
| 869 | zzc[j] = k; // minimum of all paths to this matrix position |
|---|
| 870 | zzd[j] = m; // minimum of those two paths, where gap was inserted into slave |
|---|
| 871 | } |
|---|
| 872 | } |
|---|
| 873 | |
|---|
| 874 | #undef BO |
|---|
| 875 | |
|---|
| 876 | // connect rightmost column of first half (column ctri) |
|---|
| 877 | // with leftmost column of second half (column ctri+1) |
|---|
| 878 | |
|---|
| 879 | zzd[v4] = zzc[v4]; |
|---|
| 880 | |
|---|
| 881 | ctrc = INT_MAX; |
|---|
| 882 | flag = 0; |
|---|
| 883 | |
|---|
| 884 | for(j=(ctri?0:1);j<=v4;j++) // was j=0;... (this was erroneous cause it splitted zero-length subsequences) |
|---|
| 885 | { |
|---|
| 886 | IF_MATRIX_DUMP(vertical[ctri+MHO][j]=) |
|---|
| 887 | k = zza[j] + zzc[j]; // sum up both calculations (=diagonal=no gaps) |
|---|
| 888 | |
|---|
| 889 | if (k<ctrc || ((k==ctrc) && (zza[j]!=zzb[j]) && (zzc[j]==zzd[j]))) |
|---|
| 890 | { |
|---|
| 891 | ctrc = k; |
|---|
| 892 | ctrj = j; |
|---|
| 893 | flag = 1; |
|---|
| 894 | } |
|---|
| 895 | } |
|---|
| 896 | |
|---|
| 897 | for(j=v4;j>=(ctri?0:1);j--) |
|---|
| 898 | { |
|---|
| 899 | IF_MATRIX_DUMP(verticalOpen[ctri+MHO][j]=) |
|---|
| 900 | k = zzb[j] + zzd[j] // paths where gaps were inserted into slave (left and right side!) |
|---|
| 901 | - slave_gap_open(j); // subtract gap_open-penalty which was added twice (at left and right end of gap) |
|---|
| 902 | |
|---|
| 903 | if(k<ctrc) |
|---|
| 904 | { |
|---|
| 905 | ctrc = k; |
|---|
| 906 | ctrj = j; |
|---|
| 907 | flag = 2; |
|---|
| 908 | } |
|---|
| 909 | } |
|---|
| 910 | |
|---|
| 911 | awtc_assert(flag>=1 && flag<=2); |
|---|
| 912 | |
|---|
| 913 | #undef MHO |
|---|
| 914 | |
|---|
| 915 | #ifdef MATRIX_DUMP |
|---|
| 916 | if (display_matrix) |
|---|
| 917 | dumpMatrix(v1,v2,v3,v4,ctri); |
|---|
| 918 | #endif |
|---|
| 919 | |
|---|
| 920 | /* Conquer recursively around midpoint */ |
|---|
| 921 | |
|---|
| 922 | if(flag==1) /* Type 1 gaps */ |
|---|
| 923 | { |
|---|
| 924 | diff(v1,v2,ctri,ctrj,st,master_gap_open(v1+ctri)); // includes midpoint ctri and ctrj |
|---|
| 925 | diff(v1+ctri,v2+ctrj,v3-ctri,v4-ctrj,master_gap_open(v1+ctri),en); |
|---|
| 926 | } |
|---|
| 927 | else |
|---|
| 928 | { |
|---|
| 929 | diff(v1,v2,ctri-1,ctrj,st,0); // includes midpoint ctrj |
|---|
| 930 | |
|---|
| 931 | if(last_print<0 && print_ptr>0) /* Delete 2 */ |
|---|
| 932 | last_print = decr_displ(print_ptr-1,2); |
|---|
| 933 | else |
|---|
| 934 | last_print = set_displ(print_ptr++,-2); |
|---|
| 935 | |
|---|
| 936 | diff(v1+ctri+1,v2+ctrj,v3-ctri-1,v4-ctrj,0,en); |
|---|
| 937 | } |
|---|
| 938 | |
|---|
| 939 | deep--; |
|---|
| 940 | return ctrc; /* Return the score of the best alignment */ |
|---|
| 941 | } |
|---|
| 942 | |
|---|
| 943 | static void do_align( /*int v1,*/ int *score, long act_seq_length) |
|---|
| 944 | { |
|---|
| 945 | int i,j,k,l1,l2,n; |
|---|
| 946 | int t_arr[MAX_BASETYPES]; |
|---|
| 947 | |
|---|
| 948 | l1=l2=pos1=pos2=0; |
|---|
| 949 | |
|---|
| 950 | // clear statistics |
|---|
| 951 | |
|---|
| 952 | for(i=1;i<=act_seq_length;++i) { |
|---|
| 953 | for(j=0;j<MAX_BASETYPES;++j) { |
|---|
| 954 | naa1[j][i] = naa2[j][i]=0; |
|---|
| 955 | naas[j][i]=0; |
|---|
| 956 | } |
|---|
| 957 | } |
|---|
| 958 | |
|---|
| 959 | // create position statistics for each group |
|---|
| 960 | // [here every group contains only one seq] |
|---|
| 961 | |
|---|
| 962 | for (i=1;i<=nseqs;++i) { |
|---|
| 963 | if (group[i]==1) { |
|---|
| 964 | fst_list[++pos1]=i; |
|---|
| 965 | for (j=1;j<=seqlen_array[i];++j) { |
|---|
| 966 | unsigned char b = seq_array[i][j]; |
|---|
| 967 | if (b<128) { |
|---|
| 968 | ++naa1[b][j]; |
|---|
| 969 | ++naa1[0][j]; |
|---|
| 970 | } |
|---|
| 971 | } |
|---|
| 972 | if (seqlen_array[i]>l1) l1=seqlen_array[i]; |
|---|
| 973 | } |
|---|
| 974 | else if (group[i]==2) |
|---|
| 975 | { |
|---|
| 976 | snd_list[++pos2]=i; |
|---|
| 977 | for (j=1;j<=seqlen_array[i];++j) { |
|---|
| 978 | unsigned char b = seq_array[i][j]; |
|---|
| 979 | if (b<128) { |
|---|
| 980 | ++naa2[b][j]; |
|---|
| 981 | ++naa2[0][j]; |
|---|
| 982 | } |
|---|
| 983 | } |
|---|
| 984 | if (seqlen_array[i]>l2) l2=seqlen_array[i]; |
|---|
| 985 | } |
|---|
| 986 | } |
|---|
| 987 | |
|---|
| 988 | if(pos1>=pos2) { |
|---|
| 989 | for(i=1;i<=pos2;++i) alist[i]=snd_list[i]; |
|---|
| 990 | for(n=1;n<=l1;++n) |
|---|
| 991 | { |
|---|
| 992 | for(i=1;i<MAX_BASETYPES;++i) t_arr[i]=0; |
|---|
| 993 | for(i=1;i<MAX_BASETYPES;++i) { |
|---|
| 994 | if(naa1[i][n]>0) { |
|---|
| 995 | for(j=1;j<MAX_BASETYPES;++j) { |
|---|
| 996 | t_arr[j] += (weights[i][j]*naa1[i][n]); |
|---|
| 997 | } |
|---|
| 998 | } |
|---|
| 999 | } |
|---|
| 1000 | k = naa1[0][n]; |
|---|
| 1001 | if (k>0) { |
|---|
| 1002 | for (i=1;i<MAX_BASETYPES;++i) { |
|---|
| 1003 | naas[i][n]=t_arr[i]/k; |
|---|
| 1004 | } |
|---|
| 1005 | } |
|---|
| 1006 | } |
|---|
| 1007 | } |
|---|
| 1008 | else { |
|---|
| 1009 | MAXN_2_assert(0); // should never occur if MAXN==2 |
|---|
| 1010 | #if (MAXN!=2) |
|---|
| 1011 | for (i=1;i<=pos1;++i) alist[i]=fst_list[i]; |
|---|
| 1012 | for (n=1;n<=l2;++n) { |
|---|
| 1013 | for (i=1;i<MAX_BASETYPES;++i) t_arr[i]=0; |
|---|
| 1014 | for (i=1;i<MAX_BASETYPES;++i) |
|---|
| 1015 | if (naa2[i][n]>0) |
|---|
| 1016 | for (j=1;j<MAX_BASETYPES;++j) |
|---|
| 1017 | t_arr[j] += (weights[i][j]*naa2[i][n]); |
|---|
| 1018 | k = naa2[0][n]; |
|---|
| 1019 | if (k>0) |
|---|
| 1020 | for (i=1;i<MAX_BASETYPES;++i) |
|---|
| 1021 | naas[i][n]=t_arr[i]/k; |
|---|
| 1022 | } |
|---|
| 1023 | #endif |
|---|
| 1024 | } |
|---|
| 1025 | |
|---|
| 1026 | *score=diff(1,1,l1,l2,master_gap_open(1),master_gap_open(l1+1)/*v1,v1*/); /* Myers and Miller alignment now */ |
|---|
| 1027 | } |
|---|
| 1028 | |
|---|
| 1029 | /* |
|---|
| 1030 | static void group_gap(int len,int sclass,char *seq) |
|---|
| 1031 | { |
|---|
| 1032 | int i,j,k,xtra; |
|---|
| 1033 | |
|---|
| 1034 | for(i=1;i<=nseqs;++i) |
|---|
| 1035 | if(group[i] == sclass) |
|---|
| 1036 | { |
|---|
| 1037 | xtra = len - seqlen_array[i]; |
|---|
| 1038 | if(xtra>0) |
|---|
| 1039 | for(j=1;j<=xtra;++j) |
|---|
| 1040 | seq_array[i][seqlen_array[i]+j] = -1; |
|---|
| 1041 | for(j=1;j<=len;++j) |
|---|
| 1042 | if(seq[j] == '-') { |
|---|
| 1043 | for(k=len;k>=j+1;--k) |
|---|
| 1044 | seq_array[i][k] = seq_array[i][k-1]; |
|---|
| 1045 | seq_array[i][j] = -1; |
|---|
| 1046 | } |
|---|
| 1047 | seqlen_array[i] = len; |
|---|
| 1048 | } |
|---|
| 1049 | } |
|---|
| 1050 | */ |
|---|
| 1051 | |
|---|
| 1052 | static int add_ggaps(long /*max_seq_length*/) |
|---|
| 1053 | { |
|---|
| 1054 | int i,j,k,pos,to_do; |
|---|
| 1055 | |
|---|
| 1056 | pos=1; |
|---|
| 1057 | to_do=print_ptr; |
|---|
| 1058 | |
|---|
| 1059 | for(i=0;i<to_do;++i) // was: 1 .. <=to_do |
|---|
| 1060 | { |
|---|
| 1061 | if(displ[i]==0) |
|---|
| 1062 | { |
|---|
| 1063 | result[1][pos]=result[2][pos]='*'; |
|---|
| 1064 | ++pos; |
|---|
| 1065 | } |
|---|
| 1066 | else |
|---|
| 1067 | { |
|---|
| 1068 | if((k=displ[i])>0) |
|---|
| 1069 | { |
|---|
| 1070 | for(j=0;j<=k-1;++j) |
|---|
| 1071 | { |
|---|
| 1072 | result[2][pos+j]='*'; |
|---|
| 1073 | result[1][pos+j]='-'; |
|---|
| 1074 | } |
|---|
| 1075 | pos += k; |
|---|
| 1076 | } |
|---|
| 1077 | else |
|---|
| 1078 | { |
|---|
| 1079 | k = (displ[i]<0) ? displ[i] * -1 : displ[i]; |
|---|
| 1080 | for(j=0;j<=k-1;++j) |
|---|
| 1081 | { |
|---|
| 1082 | result[1][pos+j]='*'; |
|---|
| 1083 | result[2][pos+j]='-'; |
|---|
| 1084 | } |
|---|
| 1085 | pos += k; |
|---|
| 1086 | } |
|---|
| 1087 | } |
|---|
| 1088 | } |
|---|
| 1089 | |
|---|
| 1090 | #ifdef DEBUG |
|---|
| 1091 | result[1][pos] = 0; |
|---|
| 1092 | result[2][pos] = 0; |
|---|
| 1093 | #endif |
|---|
| 1094 | |
|---|
| 1095 | return pos-1; |
|---|
| 1096 | } |
|---|
| 1097 | |
|---|
| 1098 | |
|---|
| 1099 | static int res_index(const char *t,char c) |
|---|
| 1100 | { |
|---|
| 1101 | if (t[0]==c) return UNKNOWN_ACID; |
|---|
| 1102 | for (int i=1; t[i]; i++) { |
|---|
| 1103 | if (t[i]==c) return i; |
|---|
| 1104 | } |
|---|
| 1105 | return 0; |
|---|
| 1106 | } |
|---|
| 1107 | |
|---|
| 1108 | static GB_ERROR p_encode(const unsigned char *seq, unsigned char *naseq, int l) /* code seq as ints .. use -2 for gap */ { |
|---|
| 1109 | bool warned = false; |
|---|
| 1110 | |
|---|
| 1111 | for(int i=1; i<=l; i++) { |
|---|
| 1112 | int c = res_index(amino_acid_order,seq[i]); |
|---|
| 1113 | |
|---|
| 1114 | if (!c) { |
|---|
| 1115 | if (seq[i] == '*') { |
|---|
| 1116 | c = -2; |
|---|
| 1117 | } |
|---|
| 1118 | else { |
|---|
| 1119 | if (!warned && seq[i] != '?') { // consensus contains ? and it means X |
|---|
| 1120 | char buf[100]; |
|---|
| 1121 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
|---|
| 1122 | aw_message(buf); |
|---|
| 1123 | warned = true; |
|---|
| 1124 | } |
|---|
| 1125 | c = res_index(amino_acid_order, 'X'); |
|---|
| 1126 | } |
|---|
| 1127 | } |
|---|
| 1128 | |
|---|
| 1129 | awtc_assert(c>0 || c == -2); |
|---|
| 1130 | naseq[i] = c; |
|---|
| 1131 | } |
|---|
| 1132 | |
|---|
| 1133 | return 0; |
|---|
| 1134 | } |
|---|
| 1135 | |
|---|
| 1136 | static GB_ERROR n_encode(const unsigned char *seq,unsigned char *naseq,int l) |
|---|
| 1137 | { /* code seq as ints .. use -2 for gap */ |
|---|
| 1138 | int i; |
|---|
| 1139 | /* static char *nucs="ACGTU"; */ |
|---|
| 1140 | int warned = 0; |
|---|
| 1141 | |
|---|
| 1142 | for(i=1; i<=l; i++) { |
|---|
| 1143 | int c = res_index(nucleic_acid_order,seq[i]); |
|---|
| 1144 | |
|---|
| 1145 | if (!c) { |
|---|
| 1146 | if (!warned && seq[i] != '?') { |
|---|
| 1147 | char buf[100]; |
|---|
| 1148 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
|---|
| 1149 | aw_message(buf); |
|---|
| 1150 | warned = 1; |
|---|
| 1151 | } |
|---|
| 1152 | c = res_index(nucleic_acid_order, 'N'); |
|---|
| 1153 | } |
|---|
| 1154 | |
|---|
| 1155 | awtc_assert(c>0); |
|---|
| 1156 | naseq[i] = c; |
|---|
| 1157 | } |
|---|
| 1158 | |
|---|
| 1159 | return 0; |
|---|
| 1160 | } |
|---|
| 1161 | |
|---|
| 1162 | GB_ERROR AWTC_ClustalV_align(int is_dna, int weighted, |
|---|
| 1163 | const char *seq1, int length1, |
|---|
| 1164 | const char *seq2, int length2, |
|---|
| 1165 | const int *gapsBefore1, |
|---|
| 1166 | int max_seq_length, |
|---|
| 1167 | char **resultPtr1, |
|---|
| 1168 | char **resultPtr2, |
|---|
| 1169 | int *resLengthPtr, |
|---|
| 1170 | int *score) |
|---|
| 1171 | { |
|---|
| 1172 | error = 0; |
|---|
| 1173 | gapsBeforePosition = gapsBefore1; |
|---|
| 1174 | |
|---|
| 1175 | if (!module_initialized) { // initialize only once |
|---|
| 1176 | dnaflag = is_dna; |
|---|
| 1177 | is_weight = weighted; |
|---|
| 1178 | |
|---|
| 1179 | init_myers(max_seq_length); |
|---|
| 1180 | init_show_pair(max_seq_length); |
|---|
| 1181 | make_pamo(0); |
|---|
| 1182 | fill_pam(); |
|---|
| 1183 | |
|---|
| 1184 | for (int i=1; i<=2; i++) |
|---|
| 1185 | { |
|---|
| 1186 | seq_array[i] = (unsigned char*)ckalloc((max_seq_length+2)*sizeof(char)); |
|---|
| 1187 | result[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char)); |
|---|
| 1188 | group[i] = i; |
|---|
| 1189 | } |
|---|
| 1190 | |
|---|
| 1191 | if (!error) module_initialized = true; |
|---|
| 1192 | } |
|---|
| 1193 | else { |
|---|
| 1194 | if (dnaflag!=is_dna || is_weight!=weighted) { |
|---|
| 1195 | error = "Please call AWTC_ClustalV_align_exit() between calls that differ in\n" |
|---|
| 1196 | "one of the following parameters:\n" |
|---|
| 1197 | " is_dna, weighted"; |
|---|
| 1198 | } |
|---|
| 1199 | } |
|---|
| 1200 | |
|---|
| 1201 | if (!error) { |
|---|
| 1202 | nseqs = 2; |
|---|
| 1203 | print_ptr = 0; |
|---|
| 1204 | |
|---|
| 1205 | #if defined(DEBUG) && 1 |
|---|
| 1206 | memset(&seq_array[1][1], 0, max_seq_length*sizeof(seq_array[1][1])); |
|---|
| 1207 | memset(&seq_array[2][1], 0, max_seq_length*sizeof(seq_array[2][1])); |
|---|
| 1208 | memset(&displ[1], 0xff, max_seq_length*sizeof(displ[1])); |
|---|
| 1209 | seq_array[1][0] = '_'; |
|---|
| 1210 | seq_array[2][0] = '_'; |
|---|
| 1211 | #endif |
|---|
| 1212 | |
|---|
| 1213 | { |
|---|
| 1214 | GB_ERROR (*encode)(const unsigned char*,unsigned char*,int) = dnaflag ? n_encode : p_encode; |
|---|
| 1215 | |
|---|
| 1216 | error = encode((const unsigned char*)(seq1-1), seq_array[1], length1); |
|---|
| 1217 | if (!error) { |
|---|
| 1218 | seqlen_array[1] = length1; |
|---|
| 1219 | error = encode((const unsigned char*)(seq2-1), seq_array[2], length2); |
|---|
| 1220 | if (!error) { |
|---|
| 1221 | seqlen_array[2] = length2; |
|---|
| 1222 | |
|---|
| 1223 | do_align(/* gap_open,*/ score, max(length1,length2)); |
|---|
| 1224 | int alignedLength = add_ggaps(max_seq_length); |
|---|
| 1225 | |
|---|
| 1226 | *resultPtr1 = result[1]+1; |
|---|
| 1227 | *resultPtr2 = result[2]+1; |
|---|
| 1228 | *resLengthPtr = alignedLength; |
|---|
| 1229 | } |
|---|
| 1230 | } |
|---|
| 1231 | } |
|---|
| 1232 | } |
|---|
| 1233 | |
|---|
| 1234 | return error; |
|---|
| 1235 | } |
|---|
| 1236 | |
|---|
| 1237 | void AWTC_ClustalV_align_exit(void) |
|---|
| 1238 | { |
|---|
| 1239 | if (module_initialized) { |
|---|
| 1240 | module_initialized = false; |
|---|
| 1241 | |
|---|
| 1242 | for (int i=1; i<=2; i++) { |
|---|
| 1243 | free(result[i]); |
|---|
| 1244 | free(seq_array[i]); |
|---|
| 1245 | } |
|---|
| 1246 | exit_show_pair(); |
|---|
| 1247 | exit_myers(); |
|---|
| 1248 | } |
|---|
| 1249 | } |
|---|