| 1 | #include <ctype.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <memory.h> |
|---|
| 4 | #include <malloc.h> |
|---|
| 5 | #include <string.h> |
|---|
| 6 | #include <stdio.h> |
|---|
| 7 | #include <limits.h> |
|---|
| 8 | |
|---|
| 9 | #include <arbdb.h> |
|---|
| 10 | #include <aw_root.hxx> |
|---|
| 11 | |
|---|
| 12 | #include "awtc_ClustalV.hxx" |
|---|
| 13 | #include "awtc_seq_search.hxx" |
|---|
| 14 | |
|---|
| 15 | /* ---------------------------------------------------------------- */ |
|---|
| 16 | |
|---|
| 17 | #define MASTER_GAP_OPEN 50 |
|---|
| 18 | #define CHEAP_GAP_OPEN 20 // penalty for creating a gap (if we have gaps in master) |
|---|
| 19 | |
|---|
| 20 | #define DEFAULT_GAP_OPEN 30 // penalty for creating a gap |
|---|
| 21 | |
|---|
| 22 | #define MASTER_GAP_EXTEND 18 |
|---|
| 23 | #define CHEAP_GAP_EXTEND 5 // penalty for extending a gap (if we have gaps in master) |
|---|
| 24 | |
|---|
| 25 | #define DEFAULT_GAP_EXTEND 10 // penalty for extending a gap |
|---|
| 26 | |
|---|
| 27 | #define DEFAULT_IMPROBABLY_MUTATION 10 // penalty for mutations |
|---|
| 28 | #define DEFAULT_PROBABLY_MUTATION 4 // penalty for special mutations (A<->G,C<->U/T) (only if 'weighted') |
|---|
| 29 | |
|---|
| 30 | #define DYNAMIC_PENALTIES // otherwise you get FIXED PENALTIES (=no cheap penalties) |
|---|
| 31 | |
|---|
| 32 | /* ---------------------------------------------------------------- */ |
|---|
| 33 | |
|---|
| 34 | #define MAX_GAP_OPEN_DISCOUNT (DEFAULT_GAP_OPEN-CHEAP_GAP_OPEN) // maximum subtracted from DEFAULT_GAP_OPEN |
|---|
| 35 | #define MAX_GAP_EXTEND_DISCOUNT (DEFAULT_GAP_EXTEND-CHEAP_GAP_EXTEND) // maximum subtracted from DEFAULT_GAP_EXTEND |
|---|
| 36 | |
|---|
| 37 | #define MAXN 2 /* Maximum number of sequences (both groups) */ |
|---|
| 38 | |
|---|
| 39 | #if (MAXN==2) |
|---|
| 40 | #define MAXN_2(xxx) xxx |
|---|
| 41 | #define MAXN_2_assert(xxx) awtc_assert(xxx) |
|---|
| 42 | #else |
|---|
| 43 | #define MAXN_2(xxx) |
|---|
| 44 | #define MAXN_2_assert(xxx) |
|---|
| 45 | #endif |
|---|
| 46 | |
|---|
| 47 | #define TRUE 1 |
|---|
| 48 | #define FALSE 0 |
|---|
| 49 | |
|---|
| 50 | static GB_ERROR error; |
|---|
| 51 | static int module_initialized = 0; |
|---|
| 52 | |
|---|
| 53 | typedef int Boolean; |
|---|
| 54 | |
|---|
| 55 | static Boolean dnaflag; |
|---|
| 56 | static Boolean is_weight; |
|---|
| 57 | |
|---|
| 58 | static int xover; |
|---|
| 59 | static int little_pam; |
|---|
| 60 | static int big_pam; |
|---|
| 61 | static int pamo[210]; |
|---|
| 62 | static int pam[21][21]; |
|---|
| 63 | |
|---|
| 64 | static int pos1; |
|---|
| 65 | static int pos2; |
|---|
| 66 | static int **naa1; // naa1[basetype][position] counts bases for each position of all sequences in group1 |
|---|
| 67 | static int **naa2; // naa2[basetype][position] same for group2 |
|---|
| 68 | static int **naas; // |
|---|
| 69 | static int seqlen_array[MAXN+1];// length of all sequences |
|---|
| 70 | static char *seq_array[MAXN+1]; // the sequences |
|---|
| 71 | static int group[MAXN+1]; // group of sequence |
|---|
| 72 | static int alist[MAXN+1]; // indices of sequences to be aligned |
|---|
| 73 | static int fst_list[MAXN+1]; |
|---|
| 74 | static int snd_list[MAXN+1]; |
|---|
| 75 | static int nseqs; // # of sequences |
|---|
| 76 | static int weights[21][21]; // weights[b1][b2] : penalty for mutation from base 'b1' to base 'b2' |
|---|
| 77 | static int *displ; // displ==0 -> base in both , displ<0 -> displ gaps in slave, displ>0 -> displ gaps in master |
|---|
| 78 | static int *zza, // column (left->right) of align matrix (minimum of all paths to this matrix element) |
|---|
| 79 | *zzb, // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
|---|
| 80 | *zzc, // column (left<-right) of align matrix (minimum of all paths to this matrix element) |
|---|
| 81 | *zzd; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
|---|
| 82 | /*static int gap_open; |
|---|
| 83 | static int gap_extend;*/ |
|---|
| 84 | static int print_ptr; |
|---|
| 85 | static int last_print; |
|---|
| 86 | static const int *gapsBeforePosition; |
|---|
| 87 | |
|---|
| 88 | //#define MATRIX_DUMP |
|---|
| 89 | |
|---|
| 90 | #ifdef MATRIX_DUMP |
|---|
| 91 | # define IF_MATRIX_DUMP(xxx) xxx |
|---|
| 92 | # define DISPLAY_MATRIX_SIZE 500 |
|---|
| 93 | static int vertical [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 94 | static int verticalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 95 | static int diagonal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 96 | static int horizontal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 97 | static int horizontalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 98 | #else |
|---|
| 99 | # define IF_MATRIX_DUMP(xxx) |
|---|
| 100 | #endif |
|---|
| 101 | |
|---|
| 102 | static inline int master_gap_open(int beforePosition) |
|---|
| 103 | { |
|---|
| 104 | #ifdef DYNAMIC_PENALTIES |
|---|
| 105 | long gaps = gapsBeforePosition[beforePosition-1]; |
|---|
| 106 | return (gaps)? MASTER_GAP_OPEN - MAX_GAP_OPEN_DISCOUNT : MASTER_GAP_OPEN; |
|---|
| 107 | |
|---|
| 108 | /* return |
|---|
| 109 | gaps >= MAX_GAP_OPEN_DISCOUNT |
|---|
| 110 | ? DEFAULT_GAP_OPEN-MAX_GAP_OPEN_DISCOUNT |
|---|
| 111 | : DEFAULT_GAP_OPEN-gaps;*/ |
|---|
| 112 | #else |
|---|
| 113 | return DEFAULT_GAP_OPEN; |
|---|
| 114 | #endif |
|---|
| 115 | } |
|---|
| 116 | static inline int master_gap_extend(int beforePosition) |
|---|
| 117 | { |
|---|
| 118 | #ifdef DYNAMIC_PENALTIES |
|---|
| 119 | long gaps = gapsBeforePosition[beforePosition-1]; |
|---|
| 120 | |
|---|
| 121 | return (gaps)? MASTER_GAP_EXTEND - MAX_GAP_EXTEND_DISCOUNT : MASTER_GAP_EXTEND; |
|---|
| 122 | /* return |
|---|
| 123 | gaps >= MAX_GAP_EXTEND_DISCOUNT |
|---|
| 124 | ? DEFAULT_GAP_EXTEND-MAX_GAP_EXTEND_DISCOUNT |
|---|
| 125 | : DEFAULT_GAP_EXTEND-gaps;*/ |
|---|
| 126 | #else |
|---|
| 127 | return DEFAULT_GAP_EXTEND; |
|---|
| 128 | #endif |
|---|
| 129 | } |
|---|
| 130 | |
|---|
| 131 | static inline int master_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
|---|
| 132 | { |
|---|
| 133 | if (length<=0) |
|---|
| 134 | return 0; |
|---|
| 135 | |
|---|
| 136 | int beforePosition = atPosition, |
|---|
| 137 | afterPosition = atPosition-1; |
|---|
| 138 | |
|---|
| 139 | while (length--) |
|---|
| 140 | { |
|---|
| 141 | int p1,p2; |
|---|
| 142 | |
|---|
| 143 | if ((p1=master_gap_extend(beforePosition)) < (p2=master_gap_extend(afterPosition+1)) && |
|---|
| 144 | beforePosition>1) |
|---|
| 145 | { |
|---|
| 146 | penalty += p1; |
|---|
| 147 | beforePosition--; |
|---|
| 148 | } |
|---|
| 149 | else |
|---|
| 150 | { |
|---|
| 151 | penalty += p2; |
|---|
| 152 | afterPosition++; |
|---|
| 153 | } |
|---|
| 154 | } |
|---|
| 155 | |
|---|
| 156 | return penalty; |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | static inline int master_gapAt(int atPosition, int length) |
|---|
| 160 | { |
|---|
| 161 | return master_gapAtWithOpenPenalty(atPosition, length, master_gap_open(atPosition)); |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | static inline int slave_gap_open(int /*beforePosition*/) |
|---|
| 165 | { |
|---|
| 166 | return DEFAULT_GAP_OPEN; |
|---|
| 167 | } |
|---|
| 168 | |
|---|
| 169 | static inline int slave_gap_extend(int /*beforePosition*/) |
|---|
| 170 | { |
|---|
| 171 | return DEFAULT_GAP_EXTEND; |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | static inline int slave_gapAtWithOpenPenalty(int atPosition, int length, int penalty) |
|---|
| 175 | { |
|---|
| 176 | return length<=0 ? 0 : penalty + length*slave_gap_extend(atPosition); |
|---|
| 177 | } |
|---|
| 178 | |
|---|
| 179 | static inline int slave_gapAt(int atPosition, int length) |
|---|
| 180 | { |
|---|
| 181 | return slave_gapAtWithOpenPenalty(atPosition, length, slave_gap_open(atPosition)); |
|---|
| 182 | } |
|---|
| 183 | |
|---|
| 184 | #define UNKNOWN_ACID 255 |
|---|
| 185 | static const char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW"; |
|---|
| 186 | |
|---|
| 187 | #define NUCLEIDS 16 |
|---|
| 188 | static const char *nucleic_acid_order = "-ACGTUMRWSYKVHDBN"; |
|---|
| 189 | static const char *nucleic_maybe_A = "-A----AAA---AAA-A"; |
|---|
| 190 | static const char *nucleic_maybe_C = "--C---C--CC-CC-CC"; |
|---|
| 191 | static const char *nucleic_maybe_G = "---G---G-G-GG-GGG"; |
|---|
| 192 | static const char *nucleic_maybe_T = "----T---T-TT-TTTT"; |
|---|
| 193 | static const char *nucleic_maybe_U = "-----U--U-UU-UUUU"; |
|---|
| 194 | static const char *nucleic_maybe[6] = { NULL, nucleic_maybe_A, nucleic_maybe_C, nucleic_maybe_G, nucleic_maybe_T, nucleic_maybe_U }; |
|---|
| 195 | |
|---|
| 196 | /* |
|---|
| 197 | * M = A or C S = G or C V = A or G or C N = A or C or G or T |
|---|
| 198 | * R = A or G Y = C or T H = A or C or T |
|---|
| 199 | * W = A or T K = G or T D = A or G or T |
|---|
| 200 | */ |
|---|
| 201 | |
|---|
| 202 | #define cheap_if(cond) ((cond) ? 1 : 2) |
|---|
| 203 | static int baseCmp(char c1, char c2) // c1,c2 == 1=A,2=C (==index of character in nucleic_acid_order[]) |
|---|
| 204 | // returns 0 for equal |
|---|
| 205 | // 1 for probably mutations |
|---|
| 206 | // 2 for improbably mutations |
|---|
| 207 | { |
|---|
| 208 | #define COMPARABLE_BASES 5 |
|---|
| 209 | |
|---|
| 210 | if (c1==c2) return 0; |
|---|
| 211 | |
|---|
| 212 | if (c2<c1) // swap |
|---|
| 213 | { |
|---|
| 214 | int c3 = c1; |
|---|
| 215 | |
|---|
| 216 | c1 = c2; |
|---|
| 217 | c2 = c3; |
|---|
| 218 | } |
|---|
| 219 | |
|---|
| 220 | if (c2<=COMPARABLE_BASES) |
|---|
| 221 | { |
|---|
| 222 | awtc_assert(c1<=COMPARABLE_BASES); |
|---|
| 223 | |
|---|
| 224 | switch(c1) |
|---|
| 225 | { |
|---|
| 226 | case 0: return 2; |
|---|
| 227 | case 1: return cheap_if(c2==3); // A->G |
|---|
| 228 | case 2: return cheap_if(c2==4 || c2==5); // C->T/U |
|---|
| 229 | case 3: return cheap_if(c2==1); // G->A |
|---|
| 230 | case 4: if (c2==5) return 0; // T->U |
|---|
| 231 | case 5: if (c2==4) return 0; // U->T |
|---|
| 232 | return cheap_if(c2==2); // T/U->C |
|---|
| 233 | default: awtc_assert(0); |
|---|
| 234 | } |
|---|
| 235 | } |
|---|
| 236 | |
|---|
| 237 | int i; |
|---|
| 238 | int bestMatch = 3; |
|---|
| 239 | |
|---|
| 240 | if (c1<=COMPARABLE_BASES) |
|---|
| 241 | { |
|---|
| 242 | for (i=1; i<=COMPARABLE_BASES; i++) |
|---|
| 243 | { |
|---|
| 244 | if (isalpha(nucleic_maybe[i][c2])) // 'c2' maybe a 'i' |
|---|
| 245 | { |
|---|
| 246 | int match = baseCmp(c1,i); |
|---|
| 247 | if (match<bestMatch) bestMatch = match; |
|---|
| 248 | } |
|---|
| 249 | } |
|---|
| 250 | } |
|---|
| 251 | else |
|---|
| 252 | { |
|---|
| 253 | for (i=1; i<=COMPARABLE_BASES; i++) |
|---|
| 254 | { |
|---|
| 255 | if (isalpha(nucleic_maybe[i][c1])) // 'c1' maybe a 'i' |
|---|
| 256 | { |
|---|
| 257 | int match = baseCmp(i,c2); |
|---|
| 258 | if (match<bestMatch) bestMatch = match; |
|---|
| 259 | } |
|---|
| 260 | } |
|---|
| 261 | } |
|---|
| 262 | |
|---|
| 263 | awtc_assert(bestMatch>=0 && bestMatch<=2); |
|---|
| 264 | return bestMatch; |
|---|
| 265 | } |
|---|
| 266 | #undef cheap_if |
|---|
| 267 | |
|---|
| 268 | int AWTC_baseMatch(char c1, char c2) // c1,c2 == ACGUTRS... |
|---|
| 269 | // returns 0 for equal |
|---|
| 270 | // 1 for probably mutations |
|---|
| 271 | // 2 for improbably mutations |
|---|
| 272 | // -1 if one char is illegal |
|---|
| 273 | { |
|---|
| 274 | char *p1 = strchr(nucleic_acid_order, c1); |
|---|
| 275 | char *p2 = strchr(nucleic_acid_order, c2); |
|---|
| 276 | |
|---|
| 277 | awtc_assert(c1); |
|---|
| 278 | awtc_assert(c2); |
|---|
| 279 | |
|---|
| 280 | if (p1 && p2) return baseCmp(p1-nucleic_acid_order, p2-nucleic_acid_order); |
|---|
| 281 | return -1; |
|---|
| 282 | } |
|---|
| 283 | |
|---|
| 284 | static int getPenalty(char c1, char c2) /* c1,c2 = A=0,C=1,... s.o. */ |
|---|
| 285 | { |
|---|
| 286 | switch (baseCmp(c1,c2)) |
|---|
| 287 | { |
|---|
| 288 | case 1: return DEFAULT_PROBABLY_MUTATION; |
|---|
| 289 | case 2: return DEFAULT_IMPROBABLY_MUTATION; |
|---|
| 290 | default: break; |
|---|
| 291 | } |
|---|
| 292 | |
|---|
| 293 | return 0; |
|---|
| 294 | } |
|---|
| 295 | |
|---|
| 296 | static char *result[3]; // result buffers |
|---|
| 297 | |
|---|
| 298 | static char pam250mt[]={ |
|---|
| 299 | 12, |
|---|
| 300 | 0, 2, |
|---|
| 301 | -2, 1, 3, |
|---|
| 302 | -3, 1, 0, 6, |
|---|
| 303 | -2, 1, 1, 1, 2, |
|---|
| 304 | -3, 1, 0,-1, 1, 5, |
|---|
| 305 | -4, 1, 0,-1, 0, 0, 2, |
|---|
| 306 | -5, 0, 0,-1, 0, 1, 2, 4, |
|---|
| 307 | -5, 0, 0,-1, 0, 0, 1, 3, 4, |
|---|
| 308 | -5,-1,-1, 0, 0,-1, 1, 2, 2, 4, |
|---|
| 309 | -3,-1,-1, 0,-1,-2, 2, 1, 1, 3, 6, |
|---|
| 310 | -4, 0,-1, 0,-2,-3, 0,-1,-1, 1, 2, 6, |
|---|
| 311 | -5, 0, 0,-1,-1,-2, 1, 0, 0, 1, 0, 3, 5, |
|---|
| 312 | -5,-2,-1,-2,-1,-3,-2,-3,-2,-1,-2, 0, 0, 6, |
|---|
| 313 | -2,-1, 0,-2,-1,-3,-2,-2,-2,-2,-2,-2,-2, 2, 5, |
|---|
| 314 | -6,-3,-2,-3,-2,-4,-3,-4,-3,-2,-2,-3,-3, 4, 2, 6, |
|---|
| 315 | -2,-1, 0,-1, 0,-1,-2,-2,-2,-2,-2,-2,-2, 2, 4, 2, 4, |
|---|
| 316 | -4,-3,-3,-5,-4,-5,-4,-6,-5,-5,-2,-4,-5, 0, 1, 2,-1, 9, |
|---|
| 317 | 0,-3,-3,-5,-3,-5,-2,-4,-4,-4, 0,-4,-4,-2,-1,-1,-2, 7,10, |
|---|
| 318 | -8,-2,-5,-6,-6,-7,-4,-7,-7,-5,-3, 2,-3,-4,-5,-2,-6, 0, 0,17 }; |
|---|
| 319 | |
|---|
| 320 | /* |
|---|
| 321 | static char idmat[]={ |
|---|
| 322 | 10, |
|---|
| 323 | 0, 10, |
|---|
| 324 | 0, 0, 10, |
|---|
| 325 | 0, 0, 0, 10, |
|---|
| 326 | 0, 0, 0, 0, 10, |
|---|
| 327 | 0, 0, 0, 0, 1, 10, |
|---|
| 328 | 0, 0, 0, 0, 0, 0, 10, |
|---|
| 329 | 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 330 | 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 331 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 332 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 333 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 334 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 335 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 336 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 337 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 338 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 339 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, |
|---|
| 340 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10, |
|---|
| 341 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,10}; |
|---|
| 342 | */ |
|---|
| 343 | |
|---|
| 344 | /* |
|---|
| 345 | static char pam100mt[]={ |
|---|
| 346 | 14, |
|---|
| 347 | -1, 6, |
|---|
| 348 | -5, 2, 7, |
|---|
| 349 | -6, 1, -1, 10, |
|---|
| 350 | -5, 2, 2, 1, 6, |
|---|
| 351 | -8, 1, -3, -3, 1, 8, |
|---|
| 352 | -8, 2, 0, -3, -1, -1, 7, |
|---|
| 353 | -11,-1, -2, -4, -1, -1, 4, 8, |
|---|
| 354 | -11,-2, -3, -3, 0, -2, 1, 5, 8, |
|---|
| 355 | -11,-3, -3, -1, -2, -5,-1, 1, 4, 9, |
|---|
| 356 | -6,-4, -5, -2, -5, -7, 2, -1, -2, 4,11, |
|---|
| 357 | -6,-1, -4, -2, -5, -8,-3, -6, -5, 1, 1, 10, |
|---|
| 358 | -11,-2, -1, -4, -4, -5, 1, -2, -2, -1,-3, 3, 8, |
|---|
| 359 | -11,-4, -2, -6, -3, -8,-5, -8, -6, -2,-7, -2, 1, 13, |
|---|
| 360 | -5,-4, -1, -6, -3, -7,-4, -6, -5, -5,-7, -4, -4, 2, 9, |
|---|
| 361 | -12,-7, -5, -5, -5, -8,-6, -9, -7, -3,-5, -7, -6, 4, 2, 9, |
|---|
| 362 | -4,-4, -1, -4, 0, -4,-5, -6, -5, -5,-6, -6, -6, 1, 5, 1, 8, |
|---|
| 363 | -10,-5, -6, -9, -7, -8,-6,-11,-11,-10,-4, -7,-11, -2, 0, 0, -5,12, |
|---|
| 364 | -2,-6, -6,-11, -6,-11,-3, -9, -7, -9,-1,-10,-10, -8, -4,-5, -6, 6,13, |
|---|
| 365 | -13,-4,-10,-11,-11,-13,-8,-13,-14,-11,-7, 1, -9,-11,-12,-7,-14,-2,-2,19 }; |
|---|
| 366 | */ |
|---|
| 367 | |
|---|
| 368 | static char *matptr = pam250mt; |
|---|
| 369 | |
|---|
| 370 | #if defined(DEBUG) && 0 |
|---|
| 371 | static GB_ERROR p_decode(const char *naseq, char *seq, int l) |
|---|
| 372 | { |
|---|
| 373 | int len = strlen(amino_acid_order); |
|---|
| 374 | |
|---|
| 375 | for (int i=1; i<=l && naseq[i]; i++) |
|---|
| 376 | { |
|---|
| 377 | awtc_assert(naseq[i]<len); |
|---|
| 378 | seq[i] = amino_acid_order[naseq[i]]; |
|---|
| 379 | } |
|---|
| 380 | |
|---|
| 381 | return 0; |
|---|
| 382 | } |
|---|
| 383 | |
|---|
| 384 | static GB_ERROR n_decode(const char *naseq, char *seq, int l) |
|---|
| 385 | { |
|---|
| 386 | int len = strlen(nucleic_acid_order); |
|---|
| 387 | |
|---|
| 388 | for (int i=1; i<=l && naseq[i]; i++) |
|---|
| 389 | { |
|---|
| 390 | awtc_assert(naseq[i]<len); |
|---|
| 391 | seq[i] = nucleic_acid_order[naseq[i]]; |
|---|
| 392 | } |
|---|
| 393 | |
|---|
| 394 | return 0; |
|---|
| 395 | } |
|---|
| 396 | #endif |
|---|
| 397 | |
|---|
| 398 | static inline GB_ERROR MAXLENtooSmall() |
|---|
| 399 | { |
|---|
| 400 | return GB_export_error("AWTC_ClustalV-aligner: MAXLEN is dimensioned to small for this sequence"); |
|---|
| 401 | } |
|---|
| 402 | |
|---|
| 403 | static inline void *ckalloc(size_t bytes) |
|---|
| 404 | { |
|---|
| 405 | void *ret = malloc(bytes); |
|---|
| 406 | |
|---|
| 407 | if (!ret) error = "out of memory"; |
|---|
| 408 | return ret; |
|---|
| 409 | } |
|---|
| 410 | |
|---|
| 411 | static void init_myers(long max_seq_length) |
|---|
| 412 | { |
|---|
| 413 | int i; |
|---|
| 414 | |
|---|
| 415 | if (error) return; |
|---|
| 416 | |
|---|
| 417 | naa1 = (int **)ckalloc(21 * sizeof (int *) ); |
|---|
| 418 | for(i=0;i<21;i++) naa1[i]=(int *)ckalloc( (max_seq_length+1)*sizeof (int)); |
|---|
| 419 | if (error) return; |
|---|
| 420 | |
|---|
| 421 | naa2 = (int **)ckalloc(21 * sizeof (int *) ); |
|---|
| 422 | for(i=0;i<21;i++) naa2[i]=(int *)ckalloc( (max_seq_length+1)*sizeof (int)); |
|---|
| 423 | if (error) return; |
|---|
| 424 | |
|---|
| 425 | naas = (int **)ckalloc(21 * sizeof (int *) ); |
|---|
| 426 | for(i=0;i<21;i++) naas[i]=(int *)ckalloc( (max_seq_length+1)*sizeof (int)); |
|---|
| 427 | if (error) return; |
|---|
| 428 | } |
|---|
| 429 | |
|---|
| 430 | static void make_pamo(int nv) |
|---|
| 431 | { |
|---|
| 432 | register int i,c; |
|---|
| 433 | |
|---|
| 434 | little_pam=big_pam=matptr[0]; |
|---|
| 435 | for(i=0;i<210;++i) { |
|---|
| 436 | c=matptr[i]; |
|---|
| 437 | little_pam=(little_pam<c) ? little_pam : c; |
|---|
| 438 | big_pam=(big_pam>c) ? big_pam : c; |
|---|
| 439 | } |
|---|
| 440 | for(i=0;i<210;++i) |
|---|
| 441 | pamo[i]= matptr[i]-little_pam; |
|---|
| 442 | nv -= little_pam; |
|---|
| 443 | big_pam -= little_pam; |
|---|
| 444 | xover = big_pam - nv; |
|---|
| 445 | /* |
|---|
| 446 | fprintf(stdout,"\n\nxover= %d, big_pam = %d, little_pam=%d, nv = %d\n\n" |
|---|
| 447 | ,xover,big_pam,little_pam,nv); |
|---|
| 448 | */ |
|---|
| 449 | } |
|---|
| 450 | |
|---|
| 451 | static void fill_pam() |
|---|
| 452 | { |
|---|
| 453 | register int i,j,pos; |
|---|
| 454 | |
|---|
| 455 | pos=0; |
|---|
| 456 | |
|---|
| 457 | for(i=0;i<20;++i) |
|---|
| 458 | for(j=0;j<=i;++j) |
|---|
| 459 | pam[i][j]=pamo[pos++]; |
|---|
| 460 | |
|---|
| 461 | for(i=0;i<20;++i) |
|---|
| 462 | for(j=0;j<=i;++j) |
|---|
| 463 | pam[j][i]=pam[i][j]; |
|---|
| 464 | |
|---|
| 465 | if(dnaflag) |
|---|
| 466 | { |
|---|
| 467 | xover=4; |
|---|
| 468 | big_pam=8; |
|---|
| 469 | for(i=0;i<=NUCLEIDS;++i) |
|---|
| 470 | for(j=0;j<=NUCLEIDS;++j) |
|---|
| 471 | weights[i][j] = getPenalty(i,j); |
|---|
| 472 | } |
|---|
| 473 | else { |
|---|
| 474 | for(i=1;i<21;++i) |
|---|
| 475 | for(j=1;j<21;++j) { |
|---|
| 476 | weights[i][j] = big_pam - pam[i-1][j-1]; |
|---|
| 477 | } |
|---|
| 478 | for(i=0;i<21;++i) { |
|---|
| 479 | weights[0][i] = xover; |
|---|
| 480 | weights[i][0] = xover; |
|---|
| 481 | } |
|---|
| 482 | } |
|---|
| 483 | } |
|---|
| 484 | |
|---|
| 485 | static void exit_myers(void) |
|---|
| 486 | { |
|---|
| 487 | int i; |
|---|
| 488 | for (i=0; i<21; i++) delete(naas[i]); |
|---|
| 489 | delete(naas); |
|---|
| 490 | for (i=0; i<21; i++) delete(naa2[i]); |
|---|
| 491 | delete(naa2); |
|---|
| 492 | for (i=0; i<21; i++) delete(naa1[i]); |
|---|
| 493 | delete(naa1); |
|---|
| 494 | } |
|---|
| 495 | |
|---|
| 496 | static void init_show_pair(long max_seq_length) |
|---|
| 497 | { |
|---|
| 498 | displ = (int *) ckalloc( (2*max_seq_length +1) * sizeof (int) ); |
|---|
| 499 | |
|---|
| 500 | zza = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 501 | zzb = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 502 | |
|---|
| 503 | zzc = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 504 | zzd = (int *)ckalloc( (max_seq_length+1) * sizeof (int) ); |
|---|
| 505 | } |
|---|
| 506 | |
|---|
| 507 | void exit_show_pair(void) |
|---|
| 508 | { |
|---|
| 509 | delete(zzd); |
|---|
| 510 | delete(zzc); |
|---|
| 511 | delete(zzb); |
|---|
| 512 | delete(zza); |
|---|
| 513 | delete(displ); |
|---|
| 514 | } |
|---|
| 515 | |
|---|
| 516 | static inline void add(int v) // insert 'v' gaps into master ??? |
|---|
| 517 | { |
|---|
| 518 | if (last_print<0 && print_ptr>0) { |
|---|
| 519 | displ[print_ptr-1] = v; |
|---|
| 520 | displ[print_ptr++] = last_print; |
|---|
| 521 | } |
|---|
| 522 | else { |
|---|
| 523 | last_print = displ[print_ptr++] = v; |
|---|
| 524 | } |
|---|
| 525 | } |
|---|
| 526 | |
|---|
| 527 | static MAXN_2(inline) int calc_weight(int iat,int jat,int v1,int v2) |
|---|
| 528 | { |
|---|
| 529 | #if (MAXN==2) |
|---|
| 530 | awtc_assert(pos1==1 && pos2==1); |
|---|
| 531 | int j = seq_array[alist[1]][v2+jat-1]; |
|---|
| 532 | return j>0 ? naas[j][v1+iat-1] : 0; |
|---|
| 533 | #else |
|---|
| 534 | int sum,i,j,lookn,ret; |
|---|
| 535 | int ipos,jpos; |
|---|
| 536 | |
|---|
| 537 | ipos = v1 + iat -1; |
|---|
| 538 | jpos = v2 + jat -1; |
|---|
| 539 | |
|---|
| 540 | ret = 0; |
|---|
| 541 | sum = lookn = 0; |
|---|
| 542 | |
|---|
| 543 | if(pos1>=pos2) |
|---|
| 544 | { |
|---|
| 545 | for(i=1;i<=pos2;++i) |
|---|
| 546 | { |
|---|
| 547 | j=seq_array[alist[i]][jpos]; |
|---|
| 548 | if(j>0) |
|---|
| 549 | { |
|---|
| 550 | sum += naas[j][ipos]; |
|---|
| 551 | ++lookn; |
|---|
| 552 | } |
|---|
| 553 | } |
|---|
| 554 | } |
|---|
| 555 | else |
|---|
| 556 | { |
|---|
| 557 | for(i=1;i<=pos1;++i) |
|---|
| 558 | { |
|---|
| 559 | j=seq_array[alist[i]][ipos]; |
|---|
| 560 | if(j>0) |
|---|
| 561 | { |
|---|
| 562 | sum += naas[j][jpos]; |
|---|
| 563 | ++lookn; |
|---|
| 564 | } |
|---|
| 565 | } |
|---|
| 566 | } |
|---|
| 567 | if (sum>0) ret = sum/lookn; |
|---|
| 568 | return ret; |
|---|
| 569 | #endif |
|---|
| 570 | } |
|---|
| 571 | |
|---|
| 572 | #ifdef MATRIX_DUMP |
|---|
| 573 | static inline char *nstr(char *cp, int length) |
|---|
| 574 | { |
|---|
| 575 | char *s = lstr(cp,length); |
|---|
| 576 | n_decode(s-1, s-1, length); |
|---|
| 577 | return s; |
|---|
| 578 | } |
|---|
| 579 | |
|---|
| 580 | static inline void dumpMatrix(int x0, int y0, int breite, int hoehe, int mitte_vert) |
|---|
| 581 | { |
|---|
| 582 | int b; |
|---|
| 583 | int h; |
|---|
| 584 | char *sl = (char*)malloc(hoehe+3); |
|---|
| 585 | char *ma = (char*)malloc(breite+3); |
|---|
| 586 | |
|---|
| 587 | sprintf(ma, "-%s-", nstr(seq_array[1]+x0, breite)); |
|---|
| 588 | sprintf(sl, "-%s-", nstr(seq_array[2]+y0, hoehe)); |
|---|
| 589 | |
|---|
| 590 | printf(" "); |
|---|
| 591 | |
|---|
| 592 | for (b=0; b<=mitte_vert; b++) |
|---|
| 593 | printf("%5c", ma[b]); |
|---|
| 594 | printf(" MID"); |
|---|
| 595 | for (b++; b<=breite+1+1; b++) |
|---|
| 596 | printf("%5c", ma[b-1]); |
|---|
| 597 | |
|---|
| 598 | for (h=0; h<=hoehe+1; h++) |
|---|
| 599 | { |
|---|
| 600 | printf("\n%c vertical: ", sl[h]); |
|---|
| 601 | for (b=0; b<=breite+1+1; b++) printf("%5i", vertical[b][h]); |
|---|
| 602 | printf("\n verticalOpen: "); |
|---|
| 603 | for (b=0; b<=breite+1+1; b++) printf("%5i", verticalOpen[b][h]); |
|---|
| 604 | printf("\n diagonal: "); |
|---|
| 605 | for (b=0; b<=breite+1+1; b++) printf("%5i", diagonal[b][h]); |
|---|
| 606 | printf("\n horizontal: "); |
|---|
| 607 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontal[b][h]); |
|---|
| 608 | printf("\n horizontalOpen:"); |
|---|
| 609 | for (b=0; b<=breite+1+1; b++) printf("%5i", horizontalOpen[b][h]); |
|---|
| 610 | printf("\n"); |
|---|
| 611 | } |
|---|
| 612 | |
|---|
| 613 | printf("--------------------\n"); |
|---|
| 614 | |
|---|
| 615 | free(ma); |
|---|
| 616 | free(sl); |
|---|
| 617 | } |
|---|
| 618 | #endif |
|---|
| 619 | |
|---|
| 620 | static int diff(int v1,int v2,int v3,int v4, int st,int en) |
|---|
| 621 | /* v1,v3 master sequence (v1=offset, v3=length) |
|---|
| 622 | * v2,v4 slave sequence (v2=offset, v4=length) |
|---|
| 623 | * st,en gap_open-penalties for start and end of the sequence |
|---|
| 624 | * |
|---|
| 625 | * returns costs for inserted gaps |
|---|
| 626 | */ |
|---|
| 627 | { |
|---|
| 628 | int ctrc,ctri,ctrj=0, |
|---|
| 629 | i,j,k,l,m,n,p, |
|---|
| 630 | flag; |
|---|
| 631 | |
|---|
| 632 | #ifdef DEBUG |
|---|
| 633 | # ifdef MATRIX_DUMP |
|---|
| 634 | int display_matrix = 0; |
|---|
| 635 | |
|---|
| 636 | awtc_assert(v3<=(DISPLAY_MATRIX_SIZE+2)); // width |
|---|
| 637 | awtc_assert(v4<=(DISPLAY_MATRIX_SIZE)); // height |
|---|
| 638 | |
|---|
| 639 | # define DISPLAY_MATRIX_ELEMENTS ((DISPLAY_MATRIX_SIZE+2)*(DISPLAY_MATRIX_SIZE+2)) |
|---|
| 640 | |
|---|
| 641 | memset(vertical, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 642 | memset(verticalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 643 | memset(diagonal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 644 | memset(horizontal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 645 | memset(horizontalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 646 | |
|---|
| 647 | # endif |
|---|
| 648 | #endif |
|---|
| 649 | static int deep; |
|---|
| 650 | deep++; |
|---|
| 651 | |
|---|
| 652 | #if (defined (DEBUG) && 0) |
|---|
| 653 | { |
|---|
| 654 | char *d; |
|---|
| 655 | |
|---|
| 656 | d = lstr(seq_array[1]+v1,v3); |
|---|
| 657 | n_decode(d-1,d-1,v3); |
|---|
| 658 | |
|---|
| 659 | for (int cnt=0; cnt<deep; cnt++) putchar(' '); |
|---|
| 660 | printf("master = '%s' (penalties left=%i right=%i)\n", d, st, en); |
|---|
| 661 | d = lstr(seq_array[2]+v2,v4); |
|---|
| 662 | n_decode(d-1,d-1,v4); |
|---|
| 663 | |
|---|
| 664 | for (cnt=0; cnt<deep; cnt++) putchar(' '); |
|---|
| 665 | printf("slave = '%s'\n", d); |
|---|
| 666 | } |
|---|
| 667 | #endif |
|---|
| 668 | |
|---|
| 669 | // q = gap_open + gap_extend; // q is replaced for local positions |
|---|
| 670 | |
|---|
| 671 | if(v4<=0) { // if slave sequence is empty |
|---|
| 672 | if(v3>0) { |
|---|
| 673 | if(last_print<0 ) { |
|---|
| 674 | last_print = displ[print_ptr-1] -= v3; // add .. |
|---|
| 675 | } |
|---|
| 676 | else { |
|---|
| 677 | last_print = displ[print_ptr++] = -(v3); // .. or insert gap of length 'v3' into slave |
|---|
| 678 | } |
|---|
| 679 | } |
|---|
| 680 | |
|---|
| 681 | deep--; |
|---|
| 682 | return master_gapAt(v1,v3); |
|---|
| 683 | } |
|---|
| 684 | |
|---|
| 685 | if(v3<=1) { |
|---|
| 686 | if(v3<=0) { // if master sequence is empty |
|---|
| 687 | add(v4); // ??? insert gap length 'v4' into master ??? |
|---|
| 688 | deep--; |
|---|
| 689 | return slave_gapAt(v2,v4); |
|---|
| 690 | } |
|---|
| 691 | |
|---|
| 692 | awtc_assert(v3==1); |
|---|
| 693 | // if master length == 1 |
|---|
| 694 | |
|---|
| 695 | if (v4==1) |
|---|
| 696 | { |
|---|
| 697 | if(st>en) |
|---|
| 698 | st=en; |
|---|
| 699 | |
|---|
| 700 | /***************if(!v4)*********BUG********************************/ |
|---|
| 701 | |
|---|
| 702 | ctrc = slave_gapAtWithOpenPenalty(v2,v4,st); |
|---|
| 703 | ctrj = 0; |
|---|
| 704 | |
|---|
| 705 | for(j=1;j<=v4;++j) |
|---|
| 706 | { |
|---|
| 707 | k = slave_gapAt(v2, j-1) + calc_weight(1,j,v1,v2) + slave_gapAt(v2+j,v4-j); |
|---|
| 708 | if(k<ctrc) { ctrc = k; ctrj = j; } |
|---|
| 709 | } |
|---|
| 710 | |
|---|
| 711 | if(!ctrj) |
|---|
| 712 | { |
|---|
| 713 | add(v4); |
|---|
| 714 | if(last_print<0) |
|---|
| 715 | last_print = displ[print_ptr-1] -= 1; |
|---|
| 716 | else |
|---|
| 717 | last_print = displ[print_ptr++] = -1; |
|---|
| 718 | } |
|---|
| 719 | else |
|---|
| 720 | { |
|---|
| 721 | if(ctrj>1) |
|---|
| 722 | add(ctrj-1); |
|---|
| 723 | displ[print_ptr++] = last_print = 0; |
|---|
| 724 | if(ctrj<v4) |
|---|
| 725 | add(v4-ctrj); |
|---|
| 726 | } |
|---|
| 727 | |
|---|
| 728 | deep--; |
|---|
| 729 | return ctrc; |
|---|
| 730 | } |
|---|
| 731 | } |
|---|
| 732 | |
|---|
| 733 | awtc_assert(v3>=1 && v4>=1); |
|---|
| 734 | |
|---|
| 735 | // slave length >= 1 |
|---|
| 736 | // master length >= 1 |
|---|
| 737 | |
|---|
| 738 | # define AF 0 |
|---|
| 739 | |
|---|
| 740 | // first column of matrix (slave side): |
|---|
| 741 | IF_MATRIX_DUMP(vertical[0][0]=) |
|---|
| 742 | zza[0] = 0; |
|---|
| 743 | p = master_gap_open(v1); |
|---|
| 744 | for(j=1; j<=v4; j++) |
|---|
| 745 | { |
|---|
| 746 | p += master_gap_extend(v1); |
|---|
| 747 | IF_MATRIX_DUMP(vertical[0][j]=) |
|---|
| 748 | zza[j] = p; |
|---|
| 749 | IF_MATRIX_DUMP(verticalOpen[0][j]=) |
|---|
| 750 | zzb[j] = p + master_gap_open(v1); |
|---|
| 751 | } |
|---|
| 752 | |
|---|
| 753 | // left half of the matrix |
|---|
| 754 | p = st; |
|---|
| 755 | ctri = v3 / 2; |
|---|
| 756 | for(i=1; i<=ctri; i++) |
|---|
| 757 | { |
|---|
| 758 | n = zza[0]; |
|---|
| 759 | p += master_gap_extend(v1+i+AF); |
|---|
| 760 | k = p; |
|---|
| 761 | IF_MATRIX_DUMP(vertical[i][0]=) |
|---|
| 762 | zza[0] = k; |
|---|
| 763 | l = p + master_gap_open(v1+i+AF); |
|---|
| 764 | |
|---|
| 765 | for(j=1; j<=v4; j++) |
|---|
| 766 | { |
|---|
| 767 | // from above (gap in master (behind position i)) |
|---|
| 768 | IF_MATRIX_DUMP(verticalOpen[i][j]=) k += master_gap_open(v1+i+AF)+master_gap_extend(v1+i+AF); // (1) |
|---|
| 769 | IF_MATRIX_DUMP(vertical[i][j]=) l += master_gap_extend(v1+i+AF); // (2) |
|---|
| 770 | if(k<l) l = k; // l=min((1),(2)) |
|---|
| 771 | |
|---|
| 772 | // from left (gap in slave (behind position j)) |
|---|
| 773 | IF_MATRIX_DUMP(horizontalOpen[i][j]=) k = zza[j] + slave_gap_open(v2+j+AF)+slave_gap_extend(v2+j+AF); // (3) |
|---|
| 774 | IF_MATRIX_DUMP(horizontal[i][j]=) m = zzb[j] + slave_gap_extend(v2+j+AF); // (4) |
|---|
| 775 | if(k<m) m = k; // m=min((3),(4)) |
|---|
| 776 | |
|---|
| 777 | // diagonal (no gaps) |
|---|
| 778 | IF_MATRIX_DUMP(diagonal[i][j]=) k = n + calc_weight(i,j,v1,v2); // (5) |
|---|
| 779 | if(l<k) k = l; |
|---|
| 780 | if(m<k) k = m; // k = minimum of all paths |
|---|
| 781 | |
|---|
| 782 | n = zza[j]; // minimum of same row; one column to the left |
|---|
| 783 | zza[j] = k; // minimum of all paths to this matrix position |
|---|
| 784 | zzb[j] = m; // minimum of those two paths, where gap was inserted into slave |
|---|
| 785 | } |
|---|
| 786 | } |
|---|
| 787 | |
|---|
| 788 | # define MHO 1 // X-Offset for second half of matrix-arrays (only used to insert MID-column into dumpMatrix()) |
|---|
| 789 | # define BO 1 // Offset for second half of matrix (cause now the indices i and j are smaller as above) |
|---|
| 790 | |
|---|
| 791 | // last column of matrix: |
|---|
| 792 | |
|---|
| 793 | zzb[0]=zza[0]; |
|---|
| 794 | IF_MATRIX_DUMP(vertical[v3+1+MHO][v4+1]=) |
|---|
| 795 | zzc[v4]=0; |
|---|
| 796 | p = master_gap_open(v1+v3); |
|---|
| 797 | for(j=v4-1; j>-1; j--) |
|---|
| 798 | { |
|---|
| 799 | p += master_gap_extend(v1+v3); |
|---|
| 800 | IF_MATRIX_DUMP(vertical[v3+1+MHO][j+BO]=) |
|---|
| 801 | zzc[j] = p; |
|---|
| 802 | IF_MATRIX_DUMP(verticalOpen[v3+1+MHO][j+BO]=) |
|---|
| 803 | zzd[j] = p+master_gap_open(v1+v3); |
|---|
| 804 | } |
|---|
| 805 | |
|---|
| 806 | // right half of matrix (backwards): |
|---|
| 807 | p = en; |
|---|
| 808 | for(i=v3-1;i>=ctri;i--) |
|---|
| 809 | { |
|---|
| 810 | n = zzc[v4]; |
|---|
| 811 | p += master_gap_extend(v1+i); |
|---|
| 812 | k = p; |
|---|
| 813 | IF_MATRIX_DUMP(vertical[i+BO+MHO][v4+1]=) |
|---|
| 814 | zzc[v4] = k; |
|---|
| 815 | l = p+master_gap_open(v1+i); |
|---|
| 816 | |
|---|
| 817 | for(j=v4-1; j>=0; j--) |
|---|
| 818 | { |
|---|
| 819 | // from below (gap in master (in front of position (i+BO))) |
|---|
| 820 | IF_MATRIX_DUMP(verticalOpen[i+BO+MHO][j+BO]=) k += master_gap_open(v1+i)+master_gap_extend(v1+i); // (1) |
|---|
| 821 | IF_MATRIX_DUMP(vertical[i+BO+MHO][j+BO]=) l += master_gap_extend(v1+i); // (2) |
|---|
| 822 | if(k<l) l = k; // l=min((1),(2)) |
|---|
| 823 | |
|---|
| 824 | // from right (gap in slave (in front of position (j+BO))) |
|---|
| 825 | IF_MATRIX_DUMP(horizontalOpen[i+BO+MHO][j+BO]=) k = zzc[j] + slave_gap_open(v2+j) + slave_gap_extend(v2+j); // (3) |
|---|
| 826 | IF_MATRIX_DUMP(horizontal[i+BO+MHO][j+BO]=) m = zzd[j] + slave_gap_extend(v2+j); // (4) |
|---|
| 827 | if(k<m) m = k; // m=min((3),(4)) |
|---|
| 828 | |
|---|
| 829 | // diagonal (no gaps) |
|---|
| 830 | IF_MATRIX_DUMP(diagonal[i+BO+MHO][j+BO]=) k = n + calc_weight(i+BO,j+BO,v1,v2); // (5) |
|---|
| 831 | if(l<k) k = l; |
|---|
| 832 | if(m<k) k = m; // k = minimum of all paths |
|---|
| 833 | |
|---|
| 834 | n = zzc[j]; // minimum of same row; one column to the right |
|---|
| 835 | zzc[j] = k; // minimum of all paths to this matrix position |
|---|
| 836 | zzd[j] = m; // minimum of those two paths, where gap was inserted into slave |
|---|
| 837 | } |
|---|
| 838 | } |
|---|
| 839 | |
|---|
| 840 | #undef BO |
|---|
| 841 | |
|---|
| 842 | // connect rightmost column of first half (column ctri) |
|---|
| 843 | // with leftmost column of second half (column ctri+1) |
|---|
| 844 | |
|---|
| 845 | zzd[v4] = zzc[v4]; |
|---|
| 846 | |
|---|
| 847 | ctrc = INT_MAX; |
|---|
| 848 | flag = 0; |
|---|
| 849 | |
|---|
| 850 | for(j=(ctri?0:1);j<=v4;j++) // was j=0;... (this was erroneous cause it splitted zero-length subsequences) |
|---|
| 851 | { |
|---|
| 852 | IF_MATRIX_DUMP(vertical[ctri+MHO][j]=) |
|---|
| 853 | k = zza[j] + zzc[j]; // sum up both calculations (=diagonal=no gaps) |
|---|
| 854 | |
|---|
| 855 | if (k<ctrc || ((k==ctrc) && (zza[j]!=zzb[j]) && (zzc[j]==zzd[j]))) |
|---|
| 856 | { |
|---|
| 857 | ctrc = k; |
|---|
| 858 | ctrj = j; |
|---|
| 859 | flag = 1; |
|---|
| 860 | } |
|---|
| 861 | } |
|---|
| 862 | |
|---|
| 863 | for(j=v4;j>=(ctri?0:1);j--) |
|---|
| 864 | { |
|---|
| 865 | IF_MATRIX_DUMP(verticalOpen[ctri+MHO][j]=) |
|---|
| 866 | k = zzb[j] + zzd[j] // paths where gaps were inserted into slave (left and right side!) |
|---|
| 867 | - slave_gap_open(j); // subtract gap_open-penalty which was added twice (at left and right end of gap) |
|---|
| 868 | |
|---|
| 869 | if(k<ctrc) |
|---|
| 870 | { |
|---|
| 871 | ctrc = k; |
|---|
| 872 | ctrj = j; |
|---|
| 873 | flag = 2; |
|---|
| 874 | } |
|---|
| 875 | } |
|---|
| 876 | |
|---|
| 877 | awtc_assert(flag>=1 && flag<=2); |
|---|
| 878 | |
|---|
| 879 | #undef MHO |
|---|
| 880 | |
|---|
| 881 | #ifdef MATRIX_DUMP |
|---|
| 882 | if (display_matrix) |
|---|
| 883 | dumpMatrix(v1,v2,v3,v4,ctri); |
|---|
| 884 | #endif |
|---|
| 885 | |
|---|
| 886 | /* Conquer recursively around midpoint */ |
|---|
| 887 | |
|---|
| 888 | if(flag==1) /* Type 1 gaps */ |
|---|
| 889 | { |
|---|
| 890 | diff(v1,v2,ctri,ctrj,st,master_gap_open(v1+ctri)); // includes midpoint ctri and ctrj |
|---|
| 891 | diff(v1+ctri,v2+ctrj,v3-ctri,v4-ctrj,master_gap_open(v1+ctri),en); |
|---|
| 892 | } |
|---|
| 893 | else |
|---|
| 894 | { |
|---|
| 895 | diff(v1,v2,ctri-1,ctrj,st,0); // includes midpoint ctrj |
|---|
| 896 | |
|---|
| 897 | if(last_print<0) /* Delete 2 */ |
|---|
| 898 | last_print = displ[print_ptr-1] -= 2; |
|---|
| 899 | else |
|---|
| 900 | last_print = displ[print_ptr++] = -2; |
|---|
| 901 | |
|---|
| 902 | diff(v1+ctri+1,v2+ctrj,v3-ctri-1,v4-ctrj,0,en); |
|---|
| 903 | } |
|---|
| 904 | |
|---|
| 905 | deep--; |
|---|
| 906 | return ctrc; /* Return the score of the best alignment */ |
|---|
| 907 | } |
|---|
| 908 | |
|---|
| 909 | static void do_align( /*int v1,*/ int *score, long act_seq_length) |
|---|
| 910 | { |
|---|
| 911 | int i,j,k,l1,l2,n; |
|---|
| 912 | int t_arr[21]; |
|---|
| 913 | |
|---|
| 914 | l1=l2=pos1=pos2=0; |
|---|
| 915 | |
|---|
| 916 | // clear statistics |
|---|
| 917 | |
|---|
| 918 | for(i=1;i<=act_seq_length;++i) |
|---|
| 919 | { |
|---|
| 920 | for(j=0;j<21;++j) naa1[j][i]=naa2[j][i]=0; |
|---|
| 921 | for(j=1;j<21;++j) naas[j][i]=0; |
|---|
| 922 | } |
|---|
| 923 | |
|---|
| 924 | // create position statistics for each group |
|---|
| 925 | // [here every group contains only one seq] |
|---|
| 926 | |
|---|
| 927 | for(i=1;i<=nseqs;++i) |
|---|
| 928 | { |
|---|
| 929 | if(group[i]==1) |
|---|
| 930 | { |
|---|
| 931 | fst_list[++pos1]=i; |
|---|
| 932 | for(j=1;j<=seqlen_array[i];++j) |
|---|
| 933 | if(seq_array[i][j]>0) |
|---|
| 934 | { |
|---|
| 935 | ++naa1[seq_array[i][j]][j]; |
|---|
| 936 | ++naa1[0][j]; |
|---|
| 937 | } |
|---|
| 938 | if(seqlen_array[i]>l1) l1=seqlen_array[i]; |
|---|
| 939 | } |
|---|
| 940 | else if(group[i]==2) |
|---|
| 941 | { |
|---|
| 942 | snd_list[++pos2]=i; |
|---|
| 943 | for(j=1;j<=seqlen_array[i];++j) |
|---|
| 944 | if(seq_array[i][j]>0) |
|---|
| 945 | { |
|---|
| 946 | ++naa2[seq_array[i][j]][j]; |
|---|
| 947 | ++naa2[0][j]; |
|---|
| 948 | } |
|---|
| 949 | if(seqlen_array[i]>l2) l2=seqlen_array[i]; |
|---|
| 950 | } |
|---|
| 951 | } |
|---|
| 952 | |
|---|
| 953 | if(pos1>=pos2) |
|---|
| 954 | { |
|---|
| 955 | for(i=1;i<=pos2;++i) alist[i]=snd_list[i]; |
|---|
| 956 | for(n=1;n<=l1;++n) |
|---|
| 957 | { |
|---|
| 958 | for(i=1;i<21;++i) t_arr[i]=0; |
|---|
| 959 | for(i=1;i<21;++i) |
|---|
| 960 | if(naa1[i][n]>0) |
|---|
| 961 | for(j=1;j<21;++j) |
|---|
| 962 | t_arr[j] += (weights[i][j]*naa1[i][n]); |
|---|
| 963 | k = naa1[0][n]; |
|---|
| 964 | if(k>0) |
|---|
| 965 | for(i=1;i<21;++i) |
|---|
| 966 | naas[i][n]=t_arr[i]/k; |
|---|
| 967 | } |
|---|
| 968 | } |
|---|
| 969 | else |
|---|
| 970 | { |
|---|
| 971 | MAXN_2_assert(0); // should never occur if MAXN==2 |
|---|
| 972 | #if (MAXN!=2) |
|---|
| 973 | for(i=1;i<=pos1;++i) alist[i]=fst_list[i]; |
|---|
| 974 | for(n=1;n<=l2;++n) |
|---|
| 975 | { |
|---|
| 976 | for(i=1;i<21;++i) t_arr[i]=0; |
|---|
| 977 | for(i=1;i<21;++i) |
|---|
| 978 | if(naa2[i][n]>0) |
|---|
| 979 | for(j=1;j<21;++j) |
|---|
| 980 | t_arr[j] += (weights[i][j]*naa2[i][n]); |
|---|
| 981 | k = naa2[0][n]; |
|---|
| 982 | if(k>0) |
|---|
| 983 | for(i=1;i<21;++i) |
|---|
| 984 | naas[i][n]=t_arr[i]/k; |
|---|
| 985 | } |
|---|
| 986 | #endif |
|---|
| 987 | } |
|---|
| 988 | |
|---|
| 989 | *score=diff(1,1,l1,l2,master_gap_open(1),master_gap_open(l1+1)/*v1,v1*/); /* Myers and Miller alignment now */ |
|---|
| 990 | } |
|---|
| 991 | |
|---|
| 992 | /* |
|---|
| 993 | static void group_gap(int len,int sclass,char *seq) |
|---|
| 994 | { |
|---|
| 995 | int i,j,k,xtra; |
|---|
| 996 | |
|---|
| 997 | for(i=1;i<=nseqs;++i) |
|---|
| 998 | if(group[i] == sclass) |
|---|
| 999 | { |
|---|
| 1000 | xtra = len - seqlen_array[i]; |
|---|
| 1001 | if(xtra>0) |
|---|
| 1002 | for(j=1;j<=xtra;++j) |
|---|
| 1003 | seq_array[i][seqlen_array[i]+j] = -1; |
|---|
| 1004 | for(j=1;j<=len;++j) |
|---|
| 1005 | if(seq[j] == '-') { |
|---|
| 1006 | for(k=len;k>=j+1;--k) |
|---|
| 1007 | seq_array[i][k] = seq_array[i][k-1]; |
|---|
| 1008 | seq_array[i][j] = -1; |
|---|
| 1009 | } |
|---|
| 1010 | seqlen_array[i] = len; |
|---|
| 1011 | } |
|---|
| 1012 | } |
|---|
| 1013 | */ |
|---|
| 1014 | |
|---|
| 1015 | static int add_ggaps(long /*max_seq_length*/) |
|---|
| 1016 | { |
|---|
| 1017 | int i,j,k,pos,to_do; |
|---|
| 1018 | |
|---|
| 1019 | pos=1; |
|---|
| 1020 | to_do=print_ptr; |
|---|
| 1021 | |
|---|
| 1022 | for(i=0;i<to_do;++i) // was: 1 .. <=to_do |
|---|
| 1023 | { |
|---|
| 1024 | if(displ[i]==0) |
|---|
| 1025 | { |
|---|
| 1026 | result[1][pos]=result[2][pos]='*'; |
|---|
| 1027 | ++pos; |
|---|
| 1028 | } |
|---|
| 1029 | else |
|---|
| 1030 | { |
|---|
| 1031 | if((k=displ[i])>0) |
|---|
| 1032 | { |
|---|
| 1033 | for(j=0;j<=k-1;++j) |
|---|
| 1034 | { |
|---|
| 1035 | result[2][pos+j]='*'; |
|---|
| 1036 | result[1][pos+j]='-'; |
|---|
| 1037 | } |
|---|
| 1038 | pos += k; |
|---|
| 1039 | } |
|---|
| 1040 | else |
|---|
| 1041 | { |
|---|
| 1042 | k = (displ[i]<0) ? displ[i] * -1 : displ[i]; |
|---|
| 1043 | for(j=0;j<=k-1;++j) |
|---|
| 1044 | { |
|---|
| 1045 | result[1][pos+j]='*'; |
|---|
| 1046 | result[2][pos+j]='-'; |
|---|
| 1047 | } |
|---|
| 1048 | pos += k; |
|---|
| 1049 | } |
|---|
| 1050 | } |
|---|
| 1051 | } |
|---|
| 1052 | |
|---|
| 1053 | #ifdef DEBUG |
|---|
| 1054 | result[1][pos] = 0; |
|---|
| 1055 | result[2][pos] = 0; |
|---|
| 1056 | #endif |
|---|
| 1057 | |
|---|
| 1058 | return pos-1; |
|---|
| 1059 | } |
|---|
| 1060 | |
|---|
| 1061 | |
|---|
| 1062 | static int res_index(const char *t,char c) |
|---|
| 1063 | { |
|---|
| 1064 | if (t[0]==c) |
|---|
| 1065 | return UNKNOWN_ACID; |
|---|
| 1066 | for (int i=1; t[i]; i++) |
|---|
| 1067 | if (t[i]==c) |
|---|
| 1068 | return i; |
|---|
| 1069 | return 0; |
|---|
| 1070 | } |
|---|
| 1071 | |
|---|
| 1072 | static GB_ERROR p_encode(const char *seq, char *naseq, int l) |
|---|
| 1073 | { /* code seq as ints .. use -2 for gap */ |
|---|
| 1074 | int i; |
|---|
| 1075 | /* static char *aacids="CSTPAGNDEQHRKMILVFYW";*/ |
|---|
| 1076 | |
|---|
| 1077 | for(i=1; i<=l; i++) |
|---|
| 1078 | { |
|---|
| 1079 | int c = res_index(amino_acid_order,seq[i]); |
|---|
| 1080 | |
|---|
| 1081 | if (!c) |
|---|
| 1082 | { |
|---|
| 1083 | if (seq[i]!='?') { // consensus contains ??? |
|---|
| 1084 | char buf[100]; |
|---|
| 1085 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
|---|
| 1086 | aw_message(buf); |
|---|
| 1087 | } |
|---|
| 1088 | |
|---|
| 1089 | c = res_index(amino_acid_order, 'X'); |
|---|
| 1090 | } |
|---|
| 1091 | |
|---|
| 1092 | awtc_assert(c>0); |
|---|
| 1093 | naseq[i] = c; |
|---|
| 1094 | } |
|---|
| 1095 | |
|---|
| 1096 | return 0; |
|---|
| 1097 | } |
|---|
| 1098 | |
|---|
| 1099 | static GB_ERROR n_encode(const char *seq,char *naseq,int l) |
|---|
| 1100 | { /* code seq as ints .. use -2 for gap */ |
|---|
| 1101 | int i; |
|---|
| 1102 | /* static char *nucs="ACGTU"; */ |
|---|
| 1103 | int warned = 0; |
|---|
| 1104 | |
|---|
| 1105 | for(i=1; i<=l; i++) |
|---|
| 1106 | { |
|---|
| 1107 | int c = res_index(nucleic_acid_order,seq[i]); |
|---|
| 1108 | |
|---|
| 1109 | if (!c) |
|---|
| 1110 | { |
|---|
| 1111 | if (seq[i]!='?') { // consensus contains ??? |
|---|
| 1112 | if (!warned) { |
|---|
| 1113 | char buf[100]; |
|---|
| 1114 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
|---|
| 1115 | aw_message(buf); |
|---|
| 1116 | warned = 1; |
|---|
| 1117 | } |
|---|
| 1118 | } |
|---|
| 1119 | c = res_index(nucleic_acid_order, 'N'); |
|---|
| 1120 | } |
|---|
| 1121 | |
|---|
| 1122 | awtc_assert(c>0); |
|---|
| 1123 | naseq[i] = c; |
|---|
| 1124 | } |
|---|
| 1125 | |
|---|
| 1126 | return 0; |
|---|
| 1127 | } |
|---|
| 1128 | |
|---|
| 1129 | GB_ERROR AWTC_ClustalV_align(int is_dna, int weighted, |
|---|
| 1130 | const char *seq1, int length1, |
|---|
| 1131 | const char *seq2, int length2, |
|---|
| 1132 | const int *gapsBefore1, |
|---|
| 1133 | int max_seq_length, |
|---|
| 1134 | char **resultPtr1, |
|---|
| 1135 | char **resultPtr2, |
|---|
| 1136 | int *resLengthPtr, |
|---|
| 1137 | int *score) |
|---|
| 1138 | { |
|---|
| 1139 | error = 0; |
|---|
| 1140 | gapsBeforePosition = gapsBefore1; |
|---|
| 1141 | |
|---|
| 1142 | if (!module_initialized) // initialize only once |
|---|
| 1143 | { |
|---|
| 1144 | dnaflag = is_dna; |
|---|
| 1145 | is_weight = weighted; |
|---|
| 1146 | |
|---|
| 1147 | init_myers(max_seq_length); |
|---|
| 1148 | init_show_pair(max_seq_length); |
|---|
| 1149 | make_pamo(0); |
|---|
| 1150 | fill_pam(); |
|---|
| 1151 | |
|---|
| 1152 | for (int i=1; i<=2; i++) |
|---|
| 1153 | { |
|---|
| 1154 | seq_array[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char)); |
|---|
| 1155 | result[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char)); |
|---|
| 1156 | group[i] = i; |
|---|
| 1157 | } |
|---|
| 1158 | |
|---|
| 1159 | if (error) goto ende; |
|---|
| 1160 | |
|---|
| 1161 | module_initialized = 1; |
|---|
| 1162 | } |
|---|
| 1163 | else |
|---|
| 1164 | { |
|---|
| 1165 | if (dnaflag!=is_dna || is_weight!=weighted) |
|---|
| 1166 | { |
|---|
| 1167 | return "Please call AWTC_ClustalV_align_exit() between calls that differ in\n" |
|---|
| 1168 | "one of the following parameters:\n" |
|---|
| 1169 | " is_dna, weighted"; |
|---|
| 1170 | } |
|---|
| 1171 | } |
|---|
| 1172 | |
|---|
| 1173 | nseqs = 2; |
|---|
| 1174 | print_ptr = 0; |
|---|
| 1175 | |
|---|
| 1176 | #if defined(DEBUG) && 1 |
|---|
| 1177 | memset(&seq_array[1][1], 0, max_seq_length*sizeof(seq_array[1][1])); |
|---|
| 1178 | memset(&seq_array[2][1], 0, max_seq_length*sizeof(seq_array[2][1])); |
|---|
| 1179 | memset(&displ[1], 0xff, max_seq_length*sizeof(displ[1])); |
|---|
| 1180 | seq_array[1][0] = '_'; |
|---|
| 1181 | seq_array[2][0] = '_'; |
|---|
| 1182 | #endif |
|---|
| 1183 | |
|---|
| 1184 | { |
|---|
| 1185 | GB_ERROR (*encode)(const char*,char*,int) = dnaflag ? n_encode : p_encode; |
|---|
| 1186 | |
|---|
| 1187 | error = encode(seq1-1, seq_array[1], length1); |
|---|
| 1188 | if (error) goto ende; |
|---|
| 1189 | seqlen_array[1] = length1; |
|---|
| 1190 | |
|---|
| 1191 | error = encode(seq2-1, seq_array[2], length2); |
|---|
| 1192 | if (error) goto ende; |
|---|
| 1193 | seqlen_array[2] = length2; |
|---|
| 1194 | |
|---|
| 1195 | do_align(/*gap_open,*/ score,max(length1,length2)); |
|---|
| 1196 | |
|---|
| 1197 | int alignedLength = add_ggaps(max_seq_length); |
|---|
| 1198 | |
|---|
| 1199 | *resultPtr1 = result[1]+1; |
|---|
| 1200 | *resultPtr2 = result[2]+1; |
|---|
| 1201 | *resLengthPtr = alignedLength; |
|---|
| 1202 | } |
|---|
| 1203 | |
|---|
| 1204 | ende: |
|---|
| 1205 | |
|---|
| 1206 | return error; |
|---|
| 1207 | } |
|---|
| 1208 | |
|---|
| 1209 | void AWTC_ClustalV_align_exit(void) |
|---|
| 1210 | { |
|---|
| 1211 | if (module_initialized) |
|---|
| 1212 | { |
|---|
| 1213 | module_initialized = 0; |
|---|
| 1214 | |
|---|
| 1215 | for (int i=1; i<=2; i++) |
|---|
| 1216 | { |
|---|
| 1217 | free(result[i]); |
|---|
| 1218 | free(seq_array[i]); |
|---|
| 1219 | } |
|---|
| 1220 | exit_show_pair(); |
|---|
| 1221 | exit_myers(); |
|---|
| 1222 | } |
|---|
| 1223 | } |
|---|
| 1224 | |
|---|
| 1225 | |
|---|
| 1226 | |
|---|
| 1227 | |
|---|
| 1228 | |
|---|