| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ClustalV.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Coded based on ClustalV by Ralf Westram (coder@reallysoft.de) // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | #include "ClustalV.hxx" |
|---|
| 13 | #include "seq_search.hxx" |
|---|
| 14 | |
|---|
| 15 | #include <cctype> |
|---|
| 16 | #include <climits> |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | // ---------------------------------------------------------------- |
|---|
| 20 | |
|---|
| 21 | #define MASTER_GAP_OPEN 50 |
|---|
| 22 | #define CHEAP_GAP_OPEN 20 // penalty for creating a gap (if we have gaps in master) |
|---|
| 23 | |
|---|
| 24 | #define DEFAULT_GAP_OPEN 30 // penalty for creating a gap |
|---|
| 25 | |
|---|
| 26 | #define MASTER_GAP_EXTEND 18 |
|---|
| 27 | #define CHEAP_GAP_EXTEND 5 // penalty for extending a gap (if we have gaps in master) |
|---|
| 28 | |
|---|
| 29 | #define DEFAULT_GAP_EXTEND 10 // penalty for extending a gap |
|---|
| 30 | |
|---|
| 31 | #define DEFAULT_IMPROBABLY_MUTATION 10 // penalty for mutations |
|---|
| 32 | #define DEFAULT_PROBABLY_MUTATION 4 // penalty for special mutations (A<->G,C<->U/T) (only if 'weighted') |
|---|
| 33 | |
|---|
| 34 | #define DYNAMIC_PENALTIES // otherwise you get FIXED PENALTIES (=no cheap penalties) |
|---|
| 35 | |
|---|
| 36 | // ---------------------------------------------------------------- |
|---|
| 37 | |
|---|
| 38 | #define MAX_GAP_OPEN_DISCOUNT (DEFAULT_GAP_OPEN-CHEAP_GAP_OPEN) // maximum subtracted from DEFAULT_GAP_OPEN |
|---|
| 39 | #define MAX_GAP_EXTEND_DISCOUNT (DEFAULT_GAP_EXTEND-CHEAP_GAP_EXTEND) // maximum subtracted from DEFAULT_GAP_EXTEND |
|---|
| 40 | |
|---|
| 41 | #define MAXN 2 // Maximum number of sequences (both groups) |
|---|
| 42 | |
|---|
| 43 | static bool module_initialized = false; |
|---|
| 44 | |
|---|
| 45 | typedef int Boolean; |
|---|
| 46 | |
|---|
| 47 | static Boolean dnaflag; |
|---|
| 48 | static Boolean is_weight; |
|---|
| 49 | |
|---|
| 50 | #define MAX_BASETYPES 21 |
|---|
| 51 | |
|---|
| 52 | static int xover; |
|---|
| 53 | static int little_pam; |
|---|
| 54 | static int big_pam; |
|---|
| 55 | static int pamo[(MAX_BASETYPES-1)*MAX_BASETYPES/2]; |
|---|
| 56 | static int pam[MAX_BASETYPES][MAX_BASETYPES]; |
|---|
| 57 | |
|---|
| 58 | static int pos1; |
|---|
| 59 | static int pos2; |
|---|
| 60 | static int **naa1; // naa1[basetype][position] counts bases for each position of all sequences in group1 |
|---|
| 61 | static int **naa2; // naa2[basetype][position] same for group2 |
|---|
| 62 | static int **naas; // |
|---|
| 63 | static int seqlen_array[MAXN+1]; // length of all sequences |
|---|
| 64 | static unsigned char *seq_array[MAXN+1]; // the sequences |
|---|
| 65 | static int group[MAXN+1]; // group of sequence |
|---|
| 66 | static int alist[MAXN+1]; // indices of sequences to be aligned |
|---|
| 67 | static int fst_list[MAXN+1]; |
|---|
| 68 | static int snd_list[MAXN+1]; |
|---|
| 69 | |
|---|
| 70 | static int nseqs; // # of sequences |
|---|
| 71 | static int weights[MAX_BASETYPES][MAX_BASETYPES]; // weights[b1][b2] : penalty for mutation from base 'b1' to base 'b2' |
|---|
| 72 | |
|---|
| 73 | #if defined(ASSERTION_USED) |
|---|
| 74 | static size_t displ_size = 0; |
|---|
| 75 | #endif // ASSERTION_USED |
|---|
| 76 | |
|---|
| 77 | static int *displ; // displ == 0 -> base in both , displ<0 -> displ gaps in slave, displ>0 -> displ gaps in master |
|---|
| 78 | static int *zza; // column (left->right) of align matrix (minimum of all paths to this matrix element) |
|---|
| 79 | static int *zzb; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
|---|
| 80 | static int *zzc; // column (left<-right) of align matrix (minimum of all paths to this matrix element) |
|---|
| 81 | static int *zzd; // -------------- " ------------------- (minimum of all paths, where gap inserted into slave) |
|---|
| 82 | static int print_ptr; |
|---|
| 83 | static int last_print; |
|---|
| 84 | |
|---|
| 85 | static const int *gaps_before_position; |
|---|
| 86 | |
|---|
| 87 | #if defined(DEBUG) |
|---|
| 88 | // #define MATRIX_DUMP |
|---|
| 89 | // #define DISPLAY_DIFF |
|---|
| 90 | #endif // DEBUG |
|---|
| 91 | |
|---|
| 92 | #ifdef MATRIX_DUMP |
|---|
| 93 | # define IF_MATRIX_DUMP(xxx) xxx |
|---|
| 94 | # define DISPLAY_MATRIX_SIZE 3000 |
|---|
| 95 | |
|---|
| 96 | static int vertical [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 97 | static int verticalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 98 | static int diagonal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 99 | static int horizontal [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 100 | static int horizontalOpen [DISPLAY_MATRIX_SIZE+2][DISPLAY_MATRIX_SIZE+2]; |
|---|
| 101 | |
|---|
| 102 | #else |
|---|
| 103 | # define IF_MATRIX_DUMP(xxx) |
|---|
| 104 | #endif |
|---|
| 105 | |
|---|
| 106 | inline int master_gap_open(int beforePosition) { |
|---|
| 107 | #ifdef DYNAMIC_PENALTIES |
|---|
| 108 | long gaps = gaps_before_position[beforePosition-1]; |
|---|
| 109 | return gaps ? MASTER_GAP_OPEN - MAX_GAP_OPEN_DISCOUNT : MASTER_GAP_OPEN; |
|---|
| 110 | |
|---|
| 111 | /* return |
|---|
| 112 | gaps >= MAX_GAP_OPEN_DISCOUNT |
|---|
| 113 | ? DEFAULT_GAP_OPEN-MAX_GAP_OPEN_DISCOUNT |
|---|
| 114 | : DEFAULT_GAP_OPEN-gaps; */ |
|---|
| 115 | #else |
|---|
| 116 | return DEFAULT_GAP_OPEN; |
|---|
| 117 | #endif |
|---|
| 118 | } |
|---|
| 119 | inline int master_gap_extend(int beforePosition) { |
|---|
| 120 | #ifdef DYNAMIC_PENALTIES |
|---|
| 121 | long gaps = gaps_before_position[beforePosition-1]; |
|---|
| 122 | return gaps ? MASTER_GAP_EXTEND - MAX_GAP_EXTEND_DISCOUNT : MASTER_GAP_EXTEND; |
|---|
| 123 | /* return |
|---|
| 124 | gaps >= MAX_GAP_EXTEND_DISCOUNT |
|---|
| 125 | ? DEFAULT_GAP_EXTEND-MAX_GAP_EXTEND_DISCOUNT |
|---|
| 126 | : DEFAULT_GAP_EXTEND-gaps; */ |
|---|
| 127 | #else |
|---|
| 128 | return DEFAULT_GAP_EXTEND; |
|---|
| 129 | #endif |
|---|
| 130 | } |
|---|
| 131 | |
|---|
| 132 | inline int master_gapAtWithOpenPenalty(int atPosition, int length, int penalty) { |
|---|
| 133 | if (length<=0) return 0; |
|---|
| 134 | |
|---|
| 135 | int beforePosition = atPosition; |
|---|
| 136 | int afterPosition = atPosition-1; |
|---|
| 137 | |
|---|
| 138 | while (length--) { |
|---|
| 139 | int p1 = master_gap_extend(beforePosition); |
|---|
| 140 | int p2 = master_gap_extend(afterPosition+1); |
|---|
| 141 | |
|---|
| 142 | if (p1<p2 && beforePosition>1) { |
|---|
| 143 | penalty += p1; |
|---|
| 144 | beforePosition--; |
|---|
| 145 | } |
|---|
| 146 | else { |
|---|
| 147 | penalty += p2; |
|---|
| 148 | afterPosition++; |
|---|
| 149 | } |
|---|
| 150 | } |
|---|
| 151 | |
|---|
| 152 | return penalty; |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | inline int master_gapAt(int atPosition, int length) { |
|---|
| 156 | return master_gapAtWithOpenPenalty(atPosition, length, master_gap_open(atPosition)); |
|---|
| 157 | } |
|---|
| 158 | |
|---|
| 159 | inline int slave_gap_open(int /* beforePosition */) { |
|---|
| 160 | return DEFAULT_GAP_OPEN; |
|---|
| 161 | } |
|---|
| 162 | |
|---|
| 163 | inline int slave_gap_extend(int /* beforePosition */) { |
|---|
| 164 | return DEFAULT_GAP_EXTEND; |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | inline int slave_gapAtWithOpenPenalty(int atPosition, int length, int penalty) { |
|---|
| 168 | return length<=0 ? 0 : penalty + length*slave_gap_extend(atPosition); |
|---|
| 169 | } |
|---|
| 170 | |
|---|
| 171 | inline int slave_gapAt(int atPosition, int length) { |
|---|
| 172 | return slave_gapAtWithOpenPenalty(atPosition, length, slave_gap_open(atPosition)); |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | #define UNKNOWN_ACID 255 |
|---|
| 176 | static const char *amino_acid_order = "XCSTPAGNDEQHRKMILVFYW"; |
|---|
| 177 | |
|---|
| 178 | #define NUCLEIDS 16 |
|---|
| 179 | static const char *nucleic_acid_order = "-ACGTUMRWSYKVHDBN"; |
|---|
| 180 | static const char *nucleic_maybe_A = "-A----AAA---AAA-A"; |
|---|
| 181 | static const char *nucleic_maybe_C = "--C---C--CC-CC-CC"; |
|---|
| 182 | static const char *nucleic_maybe_G = "---G---G-G-GG-GGG"; |
|---|
| 183 | static const char *nucleic_maybe_T = "----T---T-TT-TTTT"; |
|---|
| 184 | static const char *nucleic_maybe_U = "-----U--U-UU-UUUU"; |
|---|
| 185 | static const char *nucleic_maybe[6] = { NULp, nucleic_maybe_A, nucleic_maybe_C, nucleic_maybe_G, nucleic_maybe_T, nucleic_maybe_U }; |
|---|
| 186 | |
|---|
| 187 | /* |
|---|
| 188 | * M = A or C S = G or C V = A or G or C N = A or C or G or T |
|---|
| 189 | * R = A or G Y = C or T H = A or C or T |
|---|
| 190 | * W = A or T K = G or T D = A or G or T |
|---|
| 191 | */ |
|---|
| 192 | |
|---|
| 193 | #define cheap_if(cond) ((cond) ? 1 : 2) |
|---|
| 194 | static int baseCmp(unsigned char c1, unsigned char c2) { |
|---|
| 195 | // c1,c2 == 1=A,2=C (==index of character in nucleic_acid_order[]) |
|---|
| 196 | // returns 0 for equal |
|---|
| 197 | // 1 for probably mutations |
|---|
| 198 | // 2 for improbably mutations |
|---|
| 199 | #define COMPARABLE_BASES 5 |
|---|
| 200 | |
|---|
| 201 | if (c1==c2) return 0; |
|---|
| 202 | |
|---|
| 203 | if (c2<c1) { // swap |
|---|
| 204 | int c3 = c1; |
|---|
| 205 | |
|---|
| 206 | c1 = c2; |
|---|
| 207 | c2 = c3; |
|---|
| 208 | } |
|---|
| 209 | |
|---|
| 210 | if (c2<=COMPARABLE_BASES) { |
|---|
| 211 | fa_assert(c1<=COMPARABLE_BASES); |
|---|
| 212 | |
|---|
| 213 | switch (c1) { |
|---|
| 214 | case 0: return 2; |
|---|
| 215 | case 1: return cheap_if(c2==3); // A->G |
|---|
| 216 | case 2: return cheap_if(c2==4 || c2==5); // C->T/U |
|---|
| 217 | case 3: return cheap_if(c2==1); // G->A |
|---|
| 218 | case 4: if (c2==5) return 0; // T->U |
|---|
| 219 | FALLTHROUGH; |
|---|
| 220 | case 5: if (c2==4) return 0; // U->T |
|---|
| 221 | return cheap_if(c2==2); // T/U->C |
|---|
| 222 | default: fa_assert(0); |
|---|
| 223 | } |
|---|
| 224 | } |
|---|
| 225 | |
|---|
| 226 | int bestMatch = 3; |
|---|
| 227 | if (c1<=COMPARABLE_BASES) { |
|---|
| 228 | for (int i=1; i<=COMPARABLE_BASES; i++) { |
|---|
| 229 | if (isalpha(nucleic_maybe[i][c2])) { // 'c2' maybe a 'i' |
|---|
| 230 | int match = baseCmp(c1, i); |
|---|
| 231 | if (match<bestMatch) bestMatch = match; |
|---|
| 232 | } |
|---|
| 233 | } |
|---|
| 234 | } |
|---|
| 235 | else { |
|---|
| 236 | for (int i=1; i<=COMPARABLE_BASES; i++) { |
|---|
| 237 | if (isalpha(nucleic_maybe[i][c1])) { // 'c1' maybe a 'i' |
|---|
| 238 | int match = baseCmp(i, c2); |
|---|
| 239 | if (match<bestMatch) bestMatch = match; |
|---|
| 240 | } |
|---|
| 241 | } |
|---|
| 242 | } |
|---|
| 243 | |
|---|
| 244 | fa_assert(bestMatch>=0 && bestMatch<=2); |
|---|
| 245 | return bestMatch; |
|---|
| 246 | } |
|---|
| 247 | #undef cheap_if |
|---|
| 248 | |
|---|
| 249 | int baseMatch(char c1, char c2) { |
|---|
| 250 | // c1,c2 == ACGUTRS... |
|---|
| 251 | // returns 0 for equal |
|---|
| 252 | // 1 for probably mutations |
|---|
| 253 | // 2 for improbably mutations |
|---|
| 254 | // -1 if one char is illegal |
|---|
| 255 | |
|---|
| 256 | const char *p1 = strchr(nucleic_acid_order, c1); |
|---|
| 257 | const char *p2 = strchr(nucleic_acid_order, c2); |
|---|
| 258 | |
|---|
| 259 | fa_assert(c1); |
|---|
| 260 | fa_assert(c2); |
|---|
| 261 | |
|---|
| 262 | if (p1 && p2) return baseCmp(p1-nucleic_acid_order, p2-nucleic_acid_order); |
|---|
| 263 | return -1; |
|---|
| 264 | } |
|---|
| 265 | |
|---|
| 266 | static int getPenalty(char c1, char c2) { |
|---|
| 267 | // c1,c2 = A=0,C=1,... s.o. |
|---|
| 268 | switch (baseCmp(c1, c2)) { |
|---|
| 269 | case 1: return DEFAULT_PROBABLY_MUTATION; |
|---|
| 270 | case 2: return DEFAULT_IMPROBABLY_MUTATION; |
|---|
| 271 | default: break; |
|---|
| 272 | } |
|---|
| 273 | |
|---|
| 274 | return 0; |
|---|
| 275 | } |
|---|
| 276 | |
|---|
| 277 | static char *result[3]; // result buffers |
|---|
| 278 | |
|---|
| 279 | static char pam250mt[] = { |
|---|
| 280 | 12, |
|---|
| 281 | 0, 2, |
|---|
| 282 | -2, 1, 3, |
|---|
| 283 | -3, 1, 0, 6, |
|---|
| 284 | -2, 1, 1, 1, 2, |
|---|
| 285 | -3, 1, 0, -1, 1, 5, |
|---|
| 286 | -4, 1, 0, -1, 0, 0, 2, |
|---|
| 287 | -5, 0, 0, -1, 0, 1, 2, 4, |
|---|
| 288 | -5, 0, 0, -1, 0, 0, 1, 3, 4, |
|---|
| 289 | -5, -1, -1, 0, 0, -1, 1, 2, 2, 4, |
|---|
| 290 | -3, -1, -1, 0, -1, -2, 2, 1, 1, 3, 6, |
|---|
| 291 | -4, 0, -1, 0, -2, -3, 0, -1, -1, 1, 2, 6, |
|---|
| 292 | -5, 0, 0, -1, -1, -2, 1, 0, 0, 1, 0, 3, 5, |
|---|
| 293 | -5, -2, -1, -2, -1, -3, -2, -3, -2, -1, -2, 0, 0, 6, |
|---|
| 294 | -2, -1, 0, -2, -1, -3, -2, -2, -2, -2, -2, -2, -2, 2, 5, |
|---|
| 295 | -6, -3, -2, -3, -2, -4, -3, -4, -3, -2, -2, -3, -3, 4, 2, 6, |
|---|
| 296 | -2, -1, 0, -1, 0, -1, -2, -2, -2, -2, -2, -2, -2, 2, 4, 2, 4, |
|---|
| 297 | -4, -3, -3, -5, -4, -5, -4, -6, -5, -5, -2, -4, -5, 0, 1, 2, -1, 9, |
|---|
| 298 | 0, -3, -3, -5, -3, -5, -2, -4, -4, -4, 0, -4, -4, -2, -1, -1, -2, 7, 10, |
|---|
| 299 | -8, -2, -5, -6, -6, -7, -4, -7, -7, -5, -3, 2, -3, -4, -5, -2, -6, 0, 0, 17 |
|---|
| 300 | }; |
|---|
| 301 | |
|---|
| 302 | static char *matptr = pam250mt; |
|---|
| 303 | |
|---|
| 304 | #if (defined(DISPLAY_DIFF) || defined(MATRIX_DUMP)) |
|---|
| 305 | static void p_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
|---|
| 306 | int len = strlen(amino_acid_order); |
|---|
| 307 | |
|---|
| 308 | for (int i=1; i<=l && naseq[i]; i++) { |
|---|
| 309 | fa_assert(naseq[i]<len); |
|---|
| 310 | seq[i] = amino_acid_order[naseq[i]]; |
|---|
| 311 | } |
|---|
| 312 | } |
|---|
| 313 | |
|---|
| 314 | static void n_decode(const unsigned char *naseq, unsigned char *seq, int l) { |
|---|
| 315 | int len = strlen(nucleic_acid_order); |
|---|
| 316 | |
|---|
| 317 | for (int i=1; i<=l && naseq[i]; i++) { |
|---|
| 318 | fa_assert(naseq[i]<len); |
|---|
| 319 | seq[i] = nucleic_acid_order[naseq[i]]; |
|---|
| 320 | } |
|---|
| 321 | } |
|---|
| 322 | #endif |
|---|
| 323 | |
|---|
| 324 | inline ARB_ERROR MAXLENtooSmall() { |
|---|
| 325 | return "ClustalV-aligner: MAXLEN is dimensioned to small for this sequence"; |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | inline void *ckalloc(size_t bytes, ARB_ERROR& error) { |
|---|
| 329 | if (error) return NULp; |
|---|
| 330 | |
|---|
| 331 | void *ret = malloc(bytes); |
|---|
| 332 | |
|---|
| 333 | if (!ret) error = "out of memory"; |
|---|
| 334 | return ret; |
|---|
| 335 | } |
|---|
| 336 | |
|---|
| 337 | static ARB_ERROR init_myers(long max_seq_length) { |
|---|
| 338 | ARB_ERROR error; |
|---|
| 339 | |
|---|
| 340 | naa1 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *), error); |
|---|
| 341 | naa2 = (int **)ckalloc(MAX_BASETYPES * sizeof (int *), error); |
|---|
| 342 | naas = (int **)ckalloc(MAX_BASETYPES * sizeof (int *), error); |
|---|
| 343 | |
|---|
| 344 | for (int i=0; i<MAX_BASETYPES && !error; i++) { |
|---|
| 345 | naa1[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int), error); |
|---|
| 346 | naa2[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int), error); |
|---|
| 347 | naas[i] = (int *)ckalloc((max_seq_length+1)*sizeof(int), error); |
|---|
| 348 | } |
|---|
| 349 | return error; |
|---|
| 350 | } |
|---|
| 351 | |
|---|
| 352 | static void make_pamo(int nv) { |
|---|
| 353 | little_pam=big_pam=matptr[0]; |
|---|
| 354 | for (int i=0; i<210; ++i) { // LOOP_VECTORIZED |
|---|
| 355 | int c=matptr[i]; |
|---|
| 356 | little_pam=(little_pam<c) ? little_pam : c; |
|---|
| 357 | big_pam=(big_pam>c) ? big_pam : c; |
|---|
| 358 | } |
|---|
| 359 | for (int i=0; i<210; ++i) { // LOOP_VECTORIZED |
|---|
| 360 | pamo[i] = matptr[i]-little_pam; |
|---|
| 361 | } |
|---|
| 362 | nv -= little_pam; |
|---|
| 363 | big_pam -= little_pam; |
|---|
| 364 | xover = big_pam - nv; |
|---|
| 365 | /* |
|---|
| 366 | fprintf(stdout,"\n\nxover= %d, big_pam = %d, little_pam=%d, nv = %d\n\n" |
|---|
| 367 | ,xover,big_pam,little_pam,nv); |
|---|
| 368 | */ |
|---|
| 369 | } |
|---|
| 370 | |
|---|
| 371 | static void fill_pam() { |
|---|
| 372 | int pos = 0; |
|---|
| 373 | |
|---|
| 374 | for (int i=0; i<20; ++i) { |
|---|
| 375 | for (int j=0; j<=i; ++j) { |
|---|
| 376 | pam[i][j]=pamo[pos++]; |
|---|
| 377 | } |
|---|
| 378 | } |
|---|
| 379 | |
|---|
| 380 | for (int i=0; i<20; ++i) { |
|---|
| 381 | for (int j=0; j<=i; ++j) { // LOOP_VECTORIZED[!<8.1] |
|---|
| 382 | pam[j][i]=pam[i][j]; |
|---|
| 383 | } |
|---|
| 384 | } |
|---|
| 385 | |
|---|
| 386 | if (dnaflag) { |
|---|
| 387 | xover=4; |
|---|
| 388 | big_pam=8; |
|---|
| 389 | for (int i=0; i<=NUCLEIDS; ++i) { |
|---|
| 390 | for (int j=0; j<=NUCLEIDS; ++j) { |
|---|
| 391 | weights[i][j] = getPenalty(i, j); |
|---|
| 392 | } |
|---|
| 393 | } |
|---|
| 394 | } |
|---|
| 395 | else { |
|---|
| 396 | for (int i=1; i<MAX_BASETYPES; ++i) { |
|---|
| 397 | for (int j=1; j<MAX_BASETYPES; ++j) { // LOOP_VECTORIZED |
|---|
| 398 | weights[i][j] = big_pam - pam[i-1][j-1]; |
|---|
| 399 | } |
|---|
| 400 | } |
|---|
| 401 | for (int i=0; i<MAX_BASETYPES; ++i) { |
|---|
| 402 | weights[0][i] = xover; |
|---|
| 403 | weights[i][0] = xover; |
|---|
| 404 | } |
|---|
| 405 | } |
|---|
| 406 | } |
|---|
| 407 | |
|---|
| 408 | static void exit_myers() { |
|---|
| 409 | for (int i=0; i<MAX_BASETYPES; i++) { |
|---|
| 410 | free(naas[i]); |
|---|
| 411 | free(naa2[i]); |
|---|
| 412 | free(naa1[i]); |
|---|
| 413 | } |
|---|
| 414 | free(naas); |
|---|
| 415 | free(naa2); |
|---|
| 416 | free(naa1); |
|---|
| 417 | } |
|---|
| 418 | |
|---|
| 419 | static ARB_ERROR init_show_pair(long max_seq_length) { |
|---|
| 420 | ARB_ERROR error; |
|---|
| 421 | |
|---|
| 422 | #if defined(ASSERTION_USED) |
|---|
| 423 | displ_size = (2*max_seq_length + 1); |
|---|
| 424 | #endif // ASSERTION_USED |
|---|
| 425 | |
|---|
| 426 | displ = (int *) ckalloc((2*max_seq_length + 1) * sizeof (int), error); |
|---|
| 427 | last_print = 0; |
|---|
| 428 | |
|---|
| 429 | zza = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
|---|
| 430 | zzb = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
|---|
| 431 | |
|---|
| 432 | zzc = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
|---|
| 433 | zzd = (int *)ckalloc((max_seq_length+1) * sizeof (int), error); |
|---|
| 434 | |
|---|
| 435 | return error; |
|---|
| 436 | } |
|---|
| 437 | |
|---|
| 438 | static void exit_show_pair() { |
|---|
| 439 | freenull(zzd); |
|---|
| 440 | freenull(zzc); |
|---|
| 441 | freenull(zzb); |
|---|
| 442 | freenull(zza); |
|---|
| 443 | freenull(displ); |
|---|
| 444 | #if defined(ASSERTION_USED) |
|---|
| 445 | displ_size = 0; |
|---|
| 446 | #endif // ASSERTION_USED |
|---|
| 447 | } |
|---|
| 448 | |
|---|
| 449 | inline int set_displ(int offset, int value) { |
|---|
| 450 | fa_assert(offset >= 0 && offset<int(displ_size)); |
|---|
| 451 | displ[offset] = value; |
|---|
| 452 | return displ[offset]; |
|---|
| 453 | } |
|---|
| 454 | inline int decr_displ(int offset, int value) { |
|---|
| 455 | fa_assert(offset >= 0 && offset<int(displ_size)); |
|---|
| 456 | displ[offset] -= value; |
|---|
| 457 | return displ[offset]; |
|---|
| 458 | } |
|---|
| 459 | |
|---|
| 460 | |
|---|
| 461 | inline void add(int v) { |
|---|
| 462 | // insert 'v' gaps into master ??? |
|---|
| 463 | if (last_print<0 && print_ptr>0) { |
|---|
| 464 | set_displ(print_ptr-1, v); |
|---|
| 465 | set_displ(print_ptr++, last_print); |
|---|
| 466 | } |
|---|
| 467 | else { |
|---|
| 468 | last_print = set_displ(print_ptr++, v); |
|---|
| 469 | } |
|---|
| 470 | } |
|---|
| 471 | |
|---|
| 472 | inline int calc_weight(int iat, int jat, int v1, int v2) { |
|---|
| 473 | fa_assert(pos1==1 && pos2==1); |
|---|
| 474 | unsigned char j = seq_array[alist[1]][v2+jat-1]; |
|---|
| 475 | return j<128 ? naas[j][v1+iat-1] : 0; |
|---|
| 476 | } |
|---|
| 477 | |
|---|
| 478 | #ifdef MATRIX_DUMP |
|---|
| 479 | |
|---|
| 480 | inline const unsigned char *lstr(const unsigned char *s, int len) { |
|---|
| 481 | static unsigned char *lstr_ss = 0; |
|---|
| 482 | |
|---|
| 483 | freeset(lstr_ss, (unsigned char*)strndup((const char*)s, len)); |
|---|
| 484 | return lstr_ss; |
|---|
| 485 | } |
|---|
| 486 | |
|---|
| 487 | inline const unsigned char *nstr(unsigned char *cp, int length) { |
|---|
| 488 | const unsigned char *s = lstr(cp, length); |
|---|
| 489 | (dnaflag ? n_decode : p_decode)(s-1, const_cast<unsigned char *>(s-1), length); |
|---|
| 490 | return s; |
|---|
| 491 | } |
|---|
| 492 | |
|---|
| 493 | inline void dumpMatrix(int x0, int y0, int breite, int hoehe, int mitte_vert) { |
|---|
| 494 | char *sl = ARB_alloc<char>(hoehe+3); |
|---|
| 495 | char *ma = ARB_alloc<char>(breite+3); |
|---|
| 496 | |
|---|
| 497 | sprintf(ma, "-%s-", nstr(seq_array[1]+x0, breite)); |
|---|
| 498 | sprintf(sl, "-%s-", nstr(seq_array[2]+y0, hoehe)); |
|---|
| 499 | |
|---|
| 500 | printf(" "); |
|---|
| 501 | |
|---|
| 502 | { |
|---|
| 503 | int b; |
|---|
| 504 | for (b=0; b<=mitte_vert; b++) { |
|---|
| 505 | printf("%5c", ma[b]); |
|---|
| 506 | } |
|---|
| 507 | printf(" MID"); |
|---|
| 508 | for (b++; b<=breite+1+1; b++) { |
|---|
| 509 | printf("%5c", ma[b-1]); |
|---|
| 510 | } |
|---|
| 511 | } |
|---|
| 512 | |
|---|
| 513 | for (int h=0; h<=hoehe+1; h++) { |
|---|
| 514 | printf("\n%c vertical: ", sl[h]); |
|---|
| 515 | for (int b=0; b<=breite+1+1; b++) printf("%5i", vertical[b][h]); |
|---|
| 516 | printf("\n verticalOpen: "); |
|---|
| 517 | for (int b=0; b<=breite+1+1; b++) printf("%5i", verticalOpen[b][h]); |
|---|
| 518 | printf("\n diagonal: "); |
|---|
| 519 | for (int b=0; b<=breite+1+1; b++) printf("%5i", diagonal[b][h]); |
|---|
| 520 | printf("\n horizontal: "); |
|---|
| 521 | for (int b=0; b<=breite+1+1; b++) printf("%5i", horizontal[b][h]); |
|---|
| 522 | printf("\n horizontalOpen:"); |
|---|
| 523 | for (int b=0; b<=breite+1+1; b++) printf("%5i", horizontalOpen[b][h]); |
|---|
| 524 | printf("\n"); |
|---|
| 525 | } |
|---|
| 526 | |
|---|
| 527 | printf("--------------------\n"); |
|---|
| 528 | |
|---|
| 529 | free(ma); |
|---|
| 530 | free(sl); |
|---|
| 531 | } |
|---|
| 532 | #endif |
|---|
| 533 | |
|---|
| 534 | static int diff(int v1, int v2, int v3, int v4, int st, int en) { |
|---|
| 535 | /* v1,v3 master sequence (v1=offset, v3=length) |
|---|
| 536 | * v2,v4 slave sequence (v2=offset, v4=length) |
|---|
| 537 | * st,en gap_open-penalties for start and end of the sequence |
|---|
| 538 | * |
|---|
| 539 | * returns costs for inserted gaps |
|---|
| 540 | */ |
|---|
| 541 | #ifdef DEBUG |
|---|
| 542 | # ifdef MATRIX_DUMP |
|---|
| 543 | int display_matrix = 0; |
|---|
| 544 | |
|---|
| 545 | fa_assert(v3<=(DISPLAY_MATRIX_SIZE+2)); // width |
|---|
| 546 | fa_assert(v4<=(DISPLAY_MATRIX_SIZE)); // height |
|---|
| 547 | |
|---|
| 548 | # define DISPLAY_MATRIX_ELEMENTS ((DISPLAY_MATRIX_SIZE+2)*(DISPLAY_MATRIX_SIZE+2)) |
|---|
| 549 | |
|---|
| 550 | memset(vertical, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 551 | memset(verticalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 552 | memset(diagonal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 553 | memset(horizontal, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 554 | memset(horizontalOpen, -1, DISPLAY_MATRIX_ELEMENTS*sizeof(int)); |
|---|
| 555 | |
|---|
| 556 | # endif |
|---|
| 557 | #endif |
|---|
| 558 | static int deep; |
|---|
| 559 | deep++; |
|---|
| 560 | |
|---|
| 561 | #if (defined (DEBUG) && 0) |
|---|
| 562 | { |
|---|
| 563 | char *d = lstr(seq_array[1]+v1, v3); |
|---|
| 564 | |
|---|
| 565 | (dnaflag ? n_decode : p_decode)(d-1, d-1, v3); |
|---|
| 566 | |
|---|
| 567 | for (int cnt=0; cnt<deep; cnt++) putchar(' '); |
|---|
| 568 | printf("master = '%s' (penalties left=%i right=%i)\n", d, st, en); |
|---|
| 569 | |
|---|
| 570 | d = lstr(seq_array[2]+v2, v4); |
|---|
| 571 | (dnaflag ? n_decode : p_decode)(d-1, d-1, v4); |
|---|
| 572 | |
|---|
| 573 | for (cnt=0; cnt<deep; cnt++) putchar(' '); |
|---|
| 574 | printf("slave = '%s'\n", d); |
|---|
| 575 | } |
|---|
| 576 | #endif |
|---|
| 577 | |
|---|
| 578 | if (v4<=0) { // if slave sequence is empty |
|---|
| 579 | if (v3>0) { |
|---|
| 580 | if (last_print<0 && print_ptr>0) { |
|---|
| 581 | last_print = decr_displ(print_ptr-1, v3); // add .. |
|---|
| 582 | } |
|---|
| 583 | else { |
|---|
| 584 | last_print = set_displ(print_ptr++, -(v3)); // .. or insert gap of length 'v3' into slave |
|---|
| 585 | } |
|---|
| 586 | } |
|---|
| 587 | |
|---|
| 588 | deep--; |
|---|
| 589 | return master_gapAt(v1, v3); |
|---|
| 590 | } |
|---|
| 591 | |
|---|
| 592 | int ctrj = 0; |
|---|
| 593 | if (v3<=1) { |
|---|
| 594 | if (v3<=0) { // if master sequence is empty |
|---|
| 595 | add(v4); // ??? insert gap length 'v4' into master ??? |
|---|
| 596 | deep--; |
|---|
| 597 | return slave_gapAt(v2, v4); |
|---|
| 598 | } |
|---|
| 599 | |
|---|
| 600 | fa_assert(v3==1); |
|---|
| 601 | // if master length == 1 |
|---|
| 602 | |
|---|
| 603 | if (v4==1) { |
|---|
| 604 | if (st>en) st=en; |
|---|
| 605 | |
|---|
| 606 | //!*************if(!v4)*********BUG******************************* |
|---|
| 607 | |
|---|
| 608 | int ctrc = slave_gapAtWithOpenPenalty(v2, v4, st); |
|---|
| 609 | ctrj = 0; |
|---|
| 610 | |
|---|
| 611 | for (int j=1; j<=v4; ++j) { |
|---|
| 612 | int k = slave_gapAt(v2, j-1) + calc_weight(1, j, v1, v2) + slave_gapAt(v2+j, v4-j); |
|---|
| 613 | if (k<ctrc) { ctrc = k; ctrj = j; } |
|---|
| 614 | } |
|---|
| 615 | |
|---|
| 616 | if (!ctrj) { |
|---|
| 617 | add(v4); |
|---|
| 618 | if (last_print<0 && print_ptr>0) { |
|---|
| 619 | last_print = decr_displ(print_ptr-1, 1); |
|---|
| 620 | } |
|---|
| 621 | else { |
|---|
| 622 | last_print = set_displ(print_ptr++, -1); |
|---|
| 623 | } |
|---|
| 624 | } |
|---|
| 625 | else { |
|---|
| 626 | if (ctrj>1) add(ctrj-1); |
|---|
| 627 | set_displ(print_ptr++, last_print = 0); |
|---|
| 628 | if (ctrj<v4) add(v4-ctrj); |
|---|
| 629 | } |
|---|
| 630 | |
|---|
| 631 | deep--; |
|---|
| 632 | return ctrc; |
|---|
| 633 | } |
|---|
| 634 | } |
|---|
| 635 | |
|---|
| 636 | fa_assert(v3>=1 && v4>=1); |
|---|
| 637 | |
|---|
| 638 | // slave length >= 1 |
|---|
| 639 | // master length >= 1 |
|---|
| 640 | |
|---|
| 641 | # define AF 0 |
|---|
| 642 | |
|---|
| 643 | // first column of matrix (slave side): |
|---|
| 644 | IF_MATRIX_DUMP(vertical[0][0]=) |
|---|
| 645 | zza[0] = 0; |
|---|
| 646 | int p = master_gap_open(v1); |
|---|
| 647 | for (int j=1; j<=v4; j++) { |
|---|
| 648 | p += master_gap_extend(v1); |
|---|
| 649 | IF_MATRIX_DUMP(vertical[0][j]=) |
|---|
| 650 | zza[j] = p; |
|---|
| 651 | IF_MATRIX_DUMP(verticalOpen[0][j]=) |
|---|
| 652 | zzb[j] = p + master_gap_open(v1); |
|---|
| 653 | } |
|---|
| 654 | |
|---|
| 655 | // left half of the matrix |
|---|
| 656 | p = st; |
|---|
| 657 | int ctri = v3 / 2; |
|---|
| 658 | for (int i=1; i<=ctri; i++) { |
|---|
| 659 | int n = zza[0]; |
|---|
| 660 | p += master_gap_extend(v1+i+AF); |
|---|
| 661 | int k = p; |
|---|
| 662 | IF_MATRIX_DUMP(vertical[i][0]=) |
|---|
| 663 | zza[0] = k; |
|---|
| 664 | int l = p + master_gap_open(v1+i+AF); |
|---|
| 665 | |
|---|
| 666 | for (int j=1; j<=v4; j++) { |
|---|
| 667 | // from above (gap in master (behind position i)) |
|---|
| 668 | IF_MATRIX_DUMP(verticalOpen[i][j]=) k += master_gap_open(v1+i+AF)+master_gap_extend(v1+i+AF); // (1) |
|---|
| 669 | IF_MATRIX_DUMP(vertical[i][j]=) l += master_gap_extend(v1+i+AF); // (2) |
|---|
| 670 | if (k<l) l = k; // l=min((1),(2)) |
|---|
| 671 | |
|---|
| 672 | // from left (gap in slave (behind position j)) |
|---|
| 673 | IF_MATRIX_DUMP(horizontalOpen[i][j]=) k = zza[j] + slave_gap_open(v2+j+AF)+slave_gap_extend(v2+j+AF); // (3) |
|---|
| 674 | int m; |
|---|
| 675 | IF_MATRIX_DUMP(horizontal[i][j]=) m = zzb[j] + slave_gap_extend(v2+j+AF); // (4) |
|---|
| 676 | if (k<m) m = k; // m=min((3),(4)) |
|---|
| 677 | |
|---|
| 678 | // diagonal (no gaps) |
|---|
| 679 | IF_MATRIX_DUMP(diagonal[i][j]=) k = n + calc_weight(i, j, v1, v2); // (5) |
|---|
| 680 | if (l<k) k = l; |
|---|
| 681 | if (m<k) k = m; // k = minimum of all paths |
|---|
| 682 | |
|---|
| 683 | n = zza[j]; // minimum of same row; one column to the left |
|---|
| 684 | zza[j] = k; // minimum of all paths to this matrix position |
|---|
| 685 | zzb[j] = m; // minimum of those two paths, where gap was inserted into slave |
|---|
| 686 | } |
|---|
| 687 | } |
|---|
| 688 | |
|---|
| 689 | # define MHO 1 // X-Offset for second half of matrix-arrays (only used to insert MID-column into dumpMatrix()) |
|---|
| 690 | # define BO 1 // Offset for second half of matrix (cause now the indices i and j are smaller as above) |
|---|
| 691 | |
|---|
| 692 | // last column of matrix: |
|---|
| 693 | |
|---|
| 694 | zzb[0]=zza[0]; |
|---|
| 695 | IF_MATRIX_DUMP(vertical[v3+1+MHO][v4+1]=) |
|---|
| 696 | zzc[v4]=0; |
|---|
| 697 | p = master_gap_open(v1+v3); |
|---|
| 698 | for (int j=v4-1; j>-1; j--) { |
|---|
| 699 | p += master_gap_extend(v1+v3); |
|---|
| 700 | IF_MATRIX_DUMP(vertical[v3+1+MHO][j+BO]=) |
|---|
| 701 | zzc[j] = p; |
|---|
| 702 | IF_MATRIX_DUMP(verticalOpen[v3+1+MHO][j+BO]=) |
|---|
| 703 | zzd[j] = p+master_gap_open(v1+v3); |
|---|
| 704 | } |
|---|
| 705 | |
|---|
| 706 | // right half of matrix (backwards): |
|---|
| 707 | p = en; |
|---|
| 708 | for (int i=v3-1; i>=ctri; i--) { |
|---|
| 709 | int n = zzc[v4]; |
|---|
| 710 | p += master_gap_extend(v1+i); |
|---|
| 711 | int k = p; |
|---|
| 712 | IF_MATRIX_DUMP(vertical[i+BO+MHO][v4+1]=) |
|---|
| 713 | zzc[v4] = k; |
|---|
| 714 | int l = p+master_gap_open(v1+i); |
|---|
| 715 | |
|---|
| 716 | for (int j=v4-1; j>=0; j--) { |
|---|
| 717 | // from below (gap in master (in front of position (i+BO))) |
|---|
| 718 | IF_MATRIX_DUMP(verticalOpen[i+BO+MHO][j+BO]=) k += master_gap_open(v1+i)+master_gap_extend(v1+i); // (1) |
|---|
| 719 | IF_MATRIX_DUMP(vertical[i+BO+MHO][j+BO]=) l += master_gap_extend(v1+i); // (2) |
|---|
| 720 | if (k<l) l = k; // l=min((1),(2)) |
|---|
| 721 | |
|---|
| 722 | // from right (gap in slave (in front of position (j+BO))) |
|---|
| 723 | IF_MATRIX_DUMP(horizontalOpen[i+BO+MHO][j+BO]=) k = zzc[j] + slave_gap_open(v2+j) + slave_gap_extend(v2+j); // (3) |
|---|
| 724 | int m; |
|---|
| 725 | IF_MATRIX_DUMP(horizontal[i+BO+MHO][j+BO]=) m = zzd[j] + slave_gap_extend(v2+j); // (4) |
|---|
| 726 | if (k<m) m = k; // m=min((3),(4)) |
|---|
| 727 | |
|---|
| 728 | // diagonal (no gaps) |
|---|
| 729 | IF_MATRIX_DUMP(diagonal[i+BO+MHO][j+BO]=) k = n + calc_weight(i+BO, j+BO, v1, v2); // (5) |
|---|
| 730 | if (l<k) k = l; |
|---|
| 731 | if (m<k) k = m; // k = minimum of all paths |
|---|
| 732 | |
|---|
| 733 | n = zzc[j]; // minimum of same row; one column to the right |
|---|
| 734 | zzc[j] = k; // minimum of all paths to this matrix position |
|---|
| 735 | zzd[j] = m; // minimum of those two paths, where gap was inserted into slave |
|---|
| 736 | } |
|---|
| 737 | } |
|---|
| 738 | |
|---|
| 739 | #undef BO |
|---|
| 740 | |
|---|
| 741 | // connect rightmost column of first half (column ctri) |
|---|
| 742 | // with leftmost column of second half (column ctri+1) |
|---|
| 743 | |
|---|
| 744 | zzd[v4] = zzc[v4]; |
|---|
| 745 | |
|---|
| 746 | int ctrc = INT_MAX; |
|---|
| 747 | int flag = 0; |
|---|
| 748 | |
|---|
| 749 | for (int j=(ctri ? 0 : 1); j<=v4; j++) { |
|---|
| 750 | int k; |
|---|
| 751 | IF_MATRIX_DUMP(vertical[ctri+MHO][j]=) |
|---|
| 752 | k = zza[j] + zzc[j]; // sum up both calculations (=diagonal=no gaps) |
|---|
| 753 | |
|---|
| 754 | if (k<ctrc || ((k==ctrc) && (zza[j]!=zzb[j]) && (zzc[j]==zzd[j]))) { |
|---|
| 755 | ctrc = k; |
|---|
| 756 | ctrj = j; |
|---|
| 757 | flag = 1; |
|---|
| 758 | } |
|---|
| 759 | } |
|---|
| 760 | |
|---|
| 761 | for (int j=v4; j>=(ctri ? 0 : 1); j--) { |
|---|
| 762 | int k; |
|---|
| 763 | IF_MATRIX_DUMP(verticalOpen[ctri+MHO][j]=) |
|---|
| 764 | k = zzb[j] + zzd[j] // paths where gaps were inserted into slave (left and right side!) |
|---|
| 765 | - slave_gap_open(j); // subtract gap_open-penalty which was added twice (at left and right end of gap) |
|---|
| 766 | |
|---|
| 767 | if (k<ctrc) { |
|---|
| 768 | ctrc = k; |
|---|
| 769 | ctrj = j; |
|---|
| 770 | flag = 2; |
|---|
| 771 | } |
|---|
| 772 | } |
|---|
| 773 | |
|---|
| 774 | fa_assert(flag>=1 && flag<=2); |
|---|
| 775 | |
|---|
| 776 | #undef MHO |
|---|
| 777 | |
|---|
| 778 | #ifdef MATRIX_DUMP |
|---|
| 779 | if (display_matrix) { |
|---|
| 780 | dumpMatrix(v1, v2, v3, v4, ctri); |
|---|
| 781 | } |
|---|
| 782 | #endif |
|---|
| 783 | |
|---|
| 784 | // Conquer recursively around midpoint |
|---|
| 785 | |
|---|
| 786 | if (flag==1) { // Type 1 gaps |
|---|
| 787 | diff(v1, v2, ctri, ctrj, st, master_gap_open(v1+ctri)); // includes midpoint ctri and ctrj |
|---|
| 788 | diff(v1+ctri, v2+ctrj, v3-ctri, v4-ctrj, master_gap_open(v1+ctri), en); |
|---|
| 789 | } |
|---|
| 790 | else { |
|---|
| 791 | diff(v1, v2, ctri-1, ctrj, st, 0); // includes midpoint ctrj |
|---|
| 792 | |
|---|
| 793 | if (last_print<0 && print_ptr>0) { // Delete 2 |
|---|
| 794 | last_print = decr_displ(print_ptr-1, 2); |
|---|
| 795 | } |
|---|
| 796 | else { |
|---|
| 797 | last_print = set_displ(print_ptr++, -2); |
|---|
| 798 | } |
|---|
| 799 | |
|---|
| 800 | diff(v1+ctri+1, v2+ctrj, v3-ctri-1, v4-ctrj, 0, en); |
|---|
| 801 | } |
|---|
| 802 | |
|---|
| 803 | deep--; |
|---|
| 804 | return ctrc; // Return the score of the best alignment |
|---|
| 805 | } |
|---|
| 806 | |
|---|
| 807 | static void do_align(int& score, long act_seq_length) { |
|---|
| 808 | pos1 = pos2 = 0; |
|---|
| 809 | |
|---|
| 810 | // clear statistics |
|---|
| 811 | |
|---|
| 812 | for (int i=1; i<=act_seq_length; ++i) { |
|---|
| 813 | for (int j=0; j<MAX_BASETYPES; ++j) { |
|---|
| 814 | naa1[j][i] = naa2[j][i]=0; |
|---|
| 815 | naas[j][i]=0; |
|---|
| 816 | } |
|---|
| 817 | } |
|---|
| 818 | |
|---|
| 819 | // create position statistics for each group |
|---|
| 820 | // [here every group contains only one seq] |
|---|
| 821 | |
|---|
| 822 | int l1 = 0; |
|---|
| 823 | int l2 = 0; |
|---|
| 824 | for (int i=1; i<=nseqs; ++i) { |
|---|
| 825 | if (group[i]==1) { |
|---|
| 826 | fst_list[++pos1]=i; |
|---|
| 827 | for (int j=1; j<=seqlen_array[i]; ++j) { |
|---|
| 828 | unsigned char b = seq_array[i][j]; |
|---|
| 829 | if (b<128) { |
|---|
| 830 | ++naa1[b][j]; |
|---|
| 831 | ++naa1[0][j]; |
|---|
| 832 | } |
|---|
| 833 | } |
|---|
| 834 | if (seqlen_array[i]>l1) l1=seqlen_array[i]; |
|---|
| 835 | } |
|---|
| 836 | else if (group[i]==2) { |
|---|
| 837 | snd_list[++pos2]=i; |
|---|
| 838 | for (int j=1; j<=seqlen_array[i]; ++j) { |
|---|
| 839 | unsigned char b = seq_array[i][j]; |
|---|
| 840 | if (b<128) { |
|---|
| 841 | ++naa2[b][j]; |
|---|
| 842 | ++naa2[0][j]; |
|---|
| 843 | } |
|---|
| 844 | } |
|---|
| 845 | if (seqlen_array[i]>l2) l2=seqlen_array[i]; |
|---|
| 846 | } |
|---|
| 847 | } |
|---|
| 848 | |
|---|
| 849 | if (pos1>=pos2) { |
|---|
| 850 | int t_arr[MAX_BASETYPES]; |
|---|
| 851 | |
|---|
| 852 | for (int i=1; i<=pos2; ++i) alist[i]=snd_list[i]; |
|---|
| 853 | for (int n=1; n<=l1; ++n) { |
|---|
| 854 | for (int i=1; i<MAX_BASETYPES; ++i) t_arr[i]=0; |
|---|
| 855 | for (int i=1; i<MAX_BASETYPES; ++i) { |
|---|
| 856 | if (naa1[i][n]>0) { |
|---|
| 857 | for (int j=1; j<MAX_BASETYPES; ++j) { // LOOP_VECTORIZED |
|---|
| 858 | t_arr[j] += (weights[i][j]*naa1[i][n]); |
|---|
| 859 | } |
|---|
| 860 | } |
|---|
| 861 | } |
|---|
| 862 | int k = naa1[0][n]; |
|---|
| 863 | if (k>0) { |
|---|
| 864 | for (int i=1; i<MAX_BASETYPES; ++i) { |
|---|
| 865 | naas[i][n]=t_arr[i]/k; |
|---|
| 866 | } |
|---|
| 867 | } |
|---|
| 868 | } |
|---|
| 869 | } |
|---|
| 870 | else { |
|---|
| 871 | fa_assert(0); // should never occur if MAXN==2 |
|---|
| 872 | } |
|---|
| 873 | |
|---|
| 874 | score = diff(1, 1, l1, l2, master_gap_open(1), master_gap_open(l1+1)); // Myers and Miller alignment now |
|---|
| 875 | } |
|---|
| 876 | |
|---|
| 877 | static int add_ggaps(long /* max_seq_length */) { |
|---|
| 878 | int pos = 1; |
|---|
| 879 | int to_do = print_ptr; |
|---|
| 880 | |
|---|
| 881 | for (int i=0; i<to_do; ++i) { // was: 1 .. <=to_do |
|---|
| 882 | if (displ[i]==0) { |
|---|
| 883 | result[1][pos]=result[2][pos]='*'; |
|---|
| 884 | ++pos; |
|---|
| 885 | } |
|---|
| 886 | else { |
|---|
| 887 | int k = displ[i]; |
|---|
| 888 | if (k>0) { |
|---|
| 889 | for (int j=0; j<=k-1; ++j) { // LOOP_VECTORIZED =0[>=9<11] =1 // completely fails with 9.x and 10.x; 1x with 4.9.2 + 5.4 + 5.5 + 6.5 + 7.2 + 7.5 + 8.1 + 8.5 |
|---|
| 890 | result[2][pos+j]='*'; |
|---|
| 891 | result[1][pos+j]='-'; |
|---|
| 892 | } |
|---|
| 893 | pos += k; |
|---|
| 894 | } |
|---|
| 895 | else { |
|---|
| 896 | k = (displ[i]<0) ? displ[i] * -1 : displ[i]; |
|---|
| 897 | for (int j=0; j<=k-1; ++j) { // LOOP_VECTORIZED =0[>=9<11] =1 // completely fails with 9.x and 10.x; 1x with 4.9.2 + 5.4 + 5.5 + 6.5 + 7.2 + 7.5 + 8.1 + 8.5 |
|---|
| 898 | result[1][pos+j]='*'; |
|---|
| 899 | result[2][pos+j]='-'; |
|---|
| 900 | } |
|---|
| 901 | pos += k; |
|---|
| 902 | } |
|---|
| 903 | } |
|---|
| 904 | } |
|---|
| 905 | |
|---|
| 906 | #ifdef DEBUG |
|---|
| 907 | result[1][pos] = 0; |
|---|
| 908 | result[2][pos] = 0; |
|---|
| 909 | #endif |
|---|
| 910 | |
|---|
| 911 | return pos-1; |
|---|
| 912 | } |
|---|
| 913 | |
|---|
| 914 | |
|---|
| 915 | static int res_index(const char *t, char c) { |
|---|
| 916 | if (t[0]==c) return UNKNOWN_ACID; |
|---|
| 917 | for (int i=1; t[i]; i++) { |
|---|
| 918 | if (t[i]==c) return i; |
|---|
| 919 | } |
|---|
| 920 | return 0; |
|---|
| 921 | } |
|---|
| 922 | |
|---|
| 923 | static void p_encode(const unsigned char *seq, unsigned char *naseq, int l) { |
|---|
| 924 | // code seq as ints .. use -2 for gap |
|---|
| 925 | bool warned = false; |
|---|
| 926 | |
|---|
| 927 | for (int i=1; i<=l; i++) { |
|---|
| 928 | int c = res_index(amino_acid_order, seq[i]); |
|---|
| 929 | |
|---|
| 930 | if (!c) { |
|---|
| 931 | if (seq[i] == '*') { |
|---|
| 932 | c = -2; |
|---|
| 933 | } |
|---|
| 934 | else { |
|---|
| 935 | if (!warned && seq[i] != '?') { // consensus contains ? and it means X |
|---|
| 936 | char buf[100]; |
|---|
| 937 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
|---|
| 938 | aw_message(buf); |
|---|
| 939 | warned = true; |
|---|
| 940 | } |
|---|
| 941 | c = res_index(amino_acid_order, 'X'); |
|---|
| 942 | } |
|---|
| 943 | } |
|---|
| 944 | |
|---|
| 945 | fa_assert(c>0 || c == -2); |
|---|
| 946 | naseq[i] = c; |
|---|
| 947 | } |
|---|
| 948 | } |
|---|
| 949 | |
|---|
| 950 | static void n_encode(const unsigned char *seq, unsigned char *naseq, int l) { |
|---|
| 951 | // code seq as ints .. use -2 for gap |
|---|
| 952 | bool warned = false; |
|---|
| 953 | for (int i=1; i<=l; i++) { |
|---|
| 954 | int c = res_index(nucleic_acid_order, seq[i]); |
|---|
| 955 | |
|---|
| 956 | if (!c) { |
|---|
| 957 | if (!warned && seq[i] != '?') { |
|---|
| 958 | char buf[100]; |
|---|
| 959 | sprintf(buf, "Illegal character '%c' in sequence data", seq[i]); |
|---|
| 960 | aw_message(buf); |
|---|
| 961 | warned = true; |
|---|
| 962 | } |
|---|
| 963 | c = res_index(nucleic_acid_order, 'N'); |
|---|
| 964 | } |
|---|
| 965 | |
|---|
| 966 | fa_assert(c>0); |
|---|
| 967 | naseq[i] = c; |
|---|
| 968 | } |
|---|
| 969 | } |
|---|
| 970 | |
|---|
| 971 | ARB_ERROR ClustalV_align(int is_dna, |
|---|
| 972 | int weighted, |
|---|
| 973 | const char *seq1, |
|---|
| 974 | int length1, |
|---|
| 975 | const char *seq2, |
|---|
| 976 | int length2, |
|---|
| 977 | const int *gapsBefore1, // size of array = length1+1 |
|---|
| 978 | int max_seq_length, |
|---|
| 979 | // result params: |
|---|
| 980 | const char*& res1, |
|---|
| 981 | const char*& res2, |
|---|
| 982 | int& reslen, |
|---|
| 983 | int& score) |
|---|
| 984 | { |
|---|
| 985 | ARB_ERROR error; |
|---|
| 986 | gaps_before_position = gapsBefore1; |
|---|
| 987 | |
|---|
| 988 | if (!module_initialized) { // initialize only once |
|---|
| 989 | dnaflag = is_dna; |
|---|
| 990 | is_weight = weighted; |
|---|
| 991 | |
|---|
| 992 | error = init_myers(max_seq_length); |
|---|
| 993 | if (!error) error = init_show_pair(max_seq_length); |
|---|
| 994 | make_pamo(0); |
|---|
| 995 | fill_pam(); |
|---|
| 996 | |
|---|
| 997 | for (int i=1; i<=2 && !error; i++) { |
|---|
| 998 | seq_array[i] = (unsigned char*)ckalloc((max_seq_length+2)*sizeof(char), error); |
|---|
| 999 | result[i] = (char*)ckalloc((max_seq_length+2)*sizeof(char), error); |
|---|
| 1000 | group[i] = i; |
|---|
| 1001 | } |
|---|
| 1002 | |
|---|
| 1003 | if (!error) module_initialized = true; |
|---|
| 1004 | } |
|---|
| 1005 | else { |
|---|
| 1006 | if (dnaflag!=is_dna || is_weight!=weighted) { |
|---|
| 1007 | error = "Please call ClustalV_exit() between calls that differ in\n" |
|---|
| 1008 | "one of the following parameters:\n" |
|---|
| 1009 | " is_dna, weighted"; |
|---|
| 1010 | } |
|---|
| 1011 | } |
|---|
| 1012 | |
|---|
| 1013 | if (!error) { |
|---|
| 1014 | nseqs = 2; |
|---|
| 1015 | print_ptr = 0; |
|---|
| 1016 | |
|---|
| 1017 | #if defined(DEBUG) && 1 |
|---|
| 1018 | memset(&seq_array[1][1], 0, max_seq_length*sizeof(seq_array[1][1])); |
|---|
| 1019 | memset(&seq_array[2][1], 0, max_seq_length*sizeof(seq_array[2][1])); |
|---|
| 1020 | memset(&displ[1], 0xff, max_seq_length*sizeof(displ[1])); |
|---|
| 1021 | seq_array[1][0] = '_'; |
|---|
| 1022 | seq_array[2][0] = '_'; |
|---|
| 1023 | #endif |
|---|
| 1024 | |
|---|
| 1025 | { |
|---|
| 1026 | typedef void (*encoder)(const unsigned char*, unsigned char*, int); |
|---|
| 1027 | encoder encode = dnaflag ? n_encode : p_encode; |
|---|
| 1028 | |
|---|
| 1029 | encode((const unsigned char*)(seq1-1), seq_array[1], length1); |
|---|
| 1030 | seqlen_array[1] = length1; |
|---|
| 1031 | encode((const unsigned char*)(seq2-1), seq_array[2], length2); |
|---|
| 1032 | seqlen_array[2] = length2; |
|---|
| 1033 | |
|---|
| 1034 | do_align(score, max(length1, length2)); |
|---|
| 1035 | int alignedLength = add_ggaps(max_seq_length); |
|---|
| 1036 | |
|---|
| 1037 | result[1][alignedLength+1] = 0; |
|---|
| 1038 | result[2][alignedLength+1] = 0; |
|---|
| 1039 | |
|---|
| 1040 | res1 = result[1]+1; |
|---|
| 1041 | res2 = result[2]+1; |
|---|
| 1042 | reslen = alignedLength; |
|---|
| 1043 | } |
|---|
| 1044 | } |
|---|
| 1045 | |
|---|
| 1046 | return error; |
|---|
| 1047 | } |
|---|
| 1048 | |
|---|
| 1049 | void ClustalV_exit() { |
|---|
| 1050 | if (module_initialized) { |
|---|
| 1051 | module_initialized = false; |
|---|
| 1052 | |
|---|
| 1053 | for (int i=1; i<=2; i++) { |
|---|
| 1054 | free(result[i]); |
|---|
| 1055 | free(seq_array[i]); |
|---|
| 1056 | } |
|---|
| 1057 | exit_show_pair(); |
|---|
| 1058 | exit_myers(); |
|---|
| 1059 | } |
|---|
| 1060 | } |
|---|
| 1061 | |
|---|
| 1062 | // -------------------------------------------------------------------------------- |
|---|
| 1063 | |
|---|
| 1064 | #ifdef UNIT_TESTS |
|---|
| 1065 | #ifndef TEST_UNIT_H |
|---|
| 1066 | #include <test_unit.h> |
|---|
| 1067 | #endif |
|---|
| 1068 | |
|---|
| 1069 | static arb_test::match_expectation clustal_aligns(const char *i1, const char *i2, const char *expected1, const char *expected2, int expected_score) { |
|---|
| 1070 | size_t l1 = strlen(i1); |
|---|
| 1071 | size_t l2 = strlen(i2); |
|---|
| 1072 | |
|---|
| 1073 | const char *result1 = NULp; |
|---|
| 1074 | const char *result2 = NULp; |
|---|
| 1075 | int result_len; // test result value? |
|---|
| 1076 | int score; |
|---|
| 1077 | |
|---|
| 1078 | int gaps_size = l1+1; |
|---|
| 1079 | int no_gaps_before1[gaps_size]; |
|---|
| 1080 | memset(no_gaps_before1, 0, gaps_size*sizeof(*no_gaps_before1)); |
|---|
| 1081 | |
|---|
| 1082 | ARB_ERROR error = ClustalV_align(0, 0, |
|---|
| 1083 | i1, l1, |
|---|
| 1084 | i2, l2, |
|---|
| 1085 | no_gaps_before1, |
|---|
| 1086 | max(l1, l2), |
|---|
| 1087 | result1, |
|---|
| 1088 | result2, |
|---|
| 1089 | result_len, |
|---|
| 1090 | score); |
|---|
| 1091 | |
|---|
| 1092 | using namespace arb_test; |
|---|
| 1093 | expectation_group expected; |
|---|
| 1094 | |
|---|
| 1095 | expected.add(doesnt_report_error(error.deliver())); |
|---|
| 1096 | expected.add(that(result1).is_equal_to(expected1)); |
|---|
| 1097 | expected.add(that(result2).is_equal_to(expected2)); |
|---|
| 1098 | expected.add(that(score).is_equal_to(expected_score)); |
|---|
| 1099 | |
|---|
| 1100 | return all().ofgroup(expected); |
|---|
| 1101 | } |
|---|
| 1102 | |
|---|
| 1103 | #define TEST_CLUSTAL_ALIGNS(i1,i2,o1,o2,score) TEST_EXPECTATION(clustal_aligns(i1,i2,o1,o2,score)) |
|---|
| 1104 | |
|---|
| 1105 | void TEST_clustalV() { |
|---|
| 1106 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
|---|
| 1107 | "ACGTTGCAACGT", |
|---|
| 1108 | "************", |
|---|
| 1109 | "************", |
|---|
| 1110 | 138); |
|---|
| 1111 | |
|---|
| 1112 | TEST_CLUSTAL_ALIGNS("ACGTTAACGT", |
|---|
| 1113 | "ACGTTGCAACGT", |
|---|
| 1114 | "*****--*****", |
|---|
| 1115 | "************", |
|---|
| 1116 | 207); |
|---|
| 1117 | |
|---|
| 1118 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
|---|
| 1119 | "ACCAACGT", |
|---|
| 1120 | "************", |
|---|
| 1121 | "*----*******", |
|---|
| 1122 | 156); |
|---|
| 1123 | |
|---|
| 1124 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
|---|
| 1125 | "AGCTGTCACAGT", |
|---|
| 1126 | "************", |
|---|
| 1127 | "************", |
|---|
| 1128 | 187); |
|---|
| 1129 | |
|---|
| 1130 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
|---|
| 1131 | "ACGTACGTACGT", |
|---|
| 1132 | "************", |
|---|
| 1133 | "************", |
|---|
| 1134 | 164); |
|---|
| 1135 | |
|---|
| 1136 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
|---|
| 1137 | "ACGTACGT", |
|---|
| 1138 | "************", |
|---|
| 1139 | "****----****", |
|---|
| 1140 | 162); |
|---|
| 1141 | |
|---|
| 1142 | TEST_CLUSTAL_ALIGNS("ACGTTGCAACGT", |
|---|
| 1143 | "AACGGTTC", |
|---|
| 1144 | "************", |
|---|
| 1145 | // "ACGTTGCAACGT", |
|---|
| 1146 | // "A----ACGGTTC", |
|---|
| 1147 | "*----*******", |
|---|
| 1148 | 193); |
|---|
| 1149 | } |
|---|
| 1150 | |
|---|
| 1151 | #endif // UNIT_TESTS |
|---|
| 1152 | |
|---|
| 1153 | // -------------------------------------------------------------------------------- |
|---|