| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <memory.h> |
|---|
| 4 | #include <malloc.h> |
|---|
| 5 | #include <string.h> |
|---|
| 6 | |
|---|
| 7 | #include <math.h> |
|---|
| 8 | |
|---|
| 9 | #include <arbdb.h> |
|---|
| 10 | #include <arbdbt.h> |
|---|
| 11 | |
|---|
| 12 | #include <aw_root.hxx> |
|---|
| 13 | #include <aw_device.hxx> |
|---|
| 14 | #include <aw_window.hxx> |
|---|
| 15 | #include <aw_preset.hxx> |
|---|
| 16 | #include <awt.hxx> |
|---|
| 17 | |
|---|
| 18 | #include <awt_tree.hxx> |
|---|
| 19 | #include "dist.hxx" |
|---|
| 20 | #include <BI_helix.hxx> |
|---|
| 21 | #include <awt_csp.hxx> |
|---|
| 22 | |
|---|
| 23 | #include "ph_matr.hxx" |
|---|
| 24 | #include "ph_mldist.hxx" |
|---|
| 25 | |
|---|
| 26 | #define epsilon 0.000001/* a small number */ |
|---|
| 27 | |
|---|
| 28 | |
|---|
| 29 | void ph_mldist::givens(ph_ml_matrix a,long i,long j,long n,double ctheta,double stheta,GB_BOOL left) |
|---|
| 30 | { |
|---|
| 31 | /* Givens transform at i,j for 1..n with angle theta */ |
|---|
| 32 | long k; |
|---|
| 33 | double d; |
|---|
| 34 | |
|---|
| 35 | for (k = 0; k < n; k++) { |
|---|
| 36 | if (left) { |
|---|
| 37 | d = ctheta * a[i - 1][k] + stheta * a[j - 1][k]; |
|---|
| 38 | a[j - 1][k] = ctheta * a[j - 1][k] - stheta * a[i - 1][k]; |
|---|
| 39 | a[i - 1][k] = d; |
|---|
| 40 | } else { |
|---|
| 41 | d = ctheta * a[k][i - 1] + stheta * a[k][j - 1]; |
|---|
| 42 | a[k][j - 1] = ctheta * a[k][j - 1] - stheta * a[k][i - 1]; |
|---|
| 43 | a[k][i - 1] = d; |
|---|
| 44 | } |
|---|
| 45 | } |
|---|
| 46 | } /* givens */ |
|---|
| 47 | |
|---|
| 48 | void ph_mldist::coeffs(double x,double y,double *c,double *s,double accuracy) |
|---|
| 49 | { |
|---|
| 50 | /* compute cosine and sine of theta */ |
|---|
| 51 | double root; |
|---|
| 52 | |
|---|
| 53 | root = sqrt(x * x + y * y); |
|---|
| 54 | if (root < accuracy) { |
|---|
| 55 | *c = 1.0; |
|---|
| 56 | *s = 0.0; |
|---|
| 57 | } else { |
|---|
| 58 | *c = x / root; |
|---|
| 59 | *s = y / root; |
|---|
| 60 | } |
|---|
| 61 | } /* coeffs */ |
|---|
| 62 | |
|---|
| 63 | void ph_mldist::tridiag(ph_ml_matrix a,long n,double accuracy) |
|---|
| 64 | { |
|---|
| 65 | /* Givens tridiagonalization */ |
|---|
| 66 | long i, j; |
|---|
| 67 | double s, c; |
|---|
| 68 | |
|---|
| 69 | for (i = 2; i < n; i++) { |
|---|
| 70 | for (j = i + 1; j <= n; j++) { |
|---|
| 71 | coeffs(a[i - 2][i - 1], a[i - 2][j - 1], &c, &s, accuracy); |
|---|
| 72 | givens(a, i, j, n, c, s, GB_TRUE); |
|---|
| 73 | givens(a, i, j, n, c, s, GB_FALSE); |
|---|
| 74 | givens(eigvecs, i, j, n, c, s, GB_TRUE); |
|---|
| 75 | } |
|---|
| 76 | } |
|---|
| 77 | } /* tridiag */ |
|---|
| 78 | |
|---|
| 79 | void ph_mldist::shiftqr(ph_ml_matrix a, long n, double accuracy) |
|---|
| 80 | { |
|---|
| 81 | /* QR eigenvalue-finder */ |
|---|
| 82 | long i, j; |
|---|
| 83 | double approx, s, c, d, TEMP, TEMP1; |
|---|
| 84 | |
|---|
| 85 | for (i = n; i >= 2; i--) { |
|---|
| 86 | do { |
|---|
| 87 | TEMP = a[i - 2][i - 2] - a[i - 1][i - 1]; |
|---|
| 88 | TEMP1 = a[i - 1][i - 2]; |
|---|
| 89 | d = sqrt(TEMP * TEMP + TEMP1 * TEMP1); |
|---|
| 90 | approx = a[i - 2][i - 2] + a[i - 1][i - 1]; |
|---|
| 91 | if (a[i - 1][i - 1] < a[i - 2][i - 2]) |
|---|
| 92 | approx = (approx - d) / 2.0; |
|---|
| 93 | else |
|---|
| 94 | approx = (approx + d) / 2.0; |
|---|
| 95 | for (j = 0; j < i; j++) |
|---|
| 96 | a[j][j] -= approx; |
|---|
| 97 | for (j = 1; j < i; j++) { |
|---|
| 98 | coeffs(a[j - 1][j - 1], a[j][j - 1], &c, &s, accuracy); |
|---|
| 99 | givens(a, j, j + 1, i, c, s, GB_TRUE); |
|---|
| 100 | givens(a, j, j + 1, i, c, s, GB_FALSE); |
|---|
| 101 | givens(eigvecs, j, j + 1, n, c, s, GB_TRUE); |
|---|
| 102 | } |
|---|
| 103 | for (j = 0; j < i; j++) |
|---|
| 104 | a[j][j] += approx; |
|---|
| 105 | } while (fabs(a[i - 1][i - 2]) > accuracy); |
|---|
| 106 | } |
|---|
| 107 | } /* shiftqr */ |
|---|
| 108 | |
|---|
| 109 | |
|---|
| 110 | void ph_mldist::qreigen(ph_ml_matrix proba,long n) |
|---|
| 111 | { |
|---|
| 112 | /* QR eigenvector/eigenvalue method for symmetric matrix */ |
|---|
| 113 | double accuracy; |
|---|
| 114 | long i, j; |
|---|
| 115 | |
|---|
| 116 | accuracy = 1.0e-6; |
|---|
| 117 | for (i = 0; i < n; i++) { |
|---|
| 118 | for (j = 0; j < n; j++) |
|---|
| 119 | eigvecs[i][j] = 0.0; |
|---|
| 120 | eigvecs[i][i] = 1.0; |
|---|
| 121 | } |
|---|
| 122 | tridiag(proba, n, accuracy); |
|---|
| 123 | shiftqr(proba, n, accuracy); |
|---|
| 124 | for (i = 0; i < n; i++) |
|---|
| 125 | eig[i] = proba[i][i]; |
|---|
| 126 | for (i = 0; i < n_states; i++) { |
|---|
| 127 | for (j = 0; j < n_states; j++) |
|---|
| 128 | proba[i][j] = sqrt(pi[j]) * eigvecs[i][j]; |
|---|
| 129 | } |
|---|
| 130 | /* proba[i][j] is the value of U' times pi^(1/2) */ |
|---|
| 131 | } /* qreigen */ |
|---|
| 132 | |
|---|
| 133 | |
|---|
| 134 | /* pameigen */ |
|---|
| 135 | |
|---|
| 136 | void ph_mldist::build_exptteig(double tt){ |
|---|
| 137 | int m; |
|---|
| 138 | for (m = 0; m < n_states; m++) { |
|---|
| 139 | exptteig[m] = exp(tt * eig[m]); |
|---|
| 140 | } |
|---|
| 141 | } |
|---|
| 142 | |
|---|
| 143 | void ph_mldist::predict(double /*tt*/, long nb1,long nb2) |
|---|
| 144 | { |
|---|
| 145 | /* make contribution to prediction of this aa pair */ |
|---|
| 146 | long m; |
|---|
| 147 | double q; |
|---|
| 148 | double TEMP; |
|---|
| 149 | for (m = n_states-1; m >=0; m--) { |
|---|
| 150 | q = prob[m][nb1] * prob[m][nb2] * exptteig[m]; |
|---|
| 151 | p += q; |
|---|
| 152 | TEMP = eig[m]; |
|---|
| 153 | dp += TEMP * q; |
|---|
| 154 | d2p += TEMP * TEMP * q; |
|---|
| 155 | } |
|---|
| 156 | } /* predict */ |
|---|
| 157 | |
|---|
| 158 | void ph_mldist::build_predikt_table(int pos){ |
|---|
| 159 | int b1, b2; |
|---|
| 160 | double tt = pos_2_tt(pos); |
|---|
| 161 | build_exptteig(tt); |
|---|
| 162 | akt_slopes = slopes[pos] = (ph_pml_matrix *) calloc(sizeof(ph_pml_matrix), 1); |
|---|
| 163 | akt_curves = curves[pos] = (ph_pml_matrix *) calloc(sizeof(ph_pml_matrix), 1); |
|---|
| 164 | akt_infs = infs[pos] = (ph_bool_matrix *) calloc(sizeof(ph_bool_matrix), 1); |
|---|
| 165 | |
|---|
| 166 | for (b1 = 0; b1 < this->n_states; b1++) { |
|---|
| 167 | for (b2 = 0; b2 <= b1; b2++) { |
|---|
| 168 | p = 0.0; |
|---|
| 169 | dp = 0.0; |
|---|
| 170 | d2p = 0.0; |
|---|
| 171 | predict(tt, b1, b2); |
|---|
| 172 | |
|---|
| 173 | if (p > 0.0){ |
|---|
| 174 | double ip = 1.0/p; |
|---|
| 175 | akt_slopes[0][b1][b2] = dp * ip; |
|---|
| 176 | akt_curves[0][b1][b2] = d2p * ip - dp * dp * (ip * ip); |
|---|
| 177 | akt_infs[0][b1][b2] = 0; |
|---|
| 178 | akt_slopes[0][b2][b1] = akt_slopes[0][b1][b2]; |
|---|
| 179 | akt_curves[0][b2][b1] = akt_curves[0][b1][b2]; |
|---|
| 180 | akt_infs[0][b2][b1] = 0; |
|---|
| 181 | }else{ |
|---|
| 182 | akt_infs[0][b1][b2] = 1; |
|---|
| 183 | akt_infs[0][b2][b1] = 1; |
|---|
| 184 | } |
|---|
| 185 | } |
|---|
| 186 | } |
|---|
| 187 | } |
|---|
| 188 | |
|---|
| 189 | int ph_mldist::tt_2_pos(double tt) { |
|---|
| 190 | int pos = (int)(tt * fracchange * PH_ML_RESOLUTION); |
|---|
| 191 | if (pos >= PH_ML_RESOLUTION * PH_ML_MAX_DIST ) |
|---|
| 192 | pos = PH_ML_RESOLUTION * PH_ML_MAX_DIST - 1; |
|---|
| 193 | if (pos < 0) |
|---|
| 194 | pos = 0; |
|---|
| 195 | return pos; |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | double ph_mldist::pos_2_tt(int pos) { |
|---|
| 199 | double tt = pos / (fracchange * PH_ML_RESOLUTION); |
|---|
| 200 | return tt+epsilon; |
|---|
| 201 | } |
|---|
| 202 | |
|---|
| 203 | void ph_mldist::build_akt_predikt(double tt) |
|---|
| 204 | { |
|---|
| 205 | /* take an aktual slope from the hash table, else calculate a new one */ |
|---|
| 206 | int pos = tt_2_pos(tt); |
|---|
| 207 | if (!slopes[pos]){ |
|---|
| 208 | build_predikt_table(pos); |
|---|
| 209 | } |
|---|
| 210 | akt_slopes = slopes[pos]; |
|---|
| 211 | akt_curves = curves[pos]; |
|---|
| 212 | akt_infs = infs[pos]; |
|---|
| 213 | return; |
|---|
| 214 | |
|---|
| 215 | } |
|---|
| 216 | |
|---|
| 217 | const char *ph_mldist::makedists() |
|---|
| 218 | { |
|---|
| 219 | /* compute the distances */ |
|---|
| 220 | long i, j, k, iterations; |
|---|
| 221 | double delta, slope, curv; |
|---|
| 222 | int b1=0, b2=0; |
|---|
| 223 | double tt=0; |
|---|
| 224 | int pos; |
|---|
| 225 | |
|---|
| 226 | for (i = 0; i < spp; i++) { |
|---|
| 227 | matrix->set(i,i,0.0); |
|---|
| 228 | { |
|---|
| 229 | double gauge = (double)i/(double)spp; |
|---|
| 230 | if (aw_status(gauge*gauge)) return "Aborted"; |
|---|
| 231 | } |
|---|
| 232 | { |
|---|
| 233 | /* move all unknown characters to del */ |
|---|
| 234 | ap_pro *seq1 = entries[i]->sequence_protein->sequence; |
|---|
| 235 | for (k = 0; k <chars ; k++) { |
|---|
| 236 | b1 = seq1[k]; |
|---|
| 237 | if (b1 <=val) continue; |
|---|
| 238 | if (b1 == asx || b1 == glx) continue; |
|---|
| 239 | seq1[k] = del; |
|---|
| 240 | } |
|---|
| 241 | } |
|---|
| 242 | |
|---|
| 243 | for (j = 0; j < i ; j++) { |
|---|
| 244 | tt = 1.0; |
|---|
| 245 | delta = tt / 2.0; |
|---|
| 246 | iterations = 0; |
|---|
| 247 | do { |
|---|
| 248 | slope = 0.0; |
|---|
| 249 | curv = 0.0; |
|---|
| 250 | pos = tt_2_pos(tt); |
|---|
| 251 | tt = pos_2_tt(pos); |
|---|
| 252 | build_akt_predikt(tt); |
|---|
| 253 | ap_pro *seq1 = entries[i]->sequence_protein->sequence; |
|---|
| 254 | ap_pro *seq2 = entries[j]->sequence_protein->sequence; |
|---|
| 255 | for (k = chars; k >0; k--) { |
|---|
| 256 | b1 = *(seq1++); |
|---|
| 257 | b2 = *(seq2++); |
|---|
| 258 | if (predict_infinity(b1,b2)){ |
|---|
| 259 | break; |
|---|
| 260 | } |
|---|
| 261 | slope += predict_slope(b1,b2); |
|---|
| 262 | curv += predict_curve(b1,b2); |
|---|
| 263 | } |
|---|
| 264 | iterations++; |
|---|
| 265 | if (!predict_infinity(b1,b2)) { |
|---|
| 266 | if (curv < 0.0) { |
|---|
| 267 | tt -= slope / curv; |
|---|
| 268 | if (tt > 10000.0) { |
|---|
| 269 | aw_message(GB_export_error("INFINITE DISTANCE BETWEEN SPECIES %ld AND %ld; -1.0 WAS WRITTEN\n", i, j)); |
|---|
| 270 | tt = -1.0 / fracchange; |
|---|
| 271 | break; |
|---|
| 272 | } |
|---|
| 273 | int npos = tt_2_pos(tt); |
|---|
| 274 | int d = npos - pos; if (d<0) d=-d; |
|---|
| 275 | if (d<=1){ // cannot optimize |
|---|
| 276 | break; |
|---|
| 277 | } |
|---|
| 278 | |
|---|
| 279 | } else { |
|---|
| 280 | if ((slope > 0.0 && delta < 0.0) || (slope < 0.0 && delta > 0.0)) |
|---|
| 281 | delta /= -2; |
|---|
| 282 | if (tt + delta < 0 && tt<= epsilon) { |
|---|
| 283 | break; |
|---|
| 284 | } |
|---|
| 285 | tt += delta; |
|---|
| 286 | } |
|---|
| 287 | } else { |
|---|
| 288 | delta /= -2; |
|---|
| 289 | tt += delta; |
|---|
| 290 | if (tt < 0) tt = 0; |
|---|
| 291 | } |
|---|
| 292 | } while (iterations < 20); |
|---|
| 293 | } |
|---|
| 294 | matrix->set(i,j,fracchange * tt); |
|---|
| 295 | } |
|---|
| 296 | return 0; |
|---|
| 297 | } /* makedists */ |
|---|
| 298 | |
|---|
| 299 | |
|---|
| 300 | void ph_mldist::clean_slopes(){ |
|---|
| 301 | int i; |
|---|
| 302 | if (slopes) { |
|---|
| 303 | for (i=0;i<PH_ML_RESOLUTION*PH_ML_MAX_DIST;i++) { |
|---|
| 304 | delete slopes[i]; slopes[i] = 0; |
|---|
| 305 | delete curves[i]; curves[i] = 0; |
|---|
| 306 | delete infs[i]; infs[i] = 0; |
|---|
| 307 | } |
|---|
| 308 | } |
|---|
| 309 | akt_slopes = 0; |
|---|
| 310 | akt_curves = 0; |
|---|
| 311 | akt_infs = 0; |
|---|
| 312 | } |
|---|
| 313 | |
|---|
| 314 | ph_mldist::~ph_mldist(){ |
|---|
| 315 | clean_slopes(); |
|---|
| 316 | } |
|---|
| 317 | |
|---|
| 318 | ph_mldist::ph_mldist(long nentries, PHENTRY **entriesi, long seq_len, AP_smatrix *matrixi){ |
|---|
| 319 | memset((char *)this,0,sizeof(ph_mldist)); |
|---|
| 320 | entries = entriesi; |
|---|
| 321 | matrix = matrixi; |
|---|
| 322 | |
|---|
| 323 | spp = nentries; |
|---|
| 324 | chars = seq_len; |
|---|
| 325 | |
|---|
| 326 | //maketrans(); |
|---|
| 327 | qreigen(prob, 20L); |
|---|
| 328 | } |
|---|
| 329 | |
|---|