| 1 | #include <stdio.h> |
|---|
| 2 | #include <stdlib.h> |
|---|
| 3 | #include <string.h> |
|---|
| 4 | // #include <malloc.h> |
|---|
| 5 | #include <memory.h> |
|---|
| 6 | #include <arbdb.h> |
|---|
| 7 | #include <arbdbt.h> |
|---|
| 8 | #include "awt_tree.hxx" |
|---|
| 9 | #include "awt_nei.hxx" |
|---|
| 10 | |
|---|
| 11 | #define CHECK_NAN(x) if ( (!(x>=0.0)) && (!(x<0.0)) ) *(int *)0=0; |
|---|
| 12 | |
|---|
| 13 | PH_NEIGHBOUR_DIST::PH_NEIGHBOUR_DIST(void) |
|---|
| 14 | { |
|---|
| 15 | memset((char *)this,0,sizeof(PH_NEIGHBOUR_DIST)); |
|---|
| 16 | } |
|---|
| 17 | |
|---|
| 18 | |
|---|
| 19 | AP_FLOAT PH_NEIGHBOURJOINING::get_max_di(AP_FLOAT **m) // O(n*2) |
|---|
| 20 | { |
|---|
| 21 | long i,j; |
|---|
| 22 | AP_FLOAT max = 0.0; |
|---|
| 23 | for (i=0;i<size; i++){ |
|---|
| 24 | for (j=0;j<i;j++){ |
|---|
| 25 | if (m[i][j] > max) max = m[i][j]; |
|---|
| 26 | } |
|---|
| 27 | } |
|---|
| 28 | return max; |
|---|
| 29 | } |
|---|
| 30 | |
|---|
| 31 | void PH_NEIGHBOURJOINING::remove_taxa_from_dist_list(long i) // O(n/2) |
|---|
| 32 | { |
|---|
| 33 | long a,j; |
|---|
| 34 | PH_NEIGHBOUR_DIST *nd; |
|---|
| 35 | for (a=0;a<swap_size;a++) { |
|---|
| 36 | j= swap_tab[a]; |
|---|
| 37 | if (i==j) continue; |
|---|
| 38 | if (j<i) { |
|---|
| 39 | nd = &(dist_matrix[i][j]); |
|---|
| 40 | }else{ |
|---|
| 41 | nd = &(dist_matrix[j][i]); |
|---|
| 42 | } |
|---|
| 43 | nd->remove(); |
|---|
| 44 | net_divergence[j] -= nd->val; // corr net divergence |
|---|
| 45 | } |
|---|
| 46 | } |
|---|
| 47 | void PH_NEIGHBOURJOINING::add_taxa_to_dist_list(long i) // O(n/2) |
|---|
| 48 | { |
|---|
| 49 | long a; |
|---|
| 50 | long pos,j; |
|---|
| 51 | PH_NEIGHBOUR_DIST *nd; |
|---|
| 52 | AP_FLOAT my_nd = 0.0; |
|---|
| 53 | for (a=0;a<swap_size;a++) { |
|---|
| 54 | j= swap_tab[a]; |
|---|
| 55 | if (i==j) continue; |
|---|
| 56 | if (j<i) { |
|---|
| 57 | nd = &(dist_matrix[i][j]); |
|---|
| 58 | }else{ |
|---|
| 59 | nd = &(dist_matrix[j][i]); |
|---|
| 60 | } |
|---|
| 61 | ph_assert(!nd->previous); |
|---|
| 62 | pos = (int)(nd->val*dist_list_corr); |
|---|
| 63 | if (pos>= dist_list_size) { |
|---|
| 64 | pos = dist_list_size-1; |
|---|
| 65 | }else if (pos<0) |
|---|
| 66 | pos = 0; |
|---|
| 67 | nd->add(&(dist_list[pos])); |
|---|
| 68 | |
|---|
| 69 | net_divergence[j] += nd->val; // corr net divergence |
|---|
| 70 | my_nd += nd->val; |
|---|
| 71 | } |
|---|
| 72 | net_divergence[i] = my_nd; |
|---|
| 73 | } |
|---|
| 74 | |
|---|
| 75 | AP_FLOAT PH_NEIGHBOURJOINING::get_max_net_divergence(void) // O(n/2) |
|---|
| 76 | { |
|---|
| 77 | long a,i; |
|---|
| 78 | AP_FLOAT max = 0.0; |
|---|
| 79 | for (a=0;a<swap_size;a++){ |
|---|
| 80 | i = swap_tab[a]; |
|---|
| 81 | if (net_divergence[i] > max) max = net_divergence[i]; |
|---|
| 82 | } |
|---|
| 83 | return max; |
|---|
| 84 | } |
|---|
| 85 | |
|---|
| 86 | void PH_NEIGHBOURJOINING::remove_taxa_from_swap_tab(long i) // O(n/2) |
|---|
| 87 | { |
|---|
| 88 | long a; |
|---|
| 89 | long *source,*dest; |
|---|
| 90 | source = dest = swap_tab; |
|---|
| 91 | for (a=0;a<swap_size;a++){ |
|---|
| 92 | if (swap_tab[a] == i){ |
|---|
| 93 | source++; |
|---|
| 94 | }else{ |
|---|
| 95 | *(dest++) = *(source++); |
|---|
| 96 | } |
|---|
| 97 | } |
|---|
| 98 | swap_size --; |
|---|
| 99 | } |
|---|
| 100 | |
|---|
| 101 | PH_NEIGHBOURJOINING::PH_NEIGHBOURJOINING(AP_FLOAT **m, long isize) |
|---|
| 102 | { |
|---|
| 103 | long i,j; |
|---|
| 104 | |
|---|
| 105 | size = isize; |
|---|
| 106 | swap_size = size; // init swap tab |
|---|
| 107 | swap_tab = new long[size]; |
|---|
| 108 | for (i=0;i<swap_size;i++) swap_tab[i] = i; |
|---|
| 109 | |
|---|
| 110 | net_divergence = (AP_FLOAT *)calloc(sizeof(AP_FLOAT),(size_t)size); |
|---|
| 111 | |
|---|
| 112 | |
|---|
| 113 | dist_list_size = size; // hope te be the best |
|---|
| 114 | dist_list = new PH_NEIGHBOUR_DIST[dist_list_size];// the roots, no elems |
|---|
| 115 | dist_list_corr = (dist_list_size-2.0)/get_max_di(m); |
|---|
| 116 | |
|---|
| 117 | dist_matrix = new PH_NEIGHBOUR_DIST*[size]; |
|---|
| 118 | for (i=0;i<size;i++) { |
|---|
| 119 | dist_matrix[i] = new PH_NEIGHBOUR_DIST[i]; |
|---|
| 120 | for (j=0;j<i;j++){ |
|---|
| 121 | dist_matrix[i][j].val = m[i][j]; |
|---|
| 122 | dist_matrix[i][j].i = i; |
|---|
| 123 | dist_matrix[i][j].j = j; |
|---|
| 124 | } |
|---|
| 125 | } |
|---|
| 126 | for (i=0;i<size;i++){ |
|---|
| 127 | swap_size = i; // to calculate the correct net divergence |
|---|
| 128 | add_taxa_to_dist_list(i); // add to dist list and add n.d. |
|---|
| 129 | } |
|---|
| 130 | swap_size = size; |
|---|
| 131 | } |
|---|
| 132 | |
|---|
| 133 | PH_NEIGHBOURJOINING::~PH_NEIGHBOURJOINING(void) |
|---|
| 134 | { |
|---|
| 135 | delete [] dist_matrix; |
|---|
| 136 | delete [] dist_list; |
|---|
| 137 | free((char *)net_divergence); |
|---|
| 138 | delete [] swap_tab; |
|---|
| 139 | } |
|---|
| 140 | |
|---|
| 141 | void PH_NEIGHBOURJOINING::get_min_ij(long& mini, long& minj) // O(n*n/speedup) |
|---|
| 142 | { |
|---|
| 143 | AP_FLOAT maxri = get_max_net_divergence(); // O(n/2) |
|---|
| 144 | PH_NEIGHBOUR_DIST *dl; |
|---|
| 145 | long stat = 0; |
|---|
| 146 | AP_FLOAT x; |
|---|
| 147 | AP_FLOAT minval; |
|---|
| 148 | minval = 100000.0; |
|---|
| 149 | AP_FLOAT N_1 = 1.0/(swap_size-2.0); |
|---|
| 150 | maxri = maxri*2.0*N_1; |
|---|
| 151 | long pos; |
|---|
| 152 | get_last_ij(mini,minj); |
|---|
| 153 | |
|---|
| 154 | for (pos=0;pos<dist_list_size;pos++){ |
|---|
| 155 | if (minval < pos/dist_list_corr - maxri) break; |
|---|
| 156 | // no way to get a better minimum |
|---|
| 157 | dl = dist_list[pos].next; // first entry does not contain information |
|---|
| 158 | for (;dl;dl=dl->next){ |
|---|
| 159 | x = (net_divergence[dl->i] + net_divergence[dl->j])*N_1; |
|---|
| 160 | if (dl->val-x<minval) { |
|---|
| 161 | minval = dl->val -x; |
|---|
| 162 | minj = dl->i; |
|---|
| 163 | mini = dl->j; |
|---|
| 164 | } |
|---|
| 165 | stat++; |
|---|
| 166 | } |
|---|
| 167 | } |
|---|
| 168 | |
|---|
| 169 | |
|---|
| 170 | //printf("stat %li of %li mini %li minj %li\n", |
|---|
| 171 | // stat,swap_size*(swap_size-1)/2,mini,minj); |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | void PH_NEIGHBOURJOINING::join_nodes(long i,long j,AP_FLOAT &leftl,AP_FLOAT& rightl) |
|---|
| 175 | { |
|---|
| 176 | PH_NEIGHBOUR_DIST **d = dist_matrix; |
|---|
| 177 | AP_FLOAT dji; |
|---|
| 178 | |
|---|
| 179 | AP_FLOAT dist = get_dist(i,j); |
|---|
| 180 | |
|---|
| 181 | leftl = dist*.5 + (net_divergence[i] - net_divergence[j])*.5/ |
|---|
| 182 | (swap_size - 2.0); |
|---|
| 183 | rightl = dist - leftl; |
|---|
| 184 | |
|---|
| 185 | remove_taxa_from_dist_list(j); |
|---|
| 186 | remove_taxa_from_swap_tab(j); |
|---|
| 187 | remove_taxa_from_dist_list(i); |
|---|
| 188 | |
|---|
| 189 | long a,k; |
|---|
| 190 | dji = d[j][i].val; |
|---|
| 191 | for (a=0;a<swap_size;a++) { |
|---|
| 192 | k = swap_tab[a]; |
|---|
| 193 | if (k==i) continue; // k == j not possible |
|---|
| 194 | if (k>i) { |
|---|
| 195 | if (k>j) { |
|---|
| 196 | d[k][i].val = .5*(d[k][i].val + d[k][j].val - dji); |
|---|
| 197 | }else{ |
|---|
| 198 | d[k][i].val = .5*(d[k][i].val + d[j][k].val - dji); |
|---|
| 199 | } |
|---|
| 200 | }else{ |
|---|
| 201 | d[i][k].val = 0.5 * (d[i][k].val + d[j][k].val - dji); |
|---|
| 202 | |
|---|
| 203 | } |
|---|
| 204 | } |
|---|
| 205 | add_taxa_to_dist_list(i); |
|---|
| 206 | } |
|---|
| 207 | |
|---|
| 208 | void PH_NEIGHBOURJOINING::get_last_ij(long& i, long& j) |
|---|
| 209 | { |
|---|
| 210 | i = swap_tab[0]; |
|---|
| 211 | j = swap_tab[1]; |
|---|
| 212 | } |
|---|
| 213 | |
|---|
| 214 | AP_FLOAT PH_NEIGHBOURJOINING::get_dist(long i, long j) |
|---|
| 215 | { |
|---|
| 216 | return dist_matrix[j][i].val; |
|---|
| 217 | } |
|---|
| 218 | |
|---|
| 219 | GBT_TREE *neighbourjoining(char **names, AP_FLOAT **m, long size, size_t structure_size) |
|---|
| 220 | { |
|---|
| 221 | // structure_size >= sizeof(GBT_TREE); |
|---|
| 222 | // lower triangular matrix |
|---|
| 223 | // size: size of matrix |
|---|
| 224 | |
|---|
| 225 | |
|---|
| 226 | PH_NEIGHBOURJOINING *nj = new PH_NEIGHBOURJOINING(m,size); |
|---|
| 227 | long i; |
|---|
| 228 | long a,b; |
|---|
| 229 | GBT_TREE **nodes; |
|---|
| 230 | AP_FLOAT ll,rl; |
|---|
| 231 | nodes = (GBT_TREE **)calloc(sizeof(GBT_TREE *),(size_t)size); |
|---|
| 232 | for (i=0;i<size;i++) { |
|---|
| 233 | nodes[i] = (GBT_TREE *)calloc(structure_size,1); |
|---|
| 234 | nodes[i]->name = strdup(names[i]); |
|---|
| 235 | nodes[i]->is_leaf = GB_TRUE; |
|---|
| 236 | } |
|---|
| 237 | |
|---|
| 238 | for (i=0;i<size-2;i++) { |
|---|
| 239 | nj->get_min_ij(a,b); |
|---|
| 240 | nj->join_nodes(a,b,ll,rl); |
|---|
| 241 | GBT_TREE *father = (GBT_TREE *)calloc(structure_size,1); |
|---|
| 242 | father->leftson = nodes[a]; |
|---|
| 243 | father->rightson = nodes[b]; |
|---|
| 244 | father->leftlen = ll; |
|---|
| 245 | father->rightlen = rl; |
|---|
| 246 | nodes[a]->father = father; |
|---|
| 247 | nodes[b]->father = father; |
|---|
| 248 | nodes[a] = father; |
|---|
| 249 | } |
|---|
| 250 | nj->get_last_ij(a,b); |
|---|
| 251 | AP_FLOAT dist = nj->get_dist(a,b); |
|---|
| 252 | ll = dist*0.5; |
|---|
| 253 | rl = dist*0.5; |
|---|
| 254 | |
|---|
| 255 | GBT_TREE *father = (GBT_TREE *)calloc(structure_size,1); |
|---|
| 256 | father->leftson = nodes[a]; |
|---|
| 257 | father->rightson = nodes[b]; |
|---|
| 258 | father->leftlen = ll; |
|---|
| 259 | father->rightlen = rl; |
|---|
| 260 | nodes[a]->father = father; |
|---|
| 261 | nodes[b]->father = father; |
|---|
| 262 | |
|---|
| 263 | delete nj; |
|---|
| 264 | free((char*)nodes); |
|---|
| 265 | return father; |
|---|
| 266 | } |
|---|