| 1 | #include "muscle.h" |
|---|
| 2 | #include "tree.h" |
|---|
| 3 | #include "distcalc.h" |
|---|
| 4 | |
|---|
| 5 | // UPGMA clustering in O(N^2) time and space. |
|---|
| 6 | |
|---|
| 7 | #define TRACE 0 |
|---|
| 8 | |
|---|
| 9 | #define MIN(x, y) ((x) < (y) ? (x) : (y)) |
|---|
| 10 | #define MAX(x, y) ((x) > (y) ? (x) : (y)) |
|---|
| 11 | #define AVG(x, y) (((x) + (y))/2) |
|---|
| 12 | |
|---|
| 13 | static unsigned g_uLeafCount; |
|---|
| 14 | static unsigned g_uTriangleSize; |
|---|
| 15 | static unsigned g_uInternalNodeCount; |
|---|
| 16 | static unsigned g_uInternalNodeIndex; |
|---|
| 17 | |
|---|
| 18 | // Triangular distance matrix is g_Dist, which is allocated |
|---|
| 19 | // as a one-dimensional vector of length g_uTriangleSize. |
|---|
| 20 | // TriangleSubscript(i,j) maps row,column=i,j to the subscript |
|---|
| 21 | // into this vector. |
|---|
| 22 | // Row / column coordinates are a bit messy. |
|---|
| 23 | // Initially they are leaf indexes 0..N-1. |
|---|
| 24 | // But each time we create a new node (=new cluster, new subtree), |
|---|
| 25 | // we re-use one of the two rows that become available (the children |
|---|
| 26 | // of the new node). This saves memory. |
|---|
| 27 | // We keep track of this through the g_uNodeIndex vector. |
|---|
| 28 | static dist_t *g_Dist; |
|---|
| 29 | |
|---|
| 30 | // Distance to nearest neighbor in row i of distance matrix. |
|---|
| 31 | // Subscript is distance matrix row. |
|---|
| 32 | static dist_t *g_MinDist; |
|---|
| 33 | |
|---|
| 34 | // Nearest neighbor to row i of distance matrix. |
|---|
| 35 | // Subscript is distance matrix row. |
|---|
| 36 | static unsigned *g_uNearestNeighbor; |
|---|
| 37 | |
|---|
| 38 | // Node index of row i in distance matrix. |
|---|
| 39 | // Node indexes are 0..N-1 for leaves, N..2N-2 for internal nodes. |
|---|
| 40 | // Subscript is distance matrix row. |
|---|
| 41 | static unsigned *g_uNodeIndex; |
|---|
| 42 | |
|---|
| 43 | // The following vectors are defined on internal nodes, |
|---|
| 44 | // subscripts are internal node index 0..N-2. |
|---|
| 45 | // For g_uLeft/Right, value is the node index 0 .. 2N-2 |
|---|
| 46 | // because a child can be internal or leaf. |
|---|
| 47 | static unsigned *g_uLeft; |
|---|
| 48 | static unsigned *g_uRight; |
|---|
| 49 | static dist_t *g_Height; |
|---|
| 50 | static dist_t *g_LeftLength; |
|---|
| 51 | static dist_t *g_RightLength; |
|---|
| 52 | |
|---|
| 53 | static inline unsigned TriangleSubscript(unsigned uIndex1, unsigned uIndex2) |
|---|
| 54 | { |
|---|
| 55 | #if DEBUG |
|---|
| 56 | if (uIndex1 >= g_uLeafCount || uIndex2 >= g_uLeafCount) |
|---|
| 57 | Quit("TriangleSubscript(%u,%u) %u", uIndex1, uIndex2, g_uLeafCount); |
|---|
| 58 | #endif |
|---|
| 59 | unsigned v; |
|---|
| 60 | if (uIndex1 >= uIndex2) |
|---|
| 61 | v = uIndex2 + (uIndex1*(uIndex1 - 1))/2; |
|---|
| 62 | else |
|---|
| 63 | v = uIndex1 + (uIndex2*(uIndex2 - 1))/2; |
|---|
| 64 | assert(v < (g_uLeafCount*(g_uLeafCount - 1))/2); |
|---|
| 65 | return v; |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | static void ListState() |
|---|
| 69 | { |
|---|
| 70 | Log("Dist matrix\n"); |
|---|
| 71 | Log(" "); |
|---|
| 72 | for (unsigned i = 0; i < g_uLeafCount; ++i) |
|---|
| 73 | { |
|---|
| 74 | if (uInsane == g_uNodeIndex[i]) |
|---|
| 75 | continue; |
|---|
| 76 | Log(" %5u", g_uNodeIndex[i]); |
|---|
| 77 | } |
|---|
| 78 | Log("\n"); |
|---|
| 79 | |
|---|
| 80 | for (unsigned i = 0; i < g_uLeafCount; ++i) |
|---|
| 81 | { |
|---|
| 82 | if (uInsane == g_uNodeIndex[i]) |
|---|
| 83 | continue; |
|---|
| 84 | Log("%5u ", g_uNodeIndex[i]); |
|---|
| 85 | for (unsigned j = 0; j < g_uLeafCount; ++j) |
|---|
| 86 | { |
|---|
| 87 | if (uInsane == g_uNodeIndex[j]) |
|---|
| 88 | continue; |
|---|
| 89 | if (i == j) |
|---|
| 90 | Log(" "); |
|---|
| 91 | else |
|---|
| 92 | { |
|---|
| 93 | unsigned v = TriangleSubscript(i, j); |
|---|
| 94 | Log("%5.2g ", g_Dist[v]); |
|---|
| 95 | } |
|---|
| 96 | } |
|---|
| 97 | Log("\n"); |
|---|
| 98 | } |
|---|
| 99 | |
|---|
| 100 | Log("\n"); |
|---|
| 101 | Log(" i Node NrNb Dist\n"); |
|---|
| 102 | Log("----- ----- ----- --------\n"); |
|---|
| 103 | for (unsigned i = 0; i < g_uLeafCount; ++i) |
|---|
| 104 | { |
|---|
| 105 | if (uInsane == g_uNodeIndex[i]) |
|---|
| 106 | continue; |
|---|
| 107 | Log("%5u %5u %5u %8.3f\n", |
|---|
| 108 | i, |
|---|
| 109 | g_uNodeIndex[i], |
|---|
| 110 | g_uNearestNeighbor[i], |
|---|
| 111 | g_MinDist[i]); |
|---|
| 112 | } |
|---|
| 113 | |
|---|
| 114 | Log("\n"); |
|---|
| 115 | Log(" Node L R Height LLength RLength\n"); |
|---|
| 116 | Log("----- ----- ----- ------ ------- -------\n"); |
|---|
| 117 | for (unsigned i = 0; i <= g_uInternalNodeIndex; ++i) |
|---|
| 118 | Log("%5u %5u %5u %6.2g %6.2g %6.2g\n", |
|---|
| 119 | i, |
|---|
| 120 | g_uLeft[i], |
|---|
| 121 | g_uRight[i], |
|---|
| 122 | g_Height[i], |
|---|
| 123 | g_LeftLength[i], |
|---|
| 124 | g_RightLength[i]); |
|---|
| 125 | } |
|---|
| 126 | |
|---|
| 127 | void UPGMA2(const DistCalc &DC, Tree &tree, LINKAGE Linkage) |
|---|
| 128 | { |
|---|
| 129 | g_uLeafCount = DC.GetCount(); |
|---|
| 130 | |
|---|
| 131 | g_uTriangleSize = (g_uLeafCount*(g_uLeafCount - 1))/2; |
|---|
| 132 | g_uInternalNodeCount = g_uLeafCount - 1; |
|---|
| 133 | |
|---|
| 134 | g_Dist = new dist_t[g_uTriangleSize]; |
|---|
| 135 | |
|---|
| 136 | g_uNodeIndex = new unsigned[g_uLeafCount]; |
|---|
| 137 | g_uNearestNeighbor = new unsigned[g_uLeafCount]; |
|---|
| 138 | g_MinDist = new dist_t[g_uLeafCount]; |
|---|
| 139 | unsigned *Ids = new unsigned [g_uLeafCount]; |
|---|
| 140 | char **Names = new char *[g_uLeafCount]; |
|---|
| 141 | |
|---|
| 142 | g_uLeft = new unsigned[g_uInternalNodeCount]; |
|---|
| 143 | g_uRight = new unsigned[g_uInternalNodeCount]; |
|---|
| 144 | g_Height = new dist_t[g_uInternalNodeCount]; |
|---|
| 145 | g_LeftLength = new dist_t[g_uInternalNodeCount]; |
|---|
| 146 | g_RightLength = new dist_t[g_uInternalNodeCount]; |
|---|
| 147 | |
|---|
| 148 | for (unsigned i = 0; i < g_uLeafCount; ++i) |
|---|
| 149 | { |
|---|
| 150 | g_MinDist[i] = BIG_DIST; |
|---|
| 151 | g_uNodeIndex[i] = i; |
|---|
| 152 | g_uNearestNeighbor[i] = uInsane; |
|---|
| 153 | Ids[i] = DC.GetId(i); |
|---|
| 154 | Names[i] = strsave(DC.GetName(i)); |
|---|
| 155 | } |
|---|
| 156 | |
|---|
| 157 | for (unsigned i = 0; i < g_uInternalNodeCount; ++i) |
|---|
| 158 | { |
|---|
| 159 | g_uLeft[i] = uInsane; |
|---|
| 160 | g_uRight[i] = uInsane; |
|---|
| 161 | g_LeftLength[i] = BIG_DIST; |
|---|
| 162 | g_RightLength[i] = BIG_DIST; |
|---|
| 163 | g_Height[i] = BIG_DIST; |
|---|
| 164 | } |
|---|
| 165 | |
|---|
| 166 | // Compute initial NxN triangular distance matrix. |
|---|
| 167 | // Store minimum distance for each full (not triangular) row. |
|---|
| 168 | // Loop from 1, not 0, because "row" is 0, 1 ... i-1, |
|---|
| 169 | // so nothing to do when i=0. |
|---|
| 170 | for (unsigned i = 1; i < g_uLeafCount; ++i) |
|---|
| 171 | { |
|---|
| 172 | dist_t *Row = g_Dist + TriangleSubscript(i, 0); |
|---|
| 173 | DC.CalcDistRange(i, Row); |
|---|
| 174 | for (unsigned j = 0; j < i; ++j) |
|---|
| 175 | { |
|---|
| 176 | const dist_t d = Row[j]; |
|---|
| 177 | if (d < g_MinDist[i]) |
|---|
| 178 | { |
|---|
| 179 | g_MinDist[i] = d; |
|---|
| 180 | g_uNearestNeighbor[i] = j; |
|---|
| 181 | } |
|---|
| 182 | if (d < g_MinDist[j]) |
|---|
| 183 | { |
|---|
| 184 | g_MinDist[j] = d; |
|---|
| 185 | g_uNearestNeighbor[j] = i; |
|---|
| 186 | } |
|---|
| 187 | } |
|---|
| 188 | } |
|---|
| 189 | |
|---|
| 190 | #if TRACE |
|---|
| 191 | Log("Initial state:\n"); |
|---|
| 192 | ListState(); |
|---|
| 193 | #endif |
|---|
| 194 | |
|---|
| 195 | for (g_uInternalNodeIndex = 0; g_uInternalNodeIndex < g_uLeafCount - 1; |
|---|
| 196 | ++g_uInternalNodeIndex) |
|---|
| 197 | { |
|---|
| 198 | #if TRACE |
|---|
| 199 | Log("\n"); |
|---|
| 200 | Log("Internal node index %5u\n", g_uInternalNodeIndex); |
|---|
| 201 | Log("-------------------------\n"); |
|---|
| 202 | #endif |
|---|
| 203 | |
|---|
| 204 | // Find nearest neighbors |
|---|
| 205 | unsigned Lmin = uInsane; |
|---|
| 206 | unsigned Rmin = uInsane; |
|---|
| 207 | dist_t dtMinDist = BIG_DIST; |
|---|
| 208 | for (unsigned j = 0; j < g_uLeafCount; ++j) |
|---|
| 209 | { |
|---|
| 210 | if (uInsane == g_uNodeIndex[j]) |
|---|
| 211 | continue; |
|---|
| 212 | |
|---|
| 213 | dist_t d = g_MinDist[j]; |
|---|
| 214 | if (d < dtMinDist) |
|---|
| 215 | { |
|---|
| 216 | dtMinDist = d; |
|---|
| 217 | Lmin = j; |
|---|
| 218 | Rmin = g_uNearestNeighbor[j]; |
|---|
| 219 | assert(uInsane != Rmin); |
|---|
| 220 | assert(uInsane != g_uNodeIndex[Rmin]); |
|---|
| 221 | } |
|---|
| 222 | } |
|---|
| 223 | |
|---|
| 224 | assert(Lmin != uInsane); |
|---|
| 225 | assert(Rmin != uInsane); |
|---|
| 226 | assert(dtMinDist != BIG_DIST); |
|---|
| 227 | |
|---|
| 228 | #if TRACE |
|---|
| 229 | Log("Nearest neighbors Lmin %u[=%u] Rmin %u[=%u] dist %.3g\n", |
|---|
| 230 | Lmin, |
|---|
| 231 | g_uNodeIndex[Lmin], |
|---|
| 232 | Rmin, |
|---|
| 233 | g_uNodeIndex[Rmin], |
|---|
| 234 | dtMinDist); |
|---|
| 235 | #endif |
|---|
| 236 | |
|---|
| 237 | // Compute distances to new node |
|---|
| 238 | // New node overwrites row currently assigned to Lmin |
|---|
| 239 | dist_t dtNewMinDist = BIG_DIST; |
|---|
| 240 | unsigned uNewNearestNeighbor = uInsane; |
|---|
| 241 | for (unsigned j = 0; j < g_uLeafCount; ++j) |
|---|
| 242 | { |
|---|
| 243 | if (j == Lmin || j == Rmin) |
|---|
| 244 | continue; |
|---|
| 245 | if (uInsane == g_uNodeIndex[j]) |
|---|
| 246 | continue; |
|---|
| 247 | |
|---|
| 248 | const unsigned vL = TriangleSubscript(Lmin, j); |
|---|
| 249 | const unsigned vR = TriangleSubscript(Rmin, j); |
|---|
| 250 | const dist_t dL = g_Dist[vL]; |
|---|
| 251 | const dist_t dR = g_Dist[vR]; |
|---|
| 252 | dist_t dtNewDist; |
|---|
| 253 | |
|---|
| 254 | switch (Linkage) |
|---|
| 255 | { |
|---|
| 256 | case LINKAGE_Avg: |
|---|
| 257 | dtNewDist = AVG(dL, dR); |
|---|
| 258 | break; |
|---|
| 259 | |
|---|
| 260 | case LINKAGE_Min: |
|---|
| 261 | dtNewDist = MIN(dL, dR); |
|---|
| 262 | break; |
|---|
| 263 | |
|---|
| 264 | case LINKAGE_Max: |
|---|
| 265 | dtNewDist = MAX(dL, dR); |
|---|
| 266 | break; |
|---|
| 267 | |
|---|
| 268 | case LINKAGE_Biased: |
|---|
| 269 | dtNewDist = g_dSUEFF*AVG(dL, dR) + (1 - g_dSUEFF)*MIN(dL, dR); |
|---|
| 270 | break; |
|---|
| 271 | |
|---|
| 272 | default: |
|---|
| 273 | Quit("UPGMA2: Invalid LINKAGE_%u", Linkage); |
|---|
| 274 | } |
|---|
| 275 | |
|---|
| 276 | // Nasty special case. |
|---|
| 277 | // If nearest neighbor of j is Lmin or Rmin, then make the new |
|---|
| 278 | // node (which overwrites the row currently occupied by Lmin) |
|---|
| 279 | // the nearest neighbor. This situation can occur when there are |
|---|
| 280 | // equal distances in the matrix. If we don't make this fix, |
|---|
| 281 | // the nearest neighbor pointer for j would become invalid. |
|---|
| 282 | // (We don't need to test for == Lmin, because in that case |
|---|
| 283 | // the net change needed is zero due to the change in row |
|---|
| 284 | // numbering). |
|---|
| 285 | if (g_uNearestNeighbor[j] == Rmin) |
|---|
| 286 | g_uNearestNeighbor[j] = Lmin; |
|---|
| 287 | |
|---|
| 288 | #if TRACE |
|---|
| 289 | Log("New dist to %u = (%u/%.3g + %u/%.3g)/2 = %.3g\n", |
|---|
| 290 | j, Lmin, dL, Rmin, dR, dtNewDist); |
|---|
| 291 | #endif |
|---|
| 292 | g_Dist[vL] = dtNewDist; |
|---|
| 293 | if (dtNewDist < dtNewMinDist) |
|---|
| 294 | { |
|---|
| 295 | dtNewMinDist = dtNewDist; |
|---|
| 296 | uNewNearestNeighbor = j; |
|---|
| 297 | } |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | assert(g_uInternalNodeIndex < g_uLeafCount - 1 || BIG_DIST != dtNewMinDist); |
|---|
| 301 | assert(g_uInternalNodeIndex < g_uLeafCount - 1 || uInsane != uNewNearestNeighbor); |
|---|
| 302 | |
|---|
| 303 | const unsigned v = TriangleSubscript(Lmin, Rmin); |
|---|
| 304 | const dist_t dLR = g_Dist[v]; |
|---|
| 305 | const dist_t dHeightNew = dLR/2; |
|---|
| 306 | const unsigned uLeft = g_uNodeIndex[Lmin]; |
|---|
| 307 | const unsigned uRight = g_uNodeIndex[Rmin]; |
|---|
| 308 | const dist_t HeightLeft = |
|---|
| 309 | uLeft < g_uLeafCount ? 0 : g_Height[uLeft - g_uLeafCount]; |
|---|
| 310 | const dist_t HeightRight = |
|---|
| 311 | uRight < g_uLeafCount ? 0 : g_Height[uRight - g_uLeafCount]; |
|---|
| 312 | |
|---|
| 313 | g_uLeft[g_uInternalNodeIndex] = uLeft; |
|---|
| 314 | g_uRight[g_uInternalNodeIndex] = uRight; |
|---|
| 315 | g_LeftLength[g_uInternalNodeIndex] = dHeightNew - HeightLeft; |
|---|
| 316 | g_RightLength[g_uInternalNodeIndex] = dHeightNew - HeightRight; |
|---|
| 317 | g_Height[g_uInternalNodeIndex] = dHeightNew; |
|---|
| 318 | |
|---|
| 319 | // Row for left child overwritten by row for new node |
|---|
| 320 | g_uNodeIndex[Lmin] = g_uLeafCount + g_uInternalNodeIndex; |
|---|
| 321 | g_uNearestNeighbor[Lmin] = uNewNearestNeighbor; |
|---|
| 322 | g_MinDist[Lmin] = dtNewMinDist; |
|---|
| 323 | |
|---|
| 324 | // Delete row for right child |
|---|
| 325 | g_uNodeIndex[Rmin] = uInsane; |
|---|
| 326 | |
|---|
| 327 | #if TRACE |
|---|
| 328 | Log("\nInternalNodeIndex=%u Lmin=%u Rmin=%u\n", |
|---|
| 329 | g_uInternalNodeIndex, Lmin, Rmin); |
|---|
| 330 | ListState(); |
|---|
| 331 | #endif |
|---|
| 332 | } |
|---|
| 333 | |
|---|
| 334 | unsigned uRoot = g_uLeafCount - 2; |
|---|
| 335 | tree.Create(g_uLeafCount, uRoot, g_uLeft, g_uRight, g_LeftLength, g_RightLength, |
|---|
| 336 | Ids, Names); |
|---|
| 337 | |
|---|
| 338 | #if TRACE |
|---|
| 339 | tree.LogMe(); |
|---|
| 340 | #endif |
|---|
| 341 | |
|---|
| 342 | delete[] g_Dist; |
|---|
| 343 | |
|---|
| 344 | delete[] g_uNodeIndex; |
|---|
| 345 | delete[] g_uNearestNeighbor; |
|---|
| 346 | delete[] g_MinDist; |
|---|
| 347 | delete[] g_Height; |
|---|
| 348 | |
|---|
| 349 | delete[] g_uLeft; |
|---|
| 350 | delete[] g_uRight; |
|---|
| 351 | delete[] g_LeftLength; |
|---|
| 352 | delete[] g_RightLength; |
|---|
| 353 | |
|---|
| 354 | for (unsigned i = 0; i < g_uLeafCount; ++i) |
|---|
| 355 | free(Names[i]); |
|---|
| 356 | delete[] Names; |
|---|
| 357 | delete[] Ids; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | class DistCalcTest : public DistCalc |
|---|
| 361 | { |
|---|
| 362 | virtual void CalcDistRange(unsigned i, dist_t Dist[]) const |
|---|
| 363 | { |
|---|
| 364 | static dist_t TestDist[5][5] = |
|---|
| 365 | { |
|---|
| 366 | 0, 2, 14, 14, 20, |
|---|
| 367 | 2, 0, 14, 14, 20, |
|---|
| 368 | 14, 14, 0, 4, 20, |
|---|
| 369 | 14, 14, 4, 0, 20, |
|---|
| 370 | 20, 20, 20, 20, 0, |
|---|
| 371 | }; |
|---|
| 372 | for (unsigned j = 0; j < i; ++j) |
|---|
| 373 | Dist[j] = TestDist[i][j]; |
|---|
| 374 | } |
|---|
| 375 | virtual unsigned GetCount() const |
|---|
| 376 | { |
|---|
| 377 | return 5; |
|---|
| 378 | } |
|---|
| 379 | virtual unsigned GetId(unsigned i) const |
|---|
| 380 | { |
|---|
| 381 | return i; |
|---|
| 382 | } |
|---|
| 383 | virtual const char *GetName(unsigned i) const |
|---|
| 384 | { |
|---|
| 385 | return "name"; |
|---|
| 386 | } |
|---|
| 387 | }; |
|---|
| 388 | |
|---|
| 389 | void Test() |
|---|
| 390 | { |
|---|
| 391 | SetListFileName("c:\\tmp\\lobster.log", false); |
|---|
| 392 | DistCalcTest DC; |
|---|
| 393 | Tree tree; |
|---|
| 394 | UPGMA2(DC, tree, LINKAGE_Avg); |
|---|
| 395 | } |
|---|