| 1 | /*** |
|---|
| 2 | Code for implementing HMMer's "BLOSUM weighting" algorithm. |
|---|
| 3 | |
|---|
| 4 | The algorithm was deduced by reverse-engineering the HMMer code. |
|---|
| 5 | |
|---|
| 6 | The HMMer documentation refers to BLOSUM weighting as "Henikoff |
|---|
| 7 | simple filter weighting" |
|---|
| 8 | |
|---|
| 9 | The name BLOSUM implied to me that HMMer would be using a |
|---|
| 10 | substitution probability matrix to compute distances, but this |
|---|
| 11 | turned out not to be the case. |
|---|
| 12 | |
|---|
| 13 | It is notable, not to say puzzling, that the HMMer BLOSUM weighting |
|---|
| 14 | algorithm is guaranteed to produce an integral NIC (number-of-indepdent- |
|---|
| 15 | counts, also known as effective sequence count). Presumably Eddy must |
|---|
| 16 | have known this, though he doesn't comment on it and he computes & stores |
|---|
| 17 | the value in a float. |
|---|
| 18 | |
|---|
| 19 | Here's the algorithm: |
|---|
| 20 | |
|---|
| 21 | Distances between two sequences are based on the average of a simple |
|---|
| 22 | binary equal (one) / not equal (zero) at each position. The only thing |
|---|
| 23 | that has anything to do with BLOSUM in this calculation is an obscure |
|---|
| 24 | (to me) constant value of 0.62. The sequences are clustered using this |
|---|
| 25 | distance. If the pairwise identity (fraction of identical positions) |
|---|
| 26 | is less than 0.62, they get assigned to disjoint clusters, the final |
|---|
| 27 | number of disjoint clusters is the NIC. This makes some intuitive sense: |
|---|
| 28 | I would interpret this by saying that if a set of sequences are close |
|---|
| 29 | enough they count as one sequence. The weight for each sequence within a |
|---|
| 30 | disjoint cluster is then determined to be 1 / (clustersize), from which it |
|---|
| 31 | follows that the sum of all weights is equal to the number of disjoint |
|---|
| 32 | clusters and is thus guaranteed to be an integer value. It is exactly this |
|---|
| 33 | sum that HMMer uses for the NIC, by default. |
|---|
| 34 | |
|---|
| 35 | The individual BLOSUM sequence weights are not used for anything else in |
|---|
| 36 | HMMer, unless you specify that BLOSUM weighting should override the default |
|---|
| 37 | GSC weighting. GSC weighting uses a different clustering algorithm to |
|---|
| 38 | determine relative weights. The BLOSUM NIC is then distributed over the |
|---|
| 39 | GSC tree according to those relative weights. |
|---|
| 40 | ***/ |
|---|
| 41 | |
|---|
| 42 | #include "muscle.h" |
|---|
| 43 | #include "msa.h" |
|---|
| 44 | #include "cluster.h" |
|---|
| 45 | #include "distfunc.h" |
|---|
| 46 | |
|---|
| 47 | // Set weights of all sequences in the subtree under given node. |
|---|
| 48 | void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const |
|---|
| 49 | { |
|---|
| 50 | if (0 == ptrNode) |
|---|
| 51 | return; |
|---|
| 52 | |
|---|
| 53 | const ClusterNode *ptrRight = ptrNode->GetRight(); |
|---|
| 54 | const ClusterNode *ptrLeft = ptrNode->GetLeft(); |
|---|
| 55 | |
|---|
| 56 | // If leaf, set weight |
|---|
| 57 | if (0 == ptrRight && 0 == ptrLeft) |
|---|
| 58 | { |
|---|
| 59 | unsigned uIndex = ptrNode->GetIndex(); |
|---|
| 60 | WEIGHT w = DoubleToWeight(dWeight); |
|---|
| 61 | m_Weights[uIndex] = w; |
|---|
| 62 | return; |
|---|
| 63 | } |
|---|
| 64 | |
|---|
| 65 | // Otherwise, recursively set subtrees |
|---|
| 66 | SetBLOSUMSubtreeWeight(ptrLeft, dWeight); |
|---|
| 67 | SetBLOSUMSubtreeWeight(ptrRight, dWeight); |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | // Traverse a subtree looking for clusters where all |
|---|
| 71 | // the leaves are sufficiently similar that they |
|---|
| 72 | // should be weighted as a group, i.e. given a weight |
|---|
| 73 | // of 1/N where N is the cluster size. The idea is |
|---|
| 74 | // to avoid sample bias where we have closely related |
|---|
| 75 | // sequences in the input alignment. |
|---|
| 76 | // The weight at a node is the distance between |
|---|
| 77 | // the two closest sequences in the left and right |
|---|
| 78 | // subtrees under that node. "Sufficiently similar" |
|---|
| 79 | // is defined as being where that minimum distance |
|---|
| 80 | // is less than the dMinDist threshhold. I don't know |
|---|
| 81 | // why the clustering is done using a minimum rather |
|---|
| 82 | // than a maximum or average, either of which would |
|---|
| 83 | // seem more natural to me. |
|---|
| 84 | // Return value is number of groups under this node. |
|---|
| 85 | // A "group" is the cluster found under a node with a |
|---|
| 86 | // weight less than the minimum. |
|---|
| 87 | unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const |
|---|
| 88 | { |
|---|
| 89 | if (0 == ptrNode) |
|---|
| 90 | return 0; |
|---|
| 91 | |
|---|
| 92 | if (ptrNode->GetWeight() < dMinDist) |
|---|
| 93 | { |
|---|
| 94 | unsigned uClusterSize = ptrNode->GetClusterSize(); |
|---|
| 95 | assert(uClusterSize > 0); |
|---|
| 96 | double dWeight = 1.0 / uClusterSize; |
|---|
| 97 | SetBLOSUMSubtreeWeight(ptrNode, dWeight); |
|---|
| 98 | return 1; |
|---|
| 99 | } |
|---|
| 100 | |
|---|
| 101 | const ClusterNode *ptrLeft = ptrNode->GetLeft(); |
|---|
| 102 | const ClusterNode *ptrRight = ptrNode->GetRight(); |
|---|
| 103 | |
|---|
| 104 | unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist); |
|---|
| 105 | unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist); |
|---|
| 106 | |
|---|
| 107 | return uLeftGroupCount + uRightGroupCount; |
|---|
| 108 | } |
|---|
| 109 | |
|---|
| 110 | // Return value is the group count, i.e. the effective number |
|---|
| 111 | // of distinctly different sequences. |
|---|
| 112 | unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const |
|---|
| 113 | { |
|---|
| 114 | // Build distance matrix |
|---|
| 115 | DistFunc DF; |
|---|
| 116 | unsigned uSeqCount = GetSeqCount(); |
|---|
| 117 | DF.SetCount(uSeqCount); |
|---|
| 118 | for (unsigned i = 0; i < uSeqCount; ++i) |
|---|
| 119 | for (unsigned j = i+1; j < uSeqCount; ++j) |
|---|
| 120 | { |
|---|
| 121 | double dDist = GetPctIdentityPair(i, j); |
|---|
| 122 | assert(dDist >= 0.0 && dDist <= 1.0); |
|---|
| 123 | DF.SetDist(i, j, (float) (1.0 - dDist)); |
|---|
| 124 | } |
|---|
| 125 | |
|---|
| 126 | // Cluster based on the distance function |
|---|
| 127 | BlosumCluster.Create(DF); |
|---|
| 128 | |
|---|
| 129 | // Return value is HMMer's "effective sequence count". |
|---|
| 130 | return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST); |
|---|
| 131 | } |
|---|