1 | /*** |
---|
2 | Code for implementing HMMer's "BLOSUM weighting" algorithm. |
---|
3 | |
---|
4 | The algorithm was deduced by reverse-engineering the HMMer code. |
---|
5 | |
---|
6 | The HMMer documentation refers to BLOSUM weighting as "Henikoff |
---|
7 | simple filter weighting" |
---|
8 | |
---|
9 | The name BLOSUM implied to me that HMMer would be using a |
---|
10 | substitution probability matrix to compute distances, but this |
---|
11 | turned out not to be the case. |
---|
12 | |
---|
13 | It is notable, not to say puzzling, that the HMMer BLOSUM weighting |
---|
14 | algorithm is guaranteed to produce an integral NIC (number-of-indepdent- |
---|
15 | counts, also known as effective sequence count). Presumably Eddy must |
---|
16 | have known this, though he doesn't comment on it and he computes & stores |
---|
17 | the value in a float. |
---|
18 | |
---|
19 | Here's the algorithm: |
---|
20 | |
---|
21 | Distances between two sequences are based on the average of a simple |
---|
22 | binary equal (one) / not equal (zero) at each position. The only thing |
---|
23 | that has anything to do with BLOSUM in this calculation is an obscure |
---|
24 | (to me) constant value of 0.62. The sequences are clustered using this |
---|
25 | distance. If the pairwise identity (fraction of identical positions) |
---|
26 | is less than 0.62, they get assigned to disjoint clusters, the final |
---|
27 | number of disjoint clusters is the NIC. This makes some intuitive sense: |
---|
28 | I would interpret this by saying that if a set of sequences are close |
---|
29 | enough they count as one sequence. The weight for each sequence within a |
---|
30 | disjoint cluster is then determined to be 1 / (clustersize), from which it |
---|
31 | follows that the sum of all weights is equal to the number of disjoint |
---|
32 | clusters and is thus guaranteed to be an integer value. It is exactly this |
---|
33 | sum that HMMer uses for the NIC, by default. |
---|
34 | |
---|
35 | The individual BLOSUM sequence weights are not used for anything else in |
---|
36 | HMMer, unless you specify that BLOSUM weighting should override the default |
---|
37 | GSC weighting. GSC weighting uses a different clustering algorithm to |
---|
38 | determine relative weights. The BLOSUM NIC is then distributed over the |
---|
39 | GSC tree according to those relative weights. |
---|
40 | ***/ |
---|
41 | |
---|
42 | #include "muscle.h" |
---|
43 | #include "msa.h" |
---|
44 | #include "cluster.h" |
---|
45 | #include "distfunc.h" |
---|
46 | |
---|
47 | // Set weights of all sequences in the subtree under given node. |
---|
48 | void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const |
---|
49 | { |
---|
50 | if (0 == ptrNode) |
---|
51 | return; |
---|
52 | |
---|
53 | const ClusterNode *ptrRight = ptrNode->GetRight(); |
---|
54 | const ClusterNode *ptrLeft = ptrNode->GetLeft(); |
---|
55 | |
---|
56 | // If leaf, set weight |
---|
57 | if (0 == ptrRight && 0 == ptrLeft) |
---|
58 | { |
---|
59 | unsigned uIndex = ptrNode->GetIndex(); |
---|
60 | WEIGHT w = DoubleToWeight(dWeight); |
---|
61 | m_Weights[uIndex] = w; |
---|
62 | return; |
---|
63 | } |
---|
64 | |
---|
65 | // Otherwise, recursively set subtrees |
---|
66 | SetBLOSUMSubtreeWeight(ptrLeft, dWeight); |
---|
67 | SetBLOSUMSubtreeWeight(ptrRight, dWeight); |
---|
68 | } |
---|
69 | |
---|
70 | // Traverse a subtree looking for clusters where all |
---|
71 | // the leaves are sufficiently similar that they |
---|
72 | // should be weighted as a group, i.e. given a weight |
---|
73 | // of 1/N where N is the cluster size. The idea is |
---|
74 | // to avoid sample bias where we have closely related |
---|
75 | // sequences in the input alignment. |
---|
76 | // The weight at a node is the distance between |
---|
77 | // the two closest sequences in the left and right |
---|
78 | // subtrees under that node. "Sufficiently similar" |
---|
79 | // is defined as being where that minimum distance |
---|
80 | // is less than the dMinDist threshhold. I don't know |
---|
81 | // why the clustering is done using a minimum rather |
---|
82 | // than a maximum or average, either of which would |
---|
83 | // seem more natural to me. |
---|
84 | // Return value is number of groups under this node. |
---|
85 | // A "group" is the cluster found under a node with a |
---|
86 | // weight less than the minimum. |
---|
87 | unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const |
---|
88 | { |
---|
89 | if (0 == ptrNode) |
---|
90 | return 0; |
---|
91 | |
---|
92 | if (ptrNode->GetWeight() < dMinDist) |
---|
93 | { |
---|
94 | unsigned uClusterSize = ptrNode->GetClusterSize(); |
---|
95 | assert(uClusterSize > 0); |
---|
96 | double dWeight = 1.0 / uClusterSize; |
---|
97 | SetBLOSUMSubtreeWeight(ptrNode, dWeight); |
---|
98 | return 1; |
---|
99 | } |
---|
100 | |
---|
101 | const ClusterNode *ptrLeft = ptrNode->GetLeft(); |
---|
102 | const ClusterNode *ptrRight = ptrNode->GetRight(); |
---|
103 | |
---|
104 | unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist); |
---|
105 | unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist); |
---|
106 | |
---|
107 | return uLeftGroupCount + uRightGroupCount; |
---|
108 | } |
---|
109 | |
---|
110 | // Return value is the group count, i.e. the effective number |
---|
111 | // of distinctly different sequences. |
---|
112 | unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const |
---|
113 | { |
---|
114 | // Build distance matrix |
---|
115 | DistFunc DF; |
---|
116 | unsigned uSeqCount = GetSeqCount(); |
---|
117 | DF.SetCount(uSeqCount); |
---|
118 | for (unsigned i = 0; i < uSeqCount; ++i) |
---|
119 | for (unsigned j = i+1; j < uSeqCount; ++j) |
---|
120 | { |
---|
121 | double dDist = GetPctIdentityPair(i, j); |
---|
122 | assert(dDist >= 0.0 && dDist <= 1.0); |
---|
123 | DF.SetDist(i, j, (float) (1.0 - dDist)); |
---|
124 | } |
---|
125 | |
---|
126 | // Cluster based on the distance function |
---|
127 | BlosumCluster.Create(DF); |
---|
128 | |
---|
129 | // Return value is HMMer's "effective sequence count". |
---|
130 | return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST); |
---|
131 | } |
---|