1 | /*** |
---|
2 | Gerstein/Sonnhammer/Chothia ad hoc sequence weighting. |
---|
3 | The algorithm was deduced by reverse-engineering the |
---|
4 | HMMer code. |
---|
5 | |
---|
6 | I used an alternative representation that I prefer over |
---|
7 | HMMer's. The HMMer code is full of tree manipulations |
---|
8 | that do something to the left child and then the equivalent |
---|
9 | thing to the right child. It was clear that there must be |
---|
10 | a re-formulation that does everything once for each node, |
---|
11 | which would reduce the number of operations expressed |
---|
12 | in the code by a factor of two. This gives a more elegant |
---|
13 | and less error-prone way to code it. |
---|
14 | |
---|
15 | These notes explain the correspondence between my design |
---|
16 | and Eddy's. |
---|
17 | |
---|
18 | HMMer stores a data structure phylo_s for each non-leaf |
---|
19 | node in the cluster tree. This structure contains the |
---|
20 | following fields: |
---|
21 | |
---|
22 | diff Weight of the node |
---|
23 | lblen Left branch length |
---|
24 | rblen Right branch length |
---|
25 | |
---|
26 | The lblen and rblen branch lengths are calculated as: |
---|
27 | |
---|
28 | this.lblen = this.diff - left.diff |
---|
29 | this.rblen = this.diff - right.diff |
---|
30 | |
---|
31 | My code stores one ClusterNode data structure per node |
---|
32 | in the cluster tree, including leaves. I store only the |
---|
33 | weight. I can recover the HMMer branch length fields |
---|
34 | in a trivial O(1) calculation as follows: |
---|
35 | |
---|
36 | lblen = Node.GetWeight() - Node.GetLeft()->GetWeight() |
---|
37 | rblen = Node.GetWeight() - Node.GetRight()->GetWeight() |
---|
38 | |
---|
39 | For the GSC weights calculation, HMMer constructs the |
---|
40 | following vectors, which have entries for all nodes, |
---|
41 | including leaves: |
---|
42 | |
---|
43 | lwt Left weight |
---|
44 | rwt Right weight |
---|
45 | |
---|
46 | The "left weight" is calculated as the sum of the weights in |
---|
47 | all the nodes reachable through the left branch, including |
---|
48 | the node itself. (This is not immediately obvious from the |
---|
49 | code, which does the calculation using branch lengths rather |
---|
50 | than weights, but this is an equivalent, and to my mind clearer, |
---|
51 | statement of what they are). Similarly, the "right weight" is |
---|
52 | the sum of all weights reachable via the right branch. I define |
---|
53 | the "cluster weight" to be the summed weight of all nodes in the |
---|
54 | subtree under the node, including the node itself. I provide |
---|
55 | a function Node.GetClusterWeight() which calculates the cluster |
---|
56 | weight using a O(ln N) recursion through the tree. The lwt and |
---|
57 | rwt values can be recovered as follows: |
---|
58 | |
---|
59 | lwt = Node.GetLeft()->GetClusterWeight() |
---|
60 | + Node.GetWeight() |
---|
61 | |
---|
62 | lwt = Node.GetLeft()->GetClusterWeight() |
---|
63 | + Node.GetWeight() |
---|
64 | |
---|
65 | HMMer calculates a further vector fwt as follows. |
---|
66 | |
---|
67 | this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt) |
---|
68 | |
---|
69 | This applies to nodes reached via a left branch, for nodes reached |
---|
70 | via a right branch: |
---|
71 | |
---|
72 | this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt) |
---|
73 | |
---|
74 | The values of fwt at the leaf nodes are the final GSC weights. |
---|
75 | We derive the various terms using our equivalents. |
---|
76 | |
---|
77 | parent.lwt = Parent.GetLeft()->GetClusterWeight() |
---|
78 | + Parent.GetWeight() |
---|
79 | |
---|
80 | parent.rwt = Parent.GetRight()->GetClusterWeight() |
---|
81 | + Parent.GetWeight() |
---|
82 | |
---|
83 | parent.lwt + parent.rwt = |
---|
84 | { Parent.GetLeft()->GetClusterWeight() |
---|
85 | + Parent.GetRight()->GetClusterWeight() |
---|
86 | + Parent.GetWeight() } |
---|
87 | + Parent.GetWeight() |
---|
88 | |
---|
89 | We recognize the term {...} as the cluster weight of the |
---|
90 | parent, so |
---|
91 | |
---|
92 | parent.lwt + parent.rwt |
---|
93 | = Parent.GetClusterWeight() |
---|
94 | + Parent.GetWeight() |
---|
95 | |
---|
96 | As you would expect, repeating this exercise for parent.rwt gives |
---|
97 | exactly the same expression. |
---|
98 | |
---|
99 | The GSC weights (fwt) are stored in the Weight2 field of the cluster |
---|
100 | tree, the Weight field stores the original (BLOSUM) weights used |
---|
101 | as input to this algorithm. |
---|
102 | ***/ |
---|
103 | |
---|
104 | #include "muscle.h" |
---|
105 | #include "msa.h" |
---|
106 | #include "cluster.h" |
---|
107 | #include "distfunc.h" |
---|
108 | |
---|
109 | // Set weights of all sequences in the subtree under given node. |
---|
110 | void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const |
---|
111 | { |
---|
112 | if (0 == ptrNode) |
---|
113 | return; |
---|
114 | |
---|
115 | const ClusterNode *ptrRight = ptrNode->GetRight(); |
---|
116 | const ClusterNode *ptrLeft = ptrNode->GetLeft(); |
---|
117 | |
---|
118 | // If leaf, set weight |
---|
119 | if (0 == ptrRight && 0 == ptrLeft) |
---|
120 | { |
---|
121 | unsigned uIndex = ptrNode->GetIndex(); |
---|
122 | double dWeight = ptrNode->GetWeight2(); |
---|
123 | WEIGHT w = DoubleToWeight(dWeight); |
---|
124 | m_Weights[uIndex] = w; |
---|
125 | return; |
---|
126 | } |
---|
127 | |
---|
128 | // Otherwise, recursively set subtrees |
---|
129 | SetSubtreeWeight2(ptrLeft); |
---|
130 | SetSubtreeWeight2(ptrRight); |
---|
131 | } |
---|
132 | |
---|
133 | void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const |
---|
134 | { |
---|
135 | if (0 == ptrNode) |
---|
136 | return; |
---|
137 | |
---|
138 | ClusterNode *ptrParent = ptrNode->GetParent(); |
---|
139 | double dParentWeight2 = ptrParent->GetWeight2(); |
---|
140 | double dParentClusterWeight = ptrParent->GetClusterWeight(); |
---|
141 | if (0.0 == dParentClusterWeight) |
---|
142 | { |
---|
143 | double dThisClusterSize = ptrNode->GetClusterSize(); |
---|
144 | double dParentClusterSize = ptrParent->GetClusterSize(); |
---|
145 | double dWeight2 = |
---|
146 | dParentWeight2*dThisClusterSize/dParentClusterSize; |
---|
147 | ptrNode->SetWeight2(dWeight2); |
---|
148 | } |
---|
149 | else |
---|
150 | { |
---|
151 | // Could cache cluster weights for better performance. |
---|
152 | // We calculate cluster weight of each node twice, so this |
---|
153 | // would give x2 improvement. |
---|
154 | // As weighting is not very expensive, we don't care. |
---|
155 | double dThisClusterWeight = ptrNode->GetClusterWeight(); |
---|
156 | double dParentWeight = ptrParent->GetWeight(); |
---|
157 | |
---|
158 | double dNum = dThisClusterWeight + dParentWeight; |
---|
159 | double dDenom = dParentClusterWeight + dParentWeight; |
---|
160 | double dWeight2 = dParentWeight2*(dNum/dDenom); |
---|
161 | |
---|
162 | ptrNode->SetWeight2(dWeight2); |
---|
163 | } |
---|
164 | |
---|
165 | SetSubtreeGSCWeight(ptrNode->GetLeft()); |
---|
166 | SetSubtreeGSCWeight(ptrNode->GetRight()); |
---|
167 | } |
---|
168 | |
---|
169 | void MSA::SetGSCWeights() const |
---|
170 | { |
---|
171 | ClusterTree CT; |
---|
172 | CalcBLOSUMWeights(CT); |
---|
173 | |
---|
174 | // Calculate weights and store in tree. |
---|
175 | ClusterNode *ptrRoot = CT.GetRoot(); |
---|
176 | ptrRoot->SetWeight2(1.0); |
---|
177 | SetSubtreeGSCWeight(ptrRoot->GetLeft()); |
---|
178 | SetSubtreeGSCWeight(ptrRoot->GetRight()); |
---|
179 | |
---|
180 | // Copy weights from tree to MSA. |
---|
181 | SetSubtreeWeight2(ptrRoot); |
---|
182 | } |
---|
183 | |
---|
184 | void MSA::ListWeights() const |
---|
185 | { |
---|
186 | const unsigned uSeqCount = GetSeqCount(); |
---|
187 | Log("Weights:\n"); |
---|
188 | WEIGHT wTotal = 0; |
---|
189 | for (unsigned n = 0; n < uSeqCount; ++n) |
---|
190 | { |
---|
191 | wTotal += GetSeqWeight(n); |
---|
192 | Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n)); |
---|
193 | } |
---|
194 | Log("Total weights = %6.3f, should be 1.0\n", wTotal); |
---|
195 | } |
---|