source: branches/nameserver/GDE/MUSCLE/src/setgscweights.cpp

Last change on this file was 10390, checked in by aboeckma, 11 years ago

added muscle sourcles amd makefile

File size: 5.9 KB
Line 
1/***
2Gerstein/Sonnhammer/Chothia ad hoc sequence weighting.
3The algorithm was deduced by reverse-engineering the
4HMMer code.
5
6I used an alternative representation that I prefer over
7HMMer's. The HMMer code is full of tree manipulations
8that do something to the left child and then the equivalent
9thing to the right child. It was clear that there must be
10a re-formulation that does everything once for each node,
11which would reduce the number of operations expressed
12in the code by a factor of two. This gives a more elegant
13and less error-prone way to code it.
14
15These notes explain the correspondence between my design
16and Eddy's.
17
18HMMer stores a data structure phylo_s for each non-leaf
19node in the cluster tree. This structure contains the
20following fields:
21
22        diff            Weight of the node
23        lblen           Left branch length
24        rblen           Right branch length
25
26The lblen and rblen branch lengths are calculated as:
27
28        this.lblen = this.diff - left.diff
29        this.rblen = this.diff - right.diff
30
31My code stores one ClusterNode data structure per node
32in the cluster tree, including leaves. I store only the
33weight. I can recover the HMMer branch length fields
34in a trivial O(1) calculation as follows:
35
36        lblen = Node.GetWeight() - Node.GetLeft()->GetWeight()
37        rblen = Node.GetWeight() - Node.GetRight()->GetWeight()
38
39For the GSC weights calculation, HMMer constructs the
40following vectors, which have entries for all nodes,
41including leaves:
42
43        lwt             Left weight
44        rwt             Right weight
45
46The "left weight" is calculated as the sum of the weights in
47all the nodes reachable through the left branch, including
48the node itself. (This is not immediately obvious from the
49code, which does the calculation using branch lengths rather
50than weights, but this is an equivalent, and to my mind clearer,
51statement of what they are). Similarly, the "right weight" is
52the sum of all weights reachable via the right branch. I define
53the "cluster weight" to be the summed weight of all nodes in the
54subtree under the node, including the node itself. I provide
55a function Node.GetClusterWeight() which calculates the cluster
56weight using a O(ln N) recursion through the tree. The lwt and
57rwt values can be recovered as follows:
58
59        lwt             = Node.GetLeft()->GetClusterWeight()
60                        + Node.GetWeight()
61
62        lwt             = Node.GetLeft()->GetClusterWeight()
63                        + Node.GetWeight()
64
65HMMer calculates a further vector fwt as follows.
66
67        this.fwt = parent.fwt * parent.lwt / (parent.lwt + parent.rwt)
68
69This applies to nodes reached via a left branch, for nodes reached
70via a right branch:
71
72        this.fwt = parent.fwt * parent.rwt / (parent.lwt + parent.rwt)
73
74The values of fwt at the leaf nodes are the final GSC weights.
75We derive the various terms using our equivalents.
76
77        parent.lwt      = Parent.GetLeft()->GetClusterWeight()
78                                + Parent.GetWeight()
79
80        parent.rwt      = Parent.GetRight()->GetClusterWeight()
81                                + Parent.GetWeight()
82
83        parent.lwt + parent.rwt =
84                                { Parent.GetLeft()->GetClusterWeight()
85                                + Parent.GetRight()->GetClusterWeight()
86                                + Parent.GetWeight() }
87                                + Parent.GetWeight()
88
89We recognize the term {...} as the cluster weight of the
90parent, so
91
92        parent.lwt + parent.rwt
93                                = Parent.GetClusterWeight()
94                                + Parent.GetWeight()
95
96As you would expect, repeating this exercise for parent.rwt gives
97exactly the same expression.
98
99The GSC weights (fwt) are stored in the Weight2 field of the cluster
100tree, the Weight field stores the original (BLOSUM) weights used
101as input to this algorithm.
102***/
103
104#include "muscle.h"
105#include "msa.h"
106#include "cluster.h"
107#include "distfunc.h"
108
109// Set weights of all sequences in the subtree under given node.
110void MSA::SetSubtreeWeight2(const ClusterNode *ptrNode) const
111        {
112        if (0 == ptrNode)
113                return;
114
115        const ClusterNode *ptrRight = ptrNode->GetRight();
116        const ClusterNode *ptrLeft = ptrNode->GetLeft();
117
118// If leaf, set weight
119        if (0 == ptrRight && 0 == ptrLeft)
120                {
121                unsigned uIndex = ptrNode->GetIndex();
122                double dWeight = ptrNode->GetWeight2();
123                WEIGHT w = DoubleToWeight(dWeight);
124                m_Weights[uIndex] = w;
125                return;
126                }
127
128// Otherwise, recursively set subtrees
129        SetSubtreeWeight2(ptrLeft);
130        SetSubtreeWeight2(ptrRight);
131        }
132
133void MSA::SetSubtreeGSCWeight(ClusterNode *ptrNode) const
134        {
135        if (0 == ptrNode)
136                return;
137
138        ClusterNode *ptrParent = ptrNode->GetParent();
139        double dParentWeight2 = ptrParent->GetWeight2();
140        double dParentClusterWeight = ptrParent->GetClusterWeight();
141        if (0.0 == dParentClusterWeight)
142                {
143                double dThisClusterSize = ptrNode->GetClusterSize();
144                double dParentClusterSize = ptrParent->GetClusterSize();
145                double dWeight2 =
146                  dParentWeight2*dThisClusterSize/dParentClusterSize;
147                ptrNode->SetWeight2(dWeight2);
148                }
149        else
150                {
151        // Could cache cluster weights for better performance.
152        // We calculate cluster weight of each node twice, so this
153        // would give x2 improvement.
154        // As weighting is not very expensive, we don't care.
155                double dThisClusterWeight = ptrNode->GetClusterWeight();
156                double dParentWeight = ptrParent->GetWeight();
157
158                double dNum = dThisClusterWeight + dParentWeight;
159                double dDenom = dParentClusterWeight + dParentWeight;
160                double dWeight2 = dParentWeight2*(dNum/dDenom);
161
162                ptrNode->SetWeight2(dWeight2);
163                }
164
165        SetSubtreeGSCWeight(ptrNode->GetLeft());
166        SetSubtreeGSCWeight(ptrNode->GetRight());
167        }
168
169void MSA::SetGSCWeights() const
170        {
171        ClusterTree CT;
172        CalcBLOSUMWeights(CT);
173
174// Calculate weights and store in tree.
175        ClusterNode *ptrRoot = CT.GetRoot();
176        ptrRoot->SetWeight2(1.0);
177        SetSubtreeGSCWeight(ptrRoot->GetLeft());
178        SetSubtreeGSCWeight(ptrRoot->GetRight());
179
180// Copy weights from tree to MSA.
181        SetSubtreeWeight2(ptrRoot);
182        }
183 
184void MSA::ListWeights() const
185        {
186        const unsigned uSeqCount = GetSeqCount();
187        Log("Weights:\n");
188        WEIGHT wTotal = 0;
189        for (unsigned n = 0; n < uSeqCount; ++n)
190                {
191                wTotal += GetSeqWeight(n);
192                Log("%6.3f %s\n", GetSeqWeight(n), GetSeqName(n));
193                }
194        Log("Total weights = %6.3f, should be 1.0\n", wTotal);
195        }
Note: See TracBrowser for help on using the repository browser.