source: branches/help/GDE/MUSCLE/src/setblosumweights.cpp

Last change on this file was 10390, checked in by aboeckma, 11 years ago

added muscle sourcles amd makefile

File size: 4.8 KB
Line 
1/***
2Code for implementing HMMer's "BLOSUM weighting" algorithm.
3
4The algorithm was deduced by reverse-engineering the HMMer code.
5
6The HMMer documentation refers to BLOSUM weighting as "Henikoff
7simple filter weighting"
8
9The name BLOSUM implied to me that HMMer would be using a
10substitution probability matrix to compute distances, but this
11turned out not to be the case.
12
13It is notable, not to say puzzling, that the HMMer BLOSUM weighting
14algorithm  is guaranteed to produce an integral NIC (number-of-indepdent-
15counts, also known as effective sequence count). Presumably Eddy must
16have known this, though he doesn't comment on it and he computes & stores
17the value in a float.
18
19Here's the algorithm:
20
21Distances between two sequences are based on the average of a simple
22binary equal (one) / not equal (zero) at each position. The only thing
23that has  anything to do with BLOSUM in this calculation is an obscure
24(to me) constant  value of 0.62. The sequences are clustered using this
25distance. If the pairwise identity (fraction of  identical positions)
26is less than 0.62, they get assigned to disjoint clusters, the final
27number of disjoint clusters is the NIC. This makes some intuitive sense:
28I would interpret this by saying that if a set of sequences are close
29enough they count as one sequence. The weight for each sequence within a
30disjoint cluster is then determined to be 1 / (clustersize), from which it
31follows that the sum of all weights is equal to the number of disjoint
32clusters and is thus guaranteed to be an integer value. It is exactly this
33sum that HMMer uses for the NIC, by default.
34
35The individual BLOSUM sequence weights are not used for anything else in
36HMMer, unless you specify that BLOSUM weighting should override the default
37GSC  weighting. GSC weighting uses a different clustering algorithm to
38determine  relative weights. The BLOSUM NIC is then distributed over the
39GSC tree according to those relative weights.
40***/
41
42#include "muscle.h"
43#include "msa.h"
44#include "cluster.h"
45#include "distfunc.h"
46
47// Set weights of all sequences in the subtree under given node.
48void MSA::SetBLOSUMSubtreeWeight(const ClusterNode *ptrNode, double dWeight) const
49        {
50        if (0 == ptrNode)
51                return;
52
53        const ClusterNode *ptrRight = ptrNode->GetRight();
54        const ClusterNode *ptrLeft = ptrNode->GetLeft();
55
56// If leaf, set weight
57        if (0 == ptrRight && 0 == ptrLeft)
58                {
59                unsigned uIndex = ptrNode->GetIndex();
60                WEIGHT w = DoubleToWeight(dWeight);
61                m_Weights[uIndex] = w;
62                return;
63                }
64
65// Otherwise, recursively set subtrees
66        SetBLOSUMSubtreeWeight(ptrLeft, dWeight);
67        SetBLOSUMSubtreeWeight(ptrRight, dWeight);
68        }
69
70// Traverse a subtree looking for clusters where all
71// the leaves are sufficiently similar that they
72// should be weighted as a group, i.e. given a weight
73// of 1/N where N is the cluster size. The idea is
74// to avoid sample bias where we have closely related
75// sequences in the input alignment.
76// The weight at a node is the distance between
77// the two closest sequences in the left and right
78// subtrees under that node. "Sufficiently similar"
79// is defined as being where that minimum distance
80// is less than the dMinDist threshhold. I don't know
81// why the clustering is done using a minimum rather
82// than a maximum or average, either of which would
83// seem more natural to me.
84// Return value is number of groups under this node.
85// A "group" is the cluster found under a node with a
86// weight less than the minimum.
87unsigned MSA::SetBLOSUMNodeWeight(const ClusterNode *ptrNode, double dMinDist) const
88        {
89        if (0 == ptrNode)
90                return 0;
91
92        if (ptrNode->GetWeight() < dMinDist)
93                {
94                unsigned uClusterSize = ptrNode->GetClusterSize(); 
95                assert(uClusterSize > 0);
96                double dWeight = 1.0 / uClusterSize;
97                SetBLOSUMSubtreeWeight(ptrNode, dWeight);
98                return 1;
99                }
100
101        const ClusterNode *ptrLeft = ptrNode->GetLeft();
102        const ClusterNode *ptrRight = ptrNode->GetRight();
103
104        unsigned uLeftGroupCount = SetBLOSUMNodeWeight(ptrLeft, dMinDist);
105        unsigned uRightGroupCount = SetBLOSUMNodeWeight(ptrRight, dMinDist);
106
107        return uLeftGroupCount + uRightGroupCount;
108        }
109
110// Return value is the group count, i.e. the effective number
111// of distinctly different sequences.
112unsigned MSA::CalcBLOSUMWeights(ClusterTree &BlosumCluster) const
113        {
114// Build distance matrix
115        DistFunc DF;
116        unsigned uSeqCount = GetSeqCount();
117        DF.SetCount(uSeqCount);
118        for (unsigned i = 0; i < uSeqCount; ++i)
119                for (unsigned j = i+1; j < uSeqCount; ++j)
120                        {
121                        double dDist = GetPctIdentityPair(i, j);
122                        assert(dDist >= 0.0 && dDist <= 1.0);
123                        DF.SetDist(i, j, (float) (1.0 - dDist));
124                        }
125
126// Cluster based on the distance function
127        BlosumCluster.Create(DF);
128
129// Return value is HMMer's "effective sequence count".
130        return SetBLOSUMNodeWeight(BlosumCluster.GetRoot(), 1.0 - BLOSUM_DIST);
131        }
Note: See TracBrowser for help on using the repository browser.