source: branches/alilink/GDE/MUSCLE/src/henikoffweightpb.cpp

Last change on this file was 10390, checked in by aboeckma, 12 years ago

added muscle sourcles amd makefile

File size: 3.6 KB
Line 
1#include "muscle.h"
2#include "msa.h"
3
4/***
5Compute Henikoff weights.
6Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights.
7J. Mol. Biol., 243(4):574-578.
8
9Award each different residue an equal share of the weight, and then to divide up
10that weight equally among the sequences sharing the same residue. So if in a
11position of a multiple alignment, r different residues are represented, a residue
12represented in only one sequence contributes a score of 1/r to that sequence, whereas a
13residue represented in s sequences contributes a score of 1/rs to each of the s
14sequences. For each sequence, the contributions from each position are summed to give
15a sequence weight.
16
17Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter,
18and (b) ignores columns that are perfectly conserved.
19
20>>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<<
21***/
22
23void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const
24        {
25        const unsigned uSeqCount = GetSeqCount();
26
27// Compute letter counts in this column
28        unsigned uLetterCount[MAX_ALPHA+1];
29        memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned));
30        unsigned uLetter;
31        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
32                {
33                if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
34                        uLetter = MAX_ALPHA;
35                else
36                        uLetter = GetLetter(uSeqIndex, uColIndex);
37                ++(uLetterCount[uLetter]);
38                }
39
40// Check for special case of perfect conservation
41        for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter)
42                {
43                unsigned uCount = uLetterCount[uLetter];
44                if (uCount > 0)
45                        {
46                // Perfectly conserved?
47                        if (uCount == uSeqCount)
48                                return;
49                        else
50                        // If count > 0 but less than nr. sequences, can't be conserved
51                                break;
52                        }
53                }
54
55// Compute weight contributions
56        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
57                {
58                unsigned uLetter;
59                if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex))
60                        uLetter = MAX_ALPHA;
61                else
62                        uLetter = GetLetter(uSeqIndex, uColIndex);
63                const unsigned uCount = uLetterCount[uLetter];
64                m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount);
65                }
66        }
67
68bool MSA::IsGapSeq(unsigned uSeqIndex) const
69        {
70        const unsigned uColCount = GetColCount();
71        for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
72                if (!IsGap(uSeqIndex, uColIndex))
73                        return false;
74        return true;
75        }
76
77void MSA::SetUniformWeights() const
78        {
79        const unsigned uSeqCount = GetSeqCount();
80        if (0 == uSeqCount)
81                return;
82
83        const WEIGHT w = (WEIGHT) (1.0 / uSeqCount);
84        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
85                m_Weights[uSeqIndex] = w;
86        }
87
88void MSA::SetHenikoffWeightsPB() const
89        {
90        const unsigned uColCount = GetColCount();
91        const unsigned uSeqCount = GetSeqCount();
92
93        if (0 == uSeqCount)
94                return;
95        else if (1 == uSeqCount)
96                {
97                m_Weights[0] = 1.0;
98                return;
99                }
100        else if (2 == uSeqCount)
101                {
102                m_Weights[0] = (WEIGHT) 0.5;
103                m_Weights[1] = (WEIGHT) 0.5;
104                return;
105                }
106
107        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
108                m_Weights[uSeqIndex] = 0.0;
109
110        for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex)
111                CalcHenikoffWeightsColPB(uColIndex);
112
113// Set all-gap seqs weight to 0
114        for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex)
115                if (IsGapSeq(uSeqIndex))
116                        m_Weights[uSeqIndex] = 0.0;
117
118// Check for special case of identical sequences, which will cause all
119// columns to be skipped becasue they're perfectly conserved.
120        if (VectorIsZero(m_Weights, uSeqCount))
121                VectorSet(m_Weights, uSeqCount, 1.0);
122
123        Normalize(m_Weights, uSeqCount);
124        }
Note: See TracBrowser for help on using the repository browser.