1 | #include "muscle.h" |
---|
2 | #include "msa.h" |
---|
3 | |
---|
4 | /*** |
---|
5 | Compute Henikoff weights. |
---|
6 | Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. |
---|
7 | J. Mol. Biol., 243(4):574-578. |
---|
8 | |
---|
9 | Award each different residue an equal share of the weight, and then to divide up |
---|
10 | that weight equally among the sequences sharing the same residue. So if in a |
---|
11 | position of a multiple alignment, r different residues are represented, a residue |
---|
12 | represented in only one sequence contributes a score of 1/r to that sequence, whereas a |
---|
13 | residue represented in s sequences contributes a score of 1/rs to each of the s |
---|
14 | sequences. For each sequence, the contributions from each position are summed to give |
---|
15 | a sequence weight. |
---|
16 | |
---|
17 | See also HenikoffWeightPB. |
---|
18 | ***/ |
---|
19 | |
---|
20 | void MSA::CalcHenikoffWeightsCol(unsigned uColIndex) const |
---|
21 | { |
---|
22 | const unsigned uSeqCount = GetSeqCount(); |
---|
23 | |
---|
24 | // Compute letter counts in this column |
---|
25 | unsigned uLetterCount[MAX_ALPHA]; |
---|
26 | memset(uLetterCount, 0, sizeof(uLetterCount)); |
---|
27 | unsigned uDifferentLetterCount = 0; |
---|
28 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
29 | { |
---|
30 | unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); |
---|
31 | if (uLetter >= 20) |
---|
32 | continue; |
---|
33 | unsigned uNewCount = uLetterCount[uLetter] + 1; |
---|
34 | uLetterCount[uLetter] = uNewCount; |
---|
35 | if (1 == uNewCount) |
---|
36 | ++uDifferentLetterCount; |
---|
37 | } |
---|
38 | |
---|
39 | // Compute weight contributions |
---|
40 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
41 | { |
---|
42 | unsigned uLetter = GetLetterEx(uSeqIndex, uColIndex); |
---|
43 | if (uLetter >= 20) |
---|
44 | continue; |
---|
45 | const unsigned uCount = uLetterCount[uLetter]; |
---|
46 | unsigned uDenom = uCount*uDifferentLetterCount; |
---|
47 | if (uDenom == 0) |
---|
48 | continue; |
---|
49 | m_Weights[uSeqIndex] += (WEIGHT) (1.0/uDenom); |
---|
50 | } |
---|
51 | } |
---|
52 | |
---|
53 | void MSA::SetHenikoffWeights() const |
---|
54 | { |
---|
55 | const unsigned uColCount = GetColCount(); |
---|
56 | const unsigned uSeqCount = GetSeqCount(); |
---|
57 | |
---|
58 | if (0 == uSeqCount) |
---|
59 | return; |
---|
60 | else if (1 == uSeqCount) |
---|
61 | { |
---|
62 | m_Weights[0] = (WEIGHT) 1.0; |
---|
63 | return; |
---|
64 | } |
---|
65 | else if (2 == uSeqCount) |
---|
66 | { |
---|
67 | m_Weights[0] = (WEIGHT) 0.5; |
---|
68 | m_Weights[1] = (WEIGHT) 0.5; |
---|
69 | return; |
---|
70 | } |
---|
71 | |
---|
72 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
73 | m_Weights[uSeqIndex] = 0.0; |
---|
74 | |
---|
75 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
76 | CalcHenikoffWeightsCol(uColIndex); |
---|
77 | |
---|
78 | // Set all-gap seqs weight to 0 |
---|
79 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
80 | if (IsGapSeq(uSeqIndex)) |
---|
81 | m_Weights[uSeqIndex] = 0.0; |
---|
82 | |
---|
83 | Normalize(m_Weights, uSeqCount); |
---|
84 | } |
---|