1 | #include "muscle.h" |
---|
2 | #include "msa.h" |
---|
3 | |
---|
4 | /*** |
---|
5 | Compute Henikoff weights. |
---|
6 | Steven Henikoff and Jorja G. Henikoff (1994), Position-based sequence weights. |
---|
7 | J. Mol. Biol., 243(4):574-578. |
---|
8 | |
---|
9 | Award each different residue an equal share of the weight, and then to divide up |
---|
10 | that weight equally among the sequences sharing the same residue. So if in a |
---|
11 | position of a multiple alignment, r different residues are represented, a residue |
---|
12 | represented in only one sequence contributes a score of 1/r to that sequence, whereas a |
---|
13 | residue represented in s sequences contributes a score of 1/rs to each of the s |
---|
14 | sequences. For each sequence, the contributions from each position are summed to give |
---|
15 | a sequence weight. |
---|
16 | |
---|
17 | Here we use the variant from PSI-BLAST, which (a) treats gaps as a 21st letter, |
---|
18 | and (b) ignores columns that are perfectly conserved. |
---|
19 | |
---|
20 | >>> WARNING -- I SUSPECT THIS DOESN'T WORK CORRECTLY <<< |
---|
21 | ***/ |
---|
22 | |
---|
23 | void MSA::CalcHenikoffWeightsColPB(unsigned uColIndex) const |
---|
24 | { |
---|
25 | const unsigned uSeqCount = GetSeqCount(); |
---|
26 | |
---|
27 | // Compute letter counts in this column |
---|
28 | unsigned uLetterCount[MAX_ALPHA+1]; |
---|
29 | memset(uLetterCount, 0, (MAX_ALPHA+1)*sizeof(unsigned)); |
---|
30 | unsigned uLetter; |
---|
31 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
32 | { |
---|
33 | if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) |
---|
34 | uLetter = MAX_ALPHA; |
---|
35 | else |
---|
36 | uLetter = GetLetter(uSeqIndex, uColIndex); |
---|
37 | ++(uLetterCount[uLetter]); |
---|
38 | } |
---|
39 | |
---|
40 | // Check for special case of perfect conservation |
---|
41 | for (unsigned uLetter = 0; uLetter < MAX_ALPHA+1; ++uLetter) |
---|
42 | { |
---|
43 | unsigned uCount = uLetterCount[uLetter]; |
---|
44 | if (uCount > 0) |
---|
45 | { |
---|
46 | // Perfectly conserved? |
---|
47 | if (uCount == uSeqCount) |
---|
48 | return; |
---|
49 | else |
---|
50 | // If count > 0 but less than nr. sequences, can't be conserved |
---|
51 | break; |
---|
52 | } |
---|
53 | } |
---|
54 | |
---|
55 | // Compute weight contributions |
---|
56 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
57 | { |
---|
58 | unsigned uLetter; |
---|
59 | if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) |
---|
60 | uLetter = MAX_ALPHA; |
---|
61 | else |
---|
62 | uLetter = GetLetter(uSeqIndex, uColIndex); |
---|
63 | const unsigned uCount = uLetterCount[uLetter]; |
---|
64 | m_Weights[uSeqIndex] += (WEIGHT) (1.0/uCount); |
---|
65 | } |
---|
66 | } |
---|
67 | |
---|
68 | bool MSA::IsGapSeq(unsigned uSeqIndex) const |
---|
69 | { |
---|
70 | const unsigned uColCount = GetColCount(); |
---|
71 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
72 | if (!IsGap(uSeqIndex, uColIndex)) |
---|
73 | return false; |
---|
74 | return true; |
---|
75 | } |
---|
76 | |
---|
77 | void MSA::SetUniformWeights() const |
---|
78 | { |
---|
79 | const unsigned uSeqCount = GetSeqCount(); |
---|
80 | if (0 == uSeqCount) |
---|
81 | return; |
---|
82 | |
---|
83 | const WEIGHT w = (WEIGHT) (1.0 / uSeqCount); |
---|
84 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
85 | m_Weights[uSeqIndex] = w; |
---|
86 | } |
---|
87 | |
---|
88 | void MSA::SetHenikoffWeightsPB() const |
---|
89 | { |
---|
90 | const unsigned uColCount = GetColCount(); |
---|
91 | const unsigned uSeqCount = GetSeqCount(); |
---|
92 | |
---|
93 | if (0 == uSeqCount) |
---|
94 | return; |
---|
95 | else if (1 == uSeqCount) |
---|
96 | { |
---|
97 | m_Weights[0] = 1.0; |
---|
98 | return; |
---|
99 | } |
---|
100 | else if (2 == uSeqCount) |
---|
101 | { |
---|
102 | m_Weights[0] = (WEIGHT) 0.5; |
---|
103 | m_Weights[1] = (WEIGHT) 0.5; |
---|
104 | return; |
---|
105 | } |
---|
106 | |
---|
107 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
108 | m_Weights[uSeqIndex] = 0.0; |
---|
109 | |
---|
110 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
111 | CalcHenikoffWeightsColPB(uColIndex); |
---|
112 | |
---|
113 | // Set all-gap seqs weight to 0 |
---|
114 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
115 | if (IsGapSeq(uSeqIndex)) |
---|
116 | m_Weights[uSeqIndex] = 0.0; |
---|
117 | |
---|
118 | // Check for special case of identical sequences, which will cause all |
---|
119 | // columns to be skipped becasue they're perfectly conserved. |
---|
120 | if (VectorIsZero(m_Weights, uSeqCount)) |
---|
121 | VectorSet(m_Weights, uSeqCount, 1.0); |
---|
122 | |
---|
123 | Normalize(m_Weights, uSeqCount); |
---|
124 | } |
---|