1 | #include "muscle.h" |
---|
2 | #include "msa.h" |
---|
3 | #include "profile.h" |
---|
4 | |
---|
5 | #define TRACE 0 |
---|
6 | |
---|
7 | static void LogF(FCOUNT f) |
---|
8 | { |
---|
9 | if (f > -0.00001 && f < 0.00001) |
---|
10 | Log(" "); |
---|
11 | else |
---|
12 | Log(" %5.3f", f); |
---|
13 | } |
---|
14 | |
---|
15 | static const char *LocalScoreToStr(SCORE s) |
---|
16 | { |
---|
17 | static char str[16]; |
---|
18 | if (s < -1e10 || s > 1e10) |
---|
19 | return " *"; |
---|
20 | sprintf(str, "%5.1f", s); |
---|
21 | return str; |
---|
22 | } |
---|
23 | |
---|
24 | #if DOUBLE_AFFINE |
---|
25 | void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) |
---|
26 | { |
---|
27 | Log(" Pos Occ LL LG GL GG Open Close Open2 Clos2\n"); |
---|
28 | Log(" --- --- -- -- -- -- ---- ----- ----- -----\n"); |
---|
29 | for (unsigned n = 0; n < uLength; ++n) |
---|
30 | { |
---|
31 | const ProfPos &PP = Prof[n]; |
---|
32 | Log("%5u", n); |
---|
33 | LogF(PP.m_fOcc); |
---|
34 | LogF(PP.m_LL); |
---|
35 | LogF(PP.m_LG); |
---|
36 | LogF(PP.m_GL); |
---|
37 | LogF(PP.m_GG); |
---|
38 | Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen)); |
---|
39 | Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose)); |
---|
40 | Log(" %s", LocalScoreToStr(-PP.m_scoreGapOpen2)); |
---|
41 | Log(" %s", LocalScoreToStr(-PP.m_scoreGapClose2)); |
---|
42 | if (0 != ptrMSA) |
---|
43 | { |
---|
44 | const unsigned uSeqCount = ptrMSA->GetSeqCount(); |
---|
45 | Log(" "); |
---|
46 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
47 | Log("%c", ptrMSA->GetChar(uSeqIndex, n)); |
---|
48 | } |
---|
49 | Log("\n"); |
---|
50 | } |
---|
51 | |
---|
52 | Log("\n"); |
---|
53 | Log(" Pos G"); |
---|
54 | for (unsigned n = 0; n < g_AlphaSize; ++n) |
---|
55 | Log(" %c", LetterExToChar(n)); |
---|
56 | Log("\n"); |
---|
57 | Log(" --- -"); |
---|
58 | for (unsigned n = 0; n < g_AlphaSize; ++n) |
---|
59 | Log(" -----"); |
---|
60 | Log("\n"); |
---|
61 | |
---|
62 | for (unsigned n = 0; n < uLength; ++n) |
---|
63 | { |
---|
64 | const ProfPos &PP = Prof[n]; |
---|
65 | Log("%5u", n); |
---|
66 | if (-1 == PP.m_uResidueGroup) |
---|
67 | Log(" -", PP.m_uResidueGroup); |
---|
68 | else |
---|
69 | Log(" %d", PP.m_uResidueGroup); |
---|
70 | |
---|
71 | for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) |
---|
72 | { |
---|
73 | FCOUNT f = PP.m_fcCounts[uLetter]; |
---|
74 | if (f == 0.0) |
---|
75 | Log(" "); |
---|
76 | else |
---|
77 | Log(" %5.3f", f); |
---|
78 | } |
---|
79 | if (0 != ptrMSA) |
---|
80 | { |
---|
81 | const unsigned uSeqCount = ptrMSA->GetSeqCount(); |
---|
82 | Log(" "); |
---|
83 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
84 | Log("%c", ptrMSA->GetChar(uSeqIndex, n)); |
---|
85 | } |
---|
86 | Log("\n"); |
---|
87 | } |
---|
88 | } |
---|
89 | #endif // DOUBLE_AFFINE |
---|
90 | |
---|
91 | #if SINGLE_AFFINE |
---|
92 | void ListProfile(const ProfPos *Prof, unsigned uLength, const MSA *ptrMSA) |
---|
93 | { |
---|
94 | Log(" Pos Occ LL LG GL GG Open Close\n"); |
---|
95 | Log(" --- --- -- -- -- -- ---- -----\n"); |
---|
96 | for (unsigned n = 0; n < uLength; ++n) |
---|
97 | { |
---|
98 | const ProfPos &PP = Prof[n]; |
---|
99 | Log("%5u", n); |
---|
100 | LogF(PP.m_fOcc); |
---|
101 | LogF(PP.m_LL); |
---|
102 | LogF(PP.m_LG); |
---|
103 | LogF(PP.m_GL); |
---|
104 | LogF(PP.m_GG); |
---|
105 | Log(" %5.1f", -PP.m_scoreGapOpen); |
---|
106 | Log(" %5.1f", -PP.m_scoreGapClose); |
---|
107 | if (0 != ptrMSA) |
---|
108 | { |
---|
109 | const unsigned uSeqCount = ptrMSA->GetSeqCount(); |
---|
110 | Log(" "); |
---|
111 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
112 | Log("%c", ptrMSA->GetChar(uSeqIndex, n)); |
---|
113 | } |
---|
114 | Log("\n"); |
---|
115 | } |
---|
116 | |
---|
117 | Log("\n"); |
---|
118 | Log(" Pos G"); |
---|
119 | for (unsigned n = 0; n < g_AlphaSize; ++n) |
---|
120 | Log(" %c", LetterExToChar(n)); |
---|
121 | Log("\n"); |
---|
122 | Log(" --- -"); |
---|
123 | for (unsigned n = 0; n < g_AlphaSize; ++n) |
---|
124 | Log(" -----"); |
---|
125 | Log("\n"); |
---|
126 | |
---|
127 | for (unsigned n = 0; n < uLength; ++n) |
---|
128 | { |
---|
129 | const ProfPos &PP = Prof[n]; |
---|
130 | Log("%5u", n); |
---|
131 | if (-1 == PP.m_uResidueGroup) |
---|
132 | Log(" -", PP.m_uResidueGroup); |
---|
133 | else |
---|
134 | Log(" %d", PP.m_uResidueGroup); |
---|
135 | |
---|
136 | for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) |
---|
137 | { |
---|
138 | FCOUNT f = PP.m_fcCounts[uLetter]; |
---|
139 | if (f == 0.0) |
---|
140 | Log(" "); |
---|
141 | else |
---|
142 | Log(" %5.3f", f); |
---|
143 | } |
---|
144 | if (0 != ptrMSA) |
---|
145 | { |
---|
146 | const unsigned uSeqCount = ptrMSA->GetSeqCount(); |
---|
147 | Log(" "); |
---|
148 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
149 | Log("%c", ptrMSA->GetChar(uSeqIndex, n)); |
---|
150 | } |
---|
151 | Log("\n"); |
---|
152 | } |
---|
153 | } |
---|
154 | #endif |
---|
155 | |
---|
156 | void SortCounts(const FCOUNT fcCounts[], unsigned SortOrder[]) |
---|
157 | { |
---|
158 | static unsigned InitialSortOrder[MAX_ALPHA] = |
---|
159 | { |
---|
160 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 |
---|
161 | }; |
---|
162 | memcpy(SortOrder, InitialSortOrder, g_AlphaSize*sizeof(unsigned)); |
---|
163 | |
---|
164 | bool bAny = true; |
---|
165 | while (bAny) |
---|
166 | { |
---|
167 | bAny = false; |
---|
168 | for (unsigned n = 0; n < g_AlphaSize - 1; ++n) |
---|
169 | { |
---|
170 | unsigned i1 = SortOrder[n]; |
---|
171 | unsigned i2 = SortOrder[n+1]; |
---|
172 | if (fcCounts[i1] < fcCounts[i2]) |
---|
173 | { |
---|
174 | SortOrder[n+1] = i1; |
---|
175 | SortOrder[n] = i2; |
---|
176 | bAny = true; |
---|
177 | } |
---|
178 | } |
---|
179 | } |
---|
180 | } |
---|
181 | |
---|
182 | static unsigned AminoGroupFromFCounts(const FCOUNT fcCounts[]) |
---|
183 | { |
---|
184 | bool bAny = false; |
---|
185 | unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; |
---|
186 | for (unsigned uLetter = 0; uLetter < 20; ++uLetter) |
---|
187 | { |
---|
188 | if (0 == fcCounts[uLetter]) |
---|
189 | continue; |
---|
190 | const unsigned uResidueGroup = ResidueGroup[uLetter]; |
---|
191 | if (bAny) |
---|
192 | { |
---|
193 | if (uResidueGroup != uConsensusResidueGroup) |
---|
194 | return RESIDUE_GROUP_MULTIPLE; |
---|
195 | } |
---|
196 | else |
---|
197 | { |
---|
198 | bAny = true; |
---|
199 | uConsensusResidueGroup = uResidueGroup; |
---|
200 | } |
---|
201 | } |
---|
202 | return uConsensusResidueGroup; |
---|
203 | } |
---|
204 | |
---|
205 | static unsigned NucleoGroupFromFCounts(const FCOUNT fcCounts[]) |
---|
206 | { |
---|
207 | bool bAny = false; |
---|
208 | unsigned uConsensusResidueGroup = RESIDUE_GROUP_MULTIPLE; |
---|
209 | for (unsigned uLetter = 0; uLetter < 4; ++uLetter) |
---|
210 | { |
---|
211 | if (0 == fcCounts[uLetter]) |
---|
212 | continue; |
---|
213 | const unsigned uResidueGroup = uLetter; |
---|
214 | if (bAny) |
---|
215 | { |
---|
216 | if (uResidueGroup != uConsensusResidueGroup) |
---|
217 | return RESIDUE_GROUP_MULTIPLE; |
---|
218 | } |
---|
219 | else |
---|
220 | { |
---|
221 | bAny = true; |
---|
222 | uConsensusResidueGroup = uResidueGroup; |
---|
223 | } |
---|
224 | } |
---|
225 | return uConsensusResidueGroup; |
---|
226 | } |
---|
227 | |
---|
228 | unsigned ResidueGroupFromFCounts(const FCOUNT fcCounts[]) |
---|
229 | { |
---|
230 | switch (g_Alpha) |
---|
231 | { |
---|
232 | case ALPHA_Amino: |
---|
233 | return AminoGroupFromFCounts(fcCounts); |
---|
234 | |
---|
235 | case ALPHA_DNA: |
---|
236 | case ALPHA_RNA: |
---|
237 | return NucleoGroupFromFCounts(fcCounts); |
---|
238 | } |
---|
239 | Quit("ResidueGroupFromFCounts: bad alpha"); |
---|
240 | return 0; |
---|
241 | } |
---|
242 | |
---|
243 | ProfPos *ProfileFromMSA(const MSA &a) |
---|
244 | { |
---|
245 | const unsigned uSeqCount = a.GetSeqCount(); |
---|
246 | const unsigned uColCount = a.GetColCount(); |
---|
247 | |
---|
248 | // Yuck -- cast away const (inconsistent design here). |
---|
249 | SetMSAWeightsMuscle((MSA &) a); |
---|
250 | |
---|
251 | ProfPos *Pos = new ProfPos[uColCount]; |
---|
252 | |
---|
253 | unsigned uHydrophobicRunLength = 0; |
---|
254 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
255 | { |
---|
256 | ProfPos &PP = Pos[uColIndex]; |
---|
257 | |
---|
258 | PP.m_bAllGaps = a.IsGapColumn(uColIndex); |
---|
259 | |
---|
260 | FCOUNT fcGapStart; |
---|
261 | FCOUNT fcGapEnd; |
---|
262 | FCOUNT fcGapExtend; |
---|
263 | FCOUNT fOcc; |
---|
264 | a.GetFractionalWeightedCounts(uColIndex, g_bNormalizeCounts, PP.m_fcCounts, |
---|
265 | &fcGapStart, &fcGapEnd, &fcGapExtend, &fOcc, |
---|
266 | &PP.m_LL, &PP.m_LG, &PP.m_GL, &PP.m_GG); |
---|
267 | PP.m_fOcc = fOcc; |
---|
268 | |
---|
269 | SortCounts(PP.m_fcCounts, PP.m_uSortOrder); |
---|
270 | |
---|
271 | PP.m_uResidueGroup = ResidueGroupFromFCounts(PP.m_fcCounts); |
---|
272 | |
---|
273 | for (unsigned i = 0; i < g_AlphaSize; ++i) |
---|
274 | { |
---|
275 | SCORE scoreSum = 0; |
---|
276 | for (unsigned j = 0; j < g_AlphaSize; ++j) |
---|
277 | scoreSum += PP.m_fcCounts[j]*(*g_ptrScoreMatrix)[i][j]; |
---|
278 | PP.m_AAScores[i] = scoreSum; |
---|
279 | } |
---|
280 | |
---|
281 | SCORE sStartOcc = (SCORE) (1.0 - fcGapStart); |
---|
282 | SCORE sEndOcc = (SCORE) (1.0 - fcGapEnd); |
---|
283 | |
---|
284 | PP.m_fcStartOcc = sStartOcc; |
---|
285 | PP.m_fcEndOcc = sEndOcc; |
---|
286 | |
---|
287 | PP.m_scoreGapOpen = sStartOcc*g_scoreGapOpen/2; |
---|
288 | PP.m_scoreGapClose = sEndOcc*g_scoreGapOpen/2; |
---|
289 | #if DOUBLE_AFFINE |
---|
290 | PP.m_scoreGapOpen2 = sStartOcc*g_scoreGapOpen2/2; |
---|
291 | PP.m_scoreGapClose2 = sEndOcc*g_scoreGapOpen2/2; |
---|
292 | #endif |
---|
293 | // PP.m_scoreGapExtend = (SCORE) ((1.0 - fcGapExtend)*scoreGapExtend); |
---|
294 | |
---|
295 | #if PAF |
---|
296 | if (ALHPA_Amino == g_Alpha && sStartOcc > 0.5) |
---|
297 | { |
---|
298 | extern SCORE PAFactor(const FCOUNT fcCounts[]); |
---|
299 | SCORE paf = PAFactor(PP.m_fcCounts); |
---|
300 | PP.m_scoreGapOpen *= paf; |
---|
301 | PP.m_scoreGapClose *= paf; |
---|
302 | } |
---|
303 | #endif |
---|
304 | } |
---|
305 | |
---|
306 | #if HYDRO |
---|
307 | if (ALPHA_Amino == g_Alpha) |
---|
308 | Hydro(Pos, uColCount); |
---|
309 | #endif |
---|
310 | |
---|
311 | #if TRACE |
---|
312 | { |
---|
313 | Log("ProfileFromMSA\n"); |
---|
314 | ListProfile(Pos, uColCount, &a); |
---|
315 | } |
---|
316 | #endif |
---|
317 | return Pos; |
---|
318 | } |
---|