1 | #include "muscle.h" |
---|
2 | #include "msa.h" |
---|
3 | #include "textfile.h" |
---|
4 | #include "seq.h" |
---|
5 | #include <math.h> |
---|
6 | |
---|
7 | const unsigned DEFAULT_SEQ_LENGTH = 500; |
---|
8 | |
---|
9 | unsigned MSA::m_uIdCount = 0; |
---|
10 | |
---|
11 | MSA::MSA() |
---|
12 | { |
---|
13 | m_uSeqCount = 0; |
---|
14 | m_uColCount = 0; |
---|
15 | |
---|
16 | m_szSeqs = 0; |
---|
17 | m_szNames = 0; |
---|
18 | m_Weights = 0; |
---|
19 | |
---|
20 | m_IdToSeqIndex = 0; |
---|
21 | m_SeqIndexToId = 0; |
---|
22 | |
---|
23 | m_uCacheSeqCount = 0; |
---|
24 | m_uCacheSeqLength = 0; |
---|
25 | } |
---|
26 | |
---|
27 | MSA::~MSA() |
---|
28 | { |
---|
29 | Free(); |
---|
30 | } |
---|
31 | |
---|
32 | void MSA::Free() |
---|
33 | { |
---|
34 | for (unsigned n = 0; n < m_uSeqCount; ++n) |
---|
35 | { |
---|
36 | delete[] m_szSeqs[n]; |
---|
37 | delete[] m_szNames[n]; |
---|
38 | } |
---|
39 | |
---|
40 | delete[] m_szSeqs; |
---|
41 | delete[] m_szNames; |
---|
42 | delete[] m_Weights; |
---|
43 | delete[] m_IdToSeqIndex; |
---|
44 | delete[] m_SeqIndexToId; |
---|
45 | |
---|
46 | m_uSeqCount = 0; |
---|
47 | m_uColCount = 0; |
---|
48 | |
---|
49 | m_szSeqs = 0; |
---|
50 | m_szNames = 0; |
---|
51 | m_Weights = 0; |
---|
52 | |
---|
53 | m_IdToSeqIndex = 0; |
---|
54 | m_SeqIndexToId = 0; |
---|
55 | } |
---|
56 | |
---|
57 | void MSA::SetSize(unsigned uSeqCount, unsigned uColCount) |
---|
58 | { |
---|
59 | Free(); |
---|
60 | |
---|
61 | m_uSeqCount = uSeqCount; |
---|
62 | m_uCacheSeqLength = uColCount; |
---|
63 | m_uColCount = 0; |
---|
64 | |
---|
65 | if (0 == uSeqCount && 0 == uColCount) |
---|
66 | return; |
---|
67 | |
---|
68 | m_szSeqs = new char *[uSeqCount]; |
---|
69 | m_szNames = new char *[uSeqCount]; |
---|
70 | m_Weights = new WEIGHT[uSeqCount]; |
---|
71 | |
---|
72 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
73 | { |
---|
74 | m_szSeqs[uSeqIndex] = new char[uColCount+1]; |
---|
75 | m_szNames[uSeqIndex] = 0; |
---|
76 | #if DEBUG |
---|
77 | m_Weights[uSeqIndex] = BTInsane; |
---|
78 | memset(m_szSeqs[uSeqIndex], '?', uColCount); |
---|
79 | #endif |
---|
80 | m_szSeqs[uSeqIndex][uColCount] = 0; |
---|
81 | } |
---|
82 | |
---|
83 | if (m_uIdCount > 0) |
---|
84 | { |
---|
85 | m_IdToSeqIndex = new unsigned[m_uIdCount]; |
---|
86 | m_SeqIndexToId = new unsigned[m_uSeqCount]; |
---|
87 | #if DEBUG |
---|
88 | memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); |
---|
89 | memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); |
---|
90 | #endif |
---|
91 | } |
---|
92 | } |
---|
93 | |
---|
94 | void MSA::LogMe() const |
---|
95 | { |
---|
96 | if (0 == GetColCount()) |
---|
97 | { |
---|
98 | Log("MSA empty\n"); |
---|
99 | return; |
---|
100 | } |
---|
101 | |
---|
102 | const unsigned uColsPerLine = 50; |
---|
103 | unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1; |
---|
104 | for (unsigned n = 0; n < uLinesPerSeq; ++n) |
---|
105 | { |
---|
106 | unsigned i; |
---|
107 | unsigned iStart = n*uColsPerLine; |
---|
108 | unsigned iEnd = GetColCount(); |
---|
109 | if (iEnd - iStart + 1 > uColsPerLine) |
---|
110 | iEnd = iStart + uColsPerLine; |
---|
111 | Log(" "); |
---|
112 | for (i = iStart; i < iEnd; ++i) |
---|
113 | Log("%u", i%10); |
---|
114 | Log("\n"); |
---|
115 | Log(" "); |
---|
116 | for (i = iStart; i + 9 < iEnd; i += 10) |
---|
117 | Log("%-10u", i); |
---|
118 | if (n == uLinesPerSeq - 1) |
---|
119 | Log(" %-10u", GetColCount()); |
---|
120 | Log("\n"); |
---|
121 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
---|
122 | { |
---|
123 | Log("%12.12s", m_szNames[uSeqIndex]); |
---|
124 | if (m_Weights[uSeqIndex] != BTInsane) |
---|
125 | Log(" (%5.3f)", m_Weights[uSeqIndex]); |
---|
126 | else |
---|
127 | Log(" "); |
---|
128 | Log(" "); |
---|
129 | for (i = iStart; i < iEnd; ++i) |
---|
130 | Log("%c", GetChar(uSeqIndex, i)); |
---|
131 | if (0 != m_SeqIndexToId) |
---|
132 | Log(" [%5u]", m_SeqIndexToId[uSeqIndex]); |
---|
133 | Log("\n"); |
---|
134 | } |
---|
135 | Log("\n\n"); |
---|
136 | } |
---|
137 | } |
---|
138 | |
---|
139 | char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const |
---|
140 | { |
---|
141 | // TODO: Performance cost? |
---|
142 | if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount) |
---|
143 | Quit("MSA::GetChar(%u/%u,%u/%u)", |
---|
144 | uSeqIndex, m_uSeqCount, uIndex, m_uColCount); |
---|
145 | |
---|
146 | char c = m_szSeqs[uSeqIndex][uIndex]; |
---|
147 | // assert(IsLegalChar(c)); |
---|
148 | return c; |
---|
149 | } |
---|
150 | |
---|
151 | unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const |
---|
152 | { |
---|
153 | // TODO: Performance cost? |
---|
154 | char c = GetChar(uSeqIndex, uIndex); |
---|
155 | unsigned uLetter = CharToLetter(c); |
---|
156 | if (uLetter >= 20) |
---|
157 | { |
---|
158 | char c = ' '; |
---|
159 | if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount) |
---|
160 | c = m_szSeqs[uSeqIndex][uIndex]; |
---|
161 | Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u", |
---|
162 | uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter); |
---|
163 | } |
---|
164 | return uLetter; |
---|
165 | } |
---|
166 | |
---|
167 | unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const |
---|
168 | { |
---|
169 | // TODO: Performance cost? |
---|
170 | char c = GetChar(uSeqIndex, uIndex); |
---|
171 | unsigned uLetter = CharToLetterEx(c); |
---|
172 | return uLetter; |
---|
173 | } |
---|
174 | |
---|
175 | void MSA::SetSeqName(unsigned uSeqIndex, const char szName[]) |
---|
176 | { |
---|
177 | if (uSeqIndex >= m_uSeqCount) |
---|
178 | Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount); |
---|
179 | delete[] m_szNames[uSeqIndex]; |
---|
180 | int n = (int) strlen(szName) + 1; |
---|
181 | m_szNames[uSeqIndex] = new char[n]; |
---|
182 | memcpy(m_szNames[uSeqIndex], szName, n); |
---|
183 | } |
---|
184 | |
---|
185 | const char *MSA::GetSeqName(unsigned uSeqIndex) const |
---|
186 | { |
---|
187 | if (uSeqIndex >= m_uSeqCount) |
---|
188 | Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount); |
---|
189 | return m_szNames[uSeqIndex]; |
---|
190 | } |
---|
191 | |
---|
192 | bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const |
---|
193 | { |
---|
194 | char c = GetChar(uSeqIndex, uIndex); |
---|
195 | return IsGapChar(c); |
---|
196 | } |
---|
197 | |
---|
198 | bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const |
---|
199 | { |
---|
200 | char c = GetChar(uSeqIndex, uIndex); |
---|
201 | return IsWildcardChar(c); |
---|
202 | } |
---|
203 | |
---|
204 | void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c) |
---|
205 | { |
---|
206 | if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength) |
---|
207 | Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex); |
---|
208 | |
---|
209 | if (uIndex == m_uCacheSeqLength) |
---|
210 | { |
---|
211 | const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH; |
---|
212 | for (unsigned n = 0; n < m_uSeqCount; ++n) |
---|
213 | { |
---|
214 | char *ptrNewSeq = new char[uNewCacheSeqLength+1]; |
---|
215 | memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength); |
---|
216 | memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH); |
---|
217 | ptrNewSeq[uNewCacheSeqLength] = 0; |
---|
218 | delete[] m_szSeqs[n]; |
---|
219 | m_szSeqs[n] = ptrNewSeq; |
---|
220 | } |
---|
221 | |
---|
222 | m_uColCount = uIndex; |
---|
223 | m_uCacheSeqLength = uNewCacheSeqLength; |
---|
224 | } |
---|
225 | |
---|
226 | if (uIndex >= m_uColCount) |
---|
227 | m_uColCount = uIndex + 1; |
---|
228 | m_szSeqs[uSeqIndex][uIndex] = c; |
---|
229 | } |
---|
230 | |
---|
231 | void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const |
---|
232 | { |
---|
233 | assert(uSeqIndex < m_uSeqCount); |
---|
234 | |
---|
235 | seq.Clear(); |
---|
236 | |
---|
237 | for (unsigned n = 0; n < m_uColCount; ++n) |
---|
238 | if (!IsGap(uSeqIndex, n)) |
---|
239 | { |
---|
240 | char c = GetChar(uSeqIndex, n); |
---|
241 | if (!isalpha(c)) |
---|
242 | Quit("Invalid character '%c' in sequence", c); |
---|
243 | c = toupper(c); |
---|
244 | seq.push_back(c); |
---|
245 | } |
---|
246 | const char *ptrName = GetSeqName(uSeqIndex); |
---|
247 | seq.SetName(ptrName); |
---|
248 | } |
---|
249 | |
---|
250 | bool MSA::HasGap() const |
---|
251 | { |
---|
252 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
---|
253 | for (unsigned n = 0; n < GetColCount(); ++n) |
---|
254 | if (IsGap(uSeqIndex, n)) |
---|
255 | return true; |
---|
256 | return false; |
---|
257 | } |
---|
258 | |
---|
259 | bool MSA::IsLegalLetter(unsigned uLetter) const |
---|
260 | { |
---|
261 | return uLetter < 20; |
---|
262 | } |
---|
263 | |
---|
264 | void MSA::SetSeqCount(unsigned uSeqCount) |
---|
265 | { |
---|
266 | Free(); |
---|
267 | SetSize(uSeqCount, DEFAULT_SEQ_LENGTH); |
---|
268 | } |
---|
269 | |
---|
270 | void MSA::CopyCol(unsigned uFromCol, unsigned uToCol) |
---|
271 | { |
---|
272 | assert(uFromCol < GetColCount()); |
---|
273 | assert(uToCol < GetColCount()); |
---|
274 | if (uFromCol == uToCol) |
---|
275 | return; |
---|
276 | |
---|
277 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
---|
278 | { |
---|
279 | const char c = GetChar(uSeqIndex, uFromCol); |
---|
280 | SetChar(uSeqIndex, uToCol, c); |
---|
281 | } |
---|
282 | } |
---|
283 | |
---|
284 | void MSA::Copy(const MSA &msa) |
---|
285 | { |
---|
286 | Free(); |
---|
287 | const unsigned uSeqCount = msa.GetSeqCount(); |
---|
288 | const unsigned uColCount = msa.GetColCount(); |
---|
289 | SetSize(uSeqCount, uColCount); |
---|
290 | |
---|
291 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
292 | { |
---|
293 | SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex)); |
---|
294 | const unsigned uId = msa.GetSeqId(uSeqIndex); |
---|
295 | SetSeqId(uSeqIndex, uId); |
---|
296 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
297 | { |
---|
298 | const char c = msa.GetChar(uSeqIndex, uColIndex); |
---|
299 | SetChar(uSeqIndex, uColIndex, c); |
---|
300 | } |
---|
301 | } |
---|
302 | } |
---|
303 | |
---|
304 | bool MSA::IsGapColumn(unsigned uColIndex) const |
---|
305 | { |
---|
306 | assert(GetSeqCount() > 0); |
---|
307 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
---|
308 | if (!IsGap(uSeqIndex, uColIndex)) |
---|
309 | return false; |
---|
310 | return true; |
---|
311 | } |
---|
312 | |
---|
313 | bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const |
---|
314 | { |
---|
315 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
---|
316 | if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex))) |
---|
317 | { |
---|
318 | *ptruSeqIndex = uSeqIndex; |
---|
319 | return true; |
---|
320 | } |
---|
321 | return false; |
---|
322 | } |
---|
323 | |
---|
324 | void MSA::DeleteCol(unsigned uColIndex) |
---|
325 | { |
---|
326 | assert(uColIndex < m_uColCount); |
---|
327 | size_t n = m_uColCount - uColIndex; |
---|
328 | if (n > 0) |
---|
329 | { |
---|
330 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
---|
331 | { |
---|
332 | char *ptrSeq = m_szSeqs[uSeqIndex]; |
---|
333 | memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n); |
---|
334 | } |
---|
335 | } |
---|
336 | --m_uColCount; |
---|
337 | } |
---|
338 | |
---|
339 | void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount) |
---|
340 | { |
---|
341 | for (unsigned n = 0; n < uColCount; ++n) |
---|
342 | DeleteCol(uColIndex); |
---|
343 | } |
---|
344 | |
---|
345 | void MSA::FromFile(TextFile &File) |
---|
346 | { |
---|
347 | FromFASTAFile(File); |
---|
348 | } |
---|
349 | |
---|
350 | // Weights sum to 1, WCounts sum to NIC |
---|
351 | WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const |
---|
352 | { |
---|
353 | assert(uSeqIndex < m_uSeqCount); |
---|
354 | WEIGHT w = m_Weights[uSeqIndex]; |
---|
355 | if (w == wInsane) |
---|
356 | Quit("Seq weight not set"); |
---|
357 | return w; |
---|
358 | } |
---|
359 | |
---|
360 | void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const |
---|
361 | { |
---|
362 | assert(uSeqIndex < m_uSeqCount); |
---|
363 | m_Weights[uSeqIndex] = w; |
---|
364 | } |
---|
365 | |
---|
366 | void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const |
---|
367 | { |
---|
368 | WEIGHT wTotal = 0; |
---|
369 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
---|
370 | wTotal += m_Weights[uSeqIndex]; |
---|
371 | |
---|
372 | if (0 == wTotal) |
---|
373 | return; |
---|
374 | |
---|
375 | const WEIGHT f = wDesiredTotal/wTotal; |
---|
376 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
---|
377 | m_Weights[uSeqIndex] *= f; |
---|
378 | } |
---|
379 | |
---|
380 | void MSA::CalcWeights() const |
---|
381 | { |
---|
382 | Quit("Calc weights not implemented"); |
---|
383 | } |
---|
384 | |
---|
385 | static void FmtChar(char c, unsigned uWidth) |
---|
386 | { |
---|
387 | Log("%c", c); |
---|
388 | for (unsigned n = 0; n < uWidth - 1; ++n) |
---|
389 | Log(" "); |
---|
390 | } |
---|
391 | |
---|
392 | static void FmtInt(unsigned u, unsigned uWidth) |
---|
393 | { |
---|
394 | static char szStr[1024]; |
---|
395 | assert(uWidth < sizeof(szStr)); |
---|
396 | if (u > 0) |
---|
397 | sprintf(szStr, "%u", u); |
---|
398 | else |
---|
399 | strcpy(szStr, "."); |
---|
400 | Log(szStr); |
---|
401 | unsigned n = (unsigned) strlen(szStr); |
---|
402 | if (n < uWidth) |
---|
403 | for (unsigned i = 0; i < uWidth - n; ++i) |
---|
404 | Log(" "); |
---|
405 | } |
---|
406 | |
---|
407 | static void FmtInt0(unsigned u, unsigned uWidth) |
---|
408 | { |
---|
409 | static char szStr[1024]; |
---|
410 | assert(uWidth < sizeof(szStr)); |
---|
411 | sprintf(szStr, "%u", u); |
---|
412 | Log(szStr); |
---|
413 | unsigned n = (unsigned) strlen(szStr); |
---|
414 | if (n < uWidth) |
---|
415 | for (unsigned i = 0; i < uWidth - n; ++i) |
---|
416 | Log(" "); |
---|
417 | } |
---|
418 | |
---|
419 | static void FmtPad(unsigned n) |
---|
420 | { |
---|
421 | for (unsigned i = 0; i < n; ++i) |
---|
422 | Log(" "); |
---|
423 | } |
---|
424 | |
---|
425 | void MSA::FromSeq(const Seq &s) |
---|
426 | { |
---|
427 | unsigned uSeqLength = s.Length(); |
---|
428 | SetSize(1, uSeqLength); |
---|
429 | SetSeqName(0, s.GetName()); |
---|
430 | if (0 != m_SeqIndexToId) |
---|
431 | SetSeqId(0, s.GetId()); |
---|
432 | for (unsigned n = 0; n < uSeqLength; ++n) |
---|
433 | SetChar(0, n, s[n]); |
---|
434 | } |
---|
435 | |
---|
436 | unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const |
---|
437 | { |
---|
438 | assert(uSeqIndex < GetSeqCount()); |
---|
439 | assert(uColIndex < GetColCount()); |
---|
440 | |
---|
441 | unsigned uCol = 0; |
---|
442 | for (unsigned n = 0; n <= uColIndex; ++n) |
---|
443 | if (!IsGap(uSeqIndex, n)) |
---|
444 | ++uCol; |
---|
445 | return uCol; |
---|
446 | } |
---|
447 | |
---|
448 | void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex) |
---|
449 | { |
---|
450 | assert(uToSeqIndex < m_uSeqCount); |
---|
451 | const unsigned uColCount = msaFrom.GetColCount(); |
---|
452 | assert(m_uColCount == uColCount || |
---|
453 | (0 == m_uColCount && uColCount <= m_uCacheSeqLength)); |
---|
454 | |
---|
455 | memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount); |
---|
456 | SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex)); |
---|
457 | if (0 == m_uColCount) |
---|
458 | m_uColCount = uColCount; |
---|
459 | } |
---|
460 | |
---|
461 | const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const |
---|
462 | { |
---|
463 | assert(uSeqIndex < m_uSeqCount); |
---|
464 | return m_szSeqs[uSeqIndex]; |
---|
465 | } |
---|
466 | |
---|
467 | void MSA::DeleteSeq(unsigned uSeqIndex) |
---|
468 | { |
---|
469 | assert(uSeqIndex < m_uSeqCount); |
---|
470 | |
---|
471 | delete m_szSeqs[uSeqIndex]; |
---|
472 | delete m_szNames[uSeqIndex]; |
---|
473 | |
---|
474 | const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *); |
---|
475 | if (uBytesToMove > 0) |
---|
476 | { |
---|
477 | memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove); |
---|
478 | memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove); |
---|
479 | } |
---|
480 | |
---|
481 | --m_uSeqCount; |
---|
482 | |
---|
483 | delete[] m_Weights; |
---|
484 | m_Weights = 0; |
---|
485 | } |
---|
486 | |
---|
487 | bool MSA::IsEmptyCol(unsigned uColIndex) const |
---|
488 | { |
---|
489 | const unsigned uSeqCount = GetSeqCount(); |
---|
490 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
491 | if (!IsGap(uSeqIndex, uColIndex)) |
---|
492 | return false; |
---|
493 | return true; |
---|
494 | } |
---|
495 | |
---|
496 | //void MSA::DeleteEmptyCols(bool bProgress) |
---|
497 | // { |
---|
498 | // unsigned uColCount = GetColCount(); |
---|
499 | // for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
500 | // { |
---|
501 | // if (IsEmptyCol(uColIndex)) |
---|
502 | // { |
---|
503 | // if (bProgress) |
---|
504 | // { |
---|
505 | // Log("Deleting col %u of %u\n", uColIndex, uColCount); |
---|
506 | // printf("Deleting col %u of %u\n", uColIndex, uColCount); |
---|
507 | // } |
---|
508 | // DeleteCol(uColIndex); |
---|
509 | // --uColCount; |
---|
510 | // } |
---|
511 | // } |
---|
512 | // } |
---|
513 | |
---|
514 | unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const |
---|
515 | { |
---|
516 | Quit("MSA::AlignedColIndexToColIndex not implemented"); |
---|
517 | return 0; |
---|
518 | } |
---|
519 | |
---|
520 | WEIGHT MSA::GetTotalSeqWeight() const |
---|
521 | { |
---|
522 | WEIGHT wTotal = 0; |
---|
523 | const unsigned uSeqCount = GetSeqCount(); |
---|
524 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
525 | wTotal += m_Weights[uSeqIndex]; |
---|
526 | return wTotal; |
---|
527 | } |
---|
528 | |
---|
529 | bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, |
---|
530 | unsigned uSeqIndex2) |
---|
531 | { |
---|
532 | Seq s1; |
---|
533 | Seq s2; |
---|
534 | |
---|
535 | a1.GetSeq(uSeqIndex1, s1); |
---|
536 | a2.GetSeq(uSeqIndex2, s2); |
---|
537 | |
---|
538 | s1.StripGaps(); |
---|
539 | s2.StripGaps(); |
---|
540 | |
---|
541 | return s1.EqIgnoreCase(s2); |
---|
542 | } |
---|
543 | |
---|
544 | unsigned MSA::GetSeqLength(unsigned uSeqIndex) const |
---|
545 | { |
---|
546 | assert(uSeqIndex < GetSeqCount()); |
---|
547 | |
---|
548 | const unsigned uColCount = GetColCount(); |
---|
549 | unsigned uLength = 0; |
---|
550 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
551 | if (!IsGap(uSeqIndex, uColIndex)) |
---|
552 | ++uLength; |
---|
553 | return uLength; |
---|
554 | } |
---|
555 | |
---|
556 | void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID, |
---|
557 | unsigned *ptruPosCount) const |
---|
558 | { |
---|
559 | assert(uSeqIndex1 < GetSeqCount()); |
---|
560 | assert(uSeqIndex2 < GetSeqCount()); |
---|
561 | |
---|
562 | unsigned uSameCount = 0; |
---|
563 | unsigned uPosCount = 0; |
---|
564 | const unsigned uColCount = GetColCount(); |
---|
565 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
566 | { |
---|
567 | char c1 = GetChar(uSeqIndex1, uColIndex); |
---|
568 | if (IsGapChar(c1)) |
---|
569 | continue; |
---|
570 | char c2 = GetChar(uSeqIndex2, uColIndex); |
---|
571 | if (IsGapChar(c2)) |
---|
572 | continue; |
---|
573 | ++uPosCount; |
---|
574 | if (c1 == c2) |
---|
575 | ++uSameCount; |
---|
576 | } |
---|
577 | *ptruPosCount = uPosCount; |
---|
578 | if (uPosCount > 0) |
---|
579 | *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount; |
---|
580 | else |
---|
581 | *ptrPWID = 0; |
---|
582 | } |
---|
583 | |
---|
584 | void MSA::UnWeight() |
---|
585 | { |
---|
586 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
---|
587 | m_Weights[uSeqIndex] = BTInsane; |
---|
588 | } |
---|
589 | |
---|
590 | unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const |
---|
591 | { |
---|
592 | assert(uColIndex < GetColCount()); |
---|
593 | |
---|
594 | unsigned Counts[MAX_ALPHA]; |
---|
595 | memset(Counts, 0, sizeof(Counts)); |
---|
596 | const unsigned uSeqCount = GetSeqCount(); |
---|
597 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
598 | { |
---|
599 | if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) |
---|
600 | continue; |
---|
601 | const unsigned uLetter = GetLetter(uSeqIndex, uColIndex); |
---|
602 | ++(Counts[uLetter]); |
---|
603 | } |
---|
604 | unsigned uUniqueCount = 0; |
---|
605 | for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) |
---|
606 | if (Counts[uLetter] > 0) |
---|
607 | ++uUniqueCount; |
---|
608 | return uUniqueCount; |
---|
609 | } |
---|
610 | |
---|
611 | double MSA::GetOcc(unsigned uColIndex) const |
---|
612 | { |
---|
613 | unsigned uGapCount = 0; |
---|
614 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
---|
615 | if (IsGap(uSeqIndex, uColIndex)) |
---|
616 | ++uGapCount; |
---|
617 | unsigned uSeqCount = GetSeqCount(); |
---|
618 | return (double) (uSeqCount - uGapCount) / (double) uSeqCount; |
---|
619 | } |
---|
620 | |
---|
621 | void MSA::ToFile(TextFile &File) const |
---|
622 | { |
---|
623 | if (g_bMSF) |
---|
624 | ToMSFFile(File); |
---|
625 | else if (g_bAln) |
---|
626 | ToAlnFile(File); |
---|
627 | else if (g_bHTML) |
---|
628 | ToHTMLFile(File); |
---|
629 | else if (g_bPHYS) |
---|
630 | ToPhySequentialFile(File); |
---|
631 | else if (g_bPHYI) |
---|
632 | ToPhyInterleavedFile(File); |
---|
633 | else |
---|
634 | ToFASTAFile(File); |
---|
635 | if (0 != g_pstrScoreFileName) |
---|
636 | WriteScoreFile(*this); |
---|
637 | } |
---|
638 | |
---|
639 | bool MSA::ColumnHasGap(unsigned uColIndex) const |
---|
640 | { |
---|
641 | const unsigned uSeqCount = GetSeqCount(); |
---|
642 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
643 | if (IsGap(uSeqIndex, uColIndex)) |
---|
644 | return true; |
---|
645 | return false; |
---|
646 | } |
---|
647 | |
---|
648 | void MSA::SetIdCount(unsigned uIdCount) |
---|
649 | { |
---|
650 | //if (m_uIdCount != 0) |
---|
651 | // Quit("MSA::SetIdCount: may only be called once"); |
---|
652 | |
---|
653 | if (m_uIdCount > 0) |
---|
654 | { |
---|
655 | if (uIdCount > m_uIdCount) |
---|
656 | Quit("MSA::SetIdCount: cannot increase count"); |
---|
657 | return; |
---|
658 | } |
---|
659 | m_uIdCount = uIdCount; |
---|
660 | } |
---|
661 | |
---|
662 | void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId) |
---|
663 | { |
---|
664 | assert(uSeqIndex < m_uSeqCount); |
---|
665 | assert(uId < m_uIdCount); |
---|
666 | if (0 == m_SeqIndexToId) |
---|
667 | { |
---|
668 | if (0 == m_uIdCount) |
---|
669 | Quit("MSA::SetSeqId, SetIdCount has not been called"); |
---|
670 | m_IdToSeqIndex = new unsigned[m_uIdCount]; |
---|
671 | m_SeqIndexToId = new unsigned[m_uSeqCount]; |
---|
672 | |
---|
673 | memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); |
---|
674 | memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); |
---|
675 | } |
---|
676 | m_SeqIndexToId[uSeqIndex] = uId; |
---|
677 | m_IdToSeqIndex[uId] = uSeqIndex; |
---|
678 | } |
---|
679 | |
---|
680 | unsigned MSA::GetSeqIndex(unsigned uId) const |
---|
681 | { |
---|
682 | assert(uId < m_uIdCount); |
---|
683 | assert(0 != m_IdToSeqIndex); |
---|
684 | unsigned uSeqIndex = m_IdToSeqIndex[uId]; |
---|
685 | assert(uSeqIndex < m_uSeqCount); |
---|
686 | return uSeqIndex; |
---|
687 | } |
---|
688 | |
---|
689 | bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const |
---|
690 | { |
---|
691 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
---|
692 | { |
---|
693 | if (uId == m_SeqIndexToId[uSeqIndex]) |
---|
694 | { |
---|
695 | *ptruIndex = uSeqIndex; |
---|
696 | return true; |
---|
697 | } |
---|
698 | } |
---|
699 | return false; |
---|
700 | } |
---|
701 | |
---|
702 | unsigned MSA::GetSeqId(unsigned uSeqIndex) const |
---|
703 | { |
---|
704 | assert(uSeqIndex < m_uSeqCount); |
---|
705 | unsigned uId = m_SeqIndexToId[uSeqIndex]; |
---|
706 | assert(uId < m_uIdCount); |
---|
707 | return uId; |
---|
708 | } |
---|
709 | |
---|
710 | bool MSA::WeightsSet() const |
---|
711 | { |
---|
712 | return BTInsane != m_Weights[0]; |
---|
713 | } |
---|
714 | |
---|
715 | void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, |
---|
716 | MSA &msaOut) |
---|
717 | { |
---|
718 | const unsigned uColCount = msaIn.GetColCount(); |
---|
719 | msaOut.SetSize(uIdCount, uColCount); |
---|
720 | for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut) |
---|
721 | { |
---|
722 | const unsigned uId = Ids[uSeqIndexOut]; |
---|
723 | |
---|
724 | const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId); |
---|
725 | const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); |
---|
726 | |
---|
727 | msaOut.SetSeqId(uSeqIndexOut, uId); |
---|
728 | msaOut.SetSeqName(uSeqIndexOut, ptrName); |
---|
729 | |
---|
730 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
---|
731 | { |
---|
732 | const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); |
---|
733 | msaOut.SetChar(uSeqIndexOut, uColIndex, c); |
---|
734 | } |
---|
735 | } |
---|
736 | } |
---|
737 | |
---|
738 | // Caller must allocate ptrSeq and ptrLabel as new char[n]. |
---|
739 | void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel) |
---|
740 | { |
---|
741 | if (m_uSeqCount > m_uCacheSeqCount) |
---|
742 | Quit("Internal error MSA::AppendSeq"); |
---|
743 | if (m_uSeqCount == m_uCacheSeqCount) |
---|
744 | ExpandCache(m_uSeqCount + 4, uSeqLength); |
---|
745 | m_szSeqs[m_uSeqCount] = ptrSeq; |
---|
746 | m_szNames[m_uSeqCount] = ptrLabel; |
---|
747 | ++m_uSeqCount; |
---|
748 | } |
---|
749 | |
---|
750 | void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount) |
---|
751 | { |
---|
752 | if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount) |
---|
753 | Quit("Internal error MSA::ExpandCache"); |
---|
754 | |
---|
755 | if (m_uSeqCount > 0 && uColCount != m_uColCount) |
---|
756 | Quit("Internal error MSA::ExpandCache, ColCount changed"); |
---|
757 | |
---|
758 | char **NewSeqs = new char *[uSeqCount]; |
---|
759 | char **NewNames = new char *[uSeqCount]; |
---|
760 | WEIGHT *NewWeights = new WEIGHT[uSeqCount]; |
---|
761 | |
---|
762 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
---|
763 | { |
---|
764 | NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex]; |
---|
765 | NewNames[uSeqIndex] = m_szNames[uSeqIndex]; |
---|
766 | NewWeights[uSeqIndex] = m_Weights[uSeqIndex]; |
---|
767 | } |
---|
768 | |
---|
769 | for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex) |
---|
770 | { |
---|
771 | char *Seq = new char[uColCount]; |
---|
772 | NewSeqs[uSeqIndex] = Seq; |
---|
773 | #if DEBUG |
---|
774 | memset(Seq, '?', uColCount); |
---|
775 | #endif |
---|
776 | } |
---|
777 | |
---|
778 | delete[] m_szSeqs; |
---|
779 | delete[] m_szNames; |
---|
780 | delete[] m_Weights; |
---|
781 | |
---|
782 | m_szSeqs = NewSeqs; |
---|
783 | m_szNames = NewNames; |
---|
784 | m_Weights = NewWeights; |
---|
785 | |
---|
786 | m_uCacheSeqCount = uSeqCount; |
---|
787 | m_uCacheSeqLength = uColCount; |
---|
788 | m_uColCount = uColCount; |
---|
789 | } |
---|
790 | |
---|
791 | void MSA::FixAlpha() |
---|
792 | { |
---|
793 | ClearInvalidLetterWarning(); |
---|
794 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
---|
795 | { |
---|
796 | for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex) |
---|
797 | { |
---|
798 | char c = GetChar(uSeqIndex, uColIndex); |
---|
799 | if (!IsResidueChar(c) && !IsGapChar(c)) |
---|
800 | { |
---|
801 | char w = GetWildcardChar(); |
---|
802 | // Warning("Invalid letter '%c', replaced by '%c'", c, w); |
---|
803 | InvalidLetterWarning(c, w); |
---|
804 | SetChar(uSeqIndex, uColIndex, w); |
---|
805 | } |
---|
806 | } |
---|
807 | } |
---|
808 | ReportInvalidLetters(); |
---|
809 | } |
---|
810 | |
---|
811 | ALPHA MSA::GuessAlpha() const |
---|
812 | { |
---|
813 | // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap |
---|
814 | // letters belong to the nucleotide alphabet, guess nucleo. |
---|
815 | // Otherwise amino. |
---|
816 | const unsigned CHAR_COUNT = 100; |
---|
817 | const unsigned MIN_NUCLEO_PCT = 95; |
---|
818 | |
---|
819 | const unsigned uSeqCount = GetSeqCount(); |
---|
820 | const unsigned uColCount = GetColCount(); |
---|
821 | if (0 == uSeqCount) |
---|
822 | return ALPHA_Amino; |
---|
823 | |
---|
824 | unsigned uDNACount = 0; |
---|
825 | unsigned uRNACount = 0; |
---|
826 | unsigned uTotal = 0; |
---|
827 | unsigned i = 0; |
---|
828 | for (;;) |
---|
829 | { |
---|
830 | unsigned uSeqIndex = i/uColCount; |
---|
831 | if (uSeqIndex >= uSeqCount) |
---|
832 | break; |
---|
833 | unsigned uColIndex = i%uColCount; |
---|
834 | ++i; |
---|
835 | char c = GetChar(uSeqIndex, uColIndex); |
---|
836 | if (IsGapChar(c)) |
---|
837 | continue; |
---|
838 | if (IsDNA(c)) |
---|
839 | ++uDNACount; |
---|
840 | if (IsRNA(c)) |
---|
841 | ++uRNACount; |
---|
842 | ++uTotal; |
---|
843 | if (uTotal >= CHAR_COUNT) |
---|
844 | break; |
---|
845 | } |
---|
846 | if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) |
---|
847 | return ALPHA_RNA; |
---|
848 | if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) |
---|
849 | return ALPHA_DNA; |
---|
850 | return ALPHA_Amino; |
---|
851 | } |
---|