| 1 | #include "muscle.h" |
|---|
| 2 | #include "msa.h" |
|---|
| 3 | #include "textfile.h" |
|---|
| 4 | #include "seq.h" |
|---|
| 5 | #include <math.h> |
|---|
| 6 | |
|---|
| 7 | const unsigned DEFAULT_SEQ_LENGTH = 500; |
|---|
| 8 | |
|---|
| 9 | unsigned MSA::m_uIdCount = 0; |
|---|
| 10 | |
|---|
| 11 | MSA::MSA() |
|---|
| 12 | { |
|---|
| 13 | m_uSeqCount = 0; |
|---|
| 14 | m_uColCount = 0; |
|---|
| 15 | |
|---|
| 16 | m_szSeqs = 0; |
|---|
| 17 | m_szNames = 0; |
|---|
| 18 | m_Weights = 0; |
|---|
| 19 | |
|---|
| 20 | m_IdToSeqIndex = 0; |
|---|
| 21 | m_SeqIndexToId = 0; |
|---|
| 22 | |
|---|
| 23 | m_uCacheSeqCount = 0; |
|---|
| 24 | m_uCacheSeqLength = 0; |
|---|
| 25 | } |
|---|
| 26 | |
|---|
| 27 | MSA::~MSA() |
|---|
| 28 | { |
|---|
| 29 | Free(); |
|---|
| 30 | } |
|---|
| 31 | |
|---|
| 32 | void MSA::Free() |
|---|
| 33 | { |
|---|
| 34 | for (unsigned n = 0; n < m_uSeqCount; ++n) |
|---|
| 35 | { |
|---|
| 36 | delete[] m_szSeqs[n]; |
|---|
| 37 | delete[] m_szNames[n]; |
|---|
| 38 | } |
|---|
| 39 | |
|---|
| 40 | delete[] m_szSeqs; |
|---|
| 41 | delete[] m_szNames; |
|---|
| 42 | delete[] m_Weights; |
|---|
| 43 | delete[] m_IdToSeqIndex; |
|---|
| 44 | delete[] m_SeqIndexToId; |
|---|
| 45 | |
|---|
| 46 | m_uSeqCount = 0; |
|---|
| 47 | m_uColCount = 0; |
|---|
| 48 | |
|---|
| 49 | m_szSeqs = 0; |
|---|
| 50 | m_szNames = 0; |
|---|
| 51 | m_Weights = 0; |
|---|
| 52 | |
|---|
| 53 | m_IdToSeqIndex = 0; |
|---|
| 54 | m_SeqIndexToId = 0; |
|---|
| 55 | } |
|---|
| 56 | |
|---|
| 57 | void MSA::SetSize(unsigned uSeqCount, unsigned uColCount) |
|---|
| 58 | { |
|---|
| 59 | Free(); |
|---|
| 60 | |
|---|
| 61 | m_uSeqCount = uSeqCount; |
|---|
| 62 | m_uCacheSeqLength = uColCount; |
|---|
| 63 | m_uColCount = 0; |
|---|
| 64 | |
|---|
| 65 | if (0 == uSeqCount && 0 == uColCount) |
|---|
| 66 | return; |
|---|
| 67 | |
|---|
| 68 | m_szSeqs = new char *[uSeqCount]; |
|---|
| 69 | m_szNames = new char *[uSeqCount]; |
|---|
| 70 | m_Weights = new WEIGHT[uSeqCount]; |
|---|
| 71 | |
|---|
| 72 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 73 | { |
|---|
| 74 | m_szSeqs[uSeqIndex] = new char[uColCount+1]; |
|---|
| 75 | m_szNames[uSeqIndex] = 0; |
|---|
| 76 | #if DEBUG |
|---|
| 77 | m_Weights[uSeqIndex] = BTInsane; |
|---|
| 78 | memset(m_szSeqs[uSeqIndex], '?', uColCount); |
|---|
| 79 | #endif |
|---|
| 80 | m_szSeqs[uSeqIndex][uColCount] = 0; |
|---|
| 81 | } |
|---|
| 82 | |
|---|
| 83 | if (m_uIdCount > 0) |
|---|
| 84 | { |
|---|
| 85 | m_IdToSeqIndex = new unsigned[m_uIdCount]; |
|---|
| 86 | m_SeqIndexToId = new unsigned[m_uSeqCount]; |
|---|
| 87 | #if DEBUG |
|---|
| 88 | memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); |
|---|
| 89 | memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); |
|---|
| 90 | #endif |
|---|
| 91 | } |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | void MSA::LogMe() const |
|---|
| 95 | { |
|---|
| 96 | if (0 == GetColCount()) |
|---|
| 97 | { |
|---|
| 98 | Log("MSA empty\n"); |
|---|
| 99 | return; |
|---|
| 100 | } |
|---|
| 101 | |
|---|
| 102 | const unsigned uColsPerLine = 50; |
|---|
| 103 | unsigned uLinesPerSeq = (GetColCount() - 1)/uColsPerLine + 1; |
|---|
| 104 | for (unsigned n = 0; n < uLinesPerSeq; ++n) |
|---|
| 105 | { |
|---|
| 106 | unsigned i; |
|---|
| 107 | unsigned iStart = n*uColsPerLine; |
|---|
| 108 | unsigned iEnd = GetColCount(); |
|---|
| 109 | if (iEnd - iStart + 1 > uColsPerLine) |
|---|
| 110 | iEnd = iStart + uColsPerLine; |
|---|
| 111 | Log(" "); |
|---|
| 112 | for (i = iStart; i < iEnd; ++i) |
|---|
| 113 | Log("%u", i%10); |
|---|
| 114 | Log("\n"); |
|---|
| 115 | Log(" "); |
|---|
| 116 | for (i = iStart; i + 9 < iEnd; i += 10) |
|---|
| 117 | Log("%-10u", i); |
|---|
| 118 | if (n == uLinesPerSeq - 1) |
|---|
| 119 | Log(" %-10u", GetColCount()); |
|---|
| 120 | Log("\n"); |
|---|
| 121 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
|---|
| 122 | { |
|---|
| 123 | Log("%12.12s", m_szNames[uSeqIndex]); |
|---|
| 124 | if (m_Weights[uSeqIndex] != BTInsane) |
|---|
| 125 | Log(" (%5.3f)", m_Weights[uSeqIndex]); |
|---|
| 126 | else |
|---|
| 127 | Log(" "); |
|---|
| 128 | Log(" "); |
|---|
| 129 | for (i = iStart; i < iEnd; ++i) |
|---|
| 130 | Log("%c", GetChar(uSeqIndex, i)); |
|---|
| 131 | if (0 != m_SeqIndexToId) |
|---|
| 132 | Log(" [%5u]", m_SeqIndexToId[uSeqIndex]); |
|---|
| 133 | Log("\n"); |
|---|
| 134 | } |
|---|
| 135 | Log("\n\n"); |
|---|
| 136 | } |
|---|
| 137 | } |
|---|
| 138 | |
|---|
| 139 | char MSA::GetChar(unsigned uSeqIndex, unsigned uIndex) const |
|---|
| 140 | { |
|---|
| 141 | // TODO: Performance cost? |
|---|
| 142 | if (uSeqIndex >= m_uSeqCount || uIndex >= m_uColCount) |
|---|
| 143 | Quit("MSA::GetChar(%u/%u,%u/%u)", |
|---|
| 144 | uSeqIndex, m_uSeqCount, uIndex, m_uColCount); |
|---|
| 145 | |
|---|
| 146 | char c = m_szSeqs[uSeqIndex][uIndex]; |
|---|
| 147 | // assert(IsLegalChar(c)); |
|---|
| 148 | return c; |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | unsigned MSA::GetLetter(unsigned uSeqIndex, unsigned uIndex) const |
|---|
| 152 | { |
|---|
| 153 | // TODO: Performance cost? |
|---|
| 154 | char c = GetChar(uSeqIndex, uIndex); |
|---|
| 155 | unsigned uLetter = CharToLetter(c); |
|---|
| 156 | if (uLetter >= 20) |
|---|
| 157 | { |
|---|
| 158 | char c = ' '; |
|---|
| 159 | if (uSeqIndex < m_uSeqCount && uIndex < m_uColCount) |
|---|
| 160 | c = m_szSeqs[uSeqIndex][uIndex]; |
|---|
| 161 | Quit("MSA::GetLetter(%u/%u, %u/%u)='%c'/%u", |
|---|
| 162 | uSeqIndex, m_uSeqCount, uIndex, m_uColCount, c, uLetter); |
|---|
| 163 | } |
|---|
| 164 | return uLetter; |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | unsigned MSA::GetLetterEx(unsigned uSeqIndex, unsigned uIndex) const |
|---|
| 168 | { |
|---|
| 169 | // TODO: Performance cost? |
|---|
| 170 | char c = GetChar(uSeqIndex, uIndex); |
|---|
| 171 | unsigned uLetter = CharToLetterEx(c); |
|---|
| 172 | return uLetter; |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | void MSA::SetSeqName(unsigned uSeqIndex, const char szName[]) |
|---|
| 176 | { |
|---|
| 177 | if (uSeqIndex >= m_uSeqCount) |
|---|
| 178 | Quit("MSA::SetSeqName(%u, %s), count=%u", uSeqIndex, m_uSeqCount); |
|---|
| 179 | delete[] m_szNames[uSeqIndex]; |
|---|
| 180 | int n = (int) strlen(szName) + 1; |
|---|
| 181 | m_szNames[uSeqIndex] = new char[n]; |
|---|
| 182 | memcpy(m_szNames[uSeqIndex], szName, n); |
|---|
| 183 | } |
|---|
| 184 | |
|---|
| 185 | const char *MSA::GetSeqName(unsigned uSeqIndex) const |
|---|
| 186 | { |
|---|
| 187 | if (uSeqIndex >= m_uSeqCount) |
|---|
| 188 | Quit("MSA::GetSeqName(%u), count=%u", uSeqIndex, m_uSeqCount); |
|---|
| 189 | return m_szNames[uSeqIndex]; |
|---|
| 190 | } |
|---|
| 191 | |
|---|
| 192 | bool MSA::IsGap(unsigned uSeqIndex, unsigned uIndex) const |
|---|
| 193 | { |
|---|
| 194 | char c = GetChar(uSeqIndex, uIndex); |
|---|
| 195 | return IsGapChar(c); |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | bool MSA::IsWildcard(unsigned uSeqIndex, unsigned uIndex) const |
|---|
| 199 | { |
|---|
| 200 | char c = GetChar(uSeqIndex, uIndex); |
|---|
| 201 | return IsWildcardChar(c); |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | void MSA::SetChar(unsigned uSeqIndex, unsigned uIndex, char c) |
|---|
| 205 | { |
|---|
| 206 | if (uSeqIndex >= m_uSeqCount || uIndex > m_uCacheSeqLength) |
|---|
| 207 | Quit("MSA::SetChar(%u,%u)", uSeqIndex, uIndex); |
|---|
| 208 | |
|---|
| 209 | if (uIndex == m_uCacheSeqLength) |
|---|
| 210 | { |
|---|
| 211 | const unsigned uNewCacheSeqLength = m_uCacheSeqLength + DEFAULT_SEQ_LENGTH; |
|---|
| 212 | for (unsigned n = 0; n < m_uSeqCount; ++n) |
|---|
| 213 | { |
|---|
| 214 | char *ptrNewSeq = new char[uNewCacheSeqLength+1]; |
|---|
| 215 | memcpy(ptrNewSeq, m_szSeqs[n], m_uCacheSeqLength); |
|---|
| 216 | memset(ptrNewSeq + m_uCacheSeqLength, '?', DEFAULT_SEQ_LENGTH); |
|---|
| 217 | ptrNewSeq[uNewCacheSeqLength] = 0; |
|---|
| 218 | delete[] m_szSeqs[n]; |
|---|
| 219 | m_szSeqs[n] = ptrNewSeq; |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | m_uColCount = uIndex; |
|---|
| 223 | m_uCacheSeqLength = uNewCacheSeqLength; |
|---|
| 224 | } |
|---|
| 225 | |
|---|
| 226 | if (uIndex >= m_uColCount) |
|---|
| 227 | m_uColCount = uIndex + 1; |
|---|
| 228 | m_szSeqs[uSeqIndex][uIndex] = c; |
|---|
| 229 | } |
|---|
| 230 | |
|---|
| 231 | void MSA::GetSeq(unsigned uSeqIndex, Seq &seq) const |
|---|
| 232 | { |
|---|
| 233 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 234 | |
|---|
| 235 | seq.Clear(); |
|---|
| 236 | |
|---|
| 237 | for (unsigned n = 0; n < m_uColCount; ++n) |
|---|
| 238 | if (!IsGap(uSeqIndex, n)) |
|---|
| 239 | { |
|---|
| 240 | char c = GetChar(uSeqIndex, n); |
|---|
| 241 | if (!isalpha(c)) |
|---|
| 242 | Quit("Invalid character '%c' in sequence", c); |
|---|
| 243 | c = toupper(c); |
|---|
| 244 | seq.push_back(c); |
|---|
| 245 | } |
|---|
| 246 | const char *ptrName = GetSeqName(uSeqIndex); |
|---|
| 247 | seq.SetName(ptrName); |
|---|
| 248 | } |
|---|
| 249 | |
|---|
| 250 | bool MSA::HasGap() const |
|---|
| 251 | { |
|---|
| 252 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 253 | for (unsigned n = 0; n < GetColCount(); ++n) |
|---|
| 254 | if (IsGap(uSeqIndex, n)) |
|---|
| 255 | return true; |
|---|
| 256 | return false; |
|---|
| 257 | } |
|---|
| 258 | |
|---|
| 259 | bool MSA::IsLegalLetter(unsigned uLetter) const |
|---|
| 260 | { |
|---|
| 261 | return uLetter < 20; |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | void MSA::SetSeqCount(unsigned uSeqCount) |
|---|
| 265 | { |
|---|
| 266 | Free(); |
|---|
| 267 | SetSize(uSeqCount, DEFAULT_SEQ_LENGTH); |
|---|
| 268 | } |
|---|
| 269 | |
|---|
| 270 | void MSA::CopyCol(unsigned uFromCol, unsigned uToCol) |
|---|
| 271 | { |
|---|
| 272 | assert(uFromCol < GetColCount()); |
|---|
| 273 | assert(uToCol < GetColCount()); |
|---|
| 274 | if (uFromCol == uToCol) |
|---|
| 275 | return; |
|---|
| 276 | |
|---|
| 277 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 278 | { |
|---|
| 279 | const char c = GetChar(uSeqIndex, uFromCol); |
|---|
| 280 | SetChar(uSeqIndex, uToCol, c); |
|---|
| 281 | } |
|---|
| 282 | } |
|---|
| 283 | |
|---|
| 284 | void MSA::Copy(const MSA &msa) |
|---|
| 285 | { |
|---|
| 286 | Free(); |
|---|
| 287 | const unsigned uSeqCount = msa.GetSeqCount(); |
|---|
| 288 | const unsigned uColCount = msa.GetColCount(); |
|---|
| 289 | SetSize(uSeqCount, uColCount); |
|---|
| 290 | |
|---|
| 291 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 292 | { |
|---|
| 293 | SetSeqName(uSeqIndex, msa.GetSeqName(uSeqIndex)); |
|---|
| 294 | const unsigned uId = msa.GetSeqId(uSeqIndex); |
|---|
| 295 | SetSeqId(uSeqIndex, uId); |
|---|
| 296 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
|---|
| 297 | { |
|---|
| 298 | const char c = msa.GetChar(uSeqIndex, uColIndex); |
|---|
| 299 | SetChar(uSeqIndex, uColIndex, c); |
|---|
| 300 | } |
|---|
| 301 | } |
|---|
| 302 | } |
|---|
| 303 | |
|---|
| 304 | bool MSA::IsGapColumn(unsigned uColIndex) const |
|---|
| 305 | { |
|---|
| 306 | assert(GetSeqCount() > 0); |
|---|
| 307 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 308 | if (!IsGap(uSeqIndex, uColIndex)) |
|---|
| 309 | return false; |
|---|
| 310 | return true; |
|---|
| 311 | } |
|---|
| 312 | |
|---|
| 313 | bool MSA::GetSeqIndex(const char *ptrSeqName, unsigned *ptruSeqIndex) const |
|---|
| 314 | { |
|---|
| 315 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 316 | if (0 == stricmp(ptrSeqName, GetSeqName(uSeqIndex))) |
|---|
| 317 | { |
|---|
| 318 | *ptruSeqIndex = uSeqIndex; |
|---|
| 319 | return true; |
|---|
| 320 | } |
|---|
| 321 | return false; |
|---|
| 322 | } |
|---|
| 323 | |
|---|
| 324 | void MSA::DeleteCol(unsigned uColIndex) |
|---|
| 325 | { |
|---|
| 326 | assert(uColIndex < m_uColCount); |
|---|
| 327 | size_t n = m_uColCount - uColIndex; |
|---|
| 328 | if (n > 0) |
|---|
| 329 | { |
|---|
| 330 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 331 | { |
|---|
| 332 | char *ptrSeq = m_szSeqs[uSeqIndex]; |
|---|
| 333 | memmove(ptrSeq + uColIndex, ptrSeq + uColIndex + 1, n); |
|---|
| 334 | } |
|---|
| 335 | } |
|---|
| 336 | --m_uColCount; |
|---|
| 337 | } |
|---|
| 338 | |
|---|
| 339 | void MSA::DeleteColumns(unsigned uColIndex, unsigned uColCount) |
|---|
| 340 | { |
|---|
| 341 | for (unsigned n = 0; n < uColCount; ++n) |
|---|
| 342 | DeleteCol(uColIndex); |
|---|
| 343 | } |
|---|
| 344 | |
|---|
| 345 | void MSA::FromFile(TextFile &File) |
|---|
| 346 | { |
|---|
| 347 | FromFASTAFile(File); |
|---|
| 348 | } |
|---|
| 349 | |
|---|
| 350 | // Weights sum to 1, WCounts sum to NIC |
|---|
| 351 | WEIGHT MSA::GetSeqWeight(unsigned uSeqIndex) const |
|---|
| 352 | { |
|---|
| 353 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 354 | WEIGHT w = m_Weights[uSeqIndex]; |
|---|
| 355 | if (w == wInsane) |
|---|
| 356 | Quit("Seq weight not set"); |
|---|
| 357 | return w; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | void MSA::SetSeqWeight(unsigned uSeqIndex, WEIGHT w) const |
|---|
| 361 | { |
|---|
| 362 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 363 | m_Weights[uSeqIndex] = w; |
|---|
| 364 | } |
|---|
| 365 | |
|---|
| 366 | void MSA::NormalizeWeights(WEIGHT wDesiredTotal) const |
|---|
| 367 | { |
|---|
| 368 | WEIGHT wTotal = 0; |
|---|
| 369 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
|---|
| 370 | wTotal += m_Weights[uSeqIndex]; |
|---|
| 371 | |
|---|
| 372 | if (0 == wTotal) |
|---|
| 373 | return; |
|---|
| 374 | |
|---|
| 375 | const WEIGHT f = wDesiredTotal/wTotal; |
|---|
| 376 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
|---|
| 377 | m_Weights[uSeqIndex] *= f; |
|---|
| 378 | } |
|---|
| 379 | |
|---|
| 380 | void MSA::CalcWeights() const |
|---|
| 381 | { |
|---|
| 382 | Quit("Calc weights not implemented"); |
|---|
| 383 | } |
|---|
| 384 | |
|---|
| 385 | static void FmtChar(char c, unsigned uWidth) |
|---|
| 386 | { |
|---|
| 387 | Log("%c", c); |
|---|
| 388 | for (unsigned n = 0; n < uWidth - 1; ++n) |
|---|
| 389 | Log(" "); |
|---|
| 390 | } |
|---|
| 391 | |
|---|
| 392 | static void FmtInt(unsigned u, unsigned uWidth) |
|---|
| 393 | { |
|---|
| 394 | static char szStr[1024]; |
|---|
| 395 | assert(uWidth < sizeof(szStr)); |
|---|
| 396 | if (u > 0) |
|---|
| 397 | sprintf(szStr, "%u", u); |
|---|
| 398 | else |
|---|
| 399 | strcpy(szStr, "."); |
|---|
| 400 | Log(szStr); |
|---|
| 401 | unsigned n = (unsigned) strlen(szStr); |
|---|
| 402 | if (n < uWidth) |
|---|
| 403 | for (unsigned i = 0; i < uWidth - n; ++i) |
|---|
| 404 | Log(" "); |
|---|
| 405 | } |
|---|
| 406 | |
|---|
| 407 | static void FmtInt0(unsigned u, unsigned uWidth) |
|---|
| 408 | { |
|---|
| 409 | static char szStr[1024]; |
|---|
| 410 | assert(uWidth < sizeof(szStr)); |
|---|
| 411 | sprintf(szStr, "%u", u); |
|---|
| 412 | Log(szStr); |
|---|
| 413 | unsigned n = (unsigned) strlen(szStr); |
|---|
| 414 | if (n < uWidth) |
|---|
| 415 | for (unsigned i = 0; i < uWidth - n; ++i) |
|---|
| 416 | Log(" "); |
|---|
| 417 | } |
|---|
| 418 | |
|---|
| 419 | static void FmtPad(unsigned n) |
|---|
| 420 | { |
|---|
| 421 | for (unsigned i = 0; i < n; ++i) |
|---|
| 422 | Log(" "); |
|---|
| 423 | } |
|---|
| 424 | |
|---|
| 425 | void MSA::FromSeq(const Seq &s) |
|---|
| 426 | { |
|---|
| 427 | unsigned uSeqLength = s.Length(); |
|---|
| 428 | SetSize(1, uSeqLength); |
|---|
| 429 | SetSeqName(0, s.GetName()); |
|---|
| 430 | if (0 != m_SeqIndexToId) |
|---|
| 431 | SetSeqId(0, s.GetId()); |
|---|
| 432 | for (unsigned n = 0; n < uSeqLength; ++n) |
|---|
| 433 | SetChar(0, n, s[n]); |
|---|
| 434 | } |
|---|
| 435 | |
|---|
| 436 | unsigned MSA::GetCharCount(unsigned uSeqIndex, unsigned uColIndex) const |
|---|
| 437 | { |
|---|
| 438 | assert(uSeqIndex < GetSeqCount()); |
|---|
| 439 | assert(uColIndex < GetColCount()); |
|---|
| 440 | |
|---|
| 441 | unsigned uCol = 0; |
|---|
| 442 | for (unsigned n = 0; n <= uColIndex; ++n) |
|---|
| 443 | if (!IsGap(uSeqIndex, n)) |
|---|
| 444 | ++uCol; |
|---|
| 445 | return uCol; |
|---|
| 446 | } |
|---|
| 447 | |
|---|
| 448 | void MSA::CopySeq(unsigned uToSeqIndex, const MSA &msaFrom, unsigned uFromSeqIndex) |
|---|
| 449 | { |
|---|
| 450 | assert(uToSeqIndex < m_uSeqCount); |
|---|
| 451 | const unsigned uColCount = msaFrom.GetColCount(); |
|---|
| 452 | assert(m_uColCount == uColCount || |
|---|
| 453 | (0 == m_uColCount && uColCount <= m_uCacheSeqLength)); |
|---|
| 454 | |
|---|
| 455 | memcpy(m_szSeqs[uToSeqIndex], msaFrom.GetSeqBuffer(uFromSeqIndex), uColCount); |
|---|
| 456 | SetSeqName(uToSeqIndex, msaFrom.GetSeqName(uFromSeqIndex)); |
|---|
| 457 | if (0 == m_uColCount) |
|---|
| 458 | m_uColCount = uColCount; |
|---|
| 459 | } |
|---|
| 460 | |
|---|
| 461 | const char *MSA::GetSeqBuffer(unsigned uSeqIndex) const |
|---|
| 462 | { |
|---|
| 463 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 464 | return m_szSeqs[uSeqIndex]; |
|---|
| 465 | } |
|---|
| 466 | |
|---|
| 467 | void MSA::DeleteSeq(unsigned uSeqIndex) |
|---|
| 468 | { |
|---|
| 469 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 470 | |
|---|
| 471 | delete m_szSeqs[uSeqIndex]; |
|---|
| 472 | delete m_szNames[uSeqIndex]; |
|---|
| 473 | |
|---|
| 474 | const unsigned uBytesToMove = (m_uSeqCount - uSeqIndex)*sizeof(char *); |
|---|
| 475 | if (uBytesToMove > 0) |
|---|
| 476 | { |
|---|
| 477 | memmove(m_szSeqs + uSeqIndex, m_szSeqs + uSeqIndex + 1, uBytesToMove); |
|---|
| 478 | memmove(m_szNames + uSeqIndex, m_szNames + uSeqIndex + 1, uBytesToMove); |
|---|
| 479 | } |
|---|
| 480 | |
|---|
| 481 | --m_uSeqCount; |
|---|
| 482 | |
|---|
| 483 | delete[] m_Weights; |
|---|
| 484 | m_Weights = 0; |
|---|
| 485 | } |
|---|
| 486 | |
|---|
| 487 | bool MSA::IsEmptyCol(unsigned uColIndex) const |
|---|
| 488 | { |
|---|
| 489 | const unsigned uSeqCount = GetSeqCount(); |
|---|
| 490 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 491 | if (!IsGap(uSeqIndex, uColIndex)) |
|---|
| 492 | return false; |
|---|
| 493 | return true; |
|---|
| 494 | } |
|---|
| 495 | |
|---|
| 496 | //void MSA::DeleteEmptyCols(bool bProgress) |
|---|
| 497 | // { |
|---|
| 498 | // unsigned uColCount = GetColCount(); |
|---|
| 499 | // for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
|---|
| 500 | // { |
|---|
| 501 | // if (IsEmptyCol(uColIndex)) |
|---|
| 502 | // { |
|---|
| 503 | // if (bProgress) |
|---|
| 504 | // { |
|---|
| 505 | // Log("Deleting col %u of %u\n", uColIndex, uColCount); |
|---|
| 506 | // printf("Deleting col %u of %u\n", uColIndex, uColCount); |
|---|
| 507 | // } |
|---|
| 508 | // DeleteCol(uColIndex); |
|---|
| 509 | // --uColCount; |
|---|
| 510 | // } |
|---|
| 511 | // } |
|---|
| 512 | // } |
|---|
| 513 | |
|---|
| 514 | unsigned MSA::AlignedColIndexToColIndex(unsigned uAlignedColIndex) const |
|---|
| 515 | { |
|---|
| 516 | Quit("MSA::AlignedColIndexToColIndex not implemented"); |
|---|
| 517 | return 0; |
|---|
| 518 | } |
|---|
| 519 | |
|---|
| 520 | WEIGHT MSA::GetTotalSeqWeight() const |
|---|
| 521 | { |
|---|
| 522 | WEIGHT wTotal = 0; |
|---|
| 523 | const unsigned uSeqCount = GetSeqCount(); |
|---|
| 524 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 525 | wTotal += m_Weights[uSeqIndex]; |
|---|
| 526 | return wTotal; |
|---|
| 527 | } |
|---|
| 528 | |
|---|
| 529 | bool MSA::SeqsEq(const MSA &a1, unsigned uSeqIndex1, const MSA &a2, |
|---|
| 530 | unsigned uSeqIndex2) |
|---|
| 531 | { |
|---|
| 532 | Seq s1; |
|---|
| 533 | Seq s2; |
|---|
| 534 | |
|---|
| 535 | a1.GetSeq(uSeqIndex1, s1); |
|---|
| 536 | a2.GetSeq(uSeqIndex2, s2); |
|---|
| 537 | |
|---|
| 538 | s1.StripGaps(); |
|---|
| 539 | s2.StripGaps(); |
|---|
| 540 | |
|---|
| 541 | return s1.EqIgnoreCase(s2); |
|---|
| 542 | } |
|---|
| 543 | |
|---|
| 544 | unsigned MSA::GetSeqLength(unsigned uSeqIndex) const |
|---|
| 545 | { |
|---|
| 546 | assert(uSeqIndex < GetSeqCount()); |
|---|
| 547 | |
|---|
| 548 | const unsigned uColCount = GetColCount(); |
|---|
| 549 | unsigned uLength = 0; |
|---|
| 550 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
|---|
| 551 | if (!IsGap(uSeqIndex, uColIndex)) |
|---|
| 552 | ++uLength; |
|---|
| 553 | return uLength; |
|---|
| 554 | } |
|---|
| 555 | |
|---|
| 556 | void MSA::GetPWID(unsigned uSeqIndex1, unsigned uSeqIndex2, double *ptrPWID, |
|---|
| 557 | unsigned *ptruPosCount) const |
|---|
| 558 | { |
|---|
| 559 | assert(uSeqIndex1 < GetSeqCount()); |
|---|
| 560 | assert(uSeqIndex2 < GetSeqCount()); |
|---|
| 561 | |
|---|
| 562 | unsigned uSameCount = 0; |
|---|
| 563 | unsigned uPosCount = 0; |
|---|
| 564 | const unsigned uColCount = GetColCount(); |
|---|
| 565 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
|---|
| 566 | { |
|---|
| 567 | char c1 = GetChar(uSeqIndex1, uColIndex); |
|---|
| 568 | if (IsGapChar(c1)) |
|---|
| 569 | continue; |
|---|
| 570 | char c2 = GetChar(uSeqIndex2, uColIndex); |
|---|
| 571 | if (IsGapChar(c2)) |
|---|
| 572 | continue; |
|---|
| 573 | ++uPosCount; |
|---|
| 574 | if (c1 == c2) |
|---|
| 575 | ++uSameCount; |
|---|
| 576 | } |
|---|
| 577 | *ptruPosCount = uPosCount; |
|---|
| 578 | if (uPosCount > 0) |
|---|
| 579 | *ptrPWID = 100.0 * (double) uSameCount / (double) uPosCount; |
|---|
| 580 | else |
|---|
| 581 | *ptrPWID = 0; |
|---|
| 582 | } |
|---|
| 583 | |
|---|
| 584 | void MSA::UnWeight() |
|---|
| 585 | { |
|---|
| 586 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 587 | m_Weights[uSeqIndex] = BTInsane; |
|---|
| 588 | } |
|---|
| 589 | |
|---|
| 590 | unsigned MSA::UniqueResidueTypes(unsigned uColIndex) const |
|---|
| 591 | { |
|---|
| 592 | assert(uColIndex < GetColCount()); |
|---|
| 593 | |
|---|
| 594 | unsigned Counts[MAX_ALPHA]; |
|---|
| 595 | memset(Counts, 0, sizeof(Counts)); |
|---|
| 596 | const unsigned uSeqCount = GetSeqCount(); |
|---|
| 597 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 598 | { |
|---|
| 599 | if (IsGap(uSeqIndex, uColIndex) || IsWildcard(uSeqIndex, uColIndex)) |
|---|
| 600 | continue; |
|---|
| 601 | const unsigned uLetter = GetLetter(uSeqIndex, uColIndex); |
|---|
| 602 | ++(Counts[uLetter]); |
|---|
| 603 | } |
|---|
| 604 | unsigned uUniqueCount = 0; |
|---|
| 605 | for (unsigned uLetter = 0; uLetter < g_AlphaSize; ++uLetter) |
|---|
| 606 | if (Counts[uLetter] > 0) |
|---|
| 607 | ++uUniqueCount; |
|---|
| 608 | return uUniqueCount; |
|---|
| 609 | } |
|---|
| 610 | |
|---|
| 611 | double MSA::GetOcc(unsigned uColIndex) const |
|---|
| 612 | { |
|---|
| 613 | unsigned uGapCount = 0; |
|---|
| 614 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 615 | if (IsGap(uSeqIndex, uColIndex)) |
|---|
| 616 | ++uGapCount; |
|---|
| 617 | unsigned uSeqCount = GetSeqCount(); |
|---|
| 618 | return (double) (uSeqCount - uGapCount) / (double) uSeqCount; |
|---|
| 619 | } |
|---|
| 620 | |
|---|
| 621 | void MSA::ToFile(TextFile &File) const |
|---|
| 622 | { |
|---|
| 623 | if (g_bMSF) |
|---|
| 624 | ToMSFFile(File); |
|---|
| 625 | else if (g_bAln) |
|---|
| 626 | ToAlnFile(File); |
|---|
| 627 | else if (g_bHTML) |
|---|
| 628 | ToHTMLFile(File); |
|---|
| 629 | else if (g_bPHYS) |
|---|
| 630 | ToPhySequentialFile(File); |
|---|
| 631 | else if (g_bPHYI) |
|---|
| 632 | ToPhyInterleavedFile(File); |
|---|
| 633 | else |
|---|
| 634 | ToFASTAFile(File); |
|---|
| 635 | if (0 != g_pstrScoreFileName) |
|---|
| 636 | WriteScoreFile(*this); |
|---|
| 637 | } |
|---|
| 638 | |
|---|
| 639 | bool MSA::ColumnHasGap(unsigned uColIndex) const |
|---|
| 640 | { |
|---|
| 641 | const unsigned uSeqCount = GetSeqCount(); |
|---|
| 642 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 643 | if (IsGap(uSeqIndex, uColIndex)) |
|---|
| 644 | return true; |
|---|
| 645 | return false; |
|---|
| 646 | } |
|---|
| 647 | |
|---|
| 648 | void MSA::SetIdCount(unsigned uIdCount) |
|---|
| 649 | { |
|---|
| 650 | //if (m_uIdCount != 0) |
|---|
| 651 | // Quit("MSA::SetIdCount: may only be called once"); |
|---|
| 652 | |
|---|
| 653 | if (m_uIdCount > 0) |
|---|
| 654 | { |
|---|
| 655 | if (uIdCount > m_uIdCount) |
|---|
| 656 | Quit("MSA::SetIdCount: cannot increase count"); |
|---|
| 657 | return; |
|---|
| 658 | } |
|---|
| 659 | m_uIdCount = uIdCount; |
|---|
| 660 | } |
|---|
| 661 | |
|---|
| 662 | void MSA::SetSeqId(unsigned uSeqIndex, unsigned uId) |
|---|
| 663 | { |
|---|
| 664 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 665 | assert(uId < m_uIdCount); |
|---|
| 666 | if (0 == m_SeqIndexToId) |
|---|
| 667 | { |
|---|
| 668 | if (0 == m_uIdCount) |
|---|
| 669 | Quit("MSA::SetSeqId, SetIdCount has not been called"); |
|---|
| 670 | m_IdToSeqIndex = new unsigned[m_uIdCount]; |
|---|
| 671 | m_SeqIndexToId = new unsigned[m_uSeqCount]; |
|---|
| 672 | |
|---|
| 673 | memset(m_IdToSeqIndex, 0xff, m_uIdCount*sizeof(unsigned)); |
|---|
| 674 | memset(m_SeqIndexToId, 0xff, m_uSeqCount*sizeof(unsigned)); |
|---|
| 675 | } |
|---|
| 676 | m_SeqIndexToId[uSeqIndex] = uId; |
|---|
| 677 | m_IdToSeqIndex[uId] = uSeqIndex; |
|---|
| 678 | } |
|---|
| 679 | |
|---|
| 680 | unsigned MSA::GetSeqIndex(unsigned uId) const |
|---|
| 681 | { |
|---|
| 682 | assert(uId < m_uIdCount); |
|---|
| 683 | assert(0 != m_IdToSeqIndex); |
|---|
| 684 | unsigned uSeqIndex = m_IdToSeqIndex[uId]; |
|---|
| 685 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 686 | return uSeqIndex; |
|---|
| 687 | } |
|---|
| 688 | |
|---|
| 689 | bool MSA::GetSeqIndex(unsigned uId, unsigned *ptruIndex) const |
|---|
| 690 | { |
|---|
| 691 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
|---|
| 692 | { |
|---|
| 693 | if (uId == m_SeqIndexToId[uSeqIndex]) |
|---|
| 694 | { |
|---|
| 695 | *ptruIndex = uSeqIndex; |
|---|
| 696 | return true; |
|---|
| 697 | } |
|---|
| 698 | } |
|---|
| 699 | return false; |
|---|
| 700 | } |
|---|
| 701 | |
|---|
| 702 | unsigned MSA::GetSeqId(unsigned uSeqIndex) const |
|---|
| 703 | { |
|---|
| 704 | assert(uSeqIndex < m_uSeqCount); |
|---|
| 705 | unsigned uId = m_SeqIndexToId[uSeqIndex]; |
|---|
| 706 | assert(uId < m_uIdCount); |
|---|
| 707 | return uId; |
|---|
| 708 | } |
|---|
| 709 | |
|---|
| 710 | bool MSA::WeightsSet() const |
|---|
| 711 | { |
|---|
| 712 | return BTInsane != m_Weights[0]; |
|---|
| 713 | } |
|---|
| 714 | |
|---|
| 715 | void MSASubsetByIds(const MSA &msaIn, const unsigned Ids[], unsigned uIdCount, |
|---|
| 716 | MSA &msaOut) |
|---|
| 717 | { |
|---|
| 718 | const unsigned uColCount = msaIn.GetColCount(); |
|---|
| 719 | msaOut.SetSize(uIdCount, uColCount); |
|---|
| 720 | for (unsigned uSeqIndexOut = 0; uSeqIndexOut < uIdCount; ++uSeqIndexOut) |
|---|
| 721 | { |
|---|
| 722 | const unsigned uId = Ids[uSeqIndexOut]; |
|---|
| 723 | |
|---|
| 724 | const unsigned uSeqIndexIn = msaIn.GetSeqIndex(uId); |
|---|
| 725 | const char *ptrName = msaIn.GetSeqName(uSeqIndexIn); |
|---|
| 726 | |
|---|
| 727 | msaOut.SetSeqId(uSeqIndexOut, uId); |
|---|
| 728 | msaOut.SetSeqName(uSeqIndexOut, ptrName); |
|---|
| 729 | |
|---|
| 730 | for (unsigned uColIndex = 0; uColIndex < uColCount; ++uColIndex) |
|---|
| 731 | { |
|---|
| 732 | const char c = msaIn.GetChar(uSeqIndexIn, uColIndex); |
|---|
| 733 | msaOut.SetChar(uSeqIndexOut, uColIndex, c); |
|---|
| 734 | } |
|---|
| 735 | } |
|---|
| 736 | } |
|---|
| 737 | |
|---|
| 738 | // Caller must allocate ptrSeq and ptrLabel as new char[n]. |
|---|
| 739 | void MSA::AppendSeq(char *ptrSeq, unsigned uSeqLength, char *ptrLabel) |
|---|
| 740 | { |
|---|
| 741 | if (m_uSeqCount > m_uCacheSeqCount) |
|---|
| 742 | Quit("Internal error MSA::AppendSeq"); |
|---|
| 743 | if (m_uSeqCount == m_uCacheSeqCount) |
|---|
| 744 | ExpandCache(m_uSeqCount + 4, uSeqLength); |
|---|
| 745 | m_szSeqs[m_uSeqCount] = ptrSeq; |
|---|
| 746 | m_szNames[m_uSeqCount] = ptrLabel; |
|---|
| 747 | ++m_uSeqCount; |
|---|
| 748 | } |
|---|
| 749 | |
|---|
| 750 | void MSA::ExpandCache(unsigned uSeqCount, unsigned uColCount) |
|---|
| 751 | { |
|---|
| 752 | if (m_IdToSeqIndex != 0 || m_SeqIndexToId != 0 || uSeqCount < m_uSeqCount) |
|---|
| 753 | Quit("Internal error MSA::ExpandCache"); |
|---|
| 754 | |
|---|
| 755 | if (m_uSeqCount > 0 && uColCount != m_uColCount) |
|---|
| 756 | Quit("Internal error MSA::ExpandCache, ColCount changed"); |
|---|
| 757 | |
|---|
| 758 | char **NewSeqs = new char *[uSeqCount]; |
|---|
| 759 | char **NewNames = new char *[uSeqCount]; |
|---|
| 760 | WEIGHT *NewWeights = new WEIGHT[uSeqCount]; |
|---|
| 761 | |
|---|
| 762 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
|---|
| 763 | { |
|---|
| 764 | NewSeqs[uSeqIndex] = m_szSeqs[uSeqIndex]; |
|---|
| 765 | NewNames[uSeqIndex] = m_szNames[uSeqIndex]; |
|---|
| 766 | NewWeights[uSeqIndex] = m_Weights[uSeqIndex]; |
|---|
| 767 | } |
|---|
| 768 | |
|---|
| 769 | for (unsigned uSeqIndex = m_uSeqCount; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 770 | { |
|---|
| 771 | char *Seq = new char[uColCount]; |
|---|
| 772 | NewSeqs[uSeqIndex] = Seq; |
|---|
| 773 | #if DEBUG |
|---|
| 774 | memset(Seq, '?', uColCount); |
|---|
| 775 | #endif |
|---|
| 776 | } |
|---|
| 777 | |
|---|
| 778 | delete[] m_szSeqs; |
|---|
| 779 | delete[] m_szNames; |
|---|
| 780 | delete[] m_Weights; |
|---|
| 781 | |
|---|
| 782 | m_szSeqs = NewSeqs; |
|---|
| 783 | m_szNames = NewNames; |
|---|
| 784 | m_Weights = NewWeights; |
|---|
| 785 | |
|---|
| 786 | m_uCacheSeqCount = uSeqCount; |
|---|
| 787 | m_uCacheSeqLength = uColCount; |
|---|
| 788 | m_uColCount = uColCount; |
|---|
| 789 | } |
|---|
| 790 | |
|---|
| 791 | void MSA::FixAlpha() |
|---|
| 792 | { |
|---|
| 793 | ClearInvalidLetterWarning(); |
|---|
| 794 | for (unsigned uSeqIndex = 0; uSeqIndex < m_uSeqCount; ++uSeqIndex) |
|---|
| 795 | { |
|---|
| 796 | for (unsigned uColIndex = 0; uColIndex < m_uColCount; ++uColIndex) |
|---|
| 797 | { |
|---|
| 798 | char c = GetChar(uSeqIndex, uColIndex); |
|---|
| 799 | if (!IsResidueChar(c) && !IsGapChar(c)) |
|---|
| 800 | { |
|---|
| 801 | char w = GetWildcardChar(); |
|---|
| 802 | // Warning("Invalid letter '%c', replaced by '%c'", c, w); |
|---|
| 803 | InvalidLetterWarning(c, w); |
|---|
| 804 | SetChar(uSeqIndex, uColIndex, w); |
|---|
| 805 | } |
|---|
| 806 | } |
|---|
| 807 | } |
|---|
| 808 | ReportInvalidLetters(); |
|---|
| 809 | } |
|---|
| 810 | |
|---|
| 811 | ALPHA MSA::GuessAlpha() const |
|---|
| 812 | { |
|---|
| 813 | // If at least MIN_NUCLEO_PCT of the first CHAR_COUNT non-gap |
|---|
| 814 | // letters belong to the nucleotide alphabet, guess nucleo. |
|---|
| 815 | // Otherwise amino. |
|---|
| 816 | const unsigned CHAR_COUNT = 100; |
|---|
| 817 | const unsigned MIN_NUCLEO_PCT = 95; |
|---|
| 818 | |
|---|
| 819 | const unsigned uSeqCount = GetSeqCount(); |
|---|
| 820 | const unsigned uColCount = GetColCount(); |
|---|
| 821 | if (0 == uSeqCount) |
|---|
| 822 | return ALPHA_Amino; |
|---|
| 823 | |
|---|
| 824 | unsigned uDNACount = 0; |
|---|
| 825 | unsigned uRNACount = 0; |
|---|
| 826 | unsigned uTotal = 0; |
|---|
| 827 | unsigned i = 0; |
|---|
| 828 | for (;;) |
|---|
| 829 | { |
|---|
| 830 | unsigned uSeqIndex = i/uColCount; |
|---|
| 831 | if (uSeqIndex >= uSeqCount) |
|---|
| 832 | break; |
|---|
| 833 | unsigned uColIndex = i%uColCount; |
|---|
| 834 | ++i; |
|---|
| 835 | char c = GetChar(uSeqIndex, uColIndex); |
|---|
| 836 | if (IsGapChar(c)) |
|---|
| 837 | continue; |
|---|
| 838 | if (IsDNA(c)) |
|---|
| 839 | ++uDNACount; |
|---|
| 840 | if (IsRNA(c)) |
|---|
| 841 | ++uRNACount; |
|---|
| 842 | ++uTotal; |
|---|
| 843 | if (uTotal >= CHAR_COUNT) |
|---|
| 844 | break; |
|---|
| 845 | } |
|---|
| 846 | if (uTotal != 0 && ((uRNACount*100)/uTotal) >= MIN_NUCLEO_PCT) |
|---|
| 847 | return ALPHA_RNA; |
|---|
| 848 | if (uTotal != 0 && ((uDNACount*100)/uTotal) >= MIN_NUCLEO_PCT) |
|---|
| 849 | return ALPHA_DNA; |
|---|
| 850 | return ALPHA_Amino; |
|---|
| 851 | } |
|---|