| 1 | #include "muscle.h" |
|---|
| 2 | #include "seq.h" |
|---|
| 3 | #include "textfile.h" |
|---|
| 4 | #include "msa.h" |
|---|
| 5 | //#include <ctype.h> |
|---|
| 6 | |
|---|
| 7 | const size_t MAX_FASTA_LINE = 16000; |
|---|
| 8 | |
|---|
| 9 | void Seq::SetName(const char *ptrName) |
|---|
| 10 | { |
|---|
| 11 | delete[] m_ptrName; |
|---|
| 12 | size_t n = strlen(ptrName) + 1; |
|---|
| 13 | m_ptrName = new char[n]; |
|---|
| 14 | strcpy(m_ptrName, ptrName); |
|---|
| 15 | } |
|---|
| 16 | |
|---|
| 17 | void Seq::ToFASTAFile(TextFile &File) const |
|---|
| 18 | { |
|---|
| 19 | File.PutFormat(">%s\n", m_ptrName); |
|---|
| 20 | unsigned uColCount = Length(); |
|---|
| 21 | for (unsigned n = 0; n < uColCount; ++n) |
|---|
| 22 | { |
|---|
| 23 | if (n > 0 && n%60 == 0) |
|---|
| 24 | File.PutString("\n"); |
|---|
| 25 | File.PutChar(at(n)); |
|---|
| 26 | } |
|---|
| 27 | File.PutString("\n"); |
|---|
| 28 | } |
|---|
| 29 | |
|---|
| 30 | // Return true on end-of-file |
|---|
| 31 | bool Seq::FromFASTAFile(TextFile &File) |
|---|
| 32 | { |
|---|
| 33 | Clear(); |
|---|
| 34 | |
|---|
| 35 | char szLine[MAX_FASTA_LINE]; |
|---|
| 36 | bool bEof = File.GetLine(szLine, sizeof(szLine)); |
|---|
| 37 | if (bEof) |
|---|
| 38 | return true; |
|---|
| 39 | if ('>' != szLine[0]) |
|---|
| 40 | Quit("Expecting '>' in FASTA file %s line %u", |
|---|
| 41 | File.GetFileName(), File.GetLineNr()); |
|---|
| 42 | |
|---|
| 43 | size_t n = strlen(szLine); |
|---|
| 44 | if (1 == n) |
|---|
| 45 | Quit("Missing annotation following '>' in FASTA file %s line %u", |
|---|
| 46 | File.GetFileName(), File.GetLineNr()); |
|---|
| 47 | |
|---|
| 48 | m_ptrName = new char[n]; |
|---|
| 49 | strcpy(m_ptrName, szLine + 1); |
|---|
| 50 | |
|---|
| 51 | TEXTFILEPOS Pos = File.GetPos(); |
|---|
| 52 | for (;;) |
|---|
| 53 | { |
|---|
| 54 | bEof = File.GetLine(szLine, sizeof(szLine)); |
|---|
| 55 | if (bEof) |
|---|
| 56 | { |
|---|
| 57 | if (0 == size()) |
|---|
| 58 | { |
|---|
| 59 | Quit("Empty sequence in FASTA file %s line %u", |
|---|
| 60 | File.GetFileName(), File.GetLineNr()); |
|---|
| 61 | return true; |
|---|
| 62 | } |
|---|
| 63 | return false; |
|---|
| 64 | } |
|---|
| 65 | if ('>' == szLine[0]) |
|---|
| 66 | { |
|---|
| 67 | if (0 == size()) |
|---|
| 68 | Quit("Empty sequence in FASTA file %s line %u", |
|---|
| 69 | File.GetFileName(), File.GetLineNr()); |
|---|
| 70 | // Rewind to beginning of this line, it's the start of the |
|---|
| 71 | // next sequence. |
|---|
| 72 | File.SetPos(Pos); |
|---|
| 73 | return false; |
|---|
| 74 | } |
|---|
| 75 | const char *ptrChar = szLine; |
|---|
| 76 | while (char c = *ptrChar++) |
|---|
| 77 | { |
|---|
| 78 | if (isspace(c)) |
|---|
| 79 | continue; |
|---|
| 80 | if (IsGapChar(c)) |
|---|
| 81 | continue; |
|---|
| 82 | if (!IsResidueChar(c)) |
|---|
| 83 | { |
|---|
| 84 | if (isprint(c)) |
|---|
| 85 | { |
|---|
| 86 | char w = GetWildcardChar(); |
|---|
| 87 | Warning("Invalid residue '%c' in FASTA file %s line %d, replaced by '%c'", |
|---|
| 88 | c, File.GetFileName(), File.GetLineNr(), w); |
|---|
| 89 | c = w; |
|---|
| 90 | } |
|---|
| 91 | else |
|---|
| 92 | Quit("Invalid byte hex %02x in FASTA file %s line %d", |
|---|
| 93 | (unsigned char) c, File.GetFileName(), File.GetLineNr()); |
|---|
| 94 | } |
|---|
| 95 | c = toupper(c); |
|---|
| 96 | push_back(c); |
|---|
| 97 | } |
|---|
| 98 | Pos = File.GetPos(); |
|---|
| 99 | } |
|---|
| 100 | } |
|---|
| 101 | |
|---|
| 102 | void Seq::ExtractUngapped(MSA &msa) const |
|---|
| 103 | { |
|---|
| 104 | msa.Clear(); |
|---|
| 105 | unsigned uColCount = Length(); |
|---|
| 106 | msa.SetSize(1, 1); |
|---|
| 107 | unsigned uUngappedPos = 0; |
|---|
| 108 | for (unsigned n = 0; n < uColCount; ++n) |
|---|
| 109 | { |
|---|
| 110 | char c = at(n); |
|---|
| 111 | if (!IsGapChar(c)) |
|---|
| 112 | msa.SetChar(0, uUngappedPos++, c); |
|---|
| 113 | } |
|---|
| 114 | msa.SetSeqName(0, m_ptrName); |
|---|
| 115 | } |
|---|
| 116 | |
|---|
| 117 | void Seq::Copy(const Seq &rhs) |
|---|
| 118 | { |
|---|
| 119 | clear(); |
|---|
| 120 | const unsigned uLength = rhs.Length(); |
|---|
| 121 | for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) |
|---|
| 122 | push_back(rhs.at(uColIndex)); |
|---|
| 123 | const char *ptrName = rhs.GetName(); |
|---|
| 124 | size_t n = strlen(ptrName) + 1; |
|---|
| 125 | m_ptrName = new char[n]; |
|---|
| 126 | strcpy(m_ptrName, ptrName); |
|---|
| 127 | SetId(rhs.GetId()); |
|---|
| 128 | } |
|---|
| 129 | |
|---|
| 130 | void Seq::CopyReversed(const Seq &rhs) |
|---|
| 131 | { |
|---|
| 132 | clear(); |
|---|
| 133 | const unsigned uLength = rhs.Length(); |
|---|
| 134 | const unsigned uBase = rhs.Length() - 1; |
|---|
| 135 | for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) |
|---|
| 136 | push_back(rhs.at(uBase - uColIndex)); |
|---|
| 137 | const char *ptrName = rhs.GetName(); |
|---|
| 138 | size_t n = strlen(ptrName) + 1; |
|---|
| 139 | m_ptrName = new char[n]; |
|---|
| 140 | strcpy(m_ptrName, ptrName); |
|---|
| 141 | } |
|---|
| 142 | |
|---|
| 143 | void Seq::StripGaps() |
|---|
| 144 | { |
|---|
| 145 | for (CharVect::iterator p = begin(); p != end(); ) |
|---|
| 146 | { |
|---|
| 147 | char c = *p; |
|---|
| 148 | if (IsGapChar(c)) |
|---|
| 149 | erase(p); |
|---|
| 150 | else |
|---|
| 151 | ++p; |
|---|
| 152 | } |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | void Seq::StripGapsAndWhitespace() |
|---|
| 156 | { |
|---|
| 157 | for (CharVect::iterator p = begin(); p != end(); ) |
|---|
| 158 | { |
|---|
| 159 | char c = *p; |
|---|
| 160 | if (isspace(c) || IsGapChar(c)) |
|---|
| 161 | erase(p); |
|---|
| 162 | else |
|---|
| 163 | ++p; |
|---|
| 164 | } |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | void Seq::ToUpper() |
|---|
| 168 | { |
|---|
| 169 | for (CharVect::iterator p = begin(); p != end(); ++p) |
|---|
| 170 | { |
|---|
| 171 | char c = *p; |
|---|
| 172 | if (islower(c)) |
|---|
| 173 | *p = toupper(c); |
|---|
| 174 | } |
|---|
| 175 | } |
|---|
| 176 | |
|---|
| 177 | unsigned Seq::GetLetter(unsigned uIndex) const |
|---|
| 178 | { |
|---|
| 179 | assert(uIndex < Length()); |
|---|
| 180 | char c = operator[](uIndex); |
|---|
| 181 | return CharToLetter(c); |
|---|
| 182 | } |
|---|
| 183 | |
|---|
| 184 | bool Seq::EqIgnoreCase(const Seq &s) const |
|---|
| 185 | { |
|---|
| 186 | const unsigned n = Length(); |
|---|
| 187 | if (n != s.Length()) |
|---|
| 188 | return false; |
|---|
| 189 | for (unsigned i = 0; i < n; ++i) |
|---|
| 190 | { |
|---|
| 191 | const char c1 = at(i); |
|---|
| 192 | const char c2 = s.at(i); |
|---|
| 193 | if (IsGapChar(c1)) |
|---|
| 194 | { |
|---|
| 195 | if (!IsGapChar(c2)) |
|---|
| 196 | return false; |
|---|
| 197 | } |
|---|
| 198 | else |
|---|
| 199 | { |
|---|
| 200 | if (toupper(c1) != toupper(c2)) |
|---|
| 201 | return false; |
|---|
| 202 | } |
|---|
| 203 | } |
|---|
| 204 | return true; |
|---|
| 205 | } |
|---|
| 206 | |
|---|
| 207 | bool Seq::Eq(const Seq &s) const |
|---|
| 208 | { |
|---|
| 209 | const unsigned n = Length(); |
|---|
| 210 | if (n != s.Length()) |
|---|
| 211 | return false; |
|---|
| 212 | for (unsigned i = 0; i < n; ++i) |
|---|
| 213 | { |
|---|
| 214 | const char c1 = at(i); |
|---|
| 215 | const char c2 = s.at(i); |
|---|
| 216 | if (c1 != c2) |
|---|
| 217 | return false; |
|---|
| 218 | } |
|---|
| 219 | return true; |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | bool Seq::EqIgnoreCaseAndGaps(const Seq &s) const |
|---|
| 223 | { |
|---|
| 224 | const unsigned uThisLength = Length(); |
|---|
| 225 | const unsigned uOtherLength = s.Length(); |
|---|
| 226 | |
|---|
| 227 | unsigned uThisPos = 0; |
|---|
| 228 | unsigned uOtherPos = 0; |
|---|
| 229 | |
|---|
| 230 | int cThis; |
|---|
| 231 | int cOther; |
|---|
| 232 | for (;;) |
|---|
| 233 | { |
|---|
| 234 | if (uThisPos == uThisLength && uOtherPos == uOtherLength) |
|---|
| 235 | break; |
|---|
| 236 | |
|---|
| 237 | // Set cThis to next non-gap character in this string |
|---|
| 238 | // or -1 if end-of-string. |
|---|
| 239 | for (;;) |
|---|
| 240 | { |
|---|
| 241 | if (uThisPos == uThisLength) |
|---|
| 242 | { |
|---|
| 243 | cThis = -1; |
|---|
| 244 | break; |
|---|
| 245 | } |
|---|
| 246 | else |
|---|
| 247 | { |
|---|
| 248 | cThis = at(uThisPos); |
|---|
| 249 | ++uThisPos; |
|---|
| 250 | if (!IsGapChar(cThis)) |
|---|
| 251 | { |
|---|
| 252 | cThis = toupper(cThis); |
|---|
| 253 | break; |
|---|
| 254 | } |
|---|
| 255 | } |
|---|
| 256 | } |
|---|
| 257 | |
|---|
| 258 | // Set cOther to next non-gap character in s |
|---|
| 259 | // or -1 if end-of-string. |
|---|
| 260 | for (;;) |
|---|
| 261 | { |
|---|
| 262 | if (uOtherPos == uOtherLength) |
|---|
| 263 | { |
|---|
| 264 | cOther = -1; |
|---|
| 265 | break; |
|---|
| 266 | } |
|---|
| 267 | else |
|---|
| 268 | { |
|---|
| 269 | cOther = s.at(uOtherPos); |
|---|
| 270 | ++uOtherPos; |
|---|
| 271 | if (!IsGapChar(cOther)) |
|---|
| 272 | { |
|---|
| 273 | cOther = toupper(cOther); |
|---|
| 274 | break; |
|---|
| 275 | } |
|---|
| 276 | } |
|---|
| 277 | } |
|---|
| 278 | |
|---|
| 279 | // Compare characters are corresponding ungapped position |
|---|
| 280 | if (cThis != cOther) |
|---|
| 281 | return false; |
|---|
| 282 | } |
|---|
| 283 | return true; |
|---|
| 284 | } |
|---|
| 285 | |
|---|
| 286 | unsigned Seq::GetUngappedLength() const |
|---|
| 287 | { |
|---|
| 288 | unsigned uUngappedLength = 0; |
|---|
| 289 | for (CharVect::const_iterator p = begin(); p != end(); ++p) |
|---|
| 290 | { |
|---|
| 291 | char c = *p; |
|---|
| 292 | if (!IsGapChar(c)) |
|---|
| 293 | ++uUngappedLength; |
|---|
| 294 | } |
|---|
| 295 | return uUngappedLength; |
|---|
| 296 | } |
|---|
| 297 | |
|---|
| 298 | void Seq::LogMe() const |
|---|
| 299 | { |
|---|
| 300 | Log(">%s\n", m_ptrName); |
|---|
| 301 | const unsigned n = Length(); |
|---|
| 302 | for (unsigned i = 0; i < n; ++i) |
|---|
| 303 | Log("%c", at(i)); |
|---|
| 304 | Log("\n"); |
|---|
| 305 | } |
|---|
| 306 | |
|---|
| 307 | void Seq::FromString(const char *pstrSeq, const char *pstrName) |
|---|
| 308 | { |
|---|
| 309 | clear(); |
|---|
| 310 | const unsigned uLength = (unsigned) strlen(pstrSeq); |
|---|
| 311 | for (unsigned uColIndex = 0; uColIndex < uLength; ++uColIndex) |
|---|
| 312 | push_back(pstrSeq[uColIndex]); |
|---|
| 313 | size_t n = strlen(pstrName) + 1; |
|---|
| 314 | m_ptrName = new char[n]; |
|---|
| 315 | strcpy(m_ptrName, pstrName); |
|---|
| 316 | } |
|---|
| 317 | |
|---|
| 318 | bool Seq::HasGap() const |
|---|
| 319 | { |
|---|
| 320 | for (CharVect::const_iterator p = begin(); p != end(); ++p) |
|---|
| 321 | { |
|---|
| 322 | char c = *p; |
|---|
| 323 | if (IsGapChar(c)) |
|---|
| 324 | return true; |
|---|
| 325 | } |
|---|
| 326 | return false; |
|---|
| 327 | } |
|---|
| 328 | |
|---|
| 329 | void Seq::FixAlpha() |
|---|
| 330 | { |
|---|
| 331 | for (CharVect::iterator p = begin(); p != end(); ++p) |
|---|
| 332 | { |
|---|
| 333 | char c = *p; |
|---|
| 334 | if (!IsResidueChar(c)) |
|---|
| 335 | { |
|---|
| 336 | char w = GetWildcardChar(); |
|---|
| 337 | // Warning("Invalid residue '%c', replaced by '%c'", c, w); |
|---|
| 338 | InvalidLetterWarning(c, w); |
|---|
| 339 | *p = w; |
|---|
| 340 | } |
|---|
| 341 | } |
|---|
| 342 | } |
|---|