| 1 | #include "muscle.h" |
|---|
| 2 | #include <stdio.h> |
|---|
| 3 | #include <ctype.h> |
|---|
| 4 | #include "msa.h" |
|---|
| 5 | #include "textfile.h" |
|---|
| 6 | |
|---|
| 7 | const unsigned uCharsPerLine = 60; |
|---|
| 8 | const int MIN_NAME = 10; |
|---|
| 9 | const int MAX_NAME = 32; |
|---|
| 10 | |
|---|
| 11 | static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex); |
|---|
| 12 | |
|---|
| 13 | void MSA::ToAlnFile(TextFile &File) const |
|---|
| 14 | { |
|---|
| 15 | if (g_bClwStrict) |
|---|
| 16 | File.PutString("CLUSTAL W (1.81) multiple sequence alignment\n"); |
|---|
| 17 | else |
|---|
| 18 | { |
|---|
| 19 | File.PutString("MUSCLE (" |
|---|
| 20 | SHORT_VERSION ")" |
|---|
| 21 | " multiple sequence alignment\n"); |
|---|
| 22 | File.PutString("\n"); |
|---|
| 23 | } |
|---|
| 24 | |
|---|
| 25 | int iLongestNameLength = 0; |
|---|
| 26 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 27 | { |
|---|
| 28 | const char *ptrName = GetSeqName(uSeqIndex); |
|---|
| 29 | const char *ptrBlank = strchr(ptrName, ' '); |
|---|
| 30 | int iLength; |
|---|
| 31 | if (0 != ptrBlank) |
|---|
| 32 | iLength = (int) (ptrBlank - ptrName); |
|---|
| 33 | else |
|---|
| 34 | iLength = (int) strlen(ptrName); |
|---|
| 35 | if (iLength > iLongestNameLength) |
|---|
| 36 | iLongestNameLength = iLength; |
|---|
| 37 | } |
|---|
| 38 | if (iLongestNameLength > MAX_NAME) |
|---|
| 39 | iLongestNameLength = MAX_NAME; |
|---|
| 40 | if (iLongestNameLength < MIN_NAME) |
|---|
| 41 | iLongestNameLength = MIN_NAME; |
|---|
| 42 | |
|---|
| 43 | unsigned uLineCount = (GetColCount() - 1)/uCharsPerLine + 1; |
|---|
| 44 | for (unsigned uLineIndex = 0; uLineIndex < uLineCount; ++uLineIndex) |
|---|
| 45 | { |
|---|
| 46 | File.PutString("\n"); |
|---|
| 47 | unsigned uStartColIndex = uLineIndex*uCharsPerLine; |
|---|
| 48 | unsigned uEndColIndex = uStartColIndex + uCharsPerLine - 1; |
|---|
| 49 | if (uEndColIndex >= GetColCount()) |
|---|
| 50 | uEndColIndex = GetColCount() - 1; |
|---|
| 51 | char Name[MAX_NAME+1]; |
|---|
| 52 | for (unsigned uSeqIndex = 0; uSeqIndex < GetSeqCount(); ++uSeqIndex) |
|---|
| 53 | { |
|---|
| 54 | const char *ptrName = GetSeqName(uSeqIndex); |
|---|
| 55 | const char *ptrBlank = strchr(ptrName, ' '); |
|---|
| 56 | int iLength; |
|---|
| 57 | if (0 != ptrBlank) |
|---|
| 58 | iLength = (int) (ptrBlank - ptrName); |
|---|
| 59 | else |
|---|
| 60 | iLength = (int) strlen(ptrName); |
|---|
| 61 | if (iLength > MAX_NAME) |
|---|
| 62 | iLength = MAX_NAME; |
|---|
| 63 | memset(Name, ' ', MAX_NAME); |
|---|
| 64 | memcpy(Name, ptrName, iLength); |
|---|
| 65 | Name[iLongestNameLength] = 0; |
|---|
| 66 | |
|---|
| 67 | File.PutFormat("%s ", Name); |
|---|
| 68 | for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; |
|---|
| 69 | ++uColIndex) |
|---|
| 70 | { |
|---|
| 71 | const char c = GetChar(uSeqIndex, uColIndex); |
|---|
| 72 | File.PutFormat("%c", toupper(c)); |
|---|
| 73 | } |
|---|
| 74 | File.PutString("\n"); |
|---|
| 75 | } |
|---|
| 76 | |
|---|
| 77 | memset(Name, ' ', MAX_NAME); |
|---|
| 78 | Name[iLongestNameLength] = 0; |
|---|
| 79 | File.PutFormat("%s ", Name); |
|---|
| 80 | for (unsigned uColIndex = uStartColIndex; uColIndex <= uEndColIndex; |
|---|
| 81 | ++uColIndex) |
|---|
| 82 | { |
|---|
| 83 | const char c = GetAlnConsensusChar(*this, uColIndex); |
|---|
| 84 | File.PutChar(c); |
|---|
| 85 | } |
|---|
| 86 | File.PutString("\n"); |
|---|
| 87 | } |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | static char GetAlnConsensusChar(const MSA &a, unsigned uColIndex) |
|---|
| 91 | { |
|---|
| 92 | const unsigned uSeqCount = a.GetSeqCount(); |
|---|
| 93 | unsigned BitMap = 0; |
|---|
| 94 | unsigned Count = 0; |
|---|
| 95 | for (unsigned uSeqIndex = 0; uSeqIndex < uSeqCount; ++uSeqIndex) |
|---|
| 96 | { |
|---|
| 97 | unsigned uLetter = a.GetLetterEx(uSeqIndex, uColIndex); |
|---|
| 98 | assert(uLetter < 32); |
|---|
| 99 | unsigned Bit = (1 << uLetter); |
|---|
| 100 | if (!(BitMap & Bit)) |
|---|
| 101 | ++Count; |
|---|
| 102 | BitMap |= Bit; |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | // '*' indicates positions which have a single, fully conserved residue |
|---|
| 106 | if (1 == Count) |
|---|
| 107 | return '*'; |
|---|
| 108 | |
|---|
| 109 | if (ALPHA_Amino != g_Alpha) |
|---|
| 110 | return ' '; |
|---|
| 111 | |
|---|
| 112 | #define B(a) (1 << AX_##a) |
|---|
| 113 | #define S2(a, b) S(B(a) | B(b)) |
|---|
| 114 | #define S3(a, b, c) S(B(a) | B(b) | B(c)) |
|---|
| 115 | #define S4(a, b, c, d) S(B(a) | B(b) | B(c) | B(d)) |
|---|
| 116 | #define S(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return ':'; |
|---|
| 117 | |
|---|
| 118 | #define W3(a, b, c) W(B(a) | B(b) | B(c)) |
|---|
| 119 | #define W4(a, b, c, d) W(B(a) | B(b) | B(c) | B(d)) |
|---|
| 120 | #define W5(a, b, c, d, e) W(B(a) | B(b) | B(c) | B(d) | B(e)) |
|---|
| 121 | #define W6(a, b, c, d, e, f) W(B(a) | B(b) | B(c) | B(d) | B(e) | B(f)) |
|---|
| 122 | #define W(w) if (0 == (BitMap & ~(w)) && (BitMap & (w)) != 0) return '.'; |
|---|
| 123 | |
|---|
| 124 | // ':' indicates that one of the following 'strong' |
|---|
| 125 | // groups is fully conserved |
|---|
| 126 | // STA |
|---|
| 127 | // NEQK |
|---|
| 128 | // NHQK |
|---|
| 129 | // NDEQ |
|---|
| 130 | // QHRK |
|---|
| 131 | // MILV |
|---|
| 132 | // MILF |
|---|
| 133 | // HY |
|---|
| 134 | // FYW |
|---|
| 135 | // |
|---|
| 136 | S3(S, T, A) |
|---|
| 137 | S4(N, E, Q, K) |
|---|
| 138 | S4(N, H, Q, K) |
|---|
| 139 | S4(N, D, E, Q) |
|---|
| 140 | S4(M, I, L, V) |
|---|
| 141 | S4(M, I, L, F) |
|---|
| 142 | S2(H, Y) |
|---|
| 143 | S3(F, Y, W) |
|---|
| 144 | |
|---|
| 145 | // '.' indicates that one of the following 'weaker' |
|---|
| 146 | // groups is fully conserved |
|---|
| 147 | // CSA |
|---|
| 148 | // ATV |
|---|
| 149 | // SAG |
|---|
| 150 | // STNK |
|---|
| 151 | // STPA |
|---|
| 152 | // SGND |
|---|
| 153 | // SNDEQK |
|---|
| 154 | // NDEQHK |
|---|
| 155 | // NEQHRK |
|---|
| 156 | // FVLIM |
|---|
| 157 | // HFY |
|---|
| 158 | W3(C, S, A) |
|---|
| 159 | W3(A, T, V) |
|---|
| 160 | W3(S, A, G) |
|---|
| 161 | W4(S, T, N, K) |
|---|
| 162 | W4(S, T, P, A) |
|---|
| 163 | W4(S, G, N, D) |
|---|
| 164 | W6(S, N, D, E, Q, K) |
|---|
| 165 | W6(N, W, Q, H, R, K) |
|---|
| 166 | W5(F, V, L, I, M) |
|---|
| 167 | W3(H, F, Y) |
|---|
| 168 | |
|---|
| 169 | return ' '; |
|---|
| 170 | } |
|---|