1 | // ================================================================ // |
---|
2 | // // |
---|
3 | // File : iupac.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // ================================================================ // |
---|
10 | |
---|
11 | #include "iupac.h" |
---|
12 | #include <cctype> |
---|
13 | |
---|
14 | #define awt_assert(bed) arb_assert(bed) |
---|
15 | |
---|
16 | #define IUPAC_EMPTY "" // changed from " " to "" (2009-03-19 -- ralf) |
---|
17 | #define ILL_CODE char(26) |
---|
18 | |
---|
19 | using namespace iupac; |
---|
20 | |
---|
21 | namespace iupac { |
---|
22 | #define _____________ {NULp,0} |
---|
23 | |
---|
24 | const Nuc_Group nuc_group[26][2] = { |
---|
25 | { { "A", 1 }, { "A", 1 } }, // A |
---|
26 | { { "CGT", 3 }, { "CGU", 3 } }, // B |
---|
27 | { { "C", 1 }, { "C", 1 } }, // C |
---|
28 | { { "AGT", 3 }, { "AGU", 3 } }, // D |
---|
29 | { _____________, _____________ }, // E |
---|
30 | { _____________, _____________ }, // F |
---|
31 | { { "G", 1 }, { "G", 1 } }, // G |
---|
32 | { { "ACT", 3 }, { "ACU", 3 } }, // H |
---|
33 | { _____________, _____________ }, // I |
---|
34 | { _____________, _____________ }, // J |
---|
35 | { { "GT", 2 }, { "GU", 2 } }, // K |
---|
36 | { _____________, _____________ }, // L |
---|
37 | { { "AC", 2 }, { "AC", 2 } }, // M |
---|
38 | { { "ACGT", 1 }, { "ACGU", 1 } }, // N |
---|
39 | { _____________, _____________ }, // O |
---|
40 | { _____________, _____________ }, // P |
---|
41 | { _____________, _____________ }, // Q |
---|
42 | { { "AG", 2 }, { "AG", 2 } }, // R |
---|
43 | { { "CG", 2 }, { "CG", 2 } }, // S |
---|
44 | { { "T", 1 }, { "U", 1 } }, // T |
---|
45 | { { "T", 1 }, { "U", 1 } }, // U |
---|
46 | { { "ACG", 3 }, { "ACG", 3 } }, // V |
---|
47 | { { "AT", 2 }, { "AU", 2 } }, // W |
---|
48 | { _____________, _____________ }, // X |
---|
49 | { { "CT", 2 }, { "CU", 2 } }, // Y |
---|
50 | { _____________, _____________ }, // Z |
---|
51 | |
---|
52 | // (In each single string the nucs have to be sorted alphabetically) |
---|
53 | }; |
---|
54 | |
---|
55 | #undef _____________ |
---|
56 | |
---|
57 | static const char *aminoGroupMembers[AA_GROUP_COUNT] = { |
---|
58 | // Note: first character has to be the "representative character" of the group |
---|
59 | "XOU", // AA_GROUP_NONE |
---|
60 | "AGPST", // AA_GROUP_ALPHA |
---|
61 | "DBENQZ", // AA_GROUP_BETA |
---|
62 | "HKR", // AA_GROUP_GAMMA |
---|
63 | "IJLMV", // AA_GROUP_DELTA |
---|
64 | "FWY", // AA_GROUP_EPSILON |
---|
65 | "C", // AA_GROUP_ZETA |
---|
66 | }; |
---|
67 | |
---|
68 | static Amino_Group amino_group[26]; |
---|
69 | |
---|
70 | class Setup { // setup static data in this module |
---|
71 | void setup_amino_group() { |
---|
72 | for (int c = 0; c<26; ++c) amino_group[c] = AA_GROUP_ILLEGAL; |
---|
73 | for (Amino_Group g = AA_GROUP_NONE; g<AA_GROUP_COUNT; g = Amino_Group(g+1)) { |
---|
74 | const char *members = aminoGroupMembers[g]; |
---|
75 | for (int p = 0; members[p]; ++p) { |
---|
76 | int c = members[p]; |
---|
77 | awt_assert(c >= 'A' && c <= 'Z'); |
---|
78 | amino_group[c-'A'] = g; |
---|
79 | } |
---|
80 | } |
---|
81 | } |
---|
82 | public: |
---|
83 | Setup() { setup_amino_group(); } |
---|
84 | }; |
---|
85 | static Setup perform; |
---|
86 | |
---|
87 | }; |
---|
88 | |
---|
89 | Amino_Group iupac::get_amino_group_for(char aa) { |
---|
90 | aa = toupper(aa); |
---|
91 | if (aa<'A' || aa>'Z') { return AA_GROUP_ILLEGAL; } |
---|
92 | return amino_group[aa-'A']; |
---|
93 | } |
---|
94 | |
---|
95 | char iupac::get_amino_consensus_char(Amino_Group ag) { |
---|
96 | if (ag>AA_GROUP_NONE && ag<AA_GROUP_ILLEGAL) { |
---|
97 | return aminoGroupMembers[ag][0]; |
---|
98 | } |
---|
99 | return '?'; |
---|
100 | } |
---|
101 | |
---|
102 | static char IUPAC_add[26][26]; // uses T |
---|
103 | static int IUPAC_add_initialized = 0; |
---|
104 | |
---|
105 | static void initialize_IUPAC_add() { |
---|
106 | int c1, c2; |
---|
107 | |
---|
108 | for (c1=0; c1<26; c1++) { |
---|
109 | const char *decoded1 = nuc_group[c1][0].members; |
---|
110 | |
---|
111 | if (!decoded1) { |
---|
112 | for (c2=0; c2<=c1; c2++) { |
---|
113 | IUPAC_add[c1][c2] = ILL_CODE; |
---|
114 | IUPAC_add[c2][c1] = ILL_CODE; |
---|
115 | } |
---|
116 | } |
---|
117 | else { |
---|
118 | IUPAC_add[c1][c1] = char(c1); // add char to same char |
---|
119 | |
---|
120 | for (c2=0; c2<c1; c2++) { |
---|
121 | if (strchr(decoded1, 'A'+c2)) { // char is already contained in this IUPAC |
---|
122 | IUPAC_add[c1][c2] = char(c1); |
---|
123 | } |
---|
124 | else { |
---|
125 | const char *decoded2 = nuc_group[c2][0].members; |
---|
126 | |
---|
127 | if (!decoded2) { // char is illegal |
---|
128 | IUPAC_add[c1][c2] = ILL_CODE; |
---|
129 | } |
---|
130 | else { |
---|
131 | #define MAX_MIXED 5 |
---|
132 | char mixed[MAX_MIXED]; |
---|
133 | char *mp = mixed; |
---|
134 | const char *d1 = decoded1; |
---|
135 | const char *d2 = decoded2; |
---|
136 | |
---|
137 | while (1) { |
---|
138 | char z1 = *d1; |
---|
139 | char z2 = *d2; |
---|
140 | |
---|
141 | if (!z1 && !z2) break; |
---|
142 | |
---|
143 | if (z1==z2) { |
---|
144 | *mp++ = z1; |
---|
145 | d1++; |
---|
146 | d2++; |
---|
147 | } |
---|
148 | else if (!z2 || (z1 && z1<z2)) { |
---|
149 | *mp++ = z1; |
---|
150 | d1++; |
---|
151 | } |
---|
152 | else { |
---|
153 | awt_assert(!z1 || (z2 && z2<z1)); |
---|
154 | *mp++ = z2; |
---|
155 | d2++; |
---|
156 | } |
---|
157 | } |
---|
158 | |
---|
159 | awt_assert((mp-mixed)<MAX_MIXED); |
---|
160 | *mp++ = 0; |
---|
161 | |
---|
162 | #if !defined(NDEBUG) && 0 |
---|
163 | printf("Mix '%s' + '%s' = '%s'\n", decoded1, decoded2, mixed); |
---|
164 | #endif |
---|
165 | |
---|
166 | int c3; |
---|
167 | |
---|
168 | for (c3=0; c3<26; c3++) { |
---|
169 | if (nuc_group[c3][0].members && strcmp(mixed, nuc_group[c3][0].members)==0) { |
---|
170 | IUPAC_add[c1][c2] = char(c3); |
---|
171 | break; |
---|
172 | } |
---|
173 | } |
---|
174 | |
---|
175 | if (c3>=26) { |
---|
176 | IUPAC_add[c1][c2] = 0; |
---|
177 | } |
---|
178 | } |
---|
179 | } |
---|
180 | |
---|
181 | if (IUPAC_add[c1][c2]==('U'-'A')) { |
---|
182 | IUPAC_add[c1][c2] = 'T'-'A'; |
---|
183 | } |
---|
184 | |
---|
185 | IUPAC_add[c2][c1] = IUPAC_add[c1][c2]; |
---|
186 | } |
---|
187 | } |
---|
188 | |
---|
189 | } |
---|
190 | } |
---|
191 | |
---|
192 | char iupac::encode(const char bases[], GB_alignment_type ali) { |
---|
193 | if (!IUPAC_add_initialized) { |
---|
194 | initialize_IUPAC_add(); |
---|
195 | IUPAC_add_initialized = 1; |
---|
196 | } |
---|
197 | |
---|
198 | if (ali==GB_AT_AA) { |
---|
199 | return '?'; |
---|
200 | } |
---|
201 | |
---|
202 | int i = 0; |
---|
203 | unsigned char c1; |
---|
204 | |
---|
205 | while (1) { |
---|
206 | c1 = bases[i++]; |
---|
207 | if (!c1) return '-'; |
---|
208 | if (isalpha(c1)) break; |
---|
209 | } |
---|
210 | |
---|
211 | c1 = toupper(c1)-'A'; |
---|
212 | unsigned char c = IUPAC_add[c1][c1]; |
---|
213 | |
---|
214 | while (c!=ILL_CODE && bases[i]) { |
---|
215 | if (isalpha(bases[i])) { |
---|
216 | int c2 = toupper(bases[i])-'A'; |
---|
217 | c = IUPAC_add[c][c2]; |
---|
218 | } |
---|
219 | i++; |
---|
220 | } |
---|
221 | |
---|
222 | if (c==ILL_CODE) { |
---|
223 | return '-'; |
---|
224 | } |
---|
225 | |
---|
226 | c += 'A'; |
---|
227 | if (c=='T' && ali==GB_AT_RNA) c = 'U'; |
---|
228 | return c; |
---|
229 | } |
---|
230 | |
---|
231 | char iupac::combine(char c1, char c2, GB_alignment_type ali) { |
---|
232 | char buffer[3]; |
---|
233 | buffer[0] = c1; |
---|
234 | buffer[1] = c2; |
---|
235 | buffer[2] = 0; |
---|
236 | return iupac::encode(buffer, ali); |
---|
237 | } |
---|
238 | |
---|
239 | const char* iupac::decode(char iupac, GB_alignment_type ali, bool decode_amino_iupac_groups) { |
---|
240 | if (!isalpha(iupac)) { |
---|
241 | return IUPAC_EMPTY; |
---|
242 | } |
---|
243 | |
---|
244 | if (ali==GB_AT_AA) { |
---|
245 | if (decode_amino_iupac_groups) { |
---|
246 | Amino_Group group = get_amino_group_for(iupac); |
---|
247 | if (group == AA_GROUP_NONE) { |
---|
248 | return ""; |
---|
249 | } |
---|
250 | else { |
---|
251 | awt_assert(group >= AA_GROUP_NONE && group<AA_GROUP_COUNT); |
---|
252 | return aminoGroupMembers[group]; |
---|
253 | } |
---|
254 | } |
---|
255 | return "?"; |
---|
256 | } |
---|
257 | |
---|
258 | const char *decoded = nuc_group[toupper(iupac)-'A'][ali==GB_AT_RNA ? 1 : 0].members; |
---|
259 | |
---|
260 | return decoded ? decoded : IUPAC_EMPTY; |
---|
261 | } |
---|
262 | |
---|
263 | // -------------------------------------------------------------------------------- |
---|
264 | |
---|
265 | #ifdef UNIT_TESTS |
---|
266 | |
---|
267 | #include <test_unit.h> |
---|
268 | |
---|
269 | void TEST_amino_groups() { |
---|
270 | TEST_EXPECT_EQUAL(decode('x', GB_AT_AA, true), ""); |
---|
271 | TEST_EXPECT_EQUAL(decode('A', GB_AT_AA, true), "AGPST"); |
---|
272 | |
---|
273 | // each character (A-Z) shall be member of a group: |
---|
274 | for (char c = 'A'; c <= 'Z'; ++c) { |
---|
275 | Amino_Group group = get_amino_group_for(c); |
---|
276 | TEST_EXPECT_LESS(group, AA_GROUP_COUNT); |
---|
277 | TEST_EXPECT_DIFFERENT(group, AA_GROUP_ILLEGAL); |
---|
278 | TEST_EXPECT_IN_RANGE(group, AA_GROUP_NONE, AA_GROUP_ZETA); |
---|
279 | |
---|
280 | { |
---|
281 | const char * const members = aminoGroupMembers[group]; |
---|
282 | const char * const found = strchr(members, c); |
---|
283 | TEST_EXPECT_EQUAL(found ? found[0] : 0, c); // char has to be a member of its group |
---|
284 | } |
---|
285 | |
---|
286 | if (group != AA_GROUP_NONE) { |
---|
287 | const char * const decoded = decode(c, GB_AT_AA, true); |
---|
288 | const char * const found = strchr(decoded, c); |
---|
289 | |
---|
290 | TEST_EXPECT_EQUAL(found ? found[0] : 0, c); // check char is |
---|
291 | } |
---|
292 | } |
---|
293 | |
---|
294 | // no character may be member of two groups |
---|
295 | for (Amino_Group group = AA_GROUP_NONE; group < AA_GROUP_COUNT; group = Amino_Group(group+1)) { |
---|
296 | const char * const member = aminoGroupMembers[group]; |
---|
297 | for (int pos = 0; member[pos]; ++pos) { |
---|
298 | Amino_Group groupOfChar = get_amino_group_for(member[pos]); |
---|
299 | TEST_EXPECT_EQUAL(groupOfChar, group); |
---|
300 | } |
---|
301 | } |
---|
302 | } |
---|
303 | |
---|
304 | void TEST_nuc_groups() { |
---|
305 | for (int base = 0; base<26; base++) { |
---|
306 | TEST_EXPECT_EQUAL(nuc_group[base][0].count, nuc_group[base][1].count); |
---|
307 | for (int alitype = 0; alitype<2; alitype++) { |
---|
308 | const Nuc_Group& group = nuc_group[base][alitype]; |
---|
309 | if (group.members) { |
---|
310 | if ((base+'A') == 'N') { |
---|
311 | TEST_EXPECT_EQUAL__BROKEN(group.count, strlen(group.members), 1); |
---|
312 | // @@@ fails because count is 1 for "ACGT" [N] |
---|
313 | // maybe be expected by resolve_IUPAC_target_string (fix this first) |
---|
314 | TEST_EXPECT_EQUAL(group.count, 1U); // fixture for behavior |
---|
315 | } |
---|
316 | else { |
---|
317 | TEST_EXPECT_EQUAL(group.count, strlen(group.members)); |
---|
318 | } |
---|
319 | |
---|
320 | for (size_t pos = 1; pos<group.count; ++pos) { |
---|
321 | TEST_EXPECT_LESS(group.members[pos-1], group.members[pos]); |
---|
322 | } |
---|
323 | } |
---|
324 | else { |
---|
325 | TEST_REJECT(group.count); |
---|
326 | } |
---|
327 | } |
---|
328 | } |
---|
329 | } |
---|
330 | |
---|
331 | #endif // UNIT_TESTS |
---|