1 | // ==================================================================== // |
---|
2 | // // |
---|
3 | // File : probe_match_parser.cxx // |
---|
4 | // Purpose : parse the results of a probe match // |
---|
5 | // // |
---|
6 | // // |
---|
7 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2004 // |
---|
8 | // Copyright Department of Microbiology (Technical University Munich) // |
---|
9 | // // |
---|
10 | // Visit our web site at: http://www.arb-home.de/ // |
---|
11 | // // |
---|
12 | // ==================================================================== // |
---|
13 | |
---|
14 | #include "probe_match_parser.hxx" |
---|
15 | |
---|
16 | #include <arbdbt.h> |
---|
17 | #include <arb_defs.h> |
---|
18 | |
---|
19 | #include <cctype> |
---|
20 | #include <map> |
---|
21 | |
---|
22 | #define pm_assert(cond) arb_assert(cond) |
---|
23 | |
---|
24 | using namespace std; |
---|
25 | |
---|
26 | // ---------------- |
---|
27 | // column |
---|
28 | |
---|
29 | struct column { |
---|
30 | const char *title; // column title (pointer into ProbeMatch_impl::headline) |
---|
31 | int start_column, end_column; |
---|
32 | |
---|
33 | column() : title(NULp), start_column(-1), end_column(-1) {} |
---|
34 | column(const char *t, int sc, int ec) : title(t), start_column(sc), end_column(ec) {} |
---|
35 | }; |
---|
36 | |
---|
37 | // ------------------------- |
---|
38 | // ProbeMatch_impl |
---|
39 | |
---|
40 | typedef map<const char*, column, charpLess> ColumnMap; |
---|
41 | |
---|
42 | class ProbeMatch_impl : virtual Noncopyable { |
---|
43 | char *headline; |
---|
44 | ColumnMap columns; |
---|
45 | int probe_region_offset; // left index of probe region |
---|
46 | |
---|
47 | |
---|
48 | public: |
---|
49 | ProbeMatch_impl(const char *headline_, char **errPtr) : |
---|
50 | headline(NULp), |
---|
51 | probe_region_offset(-1) |
---|
52 | { |
---|
53 | pm_assert(headline_); |
---|
54 | headline = ARB_strdup(headline_); |
---|
55 | |
---|
56 | for (char *tok_start = strtok(headline, " "); tok_start; tok_start = strtok(NULp, " ")) { |
---|
57 | char *tok_end = strchr(tok_start, 0)-1; |
---|
58 | |
---|
59 | int startPos = tok_start-headline; |
---|
60 | int endPos = tok_end-headline; |
---|
61 | |
---|
62 | while (tok_end >= tok_start && tok_end[0] == '-') --tok_end; |
---|
63 | while (tok_start <= tok_end && tok_start[0] == '-') ++tok_start; |
---|
64 | pm_assert(tok_start <= tok_end); // otherwise column only contained '-' |
---|
65 | tok_end[1] = 0; |
---|
66 | |
---|
67 | columns[tok_start] = column(tok_start, startPos-2, endPos-2); // -2 because headline is 2 shorter than other lines |
---|
68 | } |
---|
69 | |
---|
70 | if (columns.empty()) *errPtr = ARB_strdup("No columns found"); |
---|
71 | } |
---|
72 | |
---|
73 | ~ProbeMatch_impl() { |
---|
74 | free(headline); |
---|
75 | } |
---|
76 | |
---|
77 | column *findColumn(const char *columntitle) { |
---|
78 | ColumnMap::iterator ci = columns.find(columntitle); |
---|
79 | if (ci == columns.end()) return NULp; |
---|
80 | return &(ci->second); |
---|
81 | } |
---|
82 | |
---|
83 | void set_probe_region_offset(int offset) { probe_region_offset = offset; } |
---|
84 | int get_probe_region_offset() const { return probe_region_offset; } |
---|
85 | }; |
---|
86 | |
---|
87 | // -------------------------- |
---|
88 | // ProbeMatchParser |
---|
89 | |
---|
90 | ProbeMatchParser::ProbeMatchParser(const char *probe_target, const char *headline) : |
---|
91 | pimpl(NULp), |
---|
92 | init_error(NULp) |
---|
93 | { |
---|
94 | if (!headline) { |
---|
95 | init_error = ARB_strdup("No headline given"); |
---|
96 | } |
---|
97 | else if (!probe_target) { |
---|
98 | init_error = ARB_strdup("No probe target given."); |
---|
99 | } |
---|
100 | else { |
---|
101 | pimpl = new ProbeMatch_impl(headline, &init_error); |
---|
102 | if (!init_error) { |
---|
103 | // modify target, so that it matches the target string in headline |
---|
104 | char *probe_target_copy = GBS_global_string_copy("'%s'", probe_target); // add single quotes |
---|
105 | for (int i = 0; probe_target_copy[i]; ++i) { |
---|
106 | probe_target_copy[i] = toupper(probe_target_copy[i]); |
---|
107 | if (probe_target_copy[i] == 'T') { // replace 'T' by 'U' |
---|
108 | probe_target_copy[i] = 'U'; |
---|
109 | } |
---|
110 | } |
---|
111 | |
---|
112 | // find that column and |
---|
113 | column *target_found = pimpl->findColumn(probe_target_copy); |
---|
114 | if (!target_found) { |
---|
115 | char *probe_rev_compl = ARB_strdup(probe_target_copy); |
---|
116 | GBT_reverseComplementNucSequence(probe_rev_compl, strlen(probe_rev_compl), 'U'); |
---|
117 | target_found = pimpl->findColumn(probe_rev_compl); |
---|
118 | free(probe_rev_compl); |
---|
119 | } |
---|
120 | |
---|
121 | if (target_found) { |
---|
122 | int probe_region_offset = target_found->start_column - 9; |
---|
123 | pimpl->set_probe_region_offset(probe_region_offset); |
---|
124 | } |
---|
125 | else { |
---|
126 | init_error = GBS_global_string_copy("Probe match parser failed (Could not find target '%s' in headline)", probe_target_copy); |
---|
127 | } |
---|
128 | free(probe_target_copy); |
---|
129 | } |
---|
130 | } |
---|
131 | } |
---|
132 | |
---|
133 | ProbeMatchParser::~ProbeMatchParser() { |
---|
134 | free(init_error); |
---|
135 | delete pimpl; |
---|
136 | } |
---|
137 | |
---|
138 | bool ProbeMatchParser::getColumnRange(const char *columnName, int *startCol, int *endCol) const { |
---|
139 | pm_assert(!init_error); |
---|
140 | column *col = pimpl->findColumn(columnName); |
---|
141 | if (!col) return false; |
---|
142 | |
---|
143 | *startCol = col->start_column; |
---|
144 | *endCol = col->end_column; |
---|
145 | return true; |
---|
146 | } |
---|
147 | |
---|
148 | bool ProbeMatchParser::is_gene_result() const { |
---|
149 | pm_assert(!init_error); |
---|
150 | return pimpl->findColumn("organism") && pimpl->findColumn("genename"); |
---|
151 | } |
---|
152 | |
---|
153 | int ProbeMatchParser::get_probe_region_offset() const { |
---|
154 | pm_assert(!init_error); |
---|
155 | return pimpl->get_probe_region_offset(); |
---|
156 | } |
---|
157 | |
---|
158 | // -------------------------- |
---|
159 | // ParsedProbeMatch |
---|
160 | |
---|
161 | ParsedProbeMatch::ParsedProbeMatch(const char *match_, const ProbeMatchParser& parser_) : |
---|
162 | parser(parser_), |
---|
163 | match(NULp), |
---|
164 | error(NULp) |
---|
165 | { |
---|
166 | if (match_) match = ARB_strdup(match_); |
---|
167 | else error = "No match given"; |
---|
168 | } |
---|
169 | |
---|
170 | ParsedProbeMatch::~ParsedProbeMatch() { |
---|
171 | free(match); |
---|
172 | } |
---|
173 | |
---|
174 | inline char *strpartdup(const char *str, int c1, int c2) { |
---|
175 | int len = c2-c1+1; |
---|
176 | |
---|
177 | pm_assert(str); |
---|
178 | pm_assert(c1 <= c2); |
---|
179 | pm_assert((int)strlen(str) > c2); |
---|
180 | |
---|
181 | return ARB_strndup(str+c1, len); |
---|
182 | } |
---|
183 | |
---|
184 | int ParsedProbeMatch::get_position() const { |
---|
185 | pm_assert(!error); |
---|
186 | int c1, c2; |
---|
187 | if (parser.getColumnRange("pos", &c1, &c2)) { |
---|
188 | char *content = strpartdup(match, c1, c2); |
---|
189 | int pos = bio2info(atoi(content)); |
---|
190 | free(content); |
---|
191 | return pos; |
---|
192 | } |
---|
193 | error = "no such column: 'pos'"; |
---|
194 | return -1; |
---|
195 | } |
---|
196 | |
---|
197 | const char *ParsedProbeMatch::get_probe_region() const { |
---|
198 | pm_assert(!error); |
---|
199 | int pro = parser.pimpl->get_probe_region_offset(); |
---|
200 | int matchlen = strlen(match); |
---|
201 | |
---|
202 | if (pro<matchlen) { |
---|
203 | return match+pro; |
---|
204 | } |
---|
205 | |
---|
206 | error = GBS_global_string("can't parse match info '%s'", match); |
---|
207 | return NULp; |
---|
208 | } |
---|
209 | |
---|
210 | char *ParsedProbeMatch::get_column_content(const char *columnName, bool chop_spaces) const { |
---|
211 | pm_assert(!error); |
---|
212 | int sc, ec; |
---|
213 | if (parser.getColumnRange(columnName, &sc, &ec)) { |
---|
214 | if (chop_spaces) { |
---|
215 | while (sc<ec && match[sc] == ' ') ++sc; |
---|
216 | while (sc<ec && match[ec] == ' ') --ec; |
---|
217 | } |
---|
218 | return strpartdup(match, sc, ec); |
---|
219 | } |
---|
220 | return NULp; |
---|
221 | } |
---|