1 | // ==================================================================== // |
---|
2 | // // |
---|
3 | // File : probe_match_parser.cxx // |
---|
4 | // Purpose : parse the results of a probe match // |
---|
5 | // // |
---|
6 | // // |
---|
7 | // Coded by Ralf Westram (coder@reallysoft.de) in June 2004 // |
---|
8 | // Copyright Department of Microbiology (Technical University Munich) // |
---|
9 | // // |
---|
10 | // Visit our web site at: http://www.arb-home.de/ // |
---|
11 | // // |
---|
12 | // ==================================================================== // |
---|
13 | |
---|
14 | #include <cstring> |
---|
15 | #include <cstdlib> |
---|
16 | #include <cstdio> |
---|
17 | #include <cctype> |
---|
18 | #include <map> |
---|
19 | |
---|
20 | #include <arbdb.h> |
---|
21 | #include <arbdbt.h> |
---|
22 | |
---|
23 | #define pm_assert(cond) arb_assert(cond) |
---|
24 | |
---|
25 | #include "probe_match_parser.hxx" |
---|
26 | |
---|
27 | using namespace std; |
---|
28 | |
---|
29 | // --------------- |
---|
30 | // column |
---|
31 | // --------------- |
---|
32 | |
---|
33 | struct column { |
---|
34 | const char *title; // column title (pointer into ProbeMatch_impl::headline) |
---|
35 | int start_column, end_column; |
---|
36 | |
---|
37 | column() : title(0), start_column(-1), end_column(-1) { } |
---|
38 | column(const char *t, int sc, int ec) : title(t), start_column(sc), end_column(ec) { } |
---|
39 | }; |
---|
40 | |
---|
41 | // ------------------------ |
---|
42 | // ProbeMatch_impl |
---|
43 | // ------------------------ |
---|
44 | |
---|
45 | struct ltstr { |
---|
46 | bool operator()(const char* s1, const char* s2) const { |
---|
47 | return strcmp(s1, s2) < 0; |
---|
48 | } |
---|
49 | }; |
---|
50 | |
---|
51 | typedef map<const char*, column, ltstr> ColumnMap; |
---|
52 | |
---|
53 | class ProbeMatch_impl { |
---|
54 | char *headline; |
---|
55 | ColumnMap columns; |
---|
56 | int probe_region_offset; // left index of probe region |
---|
57 | |
---|
58 | |
---|
59 | public: |
---|
60 | ProbeMatch_impl(const char *headline_, char **errPtr) |
---|
61 | : headline(0) |
---|
62 | , probe_region_offset(-1) |
---|
63 | { |
---|
64 | pm_assert(headline_); |
---|
65 | headline = strdup(headline_); |
---|
66 | |
---|
67 | for (char *tok_start = strtok(headline, " "); tok_start; tok_start = strtok(0, " ")) { |
---|
68 | char *tok_end = strchr(tok_start, 0)-1; |
---|
69 | |
---|
70 | int startPos = tok_start-headline; |
---|
71 | int endPos = tok_end-headline; |
---|
72 | |
---|
73 | while (tok_end >= tok_start && tok_end[0] == '-') --tok_end; |
---|
74 | while (tok_start <= tok_end && tok_start[0] == '-') ++tok_start; |
---|
75 | pm_assert(tok_start <= tok_end); // otherwise column only contained '-' |
---|
76 | tok_end[1] = 0; |
---|
77 | |
---|
78 | columns[tok_start] = column(tok_start, startPos-2, endPos-2); // -2 because headline is 2 shorter than other lines |
---|
79 | } |
---|
80 | |
---|
81 | if (columns.empty()) *errPtr = strdup("No columns found"); |
---|
82 | } |
---|
83 | |
---|
84 | ~ProbeMatch_impl() { |
---|
85 | free(headline); |
---|
86 | } |
---|
87 | |
---|
88 | column *findColumn(const char *columntitle) { |
---|
89 | ColumnMap::iterator ci = columns.find(columntitle); |
---|
90 | if (ci == columns.end()) return 0; |
---|
91 | return &(ci->second); |
---|
92 | } |
---|
93 | |
---|
94 | void set_probe_region_offset(int offset) { probe_region_offset = offset; } |
---|
95 | int get_probe_region_offset() const { return probe_region_offset; } |
---|
96 | }; |
---|
97 | |
---|
98 | // ------------------------- |
---|
99 | // ProbeMatchParser |
---|
100 | // ------------------------- |
---|
101 | |
---|
102 | ProbeMatchParser::ProbeMatchParser(const char *probe_target, const char *headline) |
---|
103 | : pimpl(0), init_error(0) |
---|
104 | { |
---|
105 | if (!headline) { |
---|
106 | init_error = strdup("No headline given"); |
---|
107 | } |
---|
108 | else if (!probe_target) { |
---|
109 | init_error = strdup("No probe target given."); |
---|
110 | } |
---|
111 | else { |
---|
112 | pimpl = new ProbeMatch_impl(headline, &init_error); |
---|
113 | if (!init_error) { |
---|
114 | // modify target, so that it matches the target string in headline |
---|
115 | char *probe_target_copy = GBS_global_string_copy("'%s'", probe_target); // add single quotes |
---|
116 | for (int i = 0; probe_target_copy[i]; ++i) { |
---|
117 | probe_target_copy[i] = toupper(probe_target_copy[i]); |
---|
118 | if (probe_target_copy[i] == 'T') { // replace 'T' by 'U' |
---|
119 | probe_target_copy[i] = 'U'; |
---|
120 | } |
---|
121 | } |
---|
122 | |
---|
123 | // find that column and |
---|
124 | column *target_found = pimpl->findColumn(probe_target_copy); |
---|
125 | if (!target_found) { |
---|
126 | char *probe_rev_compl = strdup(probe_target_copy); |
---|
127 | GBT_reverseComplementNucSequence(probe_rev_compl, strlen(probe_rev_compl), 'U'); |
---|
128 | target_found = pimpl->findColumn(probe_rev_compl); |
---|
129 | free(probe_rev_compl); |
---|
130 | } |
---|
131 | |
---|
132 | if (target_found) { |
---|
133 | int probe_region_offset = target_found->start_column - 9; |
---|
134 | pimpl->set_probe_region_offset(probe_region_offset); |
---|
135 | } |
---|
136 | else { |
---|
137 | init_error = GBS_global_string_copy("Probe match parser failed (Could not find target '%s' in headline)", probe_target_copy); |
---|
138 | } |
---|
139 | free(probe_target_copy); |
---|
140 | } |
---|
141 | } |
---|
142 | } |
---|
143 | |
---|
144 | ProbeMatchParser::~ProbeMatchParser() { |
---|
145 | free(init_error); |
---|
146 | } |
---|
147 | |
---|
148 | bool ProbeMatchParser::getColumnRange(const char *columnName, int *startCol, int *endCol) const { |
---|
149 | pm_assert(!init_error); |
---|
150 | column *col = pimpl->findColumn(columnName); |
---|
151 | if (!col) return false; |
---|
152 | |
---|
153 | *startCol = col->start_column; |
---|
154 | *endCol = col->end_column; |
---|
155 | return true; |
---|
156 | } |
---|
157 | |
---|
158 | bool ProbeMatchParser::is_gene_result() const { |
---|
159 | pm_assert(!init_error); |
---|
160 | return pimpl->findColumn("organism") && pimpl->findColumn("genename"); |
---|
161 | } |
---|
162 | |
---|
163 | int ProbeMatchParser::get_probe_region_offset() const { |
---|
164 | pm_assert(!init_error); |
---|
165 | return pimpl->get_probe_region_offset(); |
---|
166 | } |
---|
167 | |
---|
168 | // ------------------------- |
---|
169 | // ParsedProbeMatch |
---|
170 | // ------------------------- |
---|
171 | |
---|
172 | ParsedProbeMatch::ParsedProbeMatch(const char *match_, const ProbeMatchParser& parser_) |
---|
173 | : parser(parser_), match(0), error(0) |
---|
174 | { |
---|
175 | if (match_) match = strdup(match_); |
---|
176 | else error = "No match given"; |
---|
177 | } |
---|
178 | |
---|
179 | ParsedProbeMatch::~ParsedProbeMatch() { |
---|
180 | free(match); |
---|
181 | } |
---|
182 | |
---|
183 | inline char *strpartdup(const char *str, int c1, int c2) { |
---|
184 | int len = c2-c1+1; |
---|
185 | |
---|
186 | pm_assert(str); |
---|
187 | pm_assert(c1 <= c2); |
---|
188 | pm_assert((int)strlen(str) > c2); |
---|
189 | |
---|
190 | char *buffer = (char*)malloc(len+1); |
---|
191 | memcpy(buffer, str+c1, len); |
---|
192 | buffer[len] = 0; |
---|
193 | return buffer; |
---|
194 | } |
---|
195 | |
---|
196 | int ParsedProbeMatch::get_position() const { |
---|
197 | pm_assert(!error); |
---|
198 | int c1, c2; |
---|
199 | if (parser.getColumnRange("pos", &c1, &c2)) { |
---|
200 | char *content = strpartdup(match, c1, c2); |
---|
201 | int pos = atoi(content); |
---|
202 | free(content); |
---|
203 | return pos; |
---|
204 | } |
---|
205 | error = "no such column: 'pos'"; |
---|
206 | return -1; |
---|
207 | } |
---|
208 | |
---|
209 | const char *ParsedProbeMatch::get_probe_region() const { |
---|
210 | pm_assert(!error); |
---|
211 | int pro = parser.pimpl->get_probe_region_offset(); |
---|
212 | int matchlen = strlen(match); |
---|
213 | |
---|
214 | if (pro<matchlen) { |
---|
215 | return match+pro; |
---|
216 | } |
---|
217 | |
---|
218 | error = GBS_global_string("can't parse match info '%s'", match); |
---|
219 | return 0; |
---|
220 | } |
---|
221 | |
---|
222 | char *ParsedProbeMatch::get_column_content(const char *columnName, bool chop_spaces) const { |
---|
223 | pm_assert(!error); |
---|
224 | int sc, ec; |
---|
225 | if (parser.getColumnRange(columnName, &sc, &ec)) { |
---|
226 | if (chop_spaces) { |
---|
227 | while (sc<ec && match[sc] == ' ') ++sc; |
---|
228 | while (sc<ec && match[ec] == ' ') --ec; |
---|
229 | } |
---|
230 | return strpartdup(match, sc, ec); |
---|
231 | } |
---|
232 | return 0; |
---|
233 | } |
---|