1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : ps_convert_db.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Coded by Wolfram Foerster in October 2002 // |
---|
7 | // Institute of Microbiology (Technical University Munich) // |
---|
8 | // http://www.arb-home.de/ // |
---|
9 | // // |
---|
10 | // =============================================================== // |
---|
11 | |
---|
12 | #include "ps_tools.hxx" |
---|
13 | #include "ps_database.hxx" |
---|
14 | #include "ps_pg_tree_functions.hxx" |
---|
15 | #include "ps_pg_specmap.hxx" |
---|
16 | |
---|
17 | #include <sys/times.h> |
---|
18 | |
---|
19 | |
---|
20 | // GLOBALS |
---|
21 | |
---|
22 | static PS_NodePtr __ROOT; |
---|
23 | static int __PROBE_LENGTH; |
---|
24 | static SpeciesID __MIN_ID; |
---|
25 | static SpeciesID __MAX_ID; |
---|
26 | |
---|
27 | static void PS_detect_probe_length(GBDATA *_ARB_node) { |
---|
28 | // recursively walk through database to first probe and get its length |
---|
29 | __PROBE_LENGTH = -1; |
---|
30 | |
---|
31 | // search for a probe |
---|
32 | while (__PROBE_LENGTH < 0) { |
---|
33 | GBDATA *ARB_group = GB_entry(_ARB_node, "group"); |
---|
34 | if (ARB_group) { // ps_node has probes |
---|
35 | GBDATA *probe = PG_get_first_probe(ARB_group); |
---|
36 | __PROBE_LENGTH = strlen(PG_read_probe(probe)); |
---|
37 | } |
---|
38 | else { // ps_node has no probes .. check its children |
---|
39 | GBDATA *ARB_child = PS_get_first_node(_ARB_node); |
---|
40 | while (ARB_child && (__PROBE_LENGTH < 0)) { |
---|
41 | PS_detect_probe_length(ARB_child); |
---|
42 | ARB_child = PS_get_next_node(ARB_child); |
---|
43 | } |
---|
44 | } |
---|
45 | } |
---|
46 | } |
---|
47 | |
---|
48 | PS_NodePtr PS_assert_inverse_path(const int _max_depth, const int _caller_ID, IDVector *_path) { |
---|
49 | // walk down the 'inverse path' creating empty nodes as necessary |
---|
50 | |
---|
51 | PS_NodePtr current_node = __ROOT; |
---|
52 | SpeciesID current_ID; |
---|
53 | |
---|
54 | // handle given path |
---|
55 | int c = 0; |
---|
56 | for (IDVectorCIter i = _path->begin(); i != _path->end(); ++i, ++c) { |
---|
57 | current_ID = *i; |
---|
58 | current_node = current_node->assertChild(current_ID); |
---|
59 | } |
---|
60 | |
---|
61 | // handle implicit path |
---|
62 | c = 0; |
---|
63 | for (current_ID = _caller_ID+1; current_ID <= _max_depth; ++current_ID, ++c) { |
---|
64 | current_node = current_node->assertChild(current_ID); |
---|
65 | } |
---|
66 | |
---|
67 | return current_node; |
---|
68 | } |
---|
69 | |
---|
70 | PS_NodePtr PS_assert_path(const int _caller_ID, IDVector *_path) { |
---|
71 | // walk down the 'path' creating empty nodes as necessary |
---|
72 | |
---|
73 | PS_NodePtr current_node = __ROOT; |
---|
74 | SpeciesID next_path_ID; |
---|
75 | |
---|
76 | // handle given path |
---|
77 | int c = 0; |
---|
78 | IDVectorCIter i = _path->begin(); |
---|
79 | next_path_ID = (i == _path->end()) ? -1 : *i; |
---|
80 | for (SpeciesID current_ID = __MIN_ID; current_ID <= _caller_ID; ++current_ID, ++c) { |
---|
81 | if (current_ID != next_path_ID) { |
---|
82 | current_node = current_node->assertChild(current_ID); |
---|
83 | } |
---|
84 | else { |
---|
85 | ++i; |
---|
86 | next_path_ID = (i == _path->end()) ? -1 : *i; |
---|
87 | } |
---|
88 | } |
---|
89 | |
---|
90 | return current_node; |
---|
91 | } |
---|
92 | |
---|
93 | void PS_extract_probe_data(GBDATA *_ARB_node, // position in ARB database |
---|
94 | int _max_depth, // count of species in ARB database |
---|
95 | int _depth, // current depth in tree |
---|
96 | const int _parent_ID, // SpeciesID of parent node |
---|
97 | IDVector *_inverse_path) { // list with IDs of the 'inverse path' |
---|
98 | |
---|
99 | // recursively walk through ARB-database and extract probe-data to own tree format |
---|
100 | // |
---|
101 | // * Insertion of nodes takes place after a branch is completed (that is |
---|
102 | // when ive reached a leaf in the ARB-database and im going 'back up' |
---|
103 | // out of the recursion). |
---|
104 | // |
---|
105 | // * Branches below _max_depth/2 will be moved up top by inserting nodes with |
---|
106 | // 'inverse' probes in the 'inverse' branch, therefore the _inverse_path |
---|
107 | // list is maintained with the SpeciesIDs of the 'inverse path'. |
---|
108 | // - SpeciesIDs between _parent_ID and current ID are 'missing' in the path |
---|
109 | // and are appended to the _inverse_path list |
---|
110 | // - SpeciesIDs greater than the current ID are implicit in the |
---|
111 | // 'inverse path' list and therefore not stored |
---|
112 | |
---|
113 | // |
---|
114 | // get SpeciesID |
---|
115 | // |
---|
116 | GBDATA *data = GB_entry(_ARB_node, "num"); |
---|
117 | const char *buffer = GB_read_char_pntr(data); |
---|
118 | SpeciesID id = atoi(buffer); |
---|
119 | |
---|
120 | // |
---|
121 | // get probe(s) |
---|
122 | // |
---|
123 | PS_ProbeSetPtr probes = NULp; |
---|
124 | GBDATA *ARB_group = GB_entry(_ARB_node, "group"); // access probe-group |
---|
125 | if (ARB_group) { |
---|
126 | data = PG_get_first_probe(ARB_group); // get first probe if exists |
---|
127 | |
---|
128 | if (data) probes = new PS_ProbeSet; // new probe set if probes exist |
---|
129 | |
---|
130 | while (data) { |
---|
131 | buffer = PG_read_probe(data); // get probe string |
---|
132 | PS_ProbePtr new_probe(new PS_Probe); // make new probe |
---|
133 | new_probe->length = __PROBE_LENGTH; // set probe length |
---|
134 | new_probe->quality = 100; // set probe quality |
---|
135 | new_probe->GC_content = 0; // eval probe for GC-content |
---|
136 | for (int i=0; i < __PROBE_LENGTH; ++i) { |
---|
137 | if ((buffer[i] == 'C') || (buffer[i] == 'G')) ++(new_probe->GC_content); |
---|
138 | } |
---|
139 | probes->insert(new_probe); // append probe to probe set |
---|
140 | data = PG_get_next_probe(data); // get next probe |
---|
141 | } |
---|
142 | } |
---|
143 | |
---|
144 | // |
---|
145 | // enlarge inverse path |
---|
146 | // |
---|
147 | for (int i=_parent_ID+1; ((i < id) && (i >= 0)); ++i) { |
---|
148 | _inverse_path->push_back(i); |
---|
149 | } |
---|
150 | |
---|
151 | // |
---|
152 | // insertion if ARB_node had probes |
---|
153 | // |
---|
154 | if (probes) { |
---|
155 | if (_depth <= (_max_depth >> 1)) { |
---|
156 | // |
---|
157 | // insert if 'above' half depth |
---|
158 | // |
---|
159 | PS_NodePtr current_node = PS_assert_path(id, _inverse_path); |
---|
160 | current_node->addProbes(probes->begin(), probes->end()); |
---|
161 | } |
---|
162 | else { |
---|
163 | // |
---|
164 | // insert if 'below' half depth |
---|
165 | // |
---|
166 | PS_NodePtr current_node = PS_assert_inverse_path(_max_depth, id, _inverse_path); |
---|
167 | current_node->addProbesInverted(probes->begin(), probes->end()); |
---|
168 | } |
---|
169 | } |
---|
170 | |
---|
171 | // |
---|
172 | // child(ren) |
---|
173 | // |
---|
174 | GBDATA *ARB_child = PS_get_first_node(_ARB_node); // get first child if exists |
---|
175 | while (ARB_child) { |
---|
176 | PS_extract_probe_data(ARB_child, _max_depth, _depth+1, id, _inverse_path); |
---|
177 | ARB_child = PS_get_next_node(ARB_child); |
---|
178 | } |
---|
179 | |
---|
180 | // |
---|
181 | // shrink inverse path |
---|
182 | // |
---|
183 | while ((_inverse_path->back() > _parent_ID) && (!_inverse_path->empty())) { |
---|
184 | _inverse_path->pop_back(); |
---|
185 | } |
---|
186 | } |
---|
187 | |
---|
188 | |
---|
189 | int main(int _argc, char *_argv[]) { |
---|
190 | GBDATA *gb_main = NULp; |
---|
191 | GB_ERROR error = NULp; |
---|
192 | |
---|
193 | // open probe-group-database |
---|
194 | if (_argc < 2) { |
---|
195 | printf("Missing arguments\n Usage %s <input database name>\n", _argv[0]); |
---|
196 | printf("output database will be named like input database but with the suffix '.wf' instead of '.arb'\n"); |
---|
197 | exit(1); |
---|
198 | } |
---|
199 | |
---|
200 | const char *DB_name = _argv[1]; |
---|
201 | |
---|
202 | // |
---|
203 | // open and check ARB database |
---|
204 | // |
---|
205 | struct tms before; |
---|
206 | times(&before); |
---|
207 | printf("Opening probe-group-database '%s'..\n ", DB_name); |
---|
208 | gb_main = GB_open(DB_name, "rwcN"); |
---|
209 | if (!gb_main) { |
---|
210 | error = GB_await_error(); |
---|
211 | GB_warning(error); |
---|
212 | exit(1); |
---|
213 | } |
---|
214 | printf("..loaded database (enter to continue) "); |
---|
215 | PS_print_time_diff(&before); |
---|
216 | |
---|
217 | GB_transaction ta(gb_main); |
---|
218 | GBDATA *group_tree = GB_entry(gb_main, "group_tree"); |
---|
219 | if (!group_tree) { |
---|
220 | printf("no 'group_tree' in database\n"); |
---|
221 | error = GB_export_error("no 'group_tree' in database"); // @@@ error is unused |
---|
222 | exit(1); |
---|
223 | } |
---|
224 | GBDATA *first_level_node = PS_get_first_node(group_tree); |
---|
225 | if (!first_level_node) { |
---|
226 | printf("no 'node' found in group_tree\n"); |
---|
227 | error = GB_export_error("no 'node' found in group_tree"); // @@@ error is unused |
---|
228 | exit(1); |
---|
229 | } |
---|
230 | |
---|
231 | // |
---|
232 | // read Name <-> ID mappings |
---|
233 | // |
---|
234 | times(&before); |
---|
235 | printf("init Species <-> ID - Map\n"); |
---|
236 | PG_initSpeciesMaps(gb_main); |
---|
237 | int species_count = PG_NumberSpecies(); |
---|
238 | printf("%i species in the map ", species_count); |
---|
239 | if (species_count >= 10) { |
---|
240 | printf("\nhere are the first 10 :\n"); |
---|
241 | int count = 0; |
---|
242 | for (ID2NameMapCIter i=__ID2NAME_MAP.begin(); count<10; ++i, ++count) { |
---|
243 | printf("[ %2i ] %s\n", i->first, i->second.c_str()); |
---|
244 | } |
---|
245 | } |
---|
246 | __MIN_ID = __ID2NAME_MAP.begin()->first; |
---|
247 | __MAX_ID = __ID2NAME_MAP.rbegin()->first; |
---|
248 | printf("IDs %i .. %i\n(enter to continue) ", __MIN_ID, __MAX_ID); |
---|
249 | PS_print_time_diff(&before); |
---|
250 | |
---|
251 | // |
---|
252 | // create output database |
---|
253 | // |
---|
254 | string output_DB_name(DB_name); |
---|
255 | size_t suffix_pos = output_DB_name.rfind(".arb"); |
---|
256 | if (suffix_pos != string::npos) { |
---|
257 | output_DB_name.erase(suffix_pos); |
---|
258 | } |
---|
259 | output_DB_name.append(".wf"); |
---|
260 | if (suffix_pos == string::npos) { |
---|
261 | printf("cannot find suffix '.arb' in database name '%s'\n", DB_name); |
---|
262 | printf("output file will be named '%s'\n", output_DB_name.c_str()); |
---|
263 | } |
---|
264 | PS_Database *ps_db = new PS_Database(output_DB_name.c_str(), PS_Database::WRITEONLY); |
---|
265 | |
---|
266 | // |
---|
267 | // copy mappings |
---|
268 | // |
---|
269 | ps_db->setMappings(__NAME2ID_MAP, __ID2NAME_MAP); |
---|
270 | |
---|
271 | // |
---|
272 | // extract data from ARB database |
---|
273 | // |
---|
274 | times(&before); |
---|
275 | printf("extracting probe-data...\n"); |
---|
276 | PS_detect_probe_length(group_tree); |
---|
277 | printf("probe_length = %d\n", __PROBE_LENGTH); |
---|
278 | |
---|
279 | __ROOT = ps_db->getRootNode(); |
---|
280 | first_level_node = PS_get_first_node(group_tree); |
---|
281 | unsigned int c = 0; |
---|
282 | IDVector *inverse_path = new IDVector; |
---|
283 | struct tms before_first_level_node; |
---|
284 | for (; first_level_node; ++c) { |
---|
285 | if (c % 100 == 0) { |
---|
286 | times(&before_first_level_node); |
---|
287 | printf("1st level node #%u ", c+1); |
---|
288 | } |
---|
289 | PS_extract_probe_data(first_level_node, species_count, 0, __MIN_ID-1, inverse_path); |
---|
290 | first_level_node = PS_get_next_node(first_level_node); |
---|
291 | if (c % 100 == 0) { |
---|
292 | PS_print_time_diff(&before_first_level_node, "this node ", " "); |
---|
293 | PS_print_time_diff(&before, "total ", "\n"); |
---|
294 | } |
---|
295 | } |
---|
296 | printf("done after %u 1st level nodes\n", c); |
---|
297 | printf("(enter to continue) "); |
---|
298 | PS_print_time_diff(&before); |
---|
299 | |
---|
300 | // |
---|
301 | // write database to file |
---|
302 | // |
---|
303 | times(&before); |
---|
304 | printf("writing probe-data to %s..\n", output_DB_name.c_str()); |
---|
305 | ps_db->save(); |
---|
306 | printf("..done saving (enter to continue) "); |
---|
307 | PS_print_time_diff(&before); |
---|
308 | |
---|
309 | delete inverse_path; |
---|
310 | delete ps_db; |
---|
311 | before.tms_utime = 0; |
---|
312 | before.tms_stime = 0; |
---|
313 | printf("total "); |
---|
314 | PS_print_time_diff(&before); |
---|
315 | return 0; |
---|
316 | } |
---|