| 1 | #include <cstdio> |
|---|
| 2 | #include <cstdlib> |
|---|
| 3 | |
|---|
| 4 | #include <sys/times.h> |
|---|
| 5 | |
|---|
| 6 | #include "ps_tools.hxx" |
|---|
| 7 | #include "ps_database.hxx" |
|---|
| 8 | #include "ps_pg_tree_functions.cxx" |
|---|
| 9 | |
|---|
| 10 | // GLOBALS |
|---|
| 11 | |
|---|
| 12 | PS_NodePtr __ROOT; |
|---|
| 13 | int __PROBE_LENGTH; |
|---|
| 14 | SpeciesID __MIN_ID; |
|---|
| 15 | SpeciesID __MAX_ID; |
|---|
| 16 | |
|---|
| 17 | void PS_detect_probe_length( GBDATA *_ARB_node ) { |
|---|
| 18 | // recursively walk through database to first probe and get its length |
|---|
| 19 | __PROBE_LENGTH = -1; |
|---|
| 20 | |
|---|
| 21 | // search for a probe |
|---|
| 22 | while (__PROBE_LENGTH < 0) { |
|---|
| 23 | GBDATA *ARB_group = GB_entry(_ARB_node, "group"); |
|---|
| 24 | if (ARB_group) { // ps_node has probes |
|---|
| 25 | GBDATA *probe = PG_get_first_probe( ARB_group ); |
|---|
| 26 | __PROBE_LENGTH = strlen(PG_read_probe(probe)); |
|---|
| 27 | } else { // ps_node has no probes .. check its children |
|---|
| 28 | GBDATA *ARB_child = PS_get_first_node( _ARB_node ); |
|---|
| 29 | while (ARB_child && (__PROBE_LENGTH < 0)) { |
|---|
| 30 | PS_detect_probe_length( ARB_child ); |
|---|
| 31 | ARB_child = PS_get_next_node( ARB_child ); |
|---|
| 32 | } |
|---|
| 33 | } |
|---|
| 34 | } |
|---|
| 35 | } |
|---|
| 36 | |
|---|
| 37 | PS_NodePtr PS_assert_inverse_path( const int _max_depth, const int _caller_ID, IDVector *_path ) { |
|---|
| 38 | // walk down the 'inverse path' creating empty nodes as necessary |
|---|
| 39 | |
|---|
| 40 | PS_NodePtr current_node = __ROOT; |
|---|
| 41 | SpeciesID current_ID; |
|---|
| 42 | |
|---|
| 43 | // handle given path |
|---|
| 44 | //printf( " %i : PS_assert_inverse_path (%i) [ given path :", _caller_ID, _path->size() ); |
|---|
| 45 | int c = 0; |
|---|
| 46 | for (IDVectorCIter i = _path->begin(); i != _path->end(); ++i,++c) { |
|---|
| 47 | current_ID = *i; |
|---|
| 48 | current_node = current_node->assertChild( current_ID ); |
|---|
| 49 | //if ((c % 20) == 0) printf( "\n" ); |
|---|
| 50 | //printf( " %3i",current_ID ); |
|---|
| 51 | } |
|---|
| 52 | |
|---|
| 53 | //printf( "\nimplicit path :" ); |
|---|
| 54 | |
|---|
| 55 | // handle implicit path |
|---|
| 56 | c = 0; |
|---|
| 57 | for (current_ID = _caller_ID+1; current_ID <= _max_depth; ++current_ID,++c) { |
|---|
| 58 | current_node = current_node->assertChild( current_ID ); |
|---|
| 59 | |
|---|
| 60 | //if ((c % 20) == 0) printf( "\n" ); |
|---|
| 61 | //printf( " %3i",current_ID ); |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | //printf( " ] -> (%p,%i)\n", &(*current_node), current_node->getNum() ); |
|---|
| 65 | return current_node; |
|---|
| 66 | } |
|---|
| 67 | |
|---|
| 68 | PS_NodePtr PS_assert_path( const int _caller_ID, IDVector *_path ) { |
|---|
| 69 | // walk down the 'path' creating empty nodes as necessary |
|---|
| 70 | |
|---|
| 71 | PS_NodePtr current_node = __ROOT; |
|---|
| 72 | SpeciesID next_path_ID; |
|---|
| 73 | |
|---|
| 74 | // handle given path |
|---|
| 75 | //printf( " %i : PS_assert_path (%i) [ given path :", _caller_ID, _path->size() ); |
|---|
| 76 | int c = 0; |
|---|
| 77 | IDVectorCIter i = _path->begin(); |
|---|
| 78 | next_path_ID = (i == _path->end()) ? -1 : *i; |
|---|
| 79 | for (SpeciesID current_ID = __MIN_ID; current_ID <= _caller_ID; ++current_ID,++c) { |
|---|
| 80 | //if ((c % 20) == 0) printf( "\n" ); |
|---|
| 81 | if (current_ID != next_path_ID) { |
|---|
| 82 | //printf( " %3i",*i ); |
|---|
| 83 | current_node = current_node->assertChild( current_ID ); |
|---|
| 84 | } else { |
|---|
| 85 | ++i; |
|---|
| 86 | next_path_ID = (i == _path->end()) ? -1 : *i; |
|---|
| 87 | } |
|---|
| 88 | } |
|---|
| 89 | //printf( " ] -> (%p,%i)\n", &(*current_node), current_node->getNum() ); |
|---|
| 90 | |
|---|
| 91 | return current_node; |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | void PS_extract_probe_data( GBDATA *_ARB_node, // position in ARB database |
|---|
| 95 | int _max_depth, // count of species in ARB database |
|---|
| 96 | int _depth, // current depth in tree |
|---|
| 97 | const int _parent_ID, // SpeciesID of parent node |
|---|
| 98 | IDVector *_inverse_path ) { // list with IDs of the 'inverse path' |
|---|
| 99 | |
|---|
| 100 | // recursively walk through ARB-database and extract probe-data to own tree format |
|---|
| 101 | // |
|---|
| 102 | // * Insertion of nodes takes place after a branch is completed (that is |
|---|
| 103 | // when ive reached a leaf in the ARB-database and im going 'back up' |
|---|
| 104 | // out of the recursion). |
|---|
| 105 | // |
|---|
| 106 | // * Branches below _max_depth/2 will be moved up top by inserting nodes with |
|---|
| 107 | // 'inverse' probes in the 'inverse' branch, therefore the _inverse_path |
|---|
| 108 | // list is maintained with the SpeciesIDs of the 'inverse path'. |
|---|
| 109 | // - SpeciesIDs between _parent_ID and current ID are 'missing' in the path |
|---|
| 110 | // and are appended to the _inverse_path list |
|---|
| 111 | // - SpeciesIDs greater than the current ID are implicit in the |
|---|
| 112 | // 'inverse path' list and therefore not stored |
|---|
| 113 | |
|---|
| 114 | // |
|---|
| 115 | // get SpeciesID |
|---|
| 116 | // |
|---|
| 117 | GBDATA *data = GB_entry(_ARB_node, "num"); |
|---|
| 118 | const char *buffer = GB_read_char_pntr( data ); |
|---|
| 119 | SpeciesID id = atoi( buffer ); |
|---|
| 120 | |
|---|
| 121 | // |
|---|
| 122 | // get probe(s) |
|---|
| 123 | // |
|---|
| 124 | PS_ProbeSetPtr probes = 0; |
|---|
| 125 | GBDATA *ARB_group = GB_entry(_ARB_node, "group"); // access probe-group |
|---|
| 126 | if (ARB_group) { |
|---|
| 127 | data = PG_get_first_probe( ARB_group ); // get first probe if exists |
|---|
| 128 | |
|---|
| 129 | if (data) probes = new PS_ProbeSet; // new probe set if probes exist |
|---|
| 130 | |
|---|
| 131 | while (data) { |
|---|
| 132 | buffer = PG_read_probe( data ); // get probe string |
|---|
| 133 | PS_ProbePtr new_probe(new PS_Probe); // make new probe |
|---|
| 134 | new_probe->length = __PROBE_LENGTH; // set probe length |
|---|
| 135 | new_probe->quality = 100; // set probe quality |
|---|
| 136 | new_probe->GC_content = 0; // eval probe for GC-content |
|---|
| 137 | for (int i=0; i < __PROBE_LENGTH; ++i) { |
|---|
| 138 | if ((buffer[i] == 'C') || (buffer[i] == 'G')) ++(new_probe->GC_content); |
|---|
| 139 | } |
|---|
| 140 | probes->insert( new_probe ); // append probe to probe set |
|---|
| 141 | data = PG_get_next_probe( data ); // get next probe |
|---|
| 142 | } |
|---|
| 143 | } |
|---|
| 144 | |
|---|
| 145 | // |
|---|
| 146 | // enlarge inverse path |
|---|
| 147 | // |
|---|
| 148 | //printf( "%i %i %i : enlarge path (%i) [ ", _depth, _parent_ID, id, _inverse_path->size() ); |
|---|
| 149 | for (int i=_parent_ID+1; ((i < id) && (i >= 0)); ++i) { |
|---|
| 150 | //printf( "%i ",i ); |
|---|
| 151 | _inverse_path->push_back(i); |
|---|
| 152 | } |
|---|
| 153 | //printf( "] (%i)\n", _inverse_path->size() ); |
|---|
| 154 | |
|---|
| 155 | // |
|---|
| 156 | // insertion if ARB_node had probes |
|---|
| 157 | // |
|---|
| 158 | if (probes) { |
|---|
| 159 | if (_depth <= (_max_depth >> 1)) { |
|---|
| 160 | // |
|---|
| 161 | // insert if 'above' half depth |
|---|
| 162 | // |
|---|
| 163 | PS_NodePtr current_node = PS_assert_path( id, _inverse_path ); |
|---|
| 164 | current_node->addProbes( probes->begin(), probes->end() ); |
|---|
| 165 | } else { |
|---|
| 166 | // |
|---|
| 167 | // insert if 'below' half depth |
|---|
| 168 | // |
|---|
| 169 | PS_NodePtr current_node = PS_assert_inverse_path( _max_depth, id, _inverse_path ); |
|---|
| 170 | current_node->addProbesInverted( probes->begin(), probes->end() ); |
|---|
| 171 | } |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | // |
|---|
| 175 | // child(ren) |
|---|
| 176 | // |
|---|
| 177 | //printf( "%i %i %i : children\n", _depth, _parent_ID, id ); |
|---|
| 178 | GBDATA *ARB_child = PS_get_first_node( _ARB_node ); // get first child if exists |
|---|
| 179 | while (ARB_child) { |
|---|
| 180 | PS_extract_probe_data( ARB_child, _max_depth, _depth+1, id, _inverse_path ); |
|---|
| 181 | ARB_child = PS_get_next_node( ARB_child ); |
|---|
| 182 | } |
|---|
| 183 | |
|---|
| 184 | // |
|---|
| 185 | // shrink inverse path |
|---|
| 186 | // |
|---|
| 187 | //printf( "%i %i %i : shrink path (%i) [ ", _depth, _parent_ID, id, _inverse_path->size() ); |
|---|
| 188 | while ((_inverse_path->back() > _parent_ID) && (!_inverse_path->empty())) { |
|---|
| 189 | //printf( "%i ",_inverse_path->back() ); |
|---|
| 190 | _inverse_path->pop_back(); |
|---|
| 191 | } |
|---|
| 192 | //printf( "] (%i)\n", _inverse_path->size() ); |
|---|
| 193 | } |
|---|
| 194 | |
|---|
| 195 | |
|---|
| 196 | int main( int _argc, |
|---|
| 197 | char *_argv[] ) { |
|---|
| 198 | |
|---|
| 199 | GBDATA *ARB_main = 0; |
|---|
| 200 | GB_ERROR error = 0; |
|---|
| 201 | |
|---|
| 202 | // open probe-group-database |
|---|
| 203 | if (_argc < 2) { |
|---|
| 204 | printf( "Missing arguments\n Usage %s <input database name>\n", _argv[0] ); |
|---|
| 205 | printf( "output database will be named like input database but with the suffix '.wf' instead of '.arb'\n" ); |
|---|
| 206 | exit( 1 ); |
|---|
| 207 | } |
|---|
| 208 | |
|---|
| 209 | const char *DB_name = _argv[ 1 ]; |
|---|
| 210 | |
|---|
| 211 | // |
|---|
| 212 | // open and check ARB database |
|---|
| 213 | // |
|---|
| 214 | struct tms before; |
|---|
| 215 | times( &before ); |
|---|
| 216 | printf( "Opening probe-group-database '%s'..\n ", DB_name ); |
|---|
| 217 | ARB_main = GB_open( DB_name, "rwcN" ); |
|---|
| 218 | if (!ARB_main) { |
|---|
| 219 | error = GB_await_error(); |
|---|
| 220 | GB_warning(error); |
|---|
| 221 | exit(1); |
|---|
| 222 | } |
|---|
| 223 | printf( "..loaded database (enter to continue) " ); |
|---|
| 224 | PS_print_time_diff( &before ); |
|---|
| 225 | // getchar(); |
|---|
| 226 | |
|---|
| 227 | GB_transaction dummy( ARB_main ); |
|---|
| 228 | GBDATA *group_tree = GB_entry(ARB_main, "group_tree"); |
|---|
| 229 | if (!group_tree) { |
|---|
| 230 | printf( "no 'group_tree' in database\n" ); |
|---|
| 231 | error = GB_export_error( "no 'group_tree' in database" ); |
|---|
| 232 | exit(1); |
|---|
| 233 | } |
|---|
| 234 | GBDATA *first_level_node = PS_get_first_node( group_tree ); |
|---|
| 235 | if (!first_level_node) { |
|---|
| 236 | printf( "no 'node' found in group_tree\n" ); |
|---|
| 237 | error = GB_export_error( "no 'node' found in group_tree" ); |
|---|
| 238 | exit(1); |
|---|
| 239 | } |
|---|
| 240 | |
|---|
| 241 | // |
|---|
| 242 | // read Name <-> ID mappings |
|---|
| 243 | // |
|---|
| 244 | times( &before ); |
|---|
| 245 | printf( "init Species <-> ID - Map\n" ); |
|---|
| 246 | PG_initSpeciesMaps( ARB_main ); |
|---|
| 247 | int species_count = PG_NumberSpecies(); |
|---|
| 248 | printf( "%i species in the map ", species_count ); |
|---|
| 249 | if (species_count >= 10) { |
|---|
| 250 | printf( "\nhere are the first 10 :\n" ); |
|---|
| 251 | int count = 0; |
|---|
| 252 | for (ID2NameMapCIter i=__ID2NAME_MAP.begin(); count<10; ++i,++count) { |
|---|
| 253 | printf( "[ %2i ] %s\n", i->first, i->second.c_str() ); |
|---|
| 254 | } |
|---|
| 255 | } |
|---|
| 256 | __MIN_ID = __ID2NAME_MAP.begin()->first; |
|---|
| 257 | __MAX_ID = __ID2NAME_MAP.rbegin()->first; |
|---|
| 258 | printf( "IDs %i .. %i\n(enter to continue) ", __MIN_ID, __MAX_ID ); |
|---|
| 259 | PS_print_time_diff( &before ); |
|---|
| 260 | // getchar(); |
|---|
| 261 | |
|---|
| 262 | // |
|---|
| 263 | // create output database |
|---|
| 264 | // |
|---|
| 265 | string output_DB_name(DB_name); |
|---|
| 266 | size_t suffix_pos = output_DB_name.rfind( ".arb" ); |
|---|
| 267 | if (suffix_pos != string::npos) { |
|---|
| 268 | output_DB_name.erase( suffix_pos ); |
|---|
| 269 | } |
|---|
| 270 | output_DB_name.append( ".wf" ); |
|---|
| 271 | if (suffix_pos == string::npos) { |
|---|
| 272 | printf( "cannot find suffix '.arb' in database name '%s'\n", DB_name ); |
|---|
| 273 | printf( "output file will be named '%s'\n", output_DB_name.c_str() ); |
|---|
| 274 | } |
|---|
| 275 | PS_Database *ps_db = new PS_Database( output_DB_name.c_str(), PS_Database::WRITEONLY ); |
|---|
| 276 | |
|---|
| 277 | // |
|---|
| 278 | // copy mappings |
|---|
| 279 | // |
|---|
| 280 | ps_db->setMappings( __NAME2ID_MAP, __ID2NAME_MAP ); |
|---|
| 281 | |
|---|
| 282 | // |
|---|
| 283 | // extract data from ARB database |
|---|
| 284 | // |
|---|
| 285 | times( &before ); |
|---|
| 286 | printf( "extracting probe-data...\n" ); |
|---|
| 287 | PS_detect_probe_length( group_tree ); |
|---|
| 288 | printf( "probe_length = %d\n",__PROBE_LENGTH ); |
|---|
| 289 | |
|---|
| 290 | __ROOT = ps_db->getRootNode(); |
|---|
| 291 | first_level_node = PS_get_first_node( group_tree ); |
|---|
| 292 | unsigned int c = 0; |
|---|
| 293 | IDVector *inverse_path = new IDVector; |
|---|
| 294 | struct tms before_first_level_node; |
|---|
| 295 | for (; first_level_node; ++c) { |
|---|
| 296 | if (c % 100 == 0) { |
|---|
| 297 | times( &before_first_level_node ); |
|---|
| 298 | printf( "1st level node #%u ", c+1 ); |
|---|
| 299 | } |
|---|
| 300 | PS_extract_probe_data( first_level_node, species_count, 0, __MIN_ID-1, inverse_path ); |
|---|
| 301 | first_level_node = PS_get_next_node( first_level_node ); |
|---|
| 302 | if (c % 100 == 0) { |
|---|
| 303 | PS_print_time_diff( &before_first_level_node, "this node ", " " ); |
|---|
| 304 | PS_print_time_diff( &before, "total ", "\n" ); |
|---|
| 305 | } |
|---|
| 306 | } |
|---|
| 307 | printf( "done after %u 1st level nodes\n",c ); |
|---|
| 308 | printf( "(enter to continue) " ); |
|---|
| 309 | PS_print_time_diff( &before ); |
|---|
| 310 | // getchar(); |
|---|
| 311 | |
|---|
| 312 | // |
|---|
| 313 | // write database to file |
|---|
| 314 | // |
|---|
| 315 | times( &before ); |
|---|
| 316 | printf( "writing probe-data to %s..\n",output_DB_name.c_str() ); |
|---|
| 317 | ps_db->save(); |
|---|
| 318 | printf( "..done saving (enter to continue) " ); |
|---|
| 319 | PS_print_time_diff( &before ); |
|---|
| 320 | |
|---|
| 321 | delete inverse_path; |
|---|
| 322 | delete ps_db; |
|---|
| 323 | before.tms_utime = 0; |
|---|
| 324 | before.tms_stime = 0; |
|---|
| 325 | printf( "total " ); |
|---|
| 326 | PS_print_time_diff( &before ); |
|---|
| 327 | return 0; |
|---|
| 328 | } |
|---|