| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : ps_get_probes.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Wolfram Foerster in October 2002 // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | #include "ps_pg_tree_functions.hxx" |
|---|
| 13 | #include "ps_filebuffer.hxx" |
|---|
| 14 | |
|---|
| 15 | #include <sys/times.h> |
|---|
| 16 | |
|---|
| 17 | // GLOBALS |
|---|
| 18 | |
|---|
| 19 | static char *__ARB_DB_NAME = NULp; |
|---|
| 20 | static GBDATA *__ARB_DB = NULp; |
|---|
| 21 | static GBDATA *__ARB_GROUP_TREE = NULp; |
|---|
| 22 | static GB_ERROR __ARB_ERROR = NULp; |
|---|
| 23 | |
|---|
| 24 | |
|---|
| 25 | bool PS_get_probe_for_path(IDSet& _path, unsigned int _GC_content, unsigned int _probe_length, char *_probe_data) { |
|---|
| 26 | GB_transaction ta(__ARB_DB); |
|---|
| 27 | SpeciesID num; |
|---|
| 28 | GBDATA *data; |
|---|
| 29 | GBDATA *arb_node; |
|---|
| 30 | GBDATA *arb_group; |
|---|
| 31 | IDSetCIter id; |
|---|
| 32 | unsigned int gc_content; |
|---|
| 33 | |
|---|
| 34 | // iterate over path |
|---|
| 35 | arb_node = __ARB_GROUP_TREE; |
|---|
| 36 | for (id = _path.begin(); |
|---|
| 37 | (id != _path.end()) && arb_node; |
|---|
| 38 | ++id) { |
|---|
| 39 | |
|---|
| 40 | // search for arb-node with num == id |
|---|
| 41 | arb_node = PS_get_first_node(arb_node); if (!arb_node) break; |
|---|
| 42 | data = GB_entry(arb_node, "num"); |
|---|
| 43 | num = atoi(GB_read_char_pntr(data)); |
|---|
| 44 | while (num != *id) { |
|---|
| 45 | // get next node |
|---|
| 46 | arb_node = PS_get_next_node(arb_node); |
|---|
| 47 | if (!arb_node) break; |
|---|
| 48 | // get num of arb-node |
|---|
| 49 | data = GB_entry(arb_node, "num"); |
|---|
| 50 | num = atoi(GB_read_char_pntr(data)); |
|---|
| 51 | } |
|---|
| 52 | if (!arb_node) break; |
|---|
| 53 | } |
|---|
| 54 | if (!arb_node) { |
|---|
| 55 | printf(" ERROR : failed to get node for ID (%i)\n", *id); |
|---|
| 56 | return false; |
|---|
| 57 | } |
|---|
| 58 | |
|---|
| 59 | // search for probe with GC-content == _GC_content |
|---|
| 60 | arb_group = GB_entry(arb_node, "group"); |
|---|
| 61 | if (!arb_group) { |
|---|
| 62 | printf(" ERROR : failed to get group of node"); |
|---|
| 63 | return false; |
|---|
| 64 | } |
|---|
| 65 | data = PG_get_first_probe(arb_group); |
|---|
| 66 | if (!data) { |
|---|
| 67 | printf(" ERROR : failed to get first probe of group of node"); |
|---|
| 68 | return false; |
|---|
| 69 | } |
|---|
| 70 | while (_probe_data[0] == '\x00') { |
|---|
| 71 | const char *buffer = PG_read_probe(data); // read probe data |
|---|
| 72 | gc_content = 0; |
|---|
| 73 | for (unsigned int i = 0; i < _probe_length; ++i) { // calc GC-content |
|---|
| 74 | if ((buffer[i] == 'C') || (buffer[i] == 'G')) ++gc_content; |
|---|
| 75 | } |
|---|
| 76 | if (gc_content == _GC_content) { // found matching probe ? |
|---|
| 77 | for (unsigned int i = 0; i < _probe_length; ++i) { // store probe data |
|---|
| 78 | _probe_data[i] = buffer[i]; |
|---|
| 79 | } |
|---|
| 80 | } |
|---|
| 81 | else { |
|---|
| 82 | data = PG_get_next_probe(data); // get next probe |
|---|
| 83 | if (!data) break; |
|---|
| 84 | } |
|---|
| 85 | } |
|---|
| 86 | if (!data) { |
|---|
| 87 | printf(" ERROR : failed to find probe with GC-content (%u)\n", _GC_content); |
|---|
| 88 | return false; |
|---|
| 89 | } |
|---|
| 90 | |
|---|
| 91 | return true; |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | |
|---|
| 95 | // ==================================================== |
|---|
| 96 | // ==================================================== |
|---|
| 97 | int main(int argc, |
|---|
| 98 | char *argv[]) { |
|---|
| 99 | // |
|---|
| 100 | // check arguments |
|---|
| 101 | // |
|---|
| 102 | if (argc < 3) { |
|---|
| 103 | printf("Missing argument\n Usage %s <final candidates paths filename> <prefix to arb-databases>\n", argv[0]); |
|---|
| 104 | printf("Example:\n %s ~/data/850.final_candidates.paths ~/data/ssjun03_Eucarya_850.pg_\n", argv[0]); |
|---|
| 105 | exit(1); |
|---|
| 106 | } |
|---|
| 107 | |
|---|
| 108 | char *final_candidates_paths_filename = argv[1]; |
|---|
| 109 | char *arb_db_name_prefix = argv[2]; |
|---|
| 110 | char *temp_filename = (char *)malloc(strlen(final_candidates_paths_filename)+1+5); |
|---|
| 111 | strcpy(temp_filename, final_candidates_paths_filename); |
|---|
| 112 | strcat(temp_filename, ".temp"); |
|---|
| 113 | unlink(temp_filename); |
|---|
| 114 | printf("Opening temp-file '%s'..\n", temp_filename); |
|---|
| 115 | PS_FileBuffer *temp__file = new PS_FileBuffer(temp_filename, PS_FileBuffer::WRITEONLY); |
|---|
| 116 | |
|---|
| 117 | // |
|---|
| 118 | // candidates |
|---|
| 119 | // |
|---|
| 120 | printf("Opening candidates-paths-file '%s'..\n", final_candidates_paths_filename); |
|---|
| 121 | PS_FileBuffer *paths_file = new PS_FileBuffer(final_candidates_paths_filename, PS_FileBuffer::READONLY); |
|---|
| 122 | unsigned long paths_todo; |
|---|
| 123 | unsigned int probe_length_todo = 0; |
|---|
| 124 | unsigned int probe_buffer_length = 100; |
|---|
| 125 | char *probe_buffer = NULp; |
|---|
| 126 | |
|---|
| 127 | // read count of paths |
|---|
| 128 | paths_file->get_ulong(paths_todo); |
|---|
| 129 | temp__file->put_ulong(paths_todo); |
|---|
| 130 | |
|---|
| 131 | // read used probe lengths |
|---|
| 132 | unsigned int count; |
|---|
| 133 | set<unsigned int> probe_lengths; |
|---|
| 134 | paths_file->get_uint(count); |
|---|
| 135 | temp__file->put_uint(count); |
|---|
| 136 | printf("probe lengths :"); |
|---|
| 137 | for (unsigned int i = 0; i < count; ++i) { |
|---|
| 138 | unsigned int length; |
|---|
| 139 | paths_file->get_uint(length); |
|---|
| 140 | temp__file->put_uint(length); |
|---|
| 141 | probe_lengths.insert(length); |
|---|
| 142 | printf(" %u", length); |
|---|
| 143 | } |
|---|
| 144 | printf("\n"); |
|---|
| 145 | |
|---|
| 146 | // read candidates |
|---|
| 147 | while (paths_todo) { |
|---|
| 148 | printf("\npaths todo (%lu)\n", paths_todo--); |
|---|
| 149 | unsigned int probe_length; |
|---|
| 150 | unsigned int probe_GC_content; |
|---|
| 151 | unsigned int path_length; |
|---|
| 152 | IDSet path; |
|---|
| 153 | SpeciesID id; |
|---|
| 154 | |
|---|
| 155 | // read one candidate |
|---|
| 156 | paths_file->get_uint(probe_length); |
|---|
| 157 | temp__file->put_uint(probe_length); |
|---|
| 158 | paths_file->get_uint(probe_GC_content); |
|---|
| 159 | temp__file->put_uint(probe_GC_content); |
|---|
| 160 | printf(" probe length (%u) GC (%u)\n", probe_length, probe_GC_content); |
|---|
| 161 | paths_file->get_uint(path_length); |
|---|
| 162 | temp__file->put_uint(path_length); |
|---|
| 163 | printf(" path size (%u) ( ", path_length); |
|---|
| 164 | for (unsigned int i = 0; i < path_length; ++i) { |
|---|
| 165 | paths_file->get_int(id); |
|---|
| 166 | temp__file->put_int(id); |
|---|
| 167 | path.insert(id); |
|---|
| 168 | printf("%i ", id); |
|---|
| 169 | } |
|---|
| 170 | printf(")\n"); |
|---|
| 171 | if (!probe_buffer || (probe_length > probe_buffer_length)) { |
|---|
| 172 | if (probe_buffer) { // adjust buffer size |
|---|
| 173 | free(probe_buffer); |
|---|
| 174 | probe_buffer_length = 2 * probe_length; |
|---|
| 175 | } |
|---|
| 176 | probe_buffer = (char*)calloc(probe_buffer_length, sizeof(char)); |
|---|
| 177 | } |
|---|
| 178 | paths_file->get(probe_buffer, probe_length); |
|---|
| 179 | probe_buffer[probe_length] = '\x00'; |
|---|
| 180 | |
|---|
| 181 | // handle probe |
|---|
| 182 | if (probe_buffer[0] == '\x00') { |
|---|
| 183 | if (!probe_length_todo) { |
|---|
| 184 | printf("handling probes of length %u this time\n", probe_length); |
|---|
| 185 | probe_length_todo = probe_length; |
|---|
| 186 | // init. arb-db-filename |
|---|
| 187 | // +1 for \0, +7 for 'tmp.arb', +5 for max number of digits of unsigned int |
|---|
| 188 | __ARB_DB_NAME = (char*)malloc(strlen(arb_db_name_prefix)+1+7+5); |
|---|
| 189 | sprintf(__ARB_DB_NAME, "%s%utmp.arb", arb_db_name_prefix, probe_length); |
|---|
| 190 | printf("Opening ARB-Database '%s'..\n ", __ARB_DB_NAME); |
|---|
| 191 | __ARB_DB = GB_open(__ARB_DB_NAME, "rN"); |
|---|
| 192 | if (!__ARB_DB) { |
|---|
| 193 | __ARB_ERROR = GB_await_error(); |
|---|
| 194 | printf("%s\n", __ARB_ERROR); |
|---|
| 195 | return 1; |
|---|
| 196 | } |
|---|
| 197 | GB_transaction ta(__ARB_DB); |
|---|
| 198 | __ARB_GROUP_TREE = GB_entry(__ARB_DB, "group_tree"); |
|---|
| 199 | if (!__ARB_GROUP_TREE) { |
|---|
| 200 | printf("no 'group_tree' in database\n"); |
|---|
| 201 | return 1; |
|---|
| 202 | } |
|---|
| 203 | GBDATA *first_level_node = PS_get_first_node(__ARB_GROUP_TREE); |
|---|
| 204 | if (!first_level_node) { |
|---|
| 205 | printf("no 'node' found in group_tree\n"); |
|---|
| 206 | return 1; |
|---|
| 207 | } |
|---|
| 208 | } |
|---|
| 209 | if (probe_length_todo == probe_length) { |
|---|
| 210 | if (!PS_get_probe_for_path(path, probe_GC_content, probe_length, probe_buffer)) { |
|---|
| 211 | delete temp__file; |
|---|
| 212 | return 1; |
|---|
| 213 | } |
|---|
| 214 | printf(" probe data (%s) ==> updated\n", probe_buffer); |
|---|
| 215 | } |
|---|
| 216 | else { |
|---|
| 217 | printf(" probe data (%s) --> skipped\n", probe_buffer); |
|---|
| 218 | } |
|---|
| 219 | } |
|---|
| 220 | else { |
|---|
| 221 | printf(" probe data (%s) --> finished\n", probe_buffer); |
|---|
| 222 | } |
|---|
| 223 | temp__file->put(probe_buffer, probe_length); |
|---|
| 224 | } |
|---|
| 225 | probe_lengths.clear(); |
|---|
| 226 | paths_file->get_uint(count); |
|---|
| 227 | for (unsigned int i = 0; i < count; ++i) { |
|---|
| 228 | unsigned int length; |
|---|
| 229 | paths_file->get_uint(length); |
|---|
| 230 | if (length != probe_length_todo) probe_lengths.insert(length); |
|---|
| 231 | } |
|---|
| 232 | printf("remaining probe lengths :"); |
|---|
| 233 | temp__file->put_uint(probe_lengths.size()); |
|---|
| 234 | for (set<unsigned int>::iterator length = probe_lengths.begin(); |
|---|
| 235 | length != probe_lengths.end(); |
|---|
| 236 | ++length) { |
|---|
| 237 | temp__file->put_uint(*length); |
|---|
| 238 | printf(" %u", *length); |
|---|
| 239 | } |
|---|
| 240 | printf("\n"); |
|---|
| 241 | if (probe_buffer) free(probe_buffer); |
|---|
| 242 | if (__ARB_DB_NAME) free(__ARB_DB_NAME); |
|---|
| 243 | |
|---|
| 244 | printf("cleaning up... temp-file\n"); fflush(stdout); |
|---|
| 245 | delete temp__file; |
|---|
| 246 | printf("cleaning up... candidates-paths-file\n"); fflush(stdout); |
|---|
| 247 | delete paths_file; |
|---|
| 248 | printf("moving temp-file to candiates-paths-file\n"); fflush(stdout); |
|---|
| 249 | rename(temp_filename, final_candidates_paths_filename); |
|---|
| 250 | |
|---|
| 251 | // exit code == 0 if all probe lengths handled |
|---|
| 252 | // exit code == 1 if failure |
|---|
| 253 | // exit code >= 2 else |
|---|
| 254 | return probe_lengths.size()*2; |
|---|
| 255 | } |
|---|