1 | #include <stdio.h> |
---|
2 | #include <stdlib.h> |
---|
3 | #include <string.h> |
---|
4 | #include <sys/times.h> |
---|
5 | |
---|
6 | #ifndef PS_PG_TREE_FUNCTIONS |
---|
7 | #include "ps_pg_tree_functions.cxx" |
---|
8 | #endif |
---|
9 | |
---|
10 | #ifndef PS_FILEBUFFER_HXX |
---|
11 | #include "ps_filebuffer.hxx" |
---|
12 | #endif |
---|
13 | |
---|
14 | // GLOBALS |
---|
15 | |
---|
16 | char *__ARB_DB_NAME = 0; |
---|
17 | GBDATA *__ARB_DB = 0; |
---|
18 | GBDATA *__ARB_GROUP_TREE = 0; |
---|
19 | GB_ERROR __ARB_ERROR = 0; |
---|
20 | |
---|
21 | |
---|
22 | bool PS_get_probe_for_path( IDSet& _path, unsigned int _GC_content, unsigned int _probe_length, char *_probe_data ) { |
---|
23 | GB_transaction dummy( __ARB_DB ); |
---|
24 | SpeciesID num; |
---|
25 | GBDATA *data; |
---|
26 | GBDATA *arb_node; |
---|
27 | GBDATA *arb_group; |
---|
28 | IDSetCIter id; |
---|
29 | unsigned int gc_content; |
---|
30 | |
---|
31 | // iterate over path |
---|
32 | arb_node = __ARB_GROUP_TREE; |
---|
33 | for ( id = _path.begin(); |
---|
34 | (id != _path.end()) && arb_node; |
---|
35 | ++id ) { |
---|
36 | |
---|
37 | // search for arb-node with num == id |
---|
38 | arb_node = PS_get_first_node( arb_node ); if (!arb_node) break; |
---|
39 | data = GB_entry(arb_node, "num"); |
---|
40 | num = atoi( GB_read_char_pntr( data ) ); |
---|
41 | while (num != *id) { |
---|
42 | // get next node |
---|
43 | arb_node = PS_get_next_node( arb_node ); |
---|
44 | if (!arb_node) break; |
---|
45 | // get num of arb-node |
---|
46 | data = GB_entry(arb_node, "num"); |
---|
47 | num = atoi( GB_read_char_pntr( data ) ); |
---|
48 | } |
---|
49 | if (!arb_node) break; |
---|
50 | } |
---|
51 | if (!arb_node) { |
---|
52 | printf( " ERROR : failed to get node for ID (%i)\n", *id ); |
---|
53 | return false; |
---|
54 | } |
---|
55 | |
---|
56 | // search for probe with GC-content == _GC_content |
---|
57 | arb_group = GB_entry(arb_node, "group"); |
---|
58 | if (!arb_group) { |
---|
59 | printf( " ERROR : failed to get group of node" ); |
---|
60 | return false; |
---|
61 | } |
---|
62 | data = PG_get_first_probe( arb_group ); |
---|
63 | if (!data) { |
---|
64 | printf( " ERROR : failed to get first probe of group of node" ); |
---|
65 | return false; |
---|
66 | } |
---|
67 | while (_probe_data[0] == '\x00') { |
---|
68 | const char *buffer = PG_read_probe( data ); // read probe data |
---|
69 | gc_content = 0; |
---|
70 | for ( unsigned int i = 0; i < _probe_length; ++i ) { // calc GC-content |
---|
71 | if ((buffer[i] == 'C') || (buffer[i] == 'G')) ++gc_content; |
---|
72 | } |
---|
73 | if (gc_content == _GC_content) { // found matching probe ? |
---|
74 | for ( unsigned int i = 0; i < _probe_length; ++i ) { // store probe data |
---|
75 | _probe_data[i] = buffer[i]; |
---|
76 | } |
---|
77 | } else { |
---|
78 | data = PG_get_next_probe( data ); // get next probe |
---|
79 | if (!data) break; |
---|
80 | } |
---|
81 | } |
---|
82 | if (!data) { |
---|
83 | printf( " ERROR : failed to find probe with GC-content (%u)\n", _GC_content ); |
---|
84 | return false; |
---|
85 | } |
---|
86 | |
---|
87 | return true; |
---|
88 | } |
---|
89 | |
---|
90 | |
---|
91 | // ==================================================== |
---|
92 | // ==================================================== |
---|
93 | int main( int argc, |
---|
94 | char *argv[] ) { |
---|
95 | // |
---|
96 | // check arguments |
---|
97 | // |
---|
98 | if (argc < 3) { |
---|
99 | printf( "Missing argument\n Usage %s <final candidates paths filename> <prefix to arb-databases>\n", argv[0] ); |
---|
100 | printf( "Example:\n %s ~/data/850.final_candidates.paths ~/data/ssjun03_Eucarya_850.pg_\n", argv[0] ); |
---|
101 | exit( 1 ); |
---|
102 | } |
---|
103 | |
---|
104 | char *final_candidates_paths_filename = argv[1]; |
---|
105 | char *arb_db_name_prefix = argv[2]; |
---|
106 | char *temp_filename = (char *)malloc( strlen(final_candidates_paths_filename)+1+5 ); |
---|
107 | strcpy( temp_filename, final_candidates_paths_filename ); |
---|
108 | strcat( temp_filename, ".temp" ); |
---|
109 | unlink( temp_filename ); |
---|
110 | printf( "Opening temp-file '%s'..\n", temp_filename ); |
---|
111 | PS_FileBuffer *temp__file = new PS_FileBuffer( temp_filename, PS_FileBuffer::WRITEONLY ); |
---|
112 | |
---|
113 | // |
---|
114 | // candidates |
---|
115 | // |
---|
116 | printf( "Opening candidates-paths-file '%s'..\n", final_candidates_paths_filename ); |
---|
117 | PS_FileBuffer *paths_file = new PS_FileBuffer( final_candidates_paths_filename, PS_FileBuffer::READONLY ); |
---|
118 | unsigned long paths_todo; |
---|
119 | unsigned int probe_length_todo = 0; |
---|
120 | unsigned int probe_buffer_length = 100; |
---|
121 | char *probe_buffer = 0; |
---|
122 | |
---|
123 | // read count of paths |
---|
124 | paths_file->get_ulong( paths_todo ); |
---|
125 | temp__file->put_ulong( paths_todo ); |
---|
126 | |
---|
127 | // read used probe lengths |
---|
128 | unsigned int count; |
---|
129 | set<unsigned int> probe_lengths; |
---|
130 | paths_file->get_uint( count ); |
---|
131 | temp__file->put_uint( count ); |
---|
132 | printf( "probe lengths :" ); |
---|
133 | for ( unsigned int i = 0; i < count; ++i ) { |
---|
134 | unsigned int length; |
---|
135 | paths_file->get_uint( length ); |
---|
136 | temp__file->put_uint( length ); |
---|
137 | probe_lengths.insert( length ); |
---|
138 | printf( " %u", length ); |
---|
139 | } |
---|
140 | printf( "\n" ); |
---|
141 | |
---|
142 | // read candidates |
---|
143 | while (paths_todo) { |
---|
144 | printf( "\npaths todo (%lu)\n", paths_todo-- ); |
---|
145 | unsigned int probe_length; |
---|
146 | unsigned int probe_GC_content; |
---|
147 | unsigned int path_length; |
---|
148 | IDSet path; |
---|
149 | SpeciesID id; |
---|
150 | |
---|
151 | // read one candidate |
---|
152 | paths_file->get_uint( probe_length ); |
---|
153 | temp__file->put_uint( probe_length ); |
---|
154 | paths_file->get_uint( probe_GC_content ); |
---|
155 | temp__file->put_uint( probe_GC_content ); |
---|
156 | printf( " probe length (%u) GC (%u)\n", probe_length, probe_GC_content ); |
---|
157 | paths_file->get_uint( path_length ); |
---|
158 | temp__file->put_uint( path_length ); |
---|
159 | printf( " path size (%u) ( ", path_length ); |
---|
160 | for ( unsigned int i = 0; i < path_length; ++i ) { |
---|
161 | paths_file->get_int( id ); |
---|
162 | temp__file->put_int( id ); |
---|
163 | path.insert( id ); |
---|
164 | printf( "%i ", id ); |
---|
165 | } |
---|
166 | printf( ")\n" ); |
---|
167 | if (!probe_buffer || (probe_length > probe_buffer_length)) { |
---|
168 | if (probe_buffer) { // adjust buffer size |
---|
169 | free( probe_buffer ); |
---|
170 | probe_buffer_length = 2 * probe_length; |
---|
171 | } |
---|
172 | probe_buffer = (char*)calloc( probe_buffer_length,sizeof(char) ); |
---|
173 | } |
---|
174 | paths_file->get( probe_buffer, probe_length ); |
---|
175 | probe_buffer[ probe_length ] = '\x00'; |
---|
176 | |
---|
177 | // handle probe |
---|
178 | if (probe_buffer[0] == '\x00') { |
---|
179 | if (!probe_length_todo) { |
---|
180 | printf( "handling probes of length %u this time\n", probe_length ); |
---|
181 | probe_length_todo = probe_length; |
---|
182 | // init. arb-db-filename |
---|
183 | // +1 for \0, +7 for 'tmp.arb', +5 for max number of digits of unsigned int |
---|
184 | __ARB_DB_NAME = (char*)malloc( strlen(arb_db_name_prefix)+1+7+5 ); |
---|
185 | sprintf( __ARB_DB_NAME, "%s%utmp.arb", arb_db_name_prefix, probe_length ); |
---|
186 | printf( "Opening ARB-Database '%s'..\n ", __ARB_DB_NAME ); |
---|
187 | __ARB_DB = GB_open( __ARB_DB_NAME, "rN" ); |
---|
188 | if (!__ARB_DB) { |
---|
189 | __ARB_ERROR = GB_await_error(); |
---|
190 | printf( "%s\n", __ARB_ERROR ); |
---|
191 | return 1; |
---|
192 | } |
---|
193 | GB_transaction dummy( __ARB_DB ); |
---|
194 | __ARB_GROUP_TREE = GB_entry(__ARB_DB, "group_tree"); |
---|
195 | if (!__ARB_GROUP_TREE) { |
---|
196 | printf( "no 'group_tree' in database\n" ); |
---|
197 | return 1; |
---|
198 | } |
---|
199 | GBDATA *first_level_node = PS_get_first_node( __ARB_GROUP_TREE ); |
---|
200 | if (!first_level_node) { |
---|
201 | printf( "no 'node' found in group_tree\n" ); |
---|
202 | return 1; |
---|
203 | } |
---|
204 | } |
---|
205 | if (probe_length_todo == probe_length) { |
---|
206 | if (!PS_get_probe_for_path( path, probe_GC_content, probe_length, probe_buffer )) { |
---|
207 | delete temp__file; |
---|
208 | return 1; |
---|
209 | } |
---|
210 | printf( " probe data (%s) ==> updated\n", probe_buffer ); |
---|
211 | } else { |
---|
212 | printf( " probe data (%s) --> skipped\n", probe_buffer ); |
---|
213 | } |
---|
214 | } else { |
---|
215 | printf( " probe data (%s) --> finished\n", probe_buffer ); |
---|
216 | } |
---|
217 | temp__file->put( probe_buffer, probe_length ); |
---|
218 | } |
---|
219 | probe_lengths.clear(); |
---|
220 | paths_file->get_uint( count ); |
---|
221 | for ( unsigned int i = 0; i < count; ++i ) { |
---|
222 | unsigned int length; |
---|
223 | paths_file->get_uint( length ); |
---|
224 | if (length != probe_length_todo) probe_lengths.insert( length ); |
---|
225 | } |
---|
226 | printf( "remaining probe lengths :" ); |
---|
227 | temp__file->put_uint( probe_lengths.size() ); |
---|
228 | for ( set<unsigned int>::iterator length = probe_lengths.begin(); |
---|
229 | length != probe_lengths.end(); |
---|
230 | ++length ) { |
---|
231 | temp__file->put_uint( *length ); |
---|
232 | printf( " %u", *length ); |
---|
233 | } |
---|
234 | printf( "\n" ); |
---|
235 | if (probe_buffer) free( probe_buffer ); |
---|
236 | if (__ARB_DB_NAME) free( __ARB_DB_NAME ); |
---|
237 | |
---|
238 | printf( "cleaning up... temp-file\n" ); fflush( stdout ); |
---|
239 | delete temp__file; |
---|
240 | printf( "cleaning up... candidates-paths-file\n" ); fflush( stdout ); |
---|
241 | delete paths_file; |
---|
242 | printf( "moving temp-file to candiates-paths-file\n" ); fflush( stdout ); |
---|
243 | rename( temp_filename, final_candidates_paths_filename ); |
---|
244 | |
---|
245 | // exit code == 0 if all probe lengths handled |
---|
246 | // exit code == 1 if failure |
---|
247 | // exit code >= 2 else |
---|
248 | return probe_lengths.size()*2; |
---|
249 | } |
---|