1 | // =============================================================== // |
---|
2 | // // |
---|
3 | // File : arb_gene_probe.cxx // |
---|
4 | // Purpose : // |
---|
5 | // // |
---|
6 | // Institute of Microbiology (Technical University Munich) // |
---|
7 | // http://www.arb-home.de/ // |
---|
8 | // // |
---|
9 | // =============================================================== // |
---|
10 | |
---|
11 | #include <arbdbt.h> |
---|
12 | #include <adGene.h> |
---|
13 | |
---|
14 | #include <map> |
---|
15 | #include <list> |
---|
16 | #include <set> |
---|
17 | #include <string> |
---|
18 | |
---|
19 | #include <unistd.h> |
---|
20 | #include <sys/types.h> |
---|
21 | |
---|
22 | #define gp_assert(cond) arb_assert(cond) |
---|
23 | |
---|
24 | using namespace std; |
---|
25 | |
---|
26 | #if defined(DEBUG) |
---|
27 | // #define CREATE_DEBUG_FILES |
---|
28 | // #define DUMP_OVERLAP_CALC |
---|
29 | #endif // DEBUG |
---|
30 | |
---|
31 | // -------------------------------------------------------------------------------- |
---|
32 | |
---|
33 | static int gene_counter = 0; // pre-incremented counters |
---|
34 | static int split_gene_counter = 0; |
---|
35 | static int intergene_counter = 0; |
---|
36 | |
---|
37 | struct nameOrder { |
---|
38 | bool operator()(const char *name1, const char *name2) const { |
---|
39 | // Normally it is sufficient to have any order, as long as it is strict. |
---|
40 | // But for UNIT_TESTS we need a reproducable order, which does not |
---|
41 | // depend on memory layout of DB elements. |
---|
42 | #if defined(UNIT_TESTS) // UT_DIFF |
---|
43 | return strcmp(name1, name2)<0; // slow, determined by species names |
---|
44 | #else |
---|
45 | return (name1-name2)<0; // fast, but depends on memory layout (e.g. on MEMORY_TEST in gb_memory.h) |
---|
46 | #endif |
---|
47 | } |
---|
48 | }; |
---|
49 | |
---|
50 | typedef map<const char *, string, nameOrder> FullNameMap; |
---|
51 | static FullNameMap names; |
---|
52 | |
---|
53 | // -------------------------------------------------------------------------------- |
---|
54 | |
---|
55 | struct PositionPair { |
---|
56 | int begin; // these positions are in range [0 .. genome_length-1] |
---|
57 | int end; |
---|
58 | |
---|
59 | static int genome_length; |
---|
60 | |
---|
61 | #if defined(DEBUG) |
---|
62 | void check_legal() const { |
---|
63 | gp_assert(begin >= 0); |
---|
64 | gp_assert(begin <= end); |
---|
65 | gp_assert(end < genome_length); |
---|
66 | } |
---|
67 | #endif // DEBUG |
---|
68 | |
---|
69 | PositionPair() : begin(-1), end(-1) {} |
---|
70 | PositionPair(int begin_, int end_) : begin(begin_), end(end_) { |
---|
71 | #if defined(DEBUG) |
---|
72 | check_legal(); |
---|
73 | #endif // DEBUG |
---|
74 | } |
---|
75 | |
---|
76 | int length() const { return end-begin+1; } |
---|
77 | |
---|
78 | bool overlapsWith(const PositionPair& other) const { |
---|
79 | #if defined(DEBUG) |
---|
80 | check_legal(); |
---|
81 | other.check_legal(); |
---|
82 | #endif // DEBUG |
---|
83 | return ! ((end < other.begin) || (other.end < begin)); |
---|
84 | } |
---|
85 | |
---|
86 | #if defined(DUMP_OVERLAP_CALC) |
---|
87 | void dump(const char *note) const { |
---|
88 | printf("%s begin=%i end=%i\n", note, begin, end); |
---|
89 | } |
---|
90 | #endif // DUMP_OVERLAP_CALC |
---|
91 | }; |
---|
92 | |
---|
93 | int PositionPair::genome_length = 0; |
---|
94 | |
---|
95 | typedef list<PositionPair> PositionPairList; |
---|
96 | |
---|
97 | struct ltNonOverlap { |
---|
98 | // sorting with this operator identifies all overlapping PositionPair's as "equal" |
---|
99 | bool operator ()(const PositionPair& p1, const PositionPair& p2) const { |
---|
100 | return p1.end < p2.begin; |
---|
101 | } |
---|
102 | }; |
---|
103 | |
---|
104 | class GenePositionMap { |
---|
105 | typedef set<PositionPair, ltNonOverlap> OverlappingGeneSet; |
---|
106 | |
---|
107 | OverlappingGeneSet usedRanges; |
---|
108 | unsigned long overlapSize; |
---|
109 | unsigned long geneSize; |
---|
110 | public: |
---|
111 | GenePositionMap() : overlapSize(0), geneSize(0) {} |
---|
112 | |
---|
113 | void announceGene(PositionPair gene); |
---|
114 | GB_ERROR buildIntergeneList(const PositionPair& wholeGenome, PositionPairList& intergeneList) const; |
---|
115 | unsigned long getOverlap() const { return overlapSize; } |
---|
116 | unsigned long getAllGeneSize() const { return geneSize; } |
---|
117 | |
---|
118 | #if defined(DUMP_OVERLAP_CALC) |
---|
119 | void dump() const; |
---|
120 | #endif // DUMP_OVERLAP_CALC |
---|
121 | }; |
---|
122 | |
---|
123 | // ____________________________________________________________ |
---|
124 | // start of implementation of class GenePositionMap: |
---|
125 | |
---|
126 | void GenePositionMap::announceGene(PositionPair gene) { |
---|
127 | OverlappingGeneSet::iterator found = usedRanges.find(gene); |
---|
128 | if (found == usedRanges.end()) { // gene does not overlap with currently known ranges |
---|
129 | usedRanges.insert(gene); // add to known ranges |
---|
130 | } |
---|
131 | else { |
---|
132 | // 'found' overlaps with 'gene' |
---|
133 | int gene_length = gene.length(); |
---|
134 | |
---|
135 | do { |
---|
136 | gp_assert(gene.overlapsWith(*found)); |
---|
137 | |
---|
138 | gene = PositionPair(min(found->begin, gene.begin), max(found->end, gene.end)); // calc combined range |
---|
139 | int combined_length = gene.length(); |
---|
140 | |
---|
141 | size_t overlap = (found->length()+gene_length)-combined_length; |
---|
142 | overlapSize += overlap; |
---|
143 | geneSize += gene_length; |
---|
144 | |
---|
145 | usedRanges.erase(found); |
---|
146 | |
---|
147 | gene_length = combined_length; |
---|
148 | found = usedRanges.find(gene); // search for further overlaps |
---|
149 | } while (found != usedRanges.end()); |
---|
150 | |
---|
151 | usedRanges.insert(gene); // insert the combined range |
---|
152 | } |
---|
153 | } |
---|
154 | |
---|
155 | GB_ERROR GenePositionMap::buildIntergeneList(const PositionPair& wholeGenome, PositionPairList& intergeneList) const { |
---|
156 | OverlappingGeneSet::iterator end = usedRanges.end(); |
---|
157 | OverlappingGeneSet::iterator curr = usedRanges.begin(); |
---|
158 | OverlappingGeneSet::iterator prev = end; |
---|
159 | |
---|
160 | if (curr == end) { // nothing defined -> use whole genome as one big intergene |
---|
161 | intergeneList.push_back(wholeGenome); |
---|
162 | } |
---|
163 | else { |
---|
164 | if (curr->begin > wholeGenome.begin) { // intergene before first gene range ? |
---|
165 | intergeneList.push_back(PositionPair(wholeGenome.begin, curr->begin-1)); |
---|
166 | } |
---|
167 | |
---|
168 | prev = curr; ++curr; |
---|
169 | |
---|
170 | while (curr != end) { |
---|
171 | if (prev->end < curr->begin) { |
---|
172 | if (prev->end != (curr->begin-1)) { // not directly adjacent |
---|
173 | intergeneList.push_back(PositionPair(prev->end+1, curr->begin-1)); |
---|
174 | } |
---|
175 | } |
---|
176 | else { |
---|
177 | return "Internal error: Overlapping gene ranges"; |
---|
178 | } |
---|
179 | |
---|
180 | prev = curr; ++curr; |
---|
181 | } |
---|
182 | |
---|
183 | if (prev != end && prev->end < wholeGenome.end) { |
---|
184 | intergeneList.push_back(PositionPair(prev->end+1, wholeGenome.end)); |
---|
185 | } |
---|
186 | } |
---|
187 | return NULp; |
---|
188 | } |
---|
189 | |
---|
190 | #if defined(DUMP_OVERLAP_CALC) |
---|
191 | void GenePositionMap::dump() const { |
---|
192 | printf("List of ranges used by genes:\n"); |
---|
193 | for (OverlappingGeneSet::iterator g = usedRanges.begin(); g != usedRanges.end(); ++g) { |
---|
194 | g->dump("- "); |
---|
195 | } |
---|
196 | printf("Overlap: %lu bases\n", getOverlap()); |
---|
197 | } |
---|
198 | #endif // DUMP_OVERLAP_CALC |
---|
199 | |
---|
200 | // -end- of implementation of class GenePositionMap. |
---|
201 | |
---|
202 | static GB_ERROR create_data_entry(GBDATA *gb_species2, const char *sequence, int seqlen) { |
---|
203 | GB_ERROR error = NULp; |
---|
204 | char *gene_sequence = new char[seqlen+1]; |
---|
205 | |
---|
206 | memcpy(gene_sequence, sequence, seqlen); // @@@ FIXME: avoid this copy! |
---|
207 | gene_sequence[seqlen] = 0; |
---|
208 | |
---|
209 | GBDATA *gb_ali = GB_create_container(gb_species2, "ali_ptgene"); |
---|
210 | if (!gb_ali) error = GB_await_error(); |
---|
211 | else error = GBT_write_string(gb_ali, "data", gene_sequence); |
---|
212 | |
---|
213 | delete [] gene_sequence; |
---|
214 | return error; |
---|
215 | } |
---|
216 | |
---|
217 | #if defined(DEBUG) |
---|
218 | static void CHECK_SEMI_ESCAPED(const char *name) { |
---|
219 | // checks whether all ";\\" are escaped |
---|
220 | while (*name) { |
---|
221 | gp_assert(*name != ';'); // oops, unescaped ';' |
---|
222 | if (*name == '\\') ++name; |
---|
223 | ++name; |
---|
224 | } |
---|
225 | } |
---|
226 | #else |
---|
227 | #define CHECK_SEMI_ESCAPED(s) |
---|
228 | #endif // DEBUG |
---|
229 | |
---|
230 | |
---|
231 | static GBDATA *create_gene_species(GBDATA *gb_species_data2, const char *internal_name, const char *long_name, int abspos, const char *sequence, int length) { |
---|
232 | // Note: 'sequence' is not necessarily 0-terminated! |
---|
233 | |
---|
234 | #if defined(DEBUG) |
---|
235 | const char *firstSem = strchr(long_name, ';'); |
---|
236 | gp_assert(firstSem); |
---|
237 | CHECK_SEMI_ESCAPED(firstSem+1); |
---|
238 | #endif // DEBUG |
---|
239 | |
---|
240 | GB_ERROR error = GB_push_transaction(gb_species_data2); |
---|
241 | GBDATA *gb_species2 = NULp; |
---|
242 | |
---|
243 | if (!error) { |
---|
244 | gb_species2 = GB_create_container(gb_species_data2, "species"); |
---|
245 | if (!gb_species2) error = GB_await_error(); |
---|
246 | } |
---|
247 | |
---|
248 | if (!error) { |
---|
249 | GBDATA *gb_name = GB_create(gb_species2, "name", GB_STRING); |
---|
250 | |
---|
251 | if (!gb_name) error = GB_await_error(); |
---|
252 | else { |
---|
253 | error = GB_write_string(gb_name, internal_name); |
---|
254 | if (!error) { |
---|
255 | const char *static_internal_name = GB_read_char_pntr(gb_name); // use static copy from db as map-index (internal_name is temporary) |
---|
256 | error = create_data_entry(gb_species2, sequence, length); |
---|
257 | if (!error) { |
---|
258 | names[static_internal_name] = long_name; |
---|
259 | error = GBT_write_int(gb_species2, "abspos", abspos); |
---|
260 | } |
---|
261 | } |
---|
262 | } |
---|
263 | } |
---|
264 | |
---|
265 | error = GB_end_transaction(gb_species_data2, error); |
---|
266 | |
---|
267 | if (error) { // be more verbose : |
---|
268 | error = GBS_global_string("%s (internal_name='%s', long_name='%s')", error, internal_name, long_name); |
---|
269 | GB_export_error(error); |
---|
270 | gb_species2 = NULp; |
---|
271 | } |
---|
272 | |
---|
273 | return gb_species2; |
---|
274 | } |
---|
275 | |
---|
276 | static GB_ERROR create_genelike_entry(const char *internal_name, GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_name) { |
---|
277 | GBDATA *gb_genespecies = create_gene_species(gb_species_data2, internal_name, long_name, start_pos, ali_genome+start_pos, end_pos-start_pos+1); |
---|
278 | return gb_genespecies ? NULp : GB_await_error(); |
---|
279 | } |
---|
280 | |
---|
281 | static GB_ERROR create_intergene(GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_gene_name) { |
---|
282 | if (start_pos <= end_pos) { |
---|
283 | char internal_name[128]; |
---|
284 | sprintf(internal_name, "i%x", intergene_counter++); |
---|
285 | return create_genelike_entry(internal_name, gb_species_data2, start_pos, end_pos, ali_genome, long_gene_name); |
---|
286 | } |
---|
287 | return "Illegal inter-gene positions (start behind end)"; |
---|
288 | } |
---|
289 | |
---|
290 | static GB_ERROR create_gene(GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_gene_name) { |
---|
291 | if (start_pos <= end_pos) { |
---|
292 | char internal_name[128]; |
---|
293 | sprintf(internal_name, "n%x", gene_counter++); |
---|
294 | return create_genelike_entry(internal_name, gb_species_data2, start_pos, end_pos, ali_genome, long_gene_name); |
---|
295 | } |
---|
296 | return "Illegal gene positions (start behind end)"; |
---|
297 | } |
---|
298 | |
---|
299 | static GB_ERROR create_split_gene(GBDATA *gb_species_data2, PositionPairList& part_list, const char *ali_genome, const char *long_gene_name) { |
---|
300 | GB_ERROR error = NULp; |
---|
301 | PositionPairList::iterator list_end = part_list.end(); |
---|
302 | |
---|
303 | int gene_size = 0; |
---|
304 | for (PositionPairList::iterator part = part_list.begin(); part != list_end; ++part) { |
---|
305 | int part_size = part->end-part->begin+1; |
---|
306 | gp_assert(part_size > 0); |
---|
307 | gene_size += part_size; |
---|
308 | } |
---|
309 | gp_assert(gene_size > 0); |
---|
310 | char *gene_sequence = new char[gene_size+1]; |
---|
311 | int gene_off = 0; |
---|
312 | |
---|
313 | char *split_pos_list = NULp; // contains split information: 'gene pos of part2,abs pos of part2;gene pos of part3,abs pos of part3;...' |
---|
314 | |
---|
315 | for (PositionPairList::iterator part = part_list.begin(); part != list_end;) { |
---|
316 | int part_size = part->end-part->begin+1; |
---|
317 | int genome_pos = part->begin; |
---|
318 | memcpy(gene_sequence+gene_off, ali_genome+part->begin, part_size); |
---|
319 | gene_off += part_size; |
---|
320 | |
---|
321 | ++part; |
---|
322 | |
---|
323 | if (!split_pos_list) { // first part |
---|
324 | split_pos_list = GBS_global_string_copy("%i", gene_off); // gene offset of part 2 |
---|
325 | } |
---|
326 | else { // next parts |
---|
327 | char *next_split_pos_list; |
---|
328 | if (part != list_end) { // not last |
---|
329 | next_split_pos_list = GBS_global_string_copy("%s,%i;%i", split_pos_list, genome_pos, gene_off); |
---|
330 | } |
---|
331 | else { // last part |
---|
332 | next_split_pos_list = GBS_global_string_copy("%s,%i", split_pos_list, genome_pos); |
---|
333 | } |
---|
334 | freeset(split_pos_list, next_split_pos_list); |
---|
335 | } |
---|
336 | } |
---|
337 | |
---|
338 | char internal_name[128]; |
---|
339 | sprintf(internal_name, "s%x", split_gene_counter++); |
---|
340 | |
---|
341 | const PositionPair& first_part = part_list.front(); |
---|
342 | GBDATA *gb_species2 = create_gene_species(gb_species_data2, internal_name, long_gene_name, first_part.begin, |
---|
343 | gene_sequence, first_part.end-first_part.begin+1); |
---|
344 | |
---|
345 | if (!gb_species2) error = GB_await_error(); |
---|
346 | else { |
---|
347 | #if defined(DEBUG) && 0 |
---|
348 | printf("split gene: long_gene_name='%s' internal_name='%s' split_pos_list='%s'\n", |
---|
349 | long_gene_name, internal_name, split_pos_list); |
---|
350 | #endif // DEBUG |
---|
351 | error = GBT_write_string(gb_species2, "splitpos", split_pos_list); |
---|
352 | } |
---|
353 | |
---|
354 | free(split_pos_list); |
---|
355 | delete [] gene_sequence; |
---|
356 | |
---|
357 | return error; |
---|
358 | } |
---|
359 | |
---|
360 | static GB_ERROR scan_gene_positions(GBDATA *gb_gene, PositionPairList& part_list) { |
---|
361 | GB_ERROR error = NULp; |
---|
362 | GEN_position *location = GEN_read_position(gb_gene); |
---|
363 | |
---|
364 | if (!location) error = GB_await_error(); |
---|
365 | else { |
---|
366 | GEN_sortAndMergeLocationParts(location); |
---|
367 | int parts = location->parts; |
---|
368 | for (int p = 0; p<parts; ++p) { |
---|
369 | part_list.push_back(PositionPair(location->start_pos[p]-1, location->stop_pos[p]-1)); |
---|
370 | } |
---|
371 | GEN_free_position(location); |
---|
372 | } |
---|
373 | return error; |
---|
374 | } |
---|
375 | |
---|
376 | static GB_ERROR insert_genes_of_organism(GBDATA *gb_organism, GBDATA *gb_species_data2) { |
---|
377 | // insert all genes of 'gb_organism' as pseudo-species |
---|
378 | // into new 'species_data' (gb_species_data2) |
---|
379 | |
---|
380 | GB_ERROR error = NULp; |
---|
381 | const char *organism_name = GBT_get_name(gb_organism); |
---|
382 | |
---|
383 | GenePositionMap geneRanges; |
---|
384 | |
---|
385 | int gene_counter_old = gene_counter; // used for statistics only (see end of function) |
---|
386 | int split_gene_counter_old = split_gene_counter; |
---|
387 | int intergene_counter_old = intergene_counter; |
---|
388 | |
---|
389 | GBDATA *gb_ali_genom = GBT_find_sequence(gb_organism, GENOM_ALIGNMENT); |
---|
390 | gp_assert(gb_ali_genom); // existence has to be checked by caller! |
---|
391 | |
---|
392 | const char *ali_genom = GB_read_char_pntr(gb_ali_genom); |
---|
393 | if (!ali_genom) error = GB_await_error(); |
---|
394 | PositionPair::genome_length = GB_read_count(gb_ali_genom); // this affects checks in PositionPair |
---|
395 | |
---|
396 | if (!organism_name && !error) { |
---|
397 | error = "encountered invalid organism (lacks 'name' entry)"; |
---|
398 | } |
---|
399 | |
---|
400 | for (GBDATA *gb_gene = GEN_first_gene(gb_organism); |
---|
401 | gb_gene && !error; |
---|
402 | gb_gene = GEN_next_gene(gb_gene)) |
---|
403 | { |
---|
404 | const char *gene_name = GBT_get_name(gb_gene); |
---|
405 | |
---|
406 | PositionPairList part_list; |
---|
407 | error = scan_gene_positions(gb_gene, part_list); |
---|
408 | |
---|
409 | if (!error && !gene_name) error = "encountered invalid gene (lacks 'name' entry)"; |
---|
410 | if (!error && part_list.empty()) error = "empty position list"; |
---|
411 | if (!error) { |
---|
412 | int split_count = part_list.size(); |
---|
413 | PositionPair first_part = *part_list.begin(); |
---|
414 | |
---|
415 | if (!error) { |
---|
416 | char *esc_gene_name = GBS_escape_string(gene_name, ";", '\\'); |
---|
417 | char *long_gene_name = GBS_global_string_copy("%s;%s", organism_name, esc_gene_name); |
---|
418 | if (split_count == 1) { // normal gene |
---|
419 | error = create_gene(gb_species_data2, first_part.begin, first_part.end, ali_genom, long_gene_name); |
---|
420 | geneRanges.announceGene(first_part); |
---|
421 | } |
---|
422 | else { // split gene |
---|
423 | error = create_split_gene(gb_species_data2, part_list, ali_genom, long_gene_name); |
---|
424 | |
---|
425 | for (PositionPairList::iterator p = part_list.begin(); p != part_list.end(); ++p) { |
---|
426 | geneRanges.announceGene(*p); |
---|
427 | } |
---|
428 | } |
---|
429 | free(long_gene_name); |
---|
430 | free(esc_gene_name); |
---|
431 | } |
---|
432 | } |
---|
433 | |
---|
434 | if (error && gene_name) error = GBS_global_string("in gene '%s': %s", gene_name, error); |
---|
435 | } |
---|
436 | |
---|
437 | if (!error) { // add intergenes |
---|
438 | PositionPairList intergenes; |
---|
439 | PositionPair wholeGenome(0, PositionPair::genome_length-1); |
---|
440 | error = geneRanges.buildIntergeneList(wholeGenome, intergenes); |
---|
441 | |
---|
442 | for (PositionPairList::iterator i = intergenes.begin(); !error && i != intergenes.end(); ++i) { |
---|
443 | char *long_intergene_name = GBS_global_string_copy("%s;intergene_%i_%i", organism_name, i->begin, i->end); |
---|
444 | error = create_intergene(gb_species_data2, i->begin, i->end, ali_genom, long_intergene_name); |
---|
445 | free(long_intergene_name); |
---|
446 | } |
---|
447 | } |
---|
448 | |
---|
449 | if (error && organism_name) error = GBS_global_string("in organism '%s': %s", organism_name, error); |
---|
450 | |
---|
451 | if (!error) { |
---|
452 | int new_genes = gene_counter-gene_counter_old; // only non-split genes |
---|
453 | int new_split_genes = split_gene_counter-split_gene_counter_old; |
---|
454 | int new_intergenes = intergene_counter-intergene_counter_old; |
---|
455 | |
---|
456 | unsigned long genesSize = geneRanges.getAllGeneSize(); |
---|
457 | unsigned long overlaps = geneRanges.getOverlap(); |
---|
458 | double data_grow = overlaps/double(PositionPair::genome_length)*100; |
---|
459 | double gene_overlap = overlaps/double(genesSize)*100; |
---|
460 | |
---|
461 | if (new_split_genes) { |
---|
462 | |
---|
463 | printf(" - %s: %i genes (%i split), %i intergenes", |
---|
464 | organism_name, new_genes+new_split_genes, new_split_genes, new_intergenes); |
---|
465 | } |
---|
466 | else { |
---|
467 | printf(" - %s: %i genes, %i intergenes", |
---|
468 | organism_name, new_genes, new_intergenes); |
---|
469 | } |
---|
470 | printf(" (data grow: %5.2f%%, gene overlap: %5.2f%%=%lu bp)\n", data_grow, gene_overlap, overlaps); |
---|
471 | } |
---|
472 | |
---|
473 | #if defined(DUMP_OVERLAP_CALC) |
---|
474 | geneRanges.dump(); |
---|
475 | #endif // DUMP_OVERLAP_CALC |
---|
476 | |
---|
477 | return error; |
---|
478 | } |
---|
479 | |
---|
480 | int ARB_main(int argc, char *argv[]) { |
---|
481 | |
---|
482 | printf("\n" |
---|
483 | "arb_gene_probe 1.2 -- (C) 2003/2004 The ARB-project\n" |
---|
484 | "written by Tom Littschwager, Bernd Spanfelner, Conny Wolf, Ralf Westram.\n"); |
---|
485 | |
---|
486 | if (argc != 3) { |
---|
487 | printf("Usage: arb_gene_probe input_database output_database\n"); |
---|
488 | printf(" Prepares a genome database for Gene-PT-Server\n"); |
---|
489 | return EXIT_FAILURE; |
---|
490 | } |
---|
491 | |
---|
492 | const char *inputname = argv[1]; |
---|
493 | const char *outputname = argv[2]; |
---|
494 | |
---|
495 | // GBK_terminate("test-crash of arb_gene_probe"); |
---|
496 | |
---|
497 | printf("Converting '%s' -> '%s' ..\n", inputname, outputname); |
---|
498 | |
---|
499 | GB_ERROR error = NULp; |
---|
500 | GB_shell shell; |
---|
501 | GBDATA *gb_main = GB_open(inputname, "rw"); // rootzeiger wird gesetzt |
---|
502 | if (!gb_main) { |
---|
503 | error = GBS_global_string("Database '%s' not found", inputname); |
---|
504 | } |
---|
505 | else { |
---|
506 | GB_request_undo_type(gb_main, GB_UNDO_NONE); // disable arbdb builtin undo |
---|
507 | GB_begin_transaction(gb_main); |
---|
508 | |
---|
509 | GBDATA *gb_species_data = GBT_get_species_data(gb_main); |
---|
510 | GBDATA *gb_species_data_new = GBT_create(gb_main, "species_data", 7); // create a second 'species_data' container |
---|
511 | |
---|
512 | if (!gb_species_data_new) error = GB_await_error(); |
---|
513 | |
---|
514 | int non_ali_genom_species = 0; |
---|
515 | int ali_genom_species = 0; |
---|
516 | |
---|
517 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
518 | gb_species && !error; |
---|
519 | gb_species = GBT_next_species(gb_species)) |
---|
520 | { |
---|
521 | GBDATA *gb_ali_genom = GBT_find_sequence(gb_species, GENOM_ALIGNMENT); |
---|
522 | if (!gb_ali_genom) { |
---|
523 | // skip species w/o alignment 'GENOM_ALIGNMENT' (genome DBs often contain pseudo species) |
---|
524 | ++non_ali_genom_species; |
---|
525 | } |
---|
526 | else { |
---|
527 | error = insert_genes_of_organism(gb_species, gb_species_data_new); |
---|
528 | ++ali_genom_species; |
---|
529 | } |
---|
530 | } |
---|
531 | |
---|
532 | if (non_ali_genom_species) { |
---|
533 | printf("%i species had no alignment in '" GENOM_ALIGNMENT "' and have been skipped.\n", non_ali_genom_species); |
---|
534 | } |
---|
535 | if (!error && ali_genom_species == 0) { |
---|
536 | error = "no species with data in alignment '" GENOM_ALIGNMENT "' were found"; |
---|
537 | } |
---|
538 | |
---|
539 | if (!error) { |
---|
540 | printf("%i species had data in alignment '" GENOM_ALIGNMENT "'.\n" |
---|
541 | "Found %i genes (%i were split) and %i intergene regions.\n", |
---|
542 | ali_genom_species, gene_counter, split_gene_counter, intergene_counter); |
---|
543 | } |
---|
544 | |
---|
545 | if (!error) { |
---|
546 | error = GB_delete(gb_species_data); // delete first (old) 'species_data' container |
---|
547 | } |
---|
548 | |
---|
549 | if (!error) { |
---|
550 | // create map-string |
---|
551 | char* map_string; |
---|
552 | { |
---|
553 | FullNameMap::iterator NameEnd = names.end(); |
---|
554 | FullNameMap::iterator NameIter; |
---|
555 | |
---|
556 | size_t mapsize = 0; |
---|
557 | for (NameIter = names.begin(); NameIter != NameEnd; ++NameIter) { |
---|
558 | mapsize += strlen(NameIter->first)+NameIter->second.length()+2; |
---|
559 | } |
---|
560 | |
---|
561 | map_string = new char[mapsize+1]; |
---|
562 | size_t moff = 0; |
---|
563 | |
---|
564 | for (NameIter = names.begin(); NameIter != NameEnd; ++NameIter) { |
---|
565 | int len1 = strlen(NameIter->first); |
---|
566 | int len2 = NameIter->second.length(); |
---|
567 | |
---|
568 | memcpy(map_string+moff, NameIter->first, len1); |
---|
569 | map_string[moff+len1] = ';'; |
---|
570 | moff += len1+1; |
---|
571 | |
---|
572 | memcpy(map_string+moff, NameIter->second.c_str(), len2); |
---|
573 | map_string[moff+len2] = ';'; |
---|
574 | moff += len2+1; |
---|
575 | } |
---|
576 | map_string[moff] = 0; |
---|
577 | |
---|
578 | gp_assert(moff <= mapsize); |
---|
579 | } |
---|
580 | |
---|
581 | GBDATA *gb_gene_map = GB_create_container(gb_main, "gene_map"); |
---|
582 | if (!gb_gene_map) error = GB_await_error(); |
---|
583 | else error = GBT_write_string(gb_gene_map, "map_string", map_string); |
---|
584 | |
---|
585 | delete [] map_string; |
---|
586 | } |
---|
587 | |
---|
588 | if (!error) { |
---|
589 | // set default alignment for pt_server |
---|
590 | error = GBT_set_default_alignment(gb_main, "ali_ptgene"); |
---|
591 | |
---|
592 | if (!error) { |
---|
593 | GBDATA *gb_use = GB_search(gb_main, "presets/alignment/alignment_name", GB_STRING); |
---|
594 | if (!gb_use) error = GB_await_error(); |
---|
595 | else { |
---|
596 | GB_topSecurityLevel unsecured(gb_main); |
---|
597 | error = GB_write_string(gb_use, "ali_ptgene"); |
---|
598 | } |
---|
599 | } |
---|
600 | } |
---|
601 | |
---|
602 | error = GB_end_transaction(gb_main, error); |
---|
603 | |
---|
604 | if (!error) { |
---|
605 | printf("Saving '%s' ..\n", outputname); |
---|
606 | error = GB_save_as(gb_main, outputname, "bfm"); |
---|
607 | if (error) unlink(outputname); |
---|
608 | } |
---|
609 | |
---|
610 | GB_close(gb_main); |
---|
611 | } |
---|
612 | |
---|
613 | if (error) { |
---|
614 | printf("Error in arb_gene_probe: %s\n", error); |
---|
615 | return EXIT_FAILURE; |
---|
616 | } |
---|
617 | |
---|
618 | printf("arb_gene_probe done.\n"); |
---|
619 | return EXIT_SUCCESS; |
---|
620 | } |
---|
621 | |
---|