| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : arb_gene_probe.cxx // |
|---|
| 4 | // Purpose : // |
|---|
| 5 | // // |
|---|
| 6 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 7 | // http://www.arb-home.de/ // |
|---|
| 8 | // // |
|---|
| 9 | // =============================================================== // |
|---|
| 10 | |
|---|
| 11 | #include <arbdbt.h> |
|---|
| 12 | #include <adGene.h> |
|---|
| 13 | |
|---|
| 14 | #include <map> |
|---|
| 15 | #include <list> |
|---|
| 16 | #include <set> |
|---|
| 17 | #include <string> |
|---|
| 18 | |
|---|
| 19 | #include <unistd.h> |
|---|
| 20 | #include <sys/types.h> |
|---|
| 21 | |
|---|
| 22 | #define gp_assert(cond) arb_assert(cond) |
|---|
| 23 | |
|---|
| 24 | using namespace std; |
|---|
| 25 | |
|---|
| 26 | #if defined(DEBUG) |
|---|
| 27 | // #define CREATE_DEBUG_FILES |
|---|
| 28 | // #define DUMP_OVERLAP_CALC |
|---|
| 29 | #endif // DEBUG |
|---|
| 30 | |
|---|
| 31 | // -------------------------------------------------------------------------------- |
|---|
| 32 | |
|---|
| 33 | static int gene_counter = 0; // pre-incremented counters |
|---|
| 34 | static int split_gene_counter = 0; |
|---|
| 35 | static int intergene_counter = 0; |
|---|
| 36 | |
|---|
| 37 | struct nameOrder { |
|---|
| 38 | bool operator()(const char *name1, const char *name2) const { |
|---|
| 39 | // Normally it is sufficient to have any order, as long as it is strict. |
|---|
| 40 | // But for UNIT_TESTS we need a reproducable order, which does not |
|---|
| 41 | // depend on memory layout of DB elements. |
|---|
| 42 | #if defined(UNIT_TESTS) // UT_DIFF |
|---|
| 43 | return strcmp(name1, name2)<0; // slow, determined by species names |
|---|
| 44 | #else |
|---|
| 45 | return (name1-name2)<0; // fast, but depends on memory layout (e.g. on MEMORY_TEST in gb_memory.h) |
|---|
| 46 | #endif |
|---|
| 47 | } |
|---|
| 48 | }; |
|---|
| 49 | |
|---|
| 50 | typedef map<const char *, string, nameOrder> FullNameMap; |
|---|
| 51 | static FullNameMap names; |
|---|
| 52 | |
|---|
| 53 | // -------------------------------------------------------------------------------- |
|---|
| 54 | |
|---|
| 55 | struct PositionPair { |
|---|
| 56 | int begin; // these positions are in range [0 .. genome_length-1] |
|---|
| 57 | int end; |
|---|
| 58 | |
|---|
| 59 | static int genome_length; |
|---|
| 60 | |
|---|
| 61 | #if defined(DEBUG) |
|---|
| 62 | void check_legal() const { |
|---|
| 63 | gp_assert(begin >= 0); |
|---|
| 64 | gp_assert(begin <= end); |
|---|
| 65 | gp_assert(end < genome_length); |
|---|
| 66 | } |
|---|
| 67 | #endif // DEBUG |
|---|
| 68 | |
|---|
| 69 | PositionPair() : begin(-1), end(-1) {} |
|---|
| 70 | PositionPair(int begin_, int end_) : begin(begin_), end(end_) { |
|---|
| 71 | #if defined(DEBUG) |
|---|
| 72 | check_legal(); |
|---|
| 73 | #endif // DEBUG |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | int length() const { return end-begin+1; } |
|---|
| 77 | |
|---|
| 78 | bool overlapsWith(const PositionPair& other) const { |
|---|
| 79 | #if defined(DEBUG) |
|---|
| 80 | check_legal(); |
|---|
| 81 | other.check_legal(); |
|---|
| 82 | #endif // DEBUG |
|---|
| 83 | return ! ((end < other.begin) || (other.end < begin)); |
|---|
| 84 | } |
|---|
| 85 | |
|---|
| 86 | #if defined(DUMP_OVERLAP_CALC) |
|---|
| 87 | void dump(const char *note) const { |
|---|
| 88 | printf("%s begin=%i end=%i\n", note, begin, end); |
|---|
| 89 | } |
|---|
| 90 | #endif // DUMP_OVERLAP_CALC |
|---|
| 91 | }; |
|---|
| 92 | |
|---|
| 93 | int PositionPair::genome_length = 0; |
|---|
| 94 | |
|---|
| 95 | typedef list<PositionPair> PositionPairList; |
|---|
| 96 | |
|---|
| 97 | struct ltNonOverlap { |
|---|
| 98 | // sorting with this operator identifies all overlapping PositionPair's as "equal" |
|---|
| 99 | bool operator ()(const PositionPair& p1, const PositionPair& p2) const { |
|---|
| 100 | return p1.end < p2.begin; |
|---|
| 101 | } |
|---|
| 102 | }; |
|---|
| 103 | |
|---|
| 104 | class GenePositionMap { |
|---|
| 105 | typedef set<PositionPair, ltNonOverlap> OverlappingGeneSet; |
|---|
| 106 | |
|---|
| 107 | OverlappingGeneSet usedRanges; |
|---|
| 108 | unsigned long overlapSize; |
|---|
| 109 | unsigned long geneSize; |
|---|
| 110 | public: |
|---|
| 111 | GenePositionMap() : overlapSize(0), geneSize(0) {} |
|---|
| 112 | |
|---|
| 113 | void announceGene(PositionPair gene); |
|---|
| 114 | GB_ERROR buildIntergeneList(const PositionPair& wholeGenome, PositionPairList& intergeneList) const; |
|---|
| 115 | unsigned long getOverlap() const { return overlapSize; } |
|---|
| 116 | unsigned long getAllGeneSize() const { return geneSize; } |
|---|
| 117 | |
|---|
| 118 | #if defined(DUMP_OVERLAP_CALC) |
|---|
| 119 | void dump() const; |
|---|
| 120 | #endif // DUMP_OVERLAP_CALC |
|---|
| 121 | }; |
|---|
| 122 | |
|---|
| 123 | // ____________________________________________________________ |
|---|
| 124 | // start of implementation of class GenePositionMap: |
|---|
| 125 | |
|---|
| 126 | void GenePositionMap::announceGene(PositionPair gene) { |
|---|
| 127 | OverlappingGeneSet::iterator found = usedRanges.find(gene); |
|---|
| 128 | if (found == usedRanges.end()) { // gene does not overlap with currently known ranges |
|---|
| 129 | usedRanges.insert(gene); // add to known ranges |
|---|
| 130 | } |
|---|
| 131 | else { |
|---|
| 132 | // 'found' overlaps with 'gene' |
|---|
| 133 | int gene_length = gene.length(); |
|---|
| 134 | |
|---|
| 135 | do { |
|---|
| 136 | gp_assert(gene.overlapsWith(*found)); |
|---|
| 137 | |
|---|
| 138 | gene = PositionPair(min(found->begin, gene.begin), max(found->end, gene.end)); // calc combined range |
|---|
| 139 | int combined_length = gene.length(); |
|---|
| 140 | |
|---|
| 141 | size_t overlap = (found->length()+gene_length)-combined_length; |
|---|
| 142 | overlapSize += overlap; |
|---|
| 143 | geneSize += gene_length; |
|---|
| 144 | |
|---|
| 145 | usedRanges.erase(found); |
|---|
| 146 | |
|---|
| 147 | gene_length = combined_length; |
|---|
| 148 | found = usedRanges.find(gene); // search for further overlaps |
|---|
| 149 | } while (found != usedRanges.end()); |
|---|
| 150 | |
|---|
| 151 | usedRanges.insert(gene); // insert the combined range |
|---|
| 152 | } |
|---|
| 153 | } |
|---|
| 154 | |
|---|
| 155 | GB_ERROR GenePositionMap::buildIntergeneList(const PositionPair& wholeGenome, PositionPairList& intergeneList) const { |
|---|
| 156 | OverlappingGeneSet::iterator end = usedRanges.end(); |
|---|
| 157 | OverlappingGeneSet::iterator curr = usedRanges.begin(); |
|---|
| 158 | OverlappingGeneSet::iterator prev = end; |
|---|
| 159 | |
|---|
| 160 | if (curr == end) { // nothing defined -> use whole genome as one big intergene |
|---|
| 161 | intergeneList.push_back(wholeGenome); |
|---|
| 162 | } |
|---|
| 163 | else { |
|---|
| 164 | if (curr->begin > wholeGenome.begin) { // intergene before first gene range ? |
|---|
| 165 | intergeneList.push_back(PositionPair(wholeGenome.begin, curr->begin-1)); |
|---|
| 166 | } |
|---|
| 167 | |
|---|
| 168 | prev = curr; ++curr; |
|---|
| 169 | |
|---|
| 170 | while (curr != end) { |
|---|
| 171 | if (prev->end < curr->begin) { |
|---|
| 172 | if (prev->end != (curr->begin-1)) { // not directly adjacent |
|---|
| 173 | intergeneList.push_back(PositionPair(prev->end+1, curr->begin-1)); |
|---|
| 174 | } |
|---|
| 175 | } |
|---|
| 176 | else { |
|---|
| 177 | return "Internal error: Overlapping gene ranges"; |
|---|
| 178 | } |
|---|
| 179 | |
|---|
| 180 | prev = curr; ++curr; |
|---|
| 181 | } |
|---|
| 182 | |
|---|
| 183 | if (prev != end && prev->end < wholeGenome.end) { |
|---|
| 184 | intergeneList.push_back(PositionPair(prev->end+1, wholeGenome.end)); |
|---|
| 185 | } |
|---|
| 186 | } |
|---|
| 187 | return NULp; |
|---|
| 188 | } |
|---|
| 189 | |
|---|
| 190 | #if defined(DUMP_OVERLAP_CALC) |
|---|
| 191 | void GenePositionMap::dump() const { |
|---|
| 192 | printf("List of ranges used by genes:\n"); |
|---|
| 193 | for (OverlappingGeneSet::iterator g = usedRanges.begin(); g != usedRanges.end(); ++g) { |
|---|
| 194 | g->dump("- "); |
|---|
| 195 | } |
|---|
| 196 | printf("Overlap: %lu bases\n", getOverlap()); |
|---|
| 197 | } |
|---|
| 198 | #endif // DUMP_OVERLAP_CALC |
|---|
| 199 | |
|---|
| 200 | // -end- of implementation of class GenePositionMap. |
|---|
| 201 | |
|---|
| 202 | static GB_ERROR create_data_entry(GBDATA *gb_species2, const char *sequence, int seqlen) { |
|---|
| 203 | GB_ERROR error = NULp; |
|---|
| 204 | char *gene_sequence = new char[seqlen+1]; |
|---|
| 205 | |
|---|
| 206 | memcpy(gene_sequence, sequence, seqlen); // @@@ FIXME: avoid this copy! |
|---|
| 207 | gene_sequence[seqlen] = 0; |
|---|
| 208 | |
|---|
| 209 | GBDATA *gb_ali = GB_create_container(gb_species2, "ali_ptgene"); |
|---|
| 210 | if (!gb_ali) error = GB_await_error(); |
|---|
| 211 | else error = GBT_write_string(gb_ali, "data", gene_sequence); |
|---|
| 212 | |
|---|
| 213 | delete [] gene_sequence; |
|---|
| 214 | return error; |
|---|
| 215 | } |
|---|
| 216 | |
|---|
| 217 | #if defined(DEBUG) |
|---|
| 218 | static void CHECK_SEMI_ESCAPED(const char *name) { |
|---|
| 219 | // checks whether all ";\\" are escaped |
|---|
| 220 | while (*name) { |
|---|
| 221 | gp_assert(*name != ';'); // oops, unescaped ';' |
|---|
| 222 | if (*name == '\\') ++name; |
|---|
| 223 | ++name; |
|---|
| 224 | } |
|---|
| 225 | } |
|---|
| 226 | #else |
|---|
| 227 | #define CHECK_SEMI_ESCAPED(s) |
|---|
| 228 | #endif // DEBUG |
|---|
| 229 | |
|---|
| 230 | |
|---|
| 231 | static GBDATA *create_gene_species(GBDATA *gb_species_data2, const char *internal_name, const char *long_name, int abspos, const char *sequence, int length) { |
|---|
| 232 | // Note: 'sequence' is not necessarily 0-terminated! |
|---|
| 233 | |
|---|
| 234 | #if defined(DEBUG) |
|---|
| 235 | const char *firstSem = strchr(long_name, ';'); |
|---|
| 236 | gp_assert(firstSem); |
|---|
| 237 | CHECK_SEMI_ESCAPED(firstSem+1); |
|---|
| 238 | #endif // DEBUG |
|---|
| 239 | |
|---|
| 240 | GB_ERROR error = GB_push_transaction(gb_species_data2); |
|---|
| 241 | GBDATA *gb_species2 = NULp; |
|---|
| 242 | |
|---|
| 243 | if (!error) { |
|---|
| 244 | gb_species2 = GB_create_container(gb_species_data2, "species"); |
|---|
| 245 | if (!gb_species2) error = GB_await_error(); |
|---|
| 246 | } |
|---|
| 247 | |
|---|
| 248 | if (!error) { |
|---|
| 249 | GBDATA *gb_name = GB_create(gb_species2, "name", GB_STRING); |
|---|
| 250 | |
|---|
| 251 | if (!gb_name) error = GB_await_error(); |
|---|
| 252 | else { |
|---|
| 253 | error = GB_write_string(gb_name, internal_name); |
|---|
| 254 | if (!error) { |
|---|
| 255 | const char *static_internal_name = GB_read_char_pntr(gb_name); // use static copy from db as map-index (internal_name is temporary) |
|---|
| 256 | error = create_data_entry(gb_species2, sequence, length); |
|---|
| 257 | if (!error) { |
|---|
| 258 | names[static_internal_name] = long_name; |
|---|
| 259 | error = GBT_write_int(gb_species2, "abspos", abspos); |
|---|
| 260 | } |
|---|
| 261 | } |
|---|
| 262 | } |
|---|
| 263 | } |
|---|
| 264 | |
|---|
| 265 | error = GB_end_transaction(gb_species_data2, error); |
|---|
| 266 | |
|---|
| 267 | if (error) { // be more verbose : |
|---|
| 268 | error = GBS_global_string("%s (internal_name='%s', long_name='%s')", error, internal_name, long_name); |
|---|
| 269 | GB_export_error(error); |
|---|
| 270 | gb_species2 = NULp; |
|---|
| 271 | } |
|---|
| 272 | |
|---|
| 273 | return gb_species2; |
|---|
| 274 | } |
|---|
| 275 | |
|---|
| 276 | static GB_ERROR create_genelike_entry(const char *internal_name, GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_name) { |
|---|
| 277 | GBDATA *gb_genespecies = create_gene_species(gb_species_data2, internal_name, long_name, start_pos, ali_genome+start_pos, end_pos-start_pos+1); |
|---|
| 278 | return gb_genespecies ? NULp : GB_await_error(); |
|---|
| 279 | } |
|---|
| 280 | |
|---|
| 281 | static GB_ERROR create_intergene(GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_gene_name) { |
|---|
| 282 | if (start_pos <= end_pos) { |
|---|
| 283 | char internal_name[128]; |
|---|
| 284 | sprintf(internal_name, "i%x", intergene_counter++); |
|---|
| 285 | return create_genelike_entry(internal_name, gb_species_data2, start_pos, end_pos, ali_genome, long_gene_name); |
|---|
| 286 | } |
|---|
| 287 | return "Illegal inter-gene positions (start behind end)"; |
|---|
| 288 | } |
|---|
| 289 | |
|---|
| 290 | static GB_ERROR create_gene(GBDATA *gb_species_data2, int start_pos, int end_pos, const char *ali_genome, const char *long_gene_name) { |
|---|
| 291 | if (start_pos <= end_pos) { |
|---|
| 292 | char internal_name[128]; |
|---|
| 293 | sprintf(internal_name, "n%x", gene_counter++); |
|---|
| 294 | return create_genelike_entry(internal_name, gb_species_data2, start_pos, end_pos, ali_genome, long_gene_name); |
|---|
| 295 | } |
|---|
| 296 | return "Illegal gene positions (start behind end)"; |
|---|
| 297 | } |
|---|
| 298 | |
|---|
| 299 | static GB_ERROR create_split_gene(GBDATA *gb_species_data2, PositionPairList& part_list, const char *ali_genome, const char *long_gene_name) { |
|---|
| 300 | GB_ERROR error = NULp; |
|---|
| 301 | PositionPairList::iterator list_end = part_list.end(); |
|---|
| 302 | |
|---|
| 303 | int gene_size = 0; |
|---|
| 304 | for (PositionPairList::iterator part = part_list.begin(); part != list_end; ++part) { |
|---|
| 305 | int part_size = part->end-part->begin+1; |
|---|
| 306 | gp_assert(part_size > 0); |
|---|
| 307 | gene_size += part_size; |
|---|
| 308 | } |
|---|
| 309 | gp_assert(gene_size > 0); |
|---|
| 310 | char *gene_sequence = new char[gene_size+1]; |
|---|
| 311 | int gene_off = 0; |
|---|
| 312 | |
|---|
| 313 | char *split_pos_list = NULp; // contains split information: 'gene pos of part2,abs pos of part2;gene pos of part3,abs pos of part3;...' |
|---|
| 314 | |
|---|
| 315 | for (PositionPairList::iterator part = part_list.begin(); part != list_end;) { |
|---|
| 316 | int part_size = part->end-part->begin+1; |
|---|
| 317 | int genome_pos = part->begin; |
|---|
| 318 | memcpy(gene_sequence+gene_off, ali_genome+part->begin, part_size); |
|---|
| 319 | gene_off += part_size; |
|---|
| 320 | |
|---|
| 321 | ++part; |
|---|
| 322 | |
|---|
| 323 | if (!split_pos_list) { // first part |
|---|
| 324 | split_pos_list = GBS_global_string_copy("%i", gene_off); // gene offset of part 2 |
|---|
| 325 | } |
|---|
| 326 | else { // next parts |
|---|
| 327 | char *next_split_pos_list; |
|---|
| 328 | if (part != list_end) { // not last |
|---|
| 329 | next_split_pos_list = GBS_global_string_copy("%s,%i;%i", split_pos_list, genome_pos, gene_off); |
|---|
| 330 | } |
|---|
| 331 | else { // last part |
|---|
| 332 | next_split_pos_list = GBS_global_string_copy("%s,%i", split_pos_list, genome_pos); |
|---|
| 333 | } |
|---|
| 334 | freeset(split_pos_list, next_split_pos_list); |
|---|
| 335 | } |
|---|
| 336 | } |
|---|
| 337 | |
|---|
| 338 | char internal_name[128]; |
|---|
| 339 | sprintf(internal_name, "s%x", split_gene_counter++); |
|---|
| 340 | |
|---|
| 341 | const PositionPair& first_part = part_list.front(); |
|---|
| 342 | GBDATA *gb_species2 = create_gene_species(gb_species_data2, internal_name, long_gene_name, first_part.begin, |
|---|
| 343 | gene_sequence, first_part.end-first_part.begin+1); |
|---|
| 344 | |
|---|
| 345 | if (!gb_species2) error = GB_await_error(); |
|---|
| 346 | else { |
|---|
| 347 | #if defined(DEBUG) && 0 |
|---|
| 348 | printf("split gene: long_gene_name='%s' internal_name='%s' split_pos_list='%s'\n", |
|---|
| 349 | long_gene_name, internal_name, split_pos_list); |
|---|
| 350 | #endif // DEBUG |
|---|
| 351 | error = GBT_write_string(gb_species2, "splitpos", split_pos_list); |
|---|
| 352 | } |
|---|
| 353 | |
|---|
| 354 | free(split_pos_list); |
|---|
| 355 | delete [] gene_sequence; |
|---|
| 356 | |
|---|
| 357 | return error; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | static GB_ERROR scan_gene_positions(GBDATA *gb_gene, PositionPairList& part_list) { |
|---|
| 361 | GB_ERROR error = NULp; |
|---|
| 362 | GEN_position *location = GEN_read_position(gb_gene); |
|---|
| 363 | |
|---|
| 364 | if (!location) error = GB_await_error(); |
|---|
| 365 | else { |
|---|
| 366 | GEN_sortAndMergeLocationParts(location); |
|---|
| 367 | int parts = location->parts; |
|---|
| 368 | for (int p = 0; p<parts; ++p) { |
|---|
| 369 | part_list.push_back(PositionPair(location->start_pos[p]-1, location->stop_pos[p]-1)); |
|---|
| 370 | } |
|---|
| 371 | GEN_free_position(location); |
|---|
| 372 | } |
|---|
| 373 | return error; |
|---|
| 374 | } |
|---|
| 375 | |
|---|
| 376 | static GB_ERROR insert_genes_of_organism(GBDATA *gb_organism, GBDATA *gb_species_data2) { |
|---|
| 377 | // insert all genes of 'gb_organism' as pseudo-species |
|---|
| 378 | // into new 'species_data' (gb_species_data2) |
|---|
| 379 | |
|---|
| 380 | GB_ERROR error = NULp; |
|---|
| 381 | const char *organism_name = GBT_get_name(gb_organism); |
|---|
| 382 | |
|---|
| 383 | GenePositionMap geneRanges; |
|---|
| 384 | |
|---|
| 385 | int gene_counter_old = gene_counter; // used for statistics only (see end of function) |
|---|
| 386 | int split_gene_counter_old = split_gene_counter; |
|---|
| 387 | int intergene_counter_old = intergene_counter; |
|---|
| 388 | |
|---|
| 389 | GBDATA *gb_ali_genom = GBT_find_sequence(gb_organism, GENOM_ALIGNMENT); |
|---|
| 390 | gp_assert(gb_ali_genom); // existence has to be checked by caller! |
|---|
| 391 | |
|---|
| 392 | const char *ali_genom = GB_read_char_pntr(gb_ali_genom); |
|---|
| 393 | if (!ali_genom) error = GB_await_error(); |
|---|
| 394 | PositionPair::genome_length = GB_read_count(gb_ali_genom); // this affects checks in PositionPair |
|---|
| 395 | |
|---|
| 396 | if (!organism_name && !error) { |
|---|
| 397 | error = "encountered invalid organism (lacks 'name' entry)"; |
|---|
| 398 | } |
|---|
| 399 | |
|---|
| 400 | for (GBDATA *gb_gene = GEN_first_gene(gb_organism); |
|---|
| 401 | gb_gene && !error; |
|---|
| 402 | gb_gene = GEN_next_gene(gb_gene)) |
|---|
| 403 | { |
|---|
| 404 | const char *gene_name = GBT_get_name(gb_gene); |
|---|
| 405 | |
|---|
| 406 | PositionPairList part_list; |
|---|
| 407 | error = scan_gene_positions(gb_gene, part_list); |
|---|
| 408 | |
|---|
| 409 | if (!error && !gene_name) error = "encountered invalid gene (lacks 'name' entry)"; |
|---|
| 410 | if (!error && part_list.empty()) error = "empty position list"; |
|---|
| 411 | if (!error) { |
|---|
| 412 | int split_count = part_list.size(); |
|---|
| 413 | PositionPair first_part = *part_list.begin(); |
|---|
| 414 | |
|---|
| 415 | if (!error) { |
|---|
| 416 | char *esc_gene_name = GBS_escape_string(gene_name, ";", '\\'); |
|---|
| 417 | char *long_gene_name = GBS_global_string_copy("%s;%s", organism_name, esc_gene_name); |
|---|
| 418 | if (split_count == 1) { // normal gene |
|---|
| 419 | error = create_gene(gb_species_data2, first_part.begin, first_part.end, ali_genom, long_gene_name); |
|---|
| 420 | geneRanges.announceGene(first_part); |
|---|
| 421 | } |
|---|
| 422 | else { // split gene |
|---|
| 423 | error = create_split_gene(gb_species_data2, part_list, ali_genom, long_gene_name); |
|---|
| 424 | |
|---|
| 425 | for (PositionPairList::iterator p = part_list.begin(); p != part_list.end(); ++p) { |
|---|
| 426 | geneRanges.announceGene(*p); |
|---|
| 427 | } |
|---|
| 428 | } |
|---|
| 429 | free(long_gene_name); |
|---|
| 430 | free(esc_gene_name); |
|---|
| 431 | } |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | if (error && gene_name) error = GBS_global_string("in gene '%s': %s", gene_name, error); |
|---|
| 435 | } |
|---|
| 436 | |
|---|
| 437 | if (!error) { // add intergenes |
|---|
| 438 | PositionPairList intergenes; |
|---|
| 439 | PositionPair wholeGenome(0, PositionPair::genome_length-1); |
|---|
| 440 | error = geneRanges.buildIntergeneList(wholeGenome, intergenes); |
|---|
| 441 | |
|---|
| 442 | for (PositionPairList::iterator i = intergenes.begin(); !error && i != intergenes.end(); ++i) { |
|---|
| 443 | char *long_intergene_name = GBS_global_string_copy("%s;intergene_%i_%i", organism_name, i->begin, i->end); |
|---|
| 444 | error = create_intergene(gb_species_data2, i->begin, i->end, ali_genom, long_intergene_name); |
|---|
| 445 | free(long_intergene_name); |
|---|
| 446 | } |
|---|
| 447 | } |
|---|
| 448 | |
|---|
| 449 | if (error && organism_name) error = GBS_global_string("in organism '%s': %s", organism_name, error); |
|---|
| 450 | |
|---|
| 451 | if (!error) { |
|---|
| 452 | int new_genes = gene_counter-gene_counter_old; // only non-split genes |
|---|
| 453 | int new_split_genes = split_gene_counter-split_gene_counter_old; |
|---|
| 454 | int new_intergenes = intergene_counter-intergene_counter_old; |
|---|
| 455 | |
|---|
| 456 | unsigned long genesSize = geneRanges.getAllGeneSize(); |
|---|
| 457 | unsigned long overlaps = geneRanges.getOverlap(); |
|---|
| 458 | double data_grow = overlaps/double(PositionPair::genome_length)*100; |
|---|
| 459 | double gene_overlap = overlaps/double(genesSize)*100; |
|---|
| 460 | |
|---|
| 461 | if (new_split_genes) { |
|---|
| 462 | |
|---|
| 463 | printf(" - %s: %i genes (%i split), %i intergenes", |
|---|
| 464 | organism_name, new_genes+new_split_genes, new_split_genes, new_intergenes); |
|---|
| 465 | } |
|---|
| 466 | else { |
|---|
| 467 | printf(" - %s: %i genes, %i intergenes", |
|---|
| 468 | organism_name, new_genes, new_intergenes); |
|---|
| 469 | } |
|---|
| 470 | printf(" (data grow: %5.2f%%, gene overlap: %5.2f%%=%lu bp)\n", data_grow, gene_overlap, overlaps); |
|---|
| 471 | } |
|---|
| 472 | |
|---|
| 473 | #if defined(DUMP_OVERLAP_CALC) |
|---|
| 474 | geneRanges.dump(); |
|---|
| 475 | #endif // DUMP_OVERLAP_CALC |
|---|
| 476 | |
|---|
| 477 | return error; |
|---|
| 478 | } |
|---|
| 479 | |
|---|
| 480 | int ARB_main(int argc, char *argv[]) { |
|---|
| 481 | |
|---|
| 482 | printf("\n" |
|---|
| 483 | "arb_gene_probe 1.2 -- (C) 2003/2004 The ARB-project\n" |
|---|
| 484 | "written by Tom Littschwager, Bernd Spanfelner, Conny Wolf, Ralf Westram.\n"); |
|---|
| 485 | |
|---|
| 486 | if (argc != 3) { |
|---|
| 487 | printf("Usage: arb_gene_probe input_database output_database\n"); |
|---|
| 488 | printf(" Prepares a genome database for Gene-PT-Server\n"); |
|---|
| 489 | return EXIT_FAILURE; |
|---|
| 490 | } |
|---|
| 491 | |
|---|
| 492 | const char *inputname = argv[1]; |
|---|
| 493 | const char *outputname = argv[2]; |
|---|
| 494 | |
|---|
| 495 | // GBK_terminate("test-crash of arb_gene_probe"); |
|---|
| 496 | |
|---|
| 497 | printf("Converting '%s' -> '%s' ..\n", inputname, outputname); |
|---|
| 498 | |
|---|
| 499 | GB_ERROR error = NULp; |
|---|
| 500 | GB_shell shell; |
|---|
| 501 | GBDATA *gb_main = GB_open(inputname, "rw"); // rootzeiger wird gesetzt |
|---|
| 502 | if (!gb_main) { |
|---|
| 503 | error = GBS_global_string("Database '%s' not found", inputname); |
|---|
| 504 | } |
|---|
| 505 | else { |
|---|
| 506 | GB_request_undo_type(gb_main, GB_UNDO_NONE); // disable arbdb builtin undo |
|---|
| 507 | GB_begin_transaction(gb_main); |
|---|
| 508 | |
|---|
| 509 | GBDATA *gb_species_data = GBT_get_species_data(gb_main); |
|---|
| 510 | GBDATA *gb_species_data_new = GBT_create(gb_main, "species_data", 7); // create a second 'species_data' container |
|---|
| 511 | |
|---|
| 512 | if (!gb_species_data_new) error = GB_await_error(); |
|---|
| 513 | |
|---|
| 514 | int non_ali_genom_species = 0; |
|---|
| 515 | int ali_genom_species = 0; |
|---|
| 516 | |
|---|
| 517 | for (GBDATA *gb_species = GBT_first_species_rel_species_data(gb_species_data); |
|---|
| 518 | gb_species && !error; |
|---|
| 519 | gb_species = GBT_next_species(gb_species)) |
|---|
| 520 | { |
|---|
| 521 | GBDATA *gb_ali_genom = GBT_find_sequence(gb_species, GENOM_ALIGNMENT); |
|---|
| 522 | if (!gb_ali_genom) { |
|---|
| 523 | // skip species w/o alignment 'GENOM_ALIGNMENT' (genome DBs often contain pseudo species) |
|---|
| 524 | ++non_ali_genom_species; |
|---|
| 525 | } |
|---|
| 526 | else { |
|---|
| 527 | error = insert_genes_of_organism(gb_species, gb_species_data_new); |
|---|
| 528 | ++ali_genom_species; |
|---|
| 529 | } |
|---|
| 530 | } |
|---|
| 531 | |
|---|
| 532 | if (non_ali_genom_species) { |
|---|
| 533 | printf("%i species had no alignment in '" GENOM_ALIGNMENT "' and have been skipped.\n", non_ali_genom_species); |
|---|
| 534 | } |
|---|
| 535 | if (!error && ali_genom_species == 0) { |
|---|
| 536 | error = "no species with data in alignment '" GENOM_ALIGNMENT "' were found"; |
|---|
| 537 | } |
|---|
| 538 | |
|---|
| 539 | if (!error) { |
|---|
| 540 | printf("%i species had data in alignment '" GENOM_ALIGNMENT "'.\n" |
|---|
| 541 | "Found %i genes (%i were split) and %i intergene regions.\n", |
|---|
| 542 | ali_genom_species, gene_counter, split_gene_counter, intergene_counter); |
|---|
| 543 | } |
|---|
| 544 | |
|---|
| 545 | if (!error) { |
|---|
| 546 | error = GB_delete(gb_species_data); // delete first (old) 'species_data' container |
|---|
| 547 | } |
|---|
| 548 | |
|---|
| 549 | if (!error) { |
|---|
| 550 | // create map-string |
|---|
| 551 | char* map_string; |
|---|
| 552 | { |
|---|
| 553 | FullNameMap::iterator NameEnd = names.end(); |
|---|
| 554 | FullNameMap::iterator NameIter; |
|---|
| 555 | |
|---|
| 556 | size_t mapsize = 0; |
|---|
| 557 | for (NameIter = names.begin(); NameIter != NameEnd; ++NameIter) { |
|---|
| 558 | mapsize += strlen(NameIter->first)+NameIter->second.length()+2; |
|---|
| 559 | } |
|---|
| 560 | |
|---|
| 561 | map_string = new char[mapsize+1]; |
|---|
| 562 | size_t moff = 0; |
|---|
| 563 | |
|---|
| 564 | for (NameIter = names.begin(); NameIter != NameEnd; ++NameIter) { |
|---|
| 565 | int len1 = strlen(NameIter->first); |
|---|
| 566 | int len2 = NameIter->second.length(); |
|---|
| 567 | |
|---|
| 568 | memcpy(map_string+moff, NameIter->first, len1); |
|---|
| 569 | map_string[moff+len1] = ';'; |
|---|
| 570 | moff += len1+1; |
|---|
| 571 | |
|---|
| 572 | memcpy(map_string+moff, NameIter->second.c_str(), len2); |
|---|
| 573 | map_string[moff+len2] = ';'; |
|---|
| 574 | moff += len2+1; |
|---|
| 575 | } |
|---|
| 576 | map_string[moff] = 0; |
|---|
| 577 | |
|---|
| 578 | gp_assert(moff <= mapsize); |
|---|
| 579 | } |
|---|
| 580 | |
|---|
| 581 | GBDATA *gb_gene_map = GB_create_container(gb_main, "gene_map"); |
|---|
| 582 | if (!gb_gene_map) error = GB_await_error(); |
|---|
| 583 | else error = GBT_write_string(gb_gene_map, "map_string", map_string); |
|---|
| 584 | |
|---|
| 585 | delete [] map_string; |
|---|
| 586 | } |
|---|
| 587 | |
|---|
| 588 | if (!error) { |
|---|
| 589 | // set default alignment for pt_server |
|---|
| 590 | error = GBT_set_default_alignment(gb_main, "ali_ptgene"); |
|---|
| 591 | |
|---|
| 592 | if (!error) { |
|---|
| 593 | GBDATA *gb_use = GB_search(gb_main, "presets/alignment/alignment_name", GB_STRING); |
|---|
| 594 | if (!gb_use) error = GB_await_error(); |
|---|
| 595 | else { |
|---|
| 596 | GB_topSecurityLevel unsecured(gb_main); |
|---|
| 597 | error = GB_write_string(gb_use, "ali_ptgene"); |
|---|
| 598 | } |
|---|
| 599 | } |
|---|
| 600 | } |
|---|
| 601 | |
|---|
| 602 | error = GB_end_transaction(gb_main, error); |
|---|
| 603 | |
|---|
| 604 | if (!error) { |
|---|
| 605 | printf("Saving '%s' ..\n", outputname); |
|---|
| 606 | error = GB_save_as(gb_main, outputname, "bfm"); |
|---|
| 607 | if (error) unlink(outputname); |
|---|
| 608 | } |
|---|
| 609 | |
|---|
| 610 | GB_close(gb_main); |
|---|
| 611 | } |
|---|
| 612 | |
|---|
| 613 | if (error) { |
|---|
| 614 | printf("Error in arb_gene_probe: %s\n", error); |
|---|
| 615 | return EXIT_FAILURE; |
|---|
| 616 | } |
|---|
| 617 | |
|---|
| 618 | printf("arb_gene_probe done.\n"); |
|---|
| 619 | return EXIT_SUCCESS; |
|---|
| 620 | } |
|---|
| 621 | |
|---|