| 1 | // =============================================================== // |
|---|
| 2 | // // |
|---|
| 3 | // File : fast_aligner.cxx // |
|---|
| 4 | // Purpose : A fast aligner (not a multiple aligner!) // |
|---|
| 5 | // // |
|---|
| 6 | // Coded by Ralf Westram (coder@reallysoft.de) in 1998 // |
|---|
| 7 | // Institute of Microbiology (Technical University Munich) // |
|---|
| 8 | // http://www.arb-home.de/ // |
|---|
| 9 | // // |
|---|
| 10 | // =============================================================== // |
|---|
| 11 | |
|---|
| 12 | |
|---|
| 13 | #include "fast_aligner.hxx" |
|---|
| 14 | #include "ClustalV.hxx" |
|---|
| 15 | #include "seq_search.hxx" |
|---|
| 16 | |
|---|
| 17 | #include <island_hopping.h> |
|---|
| 18 | |
|---|
| 19 | #include <awtc_next_neighbours.hxx> |
|---|
| 20 | #include <awt_sel_boxes.hxx> |
|---|
| 21 | |
|---|
| 22 | #include <aw_awars.hxx> |
|---|
| 23 | #include <aw_root.hxx> |
|---|
| 24 | #include <aw_question.hxx> |
|---|
| 25 | |
|---|
| 26 | #include <arbdbt.h> |
|---|
| 27 | #include <ad_cb.h> |
|---|
| 28 | |
|---|
| 29 | #include <arb_defs.h> |
|---|
| 30 | #include <arb_progress.h> |
|---|
| 31 | #include <RangeList.h> |
|---|
| 32 | |
|---|
| 33 | #include <cctype> |
|---|
| 34 | #include <cmath> |
|---|
| 35 | #include <climits> |
|---|
| 36 | |
|---|
| 37 | #include <list> |
|---|
| 38 | #include <awt_config_manager.hxx> |
|---|
| 39 | #include <rootAsWin.h> |
|---|
| 40 | |
|---|
| 41 | // -------------------------------------------------------------------------------- |
|---|
| 42 | |
|---|
| 43 | #if defined(DEBUG) |
|---|
| 44 | // #define TRACE_CLUSTAL_DATA |
|---|
| 45 | // #define TRACE_ISLANDHOPPER_DATA |
|---|
| 46 | // #define TRACE_COMPRESSED_ALIGNMENT |
|---|
| 47 | // #define TRACE_RELATIVES |
|---|
| 48 | #endif // DEBUG |
|---|
| 49 | |
|---|
| 50 | // -------------------------------------------------------------------------------- |
|---|
| 51 | |
|---|
| 52 | enum FA_report { |
|---|
| 53 | FA_NO_REPORT, // no report |
|---|
| 54 | FA_TEMP_REPORT, // report to temporary entries |
|---|
| 55 | FA_REPORT, // report to resident entries |
|---|
| 56 | }; |
|---|
| 57 | |
|---|
| 58 | enum FA_range { |
|---|
| 59 | FA_WHOLE_SEQUENCE, // align whole sequence |
|---|
| 60 | FA_AROUND_CURSOR, // align xxx positions around current cursor position |
|---|
| 61 | FA_SELECTED_RANGE, // align selected range |
|---|
| 62 | FA_SAI_MULTI_RANGE, // align versus multi range define by SAI |
|---|
| 63 | }; |
|---|
| 64 | |
|---|
| 65 | enum FA_turn { |
|---|
| 66 | FA_TURN_NEVER, // never try to turn sequence |
|---|
| 67 | FA_TURN_INTERACTIVE, // try to turn, but query user |
|---|
| 68 | FA_TURN_ALWAYS, // turn if score is better |
|---|
| 69 | }; |
|---|
| 70 | |
|---|
| 71 | enum FA_reference { |
|---|
| 72 | FA_REF_EXPLICIT, // reference sequence explicitly specified |
|---|
| 73 | FA_REF_CONSENSUS, // use group consensus as reference |
|---|
| 74 | FA_REF_RELATIVES, // search next relatives by PT server |
|---|
| 75 | }; |
|---|
| 76 | |
|---|
| 77 | enum FA_alignTarget { |
|---|
| 78 | FA_CURRENT, // align current species |
|---|
| 79 | FA_MARKED, // align all marked species |
|---|
| 80 | FA_SELECTED, // align selected species (= range) |
|---|
| 81 | }; |
|---|
| 82 | |
|---|
| 83 | enum FA_errorAction { |
|---|
| 84 | FA_NO_ACTION, // do nothing |
|---|
| 85 | FA_MARK_FAILED, // mark failed species (unmark rest) |
|---|
| 86 | FA_MARK_ALIGNED, // mark aligned species (unmark rest) |
|---|
| 87 | }; |
|---|
| 88 | |
|---|
| 89 | struct AlignParams { |
|---|
| 90 | FA_report report; |
|---|
| 91 | bool showGapsMessages; // display messages about missing gaps in master? |
|---|
| 92 | PosRange range; // range to be aligned |
|---|
| 93 | }; |
|---|
| 94 | |
|---|
| 95 | struct SearchRelativeParams : virtual Noncopyable { |
|---|
| 96 | FamilyFinder *ff; |
|---|
| 97 | char *pt_server_alignment; // alignment used in pt_server (may differ from 'alignment') |
|---|
| 98 | int maxRelatives; // max # of relatives to use |
|---|
| 99 | |
|---|
| 100 | SearchRelativeParams(FamilyFinder *ff_, const char *pt_server_alignment_, int maxRelatives_) |
|---|
| 101 | : ff(ff_), |
|---|
| 102 | pt_server_alignment(strdup(pt_server_alignment_)), |
|---|
| 103 | maxRelatives(maxRelatives_) |
|---|
| 104 | {} |
|---|
| 105 | |
|---|
| 106 | ~SearchRelativeParams() { |
|---|
| 107 | free(pt_server_alignment); |
|---|
| 108 | delete(ff); |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | FamilyFinder *getFamilyFinder() { return ff; } |
|---|
| 112 | }; |
|---|
| 113 | |
|---|
| 114 | // -------------------------------------------------------------------------------- |
|---|
| 115 | |
|---|
| 116 | #define GAP_CHAR '-' |
|---|
| 117 | #define QUALITY_NAME "ASC_ALIGNER_CLIENT_SCORE" |
|---|
| 118 | #define INSERTS_NAME "AMI_ALIGNER_MASTER_INSERTS" |
|---|
| 119 | |
|---|
| 120 | #define FA_AWAR_ROOT "faligner/" |
|---|
| 121 | #define FA_AWAR_TO_ALIGN FA_AWAR_ROOT "what" |
|---|
| 122 | #define FA_AWAR_REFERENCE FA_AWAR_ROOT "against" |
|---|
| 123 | #define FA_AWAR_REFERENCE_NAME FA_AWAR_ROOT "sagainst" |
|---|
| 124 | #define FA_AWAR_RANGE FA_AWAR_ROOT "range" |
|---|
| 125 | #define FA_AWAR_PROTECTION FA_AWAR_ROOT "protection" |
|---|
| 126 | #define FA_AWAR_AROUND FA_AWAR_ROOT "around" |
|---|
| 127 | #define FA_AWAR_MIRROR FA_AWAR_ROOT "mirror" |
|---|
| 128 | #define FA_AWAR_REPORT FA_AWAR_ROOT "report" |
|---|
| 129 | #define FA_AWAR_SHOW_GAPS_MESSAGES FA_AWAR_ROOT "show_gaps" |
|---|
| 130 | #define FA_AWAR_CONTINUE_ON_ERROR FA_AWAR_ROOT "continue_on_error" |
|---|
| 131 | #define FA_AWAR_ACTION_ON_ERROR FA_AWAR_ROOT "action_on_error" |
|---|
| 132 | #define FA_AWAR_USE_SECONDARY FA_AWAR_ROOT "use_secondary" |
|---|
| 133 | #define FA_AWAR_NEXT_RELATIVES FA_AWAR_ROOT "next_relatives" |
|---|
| 134 | #define FA_AWAR_RELATIVE_RANGE FA_AWAR_ROOT "relrange" |
|---|
| 135 | #define FA_AWAR_PT_SERVER_ALIGNMENT "tmp/" FA_AWAR_ROOT "relative_ali" |
|---|
| 136 | #define FA_AWAR_SAI_RANGE_NAME FA_AWAR_ROOT "sai/sainame" |
|---|
| 137 | #define FA_AWAR_SAI_RANGE_CHARS FA_AWAR_ROOT "sai/chars" |
|---|
| 138 | |
|---|
| 139 | #define FA_AWAR_ISLAND_HOPPING_ROOT "island_hopping/" |
|---|
| 140 | #define FA_AWAR_USE_ISLAND_HOPPING FA_AWAR_ISLAND_HOPPING_ROOT "use" |
|---|
| 141 | #define FA_AWAR_ESTIMATE_BASE_FREQ FA_AWAR_ISLAND_HOPPING_ROOT "estimate_base_freq" |
|---|
| 142 | #define FA_AWAR_BASE_FREQ_A FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_a" |
|---|
| 143 | #define FA_AWAR_BASE_FREQ_C FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_c" |
|---|
| 144 | #define FA_AWAR_BASE_FREQ_G FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_g" |
|---|
| 145 | #define FA_AWAR_BASE_FREQ_T FA_AWAR_ISLAND_HOPPING_ROOT "base_freq_t" |
|---|
| 146 | #define FA_AWAR_SUBST_PARA_AC FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_ac" |
|---|
| 147 | #define FA_AWAR_SUBST_PARA_AG FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_ag" |
|---|
| 148 | #define FA_AWAR_SUBST_PARA_AT FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_at" |
|---|
| 149 | #define FA_AWAR_SUBST_PARA_CG FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_cg" |
|---|
| 150 | #define FA_AWAR_SUBST_PARA_CT FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_ct" |
|---|
| 151 | #define FA_AWAR_SUBST_PARA_GT FA_AWAR_ISLAND_HOPPING_ROOT "subst_para_gt" |
|---|
| 152 | #define FA_AWAR_EXPECTED_DISTANCE FA_AWAR_ISLAND_HOPPING_ROOT "expected_dist" |
|---|
| 153 | #define FA_AWAR_STRUCTURE_SUPPLEMENT FA_AWAR_ISLAND_HOPPING_ROOT "struct_suppl" |
|---|
| 154 | #define FA_AWAR_THRESHOLD FA_AWAR_ISLAND_HOPPING_ROOT "threshold" |
|---|
| 155 | #define FA_AWAR_GAP_A FA_AWAR_ISLAND_HOPPING_ROOT "gapa" |
|---|
| 156 | #define FA_AWAR_GAP_B FA_AWAR_ISLAND_HOPPING_ROOT "gapb" |
|---|
| 157 | #define FA_AWAR_GAP_C FA_AWAR_ISLAND_HOPPING_ROOT "gapc" |
|---|
| 158 | |
|---|
| 159 | // -------------------------------------------------------------------------------- |
|---|
| 160 | |
|---|
| 161 | static IslandHopping *island_hopper = NULp; // NULp -> use fast aligner; else use island hopper |
|---|
| 162 | |
|---|
| 163 | static GB_alignment_type global_alignmentType = GB_AT_UNKNOWN; // type of actually aligned sequence |
|---|
| 164 | |
|---|
| 165 | static int currentSequenceNumber; // used for counter |
|---|
| 166 | static int overallSequenceNumber; |
|---|
| 167 | |
|---|
| 168 | // -------------------------------------------------------------------------------- |
|---|
| 169 | |
|---|
| 170 | inline ARB_ERROR species_not_found(GB_CSTR species_name) { |
|---|
| 171 | return GBS_global_string("No species '%s' found!", species_name); |
|---|
| 172 | } |
|---|
| 173 | |
|---|
| 174 | static ARB_ERROR reverseComplement(GBDATA *gb_species, GB_CSTR ali, int max_protection) { |
|---|
| 175 | GBDATA *gbd = GBT_find_sequence(gb_species, ali); |
|---|
| 176 | ARB_ERROR error = NULp; |
|---|
| 177 | |
|---|
| 178 | if (!gbd) { |
|---|
| 179 | error = GBS_global_string("No 'data' found for species '%s'", GBT_get_name_or_description(gb_species)); |
|---|
| 180 | } |
|---|
| 181 | else { |
|---|
| 182 | int my_protection = GB_read_security_write(gbd); |
|---|
| 183 | |
|---|
| 184 | if (my_protection<=max_protection) { // ok |
|---|
| 185 | char *seq = GB_read_string(gbd); |
|---|
| 186 | int length = GB_read_string_count(gbd); |
|---|
| 187 | GBDATA *gb_main = GB_get_root(gb_species); |
|---|
| 188 | |
|---|
| 189 | GB_alignment_type ali_type = GBT_get_alignment_type(gb_main, ali); |
|---|
| 190 | fa_assert(ali_type != GB_AT_UNKNOWN); |
|---|
| 191 | |
|---|
| 192 | char T_or_U; |
|---|
| 193 | error = GBT_determine_T_or_U(ali_type, &T_or_U, "reverse-complement"); |
|---|
| 194 | if (!error) { |
|---|
| 195 | GBT_reverseComplementNucSequence(seq, length, T_or_U); |
|---|
| 196 | error = GB_write_string(gbd, seq); |
|---|
| 197 | } |
|---|
| 198 | } |
|---|
| 199 | else { // protection error |
|---|
| 200 | error = GBS_global_string("Cannot reverse-complement species '%s' because of protection level", GBT_get_name_or_description(gb_species)); |
|---|
| 201 | } |
|---|
| 202 | |
|---|
| 203 | } |
|---|
| 204 | |
|---|
| 205 | return error; |
|---|
| 206 | } |
|---|
| 207 | |
|---|
| 208 | static void build_reverse_complement(AW_window *aw, const AlignDataAccess *data_access) { |
|---|
| 209 | GBDATA *gb_main = data_access->gb_main; |
|---|
| 210 | |
|---|
| 211 | GB_push_transaction(gb_main); |
|---|
| 212 | |
|---|
| 213 | AW_root *root = aw->get_root(); |
|---|
| 214 | FA_alignTarget revComplWhat = static_cast<FA_alignTarget>(root->awar(FA_AWAR_TO_ALIGN)->read_int()); |
|---|
| 215 | GB_CSTR alignment = data_access->alignment_name.c_str(); |
|---|
| 216 | ARB_ERROR error = NULp; |
|---|
| 217 | int max_protection = root->awar(FA_AWAR_PROTECTION)->read_int(); |
|---|
| 218 | |
|---|
| 219 | switch (revComplWhat) { |
|---|
| 220 | case FA_CURRENT: { // current species |
|---|
| 221 | GB_CSTR species_name = root->awar(AWAR_SPECIES_NAME)->read_string(); |
|---|
| 222 | GBDATA *gb_species = GBT_find_species(gb_main, species_name); |
|---|
| 223 | if (!gb_species) error = species_not_found(species_name); |
|---|
| 224 | if (!error) error = reverseComplement(gb_species, alignment, max_protection); |
|---|
| 225 | break; |
|---|
| 226 | } |
|---|
| 227 | case FA_MARKED: { // marked species |
|---|
| 228 | GBDATA *gb_species = GBT_first_marked_species(gb_main); |
|---|
| 229 | |
|---|
| 230 | if (!gb_species) { |
|---|
| 231 | error = "There is no marked species"; |
|---|
| 232 | } |
|---|
| 233 | while (gb_species) { |
|---|
| 234 | error = reverseComplement(gb_species, alignment, max_protection); |
|---|
| 235 | if (error) break; |
|---|
| 236 | gb_species = GBT_next_marked_species(gb_species); |
|---|
| 237 | } |
|---|
| 238 | break; |
|---|
| 239 | } |
|---|
| 240 | case FA_SELECTED: { // selected species (editor selection!) |
|---|
| 241 | |
|---|
| 242 | Aligner_get_first_selected_species get_first_selected_species = data_access->get_first_selected_species; |
|---|
| 243 | Aligner_get_next_selected_species get_next_selected_species = data_access->get_next_selected_species; |
|---|
| 244 | |
|---|
| 245 | int count = 0; |
|---|
| 246 | GBDATA *gb_species = get_first_selected_species(&count); |
|---|
| 247 | |
|---|
| 248 | if (!gb_species) { |
|---|
| 249 | error = "There is no selected species!"; |
|---|
| 250 | } |
|---|
| 251 | while (gb_species) { |
|---|
| 252 | error = reverseComplement(gb_species, alignment, max_protection); |
|---|
| 253 | if (error) break; |
|---|
| 254 | gb_species = get_next_selected_species(); |
|---|
| 255 | } |
|---|
| 256 | break; |
|---|
| 257 | } |
|---|
| 258 | default: { |
|---|
| 259 | fa_assert(0); |
|---|
| 260 | break; |
|---|
| 261 | } |
|---|
| 262 | } |
|---|
| 263 | |
|---|
| 264 | GB_end_transaction_show_error(gb_main, error, aw_message); |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | // -------------------------------------------------------------------------------- |
|---|
| 268 | |
|---|
| 269 | class AliChange { // describes a local alignment change |
|---|
| 270 | const CompactedSubSequence& Old; |
|---|
| 271 | const CompactedSubSequence& New; |
|---|
| 272 | |
|---|
| 273 | public: |
|---|
| 274 | AliChange(const CompactedSubSequence& old_, const CompactedSubSequence& new_) |
|---|
| 275 | : Old(old_), New(new_) |
|---|
| 276 | { |
|---|
| 277 | fa_assert(Old.may_refer_to_same_part_as(New)); |
|---|
| 278 | } |
|---|
| 279 | |
|---|
| 280 | int follow(ExplicitRange& range) const { |
|---|
| 281 | ExplicitRange compressed(Old.compPosition(range.start()), |
|---|
| 282 | Old.compPosition(range.end())); |
|---|
| 283 | |
|---|
| 284 | int exp_start = New.expdPosition(compressed.start()); |
|---|
| 285 | int exp_end = New.expdPosition(compressed.end()); |
|---|
| 286 | |
|---|
| 287 | int gaps_before = New.no_of_gaps_before(compressed.start()); |
|---|
| 288 | int gaps_after = New.no_of_gaps_after(compressed.end()); |
|---|
| 289 | |
|---|
| 290 | fa_assert(gaps_before >= 0); |
|---|
| 291 | fa_assert(gaps_after >= 0); |
|---|
| 292 | |
|---|
| 293 | range = ExplicitRange(exp_start-gaps_before, exp_end+gaps_after); |
|---|
| 294 | |
|---|
| 295 | return compressed.size(); // number of bases |
|---|
| 296 | } |
|---|
| 297 | }; |
|---|
| 298 | |
|---|
| 299 | class LooseBases { |
|---|
| 300 | typedef std::list<ExplicitRange> Ranges; |
|---|
| 301 | |
|---|
| 302 | Ranges ranges; |
|---|
| 303 | |
|---|
| 304 | public: |
|---|
| 305 | |
|---|
| 306 | bool is_empty() const { return ranges.empty(); } |
|---|
| 307 | void clear() { ranges.clear(); } |
|---|
| 308 | |
|---|
| 309 | void memorize(ExplicitRange range) { |
|---|
| 310 | ranges.push_front(range); |
|---|
| 311 | } |
|---|
| 312 | ExplicitRange recall() { |
|---|
| 313 | ExplicitRange range = ranges.front(); |
|---|
| 314 | ranges.pop_front(); |
|---|
| 315 | return range; |
|---|
| 316 | } |
|---|
| 317 | int follow_ali_change(const AliChange& change) { |
|---|
| 318 | // transform positions according to changed alignment (oldSequence -> newSequence) |
|---|
| 319 | // returns the number of bases contained in 'this' |
|---|
| 320 | int basecount = 0; |
|---|
| 321 | if (!is_empty()) { |
|---|
| 322 | for (Ranges::iterator r = ranges.begin(); r != ranges.end(); ++r) { |
|---|
| 323 | basecount += change.follow(*r); |
|---|
| 324 | } |
|---|
| 325 | } |
|---|
| 326 | return basecount; |
|---|
| 327 | } |
|---|
| 328 | void append(LooseBases& loose) { ranges.splice(ranges.end(), loose.ranges); } |
|---|
| 329 | int follow_ali_change_and_append(LooseBases& loose, const AliChange& change) { |
|---|
| 330 | // returns the number of loose bases (added from 'loose') |
|---|
| 331 | int basecount = loose.follow_ali_change(change); |
|---|
| 332 | append(loose); |
|---|
| 333 | return basecount; |
|---|
| 334 | } |
|---|
| 335 | }; |
|---|
| 336 | |
|---|
| 337 | static LooseBases unaligned_bases; // if fast_align cannot align (no master bases) -> stores positions here |
|---|
| 338 | |
|---|
| 339 | |
|---|
| 340 | static const char *read_name(GBDATA *gbd) { |
|---|
| 341 | if (!gbd) return "GROUP-CONSENSUS"; |
|---|
| 342 | const char *name = GBT_get_name(gbd); |
|---|
| 343 | return name ? name : "<unnamed-species>"; |
|---|
| 344 | } |
|---|
| 345 | |
|---|
| 346 | inline int relatedBases(char base1, char base2) { |
|---|
| 347 | return baseMatch(base1, base2)==1; |
|---|
| 348 | } |
|---|
| 349 | |
|---|
| 350 | inline char alignQuality(char slave, char master) { |
|---|
| 351 | fa_assert(slave); |
|---|
| 352 | fa_assert(master); |
|---|
| 353 | |
|---|
| 354 | char result = '#'; |
|---|
| 355 | if (slave==master) result = '-'; // equal |
|---|
| 356 | else if (slave==GAP_CHAR) result = '+'; // inserted gap |
|---|
| 357 | else if (master==GAP_CHAR) result = '+'; // no gap in master |
|---|
| 358 | else if (relatedBases(slave, master)) result = '~'; // mutation (related bases) |
|---|
| 359 | |
|---|
| 360 | return result; // mutation (non-related bases) |
|---|
| 361 | } |
|---|
| 362 | |
|---|
| 363 | // ------------------------- |
|---|
| 364 | // Debugging stuff |
|---|
| 365 | |
|---|
| 366 | #ifdef DEBUG |
|---|
| 367 | static char *lstr(const char *s, int len) { |
|---|
| 368 | static int alloc; |
|---|
| 369 | static char *buffer; |
|---|
| 370 | |
|---|
| 371 | if (alloc<(len+1)) { |
|---|
| 372 | if (alloc) free(buffer); |
|---|
| 373 | ARB_alloc(buffer, alloc=len+100); |
|---|
| 374 | } |
|---|
| 375 | |
|---|
| 376 | memcpy(buffer, s, len); |
|---|
| 377 | buffer[len] = 0; |
|---|
| 378 | |
|---|
| 379 | return buffer; |
|---|
| 380 | } |
|---|
| 381 | |
|---|
| 382 | #define BUFLEN 120 |
|---|
| 383 | |
|---|
| 384 | inline char compareChar(char base1, char base2) { |
|---|
| 385 | return base1==base2 ? '=' : (relatedBases(base1, base2) ? 'x' : 'X'); |
|---|
| 386 | } |
|---|
| 387 | |
|---|
| 388 | #if defined(TRACE_COMPRESSED_ALIGNMENT) |
|---|
| 389 | |
|---|
| 390 | static void dump_n_compare_one(const char *seq1, const char *seq2, long len, long offset) { |
|---|
| 391 | fa_assert(len<=BUFLEN); |
|---|
| 392 | char compare[BUFLEN+1]; |
|---|
| 393 | |
|---|
| 394 | for (long l=0; l<len; l++) { |
|---|
| 395 | compare[l] = (is_ali_gap(seq1[l]) && is_ali_gap(seq2[l])) ? ' ' : compareChar(seq1[l], seq2[l]); |
|---|
| 396 | } |
|---|
| 397 | |
|---|
| 398 | compare[len] = 0; |
|---|
| 399 | |
|---|
| 400 | printf(" %li '%s'\n", offset, lstr(seq1, len)); |
|---|
| 401 | printf(" %li '%s'\n", offset, lstr(seq2, len)); |
|---|
| 402 | printf(" %li '%s'\n", offset, compare); |
|---|
| 403 | } |
|---|
| 404 | |
|---|
| 405 | inline void dump_rest(const char *seq, long len, int idx, long offset) { |
|---|
| 406 | printf(" Rest von Sequenz %i:\n", idx); |
|---|
| 407 | while (len>BUFLEN) { |
|---|
| 408 | printf(" %li '%s'\n", offset, lstr(seq, BUFLEN)); |
|---|
| 409 | seq += BUFLEN; |
|---|
| 410 | len -= BUFLEN; |
|---|
| 411 | offset += BUFLEN; |
|---|
| 412 | } |
|---|
| 413 | |
|---|
| 414 | fa_assert(len>0); |
|---|
| 415 | printf(" '%s'\n", lstr(seq, len)); |
|---|
| 416 | } |
|---|
| 417 | |
|---|
| 418 | static void dump_n_compare(const char *text, const char *seq1, long len1, const char *seq2, long len2) { |
|---|
| 419 | long offset = 0; |
|---|
| 420 | |
|---|
| 421 | printf(" Comparing %s:\n", text); |
|---|
| 422 | |
|---|
| 423 | while (len1>0 && len2>0) { |
|---|
| 424 | long done = 0; |
|---|
| 425 | |
|---|
| 426 | if (len1>=BUFLEN && len2>=BUFLEN) { |
|---|
| 427 | dump_n_compare_one(seq1, seq2, done=BUFLEN, offset); |
|---|
| 428 | } |
|---|
| 429 | else { |
|---|
| 430 | long min = len1<len2 ? len1 : len2; |
|---|
| 431 | dump_n_compare_one(seq1, seq2, done=min, offset); |
|---|
| 432 | } |
|---|
| 433 | |
|---|
| 434 | seq1 += done; |
|---|
| 435 | seq2 += done; |
|---|
| 436 | len1 -= done; |
|---|
| 437 | len2 -= done; |
|---|
| 438 | offset += done; |
|---|
| 439 | } |
|---|
| 440 | |
|---|
| 441 | if (len1>0) dump_rest(seq1, len1, 1, offset); |
|---|
| 442 | if (len2>0) dump_rest(seq2, len2, 2, offset); |
|---|
| 443 | printf(" -------------------\n"); |
|---|
| 444 | } |
|---|
| 445 | |
|---|
| 446 | static void dump_n_compare(const char *text, const CompactedSubSequence& seq1, const CompactedSubSequence& seq2) { |
|---|
| 447 | dump_n_compare(text, seq1.text(), seq1.length(), seq2.text(), seq2.length()); |
|---|
| 448 | } |
|---|
| 449 | #endif // TRACE_COMPRESSED_ALIGNMENT |
|---|
| 450 | |
|---|
| 451 | #undef BUFLEN |
|---|
| 452 | |
|---|
| 453 | inline void dumpSeq(const char *seq, long len, long pos) { |
|---|
| 454 | printf("'%s' ", lstr(seq, len)); |
|---|
| 455 | printf("(Pos=%li,Len=%li)", pos, len); |
|---|
| 456 | } |
|---|
| 457 | |
|---|
| 458 | #define dump() \ |
|---|
| 459 | do { \ |
|---|
| 460 | double sig = partSignificance(sequence().length(), slaveSequence.length(), bestLength); \ |
|---|
| 461 | \ |
|---|
| 462 | printf(" Score = %li (Significance=%f)\n" \ |
|---|
| 463 | " Master = ", bestScore, sig); \ |
|---|
| 464 | dumpSeq(bestMasterLeft.text(), bestLength, bestMasterLeft.leftOf()); \ |
|---|
| 465 | printf("\n" \ |
|---|
| 466 | " Slave = "); \ |
|---|
| 467 | dumpSeq(bestSlaveLeft.text(), bestLength, bestSlaveLeft.leftOf()); \ |
|---|
| 468 | printf("\n"); \ |
|---|
| 469 | } while (0) |
|---|
| 470 | |
|---|
| 471 | #endif //DEBUG |
|---|
| 472 | |
|---|
| 473 | |
|---|
| 474 | // ------------------------------------------------ |
|---|
| 475 | // INLINE-functions used in fast_align(): |
|---|
| 476 | |
|---|
| 477 | inline double log3(double d) { |
|---|
| 478 | return log(d)/log(3.0); |
|---|
| 479 | } |
|---|
| 480 | inline double partSignificance(long seq1len, long seq2len, long partlen) { |
|---|
| 481 | // returns log3 of significance of the part |
|---|
| 482 | // usage: partSignificance(...) < log3(maxAllowedSignificance) |
|---|
| 483 | return log3((seq1len-partlen)*(seq2len-partlen)) - partlen; |
|---|
| 484 | } |
|---|
| 485 | |
|---|
| 486 | inline ARB_ERROR bufferTooSmall() { |
|---|
| 487 | return "Cannot align - reserved buffer is to small"; |
|---|
| 488 | } |
|---|
| 489 | |
|---|
| 490 | inline long insertsToNextBase(AlignBuffer *alignBuffer, const SequencePosition& master) { |
|---|
| 491 | int inserts; |
|---|
| 492 | int nextBase; |
|---|
| 493 | |
|---|
| 494 | if (master.rightOf()>0) { |
|---|
| 495 | nextBase = master.expdPosition(); |
|---|
| 496 | } |
|---|
| 497 | else { |
|---|
| 498 | nextBase = master.sequence().expdPosition(master.sequence().length()); |
|---|
| 499 | } |
|---|
| 500 | inserts = nextBase-alignBuffer->offset(); |
|---|
| 501 | |
|---|
| 502 | return inserts; |
|---|
| 503 | } |
|---|
| 504 | |
|---|
| 505 | inline void insertBase(AlignBuffer *alignBuffer, |
|---|
| 506 | SequencePosition& master, SequencePosition& slave, |
|---|
| 507 | FastAlignReport *report) |
|---|
| 508 | { |
|---|
| 509 | char slaveBase = *slave.text(); |
|---|
| 510 | char masterBase = *master.text(); |
|---|
| 511 | |
|---|
| 512 | alignBuffer->set(slaveBase, alignQuality(slaveBase, masterBase)); |
|---|
| 513 | report->count_aligned_base(slaveBase!=masterBase); |
|---|
| 514 | ++slave; |
|---|
| 515 | ++master; |
|---|
| 516 | } |
|---|
| 517 | |
|---|
| 518 | inline void insertSlaveBases(AlignBuffer *alignBuffer, |
|---|
| 519 | SequencePosition& slave, |
|---|
| 520 | int length, |
|---|
| 521 | FastAlignReport *report) |
|---|
| 522 | { |
|---|
| 523 | alignBuffer->copy(slave.text(), alignQuality(*slave.text(), GAP_CHAR), length); |
|---|
| 524 | report->count_unaligned_base(length); |
|---|
| 525 | slave += length; |
|---|
| 526 | } |
|---|
| 527 | |
|---|
| 528 | inline void insertGap(AlignBuffer *alignBuffer, |
|---|
| 529 | SequencePosition& master, |
|---|
| 530 | FastAlignReport *report) |
|---|
| 531 | { |
|---|
| 532 | char masterBase = *master.text(); |
|---|
| 533 | |
|---|
| 534 | alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, masterBase)); |
|---|
| 535 | report->count_aligned_base(masterBase!=GAP_CHAR); |
|---|
| 536 | ++master; |
|---|
| 537 | } |
|---|
| 538 | |
|---|
| 539 | static ARB_ERROR insertClustalValigned(AlignBuffer *alignBuffer, |
|---|
| 540 | SequencePosition& master, |
|---|
| 541 | SequencePosition& slave, |
|---|
| 542 | const char *masterAlignment, const char *slaveAlignment, long alignmentLength, |
|---|
| 543 | FastAlignReport *report) |
|---|
| 544 | { |
|---|
| 545 | // inserts bases of 'slave' to 'alignBuffer' according to alignment in 'masterAlignment' and 'slaveAlignment' |
|---|
| 546 | #define ACID '*' // contents of 'masterAlignment' and 'slaveAlignment' |
|---|
| 547 | #define GAP '-' |
|---|
| 548 | |
|---|
| 549 | int pos; |
|---|
| 550 | int baseAtLeft = 0; // TRUE -> last position in alignBuffer contains a base |
|---|
| 551 | |
|---|
| 552 | for (pos=0; pos<alignmentLength; pos++) { |
|---|
| 553 | long insert = insertsToNextBase(alignBuffer, master); // in expanded seq |
|---|
| 554 | |
|---|
| 555 | if (masterAlignment[pos]==ACID) { |
|---|
| 556 | if (insert>0) { |
|---|
| 557 | if (insert>alignBuffer->free()) return bufferTooSmall(); |
|---|
| 558 | alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), insert); |
|---|
| 559 | fa_assert(insertsToNextBase(alignBuffer, master)==0); |
|---|
| 560 | insert = 0; |
|---|
| 561 | } |
|---|
| 562 | |
|---|
| 563 | if (!alignBuffer->free()) return bufferTooSmall(); |
|---|
| 564 | if (slaveAlignment[pos]==ACID) { |
|---|
| 565 | insertBase(alignBuffer, master, slave, report); |
|---|
| 566 | baseAtLeft = 1; |
|---|
| 567 | } |
|---|
| 568 | else { |
|---|
| 569 | insertGap(alignBuffer, master, report); |
|---|
| 570 | baseAtLeft = 0; |
|---|
| 571 | } |
|---|
| 572 | } |
|---|
| 573 | else { |
|---|
| 574 | int slave_bases; |
|---|
| 575 | |
|---|
| 576 | fa_assert(masterAlignment[pos]==GAP); |
|---|
| 577 | for (slave_bases=1; pos+slave_bases<alignmentLength && masterAlignment[pos+slave_bases]==GAP; slave_bases++) { |
|---|
| 578 | ; // count gaps in master (= # of slave bases to insert) |
|---|
| 579 | } |
|---|
| 580 | if (!baseAtLeft && insert>slave_bases) { |
|---|
| 581 | int ins_gaps = insert-slave_bases; |
|---|
| 582 | |
|---|
| 583 | fa_assert(alignBuffer->free()>=ins_gaps); |
|---|
| 584 | alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), ins_gaps); |
|---|
| 585 | insert -= ins_gaps; |
|---|
| 586 | } |
|---|
| 587 | |
|---|
| 588 | if (insert<slave_bases) { // master has less gaps than slave bases to insert |
|---|
| 589 | report->memorize_insertion(master.expdPosition(), slave_bases-insert); |
|---|
| 590 | } |
|---|
| 591 | else if (insert>slave_bases) { // master has more gaps than slave bases to insert |
|---|
| 592 | fa_assert(baseAtLeft); |
|---|
| 593 | } |
|---|
| 594 | |
|---|
| 595 | unaligned_bases.memorize(ExplicitRange(slave.expdPosition(), // memorize base positions without counterpart in master |
|---|
| 596 | slave.expdPosition(slave_bases-1))); |
|---|
| 597 | |
|---|
| 598 | if (slave_bases>alignBuffer->free()) return bufferTooSmall(); |
|---|
| 599 | insertSlaveBases(alignBuffer, slave, slave_bases, report); |
|---|
| 600 | pos += slave_bases-1; // -1 compensates for()-increment |
|---|
| 601 | baseAtLeft = 1; |
|---|
| 602 | } |
|---|
| 603 | } |
|---|
| 604 | |
|---|
| 605 | return NULp; |
|---|
| 606 | |
|---|
| 607 | #undef GAP |
|---|
| 608 | #undef ACID |
|---|
| 609 | } |
|---|
| 610 | |
|---|
| 611 | static ARB_ERROR insertAligned(AlignBuffer *alignBuffer, |
|---|
| 612 | SequencePosition& master, SequencePosition& slave, long partLength, |
|---|
| 613 | FastAlignReport *report) |
|---|
| 614 | { |
|---|
| 615 | // insert bases of 'slave' to 'alignBuffer' according to 'master' |
|---|
| 616 | if (partLength) { |
|---|
| 617 | long insert = insertsToNextBase(alignBuffer, master); |
|---|
| 618 | |
|---|
| 619 | fa_assert(partLength>=0); |
|---|
| 620 | |
|---|
| 621 | if (insert<0) { // insert gaps into master |
|---|
| 622 | long min_insert = insert; |
|---|
| 623 | |
|---|
| 624 | report->memorize_insertion(master.expdPosition(), -insert); |
|---|
| 625 | |
|---|
| 626 | while (insert<0 && partLength) { |
|---|
| 627 | if (insert<min_insert) min_insert = insert; |
|---|
| 628 | if (!alignBuffer->free()) { |
|---|
| 629 | return bufferTooSmall(); |
|---|
| 630 | } |
|---|
| 631 | insertBase(alignBuffer, master, slave, report); |
|---|
| 632 | partLength--; |
|---|
| 633 | insert = insertsToNextBase(alignBuffer, master); |
|---|
| 634 | } |
|---|
| 635 | |
|---|
| 636 | fa_assert(partLength>=0); |
|---|
| 637 | if (partLength==0) { // all inserted |
|---|
| 638 | return NULp; |
|---|
| 639 | } |
|---|
| 640 | } |
|---|
| 641 | |
|---|
| 642 | fa_assert(insert>=0); |
|---|
| 643 | |
|---|
| 644 | if (insert>0) { // insert gaps into slave |
|---|
| 645 | if (insert>alignBuffer->free()) return bufferTooSmall(); |
|---|
| 646 | alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), insert); |
|---|
| 647 | fa_assert(insertsToNextBase(alignBuffer, master)==0); |
|---|
| 648 | } |
|---|
| 649 | |
|---|
| 650 | fa_assert(partLength>=0); |
|---|
| 651 | |
|---|
| 652 | while (partLength--) { |
|---|
| 653 | insert = insertsToNextBase(alignBuffer, master); |
|---|
| 654 | |
|---|
| 655 | fa_assert(insert>=0); |
|---|
| 656 | if (insert>0) { |
|---|
| 657 | if (insert>=alignBuffer->free()) return bufferTooSmall(); |
|---|
| 658 | alignBuffer->set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), insert); |
|---|
| 659 | } |
|---|
| 660 | else { |
|---|
| 661 | if (!alignBuffer->free()) { |
|---|
| 662 | return bufferTooSmall(); |
|---|
| 663 | } |
|---|
| 664 | } |
|---|
| 665 | |
|---|
| 666 | insertBase(alignBuffer, master, slave, report); |
|---|
| 667 | } |
|---|
| 668 | } |
|---|
| 669 | |
|---|
| 670 | return NULp; |
|---|
| 671 | } |
|---|
| 672 | |
|---|
| 673 | static ARB_ERROR cannot_fast_align(const CompactedSubSequence& master, long moffset, long mlength, |
|---|
| 674 | const CompactedSubSequence& slaveSequence, long soffset, long slength, |
|---|
| 675 | int max_seq_length, |
|---|
| 676 | AlignBuffer *alignBuffer, |
|---|
| 677 | FastAlignReport *report) |
|---|
| 678 | { |
|---|
| 679 | const char *mtext = master.text(moffset); |
|---|
| 680 | const char *stext = slaveSequence.text(soffset); |
|---|
| 681 | ARB_ERROR error = NULp; |
|---|
| 682 | |
|---|
| 683 | if (slength) { |
|---|
| 684 | if (mlength) { // if slave- and master-sequences contain bases, we call the slow aligner |
|---|
| 685 | #ifdef TRACE_CLUSTAL_DATA |
|---|
| 686 | printf("ClustalV-Align:\n"); |
|---|
| 687 | printf(" mseq = '%s'\n", lstr(mtext, mlength)); |
|---|
| 688 | printf(" sseq = '%s'\n", lstr(stext, slength)); |
|---|
| 689 | #endif // TRACE_CLUSTAL_DATA |
|---|
| 690 | |
|---|
| 691 | int is_dna = -1; |
|---|
| 692 | |
|---|
| 693 | switch (global_alignmentType) { |
|---|
| 694 | case GB_AT_RNA: |
|---|
| 695 | case GB_AT_DNA: is_dna = 1; break; |
|---|
| 696 | case GB_AT_AA: is_dna = 0; break; |
|---|
| 697 | default: error = "Unknown alignment type - aligner aborted"; break; |
|---|
| 698 | } |
|---|
| 699 | |
|---|
| 700 | const char *maligned, *saligned; |
|---|
| 701 | int len; |
|---|
| 702 | if (!error) { |
|---|
| 703 | int score; // unused |
|---|
| 704 | error = ClustalV_align(is_dna, |
|---|
| 705 | 1, |
|---|
| 706 | mtext, mlength, stext, slength, |
|---|
| 707 | master.gapsBefore(moffset), |
|---|
| 708 | max_seq_length, |
|---|
| 709 | maligned, saligned, len, score); |
|---|
| 710 | } |
|---|
| 711 | |
|---|
| 712 | if (!error) { |
|---|
| 713 | #ifdef TRACE_CLUSTAL_DATA |
|---|
| 714 | printf("ClustalV returns:\n"); |
|---|
| 715 | printf(" maligned = '%s'\n", lstr(maligned, len)); |
|---|
| 716 | printf(" saligned = '%s'\n", lstr(saligned, len)); |
|---|
| 717 | #endif // TRACE_CLUSTAL_DATA |
|---|
| 718 | |
|---|
| 719 | SequencePosition masterPos(master, moffset); |
|---|
| 720 | SequencePosition slavePos(slaveSequence, soffset); |
|---|
| 721 | |
|---|
| 722 | error = insertClustalValigned(alignBuffer, masterPos, slavePos, maligned, saligned, len, report); |
|---|
| 723 | |
|---|
| 724 | #if (defined(DEBUG) && 0) |
|---|
| 725 | |
|---|
| 726 | SequencePosition master2(master->sequence(), moffset); |
|---|
| 727 | SequencePosition slave2(slaveSequence, soffset); |
|---|
| 728 | char *cmp = new char[len]; |
|---|
| 729 | |
|---|
| 730 | for (int l=0; l<len; l++) { |
|---|
| 731 | int gaps = 0; |
|---|
| 732 | |
|---|
| 733 | if (maligned[l]=='*') { |
|---|
| 734 | maligned[l] = *master2.text(); |
|---|
| 735 | ++master2; |
|---|
| 736 | } |
|---|
| 737 | else { |
|---|
| 738 | gaps++; |
|---|
| 739 | } |
|---|
| 740 | |
|---|
| 741 | if (saligned[l]=='*') { |
|---|
| 742 | saligned[l] = *slave2.text(); |
|---|
| 743 | ++slave2; |
|---|
| 744 | } |
|---|
| 745 | else { |
|---|
| 746 | gaps++; |
|---|
| 747 | } |
|---|
| 748 | |
|---|
| 749 | cmp[l] = gaps || maligned[l]==saligned[l] ? '=' : 'X'; |
|---|
| 750 | } |
|---|
| 751 | |
|---|
| 752 | printf(" master = '%s'\n", lstr(maligned, len)); |
|---|
| 753 | printf(" slave = '%s'\n", lstr(saligned, len)); |
|---|
| 754 | printf(" '%s'\n", lstr(cmp, len)); |
|---|
| 755 | |
|---|
| 756 | delete [] cmp; |
|---|
| 757 | #endif |
|---|
| 758 | } |
|---|
| 759 | } |
|---|
| 760 | else { // master is empty here, so we just copy in the slave bases |
|---|
| 761 | if (slength<=alignBuffer->free()) { |
|---|
| 762 | unaligned_bases.memorize(ExplicitRange(slaveSequence.expdPosition(soffset), |
|---|
| 763 | slaveSequence.expdPosition(soffset+slength-1))); |
|---|
| 764 | |
|---|
| 765 | alignBuffer->copy(slaveSequence.text(soffset), '?', slength); // '?' means not aligned (just inserted) |
|---|
| 766 | // corrected by at alignBuffer->correctUnalignedPositions() |
|---|
| 767 | report->count_unaligned_base(slength); |
|---|
| 768 | } |
|---|
| 769 | else { |
|---|
| 770 | error = bufferTooSmall(); |
|---|
| 771 | } |
|---|
| 772 | } |
|---|
| 773 | } |
|---|
| 774 | |
|---|
| 775 | return error; |
|---|
| 776 | } |
|---|
| 777 | |
|---|
| 778 | // ------------------------------------ |
|---|
| 779 | // #define's for fast_align() |
|---|
| 780 | |
|---|
| 781 | #define TEST_BETTER_SCORE() \ |
|---|
| 782 | do { \ |
|---|
| 783 | if (score>bestScore) { \ |
|---|
| 784 | bestScore = score; \ |
|---|
| 785 | bestLength = masterRight.text() - masterLeft.text(); \ |
|---|
| 786 | bestMasterLeft = masterLeft; \ |
|---|
| 787 | bestSlaveLeft = slaveLeft; \ |
|---|
| 788 | } \ |
|---|
| 789 | } while (0) |
|---|
| 790 | |
|---|
| 791 | #define CAN_SCORE_LEFT() (masterLeft.leftOf() && slaveLeft.leftOf()) |
|---|
| 792 | #define CAN_SCORE_RIGHT() (masterRight.rightOf() && slaveRight.rightOf()) |
|---|
| 793 | |
|---|
| 794 | #define SCORE_LEFT() \ |
|---|
| 795 | do { \ |
|---|
| 796 | score += *(--masterLeft).text()==*(--slaveLeft).text() ? match : mismatch; \ |
|---|
| 797 | TEST_BETTER_SCORE(); \ |
|---|
| 798 | } while (0) |
|---|
| 799 | |
|---|
| 800 | #define SCORE_RIGHT() \ |
|---|
| 801 | do { \ |
|---|
| 802 | score += *(++masterRight).text()==*(++slaveRight).text() ? match : mismatch; \ |
|---|
| 803 | TEST_BETTER_SCORE(); \ |
|---|
| 804 | } while (0) |
|---|
| 805 | |
|---|
| 806 | |
|---|
| 807 | ARB_ERROR FastSearchSequence::fast_align(const CompactedSubSequence& slaveSequence, |
|---|
| 808 | AlignBuffer *alignBuffer, |
|---|
| 809 | int max_seq_length, |
|---|
| 810 | int match, int mismatch, |
|---|
| 811 | FastAlignReport *report) const |
|---|
| 812 | { |
|---|
| 813 | // aligns 'slaveSequence' to 'this' |
|---|
| 814 | // |
|---|
| 815 | // returns |
|---|
| 816 | // NULp -> all ok -> 'alignBuffer' contains aligned sequence |
|---|
| 817 | // or error -> failure -> no results |
|---|
| 818 | |
|---|
| 819 | ARB_ERROR error = NULp; |
|---|
| 820 | int aligned = 0; |
|---|
| 821 | |
|---|
| 822 | // set the following #if to zero to test ClustalV-Aligner (not fast_aligner) |
|---|
| 823 | #if 1 |
|---|
| 824 | |
|---|
| 825 | static double lowSignificance; |
|---|
| 826 | static int lowSignificanceInitialized; |
|---|
| 827 | |
|---|
| 828 | if (!lowSignificanceInitialized) { |
|---|
| 829 | lowSignificance = log3(0.01); |
|---|
| 830 | lowSignificanceInitialized = 1; |
|---|
| 831 | } |
|---|
| 832 | |
|---|
| 833 | SequencePosition slave(slaveSequence); |
|---|
| 834 | long bestScore=0; |
|---|
| 835 | SequencePosition bestMasterLeft(sequence()); |
|---|
| 836 | SequencePosition bestSlaveLeft(slaveSequence); |
|---|
| 837 | long bestLength=0; |
|---|
| 838 | |
|---|
| 839 | while (slave.rightOf()>=3) { // with all triples of slaveSequence |
|---|
| 840 | FastSearchOccurrence occurrence(*this, slave.text()); // "search" first |
|---|
| 841 | SequencePosition rightmostSlave = slave; |
|---|
| 842 | |
|---|
| 843 | while (occurrence.found()) { // with all occurrences of the triple |
|---|
| 844 | long score = match*3; |
|---|
| 845 | SequencePosition masterLeft(occurrence.sequence(), occurrence.offset()); |
|---|
| 846 | SequencePosition masterRight(occurrence.sequence(), occurrence.offset()+3); |
|---|
| 847 | SequencePosition slaveLeft(slave); |
|---|
| 848 | SequencePosition slaveRight(slave, 3); |
|---|
| 849 | |
|---|
| 850 | while (score>0) { |
|---|
| 851 | if (CAN_SCORE_LEFT()) { |
|---|
| 852 | SCORE_LEFT(); |
|---|
| 853 | } |
|---|
| 854 | else { |
|---|
| 855 | while (score>0 && CAN_SCORE_RIGHT()) { |
|---|
| 856 | SCORE_RIGHT(); |
|---|
| 857 | } |
|---|
| 858 | break; |
|---|
| 859 | } |
|---|
| 860 | |
|---|
| 861 | if (CAN_SCORE_RIGHT()) { |
|---|
| 862 | SCORE_RIGHT(); |
|---|
| 863 | } |
|---|
| 864 | else { |
|---|
| 865 | while (score>0 && CAN_SCORE_LEFT()) { |
|---|
| 866 | SCORE_LEFT(); |
|---|
| 867 | } |
|---|
| 868 | break; |
|---|
| 869 | } |
|---|
| 870 | } |
|---|
| 871 | |
|---|
| 872 | occurrence.gotoNext(); // "search" next |
|---|
| 873 | |
|---|
| 874 | if (rightmostSlave<slaveRight) { |
|---|
| 875 | rightmostSlave = slaveRight; |
|---|
| 876 | rightmostSlave -= 3; |
|---|
| 877 | } |
|---|
| 878 | } |
|---|
| 879 | |
|---|
| 880 | if (rightmostSlave>slave) slave = rightmostSlave; |
|---|
| 881 | else ++slave; |
|---|
| 882 | } |
|---|
| 883 | |
|---|
| 884 | if (bestLength) { |
|---|
| 885 | double sig = partSignificance(sequence().length(), slaveSequence.length(), bestLength); |
|---|
| 886 | |
|---|
| 887 | if (sig<lowSignificance) { |
|---|
| 888 | long masterLeftOf = bestMasterLeft.leftOf(); |
|---|
| 889 | long masterRightStart = masterLeftOf+bestLength; |
|---|
| 890 | long masterRightOf = bestMasterLeft.rightOf()-bestLength; |
|---|
| 891 | long slaveLeftOf = bestSlaveLeft.leftOf(); |
|---|
| 892 | long slaveRightStart = slaveLeftOf+bestLength; |
|---|
| 893 | long slaveRightOf = bestSlaveLeft.rightOf()-bestLength; |
|---|
| 894 | |
|---|
| 895 | #define MIN_ALIGNMENT_RANGE 4 |
|---|
| 896 | |
|---|
| 897 | if (!error) { |
|---|
| 898 | if (masterLeftOf >= MIN_ALIGNMENT_RANGE && slaveLeftOf >= MIN_ALIGNMENT_RANGE) { |
|---|
| 899 | CompactedSubSequence leftCompactedMaster(sequence(), 0, masterLeftOf); |
|---|
| 900 | FastSearchSequence leftMaster(leftCompactedMaster); |
|---|
| 901 | |
|---|
| 902 | error = leftMaster.fast_align(CompactedSubSequence(slaveSequence, 0, slaveLeftOf), |
|---|
| 903 | alignBuffer, max_seq_length, match, mismatch, report); |
|---|
| 904 | } |
|---|
| 905 | else if (slaveLeftOf>0) { |
|---|
| 906 | error = cannot_fast_align(sequence(), 0, masterLeftOf, |
|---|
| 907 | slaveSequence, 0, slaveLeftOf, |
|---|
| 908 | max_seq_length, alignBuffer, report); |
|---|
| 909 | } |
|---|
| 910 | |
|---|
| 911 | aligned = 1; |
|---|
| 912 | } |
|---|
| 913 | |
|---|
| 914 | // align part of slave sequence according to master sequence: |
|---|
| 915 | |
|---|
| 916 | if (!error) { |
|---|
| 917 | #if (defined(DEBUG) && 0) |
|---|
| 918 | long offset = alignBuffer->offset(); |
|---|
| 919 | long used; |
|---|
| 920 | #endif |
|---|
| 921 | error = insertAligned(alignBuffer, bestMasterLeft, bestSlaveLeft, bestLength, report); |
|---|
| 922 | #if (defined(DEBUG) && 0) |
|---|
| 923 | used = alignBuffer->offset()-offset; |
|---|
| 924 | printf("aligned '%s' (len=%li, address=%li)\n", lstr(alignBuffer->text()+offset, used), used, long(alignBuffer)); |
|---|
| 925 | #endif |
|---|
| 926 | aligned = 1; |
|---|
| 927 | } |
|---|
| 928 | |
|---|
| 929 | if (!error) { |
|---|
| 930 | if (masterRightOf >= MIN_ALIGNMENT_RANGE && slaveRightOf >= MIN_ALIGNMENT_RANGE) { |
|---|
| 931 | CompactedSubSequence rightCompactedMaster(sequence(), masterRightStart, masterRightOf); |
|---|
| 932 | FastSearchSequence rightMaster(rightCompactedMaster); |
|---|
| 933 | |
|---|
| 934 | error = rightMaster.fast_align(CompactedSubSequence(slaveSequence, slaveRightStart, slaveRightOf), |
|---|
| 935 | alignBuffer, |
|---|
| 936 | max_seq_length, match, mismatch, report); |
|---|
| 937 | } |
|---|
| 938 | else if (slaveRightOf>0) { |
|---|
| 939 | error = cannot_fast_align(sequence(), masterRightStart, masterRightOf, |
|---|
| 940 | slaveSequence, slaveRightStart, slaveRightOf, |
|---|
| 941 | max_seq_length, alignBuffer, report); |
|---|
| 942 | } |
|---|
| 943 | |
|---|
| 944 | aligned = 1; |
|---|
| 945 | } |
|---|
| 946 | |
|---|
| 947 | } |
|---|
| 948 | } |
|---|
| 949 | |
|---|
| 950 | #endif |
|---|
| 951 | |
|---|
| 952 | if (!aligned && !error) { |
|---|
| 953 | error = cannot_fast_align(sequence(), 0, sequence().length(), |
|---|
| 954 | slaveSequence, 0, slaveSequence.length(), |
|---|
| 955 | max_seq_length, alignBuffer, report); |
|---|
| 956 | } |
|---|
| 957 | |
|---|
| 958 | return error; |
|---|
| 959 | } |
|---|
| 960 | |
|---|
| 961 | #undef dump |
|---|
| 962 | #undef TEST_BETTER_SCORE |
|---|
| 963 | #undef CAN_SCORE_LEFT |
|---|
| 964 | #undef CAN_SCORE_RIGHT |
|---|
| 965 | #undef SCORE_LEFT |
|---|
| 966 | #undef SCORE_RIGHT |
|---|
| 967 | |
|---|
| 968 | inline long calcSequenceChecksum(const char *data, long length) { |
|---|
| 969 | return GB_checksum(data, length, 1, GAP_CHARS); |
|---|
| 970 | } |
|---|
| 971 | |
|---|
| 972 | static CompactedSubSequence *readCompactedSequence(GBDATA *gb_species, |
|---|
| 973 | const char *ali, |
|---|
| 974 | ARB_ERROR *errorPtr, |
|---|
| 975 | char **dataPtr, // if dataPtr is specified, it will be set to the WHOLE uncompacted sequence |
|---|
| 976 | long *seqChksum, // may be NULp (of part of sequence) |
|---|
| 977 | PosRange range) // if !range.is_whole() -> return only part of the sequence |
|---|
| 978 | { |
|---|
| 979 | ARB_ERROR error = NULp; |
|---|
| 980 | GBDATA *gbd; |
|---|
| 981 | CompactedSubSequence *seq = NULp; |
|---|
| 982 | |
|---|
| 983 | gbd = GBT_find_sequence(gb_species, ali); // get sequence |
|---|
| 984 | |
|---|
| 985 | if (gbd) { |
|---|
| 986 | long length = GB_read_string_count(gbd); |
|---|
| 987 | char *data = GB_read_string(gbd); |
|---|
| 988 | long partLength; |
|---|
| 989 | char *partData; |
|---|
| 990 | |
|---|
| 991 | if (dataPtr) { // make a copy of the whole uncompacted sequence (returned to caller) |
|---|
| 992 | *dataPtr = data; |
|---|
| 993 | } |
|---|
| 994 | |
|---|
| 995 | int firstColumn = range.start(); |
|---|
| 996 | if (range.is_part()) { // take only part of sequence |
|---|
| 997 | int lastColumn = range.end(); |
|---|
| 998 | |
|---|
| 999 | fa_assert(firstColumn>=0); |
|---|
| 1000 | fa_assert(firstColumn<=lastColumn); |
|---|
| 1001 | |
|---|
| 1002 | // include all surrounding gaps (@@@ this might cause problems with partial alignment) |
|---|
| 1003 | while (firstColumn>0 && is_ali_gap(data[firstColumn-1])) { |
|---|
| 1004 | firstColumn--; |
|---|
| 1005 | } |
|---|
| 1006 | if (lastColumn!=-1) { |
|---|
| 1007 | while (lastColumn<(length-1) && is_ali_gap(data[lastColumn+1])) lastColumn++; |
|---|
| 1008 | fa_assert(lastColumn<length); |
|---|
| 1009 | } |
|---|
| 1010 | |
|---|
| 1011 | partData = data+firstColumn; |
|---|
| 1012 | int slen = length-firstColumn; |
|---|
| 1013 | |
|---|
| 1014 | fa_assert(slen>=0); |
|---|
| 1015 | fa_assert((size_t)slen==strlen(partData)); |
|---|
| 1016 | |
|---|
| 1017 | if (lastColumn==-1) { // take all till end of sequence |
|---|
| 1018 | partLength = slen; |
|---|
| 1019 | } |
|---|
| 1020 | else { |
|---|
| 1021 | partLength = lastColumn-firstColumn+1; |
|---|
| 1022 | if (partLength>slen) partLength = slen; // cut rest, if we have any |
|---|
| 1023 | } |
|---|
| 1024 | } |
|---|
| 1025 | else { |
|---|
| 1026 | partLength = length; |
|---|
| 1027 | partData = data; |
|---|
| 1028 | } |
|---|
| 1029 | |
|---|
| 1030 | if (!error) { |
|---|
| 1031 | if (seqChksum) { |
|---|
| 1032 | *seqChksum = calcSequenceChecksum(partData, partLength); |
|---|
| 1033 | } |
|---|
| 1034 | |
|---|
| 1035 | seq = new CompactedSubSequence(partData, partLength, GBT_get_name_or_description(gb_species), firstColumn); |
|---|
| 1036 | } |
|---|
| 1037 | |
|---|
| 1038 | if (!dataPtr) { // free sequence only if user has not requested to get it |
|---|
| 1039 | free(data); |
|---|
| 1040 | } |
|---|
| 1041 | } |
|---|
| 1042 | else { |
|---|
| 1043 | error = GBS_global_string("No 'data' found for species '%s'", GBT_get_name_or_description(gb_species)); |
|---|
| 1044 | if (dataPtr) *dataPtr = NULp; // (user must not care to free data if we fail) |
|---|
| 1045 | } |
|---|
| 1046 | |
|---|
| 1047 | fa_assert(errorPtr); |
|---|
| 1048 | *errorPtr = error; |
|---|
| 1049 | |
|---|
| 1050 | return seq; |
|---|
| 1051 | } |
|---|
| 1052 | |
|---|
| 1053 | static ARB_ERROR writeStringToAlignment(GBDATA *gb_species, GB_CSTR alignment, GB_CSTR data_name, GB_CSTR str, bool temporary) { |
|---|
| 1054 | GBDATA *gb_ali = GB_search(gb_species, alignment, GB_DB); |
|---|
| 1055 | ARB_ERROR error = NULp; |
|---|
| 1056 | GBDATA *gb_name = GB_search(gb_ali, data_name, GB_STRING); |
|---|
| 1057 | |
|---|
| 1058 | if (gb_name) { |
|---|
| 1059 | fa_assert(GB_is_ancestor_of(gb_ali, gb_name)); |
|---|
| 1060 | error = GB_write_string(gb_name, str); |
|---|
| 1061 | if (temporary && !error) error = GB_set_temporary(gb_name); |
|---|
| 1062 | } |
|---|
| 1063 | else { |
|---|
| 1064 | error = GBS_global_string("Cannot create entry '%s' for '%s'", data_name, GBT_get_name_or_description(gb_species)); |
|---|
| 1065 | } |
|---|
| 1066 | |
|---|
| 1067 | return error; |
|---|
| 1068 | } |
|---|
| 1069 | |
|---|
| 1070 | // -------------------------------------------------------------------------------- |
|---|
| 1071 | |
|---|
| 1072 | static ARB_ERROR alignCompactedTo(CompactedSubSequence *toAlignSequence, |
|---|
| 1073 | const FastSearchSequence *alignTo, |
|---|
| 1074 | int max_seq_length, |
|---|
| 1075 | GB_CSTR alignment, |
|---|
| 1076 | long toAlignChksum, |
|---|
| 1077 | GBDATA *gb_toAlign, |
|---|
| 1078 | GBDATA *gb_alignTo, // may be NULp |
|---|
| 1079 | const AlignParams& ali_params) |
|---|
| 1080 | { |
|---|
| 1081 | // if only part of the sequence should be aligned, then this functions already gets only the part |
|---|
| 1082 | // (i.o.w.: toAlignSequence, alignTo and toAlignChksum refer to the partial sequence) |
|---|
| 1083 | AlignBuffer alignBuffer(max_seq_length); |
|---|
| 1084 | if (ali_params.range.start()>0) { |
|---|
| 1085 | alignBuffer.set(GAP_CHAR, alignQuality(GAP_CHAR, GAP_CHAR), ali_params.range.start()); |
|---|
| 1086 | } |
|---|
| 1087 | |
|---|
| 1088 | const char *master_name = read_name(gb_alignTo); |
|---|
| 1089 | |
|---|
| 1090 | FastAlignReport report(master_name, ali_params.showGapsMessages); |
|---|
| 1091 | |
|---|
| 1092 | { |
|---|
| 1093 | static GBDATA *last_gb_toAlign = NULp; |
|---|
| 1094 | if (gb_toAlign!=last_gb_toAlign) { |
|---|
| 1095 | last_gb_toAlign = gb_toAlign; |
|---|
| 1096 | currentSequenceNumber++; |
|---|
| 1097 | } |
|---|
| 1098 | } |
|---|
| 1099 | |
|---|
| 1100 | #ifdef TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1101 | printf("alignCompactedTo(): master='%s' ", master_name); |
|---|
| 1102 | printf("slave='%s'\n", toAlignSequence->name()); |
|---|
| 1103 | #endif // TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1104 | |
|---|
| 1105 | ARB_ERROR error = GB_pop_transaction(gb_toAlign); |
|---|
| 1106 | if (!error) { |
|---|
| 1107 | if (island_hopper) { |
|---|
| 1108 | error = island_hopper->do_align(); |
|---|
| 1109 | if (!error) { |
|---|
| 1110 | fa_assert(island_hopper->was_aligned()); |
|---|
| 1111 | |
|---|
| 1112 | #ifdef TRACE_ISLANDHOPPER_DATA |
|---|
| 1113 | printf("Island-Hopper returns:\n"); |
|---|
| 1114 | printf("maligned = '%s'\n", lstr(island_hopper->get_result_ref(), island_hopper->get_result_length())); |
|---|
| 1115 | printf("saligned = '%s'\n", lstr(island_hopper->get_result(), island_hopper->get_result_length())); |
|---|
| 1116 | #endif // TRACE_ISLANDHOPPER_DATA |
|---|
| 1117 | |
|---|
| 1118 | SequencePosition masterPos(alignTo->sequence(), 0); |
|---|
| 1119 | SequencePosition slavePos(*toAlignSequence, 0); |
|---|
| 1120 | |
|---|
| 1121 | error = insertClustalValigned(&alignBuffer, masterPos, slavePos, island_hopper->get_result_ref(), island_hopper->get_result(), island_hopper->get_result_length(), &report); |
|---|
| 1122 | } |
|---|
| 1123 | } |
|---|
| 1124 | else { |
|---|
| 1125 | error = alignTo->fast_align(*toAlignSequence, &alignBuffer, max_seq_length, 2, -10, &report); // <- here we align |
|---|
| 1126 | } |
|---|
| 1127 | } |
|---|
| 1128 | |
|---|
| 1129 | if (!error) { |
|---|
| 1130 | alignBuffer.correctUnalignedPositions(); |
|---|
| 1131 | if (alignBuffer.free()) { |
|---|
| 1132 | alignBuffer.set('-', alignQuality(GAP_CHAR, GAP_CHAR), alignBuffer.free()); // rest of alignBuffer is set to '-' |
|---|
| 1133 | } |
|---|
| 1134 | alignBuffer.restoreDots(*toAlignSequence); |
|---|
| 1135 | } |
|---|
| 1136 | |
|---|
| 1137 | #ifdef TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1138 | if (!error) { |
|---|
| 1139 | CompactedSubSequence alignedSlave(alignBuffer.text(), alignBuffer.length(), toAlignSequence->name()); |
|---|
| 1140 | dump_n_compare("reference vs. aligned:", alignTo->sequence(), alignedSlave); |
|---|
| 1141 | } |
|---|
| 1142 | #endif // TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1143 | |
|---|
| 1144 | { |
|---|
| 1145 | GB_ERROR err = GB_push_transaction(gb_toAlign); |
|---|
| 1146 | if (!error) error = err; |
|---|
| 1147 | } |
|---|
| 1148 | |
|---|
| 1149 | if (!error) { |
|---|
| 1150 | if (calcSequenceChecksum(alignBuffer.text(), alignBuffer.length())!=toAlignChksum) { // sequence-chksum changed |
|---|
| 1151 | error = "Internal aligner error (sequence checksum changed) -- aborted"; |
|---|
| 1152 | |
|---|
| 1153 | #ifdef TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1154 | CompactedSubSequence alignedSlave(alignBuffer.text(), alignBuffer.length(), toAlignSequence->name()); |
|---|
| 1155 | dump_n_compare("Old Slave vs. new Slave", *toAlignSequence, alignedSlave); |
|---|
| 1156 | #endif // TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1157 | } |
|---|
| 1158 | else { |
|---|
| 1159 | { |
|---|
| 1160 | GB_topSecurityLevel unsecured(gb_toAlign); |
|---|
| 1161 | GBDATA *gbd = GBT_add_data(gb_toAlign, alignment, "data", GB_STRING); |
|---|
| 1162 | |
|---|
| 1163 | if (!gbd) { |
|---|
| 1164 | error = "Can't find/create sequence data"; |
|---|
| 1165 | } |
|---|
| 1166 | else { |
|---|
| 1167 | if (ali_params.range.is_part()) { // we aligned just a part of the sequence |
|---|
| 1168 | char *buffer = GB_read_string(gbd); // so we read old sequence data |
|---|
| 1169 | long len = GB_read_string_count(gbd); |
|---|
| 1170 | if (!buffer) error = GB_await_error(); |
|---|
| 1171 | else { |
|---|
| 1172 | int lenToCopy = ali_params.range.size(); |
|---|
| 1173 | long wholeChksum = calcSequenceChecksum(buffer, len); |
|---|
| 1174 | |
|---|
| 1175 | memcpy(buffer+ali_params.range.start(), alignBuffer.text()+ali_params.range.start(), lenToCopy); // copy in the aligned part |
|---|
| 1176 | // @@@ genau um 1 position zu niedrig |
|---|
| 1177 | |
|---|
| 1178 | if (calcSequenceChecksum(buffer, len) != wholeChksum) { |
|---|
| 1179 | error = "Internal aligner error (sequence checksum changed) -- aborted"; |
|---|
| 1180 | # ifdef TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1181 | char *buffer_org = GB_read_string(gbd); |
|---|
| 1182 | dump_n_compare("Old seq vs. new seq (slave)", buffer_org, len, buffer, len); |
|---|
| 1183 | free(buffer_org); |
|---|
| 1184 | # endif // TRACE_COMPRESSED_ALIGNMENT |
|---|
| 1185 | } |
|---|
| 1186 | else { |
|---|
| 1187 | GB_write_string(gbd, ""); |
|---|
| 1188 | error = GB_write_string(gbd, buffer); |
|---|
| 1189 | } |
|---|
| 1190 | } |
|---|
| 1191 | |
|---|
| 1192 | free(buffer); |
|---|
| 1193 | } |
|---|
| 1194 | else { |
|---|
| 1195 | alignBuffer.setDotsAtEOSequence(); |
|---|
| 1196 | error = GB_write_string(gbd, alignBuffer.text()); // aligned all -> write all |
|---|
| 1197 | } |
|---|
| 1198 | } |
|---|
| 1199 | } |
|---|
| 1200 | |
|---|
| 1201 | if (!error && ali_params.report != FA_NO_REPORT) { |
|---|
| 1202 | // create temp-entry for slave containing alignment quality: |
|---|
| 1203 | |
|---|
| 1204 | error = writeStringToAlignment(gb_toAlign, alignment, QUALITY_NAME, alignBuffer.quality(), ali_params.report==FA_TEMP_REPORT); |
|---|
| 1205 | if (!error) { |
|---|
| 1206 | // create temp-entry for master containing insert dots: |
|---|
| 1207 | |
|---|
| 1208 | int buflen = max_seq_length*2; |
|---|
| 1209 | char *buffer = ARB_alloc<char>(buflen+1); |
|---|
| 1210 | char *afterLast = buffer; |
|---|
| 1211 | |
|---|
| 1212 | if (!buffer) { |
|---|
| 1213 | error = "out of memory"; |
|---|
| 1214 | } |
|---|
| 1215 | else { |
|---|
| 1216 | memset(buffer, '-', buflen); |
|---|
| 1217 | buffer[buflen] = 0; |
|---|
| 1218 | |
|---|
| 1219 | const FastAlignInsertion *inserts = report.insertion(); |
|---|
| 1220 | while (inserts) { |
|---|
| 1221 | memset(buffer+inserts->offset(), '>', inserts->gaps()); |
|---|
| 1222 | afterLast = buffer+inserts->offset()+inserts->gaps(); |
|---|
| 1223 | inserts = inserts->next(); |
|---|
| 1224 | } |
|---|
| 1225 | |
|---|
| 1226 | *afterLast = 0; |
|---|
| 1227 | if (gb_alignTo) { |
|---|
| 1228 | error = writeStringToAlignment(gb_alignTo, alignment, INSERTS_NAME, buffer, ali_params.report==FA_TEMP_REPORT); |
|---|
| 1229 | } |
|---|
| 1230 | } |
|---|
| 1231 | } |
|---|
| 1232 | } |
|---|
| 1233 | } |
|---|
| 1234 | } |
|---|
| 1235 | return error; |
|---|
| 1236 | } |
|---|
| 1237 | |
|---|
| 1238 | ARB_ERROR FastAligner_delete_temp_entries(GBDATA *gb_species, const char *alignment) { |
|---|
| 1239 | fa_assert(gb_species); |
|---|
| 1240 | fa_assert(alignment); |
|---|
| 1241 | |
|---|
| 1242 | GBDATA *gb_ali = GB_search(gb_species, alignment, GB_FIND); |
|---|
| 1243 | ARB_ERROR error = NULp; |
|---|
| 1244 | |
|---|
| 1245 | if (gb_ali) { |
|---|
| 1246 | GBDATA *gb_name = GB_search(gb_ali, QUALITY_NAME, GB_FIND); |
|---|
| 1247 | if (gb_name) { |
|---|
| 1248 | error = GB_delete(gb_name); |
|---|
| 1249 | #if defined(DEBUG) |
|---|
| 1250 | printf("deleted '%s' of '%s' (gb_name=%p)\n", QUALITY_NAME, GBT_get_name_or_description(gb_species), gb_name); |
|---|
| 1251 | #endif |
|---|
| 1252 | } |
|---|
| 1253 | |
|---|
| 1254 | if (!error) { |
|---|
| 1255 | gb_name = GB_search(gb_ali, INSERTS_NAME, GB_FIND); |
|---|
| 1256 | if (gb_name) { |
|---|
| 1257 | error = GB_delete(gb_name); |
|---|
| 1258 | #if defined(DEBUG) |
|---|
| 1259 | printf("deleted '%s' of '%s' (gb_name=%p)\n", INSERTS_NAME, GBT_get_name_or_description(gb_species), gb_name); |
|---|
| 1260 | #endif |
|---|
| 1261 | } |
|---|
| 1262 | } |
|---|
| 1263 | } |
|---|
| 1264 | |
|---|
| 1265 | return error; |
|---|
| 1266 | } |
|---|
| 1267 | |
|---|
| 1268 | static ARB_ERROR align_error(ARB_ERROR olderr, GBDATA *gb_toAlign, GBDATA *gb_alignTo) { |
|---|
| 1269 | // used by alignTo() and alignToNextRelative() to transform errors coming from subroutines |
|---|
| 1270 | // can be used by higher functions |
|---|
| 1271 | |
|---|
| 1272 | const char *name_toAlign = read_name(gb_toAlign); |
|---|
| 1273 | const char *name_alignTo = read_name(gb_alignTo); |
|---|
| 1274 | |
|---|
| 1275 | fa_assert(olderr); |
|---|
| 1276 | |
|---|
| 1277 | return GBS_global_string("Error while aligning '%s' to '%s':\n%s", |
|---|
| 1278 | name_toAlign, name_alignTo, olderr.deliver()); |
|---|
| 1279 | } |
|---|
| 1280 | |
|---|
| 1281 | static ARB_ERROR alignTo(GBDATA *gb_toAlign, |
|---|
| 1282 | GB_CSTR alignment, |
|---|
| 1283 | const FastSearchSequence *alignTo, |
|---|
| 1284 | GBDATA *gb_alignTo, // may be NULp |
|---|
| 1285 | int max_seq_length, |
|---|
| 1286 | const AlignParams& ali_params) |
|---|
| 1287 | { |
|---|
| 1288 | ARB_ERROR error = NULp; |
|---|
| 1289 | long chksum = -1; |
|---|
| 1290 | |
|---|
| 1291 | CompactedSubSequence *toAlignSequence = readCompactedSequence(gb_toAlign, alignment, &error, NULp, &chksum, ali_params.range); |
|---|
| 1292 | |
|---|
| 1293 | if (island_hopper) { |
|---|
| 1294 | GBDATA *gb_seq = GBT_find_sequence(gb_toAlign, alignment); // get sequence |
|---|
| 1295 | if (gb_seq) { |
|---|
| 1296 | long length = GB_read_string_count(gb_seq); |
|---|
| 1297 | const char *data = GB_read_char_pntr(gb_seq); |
|---|
| 1298 | |
|---|
| 1299 | island_hopper->set_toAlign_sequence(data); |
|---|
| 1300 | island_hopper->set_alignment_length(length); |
|---|
| 1301 | } |
|---|
| 1302 | } |
|---|
| 1303 | |
|---|
| 1304 | |
|---|
| 1305 | |
|---|
| 1306 | if (!error) { |
|---|
| 1307 | error = alignCompactedTo(toAlignSequence, alignTo, max_seq_length, alignment, chksum, gb_toAlign, gb_alignTo, ali_params); |
|---|
| 1308 | if (error) error = align_error(error, gb_toAlign, gb_alignTo); |
|---|
| 1309 | delete toAlignSequence; |
|---|
| 1310 | } |
|---|
| 1311 | |
|---|
| 1312 | return error; |
|---|
| 1313 | } |
|---|
| 1314 | |
|---|
| 1315 | static ARB_ERROR alignToGroupConsensus(GBDATA *gb_toAlign, |
|---|
| 1316 | GB_CSTR alignment, |
|---|
| 1317 | Aligner_get_consensus_func get_consensus, |
|---|
| 1318 | int max_seq_length, |
|---|
| 1319 | const AlignParams& ali_params) |
|---|
| 1320 | { |
|---|
| 1321 | ARB_ERROR error = NULp; |
|---|
| 1322 | char *consensus = get_consensus(read_name(gb_toAlign), ali_params.range); |
|---|
| 1323 | size_t cons_len = strlen(consensus); |
|---|
| 1324 | |
|---|
| 1325 | fa_assert(cons_len); |
|---|
| 1326 | |
|---|
| 1327 | for (size_t i = 0; i<cons_len; ++i) { // translate consensus to be accepted by aligner |
|---|
| 1328 | switch (consensus[i]) { |
|---|
| 1329 | case '=': consensus[i] = '-'; break; |
|---|
| 1330 | default: break; |
|---|
| 1331 | } |
|---|
| 1332 | } |
|---|
| 1333 | |
|---|
| 1334 | CompactedSubSequence compacted(consensus, cons_len, "group consensus", ali_params.range.start()); |
|---|
| 1335 | |
|---|
| 1336 | { |
|---|
| 1337 | FastSearchSequence fast(compacted); |
|---|
| 1338 | error = alignTo(gb_toAlign, alignment, &fast, NULp, max_seq_length, ali_params); |
|---|
| 1339 | } |
|---|
| 1340 | |
|---|
| 1341 | free(consensus); |
|---|
| 1342 | |
|---|
| 1343 | return error; |
|---|
| 1344 | } |
|---|
| 1345 | |
|---|
| 1346 | static void appendNameAndUsedBasePositions(char **toString, GBDATA *gb_species, int usedBasePositions) { |
|---|
| 1347 | // if usedBasePositions == -1 -> unknown; |
|---|
| 1348 | |
|---|
| 1349 | char *currInfo; |
|---|
| 1350 | if (usedBasePositions<0) { |
|---|
| 1351 | currInfo = strdup(GBT_get_name_or_description(gb_species)); |
|---|
| 1352 | } |
|---|
| 1353 | else { |
|---|
| 1354 | fa_assert(usedBasePositions>0); // otherwise it should NOT be announced here! |
|---|
| 1355 | currInfo = GBS_global_string_copy("%s:%i", GBT_get_name_or_description(gb_species), usedBasePositions); |
|---|
| 1356 | } |
|---|
| 1357 | |
|---|
| 1358 | char *newString = NULp; |
|---|
| 1359 | if (*toString) { |
|---|
| 1360 | newString = GBS_global_string_copy("%s, %s", *toString, currInfo); |
|---|
| 1361 | } |
|---|
| 1362 | else { |
|---|
| 1363 | newString = currInfo; |
|---|
| 1364 | currInfo = NULp; // don't free |
|---|
| 1365 | } |
|---|
| 1366 | |
|---|
| 1367 | freeset(*toString, newString); |
|---|
| 1368 | free(currInfo); |
|---|
| 1369 | } |
|---|
| 1370 | |
|---|
| 1371 | inline int min(int i, int j) { return i<j ? i : j; } |
|---|
| 1372 | |
|---|
| 1373 | static ARB_ERROR alignToNextRelative(SearchRelativeParams& relSearch, |
|---|
| 1374 | int max_seq_length, |
|---|
| 1375 | FA_turn turnAllowed, |
|---|
| 1376 | GB_CSTR alignment, |
|---|
| 1377 | GBDATA *gb_toAlign, |
|---|
| 1378 | const AlignParams& ali_params) |
|---|
| 1379 | { |
|---|
| 1380 | CompactedSubSequence *toAlignSequence = NULp; |
|---|
| 1381 | bool use_different_pt_server_alignment = 0 != strcmp(relSearch.pt_server_alignment, alignment); |
|---|
| 1382 | |
|---|
| 1383 | ARB_ERROR error; |
|---|
| 1384 | long chksum; |
|---|
| 1385 | int relativesToTest = relSearch.maxRelatives*2; // get more relatives from pt-server (needed when use_different_pt_server_alignment == true) |
|---|
| 1386 | char **nearestRelative = new char*[relativesToTest+1]; |
|---|
| 1387 | int next_relatives; |
|---|
| 1388 | int i; |
|---|
| 1389 | GBDATA *gb_main = GB_get_root(gb_toAlign); |
|---|
| 1390 | |
|---|
| 1391 | if (use_different_pt_server_alignment) { |
|---|
| 1392 | turnAllowed = FA_TURN_NEVER; // makes no sense if we're using a different alignment for the pt_server |
|---|
| 1393 | } |
|---|
| 1394 | |
|---|
| 1395 | for (next_relatives=0; next_relatives<relativesToTest; next_relatives++) { |
|---|
| 1396 | nearestRelative[next_relatives] = NULp; |
|---|
| 1397 | } |
|---|
| 1398 | next_relatives = 0; |
|---|
| 1399 | |
|---|
| 1400 | bool restart = true; |
|---|
| 1401 | while (restart) { |
|---|
| 1402 | restart = false; |
|---|
| 1403 | |
|---|
| 1404 | char *findRelsBySeq = NULp; |
|---|
| 1405 | if (use_different_pt_server_alignment) { |
|---|
| 1406 | toAlignSequence = readCompactedSequence(gb_toAlign, alignment, &error, NULp, &chksum, ali_params.range); |
|---|
| 1407 | |
|---|
| 1408 | GBDATA *gbd = GBT_find_sequence(gb_toAlign, relSearch.pt_server_alignment); // use a different alignment for next relative search |
|---|
| 1409 | if (!gbd) { |
|---|
| 1410 | error = GBS_global_string("Species '%s' has no data in alignment '%s'", GBT_get_name_or_description(gb_toAlign), relSearch.pt_server_alignment); |
|---|
| 1411 | } |
|---|
| 1412 | else { |
|---|
| 1413 | findRelsBySeq = GB_read_string(gbd); |
|---|
| 1414 | } |
|---|
| 1415 | } |
|---|
| 1416 | else { |
|---|
| 1417 | toAlignSequence = readCompactedSequence(gb_toAlign, alignment, &error, &findRelsBySeq, &chksum, ali_params.range); |
|---|
| 1418 | } |
|---|
| 1419 | |
|---|
| 1420 | if (error) { |
|---|
| 1421 | delete toAlignSequence; |
|---|
| 1422 | return error; // @@@ leaks ? |
|---|
| 1423 | } |
|---|
| 1424 | |
|---|
| 1425 | while (next_relatives) { |
|---|
| 1426 | next_relatives--; |
|---|
| 1427 | freenull(nearestRelative[next_relatives]); |
|---|
| 1428 | } |
|---|
| 1429 | |
|---|
| 1430 | { |
|---|
| 1431 | // find relatives |
|---|
| 1432 | FamilyFinder *familyFinder = relSearch.getFamilyFinder(); |
|---|
| 1433 | const PosRange& range = familyFinder->get_TargetRange(); |
|---|
| 1434 | |
|---|
| 1435 | if (range.is_part()) { |
|---|
| 1436 | range.copy_corresponding_part(findRelsBySeq, findRelsBySeq, strlen(findRelsBySeq)); |
|---|
| 1437 | turnAllowed = FA_TURN_NEVER; // makes no sense if we're using partial relative search |
|---|
| 1438 | } |
|---|
| 1439 | |
|---|
| 1440 | error = familyFinder->searchFamily(findRelsBySeq, FF_FORWARD, relativesToTest+1, 0); // @@@ make min_score configurable |
|---|
| 1441 | |
|---|
| 1442 | // @@@ case where no relative found (due to min score) handle how ? abort ? warn ? |
|---|
| 1443 | |
|---|
| 1444 | double bestScore = 0; |
|---|
| 1445 | if (!error) { |
|---|
| 1446 | #if defined(DEBUG) |
|---|
| 1447 | double lastScore = -1; |
|---|
| 1448 | #if defined(TRACE_RELATIVES) |
|---|
| 1449 | fprintf(stderr, "List of relatives used for '%s':\n", GBT_get_name_or_description(gb_toAlign)); |
|---|
| 1450 | #endif // TRACE_RELATIVES |
|---|
| 1451 | #endif // DEBUG |
|---|
| 1452 | for (const FamilyList *fl = familyFinder->getFamilyList(); fl; fl=fl->next) { |
|---|
| 1453 | if (strcmp(toAlignSequence->name(), fl->name)!=0) { |
|---|
| 1454 | if (GBT_find_species(gb_main, fl->name)) { // @@@ |
|---|
| 1455 | double thisScore = familyFinder->uses_rel_matches() ? fl->rel_matches : fl->matches; |
|---|
| 1456 | #if defined(DEBUG) |
|---|
| 1457 | // check whether family list is sorted correctly |
|---|
| 1458 | fa_assert(lastScore < 0 || lastScore >= thisScore); |
|---|
| 1459 | lastScore = thisScore; |
|---|
| 1460 | #if defined(TRACE_RELATIVES) |
|---|
| 1461 | fprintf(stderr, "- %s (%5.2f)\n", fl->name, thisScore); |
|---|
| 1462 | #endif // TRACE_RELATIVES |
|---|
| 1463 | #endif // DEBUG |
|---|
| 1464 | if (thisScore>=bestScore) bestScore = thisScore; |
|---|
| 1465 | if (next_relatives<(relativesToTest+1)) { |
|---|
| 1466 | nearestRelative[next_relatives] = strdup(fl->name); |
|---|
| 1467 | next_relatives++; |
|---|
| 1468 | } |
|---|
| 1469 | } |
|---|
| 1470 | } |
|---|
| 1471 | } |
|---|
| 1472 | } |
|---|
| 1473 | |
|---|
| 1474 | if (!error && turnAllowed != FA_TURN_NEVER) { // test if mirrored sequence has better relatives |
|---|
| 1475 | char *mirroredSequence = strdup(findRelsBySeq); |
|---|
| 1476 | long length = strlen(mirroredSequence); |
|---|
| 1477 | double bestMirroredScore = 0; |
|---|
| 1478 | |
|---|
| 1479 | char T_or_U; |
|---|
| 1480 | error = GBT_determine_T_or_U(global_alignmentType, &T_or_U, "reverse-complement"); |
|---|
| 1481 | if (!error) { |
|---|
| 1482 | GBT_reverseComplementNucSequence(mirroredSequence, length, T_or_U); |
|---|
| 1483 | error = familyFinder->searchFamily(mirroredSequence, FF_FORWARD, relativesToTest+1, 0); // @@@ make min_score configurable |
|---|
| 1484 | } |
|---|
| 1485 | if (!error) { |
|---|
| 1486 | #if defined(DEBUG) |
|---|
| 1487 | double lastScore = -1; |
|---|
| 1488 | #if defined(TRACE_RELATIVES) |
|---|
| 1489 | fprintf(stderr, "List of relatives used for '%s' (turned seq):\n", GBT_get_name_or_description(gb_toAlign)); |
|---|
| 1490 | #endif // TRACE_RELATIVES |
|---|
| 1491 | #endif // DEBUG |
|---|
| 1492 | for (const FamilyList *fl = familyFinder->getFamilyList(); fl; fl = fl->next) { |
|---|
| 1493 | double thisScore = familyFinder->uses_rel_matches() ? fl->rel_matches : fl->matches; |
|---|
| 1494 | #if defined(DEBUG) |
|---|
| 1495 | // check whether family list is sorted correctly |
|---|
| 1496 | fa_assert(lastScore < 0 || lastScore >= thisScore); |
|---|
| 1497 | lastScore = thisScore; |
|---|
| 1498 | #if defined(TRACE_RELATIVES) |
|---|
| 1499 | fprintf(stderr, "- %s (%5.2f)\n", fl->name, thisScore); |
|---|
| 1500 | #endif // TRACE_RELATIVES |
|---|
| 1501 | #endif // DEBUG |
|---|
| 1502 | if (thisScore >= bestMirroredScore) { |
|---|
| 1503 | if (strcmp(toAlignSequence->name(), fl->name)!=0) { |
|---|
| 1504 | if (GBT_find_species(gb_main, fl->name)) bestMirroredScore = thisScore; // @@@ |
|---|
| 1505 | } |
|---|
| 1506 | } |
|---|
| 1507 | } |
|---|
| 1508 | } |
|---|
| 1509 | |
|---|
| 1510 | int turnIt = 0; |
|---|
| 1511 | if (bestMirroredScore>bestScore) { |
|---|
| 1512 | if (turnAllowed==FA_TURN_INTERACTIVE) { |
|---|
| 1513 | const char *message; |
|---|
| 1514 | if (familyFinder->uses_rel_matches()) { |
|---|
| 1515 | message = GBS_global_string("'%s' seems to be the other way round (score: %.1f%%, score if turned: %.1f%%)", |
|---|
| 1516 | toAlignSequence->name(), bestScore*100, bestMirroredScore*100); |
|---|
| 1517 | } |
|---|
| 1518 | else { |
|---|
| 1519 | message = GBS_global_string("'%s' seems to be the other way round (score: %li, score if turned: %li)", |
|---|
| 1520 | toAlignSequence->name(), long(bestScore+.5), long(bestMirroredScore+.5)); |
|---|
| 1521 | } |
|---|
| 1522 | turnIt = aw_question("fastali_turn_sequence", message, "Turn sequence,Leave sequence alone")==0; |
|---|
| 1523 | } |
|---|
| 1524 | else { |
|---|
| 1525 | fa_assert(turnAllowed == FA_TURN_ALWAYS); |
|---|
| 1526 | turnIt = 1; |
|---|
| 1527 | #if defined(TRACE_RELATIVES) |
|---|
| 1528 | fprintf(stderr, "Using turned sequence!\n"); |
|---|
| 1529 | #endif // TRACE_RELATIVES |
|---|
| 1530 | } |
|---|
| 1531 | } |
|---|
| 1532 | |
|---|
| 1533 | if (turnIt) { // write mirrored sequence |
|---|
| 1534 | GBDATA *gbd = GBT_find_sequence(gb_toAlign, alignment); |
|---|
| 1535 | GB_topSecurityLevel unsecured(gbd); |
|---|
| 1536 | error = GB_write_string(gbd, mirroredSequence); |
|---|
| 1537 | if (!error) { |
|---|
| 1538 | delete toAlignSequence; |
|---|
| 1539 | restart = true; // continue while loop |
|---|
| 1540 | } |
|---|
| 1541 | } |
|---|
| 1542 | |
|---|
| 1543 | free(mirroredSequence); |
|---|
| 1544 | } |
|---|
| 1545 | } |
|---|
| 1546 | free(findRelsBySeq); |
|---|
| 1547 | } |
|---|
| 1548 | |
|---|
| 1549 | if (!error) { |
|---|
| 1550 | if (!next_relatives) { |
|---|
| 1551 | char warning[200]; |
|---|
| 1552 | sprintf(warning, "No relative found for '%s'", toAlignSequence->name()); |
|---|
| 1553 | aw_message(warning); |
|---|
| 1554 | } |
|---|
| 1555 | else { |
|---|
| 1556 | // assuming relatives are sorted! (nearest to farthest) |
|---|
| 1557 | |
|---|
| 1558 | // get data pointers |
|---|
| 1559 | typedef GBDATA *GBDATAP; |
|---|
| 1560 | GBDATAP *gb_reference = new GBDATAP[relSearch.maxRelatives]; |
|---|
| 1561 | |
|---|
| 1562 | { |
|---|
| 1563 | for (i=0; i<relSearch.maxRelatives && i<next_relatives; i++) { |
|---|
| 1564 | GBDATA *gb_species = GBT_find_species(gb_main, nearestRelative[i]); |
|---|
| 1565 | if (!gb_species) { // pt-server seems not up to date! |
|---|
| 1566 | error = species_not_found(nearestRelative[i]); |
|---|
| 1567 | break; |
|---|
| 1568 | } |
|---|
| 1569 | |
|---|
| 1570 | GBDATA *gb_sequence = GBT_find_sequence(gb_species, alignment); |
|---|
| 1571 | if (gb_sequence) { // if relative has sequence data in the current alignment .. |
|---|
| 1572 | gb_reference[i] = gb_species; |
|---|
| 1573 | } |
|---|
| 1574 | else { // remove this relative |
|---|
| 1575 | free(nearestRelative[i]); |
|---|
| 1576 | for (int j = i+1; j<next_relatives; ++j) { |
|---|
| 1577 | nearestRelative[j-1] = nearestRelative[j]; |
|---|
| 1578 | } |
|---|
| 1579 | next_relatives--; |
|---|
| 1580 | nearestRelative[next_relatives] = NULp; |
|---|
| 1581 | i--; // redo same index |
|---|
| 1582 | } |
|---|
| 1583 | } |
|---|
| 1584 | |
|---|
| 1585 | // delete superfluous relatives |
|---|
| 1586 | for (; i<next_relatives; ++i) freenull(nearestRelative[i]); |
|---|
| 1587 | |
|---|
| 1588 | if (next_relatives>relSearch.maxRelatives) { |
|---|
| 1589 | next_relatives = relSearch.maxRelatives; |
|---|
| 1590 | } |
|---|
| 1591 | } |
|---|
| 1592 | |
|---|
| 1593 | // align |
|---|
| 1594 | |
|---|
| 1595 | if (!error) { |
|---|
| 1596 | CompactedSubSequence *alignToSequence = readCompactedSequence(gb_reference[0], alignment, &error, NULp, NULp, ali_params.range); |
|---|
| 1597 | fa_assert(alignToSequence); |
|---|
| 1598 | |
|---|
| 1599 | if (island_hopper) { |
|---|
| 1600 | GBDATA *gb_ref = GBT_find_sequence(gb_reference[0], alignment); // get reference sequence |
|---|
| 1601 | GBDATA *gb_align = GBT_find_sequence(gb_toAlign, alignment); // get sequence to align |
|---|
| 1602 | |
|---|
| 1603 | if (gb_ref && gb_align) { |
|---|
| 1604 | long length_ref = GB_read_string_count(gb_ref); |
|---|
| 1605 | const char *data; |
|---|
| 1606 | |
|---|
| 1607 | data = GB_read_char_pntr(gb_ref); |
|---|
| 1608 | island_hopper->set_ref_sequence(data); |
|---|
| 1609 | |
|---|
| 1610 | data = GB_read_char_pntr(gb_align); |
|---|
| 1611 | island_hopper->set_toAlign_sequence(data); |
|---|
| 1612 | |
|---|
| 1613 | island_hopper->set_alignment_length(length_ref); |
|---|
| 1614 | } |
|---|
| 1615 | } |
|---|
| 1616 | |
|---|
| 1617 | { |
|---|
| 1618 | FastSearchSequence referenceFastSeq(*alignToSequence); |
|---|
| 1619 | |
|---|
| 1620 | error = alignCompactedTo(toAlignSequence, &referenceFastSeq, |
|---|
| 1621 | max_seq_length, alignment, chksum, |
|---|
| 1622 | gb_toAlign, gb_reference[0], ali_params); |
|---|
| 1623 | } |
|---|
| 1624 | |
|---|
| 1625 | if (error) { |
|---|
| 1626 | error = align_error(error, gb_toAlign, gb_reference[0]); |
|---|
| 1627 | } |
|---|
| 1628 | else { |
|---|
| 1629 | char *used_relatives = NULp; |
|---|
| 1630 | |
|---|
| 1631 | if (unaligned_bases.is_empty()) { |
|---|
| 1632 | appendNameAndUsedBasePositions(&used_relatives, gb_reference[0], -1); |
|---|
| 1633 | } |
|---|
| 1634 | else { |
|---|
| 1635 | if (island_hopper) { |
|---|
| 1636 | appendNameAndUsedBasePositions(&used_relatives, gb_reference[0], -1); |
|---|
| 1637 | if (next_relatives>1) error = "Island hopping uses only one relative"; |
|---|
| 1638 | } |
|---|
| 1639 | else { |
|---|
| 1640 | LooseBases loose; |
|---|
| 1641 | LooseBases loose_for_next_relative; |
|---|
| 1642 | |
|---|
| 1643 | int unaligned_positions; |
|---|
| 1644 | { |
|---|
| 1645 | CompactedSubSequence *alignedSequence = readCompactedSequence(gb_toAlign, alignment, &error, NULp, NULp, ali_params.range); |
|---|
| 1646 | |
|---|
| 1647 | unaligned_positions = loose.follow_ali_change_and_append(unaligned_bases, AliChange(*toAlignSequence, *alignedSequence)); |
|---|
| 1648 | // now loose holds the unaligned (and recalculated) parts from last relative |
|---|
| 1649 | delete alignedSequence; |
|---|
| 1650 | } |
|---|
| 1651 | |
|---|
| 1652 | if (!error) { |
|---|
| 1653 | int toalign_positions = toAlignSequence->length(); |
|---|
| 1654 | if (unaligned_positions<toalign_positions) { |
|---|
| 1655 | appendNameAndUsedBasePositions(&used_relatives, gb_reference[0], toalign_positions-unaligned_positions); |
|---|
| 1656 | } |
|---|
| 1657 | } |
|---|
| 1658 | |
|---|
| 1659 | for (i=1; i<next_relatives && !error; i++) { |
|---|
| 1660 | loose.append(loose_for_next_relative); |
|---|
| 1661 | int unaligned_positions_for_next = 0; |
|---|
| 1662 | |
|---|
| 1663 | while (!loose.is_empty() && !error) { |
|---|
| 1664 | ExplicitRange partRange(intersection(loose.recall(), ali_params.range)); |
|---|
| 1665 | CompactedSubSequence *alignToPart = readCompactedSequence(gb_reference[i], alignment, &error, NULp, NULp, partRange); |
|---|
| 1666 | |
|---|
| 1667 | if (!error) { |
|---|
| 1668 | long part_chksum; |
|---|
| 1669 | CompactedSubSequence *toAlignPart = readCompactedSequence(gb_toAlign, alignment, &error, NULp, &part_chksum, partRange); |
|---|
| 1670 | |
|---|
| 1671 | fa_assert(contradicted(error, toAlignPart)); |
|---|
| 1672 | |
|---|
| 1673 | if (!error) { |
|---|
| 1674 | AlignParams loose_ali_params = { ali_params.report, ali_params.showGapsMessages, partRange }; |
|---|
| 1675 | |
|---|
| 1676 | FastSearchSequence referenceFastSeq(*alignToPart); |
|---|
| 1677 | error = alignCompactedTo(toAlignPart, &referenceFastSeq, |
|---|
| 1678 | max_seq_length, alignment, part_chksum, |
|---|
| 1679 | gb_toAlign, gb_reference[i], loose_ali_params); |
|---|
| 1680 | if (!error) { |
|---|
| 1681 | CompactedSubSequence *alignedPart = readCompactedSequence(gb_toAlign, alignment, &error, NULp, NULp, partRange); |
|---|
| 1682 | if (!error) { |
|---|
| 1683 | unaligned_positions_for_next += loose_for_next_relative.follow_ali_change_and_append(unaligned_bases, AliChange(*toAlignPart, *alignedPart)); |
|---|
| 1684 | } |
|---|
| 1685 | delete alignedPart; |
|---|
| 1686 | } |
|---|
| 1687 | } |
|---|
| 1688 | delete toAlignPart; |
|---|
| 1689 | } |
|---|
| 1690 | delete alignToPart; |
|---|
| 1691 | } |
|---|
| 1692 | |
|---|
| 1693 | if (!error) { |
|---|
| 1694 | fa_assert(unaligned_positions_for_next <= unaligned_positions); // means: number of unaligned positions has increased by use of relative |
|---|
| 1695 | if (unaligned_positions_for_next<unaligned_positions) { |
|---|
| 1696 | appendNameAndUsedBasePositions(&used_relatives, gb_reference[i], unaligned_positions-unaligned_positions_for_next); |
|---|
| 1697 | unaligned_positions = unaligned_positions_for_next; |
|---|
| 1698 | } |
|---|
| 1699 | } |
|---|
| 1700 | } |
|---|
| 1701 | } |
|---|
| 1702 | } |
|---|
| 1703 | |
|---|
| 1704 | if (!error) { // write used relatives to db-field |
|---|
| 1705 | error = GBT_write_string(gb_toAlign, "used_rels", used_relatives); |
|---|
| 1706 | } |
|---|
| 1707 | free(used_relatives); |
|---|
| 1708 | } |
|---|
| 1709 | |
|---|
| 1710 | delete alignToSequence; |
|---|
| 1711 | } |
|---|
| 1712 | |
|---|
| 1713 | delete [] gb_reference; |
|---|
| 1714 | } |
|---|
| 1715 | } |
|---|
| 1716 | |
|---|
| 1717 | delete toAlignSequence; |
|---|
| 1718 | |
|---|
| 1719 | for (i=0; i<next_relatives; i++) freenull(nearestRelative[i]); |
|---|
| 1720 | delete [] nearestRelative; |
|---|
| 1721 | |
|---|
| 1722 | return error; |
|---|
| 1723 | } |
|---|
| 1724 | |
|---|
| 1725 | // ------------------------ |
|---|
| 1726 | // AlignmentReference |
|---|
| 1727 | |
|---|
| 1728 | class AlignmentReference : virtual Noncopyable { |
|---|
| 1729 | GB_CSTR alignment; |
|---|
| 1730 | int max_seq_length; |
|---|
| 1731 | const AlignParams& ali_params; |
|---|
| 1732 | |
|---|
| 1733 | public: |
|---|
| 1734 | AlignmentReference(GB_CSTR alignment_, |
|---|
| 1735 | int max_seq_length_, |
|---|
| 1736 | const AlignParams& ali_params_) |
|---|
| 1737 | : alignment(alignment_), |
|---|
| 1738 | max_seq_length(max_seq_length_), |
|---|
| 1739 | ali_params(ali_params_) |
|---|
| 1740 | {} |
|---|
| 1741 | virtual ~AlignmentReference() {} |
|---|
| 1742 | |
|---|
| 1743 | virtual ARB_ERROR align_to(GBDATA *gb_toalign) const = 0; |
|---|
| 1744 | |
|---|
| 1745 | GB_CSTR get_alignment() const { return alignment; } |
|---|
| 1746 | int get_max_seq_length() const { return max_seq_length; } |
|---|
| 1747 | const AlignParams& get_ali_params() const { return ali_params; } |
|---|
| 1748 | }; |
|---|
| 1749 | |
|---|
| 1750 | |
|---|
| 1751 | // @@@ make alignTo a member of ExplicitReference (or of AlignmentReference) |
|---|
| 1752 | // @@@ let alignToGroupConsensus and alignToNextRelative use ExplicitReference |
|---|
| 1753 | |
|---|
| 1754 | |
|---|
| 1755 | class ExplicitReference: public AlignmentReference { // derived from a Noncopyable |
|---|
| 1756 | const FastSearchSequence *targetSequence; |
|---|
| 1757 | GBDATA *gb_alignTo; |
|---|
| 1758 | |
|---|
| 1759 | public: |
|---|
| 1760 | ExplicitReference(GB_CSTR alignment_, |
|---|
| 1761 | const FastSearchSequence *targetSequence_, |
|---|
| 1762 | GBDATA *gb_alignTo_, |
|---|
| 1763 | int max_seq_length_, |
|---|
| 1764 | const AlignParams& ali_params_) |
|---|
| 1765 | : AlignmentReference(alignment_, max_seq_length_, ali_params_), |
|---|
| 1766 | targetSequence(targetSequence_), |
|---|
| 1767 | gb_alignTo(gb_alignTo_) |
|---|
| 1768 | {} |
|---|
| 1769 | |
|---|
| 1770 | ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE { |
|---|
| 1771 | return alignTo(gb_toalign, get_alignment(), targetSequence, gb_alignTo, get_max_seq_length(), get_ali_params()); |
|---|
| 1772 | } |
|---|
| 1773 | }; |
|---|
| 1774 | |
|---|
| 1775 | // @@@ make alignToGroupConsensus a member of ConsensusReference |
|---|
| 1776 | |
|---|
| 1777 | class ConsensusReference: public AlignmentReference { |
|---|
| 1778 | Aligner_get_consensus_func get_consensus; |
|---|
| 1779 | |
|---|
| 1780 | public: |
|---|
| 1781 | ConsensusReference(GB_CSTR alignment_, |
|---|
| 1782 | Aligner_get_consensus_func get_consensus_, |
|---|
| 1783 | int max_seq_length_, |
|---|
| 1784 | const AlignParams& ali_params_) |
|---|
| 1785 | : AlignmentReference(alignment_, max_seq_length_, ali_params_), |
|---|
| 1786 | get_consensus(get_consensus_) |
|---|
| 1787 | {} |
|---|
| 1788 | |
|---|
| 1789 | ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE { |
|---|
| 1790 | return alignToGroupConsensus(gb_toalign, get_alignment(), get_consensus, get_max_seq_length(), get_ali_params()); |
|---|
| 1791 | } |
|---|
| 1792 | }; |
|---|
| 1793 | |
|---|
| 1794 | // @@@ make alignToNextRelative a member of SearchRelativesReference |
|---|
| 1795 | |
|---|
| 1796 | class SearchRelativesReference: public AlignmentReference { |
|---|
| 1797 | SearchRelativeParams& relSearch; |
|---|
| 1798 | FA_turn turnAllowed; |
|---|
| 1799 | |
|---|
| 1800 | public: |
|---|
| 1801 | SearchRelativesReference(SearchRelativeParams& relSearch_, |
|---|
| 1802 | int max_seq_length_, |
|---|
| 1803 | FA_turn turnAllowed_, |
|---|
| 1804 | GB_CSTR alignment_, |
|---|
| 1805 | const AlignParams& ali_params_) |
|---|
| 1806 | : AlignmentReference(alignment_, max_seq_length_, ali_params_), |
|---|
| 1807 | relSearch(relSearch_), |
|---|
| 1808 | turnAllowed(turnAllowed_) |
|---|
| 1809 | {} |
|---|
| 1810 | |
|---|
| 1811 | ARB_ERROR align_to(GBDATA *gb_toalign) const OVERRIDE { |
|---|
| 1812 | return alignToNextRelative(relSearch, get_max_seq_length(), turnAllowed, get_alignment(), gb_toalign, get_ali_params()); |
|---|
| 1813 | } |
|---|
| 1814 | }; |
|---|
| 1815 | |
|---|
| 1816 | |
|---|
| 1817 | // ---------------- |
|---|
| 1818 | // Aligner |
|---|
| 1819 | |
|---|
| 1820 | class Aligner : virtual Noncopyable { |
|---|
| 1821 | GBDATA *gb_main; |
|---|
| 1822 | |
|---|
| 1823 | // define alignment target(s): |
|---|
| 1824 | FA_alignTarget alignWhat; |
|---|
| 1825 | GB_CSTR alignment; // name of alignment to use |
|---|
| 1826 | GB_CSTR toalign; // name of species to align (used if alignWhat == FA_CURRENT) |
|---|
| 1827 | Aligner_get_first_selected_species get_first_selected_species; // used if alignWhat == FA_SELECTED |
|---|
| 1828 | Aligner_get_next_selected_species get_next_selected_species; // --- "" --- |
|---|
| 1829 | |
|---|
| 1830 | // define reference sequence(s): |
|---|
| 1831 | GB_CSTR reference; // name of reference species |
|---|
| 1832 | Aligner_get_consensus_func get_consensus; // function to get consensus seq |
|---|
| 1833 | SearchRelativeParams& relSearch; // params to search for relatives |
|---|
| 1834 | |
|---|
| 1835 | // general params: |
|---|
| 1836 | FA_turn turnAllowed; |
|---|
| 1837 | const AlignParams& ali_params; |
|---|
| 1838 | int maxProtection; // protection level |
|---|
| 1839 | |
|---|
| 1840 | // -------------------- new members |
|---|
| 1841 | int wasNotAllowedToAlign; // number of failures caused by wrong protection |
|---|
| 1842 | int err_count; // count errors |
|---|
| 1843 | bool continue_on_error; /* true -> run single alignments in separate transactions. |
|---|
| 1844 | * If one target fails, continue with rest. |
|---|
| 1845 | * false -> run all in one transaction |
|---|
| 1846 | * One fails -> all fail! |
|---|
| 1847 | */ |
|---|
| 1848 | FA_errorAction error_action; |
|---|
| 1849 | |
|---|
| 1850 | typedef std::list<GBDATA*> GBDATAlist; |
|---|
| 1851 | GBDATAlist species_to_mark; // species that will be marked after aligning |
|---|
| 1852 | |
|---|
| 1853 | ARB_ERROR alignToReference(GBDATA *gb_toalign, const AlignmentReference& ref); |
|---|
| 1854 | ARB_ERROR alignTargetsToReference(const AlignmentReference& ref, GBDATA *gb_species_data); |
|---|
| 1855 | |
|---|
| 1856 | ARB_ERROR alignToExplicitReference(GBDATA *gb_species_data, int max_seq_length); |
|---|
| 1857 | ARB_ERROR alignToConsensus(GBDATA *gb_species_data, int max_seq_length); |
|---|
| 1858 | ARB_ERROR alignToRelatives(GBDATA *gb_species_data, int max_seq_length); |
|---|
| 1859 | |
|---|
| 1860 | void triggerAction(GBDATA *gb_species, bool has_been_aligned) { |
|---|
| 1861 | bool mark = false; |
|---|
| 1862 | switch (error_action) { |
|---|
| 1863 | case FA_MARK_FAILED: mark = !has_been_aligned; break; |
|---|
| 1864 | case FA_MARK_ALIGNED: mark = has_been_aligned; break; |
|---|
| 1865 | case FA_NO_ACTION: mark = false; break; |
|---|
| 1866 | } |
|---|
| 1867 | if (mark) species_to_mark.push_back(gb_species); |
|---|
| 1868 | } |
|---|
| 1869 | |
|---|
| 1870 | public: |
|---|
| 1871 | |
|---|
| 1872 | // @@@ pass AlignmentReference from caller (replacing reference parameters) |
|---|
| 1873 | |
|---|
| 1874 | Aligner(GBDATA *gb_main_, |
|---|
| 1875 | |
|---|
| 1876 | // define alignment target(s): |
|---|
| 1877 | FA_alignTarget alignWhat_, |
|---|
| 1878 | GB_CSTR alignment_, // name of alignment to use |
|---|
| 1879 | GB_CSTR toalign_, // name of species to align (used if alignWhat == FA_CURRENT) |
|---|
| 1880 | Aligner_get_first_selected_species get_first_selected_species_, // used if alignWhat == FA_SELECTED |
|---|
| 1881 | Aligner_get_next_selected_species get_next_selected_species_, // --- "" --- |
|---|
| 1882 | |
|---|
| 1883 | // define reference sequence(s): |
|---|
| 1884 | GB_CSTR reference_, // name of reference species |
|---|
| 1885 | Aligner_get_consensus_func get_consensus_, // function to get consensus seq |
|---|
| 1886 | SearchRelativeParams& relSearch_, // params to search for relatives |
|---|
| 1887 | |
|---|
| 1888 | // general params: |
|---|
| 1889 | FA_turn turnAllowed_, |
|---|
| 1890 | const AlignParams& ali_params_, |
|---|
| 1891 | int maxProtection_, // protection level |
|---|
| 1892 | bool continue_on_error_, |
|---|
| 1893 | FA_errorAction error_action_) |
|---|
| 1894 | : gb_main(gb_main_), |
|---|
| 1895 | alignWhat(alignWhat_), |
|---|
| 1896 | alignment(alignment_), |
|---|
| 1897 | toalign(toalign_), |
|---|
| 1898 | get_first_selected_species(get_first_selected_species_), |
|---|
| 1899 | get_next_selected_species(get_next_selected_species_), |
|---|
| 1900 | reference(reference_), |
|---|
| 1901 | get_consensus(get_consensus_), |
|---|
| 1902 | relSearch(relSearch_), |
|---|
| 1903 | turnAllowed(turnAllowed_), |
|---|
| 1904 | ali_params(ali_params_), |
|---|
| 1905 | maxProtection(maxProtection_), |
|---|
| 1906 | wasNotAllowedToAlign(0), |
|---|
| 1907 | err_count(0), |
|---|
| 1908 | continue_on_error(continue_on_error_), |
|---|
| 1909 | error_action(continue_on_error ? error_action_ : FA_NO_ACTION) |
|---|
| 1910 | { |
|---|
| 1911 | gb_assert(alignment); |
|---|
| 1912 | } |
|---|
| 1913 | |
|---|
| 1914 | ARB_ERROR run(); |
|---|
| 1915 | }; |
|---|
| 1916 | |
|---|
| 1917 | ARB_ERROR Aligner::alignToReference(GBDATA *gb_toalign, const AlignmentReference& ref) { |
|---|
| 1918 | int myProtection; |
|---|
| 1919 | ARB_ERROR error; |
|---|
| 1920 | { |
|---|
| 1921 | GBDATA *gb_seq = GBT_find_sequence(gb_toalign, alignment); |
|---|
| 1922 | // if sequence not found (e.g. wrong alignment) => assume protection is low + let align_to() fail below |
|---|
| 1923 | myProtection = gb_seq ? GB_read_security_write(gb_seq) : 0; |
|---|
| 1924 | } |
|---|
| 1925 | |
|---|
| 1926 | if (myProtection<=maxProtection) { |
|---|
| 1927 | // Depending on 'continue_on_error' we either |
|---|
| 1928 | // * stop aligning if an error occurs or |
|---|
| 1929 | // * run the alignment of each species in its own transaction, ignore but report errors and |
|---|
| 1930 | // optionally mark aligned or failed species. |
|---|
| 1931 | |
|---|
| 1932 | if (continue_on_error) { |
|---|
| 1933 | fa_assert(GB_get_transaction_level(gb_main) == 1); |
|---|
| 1934 | error = GB_end_transaction(gb_main, error); // end global transaction |
|---|
| 1935 | } |
|---|
| 1936 | |
|---|
| 1937 | if (!error) { |
|---|
| 1938 | error = GB_push_transaction(gb_main); |
|---|
| 1939 | if (!error) error = ref.align_to(gb_toalign); |
|---|
| 1940 | error = GB_end_transaction(gb_main, error); |
|---|
| 1941 | |
|---|
| 1942 | if (error) err_count++; |
|---|
| 1943 | triggerAction(gb_toalign, !error); |
|---|
| 1944 | } |
|---|
| 1945 | |
|---|
| 1946 | if (continue_on_error) { |
|---|
| 1947 | if (error) { |
|---|
| 1948 | GB_warning(error.deliver()); |
|---|
| 1949 | error = NULp; |
|---|
| 1950 | } |
|---|
| 1951 | error = GB_begin_transaction(gb_main); // re-open global transaction |
|---|
| 1952 | } |
|---|
| 1953 | } |
|---|
| 1954 | else { |
|---|
| 1955 | wasNotAllowedToAlign++; |
|---|
| 1956 | triggerAction(gb_toalign, false); |
|---|
| 1957 | } |
|---|
| 1958 | |
|---|
| 1959 | return error; |
|---|
| 1960 | } |
|---|
| 1961 | |
|---|
| 1962 | ARB_ERROR Aligner::alignTargetsToReference(const AlignmentReference& ref, GBDATA *gb_species_data) { |
|---|
| 1963 | ARB_ERROR error; |
|---|
| 1964 | |
|---|
| 1965 | fa_assert(GB_get_transaction_level(gb_main) == 1); |
|---|
| 1966 | |
|---|
| 1967 | switch (alignWhat) { |
|---|
| 1968 | case FA_CURRENT: { // align one sequence |
|---|
| 1969 | fa_assert(toalign); |
|---|
| 1970 | |
|---|
| 1971 | GBDATA *gb_toalign = GBT_find_species_rel_species_data(gb_species_data, toalign); |
|---|
| 1972 | if (!gb_toalign) { |
|---|
| 1973 | error = species_not_found(toalign); |
|---|
| 1974 | } |
|---|
| 1975 | else { |
|---|
| 1976 | currentSequenceNumber = overallSequenceNumber = 1; |
|---|
| 1977 | error = alignToReference(gb_toalign, ref); |
|---|
| 1978 | } |
|---|
| 1979 | break; |
|---|
| 1980 | } |
|---|
| 1981 | case FA_MARKED: { // align all marked sequences |
|---|
| 1982 | int count = GBT_count_marked_species(gb_main); |
|---|
| 1983 | GBDATA *gb_species = GBT_first_marked_species_rel_species_data(gb_species_data); |
|---|
| 1984 | |
|---|
| 1985 | arb_progress progress("Aligning marked species", long(count)); |
|---|
| 1986 | progress.auto_subtitles("Species"); |
|---|
| 1987 | |
|---|
| 1988 | currentSequenceNumber = 1; |
|---|
| 1989 | overallSequenceNumber = count; |
|---|
| 1990 | |
|---|
| 1991 | while (gb_species && !error) { |
|---|
| 1992 | error = alignToReference(gb_species, ref); |
|---|
| 1993 | progress.inc_and_check_user_abort(error); |
|---|
| 1994 | gb_species = GBT_next_marked_species(gb_species); |
|---|
| 1995 | } |
|---|
| 1996 | break; |
|---|
| 1997 | } |
|---|
| 1998 | case FA_SELECTED: { // align all selected species |
|---|
| 1999 | int count; |
|---|
| 2000 | GBDATA *gb_species = get_first_selected_species(&count); |
|---|
| 2001 | |
|---|
| 2002 | |
|---|
| 2003 | currentSequenceNumber = 1; |
|---|
| 2004 | overallSequenceNumber = count; |
|---|
| 2005 | |
|---|
| 2006 | if (!gb_species) { |
|---|
| 2007 | aw_message("There is no selected species!"); |
|---|
| 2008 | } |
|---|
| 2009 | else { |
|---|
| 2010 | arb_progress progress("Aligning selected species", long(count)); |
|---|
| 2011 | progress.auto_subtitles("Species"); |
|---|
| 2012 | |
|---|
| 2013 | while (gb_species && !error) { |
|---|
| 2014 | error = alignToReference(gb_species, ref); |
|---|
| 2015 | progress.inc_and_check_user_abort(error); |
|---|
| 2016 | gb_species = get_next_selected_species(); |
|---|
| 2017 | } |
|---|
| 2018 | } |
|---|
| 2019 | break; |
|---|
| 2020 | } |
|---|
| 2021 | } |
|---|
| 2022 | |
|---|
| 2023 | fa_assert(GB_get_transaction_level(gb_main) == 1); |
|---|
| 2024 | return error; |
|---|
| 2025 | } |
|---|
| 2026 | |
|---|
| 2027 | ARB_ERROR Aligner::alignToExplicitReference(GBDATA *gb_species_data, int max_seq_length) { |
|---|
| 2028 | ARB_ERROR error; |
|---|
| 2029 | GBDATA *gb_reference = GBT_find_species_rel_species_data(gb_species_data, reference); |
|---|
| 2030 | |
|---|
| 2031 | if (!gb_reference) { |
|---|
| 2032 | error = species_not_found(reference); |
|---|
| 2033 | } |
|---|
| 2034 | else { |
|---|
| 2035 | long referenceChksum; |
|---|
| 2036 | CompactedSubSequence *referenceSeq = readCompactedSequence(gb_reference, alignment, &error, NULp, &referenceChksum, ali_params.range); |
|---|
| 2037 | |
|---|
| 2038 | if (island_hopper) { |
|---|
| 2039 | // @@@ setting island_hopper reference has to be done in called function (seems that it is NOT done for alignToConsensus and alignToRelatives). First get tests in place! |
|---|
| 2040 | |
|---|
| 2041 | GBDATA *gb_seq = GBT_find_sequence(gb_reference, alignment); // get sequence |
|---|
| 2042 | if (gb_seq) { |
|---|
| 2043 | long length = GB_read_string_count(gb_seq); |
|---|
| 2044 | const char *data = GB_read_char_pntr(gb_seq); |
|---|
| 2045 | |
|---|
| 2046 | island_hopper->set_ref_sequence(data); |
|---|
| 2047 | island_hopper->set_alignment_length(length); |
|---|
| 2048 | } |
|---|
| 2049 | } |
|---|
| 2050 | |
|---|
| 2051 | |
|---|
| 2052 | if (!error) { |
|---|
| 2053 | // @@@ do not pass FastSearchSequence to ExplicitReference, instead pass sequence and length (ExplicitReference shall create it itself) |
|---|
| 2054 | |
|---|
| 2055 | FastSearchSequence referenceFastSeq(*referenceSeq); |
|---|
| 2056 | ExplicitReference target(alignment, &referenceFastSeq, gb_reference, max_seq_length, ali_params); |
|---|
| 2057 | |
|---|
| 2058 | error = alignTargetsToReference(target, gb_species_data); |
|---|
| 2059 | } |
|---|
| 2060 | delete referenceSeq; |
|---|
| 2061 | } |
|---|
| 2062 | return error; |
|---|
| 2063 | } |
|---|
| 2064 | |
|---|
| 2065 | ARB_ERROR Aligner::alignToConsensus(GBDATA *gb_species_data, int max_seq_length) { |
|---|
| 2066 | return alignTargetsToReference(ConsensusReference(alignment, get_consensus, max_seq_length, ali_params), |
|---|
| 2067 | gb_species_data); |
|---|
| 2068 | } |
|---|
| 2069 | |
|---|
| 2070 | ARB_ERROR Aligner::alignToRelatives(GBDATA *gb_species_data, int max_seq_length) { |
|---|
| 2071 | return alignTargetsToReference(SearchRelativesReference(relSearch, max_seq_length, turnAllowed, alignment, ali_params), |
|---|
| 2072 | gb_species_data); |
|---|
| 2073 | } |
|---|
| 2074 | |
|---|
| 2075 | ARB_ERROR Aligner::run() { |
|---|
| 2076 | fa_assert(GB_get_transaction_level(gb_main) == 0); // no open transaction allowed here! |
|---|
| 2077 | ARB_ERROR error = GB_push_transaction(gb_main); |
|---|
| 2078 | |
|---|
| 2079 | // if neither 'reference' nor 'get_consensus' are specified -> use next relative for (each) sequence: |
|---|
| 2080 | bool search_by_pt_server = !reference && !get_consensus; |
|---|
| 2081 | |
|---|
| 2082 | err_count = 0; |
|---|
| 2083 | wasNotAllowedToAlign = 0; // incremented for every sequence which has higher protection level (and was not aligned) |
|---|
| 2084 | species_to_mark.clear(); |
|---|
| 2085 | |
|---|
| 2086 | fa_assert(!reference || !get_consensus); // can't do both modes |
|---|
| 2087 | |
|---|
| 2088 | if (turnAllowed != FA_TURN_NEVER) { |
|---|
| 2089 | if ((ali_params.range.is_part()) || !search_by_pt_server) { |
|---|
| 2090 | // if not selected 'Range/Whole sequence' or not selected 'Reference/Auto search..' |
|---|
| 2091 | turnAllowed = FA_TURN_NEVER; // then disable mirroring for the current call |
|---|
| 2092 | } |
|---|
| 2093 | } |
|---|
| 2094 | |
|---|
| 2095 | if (!error) { |
|---|
| 2096 | global_alignmentType = GBT_get_alignment_type(gb_main, alignment); |
|---|
| 2097 | fa_assert(global_alignmentType != GB_AT_UNKNOWN); |
|---|
| 2098 | |
|---|
| 2099 | if (search_by_pt_server) { |
|---|
| 2100 | GB_alignment_type pt_server_alignmentType = GBT_get_alignment_type(gb_main, relSearch.pt_server_alignment); |
|---|
| 2101 | if (pt_server_alignmentType == GB_AT_UNKNOWN) { |
|---|
| 2102 | error = GB_append_exportedError("failed to detect type of alignment to use for ptserver-search"); |
|---|
| 2103 | } |
|---|
| 2104 | else if (pt_server_alignmentType != GB_AT_RNA && pt_server_alignmentType != GB_AT_DNA) { |
|---|
| 2105 | error = "pt_servers only support RNA/DNA sequences.\n" |
|---|
| 2106 | "In the aligner window you may specify a RNA/DNA alignment \n" |
|---|
| 2107 | "and use a pt_server build on that alignment."; |
|---|
| 2108 | } |
|---|
| 2109 | } |
|---|
| 2110 | } |
|---|
| 2111 | |
|---|
| 2112 | if (!error) { |
|---|
| 2113 | GBDATA *gb_species_data = GBT_get_species_data(gb_main); |
|---|
| 2114 | int max_seq_length = GBT_get_alignment_len(gb_main, alignment); |
|---|
| 2115 | fa_assert(max_seq_length>0); |
|---|
| 2116 | |
|---|
| 2117 | if (reference) error = alignToExplicitReference(gb_species_data, max_seq_length); |
|---|
| 2118 | else if (get_consensus) error = alignToConsensus(gb_species_data, max_seq_length); |
|---|
| 2119 | else error = alignToRelatives(gb_species_data, max_seq_length); |
|---|
| 2120 | } |
|---|
| 2121 | |
|---|
| 2122 | ClustalV_exit(); |
|---|
| 2123 | unaligned_bases.clear(); |
|---|
| 2124 | |
|---|
| 2125 | error = GB_end_transaction(gb_main, error); // close global transaction |
|---|
| 2126 | |
|---|
| 2127 | if (wasNotAllowedToAlign>0) { |
|---|
| 2128 | const char *mess = GBS_global_string("%i species were not aligned (because of protection level)", wasNotAllowedToAlign); |
|---|
| 2129 | aw_message(mess); |
|---|
| 2130 | } |
|---|
| 2131 | |
|---|
| 2132 | if (err_count) { |
|---|
| 2133 | aw_message_if(error); |
|---|
| 2134 | error = GBS_global_string("Aligner produced %i error%c", err_count, err_count==1 ? '\0' : 's'); |
|---|
| 2135 | } |
|---|
| 2136 | |
|---|
| 2137 | if (error_action != FA_NO_ACTION) { |
|---|
| 2138 | fa_assert(continue_on_error); |
|---|
| 2139 | |
|---|
| 2140 | GB_transaction ta(gb_main); |
|---|
| 2141 | GBT_mark_all(gb_main, 0); |
|---|
| 2142 | for (GBDATAlist::iterator sp = species_to_mark.begin(); sp != species_to_mark.end(); ++sp) { |
|---|
| 2143 | GB_write_flag(*sp, 1); |
|---|
| 2144 | } |
|---|
| 2145 | |
|---|
| 2146 | const char *whatsMarked = (error_action == FA_MARK_ALIGNED) ? "aligned" : "failed"; |
|---|
| 2147 | size_t markCount = species_to_mark.size(); |
|---|
| 2148 | if (markCount>0) { |
|---|
| 2149 | const char *msg = GBS_global_string("%zu %s species %s been marked", |
|---|
| 2150 | markCount, |
|---|
| 2151 | whatsMarked, |
|---|
| 2152 | (markCount == 1) ? "has" : "have"); |
|---|
| 2153 | aw_message(msg); |
|---|
| 2154 | } |
|---|
| 2155 | } |
|---|
| 2156 | |
|---|
| 2157 | return error; |
|---|
| 2158 | } |
|---|
| 2159 | |
|---|
| 2160 | void FastAligner_start(AW_window *aw, const AlignDataAccess *data_access) { |
|---|
| 2161 | AW_root *root = aw->get_root(); |
|---|
| 2162 | char *reference = NULp; // align against next relatives |
|---|
| 2163 | char *toalign = NULp; // align marked species |
|---|
| 2164 | ARB_ERROR error = NULp; |
|---|
| 2165 | int get_consensus = 0; |
|---|
| 2166 | int pt_server_id = -1; |
|---|
| 2167 | |
|---|
| 2168 | Aligner_get_first_selected_species get_first_selected_species = NULp; |
|---|
| 2169 | Aligner_get_next_selected_species get_next_selected_species = NULp; |
|---|
| 2170 | |
|---|
| 2171 | fa_assert(!island_hopper); |
|---|
| 2172 | if (root->awar(FA_AWAR_USE_ISLAND_HOPPING)->read_int()) { |
|---|
| 2173 | island_hopper = new IslandHopping; |
|---|
| 2174 | if (root->awar(FA_AWAR_USE_SECONDARY)->read_int()) { |
|---|
| 2175 | char *helix_string = data_access->getHelixString(); |
|---|
| 2176 | if (helix_string) { |
|---|
| 2177 | island_hopper->set_helix(helix_string); |
|---|
| 2178 | free(helix_string); |
|---|
| 2179 | } |
|---|
| 2180 | else { |
|---|
| 2181 | error = "Warning: No HELIX found. Can't use secondary structure"; |
|---|
| 2182 | } |
|---|
| 2183 | } |
|---|
| 2184 | |
|---|
| 2185 | if (!error) { |
|---|
| 2186 | island_hopper->set_parameters(root->awar(FA_AWAR_ESTIMATE_BASE_FREQ)->read_int(), |
|---|
| 2187 | root->awar(FA_AWAR_BASE_FREQ_T)->read_float(), |
|---|
| 2188 | root->awar(FA_AWAR_BASE_FREQ_C)->read_float(), |
|---|
| 2189 | root->awar(FA_AWAR_BASE_FREQ_A)->read_float(), |
|---|
| 2190 | root->awar(FA_AWAR_BASE_FREQ_C)->read_float(), |
|---|
| 2191 | root->awar(FA_AWAR_SUBST_PARA_CT)->read_float(), |
|---|
| 2192 | root->awar(FA_AWAR_SUBST_PARA_AT)->read_float(), |
|---|
| 2193 | root->awar(FA_AWAR_SUBST_PARA_GT)->read_float(), |
|---|
| 2194 | root->awar(FA_AWAR_SUBST_PARA_AC)->read_float(), |
|---|
| 2195 | root->awar(FA_AWAR_SUBST_PARA_CG)->read_float(), |
|---|
| 2196 | root->awar(FA_AWAR_SUBST_PARA_AG)->read_float(), |
|---|
| 2197 | root->awar(FA_AWAR_EXPECTED_DISTANCE)->read_float(), |
|---|
| 2198 | root->awar(FA_AWAR_STRUCTURE_SUPPLEMENT)->read_float(), |
|---|
| 2199 | root->awar(FA_AWAR_GAP_A)->read_float(), |
|---|
| 2200 | root->awar(FA_AWAR_GAP_B)->read_float(), |
|---|
| 2201 | root->awar(FA_AWAR_GAP_C)->read_float(), |
|---|
| 2202 | root->awar(FA_AWAR_THRESHOLD)->read_float() |
|---|
| 2203 | ); |
|---|
| 2204 | } |
|---|
| 2205 | } |
|---|
| 2206 | |
|---|
| 2207 | FA_alignTarget alignWhat = static_cast<FA_alignTarget>(root->awar(FA_AWAR_TO_ALIGN)->read_int()); |
|---|
| 2208 | if (!error) { |
|---|
| 2209 | switch (alignWhat) { |
|---|
| 2210 | case FA_CURRENT: { // align current species |
|---|
| 2211 | toalign = root->awar(AWAR_SPECIES_NAME)->read_string(); |
|---|
| 2212 | break; |
|---|
| 2213 | } |
|---|
| 2214 | case FA_MARKED: { // align marked species |
|---|
| 2215 | break; |
|---|
| 2216 | } |
|---|
| 2217 | case FA_SELECTED: { // align selected species |
|---|
| 2218 | get_first_selected_species = data_access->get_first_selected_species; |
|---|
| 2219 | get_next_selected_species = data_access->get_next_selected_species; |
|---|
| 2220 | break; |
|---|
| 2221 | } |
|---|
| 2222 | default: { |
|---|
| 2223 | fa_assert(0); |
|---|
| 2224 | break; |
|---|
| 2225 | } |
|---|
| 2226 | } |
|---|
| 2227 | |
|---|
| 2228 | switch (static_cast<FA_reference>(root->awar(FA_AWAR_REFERENCE)->read_int())) { |
|---|
| 2229 | case FA_REF_EXPLICIT: // align against specified species |
|---|
| 2230 | reference = root->awar(FA_AWAR_REFERENCE_NAME)->read_string(); |
|---|
| 2231 | break; |
|---|
| 2232 | |
|---|
| 2233 | case FA_REF_CONSENSUS: // align against group consensus |
|---|
| 2234 | if (data_access->get_group_consensus) { |
|---|
| 2235 | get_consensus = 1; |
|---|
| 2236 | } |
|---|
| 2237 | else { |
|---|
| 2238 | error = "Can't get group consensus here."; |
|---|
| 2239 | } |
|---|
| 2240 | break; |
|---|
| 2241 | |
|---|
| 2242 | case FA_REF_RELATIVES: // align against species searched via pt_server |
|---|
| 2243 | pt_server_id = root->awar(AWAR_PT_SERVER)->read_int(); |
|---|
| 2244 | if (pt_server_id<0) { |
|---|
| 2245 | error = "No pt_server selected"; |
|---|
| 2246 | } |
|---|
| 2247 | break; |
|---|
| 2248 | |
|---|
| 2249 | default: fa_assert(0); |
|---|
| 2250 | break; |
|---|
| 2251 | } |
|---|
| 2252 | } |
|---|
| 2253 | |
|---|
| 2254 | RangeList ranges; |
|---|
| 2255 | bool autoRestrictRange4nextRelSearch = true; |
|---|
| 2256 | |
|---|
| 2257 | if (!error) { |
|---|
| 2258 | switch (static_cast<FA_range>(root->awar(FA_AWAR_RANGE)->read_int())) { |
|---|
| 2259 | case FA_WHOLE_SEQUENCE: |
|---|
| 2260 | autoRestrictRange4nextRelSearch = false; |
|---|
| 2261 | ranges.add(PosRange::whole()); |
|---|
| 2262 | break; |
|---|
| 2263 | |
|---|
| 2264 | case FA_AROUND_CURSOR: { |
|---|
| 2265 | int curpos = root->awar(AWAR_CURSOR_POSITION_LOCAL)->read_int(); |
|---|
| 2266 | int size = root->awar(FA_AWAR_AROUND)->read_int(); |
|---|
| 2267 | |
|---|
| 2268 | ranges.add(PosRange(curpos-size, curpos+size)); |
|---|
| 2269 | break; |
|---|
| 2270 | } |
|---|
| 2271 | case FA_SELECTED_RANGE: { |
|---|
| 2272 | PosRange range; |
|---|
| 2273 | if (!data_access->get_selected_range(range)) { |
|---|
| 2274 | error = "There is no selected species!"; |
|---|
| 2275 | } |
|---|
| 2276 | else { |
|---|
| 2277 | ranges.add(range); |
|---|
| 2278 | } |
|---|
| 2279 | break; |
|---|
| 2280 | } |
|---|
| 2281 | |
|---|
| 2282 | case FA_SAI_MULTI_RANGE: { |
|---|
| 2283 | GB_transaction ta(data_access->gb_main); |
|---|
| 2284 | |
|---|
| 2285 | char *sai_name = root->awar(FA_AWAR_SAI_RANGE_NAME)->read_string(); |
|---|
| 2286 | char *aliuse = GBT_get_default_alignment(data_access->gb_main); |
|---|
| 2287 | fa_assert(aliuse); |
|---|
| 2288 | |
|---|
| 2289 | GBDATA *gb_sai = GBT_expect_SAI(data_access->gb_main, sai_name); |
|---|
| 2290 | if (!gb_sai) error = GB_await_error(); |
|---|
| 2291 | else { |
|---|
| 2292 | GBDATA *gb_data = GBT_find_sequence(gb_sai, aliuse); |
|---|
| 2293 | if (!gb_data) { |
|---|
| 2294 | error = GB_have_error() |
|---|
| 2295 | ? GB_await_error() |
|---|
| 2296 | : GBS_global_string("SAI '%s' has no data in alignment '%s'", sai_name, aliuse); |
|---|
| 2297 | } |
|---|
| 2298 | else { |
|---|
| 2299 | char *sai_data = GB_read_string(gb_data); // @@@ NOT_ALL_SAI_HAVE_DATA |
|---|
| 2300 | char *set_bits = root->awar(FA_AWAR_SAI_RANGE_CHARS)->read_string(); |
|---|
| 2301 | |
|---|
| 2302 | ranges = build_RangeList_from_string(sai_data, set_bits, false); |
|---|
| 2303 | |
|---|
| 2304 | free(set_bits); |
|---|
| 2305 | free(sai_data); |
|---|
| 2306 | } |
|---|
| 2307 | } |
|---|
| 2308 | free(aliuse); |
|---|
| 2309 | free(sai_name); |
|---|
| 2310 | break; |
|---|
| 2311 | } |
|---|
| 2312 | } |
|---|
| 2313 | } |
|---|
| 2314 | |
|---|
| 2315 | if (!error) { |
|---|
| 2316 | const char *alignment_name = data_access->alignment_name.c_str(); |
|---|
| 2317 | for (RangeList::iterator r = ranges.begin(); r != ranges.end() && !error; ++r) { |
|---|
| 2318 | PosRange range = *r; |
|---|
| 2319 | |
|---|
| 2320 | GBDATA *gb_main = data_access->gb_main; |
|---|
| 2321 | long alignment_length; |
|---|
| 2322 | { |
|---|
| 2323 | GB_transaction ta(gb_main); |
|---|
| 2324 | alignment_length = GBT_get_alignment_len(gb_main, alignment_name); |
|---|
| 2325 | fa_assert(alignment_length>0); |
|---|
| 2326 | } |
|---|
| 2327 | |
|---|
| 2328 | char *pt_server_alignment = root->awar(FA_AWAR_PT_SERVER_ALIGNMENT)->read_string(); |
|---|
| 2329 | PosRange relRange = PosRange::whole(); // unrestricted! |
|---|
| 2330 | |
|---|
| 2331 | if (autoRestrictRange4nextRelSearch) { |
|---|
| 2332 | AW_awar *awar_relrange = root->awar(FA_AWAR_RELATIVE_RANGE); |
|---|
| 2333 | const char *relrange = awar_relrange->read_char_pntr(); |
|---|
| 2334 | if (relrange[0]) { |
|---|
| 2335 | int region_plus = atoi(relrange); |
|---|
| 2336 | |
|---|
| 2337 | fa_assert(range.is_part()); |
|---|
| 2338 | |
|---|
| 2339 | relRange = PosRange(range.start()-region_plus, range.end()+region_plus); // restricted |
|---|
| 2340 | awar_relrange->write_as_string(GBS_global_string("%i", region_plus)); // set awar to detected value (avoid misbehavior when it contains ' ' or similar) |
|---|
| 2341 | } |
|---|
| 2342 | } |
|---|
| 2343 | |
|---|
| 2344 | if (island_hopper) { |
|---|
| 2345 | island_hopper->set_range(ExplicitRange(range, alignment_length)); |
|---|
| 2346 | range = PosRange::whole(); |
|---|
| 2347 | } |
|---|
| 2348 | |
|---|
| 2349 | SearchRelativeParams relSearch(new PT_FamilyFinder(gb_main, |
|---|
| 2350 | pt_server_id, |
|---|
| 2351 | root->awar(AWAR_NN_OLIGO_LEN)->read_int(), |
|---|
| 2352 | root->awar(AWAR_NN_MISMATCHES)->read_int(), |
|---|
| 2353 | root->awar(AWAR_NN_FAST_MODE)->read_int(), |
|---|
| 2354 | root->awar(AWAR_NN_REL_MATCHES)->read_int(), |
|---|
| 2355 | RSS_BOTH_MIN), // old scaling as b4 [8520] @@@ make configurable |
|---|
| 2356 | pt_server_alignment, |
|---|
| 2357 | root->awar(FA_AWAR_NEXT_RELATIVES)->read_int()); |
|---|
| 2358 | |
|---|
| 2359 | relSearch.getFamilyFinder()->restrict_2_region(relRange); |
|---|
| 2360 | |
|---|
| 2361 | struct AlignParams ali_params = { |
|---|
| 2362 | static_cast<FA_report>(root->awar(FA_AWAR_REPORT)->read_int()), |
|---|
| 2363 | bool(root->awar(FA_AWAR_SHOW_GAPS_MESSAGES)->read_int()), |
|---|
| 2364 | range |
|---|
| 2365 | }; |
|---|
| 2366 | |
|---|
| 2367 | { |
|---|
| 2368 | arb_progress progress("FastAligner"); |
|---|
| 2369 | progress.allow_title_reuse(); |
|---|
| 2370 | |
|---|
| 2371 | int cont_on_error = root->awar(FA_AWAR_CONTINUE_ON_ERROR)->read_int(); |
|---|
| 2372 | |
|---|
| 2373 | Aligner aligner(gb_main, |
|---|
| 2374 | alignWhat, |
|---|
| 2375 | alignment_name, |
|---|
| 2376 | toalign, |
|---|
| 2377 | get_first_selected_species, |
|---|
| 2378 | get_next_selected_species, |
|---|
| 2379 | |
|---|
| 2380 | reference, |
|---|
| 2381 | get_consensus ? data_access->get_group_consensus : NULp, |
|---|
| 2382 | relSearch, |
|---|
| 2383 | |
|---|
| 2384 | static_cast<FA_turn>(root->awar(FA_AWAR_MIRROR)->read_int()), |
|---|
| 2385 | ali_params, |
|---|
| 2386 | root->awar(FA_AWAR_PROTECTION)->read_int(), |
|---|
| 2387 | cont_on_error, |
|---|
| 2388 | (FA_errorAction)root->awar(FA_AWAR_ACTION_ON_ERROR)->read_int()); |
|---|
| 2389 | error = aligner.run(); |
|---|
| 2390 | |
|---|
| 2391 | if (error && cont_on_error) { |
|---|
| 2392 | aw_message_if(error); |
|---|
| 2393 | error = NULp; |
|---|
| 2394 | } |
|---|
| 2395 | } |
|---|
| 2396 | |
|---|
| 2397 | free(pt_server_alignment); |
|---|
| 2398 | } |
|---|
| 2399 | } |
|---|
| 2400 | |
|---|
| 2401 | if (island_hopper) { |
|---|
| 2402 | delete island_hopper; |
|---|
| 2403 | island_hopper = NULp; |
|---|
| 2404 | } |
|---|
| 2405 | |
|---|
| 2406 | if (toalign) free(toalign); |
|---|
| 2407 | aw_message_if(error); |
|---|
| 2408 | if (data_access->do_refresh) data_access->refresh_display(); |
|---|
| 2409 | } |
|---|
| 2410 | |
|---|
| 2411 | static void nullcb() { } |
|---|
| 2412 | |
|---|
| 2413 | void FastAligner_create_variables(AW_root *root, AW_default db1) { |
|---|
| 2414 | root->awar_string(FA_AWAR_REFERENCE_NAME, "", db1); |
|---|
| 2415 | |
|---|
| 2416 | root->awar_int(FA_AWAR_TO_ALIGN, FA_CURRENT, db1); |
|---|
| 2417 | root->awar_int(FA_AWAR_REFERENCE, FA_REF_EXPLICIT, db1); |
|---|
| 2418 | root->awar_int(FA_AWAR_RANGE, FA_WHOLE_SEQUENCE, db1); |
|---|
| 2419 | |
|---|
| 2420 | AW_awar *ali_protect = root->awar_int(FA_AWAR_PROTECTION, 0, db1); |
|---|
| 2421 | if (ARB_in_novice_mode(root)) { |
|---|
| 2422 | ali_protect->write_int(0); // reset protection for noobs |
|---|
| 2423 | } |
|---|
| 2424 | |
|---|
| 2425 | root->awar_int(FA_AWAR_AROUND, 25, db1); |
|---|
| 2426 | root->awar_int(FA_AWAR_MIRROR, FA_TURN_INTERACTIVE, db1); |
|---|
| 2427 | root->awar_int(FA_AWAR_REPORT, FA_NO_REPORT, db1); |
|---|
| 2428 | root->awar_int(FA_AWAR_SHOW_GAPS_MESSAGES, 1, db1); |
|---|
| 2429 | root->awar_int(FA_AWAR_CONTINUE_ON_ERROR, 1, db1); |
|---|
| 2430 | root->awar_int(FA_AWAR_ACTION_ON_ERROR, FA_NO_ACTION, db1); |
|---|
| 2431 | root->awar_int(FA_AWAR_USE_SECONDARY, 0, db1); |
|---|
| 2432 | root->awar_int(AWAR_PT_SERVER, 0, db1); |
|---|
| 2433 | root->awar_int(FA_AWAR_NEXT_RELATIVES, 1, db1)->set_minmax(1, 100); |
|---|
| 2434 | |
|---|
| 2435 | root->awar_string(FA_AWAR_RELATIVE_RANGE, "", db1); |
|---|
| 2436 | root->awar_string(FA_AWAR_PT_SERVER_ALIGNMENT, root->awar(AWAR_DEFAULT_ALIGNMENT)->read_char_pntr(), db1); |
|---|
| 2437 | |
|---|
| 2438 | root->awar_string(FA_AWAR_SAI_RANGE_NAME, "", db1); |
|---|
| 2439 | root->awar_string(FA_AWAR_SAI_RANGE_CHARS, "x1", db1); |
|---|
| 2440 | |
|---|
| 2441 | // island hopping: |
|---|
| 2442 | |
|---|
| 2443 | root->awar_int(FA_AWAR_USE_ISLAND_HOPPING, 0, db1); |
|---|
| 2444 | |
|---|
| 2445 | root->awar_int(FA_AWAR_ESTIMATE_BASE_FREQ, 1, db1); |
|---|
| 2446 | |
|---|
| 2447 | root->awar_float(FA_AWAR_BASE_FREQ_A, 0.25, db1); |
|---|
| 2448 | root->awar_float(FA_AWAR_BASE_FREQ_C, 0.25, db1); |
|---|
| 2449 | root->awar_float(FA_AWAR_BASE_FREQ_G, 0.25, db1); |
|---|
| 2450 | root->awar_float(FA_AWAR_BASE_FREQ_T, 0.25, db1); |
|---|
| 2451 | |
|---|
| 2452 | root->awar_float(FA_AWAR_SUBST_PARA_AC, 1.0, db1); |
|---|
| 2453 | root->awar_float(FA_AWAR_SUBST_PARA_AG, 4.0, db1); |
|---|
| 2454 | root->awar_float(FA_AWAR_SUBST_PARA_AT, 1.0, db1); |
|---|
| 2455 | root->awar_float(FA_AWAR_SUBST_PARA_CG, 1.0, db1); |
|---|
| 2456 | root->awar_float(FA_AWAR_SUBST_PARA_CT, 4.0, db1); |
|---|
| 2457 | root->awar_float(FA_AWAR_SUBST_PARA_GT, 1.0, db1); |
|---|
| 2458 | |
|---|
| 2459 | root->awar_float(FA_AWAR_EXPECTED_DISTANCE, 0.3, db1); |
|---|
| 2460 | root->awar_float(FA_AWAR_STRUCTURE_SUPPLEMENT, 0.5, db1); |
|---|
| 2461 | root->awar_float(FA_AWAR_THRESHOLD, 0.005, db1); |
|---|
| 2462 | |
|---|
| 2463 | root->awar_float(FA_AWAR_GAP_A, 8.0, db1); |
|---|
| 2464 | root->awar_float(FA_AWAR_GAP_B, 4.0, db1); |
|---|
| 2465 | root->awar_float(FA_AWAR_GAP_C, 7.0, db1); |
|---|
| 2466 | |
|---|
| 2467 | AWTC_create_common_next_neighbour_vars(root, makeRootCallback(nullcb)); |
|---|
| 2468 | } |
|---|
| 2469 | |
|---|
| 2470 | void FastAligner_set_align_current(AW_root *root, AW_default db1) { |
|---|
| 2471 | root->awar_int(FA_AWAR_TO_ALIGN, FA_CURRENT, db1); |
|---|
| 2472 | } |
|---|
| 2473 | |
|---|
| 2474 | void FastAligner_set_reference_species(AW_root *root) { |
|---|
| 2475 | // sets the aligner reference species to current species |
|---|
| 2476 | char *specName = root->awar(AWAR_SPECIES_NAME)->read_string(); |
|---|
| 2477 | root->awar(FA_AWAR_REFERENCE_NAME)->write_string(specName); |
|---|
| 2478 | free(specName); |
|---|
| 2479 | } |
|---|
| 2480 | |
|---|
| 2481 | static AW_window *create_island_hopping_window(AW_root *root) { |
|---|
| 2482 | AW_window_simple *aws = new AW_window_simple; |
|---|
| 2483 | |
|---|
| 2484 | aws->init(root, "ISLAND_HOPPING_PARA", "Parameters for Island Hopping"); |
|---|
| 2485 | aws->load_xfig("faligner/islandhopping.fig"); |
|---|
| 2486 | |
|---|
| 2487 | aws->at("close"); |
|---|
| 2488 | aws->callback(AW_POPDOWN); |
|---|
| 2489 | aws->create_button("CLOSE", "CLOSE", "O"); |
|---|
| 2490 | |
|---|
| 2491 | aws->at("help"); |
|---|
| 2492 | aws->callback(makeHelpCallback("islandhopping.hlp")); |
|---|
| 2493 | aws->create_button("HELP", "HELP"); |
|---|
| 2494 | |
|---|
| 2495 | aws->at("use_secondary"); |
|---|
| 2496 | aws->label("Use secondary structure (only for re-align)"); |
|---|
| 2497 | aws->create_toggle(FA_AWAR_USE_SECONDARY); |
|---|
| 2498 | |
|---|
| 2499 | aws->at("freq"); |
|---|
| 2500 | aws->create_toggle_field(FA_AWAR_ESTIMATE_BASE_FREQ, "Base freq."); |
|---|
| 2501 | aws->insert_default_toggle("Estimate", "E", 1); |
|---|
| 2502 | aws->insert_toggle("Define here: ", "D", 0); |
|---|
| 2503 | aws->update_toggle_field(); |
|---|
| 2504 | |
|---|
| 2505 | aws->at("freq_a"); aws->label("A:"); aws->create_input_field(FA_AWAR_BASE_FREQ_A, 4); |
|---|
| 2506 | aws->at("freq_c"); aws->label("C:"); aws->create_input_field(FA_AWAR_BASE_FREQ_C, 4); |
|---|
| 2507 | aws->at("freq_g"); aws->label("G:"); aws->create_input_field(FA_AWAR_BASE_FREQ_G, 4); |
|---|
| 2508 | aws->at("freq_t"); aws->label("T:"); aws->create_input_field(FA_AWAR_BASE_FREQ_T, 4); |
|---|
| 2509 | |
|---|
| 2510 | int xpos[4], ypos[4]; |
|---|
| 2511 | { |
|---|
| 2512 | aws->button_length(1); |
|---|
| 2513 | |
|---|
| 2514 | int dummy; |
|---|
| 2515 | aws->at("h_a"); aws->get_at_position(&xpos[0], &dummy); aws->create_button(NULp, "A"); |
|---|
| 2516 | aws->at("h_c"); aws->get_at_position(&xpos[1], &dummy); aws->create_button(NULp, "C"); |
|---|
| 2517 | aws->at("h_g"); aws->get_at_position(&xpos[2], &dummy); aws->create_button(NULp, "G"); |
|---|
| 2518 | aws->at("h_t"); aws->get_at_position(&xpos[3], &dummy); aws->create_button(NULp, "T"); |
|---|
| 2519 | |
|---|
| 2520 | aws->at("v_a"); aws->get_at_position(&dummy, &ypos[0]); aws->create_button(NULp, "A"); |
|---|
| 2521 | aws->at("v_c"); aws->get_at_position(&dummy, &ypos[1]); aws->create_button(NULp, "C"); |
|---|
| 2522 | aws->at("v_g"); aws->get_at_position(&dummy, &ypos[2]); aws->create_button(NULp, "G"); |
|---|
| 2523 | aws->at("v_t"); aws->get_at_position(&dummy, &ypos[3]); aws->create_button(NULp, "T"); |
|---|
| 2524 | } |
|---|
| 2525 | |
|---|
| 2526 | aws->at("subst"); aws->create_button(NULp, "Substitution rate parameters:"); |
|---|
| 2527 | |
|---|
| 2528 | #define XOFF -25 |
|---|
| 2529 | #define YOFF 0 |
|---|
| 2530 | |
|---|
| 2531 | aws->at(xpos[1]+XOFF, ypos[0]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_AC, 4); |
|---|
| 2532 | aws->at(xpos[2]+XOFF, ypos[0]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_AG, 4); |
|---|
| 2533 | aws->at(xpos[3]+XOFF, ypos[0]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_AT, 4); |
|---|
| 2534 | aws->at(xpos[2]+XOFF, ypos[1]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_CG, 4); |
|---|
| 2535 | aws->at(xpos[3]+XOFF, ypos[1]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_CT, 4); |
|---|
| 2536 | aws->at(xpos[3]+XOFF, ypos[2]+YOFF); aws->create_input_field(FA_AWAR_SUBST_PARA_GT, 4); |
|---|
| 2537 | |
|---|
| 2538 | #undef XOFF |
|---|
| 2539 | #undef YOFF |
|---|
| 2540 | |
|---|
| 2541 | aws->label_length(22); |
|---|
| 2542 | |
|---|
| 2543 | aws->at("dist"); |
|---|
| 2544 | aws->label("Expected distance"); |
|---|
| 2545 | aws->create_input_field(FA_AWAR_EXPECTED_DISTANCE, 5); |
|---|
| 2546 | |
|---|
| 2547 | aws->at("supp"); |
|---|
| 2548 | aws->label("Structure supplement"); |
|---|
| 2549 | aws->create_input_field(FA_AWAR_STRUCTURE_SUPPLEMENT, 5); |
|---|
| 2550 | |
|---|
| 2551 | aws->at("thres"); |
|---|
| 2552 | aws->label("Threshold"); |
|---|
| 2553 | aws->create_input_field(FA_AWAR_THRESHOLD, 5); |
|---|
| 2554 | |
|---|
| 2555 | aws->label_length(10); |
|---|
| 2556 | |
|---|
| 2557 | aws->at("gapA"); |
|---|
| 2558 | aws->label("Gap A"); |
|---|
| 2559 | aws->create_input_field(FA_AWAR_GAP_A, 5); |
|---|
| 2560 | |
|---|
| 2561 | aws->at("gapB"); |
|---|
| 2562 | aws->label("Gap B"); |
|---|
| 2563 | aws->create_input_field(FA_AWAR_GAP_B, 5); |
|---|
| 2564 | |
|---|
| 2565 | aws->at("gapC"); |
|---|
| 2566 | aws->label("Gap C"); |
|---|
| 2567 | aws->create_input_field(FA_AWAR_GAP_C, 5); |
|---|
| 2568 | |
|---|
| 2569 | return aws; |
|---|
| 2570 | } |
|---|
| 2571 | |
|---|
| 2572 | static AW_window *create_family_settings_window(AW_root *root) { |
|---|
| 2573 | static AW_window_simple *aws = NULp; |
|---|
| 2574 | |
|---|
| 2575 | if (!aws) { |
|---|
| 2576 | aws = new AW_window_simple; |
|---|
| 2577 | |
|---|
| 2578 | aws->init(root, "FAMILY_PARAMS", "Family search parameters"); |
|---|
| 2579 | aws->load_xfig("faligner/family_settings.fig"); |
|---|
| 2580 | |
|---|
| 2581 | aws->at("close"); |
|---|
| 2582 | aws->callback(AW_POPDOWN); |
|---|
| 2583 | aws->create_button("CLOSE", "CLOSE", "O"); |
|---|
| 2584 | |
|---|
| 2585 | aws->at("help"); |
|---|
| 2586 | aws->callback(makeHelpCallback("next_neighbours_common.hlp")); |
|---|
| 2587 | aws->create_button("HELP", "HELP"); |
|---|
| 2588 | |
|---|
| 2589 | aws->auto_space(5, 5); |
|---|
| 2590 | AWTC_create_common_next_neighbour_fields(aws, 200); |
|---|
| 2591 | } |
|---|
| 2592 | |
|---|
| 2593 | return aws; |
|---|
| 2594 | } |
|---|
| 2595 | |
|---|
| 2596 | static AWT_config_mapping_def aligner_config_mapping[] = { |
|---|
| 2597 | { FA_AWAR_USE_ISLAND_HOPPING, "island" }, |
|---|
| 2598 | { FA_AWAR_TO_ALIGN, "target" }, |
|---|
| 2599 | { FA_AWAR_REFERENCE, "ref" }, |
|---|
| 2600 | { FA_AWAR_REFERENCE_NAME, "refname" }, |
|---|
| 2601 | { FA_AWAR_RELATIVE_RANGE, "relrange" }, |
|---|
| 2602 | { FA_AWAR_NEXT_RELATIVES, "relatives" }, |
|---|
| 2603 | { FA_AWAR_PT_SERVER_ALIGNMENT, "ptali" }, |
|---|
| 2604 | |
|---|
| 2605 | // relative-search specific parameters from subwindow (create_family_settings_window) |
|---|
| 2606 | // same as ../DB_UI/ui_species.cxx@RELATIVES_CONFIG |
|---|
| 2607 | { AWAR_NN_OLIGO_LEN, "oligolen" }, |
|---|
| 2608 | { AWAR_NN_MISMATCHES, "mismatches" }, |
|---|
| 2609 | { AWAR_NN_FAST_MODE, "fastmode" }, |
|---|
| 2610 | { AWAR_NN_REL_MATCHES, "relmatches" }, |
|---|
| 2611 | { AWAR_NN_REL_SCALING, "relscaling" }, |
|---|
| 2612 | |
|---|
| 2613 | // island-hopping parameters (create_island_hopping_window) |
|---|
| 2614 | { FA_AWAR_USE_SECONDARY, "use2nd" }, |
|---|
| 2615 | { FA_AWAR_ESTIMATE_BASE_FREQ, "estbasefreq" }, |
|---|
| 2616 | { FA_AWAR_BASE_FREQ_A, "freq_a" }, |
|---|
| 2617 | { FA_AWAR_BASE_FREQ_C, "freq_c" }, |
|---|
| 2618 | { FA_AWAR_BASE_FREQ_G, "freq_g" }, |
|---|
| 2619 | { FA_AWAR_BASE_FREQ_T, "freq_t" }, |
|---|
| 2620 | { FA_AWAR_SUBST_PARA_AC, "subst_ac" }, |
|---|
| 2621 | { FA_AWAR_SUBST_PARA_AG, "subst_ag" }, |
|---|
| 2622 | { FA_AWAR_SUBST_PARA_AT, "subst_at" }, |
|---|
| 2623 | { FA_AWAR_SUBST_PARA_CG, "subst_cg" }, |
|---|
| 2624 | { FA_AWAR_SUBST_PARA_CT, "subst_ct" }, |
|---|
| 2625 | { FA_AWAR_SUBST_PARA_GT, "subst_gt" }, |
|---|
| 2626 | { FA_AWAR_EXPECTED_DISTANCE, "distance" }, |
|---|
| 2627 | { FA_AWAR_STRUCTURE_SUPPLEMENT, "supplement" }, |
|---|
| 2628 | { FA_AWAR_THRESHOLD, "threshold" }, |
|---|
| 2629 | { FA_AWAR_GAP_A, "gap_a" }, |
|---|
| 2630 | { FA_AWAR_GAP_B, "gap_b" }, |
|---|
| 2631 | { FA_AWAR_GAP_C, "gap_c" }, |
|---|
| 2632 | |
|---|
| 2633 | { NULp, NULp } |
|---|
| 2634 | }; |
|---|
| 2635 | |
|---|
| 2636 | AW_window *FastAligner_create_window(AW_root *root, const AlignDataAccess *data_access) { |
|---|
| 2637 | AW_window_simple *aws = new AW_window_simple; |
|---|
| 2638 | |
|---|
| 2639 | aws->init(root, "INTEGRATED_ALIGNERS", INTEGRATED_ALIGNERS_TITLE); |
|---|
| 2640 | aws->load_xfig("faligner/faligner.fig"); |
|---|
| 2641 | |
|---|
| 2642 | aws->label_length(10); |
|---|
| 2643 | aws->button_length(10); |
|---|
| 2644 | |
|---|
| 2645 | aws->at("close"); |
|---|
| 2646 | aws->callback(AW_POPDOWN); |
|---|
| 2647 | aws->create_button("CLOSE", "CLOSE", "O"); |
|---|
| 2648 | |
|---|
| 2649 | aws->at("help"); |
|---|
| 2650 | aws->callback(makeHelpCallback("faligner.hlp")); |
|---|
| 2651 | aws->create_button("HELP", "HELP"); |
|---|
| 2652 | |
|---|
| 2653 | aws->at("aligner"); |
|---|
| 2654 | aws->create_toggle_field(FA_AWAR_USE_ISLAND_HOPPING, "Aligner"); |
|---|
| 2655 | aws->insert_default_toggle("Fast aligner", "F", 0); |
|---|
| 2656 | aws->sens_mask(AWM_EXP); |
|---|
| 2657 | aws->insert_toggle ("Island Hopping", "I", 1); |
|---|
| 2658 | aws->sens_mask(AWM_ALL); |
|---|
| 2659 | aws->update_toggle_field(); |
|---|
| 2660 | |
|---|
| 2661 | aws->button_length(12); |
|---|
| 2662 | aws->at("island_para"); |
|---|
| 2663 | aws->callback(create_island_hopping_window); |
|---|
| 2664 | aws->sens_mask(AWM_EXP); |
|---|
| 2665 | aws->create_button("island_para", "Parameters", ""); |
|---|
| 2666 | aws->sens_mask(AWM_ALL); |
|---|
| 2667 | |
|---|
| 2668 | aws->button_length(10); |
|---|
| 2669 | |
|---|
| 2670 | aws->at("rev_compl"); |
|---|
| 2671 | aws->callback(makeWindowCallback(build_reverse_complement, data_access)); |
|---|
| 2672 | aws->create_button("reverse_complement", "Turn now!", ""); |
|---|
| 2673 | |
|---|
| 2674 | aws->at("what"); |
|---|
| 2675 | aws->create_toggle_field(FA_AWAR_TO_ALIGN, "Align what?"); |
|---|
| 2676 | aws->insert_toggle ("Current Species:", "A", FA_CURRENT); |
|---|
| 2677 | aws->insert_default_toggle("Marked Species", "M", FA_MARKED); |
|---|
| 2678 | aws->insert_toggle ("Selected Species", "S", FA_SELECTED); |
|---|
| 2679 | aws->update_toggle_field(); |
|---|
| 2680 | |
|---|
| 2681 | aws->at("swhat"); |
|---|
| 2682 | aws->create_input_field(AWAR_SPECIES_NAME, 2); |
|---|
| 2683 | |
|---|
| 2684 | aws->at("against"); |
|---|
| 2685 | aws->create_toggle_field(FA_AWAR_REFERENCE, "Reference"); |
|---|
| 2686 | aws->insert_toggle ("Species by name:", "S", FA_REF_EXPLICIT); |
|---|
| 2687 | aws->insert_toggle ("Group consensus", "K", FA_REF_CONSENSUS); |
|---|
| 2688 | aws->insert_default_toggle("Auto search by pt_server:", "A", FA_REF_RELATIVES); |
|---|
| 2689 | aws->update_toggle_field(); |
|---|
| 2690 | |
|---|
| 2691 | aws->at("sagainst"); |
|---|
| 2692 | aws->create_input_field(FA_AWAR_REFERENCE_NAME, 2); |
|---|
| 2693 | |
|---|
| 2694 | aws->at("copy"); |
|---|
| 2695 | aws->callback(RootAsWindowCallback::simple(FastAligner_set_reference_species)); |
|---|
| 2696 | aws->create_button("Copy", "Copy", ""); |
|---|
| 2697 | |
|---|
| 2698 | aws->label_length(0); |
|---|
| 2699 | aws->at("pt_server"); |
|---|
| 2700 | awt_create_PTSERVER_selection_button(aws, AWAR_PT_SERVER); |
|---|
| 2701 | |
|---|
| 2702 | aws->label_length(23); |
|---|
| 2703 | aws->at("relrange"); |
|---|
| 2704 | aws->label("Data from range only, plus"); |
|---|
| 2705 | aws->create_input_field(FA_AWAR_RELATIVE_RANGE, 3); |
|---|
| 2706 | |
|---|
| 2707 | aws->at("relatives"); |
|---|
| 2708 | aws->label("Number of relatives to use"); |
|---|
| 2709 | aws->create_input_field(FA_AWAR_NEXT_RELATIVES, 3); |
|---|
| 2710 | |
|---|
| 2711 | aws->label_length(9); |
|---|
| 2712 | aws->at("use_ali"); |
|---|
| 2713 | aws->label("Alignment"); |
|---|
| 2714 | aws->create_input_field(FA_AWAR_PT_SERVER_ALIGNMENT, 12); |
|---|
| 2715 | |
|---|
| 2716 | aws->at("relSett"); |
|---|
| 2717 | aws->callback(create_family_settings_window); |
|---|
| 2718 | aws->create_autosize_button("Settings", "More settings", ""); |
|---|
| 2719 | |
|---|
| 2720 | // Range |
|---|
| 2721 | |
|---|
| 2722 | aws->label_length(10); |
|---|
| 2723 | aws->at("range"); |
|---|
| 2724 | aws->create_toggle_field(FA_AWAR_RANGE, "Range"); |
|---|
| 2725 | aws->insert_default_toggle("Whole sequence", "", FA_WHOLE_SEQUENCE); |
|---|
| 2726 | aws->insert_toggle ("Positions around cursor: ", "", FA_AROUND_CURSOR); |
|---|
| 2727 | aws->insert_toggle ("Selected range", "", FA_SELECTED_RANGE); |
|---|
| 2728 | aws->insert_toggle ("Multi-Range by SAI", "", FA_SAI_MULTI_RANGE); |
|---|
| 2729 | aws->update_toggle_field(); |
|---|
| 2730 | |
|---|
| 2731 | aws->at("around"); |
|---|
| 2732 | aws->create_input_field(FA_AWAR_AROUND, 2); |
|---|
| 2733 | |
|---|
| 2734 | aws->at("sai"); |
|---|
| 2735 | awt_create_SAI_selection_button(data_access->gb_main, aws, FA_AWAR_SAI_RANGE_NAME); |
|---|
| 2736 | |
|---|
| 2737 | aws->at("rchars"); |
|---|
| 2738 | aws->create_input_field(FA_AWAR_SAI_RANGE_CHARS, 2); |
|---|
| 2739 | |
|---|
| 2740 | // Protection |
|---|
| 2741 | |
|---|
| 2742 | aws->at("protection"); |
|---|
| 2743 | aws->label("Protection"); |
|---|
| 2744 | aws->create_option_menu(FA_AWAR_PROTECTION); |
|---|
| 2745 | aws->insert_default_option("0", NULp, 0); |
|---|
| 2746 | aws->insert_option ("1", NULp, 1); |
|---|
| 2747 | aws->insert_option ("2", NULp, 2); |
|---|
| 2748 | aws->insert_option ("3", NULp, 3); |
|---|
| 2749 | aws->insert_option ("4", NULp, 4); |
|---|
| 2750 | aws->insert_option ("5", NULp, 5); |
|---|
| 2751 | aws->insert_option ("6", NULp, 6); |
|---|
| 2752 | aws->update_option_menu(); |
|---|
| 2753 | |
|---|
| 2754 | // MirrorCheck |
|---|
| 2755 | |
|---|
| 2756 | aws->at("mirror"); |
|---|
| 2757 | aws->label("Turn check"); |
|---|
| 2758 | aws->create_option_menu(FA_AWAR_MIRROR); |
|---|
| 2759 | aws->insert_option ("Never turn sequence", "", FA_TURN_NEVER); |
|---|
| 2760 | aws->insert_default_option("User acknowledgment ", "", FA_TURN_INTERACTIVE); |
|---|
| 2761 | aws->insert_option ("Automatically turn sequence", "", FA_TURN_ALWAYS); |
|---|
| 2762 | aws->update_option_menu(); |
|---|
| 2763 | |
|---|
| 2764 | // Report |
|---|
| 2765 | |
|---|
| 2766 | aws->at("insert"); |
|---|
| 2767 | aws->label("Report"); |
|---|
| 2768 | aws->create_option_menu(FA_AWAR_REPORT); |
|---|
| 2769 | aws->insert_option ("No report", "", FA_NO_REPORT); |
|---|
| 2770 | aws->sens_mask(AWM_EXP); |
|---|
| 2771 | aws->insert_default_option("Report to temporary entries", "", FA_TEMP_REPORT); |
|---|
| 2772 | aws->insert_option ("Report to resident entries", "", FA_REPORT); |
|---|
| 2773 | aws->sens_mask(AWM_ALL); |
|---|
| 2774 | aws->update_option_menu(); |
|---|
| 2775 | |
|---|
| 2776 | aws->at("gaps"); |
|---|
| 2777 | aws->create_toggle(FA_AWAR_SHOW_GAPS_MESSAGES); |
|---|
| 2778 | |
|---|
| 2779 | aws->at("continue"); |
|---|
| 2780 | aws->create_toggle(FA_AWAR_CONTINUE_ON_ERROR); |
|---|
| 2781 | |
|---|
| 2782 | aws->at("on_failure"); |
|---|
| 2783 | aws->label("On failure"); |
|---|
| 2784 | aws->create_option_menu(FA_AWAR_ACTION_ON_ERROR); |
|---|
| 2785 | aws->insert_default_option("do nothing", "", FA_NO_ACTION); |
|---|
| 2786 | aws->insert_option ("mark failed", "", FA_MARK_FAILED); |
|---|
| 2787 | aws->insert_option ("mark aligned", "", FA_MARK_ALIGNED); |
|---|
| 2788 | aws->update_option_menu(); |
|---|
| 2789 | |
|---|
| 2790 | aws->at("align"); |
|---|
| 2791 | aws->callback(makeWindowCallback(FastAligner_start, data_access)); |
|---|
| 2792 | aws->highlight(); |
|---|
| 2793 | aws->create_button("GO", "GO", "G"); |
|---|
| 2794 | |
|---|
| 2795 | aws->at("config"); |
|---|
| 2796 | AWT_insert_config_manager(aws, AW_ROOT_DEFAULT, "aligner", aligner_config_mapping); |
|---|
| 2797 | |
|---|
| 2798 | return aws; |
|---|
| 2799 | } |
|---|
| 2800 | |
|---|
| 2801 | // -------------------------------------------------------------------------------- |
|---|
| 2802 | |
|---|
| 2803 | #ifdef UNIT_TESTS |
|---|
| 2804 | |
|---|
| 2805 | #include <test_unit.h> |
|---|
| 2806 | |
|---|
| 2807 | // --------------------- |
|---|
| 2808 | // OligoCounter |
|---|
| 2809 | |
|---|
| 2810 | #include <map> |
|---|
| 2811 | #include <string> |
|---|
| 2812 | |
|---|
| 2813 | using std::map; |
|---|
| 2814 | using std::string; |
|---|
| 2815 | |
|---|
| 2816 | typedef map<string, size_t> OligoCount; |
|---|
| 2817 | |
|---|
| 2818 | class OligoCounter { |
|---|
| 2819 | size_t oligo_len; |
|---|
| 2820 | size_t datasize; |
|---|
| 2821 | |
|---|
| 2822 | mutable OligoCount occurrence; |
|---|
| 2823 | |
|---|
| 2824 | static string removeGaps(const char *seq) { |
|---|
| 2825 | size_t len = strlen(seq); |
|---|
| 2826 | string nogaps; |
|---|
| 2827 | nogaps.reserve(len); |
|---|
| 2828 | |
|---|
| 2829 | for (size_t p = 0; p<len; ++p) { |
|---|
| 2830 | char c = seq[p]; |
|---|
| 2831 | if (!is_gap(c)) nogaps.append(1, c); |
|---|
| 2832 | } |
|---|
| 2833 | return nogaps; |
|---|
| 2834 | } |
|---|
| 2835 | |
|---|
| 2836 | void count_oligos(const string& seq) { |
|---|
| 2837 | occurrence.clear(); |
|---|
| 2838 | size_t max_pos = seq.length()-oligo_len; |
|---|
| 2839 | for (size_t p = 0; p <= max_pos; ++p) { |
|---|
| 2840 | string oligo(seq, p, oligo_len); |
|---|
| 2841 | occurrence[oligo]++; |
|---|
| 2842 | } |
|---|
| 2843 | } |
|---|
| 2844 | |
|---|
| 2845 | public: |
|---|
| 2846 | OligoCounter() |
|---|
| 2847 | : oligo_len(0), |
|---|
| 2848 | datasize(0) |
|---|
| 2849 | {} |
|---|
| 2850 | OligoCounter(const char *seq, size_t oligo_len_) |
|---|
| 2851 | : oligo_len(oligo_len_) |
|---|
| 2852 | { |
|---|
| 2853 | string seq_nogaps = removeGaps(seq); |
|---|
| 2854 | datasize = seq_nogaps.length(); |
|---|
| 2855 | count_oligos(seq_nogaps); |
|---|
| 2856 | } |
|---|
| 2857 | |
|---|
| 2858 | size_t oligo_count(const char *oligo) { |
|---|
| 2859 | fa_assert(strlen(oligo) == oligo_len); |
|---|
| 2860 | return occurrence[oligo]; |
|---|
| 2861 | } |
|---|
| 2862 | |
|---|
| 2863 | size_t similarity_score(const OligoCounter& other) const { |
|---|
| 2864 | size_t score = 0; |
|---|
| 2865 | if (oligo_len == other.oligo_len) { |
|---|
| 2866 | for (OligoCount::const_iterator o = occurrence.begin(); o != occurrence.end(); ++o) { |
|---|
| 2867 | const string& oligo = o->first; |
|---|
| 2868 | size_t count = o->second; |
|---|
| 2869 | |
|---|
| 2870 | score += min(count, other.occurrence[oligo]); |
|---|
| 2871 | } |
|---|
| 2872 | } |
|---|
| 2873 | return score; |
|---|
| 2874 | } |
|---|
| 2875 | |
|---|
| 2876 | size_t getDataSize() const { return datasize; } |
|---|
| 2877 | }; |
|---|
| 2878 | |
|---|
| 2879 | void TEST_OligoCounter() { |
|---|
| 2880 | OligoCounter oc1("CCAGGT", 3); |
|---|
| 2881 | OligoCounter oc2("GGTCCA", 3); |
|---|
| 2882 | OligoCounter oc2_gaps("..GGT--CCA..", 3); |
|---|
| 2883 | OligoCounter oc3("AGGTCC", 3); |
|---|
| 2884 | OligoCounter oc4("AGGTCCAGG", 3); |
|---|
| 2885 | |
|---|
| 2886 | TEST_EXPECT_EQUAL(oc1.oligo_count("CCA"), 1); |
|---|
| 2887 | TEST_EXPECT_ZERO(oc1.oligo_count("CCG")); |
|---|
| 2888 | TEST_EXPECT_EQUAL(oc4.oligo_count("AGG"), 2); |
|---|
| 2889 | |
|---|
| 2890 | int sc1_2 = oc1.similarity_score(oc2); |
|---|
| 2891 | int sc2_1 = oc2.similarity_score(oc1); |
|---|
| 2892 | TEST_EXPECT_EQUAL(sc1_2, sc2_1); |
|---|
| 2893 | |
|---|
| 2894 | int sc1_2gaps = oc1.similarity_score(oc2_gaps); |
|---|
| 2895 | TEST_EXPECT_EQUAL(sc1_2, sc1_2gaps); |
|---|
| 2896 | |
|---|
| 2897 | int sc1_3 = oc1.similarity_score(oc3); |
|---|
| 2898 | int sc2_3 = oc2.similarity_score(oc3); |
|---|
| 2899 | int sc3_4 = oc3.similarity_score(oc4); |
|---|
| 2900 | |
|---|
| 2901 | TEST_EXPECT_EQUAL(sc1_2, 2); // common oligos (CCA GGT) |
|---|
| 2902 | TEST_EXPECT_EQUAL(sc1_3, 2); // common oligos (AGG GGT) |
|---|
| 2903 | TEST_EXPECT_EQUAL(sc2_3, 3); // common oligos (GGT GTC TCC) |
|---|
| 2904 | |
|---|
| 2905 | TEST_EXPECT_EQUAL(sc3_4, 4); |
|---|
| 2906 | } |
|---|
| 2907 | |
|---|
| 2908 | // ------------------------- |
|---|
| 2909 | // FakeFamilyFinder |
|---|
| 2910 | |
|---|
| 2911 | class FakeFamilyFinder: public FamilyFinder { // derived from a Noncopyable |
|---|
| 2912 | // used by unit tests to detect next relatives instead of asking the pt-server |
|---|
| 2913 | |
|---|
| 2914 | GBDATA *gb_main; |
|---|
| 2915 | string ali_name; |
|---|
| 2916 | map<string, OligoCounter> oligos_counted; // key = species name |
|---|
| 2917 | PosRange counted_for_range; |
|---|
| 2918 | size_t oligo_len; |
|---|
| 2919 | |
|---|
| 2920 | public: |
|---|
| 2921 | FakeFamilyFinder(GBDATA *gb_main_, string ali_name_, bool rel_matches_, size_t oligo_len_) |
|---|
| 2922 | : FamilyFinder(rel_matches_, RSS_BOTH_MIN), |
|---|
| 2923 | gb_main(gb_main_), |
|---|
| 2924 | ali_name(ali_name_), |
|---|
| 2925 | counted_for_range(PosRange::whole()), |
|---|
| 2926 | oligo_len(oligo_len_) |
|---|
| 2927 | {} |
|---|
| 2928 | |
|---|
| 2929 | GB_ERROR searchFamily(const char *sequence, FF_complement compl_mode, int max_results, double min_score) OVERRIDE { |
|---|
| 2930 | // 'sequence' has to contain full sequence or part corresponding to 'range' |
|---|
| 2931 | |
|---|
| 2932 | TEST_EXPECT_EQUAL(compl_mode, FF_FORWARD); // not fit for other modes |
|---|
| 2933 | |
|---|
| 2934 | delete_family_list(); |
|---|
| 2935 | |
|---|
| 2936 | OligoCounter seq_oligo_count(sequence, oligo_len); |
|---|
| 2937 | |
|---|
| 2938 | if (range != counted_for_range) { |
|---|
| 2939 | oligos_counted.clear(); // forget results for different range |
|---|
| 2940 | counted_for_range = range; |
|---|
| 2941 | } |
|---|
| 2942 | |
|---|
| 2943 | char *buffer = NULp; |
|---|
| 2944 | int buffersize = 0; |
|---|
| 2945 | |
|---|
| 2946 | bool partial_match = range.is_part(); |
|---|
| 2947 | |
|---|
| 2948 | GB_transaction ta(gb_main); |
|---|
| 2949 | int results = 0; |
|---|
| 2950 | |
|---|
| 2951 | for (GBDATA *gb_species = GBT_first_species(gb_main); |
|---|
| 2952 | gb_species && results<max_results; |
|---|
| 2953 | gb_species = GBT_next_species(gb_species)) |
|---|
| 2954 | { |
|---|
| 2955 | string name = GBT_get_name_or_description(gb_species); |
|---|
| 2956 | if (oligos_counted.find(name) == oligos_counted.end()) { |
|---|
| 2957 | GBDATA *gb_data = GBT_find_sequence(gb_species, ali_name.c_str()); |
|---|
| 2958 | const char *spec_seq = GB_read_char_pntr(gb_data); |
|---|
| 2959 | |
|---|
| 2960 | if (partial_match) { |
|---|
| 2961 | int spec_seq_len = GB_read_count(gb_data); |
|---|
| 2962 | int range_len = ExplicitRange(range, spec_seq_len).size(); |
|---|
| 2963 | |
|---|
| 2964 | if (buffersize<range_len) { |
|---|
| 2965 | delete [] buffer; |
|---|
| 2966 | buffersize = range_len; |
|---|
| 2967 | buffer = new char[buffersize+1]; |
|---|
| 2968 | } |
|---|
| 2969 | |
|---|
| 2970 | range.copy_corresponding_part(buffer, spec_seq, spec_seq_len); |
|---|
| 2971 | oligos_counted[name] = OligoCounter(buffer, oligo_len); |
|---|
| 2972 | } |
|---|
| 2973 | else { |
|---|
| 2974 | oligos_counted[name] = OligoCounter(spec_seq, oligo_len); |
|---|
| 2975 | } |
|---|
| 2976 | } |
|---|
| 2977 | |
|---|
| 2978 | const OligoCounter& spec_oligo_count = oligos_counted[name]; |
|---|
| 2979 | size_t score = seq_oligo_count.similarity_score(spec_oligo_count); |
|---|
| 2980 | |
|---|
| 2981 | if (score>=min_score) { |
|---|
| 2982 | FamilyList *newMember = new FamilyList; |
|---|
| 2983 | |
|---|
| 2984 | newMember->name = strdup(name.c_str()); |
|---|
| 2985 | newMember->matches = score; |
|---|
| 2986 | newMember->rel_matches = score/spec_oligo_count.getDataSize(); |
|---|
| 2987 | newMember->next = NULp; |
|---|
| 2988 | |
|---|
| 2989 | family_list = newMember->insertSortedBy_matches(family_list); |
|---|
| 2990 | results++; |
|---|
| 2991 | } |
|---|
| 2992 | } |
|---|
| 2993 | |
|---|
| 2994 | delete [] buffer; |
|---|
| 2995 | |
|---|
| 2996 | return NULp; |
|---|
| 2997 | } |
|---|
| 2998 | }; |
|---|
| 2999 | |
|---|
| 3000 | // ---------------------------- |
|---|
| 3001 | // test_alignment_data |
|---|
| 3002 | |
|---|
| 3003 | #include <arb_unit_test.h> |
|---|
| 3004 | |
|---|
| 3005 | static const char *test_aliname = "ali_test"; |
|---|
| 3006 | |
|---|
| 3007 | static const char *get_aligned_data_of(GBDATA *gb_main, const char *species_name) { |
|---|
| 3008 | GB_transaction ta(gb_main); |
|---|
| 3009 | ARB_ERROR error; |
|---|
| 3010 | const char *data = NULp; |
|---|
| 3011 | |
|---|
| 3012 | GBDATA *gb_species = GBT_find_species(gb_main, species_name); |
|---|
| 3013 | if (!gb_species) error = GB_await_error(); |
|---|
| 3014 | else { |
|---|
| 3015 | GBDATA *gb_data = GBT_find_sequence(gb_species, test_aliname); |
|---|
| 3016 | if (!gb_data) error = GB_await_error(); |
|---|
| 3017 | else { |
|---|
| 3018 | data = GB_read_char_pntr(gb_data); |
|---|
| 3019 | if (!data) error = GB_await_error(); |
|---|
| 3020 | } |
|---|
| 3021 | } |
|---|
| 3022 | |
|---|
| 3023 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3024 | |
|---|
| 3025 | return data; |
|---|
| 3026 | } |
|---|
| 3027 | |
|---|
| 3028 | static const char *get_used_rels_for(GBDATA *gb_main, const char *species_name) { |
|---|
| 3029 | GB_transaction ta(gb_main); |
|---|
| 3030 | const char *result = NULp; |
|---|
| 3031 | GBDATA *gb_species = GBT_find_species(gb_main, species_name); |
|---|
| 3032 | if (!gb_species) result = GBS_global_string("<No such species '%s'>", species_name); |
|---|
| 3033 | else { |
|---|
| 3034 | GBDATA *gb_used_rels = GB_search(gb_species, "used_rels", GB_FIND); |
|---|
| 3035 | if (!gb_used_rels) result = "<No such field 'used_rels'>"; |
|---|
| 3036 | else result = GB_read_char_pntr(gb_used_rels); |
|---|
| 3037 | } |
|---|
| 3038 | return result; |
|---|
| 3039 | } |
|---|
| 3040 | |
|---|
| 3041 | static GB_ERROR forget_used_relatives(GBDATA *gb_main) { |
|---|
| 3042 | GB_ERROR error = NULp; |
|---|
| 3043 | GB_transaction ta(gb_main); |
|---|
| 3044 | for (GBDATA *gb_species = GBT_first_species(gb_main); |
|---|
| 3045 | gb_species && !error; |
|---|
| 3046 | gb_species = GBT_next_species(gb_species)) |
|---|
| 3047 | { |
|---|
| 3048 | GBDATA *gb_used_rels = GB_search(gb_species, "used_rels", GB_FIND); |
|---|
| 3049 | if (gb_used_rels) { |
|---|
| 3050 | error = GB_delete(gb_used_rels); |
|---|
| 3051 | } |
|---|
| 3052 | } |
|---|
| 3053 | return error; |
|---|
| 3054 | } |
|---|
| 3055 | |
|---|
| 3056 | |
|---|
| 3057 | #define ALIGNED_DATA_OF(name) get_aligned_data_of(gb_main, name) |
|---|
| 3058 | #define USED_RELS_FOR(name) get_used_rels_for(gb_main, name) |
|---|
| 3059 | |
|---|
| 3060 | // ---------------------------------------- |
|---|
| 3061 | |
|---|
| 3062 | static GBDATA *selection_fake_gb_main = NULp; |
|---|
| 3063 | static GBDATA *selection_fake_gb_last = NULp; |
|---|
| 3064 | |
|---|
| 3065 | static GBDATA *fake_first_selected(int *count) { |
|---|
| 3066 | selection_fake_gb_last = NULp; |
|---|
| 3067 | *count = 2; // we fake two species as selected ("c1" and "c2") |
|---|
| 3068 | return GBT_find_species(selection_fake_gb_main, "c1"); |
|---|
| 3069 | } |
|---|
| 3070 | static GBDATA *fake_next_selected() { |
|---|
| 3071 | if (!selection_fake_gb_last) { |
|---|
| 3072 | selection_fake_gb_last = GBT_find_species(selection_fake_gb_main, "c2"); |
|---|
| 3073 | } |
|---|
| 3074 | else { |
|---|
| 3075 | selection_fake_gb_last = NULp; |
|---|
| 3076 | } |
|---|
| 3077 | return selection_fake_gb_last; |
|---|
| 3078 | } |
|---|
| 3079 | |
|---|
| 3080 | static char *fake_get_consensus(const char*, PosRange range) { |
|---|
| 3081 | const char *data = get_aligned_data_of(selection_fake_gb_main, "s1"); |
|---|
| 3082 | if (range.is_whole()) return strdup(data); |
|---|
| 3083 | return ARB_strpartdup(data+range.start(), data+range.end()); |
|---|
| 3084 | } |
|---|
| 3085 | |
|---|
| 3086 | static void test_install_fakes(GBDATA *gb_main) { |
|---|
| 3087 | selection_fake_gb_main = gb_main; |
|---|
| 3088 | } |
|---|
| 3089 | |
|---|
| 3090 | |
|---|
| 3091 | // ---------------------------------------- |
|---|
| 3092 | |
|---|
| 3093 | static AlignParams test_ali_params = { |
|---|
| 3094 | FA_NO_REPORT, |
|---|
| 3095 | false, // showGapsMessages |
|---|
| 3096 | PosRange::whole() |
|---|
| 3097 | }; |
|---|
| 3098 | |
|---|
| 3099 | static AlignParams test_ali_params_partial = { |
|---|
| 3100 | FA_NO_REPORT, |
|---|
| 3101 | false, // showGapsMessages |
|---|
| 3102 | PosRange(25, 60) |
|---|
| 3103 | }; |
|---|
| 3104 | |
|---|
| 3105 | // ---------------------------------------- |
|---|
| 3106 | |
|---|
| 3107 | static struct arb_unit_test::test_alignment_data TestAlignmentData_TargetAndReferenceHandling[] = { |
|---|
| 3108 | { 0, "s1", ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........." }, // reference |
|---|
| 3109 | { 0, "s2", "AUCUCCUAAACCCAACCGUAGUUCGAAUUGAGGACUGUAACUC......................................................" }, // align single sequence (same data as reference) |
|---|
| 3110 | { 1, "m1", "UAGAGGAUUUGGGUUGGCAUCAAGCUUAACUCCUGACAUUGAG......................................................" }, // align marked sequences.. (complement of reference) |
|---|
| 3111 | { 1, "m2", "...UCCUAAACCAACCCGUAGUUCGAAUUGAGGACUGUAA........................................................." }, |
|---|
| 3112 | { 1, "m3", "AUC---UAAACCAACCCGUAGUUCGAAUUGAGGACUG---CUC......................................................" }, |
|---|
| 3113 | { 0, "c1", "AUCUCCUAAACCCAACC--------AAUUGAGGACUGUAACUC......................................................" }, // align selected sequences.. |
|---|
| 3114 | { 0, "c2", "AUCUCCU------AACCGUAGUUCCCCGAA------ACUGUAACUC..................................................." }, |
|---|
| 3115 | { 0, "r1", "GAGUUACAGUCCUCAAUUCGGGGAACUACGGUUGGGUUUAGGAGAU..................................................." }, // align by faked pt_server |
|---|
| 3116 | }; |
|---|
| 3117 | |
|---|
| 3118 | void TEST_Aligner_TargetAndReferenceHandling() { |
|---|
| 3119 | // performs some alignments to check logic of target and reference handling |
|---|
| 3120 | |
|---|
| 3121 | GB_shell shell; |
|---|
| 3122 | ARB_ERROR error; |
|---|
| 3123 | GBDATA *gb_main = TEST_CREATE_DB(error, test_aliname, TestAlignmentData_TargetAndReferenceHandling, false); |
|---|
| 3124 | |
|---|
| 3125 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3126 | |
|---|
| 3127 | SearchRelativeParams search_relative_params(new FakeFamilyFinder(gb_main, test_aliname, false, 8), |
|---|
| 3128 | test_aliname, |
|---|
| 3129 | 2); |
|---|
| 3130 | |
|---|
| 3131 | test_install_fakes(gb_main); |
|---|
| 3132 | arb_suppress_progress silence; |
|---|
| 3133 | |
|---|
| 3134 | // bool cont_on_err = true; |
|---|
| 3135 | bool cont_on_err = false; |
|---|
| 3136 | |
|---|
| 3137 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), 3); // we got 3 marked species |
|---|
| 3138 | { |
|---|
| 3139 | Aligner aligner(gb_main, |
|---|
| 3140 | FA_CURRENT, |
|---|
| 3141 | test_aliname, |
|---|
| 3142 | "s2", // toalign |
|---|
| 3143 | NULp, // get_first_selected_species |
|---|
| 3144 | NULp, // get_next_selected_species |
|---|
| 3145 | "s1", // reference species |
|---|
| 3146 | NULp, // get_consensus |
|---|
| 3147 | search_relative_params, // relative search |
|---|
| 3148 | FA_TURN_ALWAYS, |
|---|
| 3149 | test_ali_params, |
|---|
| 3150 | 0, |
|---|
| 3151 | cont_on_err, |
|---|
| 3152 | FA_NO_ACTION); |
|---|
| 3153 | error = aligner.run(); |
|---|
| 3154 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3155 | } |
|---|
| 3156 | TEST_EXPECT_EQUAL(GBT_count_marked_species(gb_main), 3); // we still got 3 marked species |
|---|
| 3157 | { |
|---|
| 3158 | Aligner aligner(gb_main, |
|---|
| 3159 | FA_MARKED, |
|---|
| 3160 | test_aliname, |
|---|
| 3161 | NULp, // toalign |
|---|
| 3162 | NULp, // get_first_selected_species |
|---|
| 3163 | NULp, // get_next_selected_species |
|---|
| 3164 | "s1", // reference species |
|---|
| 3165 | NULp, // get_consensus |
|---|
| 3166 | search_relative_params, // relative search |
|---|
| 3167 | FA_TURN_ALWAYS, |
|---|
| 3168 | test_ali_params, |
|---|
| 3169 | 0, |
|---|
| 3170 | cont_on_err, |
|---|
| 3171 | FA_MARK_FAILED); |
|---|
| 3172 | error = aligner.run(); |
|---|
| 3173 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3174 | |
|---|
| 3175 | TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 0); // FA_MARK_FAILED (none failed -> none marked) |
|---|
| 3176 | } |
|---|
| 3177 | { |
|---|
| 3178 | Aligner aligner(gb_main, |
|---|
| 3179 | FA_SELECTED, |
|---|
| 3180 | test_aliname, |
|---|
| 3181 | NULp, // toalign |
|---|
| 3182 | fake_first_selected, // get_first_selected_species |
|---|
| 3183 | fake_next_selected, // get_next_selected_species |
|---|
| 3184 | NULp, // reference species |
|---|
| 3185 | fake_get_consensus, // get_consensus |
|---|
| 3186 | search_relative_params, // relative search |
|---|
| 3187 | FA_TURN_ALWAYS, |
|---|
| 3188 | test_ali_params, |
|---|
| 3189 | 0, |
|---|
| 3190 | cont_on_err, |
|---|
| 3191 | FA_MARK_ALIGNED); |
|---|
| 3192 | error = aligner.run(); |
|---|
| 3193 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3194 | |
|---|
| 3195 | TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 2); // FA_MARK_ALIGNED (2 selected were aligned) |
|---|
| 3196 | } |
|---|
| 3197 | { |
|---|
| 3198 | Aligner aligner(gb_main, |
|---|
| 3199 | FA_CURRENT, |
|---|
| 3200 | test_aliname, |
|---|
| 3201 | "r1", // toalign |
|---|
| 3202 | NULp, // get_first_selected_species |
|---|
| 3203 | NULp, // get_next_selected_species |
|---|
| 3204 | NULp, // reference species |
|---|
| 3205 | NULp, // get_consensus |
|---|
| 3206 | search_relative_params, // relative search |
|---|
| 3207 | FA_TURN_ALWAYS, |
|---|
| 3208 | test_ali_params, |
|---|
| 3209 | 0, |
|---|
| 3210 | cont_on_err, |
|---|
| 3211 | FA_MARK_ALIGNED); |
|---|
| 3212 | |
|---|
| 3213 | error = aligner.run(); |
|---|
| 3214 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3215 | |
|---|
| 3216 | TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 1); |
|---|
| 3217 | } |
|---|
| 3218 | |
|---|
| 3219 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s2"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); |
|---|
| 3220 | |
|---|
| 3221 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m1"), ".......UAG--AGG-A------U-U-UGGGU-UG-G-C-A-U-CAA-GCU--------UAA-C-UCCUG-AC--A-UUGAG..............."); |
|---|
| 3222 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m2"), "..............U-C------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA................"); |
|---|
| 3223 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m3"), ".........A--U----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----CU-C..........."); |
|---|
| 3224 | |
|---|
| 3225 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-------------------AA-U-UGAGG-AC--U-GUAA-CU-C..........."); |
|---|
| 3226 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c2"), ".........A--UCU-C------C-U-AA---------C-C-G-UAG-UUC------------C-CCGAA-AC--U-GUAA-CU-C..........."); |
|---|
| 3227 | |
|---|
| 3228 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("r1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUCCCC-----GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // here sequence shall be turned! |
|---|
| 3229 | |
|---|
| 3230 | |
|---|
| 3231 | TEST_EXPECT_EQUAL(USED_RELS_FOR("r1"), "s2:43, s1:1"); |
|---|
| 3232 | |
|---|
| 3233 | // ---------------------------------------------- |
|---|
| 3234 | // now align all others vs next relative |
|---|
| 3235 | |
|---|
| 3236 | search_relative_params.maxRelatives = 5; |
|---|
| 3237 | TEST_EXPECT_NO_ERROR(forget_used_relatives(gb_main)); |
|---|
| 3238 | |
|---|
| 3239 | int species_count = ARRAY_ELEMS(TestAlignmentData_TargetAndReferenceHandling); |
|---|
| 3240 | for (int sp = 0; sp<species_count; ++sp) { |
|---|
| 3241 | const char *name = TestAlignmentData_TargetAndReferenceHandling[sp].name; |
|---|
| 3242 | if (strcmp(name, "r1") != 0) { // skip "r1" (already done above) |
|---|
| 3243 | Aligner aligner(gb_main, |
|---|
| 3244 | FA_CURRENT, |
|---|
| 3245 | test_aliname, |
|---|
| 3246 | name, // toalign |
|---|
| 3247 | NULp, // get_first_selected_species |
|---|
| 3248 | NULp, // get_next_selected_species |
|---|
| 3249 | NULp, // reference species |
|---|
| 3250 | NULp, // get_consensus |
|---|
| 3251 | search_relative_params, // relative search |
|---|
| 3252 | FA_TURN_ALWAYS, |
|---|
| 3253 | test_ali_params, |
|---|
| 3254 | 0, |
|---|
| 3255 | cont_on_err, |
|---|
| 3256 | FA_MARK_ALIGNED); |
|---|
| 3257 | |
|---|
| 3258 | error = aligner.run(); |
|---|
| 3259 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3260 | |
|---|
| 3261 | TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 1); |
|---|
| 3262 | } |
|---|
| 3263 | } |
|---|
| 3264 | |
|---|
| 3265 | TEST_EXPECT_EQUAL(USED_RELS_FOR("s1"), "s2"); |
|---|
| 3266 | TEST_EXPECT_EQUAL(USED_RELS_FOR("s2"), "s1"); // same as done manually |
|---|
| 3267 | TEST_EXPECT_EQUAL(USED_RELS_FOR("m1"), "r1:42"); |
|---|
| 3268 | TEST_EXPECT_EQUAL(USED_RELS_FOR("m2"), "m3"); |
|---|
| 3269 | TEST_EXPECT_EQUAL(USED_RELS_FOR("m3"), "m2"); |
|---|
| 3270 | TEST_EXPECT_EQUAL(USED_RELS_FOR("c1"), "r1"); |
|---|
| 3271 | TEST_EXPECT_EQUAL(USED_RELS_FOR("c2"), "r1"); |
|---|
| 3272 | |
|---|
| 3273 | // range aligned below (see test_ali_params_partial) |
|---|
| 3274 | // "-------------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX------------------------------------" |
|---|
| 3275 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // 1st aligning of 's1' |
|---|
| 3276 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s2"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above (again aligned vs 's1') |
|---|
| 3277 | |
|---|
| 3278 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m1"), ".........U--AGA-G------G---AUUUG-GG-U-U-G-G-CAU-CAAGCU-----UAA-C-UCCUG-AC--A-UUGAG---------------"); // changed; @@@ bug: no dots at end |
|---|
| 3279 | |
|---|
| 3280 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m2"), ".........U--C----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----UA-A..........."); // changed (1st align vs 's1', this align vs 'm3') |
|---|
| 3281 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m3"), ".........A--U----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----CU-C..........."); // same_as_above (1st align vs 's1', this align vs 'm2') |
|---|
| 3282 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-------------------AA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above |
|---|
| 3283 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c2"), ".........A--UCU-C------C-U--------A-A-C-C-G-UAG-UUCCCC-----GA--------A-AC--U-GUAA-CU-C..........."); // changed |
|---|
| 3284 | |
|---|
| 3285 | // -------------------------------------- |
|---|
| 3286 | // test partial relative search |
|---|
| 3287 | |
|---|
| 3288 | search_relative_params.getFamilyFinder()->restrict_2_region(test_ali_params_partial.range); |
|---|
| 3289 | TEST_EXPECT_NO_ERROR(forget_used_relatives(gb_main)); |
|---|
| 3290 | |
|---|
| 3291 | for (int sp = 0; sp<species_count; ++sp) { |
|---|
| 3292 | const char *name = TestAlignmentData_TargetAndReferenceHandling[sp].name; |
|---|
| 3293 | Aligner aligner(gb_main, |
|---|
| 3294 | FA_CURRENT, |
|---|
| 3295 | test_aliname, |
|---|
| 3296 | name, // toalign |
|---|
| 3297 | NULp, // get_first_selected_species |
|---|
| 3298 | NULp, // get_next_selected_species |
|---|
| 3299 | NULp, // reference species |
|---|
| 3300 | NULp, // get_consensus |
|---|
| 3301 | search_relative_params, // relative search |
|---|
| 3302 | FA_TURN_NEVER, |
|---|
| 3303 | test_ali_params_partial, |
|---|
| 3304 | 0, |
|---|
| 3305 | cont_on_err, |
|---|
| 3306 | FA_MARK_ALIGNED); |
|---|
| 3307 | |
|---|
| 3308 | error = aligner.run(); |
|---|
| 3309 | TEST_EXPECT_NULL(error.deliver()); |
|---|
| 3310 | |
|---|
| 3311 | TEST_EXPECT(!cont_on_err || GBT_count_marked_species(gb_main) == 1); |
|---|
| 3312 | } |
|---|
| 3313 | |
|---|
| 3314 | TEST_EXPECT_EQUAL(USED_RELS_FOR("s1"), "s2"); |
|---|
| 3315 | TEST_EXPECT_EQUAL(USED_RELS_FOR("s2"), "s1"); |
|---|
| 3316 | TEST_EXPECT_EQUAL(USED_RELS_FOR("m1"), "r1"); // (not really differs) |
|---|
| 3317 | TEST_EXPECT_EQUAL(USED_RELS_FOR("m2"), "m3"); |
|---|
| 3318 | TEST_EXPECT_EQUAL(USED_RELS_FOR("m3"), "m2"); |
|---|
| 3319 | TEST_EXPECT_EQUAL(USED_RELS_FOR("c1"), "r1"); |
|---|
| 3320 | TEST_EXPECT_EQUAL(USED_RELS_FOR("c2"), "r1"); |
|---|
| 3321 | TEST_EXPECT_EQUAL(USED_RELS_FOR("r1"), "s2:20, c2:3"); |
|---|
| 3322 | |
|---|
| 3323 | // aligned range (see test_ali_params_partial) |
|---|
| 3324 | // "-------------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX------------------------------------" |
|---|
| 3325 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above |
|---|
| 3326 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("s2"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above |
|---|
| 3327 | |
|---|
| 3328 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m1"), ".........U--AGA-G------G-A-UU-UG-GG-U-U-G-G-CAU-CAAGCU-----UAA-C-UCCUG-AC--A-UUGAG---------------"); // changed |
|---|
| 3329 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m2"), ".........U--C----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----UA-A..........."); // same_as_above |
|---|
| 3330 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("m3"), ".........A--U----------C-U-AAACC-AA-C-C-C-G-UAG-UUC--------GAA-U-UGAGG-AC--U-G----CU-C..........."); // same_as_above |
|---|
| 3331 | |
|---|
| 3332 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-------------------AA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above |
|---|
| 3333 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("c2"), ".........A--UCU-C------C---------UA-A-C-C-G-UAG-UUCCCC-----GA--------A-AC--U-GUAA-CU-C..........."); // changed |
|---|
| 3334 | |
|---|
| 3335 | TEST_EXPECT_EQUAL(ALIGNED_DATA_OF("r1"), ".........A--UCU-C------C-U-AAACC-CA-A-C-C-G-UAG-UUCCCC-----GAA-U-UGAGG-AC--U-GUAA-CU-C..........."); // same_as_above |
|---|
| 3336 | |
|---|
| 3337 | GB_close(gb_main); |
|---|
| 3338 | } |
|---|
| 3339 | |
|---|
| 3340 | // ---------------------------------------- |
|---|
| 3341 | |
|---|
| 3342 | static struct arb_unit_test::test_alignment_data TestAlignmentData_checksumError[] = { |
|---|
| 3343 | { 0, "MtnK1722", "...G-GGC-C-G............CCC-GG--------CAAUGGGGGCGGCCCGGCGGAC----GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCC....." }, // gets aligned |
|---|
| 3344 | { 0, "MhnFormi", "...A-CGA-U-C------------CUUCGG--------GGUCG-U-GG-C-GU-A--C------GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCCU...." }, // 1st relative |
|---|
| 3345 | { 0, "MhnT1916", "...A-CGA-A-C------------CUU-GU--------GUUCG-U-GG-C-GA-A--C------GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCCU...." }, // next relative |
|---|
| 3346 | { 0, "MthVanni", "...U-GGU-U-U------------C-------------GGCCA-U-GG-C-GG-A--C------GG--C-UCAUU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUG-CGGC-UGGAUCACCUCC....." }, // next relative |
|---|
| 3347 | { 0, "ThcCeler", "...G-GGG-C-G...CC-U---U--------GC--G--CGCAC-C-GG-C-GG-A--C------GG--C-UCAGU-A---AAG-UCGUAACAA-GG-UAG-CCGU-AGGGGAA-CCUA-CGGC-UCGAUCACCUCCU...." }, // next relative |
|---|
| 3348 | }; |
|---|
| 3349 | |
|---|
| 3350 | void TEST_SLOW_Aligner_checksumError() { |
|---|
| 3351 | // @@@ SLOW cause this often gets terminated in nightly builds |
|---|
| 3352 | // no idea why. normally it runs a few ms |
|---|
| 3353 | |
|---|
| 3354 | // this produces an internal aligner error |
|---|
| 3355 | |
|---|
| 3356 | GB_shell shell; |
|---|
| 3357 | ARB_ERROR error; |
|---|
| 3358 | GBDATA *gb_main = TEST_CREATE_DB(error, test_aliname, TestAlignmentData_checksumError, false); |
|---|
| 3359 | |
|---|
| 3360 | SearchRelativeParams search_relative_params(new FakeFamilyFinder(gb_main, test_aliname, false, 8), |
|---|
| 3361 | test_aliname, |
|---|
| 3362 | 10); // use up to 10 relatives |
|---|
| 3363 | |
|---|
| 3364 | test_install_fakes(gb_main); |
|---|
| 3365 | arb_suppress_progress silence; |
|---|
| 3366 | |
|---|
| 3367 | bool cont_on_err = true; |
|---|
| 3368 | if (!error) { |
|---|
| 3369 | Aligner aligner(gb_main, |
|---|
| 3370 | FA_CURRENT, |
|---|
| 3371 | test_aliname, |
|---|
| 3372 | "MtnK1722", // toalign |
|---|
| 3373 | NULp, // get_first_selected_species |
|---|
| 3374 | NULp, // get_next_selected_species |
|---|
| 3375 | NULp, // reference species |
|---|
| 3376 | NULp, // get_consensus |
|---|
| 3377 | search_relative_params, // relative search |
|---|
| 3378 | FA_TURN_ALWAYS, |
|---|
| 3379 | test_ali_params, |
|---|
| 3380 | 0, |
|---|
| 3381 | cont_on_err, |
|---|
| 3382 | FA_MARK_ALIGNED); |
|---|
| 3383 | |
|---|
| 3384 | error = aligner.run(); |
|---|
| 3385 | } |
|---|
| 3386 | { |
|---|
| 3387 | GB_ERROR err = error.deliver(); |
|---|
| 3388 | TEST_EXPECT_NULL__BROKEN(err, "Aligner produced 1 error"); |
|---|
| 3389 | } |
|---|
| 3390 | TEST_EXPECT_EQUAL__BROKEN(USED_RELS_FOR("MtnK1722"), "???", "<No such field 'used_rels'>"); // subsequent failure |
|---|
| 3391 | |
|---|
| 3392 | GB_close(gb_main); |
|---|
| 3393 | } |
|---|
| 3394 | |
|---|
| 3395 | static const char *asstr(LooseBases& ub) { |
|---|
| 3396 | LooseBases tmp; |
|---|
| 3397 | while (!ub.is_empty()) tmp.memorize(ub.recall()); |
|---|
| 3398 | |
|---|
| 3399 | const char *result = ""; |
|---|
| 3400 | while (!tmp.is_empty()) { |
|---|
| 3401 | ExplicitRange r = tmp.recall(); |
|---|
| 3402 | result = GBS_global_string("%s %i/%i", result, r.start(), r.end()); |
|---|
| 3403 | ub.memorize(r); |
|---|
| 3404 | } |
|---|
| 3405 | return result; |
|---|
| 3406 | } |
|---|
| 3407 | |
|---|
| 3408 | void TEST_BASIC_UnalignedBases() { |
|---|
| 3409 | LooseBases ub; |
|---|
| 3410 | TEST_EXPECT(ub.is_empty()); |
|---|
| 3411 | TEST_EXPECT_EQUAL(asstr(ub), ""); |
|---|
| 3412 | |
|---|
| 3413 | // test add+remove |
|---|
| 3414 | ub.memorize(ExplicitRange(5, 7)); |
|---|
| 3415 | TEST_REJECT(ub.is_empty()); |
|---|
| 3416 | TEST_EXPECT_EQUAL(asstr(ub), " 5/7"); |
|---|
| 3417 | |
|---|
| 3418 | TEST_EXPECT(ub.recall() == ExplicitRange(5, 7)); |
|---|
| 3419 | TEST_EXPECT(ub.is_empty()); |
|---|
| 3420 | |
|---|
| 3421 | ub.memorize(ExplicitRange(2, 4)); |
|---|
| 3422 | TEST_EXPECT_EQUAL(asstr(ub), " 2/4"); |
|---|
| 3423 | |
|---|
| 3424 | ub.memorize(ExplicitRange(4, 9)); |
|---|
| 3425 | TEST_EXPECT_EQUAL(asstr(ub), " 2/4 4/9"); |
|---|
| 3426 | |
|---|
| 3427 | ub.memorize(ExplicitRange(8, 10)); |
|---|
| 3428 | ub.memorize(ExplicitRange(11, 14)); |
|---|
| 3429 | ub.memorize(ExplicitRange(12, 17)); |
|---|
| 3430 | TEST_EXPECT_EQUAL(asstr(ub), " 2/4 4/9 8/10 11/14 12/17"); |
|---|
| 3431 | TEST_EXPECT_EQUAL(asstr(ub), " 2/4 4/9 8/10 11/14 12/17"); // check asstr has no side-effect |
|---|
| 3432 | |
|---|
| 3433 | { |
|---|
| 3434 | LooseBases toaddNrecalc; |
|---|
| 3435 | |
|---|
| 3436 | CompactedSubSequence Old("ACGTACGT", 8, "name1"); |
|---|
| 3437 | CompactedSubSequence New("--A-C--G-T--A-C-G-T", 19, "name2"); |
|---|
| 3438 | // ---------------------- 0123456789012345678 |
|---|
| 3439 | |
|---|
| 3440 | toaddNrecalc.memorize(ExplicitRange(1, 7)); |
|---|
| 3441 | toaddNrecalc.memorize(ExplicitRange(3, 5)); |
|---|
| 3442 | TEST_EXPECT_EQUAL(asstr(toaddNrecalc), " 1/7 3/5"); |
|---|
| 3443 | |
|---|
| 3444 | ub.follow_ali_change_and_append(toaddNrecalc, AliChange(Old, New)); |
|---|
| 3445 | |
|---|
| 3446 | TEST_EXPECT_EQUAL(asstr(ub), " 3/18 8/15 2/4 4/9 8/10 11/14 12/17"); |
|---|
| 3447 | TEST_EXPECT(toaddNrecalc.is_empty()); |
|---|
| 3448 | |
|---|
| 3449 | LooseBases selfRecalc; |
|---|
| 3450 | selfRecalc.follow_ali_change_and_append(ub, AliChange(New, New)); |
|---|
| 3451 | TEST_EXPECT_EQUAL__BROKEN(asstr(selfRecalc), |
|---|
| 3452 | " 3/18 8/15 0/6 3/11 8/11 10/15 10/17", // wanted behavior? |
|---|
| 3453 | " 3/18 8/17 0/6 3/11 8/13 10/15 10/18"); // doc wrong behavior @@@ "8/17", "8/13", "10/18" are wrong |
|---|
| 3454 | |
|---|
| 3455 | ub.follow_ali_change_and_append(selfRecalc, AliChange(New, Old)); |
|---|
| 3456 | TEST_EXPECT_EQUAL__BROKEN(asstr(ub), |
|---|
| 3457 | " 1/7 3/5 0/1 1/3 3/3 4/5 4/6", // wanted behavior? (from wanted behavior above) |
|---|
| 3458 | " 1/7 3/7 0/2 1/4 3/5 4/6 4/7"); // document wrong behavior |
|---|
| 3459 | TEST_EXPECT_EQUAL__BROKEN(asstr(ub), |
|---|
| 3460 | " 1/7 3/6 0/1 1/3 3/4 4/5 4/7", // wanted behavior? (from wrong result above) |
|---|
| 3461 | " 1/7 3/7 0/2 1/4 3/5 4/6 4/7"); // document wrong behavior |
|---|
| 3462 | } |
|---|
| 3463 | } |
|---|
| 3464 | |
|---|
| 3465 | |
|---|
| 3466 | #endif // UNIT_TESTS |
|---|
| 3467 | |
|---|
| 3468 | |
|---|