| 1 | // ======================================================================================= |
|---|
| 2 | /* */ |
|---|
| 3 | // File : NT_concatenate.cxx |
|---|
| 4 | // Purpose : 1.Concatenatenation of sequences or alignments |
|---|
| 5 | // 2.Merging the fields of similar species and creating a new species |
|---|
| 6 | // Author : Yadhu Kumar (yadhu@mikro.biologie.tu-muenchen.de) |
|---|
| 7 | // web site : http://www.arb-home.de/ |
|---|
| 8 | /* */ |
|---|
| 9 | // Copyright Department of Microbiology (Technical University Munich) |
|---|
| 10 | /* */ |
|---|
| 11 | // ======================================================================================= |
|---|
| 12 | |
|---|
| 13 | #include "NT_local.h" |
|---|
| 14 | |
|---|
| 15 | #include <items.h> |
|---|
| 16 | #include <item_sel_list.h> |
|---|
| 17 | #include <awt_sel_boxes.hxx> |
|---|
| 18 | #include <AW_rename.hxx> |
|---|
| 19 | #include <aw_question.hxx> |
|---|
| 20 | #include <aw_awar.hxx> |
|---|
| 21 | #include <aw_msg.hxx> |
|---|
| 22 | #include <aw_root.hxx> |
|---|
| 23 | #include <arb_progress.h> |
|---|
| 24 | #include <arb_strbuf.h> |
|---|
| 25 | #include <arb_strarray.h> |
|---|
| 26 | #include <awt_modules.hxx> |
|---|
| 27 | |
|---|
| 28 | using namespace std; |
|---|
| 29 | |
|---|
| 30 | #define AWAR_CON_SEQUENCE_TYPE "tmp/concat/sequence_type" |
|---|
| 31 | #define AWAR_CON_NEW_ALIGNMENT_NAME "tmp/concat/new_alignment_name" |
|---|
| 32 | #define AWAR_CON_ALIGNMENT_SEPARATOR "tmp/concat/alignment_separator" |
|---|
| 33 | #define AWAR_CON_DB_ALIGNS "tmp/concat/database_alignments" |
|---|
| 34 | #define AWAR_CON_MERGE_FIELD "tmp/concat/merge_field" |
|---|
| 35 | #define AWAR_CON_STORE_SIM_SP_NO "tmp/concat/store_sim_sp_no" |
|---|
| 36 | |
|---|
| 37 | #define MERGE_SIMILAR_CONCATENATE_ALIGNMENTS 1 |
|---|
| 38 | #define MOVE_DOWN 0 |
|---|
| 39 | #define MOVE_UP 1 |
|---|
| 40 | |
|---|
| 41 | struct SpeciesConcatenateList { |
|---|
| 42 | GBDATA *species; |
|---|
| 43 | char *species_name; |
|---|
| 44 | |
|---|
| 45 | SpeciesConcatenateList *next; |
|---|
| 46 | }; |
|---|
| 47 | |
|---|
| 48 | // --------------------------creating and initializing AWARS---------------------------------------- |
|---|
| 49 | void NT_createConcatenationAwars(AW_root *aw_root, AW_default aw_def) { |
|---|
| 50 | aw_root->awar_string(AWAR_CON_SEQUENCE_TYPE, "ami", aw_def); |
|---|
| 51 | aw_root->awar_string(AWAR_CON_NEW_ALIGNMENT_NAME, "ali_concat", aw_def); |
|---|
| 52 | aw_root->awar_string(AWAR_CON_ALIGNMENT_SEPARATOR, "XXX", aw_def); |
|---|
| 53 | aw_root->awar_string(AWAR_CON_MERGE_FIELD, "full_name", aw_def); |
|---|
| 54 | aw_root->awar_string(AWAR_CON_STORE_SIM_SP_NO, "merged_species", aw_def); |
|---|
| 55 | aw_root->awar_string(AWAR_CON_DB_ALIGNS, "", aw_def); |
|---|
| 56 | } |
|---|
| 57 | |
|---|
| 58 | // ------------------------Selecting alignments from the database for concatenation---------------------- |
|---|
| 59 | |
|---|
| 60 | inline char *get_alitype_eval(AW_root *aw_root) { |
|---|
| 61 | return GBS_global_string_copy("%s=", aw_root->awar(AWAR_CON_SEQUENCE_TYPE)->read_char_pntr()); |
|---|
| 62 | } |
|---|
| 63 | |
|---|
| 64 | static void alitype_changed_cb(AW_root *aw_root, AW_CL cl_db_sel) { |
|---|
| 65 | AW_DB_selection *db_sel = (AW_DB_selection*)cl_db_sel; |
|---|
| 66 | char *ali_type = get_alitype_eval(aw_root); |
|---|
| 67 | awt_reconfigure_ALI_selection_list(db_sel, ali_type); |
|---|
| 68 | free(ali_type); |
|---|
| 69 | } |
|---|
| 70 | |
|---|
| 71 | static AW_DB_selection* createSelectionList(GBDATA *gb_main, AW_window *aws, const char *awarName) { |
|---|
| 72 | |
|---|
| 73 | #ifdef DEBUG |
|---|
| 74 | static bool ran=false; |
|---|
| 75 | nt_assert(!ran); |
|---|
| 76 | ran=true; // prevents calling this function for the second time |
|---|
| 77 | #endif |
|---|
| 78 | |
|---|
| 79 | AW_root *aw_root = aws->get_root(); |
|---|
| 80 | char *ali_type = get_alitype_eval(aw_root); |
|---|
| 81 | AW_DB_selection *db_sel = awt_create_ALI_selection_list(gb_main, aws, awarName, ali_type); |
|---|
| 82 | |
|---|
| 83 | free(ali_type); |
|---|
| 84 | return db_sel; |
|---|
| 85 | } |
|---|
| 86 | |
|---|
| 87 | // ---------- Create SAI to display alignments that were concatenated -------------- |
|---|
| 88 | |
|---|
| 89 | static GB_ERROR create_concatInfo_SAI(GBDATA *gb_main, const char *new_ali_name, const char *ali_separator, const StrArray& ali_names) { |
|---|
| 90 | GB_ERROR error = NULL; |
|---|
| 91 | GBDATA *gb_extended = GBT_find_or_create_SAI(gb_main, "ConcatInfo"); |
|---|
| 92 | |
|---|
| 93 | if (!gb_extended) error = GB_await_error(); |
|---|
| 94 | else { |
|---|
| 95 | GBDATA *gb_data = GBT_add_data(gb_extended, new_ali_name, "data", GB_STRING); |
|---|
| 96 | |
|---|
| 97 | if (!gb_data) { |
|---|
| 98 | error = GB_await_error(); |
|---|
| 99 | } |
|---|
| 100 | else { |
|---|
| 101 | int new_ali_length = GBT_get_alignment_len(gb_main, new_ali_name); |
|---|
| 102 | int sep_len = strlen(ali_separator); |
|---|
| 103 | |
|---|
| 104 | char *info = (char*)malloc(new_ali_length+1); |
|---|
| 105 | memset(info, '=', new_ali_length); |
|---|
| 106 | |
|---|
| 107 | int offset = 0; |
|---|
| 108 | int last_ali_idx = ali_names.size()-1; |
|---|
| 109 | |
|---|
| 110 | for (int a = 0; a <= last_ali_idx; ++a) { |
|---|
| 111 | const char *ali = ali_names[a]; |
|---|
| 112 | int ali_len = GBT_get_alignment_len(gb_main, ali); |
|---|
| 113 | int ali_str_len = strlen(ali); |
|---|
| 114 | |
|---|
| 115 | char *my_info = info+offset; |
|---|
| 116 | |
|---|
| 117 | int half_ali_len = ali_len/2; |
|---|
| 118 | for (int i = 0; i<5; ++i) { |
|---|
| 119 | if (i<half_ali_len) { |
|---|
| 120 | my_info[i] = '<'; |
|---|
| 121 | my_info[ali_len-i-1] = '>'; |
|---|
| 122 | } |
|---|
| 123 | } |
|---|
| 124 | |
|---|
| 125 | if (ali_str_len<ali_len) { |
|---|
| 126 | int namepos = half_ali_len - ali_str_len/2; |
|---|
| 127 | memcpy(my_info+namepos, ali, ali_str_len); |
|---|
| 128 | } |
|---|
| 129 | |
|---|
| 130 | offset += ali_len; |
|---|
| 131 | if (a != last_ali_idx) { |
|---|
| 132 | memcpy(info+offset, ali_separator, sep_len); |
|---|
| 133 | offset += sep_len; |
|---|
| 134 | } |
|---|
| 135 | } |
|---|
| 136 | |
|---|
| 137 | nt_assert(offset == new_ali_length); // wrong alignment length! |
|---|
| 138 | info[new_ali_length] = 0; |
|---|
| 139 | |
|---|
| 140 | if (!error) error = GB_write_string(gb_data, info); |
|---|
| 141 | free(info); |
|---|
| 142 | } |
|---|
| 143 | } |
|---|
| 144 | return error; |
|---|
| 145 | } |
|---|
| 146 | |
|---|
| 147 | // ---------------------------------------- Concatenation function ---------------------------------- |
|---|
| 148 | static void concatenateAlignments(AW_window *aws, AW_CL cl_selected_alis) { |
|---|
| 149 | nt_assert(cl_selected_alis); |
|---|
| 150 | AW_selection *selected_alis = (AW_selection*)cl_selected_alis; |
|---|
| 151 | |
|---|
| 152 | GB_push_transaction(GLOBAL.gb_main); |
|---|
| 153 | long marked_species = GBT_count_marked_species(GLOBAL.gb_main); |
|---|
| 154 | arb_progress progress("Concatenating alignments", marked_species); |
|---|
| 155 | AW_root *aw_root = aws->get_root(); |
|---|
| 156 | |
|---|
| 157 | char *new_ali_name = aw_root->awar(AWAR_CON_NEW_ALIGNMENT_NAME)->read_string(); |
|---|
| 158 | GB_ERROR error = GBT_check_alignment_name(new_ali_name); |
|---|
| 159 | |
|---|
| 160 | StrArray ali_names; |
|---|
| 161 | selected_alis->get_values(ali_names); |
|---|
| 162 | |
|---|
| 163 | size_t ali_count = ali_names.size(); |
|---|
| 164 | if (!error && ali_count<2) error = "Not enough alignments selected for concatenation (need at least 2)"; |
|---|
| 165 | |
|---|
| 166 | if (!error) { |
|---|
| 167 | int found[ali_count], missing[ali_count]; |
|---|
| 168 | for (size_t j = 0; j<ali_count; j++) { found[j] = 0; missing[j] = 0; } // initializing found and missing alis |
|---|
| 169 | |
|---|
| 170 | char *ali_separator = aw_root->awar(AWAR_CON_ALIGNMENT_SEPARATOR)->read_string(); |
|---|
| 171 | int sep_len = strlen(ali_separator); |
|---|
| 172 | |
|---|
| 173 | long new_alignment_len = 0; |
|---|
| 174 | for (size_t a = 0; a<ali_count; ++a) { |
|---|
| 175 | new_alignment_len += GBT_get_alignment_len(GLOBAL.gb_main, ali_names[a]) + (a ? sep_len : 0); |
|---|
| 176 | } |
|---|
| 177 | |
|---|
| 178 | GBDATA *gb_presets = GBT_get_presets(GLOBAL.gb_main); |
|---|
| 179 | GBDATA *gb_alignment_exists = GB_find_string(gb_presets, "alignment_name", new_ali_name, GB_IGNORE_CASE, SEARCH_GRANDCHILD); |
|---|
| 180 | GBDATA *gb_new_alignment = 0; |
|---|
| 181 | char *seq_type = aw_root->awar(AWAR_CON_SEQUENCE_TYPE)->read_string(); |
|---|
| 182 | |
|---|
| 183 | if (gb_alignment_exists) { // check wheather new alignment exists or not, if yes prompt user to overwrite the existing alignment; if no create an empty alignment |
|---|
| 184 | bool overwrite = aw_ask_sure("concat_ali_overwrite", GBS_global_string("Existing data in alignment \"%s\" may be overwritten. Do you want to continue?", new_ali_name)); |
|---|
| 185 | if (!overwrite) { |
|---|
| 186 | error = "Alignment exists - aborted"; |
|---|
| 187 | } |
|---|
| 188 | else { |
|---|
| 189 | gb_new_alignment = GBT_get_alignment(GLOBAL.gb_main, new_ali_name); |
|---|
| 190 | if (!gb_new_alignment) error = GB_await_error(); |
|---|
| 191 | } |
|---|
| 192 | } |
|---|
| 193 | else { |
|---|
| 194 | gb_new_alignment = GBT_create_alignment(GLOBAL.gb_main, new_ali_name, new_alignment_len, 0, 0, seq_type); |
|---|
| 195 | if (!gb_new_alignment) error = GB_await_error(); |
|---|
| 196 | } |
|---|
| 197 | |
|---|
| 198 | if (!error) { |
|---|
| 199 | AW_repeated_question ask_about_missing_alignment; |
|---|
| 200 | |
|---|
| 201 | for (GBDATA *gb_species = GBT_first_marked_species(GLOBAL.gb_main); |
|---|
| 202 | gb_species && !error; |
|---|
| 203 | gb_species = GBT_next_marked_species(gb_species)) |
|---|
| 204 | { |
|---|
| 205 | GBS_strstruct *str_seq = GBS_stropen(new_alignment_len+1); // create output stream |
|---|
| 206 | int ali_len = 0; |
|---|
| 207 | int ali_ctr = 0; |
|---|
| 208 | |
|---|
| 209 | for (size_t a = 0; a<ali_count; ++a) { |
|---|
| 210 | if (a) GBS_strcat(str_seq, ali_separator); |
|---|
| 211 | GBDATA *gb_seq_data = GBT_find_sequence(gb_species, ali_names[a]); |
|---|
| 212 | if (gb_seq_data) { |
|---|
| 213 | const char *str_data = GB_read_char_pntr(gb_seq_data); |
|---|
| 214 | GBS_strcat(str_seq, str_data); |
|---|
| 215 | ++found[ali_ctr]; |
|---|
| 216 | } |
|---|
| 217 | else { |
|---|
| 218 | char *speciesName = GB_read_string(GB_entry(gb_species, "full_name")); |
|---|
| 219 | char *question = GBS_global_string_copy("\"%s\" alignment doesn't exist in \"%s\"!", ali_names[a], speciesName); |
|---|
| 220 | int skip_ali = ask_about_missing_alignment.get_answer("insert_gaps_for_missing_ali", question, "Insert Gaps for Missing Alignment,Skip Missing Alignment", "all", true); |
|---|
| 221 | if (!skip_ali) { |
|---|
| 222 | ali_len = GBT_get_alignment_len(GLOBAL.gb_main, ali_names[a]); |
|---|
| 223 | GBS_chrncat(str_seq, '.', ali_len); |
|---|
| 224 | } |
|---|
| 225 | ++missing[ali_ctr]; |
|---|
| 226 | free(question); |
|---|
| 227 | free(speciesName); |
|---|
| 228 | } |
|---|
| 229 | } |
|---|
| 230 | |
|---|
| 231 | { |
|---|
| 232 | char *concatenated_ali_seq_data = GBS_strclose(str_seq); |
|---|
| 233 | GBDATA *gb_data = GBT_add_data(gb_species, new_ali_name, "data", GB_STRING); |
|---|
| 234 | GB_write_string(gb_data, concatenated_ali_seq_data); |
|---|
| 235 | free(concatenated_ali_seq_data); |
|---|
| 236 | } |
|---|
| 237 | progress.inc_and_check_user_abort(error); |
|---|
| 238 | } |
|---|
| 239 | |
|---|
| 240 | if (!error) { |
|---|
| 241 | // ............. print missing alignments........... |
|---|
| 242 | aw_message(GBS_global_string("Concatenation of Alignments was performed for %ld species.", marked_species)); |
|---|
| 243 | for (size_t a = 0; a<ali_count; ++a) { |
|---|
| 244 | aw_message(GBS_global_string("%s : Found in %d species & Missing in %d species.", ali_names[a], found[a], missing[a])); |
|---|
| 245 | } |
|---|
| 246 | } |
|---|
| 247 | |
|---|
| 248 | if (!error) error = create_concatInfo_SAI(GLOBAL.gb_main, new_ali_name, ali_separator, ali_names); |
|---|
| 249 | } |
|---|
| 250 | |
|---|
| 251 | free(seq_type); |
|---|
| 252 | free(ali_separator); |
|---|
| 253 | } |
|---|
| 254 | |
|---|
| 255 | if (!error) { |
|---|
| 256 | char *nfield = GBS_global_string_copy("%s/data", new_ali_name); |
|---|
| 257 | error = GBT_add_new_changekey(GLOBAL.gb_main, nfield, GB_STRING); |
|---|
| 258 | free(nfield); |
|---|
| 259 | } |
|---|
| 260 | else { |
|---|
| 261 | progress.done(); |
|---|
| 262 | } |
|---|
| 263 | GB_end_transaction_show_error(GLOBAL.gb_main, error, aw_message); |
|---|
| 264 | free(new_ali_name); |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | static void addSpeciesToConcatenateList(SpeciesConcatenateList **sclp, GB_CSTR species_name) { |
|---|
| 268 | |
|---|
| 269 | GBDATA *gb_species_data = GBT_get_species_data(GLOBAL.gb_main); |
|---|
| 270 | GBDATA *gb_species = GBT_find_species_rel_species_data(gb_species_data, species_name); |
|---|
| 271 | |
|---|
| 272 | if (gb_species) { |
|---|
| 273 | SpeciesConcatenateList *scl = new SpeciesConcatenateList; |
|---|
| 274 | |
|---|
| 275 | scl->species = gb_species; |
|---|
| 276 | scl->species_name = strdup(species_name); |
|---|
| 277 | scl->next = *sclp; |
|---|
| 278 | *sclp = scl; |
|---|
| 279 | } |
|---|
| 280 | } |
|---|
| 281 | |
|---|
| 282 | static void freeSpeciesConcatenateList(SpeciesConcatenateList *scl) { |
|---|
| 283 | while (scl) { |
|---|
| 284 | SpeciesConcatenateList *next = scl->next; |
|---|
| 285 | free(scl->species_name); |
|---|
| 286 | delete scl; |
|---|
| 287 | scl = next; |
|---|
| 288 | } |
|---|
| 289 | } |
|---|
| 290 | |
|---|
| 291 | static GB_ERROR checkAndMergeFields(GBDATA *gb_new_species, GB_ERROR error, SpeciesConcatenateList *scl) { |
|---|
| 292 | |
|---|
| 293 | char *doneFields = strdup(";name;"); // all fields which are already merged |
|---|
| 294 | int doneLen = strlen(doneFields); |
|---|
| 295 | SpeciesConcatenateList *sl = scl; |
|---|
| 296 | int sl_length = 0; while (scl) { sl_length++; scl=scl->next; } // counting no. of similar species stored in the list |
|---|
| 297 | int *fieldStat = new int[sl_length]; // 0 = not used yet ; -1 = doesn't have field ; 1..n = field content (same number means same content) |
|---|
| 298 | |
|---|
| 299 | while (sl && !error) { // with all species do.. |
|---|
| 300 | char *newFields = GB_get_subfields(sl->species); |
|---|
| 301 | char *fieldStart = newFields; // points to ; before next field |
|---|
| 302 | |
|---|
| 303 | while (fieldStart[1] && !error) { // with all subfields of the species do.. |
|---|
| 304 | char *fieldEnd = strchr(fieldStart+1, ';'); |
|---|
| 305 | nt_assert(fieldEnd); |
|---|
| 306 | char behind = fieldEnd[1]; fieldEnd[1] = 0; |
|---|
| 307 | |
|---|
| 308 | if (strstr(doneFields, fieldStart)==0) { // field is not merged yet |
|---|
| 309 | char *fieldName = fieldStart+1; |
|---|
| 310 | int fieldLen = int(fieldEnd-fieldName); |
|---|
| 311 | |
|---|
| 312 | nt_assert(fieldEnd[0]==';'); |
|---|
| 313 | fieldEnd[0] = 0; |
|---|
| 314 | |
|---|
| 315 | GBDATA *gb_field = GB_search(sl->species, fieldName, GB_FIND); // field does to exist (it was found before) |
|---|
| 316 | GB_TYPES type = GB_read_type(gb_field); |
|---|
| 317 | |
|---|
| 318 | if (type==GB_STRING) { // we only merge string fields |
|---|
| 319 | int i; int doneSpecies = 0; int nextStat = 1; |
|---|
| 320 | |
|---|
| 321 | for (i=0; i<sl_length; i++) { fieldStat[i] = 0; } // clear field status |
|---|
| 322 | |
|---|
| 323 | while (doneSpecies<sl_length) { // since all species in list were handled |
|---|
| 324 | SpeciesConcatenateList *sl2 = sl; |
|---|
| 325 | i = 0; |
|---|
| 326 | |
|---|
| 327 | while (sl2) { |
|---|
| 328 | if (fieldStat[i]==0) { |
|---|
| 329 | gb_field = GB_search(sl2->species, fieldName, GB_FIND); |
|---|
| 330 | if (gb_field) { |
|---|
| 331 | char *content = GB_read_as_string(gb_field); |
|---|
| 332 | SpeciesConcatenateList *sl3 = sl2->next; |
|---|
| 333 | fieldStat[i] = nextStat; |
|---|
| 334 | int j = i+1; doneSpecies++; |
|---|
| 335 | |
|---|
| 336 | while (sl3) { |
|---|
| 337 | if (fieldStat[j]==0) { |
|---|
| 338 | gb_field = GB_search(sl3->species, fieldName, GB_FIND); |
|---|
| 339 | if (gb_field) { |
|---|
| 340 | char *content2 = GB_read_as_string(gb_field); |
|---|
| 341 | if (strcmp(content, content2)==0) { // if contents are the same, they get the same status |
|---|
| 342 | fieldStat[j] = nextStat; |
|---|
| 343 | doneSpecies++; |
|---|
| 344 | } |
|---|
| 345 | free(content2); |
|---|
| 346 | } |
|---|
| 347 | else { |
|---|
| 348 | fieldStat[j] = -1; |
|---|
| 349 | doneSpecies++; |
|---|
| 350 | } |
|---|
| 351 | } |
|---|
| 352 | sl3 = sl3->next; j++; |
|---|
| 353 | } |
|---|
| 354 | free(content); nextStat++; |
|---|
| 355 | } |
|---|
| 356 | else { |
|---|
| 357 | fieldStat[i] = -1; // field does not exist here |
|---|
| 358 | doneSpecies++; |
|---|
| 359 | } |
|---|
| 360 | } |
|---|
| 361 | sl2 = sl2->next; i++; |
|---|
| 362 | } |
|---|
| 363 | if (!sl2) break; |
|---|
| 364 | } |
|---|
| 365 | nt_assert(nextStat!=1); // this would mean that none of the species contained the field |
|---|
| 366 | { |
|---|
| 367 | char *new_content = 0; |
|---|
| 368 | int new_content_len = 0; // @@@ useless (0 where used; unused otherwise) |
|---|
| 369 | |
|---|
| 370 | if (nextStat==2) { // all species contain same field content or do not have the field |
|---|
| 371 | SpeciesConcatenateList *sl2 = sl; |
|---|
| 372 | while (sl2) { |
|---|
| 373 | gb_field = GB_search(sl2->species, fieldName, GB_FIND); |
|---|
| 374 | if (gb_field) { |
|---|
| 375 | new_content = GB_read_as_string(gb_field); |
|---|
| 376 | new_content_len = strlen(new_content); |
|---|
| 377 | break; |
|---|
| 378 | } |
|---|
| 379 | sl2 = sl2->next; |
|---|
| 380 | } |
|---|
| 381 | } |
|---|
| 382 | else { // different field contents |
|---|
| 383 | int actualStat; |
|---|
| 384 | for (actualStat=1; actualStat<nextStat; actualStat++) { |
|---|
| 385 | int names_len = 1; // open bracket |
|---|
| 386 | SpeciesConcatenateList *sl2 = sl; |
|---|
| 387 | char *content = 0; i = 0; |
|---|
| 388 | |
|---|
| 389 | while (sl2) { |
|---|
| 390 | if (fieldStat[i]==actualStat) { |
|---|
| 391 | names_len += strlen(sl2->species_name)+1; |
|---|
| 392 | if (!content) { |
|---|
| 393 | gb_field = GB_search(sl2->species, fieldName, GB_FIND); |
|---|
| 394 | nt_assert(gb_field); |
|---|
| 395 | content = GB_read_as_string(gb_field); |
|---|
| 396 | } |
|---|
| 397 | } |
|---|
| 398 | sl2 = sl2->next; i++; |
|---|
| 399 | } |
|---|
| 400 | nt_assert(content); |
|---|
| 401 | int add_len = names_len+1+strlen(content); |
|---|
| 402 | char *whole = (char*)malloc(new_content_len+1+add_len+1); |
|---|
| 403 | nt_assert(whole); |
|---|
| 404 | char *add = new_content ? whole+sprintf(whole, "%s ", new_content) : whole; |
|---|
| 405 | sl2 = sl; i = 0; |
|---|
| 406 | int first = 1; |
|---|
| 407 | while (sl2) { |
|---|
| 408 | if (fieldStat[i]==actualStat) { |
|---|
| 409 | add += sprintf(add, "%c%s", first ? '{' : ';', sl2->species_name); |
|---|
| 410 | first = 0; |
|---|
| 411 | } |
|---|
| 412 | sl2 = sl2->next; i++; |
|---|
| 413 | } |
|---|
| 414 | add += sprintf(add, "} %s", content); |
|---|
| 415 | |
|---|
| 416 | free(content); |
|---|
| 417 | freeset(new_content, whole); |
|---|
| 418 | new_content_len = strlen(new_content); // cppcheck-suppress deallocuse |
|---|
| 419 | } |
|---|
| 420 | } |
|---|
| 421 | |
|---|
| 422 | if (new_content) { |
|---|
| 423 | error = GBT_write_string(gb_new_species, fieldName, new_content); |
|---|
| 424 | free(new_content); |
|---|
| 425 | } |
|---|
| 426 | } |
|---|
| 427 | } |
|---|
| 428 | |
|---|
| 429 | // mark field as done: |
|---|
| 430 | char *new_doneFields = (char*)malloc(doneLen+fieldLen+1+1); |
|---|
| 431 | sprintf(new_doneFields, "%s%s;", doneFields, fieldName); |
|---|
| 432 | doneLen += fieldLen+1; |
|---|
| 433 | freeset(doneFields, new_doneFields); |
|---|
| 434 | fieldEnd[0] = ';'; |
|---|
| 435 | } |
|---|
| 436 | fieldEnd[1] = behind; |
|---|
| 437 | fieldStart = fieldEnd; |
|---|
| 438 | } |
|---|
| 439 | free(newFields); |
|---|
| 440 | sl = sl->next; |
|---|
| 441 | } |
|---|
| 442 | free(doneFields); |
|---|
| 443 | delete [] fieldStat; |
|---|
| 444 | |
|---|
| 445 | return error; |
|---|
| 446 | } |
|---|
| 447 | |
|---|
| 448 | static GBDATA *concatenateFieldsCreateNewSpecies(AW_window *, GBDATA *gb_species, SpeciesConcatenateList *scl) { |
|---|
| 449 | GB_push_transaction(GLOBAL.gb_main); |
|---|
| 450 | |
|---|
| 451 | GB_ERROR error = 0; |
|---|
| 452 | GBDATA *gb_species_data = GBT_get_species_data(GLOBAL.gb_main); |
|---|
| 453 | |
|---|
| 454 | // data needed for name generation |
|---|
| 455 | char *full_name = 0; |
|---|
| 456 | char *acc = 0; |
|---|
| 457 | |
|---|
| 458 | // --------------------getting the species related data -------------------- |
|---|
| 459 | |
|---|
| 460 | GBDATA *gb_new_species = 0; |
|---|
| 461 | |
|---|
| 462 | if (!error) { |
|---|
| 463 | // copy species to create a new species |
|---|
| 464 | gb_new_species = GB_create_container(gb_species_data, "species"); |
|---|
| 465 | error = gb_new_species ? GB_copy(gb_new_species, gb_species) : GB_await_error(); |
|---|
| 466 | |
|---|
| 467 | if (!error) { // write dummy-name (real name written below) |
|---|
| 468 | error = GBT_write_string(gb_new_species, "name", "$currcat$"); |
|---|
| 469 | } |
|---|
| 470 | } |
|---|
| 471 | |
|---|
| 472 | if (!error) { // copy full name |
|---|
| 473 | full_name = GBT_read_string(gb_species, "full_name"); |
|---|
| 474 | if (!full_name) error = GB_await_error(); |
|---|
| 475 | else error = GBT_write_string(gb_new_species, "full_name", full_name); |
|---|
| 476 | } |
|---|
| 477 | |
|---|
| 478 | if (!error) { |
|---|
| 479 | ConstStrArray ali_names; |
|---|
| 480 | GBT_get_alignment_names(ali_names, GLOBAL.gb_main); |
|---|
| 481 | |
|---|
| 482 | long id = 0; |
|---|
| 483 | for (SpeciesConcatenateList *speciesList = scl; speciesList; speciesList = speciesList->next) { |
|---|
| 484 | for (int no_of_alignments = 0; ali_names[no_of_alignments]!=0; no_of_alignments++) { |
|---|
| 485 | GBDATA *gb_seq_data = GBT_find_sequence(speciesList->species, ali_names[no_of_alignments]); |
|---|
| 486 | if (gb_seq_data) { |
|---|
| 487 | const char *seq_data = GB_read_char_pntr(gb_seq_data); |
|---|
| 488 | GBDATA *gb_data = GBT_add_data(gb_new_species, ali_names[no_of_alignments], "data", GB_STRING); |
|---|
| 489 | error = GB_write_string(gb_data, seq_data); |
|---|
| 490 | if (!error) id += GBS_checksum(seq_data, 1, ".-"); // creating checksum of the each aligned sequence to generate new accession number |
|---|
| 491 | } |
|---|
| 492 | if (error) error = GB_export_errorf("Can't create alignment '%s'", ali_names[no_of_alignments]); |
|---|
| 493 | } |
|---|
| 494 | } |
|---|
| 495 | |
|---|
| 496 | if (!error) { |
|---|
| 497 | acc = GBS_global_string_copy("ARB_%lX", id); // create new accession number |
|---|
| 498 | error = GBT_write_string(gb_new_species, "acc", acc); |
|---|
| 499 | } |
|---|
| 500 | } |
|---|
| 501 | |
|---|
| 502 | if (!error) error = checkAndMergeFields(gb_new_species, error, scl); |
|---|
| 503 | |
|---|
| 504 | // now generate new name |
|---|
| 505 | if (!error) { |
|---|
| 506 | char *new_species_name = 0; |
|---|
| 507 | |
|---|
| 508 | const char *add_field = AW_get_nameserver_addid(GLOBAL.gb_main); |
|---|
| 509 | GBDATA *gb_addid = add_field[0] ? GB_entry(gb_new_species, add_field) : 0; |
|---|
| 510 | char *addid = 0; |
|---|
| 511 | if (gb_addid) addid = GB_read_as_string(gb_addid); |
|---|
| 512 | |
|---|
| 513 | error = AWTC_generate_one_name(GLOBAL.gb_main, full_name, acc, addid, new_species_name); |
|---|
| 514 | if (!error) { // name was created |
|---|
| 515 | if (GBT_find_species_rel_species_data(gb_species_data, new_species_name) != 0) { |
|---|
| 516 | // if the name is not unique -> create unique name |
|---|
| 517 | UniqueNameDetector und(gb_species_data); |
|---|
| 518 | freeset(new_species_name, AWTC_makeUniqueShortName(new_species_name, und)); |
|---|
| 519 | if (!new_species_name) error = GB_await_error(); |
|---|
| 520 | } |
|---|
| 521 | } |
|---|
| 522 | |
|---|
| 523 | if (!error) error = GBT_write_string(gb_new_species, "name", new_species_name); // insert new 'name' |
|---|
| 524 | |
|---|
| 525 | free(new_species_name); |
|---|
| 526 | free(addid); |
|---|
| 527 | } |
|---|
| 528 | |
|---|
| 529 | error = GB_end_transaction(GLOBAL.gb_main, error); |
|---|
| 530 | if (error) { |
|---|
| 531 | gb_new_species = 0; |
|---|
| 532 | aw_message(error); |
|---|
| 533 | } |
|---|
| 534 | |
|---|
| 535 | free(acc); |
|---|
| 536 | free(full_name); |
|---|
| 537 | |
|---|
| 538 | return gb_new_species; |
|---|
| 539 | } |
|---|
| 540 | |
|---|
| 541 | static GB_ERROR checkAndCreateNewField(GBDATA *gb_main, char *new_field_name) { |
|---|
| 542 | GB_ERROR error = GB_check_key(new_field_name); |
|---|
| 543 | |
|---|
| 544 | if (error) return error; |
|---|
| 545 | else { |
|---|
| 546 | error = GBT_add_new_changekey(gb_main, new_field_name, GB_STRING); |
|---|
| 547 | if (error) { |
|---|
| 548 | bool overwrite = aw_ask_sure("merge_similar_overwrite_field", |
|---|
| 549 | GBS_global_string("\"%s\" field exists! Do you want to overwrite the existing field?", new_field_name)); |
|---|
| 550 | if (!overwrite) return error; |
|---|
| 551 | } |
|---|
| 552 | } |
|---|
| 553 | return 0; |
|---|
| 554 | } |
|---|
| 555 | |
|---|
| 556 | static void mergeSimilarSpecies(AW_window *aws, AW_CL cl_mergeSimilarConcatenateAlignments, AW_CL cl_selected_alis) { |
|---|
| 557 | GB_ERROR error = NULL; |
|---|
| 558 | arb_progress wrapper; |
|---|
| 559 | { |
|---|
| 560 | AW_root *aw_root = aws->get_root(); |
|---|
| 561 | char *merge_field_name = aw_root->awar(AWAR_CON_MERGE_FIELD)->read_string(); |
|---|
| 562 | char *new_field_name = aw_root->awar(AWAR_CON_STORE_SIM_SP_NO)->read_string(); |
|---|
| 563 | |
|---|
| 564 | SpeciesConcatenateList *scl = 0; // to build list of similar species |
|---|
| 565 | SpeciesConcatenateList *newSpeciesList = 0; // new SpeciesConcatenateList |
|---|
| 566 | |
|---|
| 567 | GB_begin_transaction(GLOBAL.gb_main); // open database for transaction |
|---|
| 568 | |
|---|
| 569 | error = checkAndCreateNewField(GLOBAL.gb_main, new_field_name); |
|---|
| 570 | |
|---|
| 571 | PersistentNameServerConnection stayAlive; |
|---|
| 572 | arb_progress progress("Merging similar species", GBT_count_marked_species(GLOBAL.gb_main)); |
|---|
| 573 | progress.auto_subtitles("Species"); |
|---|
| 574 | |
|---|
| 575 | for (GBDATA * gb_species = GBT_first_marked_species(GLOBAL.gb_main); |
|---|
| 576 | gb_species && !error; |
|---|
| 577 | gb_species = GBT_next_marked_species(gb_species)) |
|---|
| 578 | { |
|---|
| 579 | GBDATA *gb_species_field = GB_entry(gb_species, merge_field_name); |
|---|
| 580 | const char *name = GBT_read_name(gb_species); |
|---|
| 581 | |
|---|
| 582 | if (!gb_species_field) { |
|---|
| 583 | // exit if species doesn't have any data in the selected field |
|---|
| 584 | error = GBS_global_string("Species '%s' does not contain data in selected field '%s'", name, merge_field_name); |
|---|
| 585 | } |
|---|
| 586 | else { |
|---|
| 587 | char *gb_species_field_content = GB_read_string(gb_species_field); |
|---|
| 588 | int similar_species = 0; |
|---|
| 589 | |
|---|
| 590 | for (GBDATA * gb_species_next = GBT_next_marked_species(gb_species); |
|---|
| 591 | gb_species_next && !error; |
|---|
| 592 | gb_species_next = GBT_next_marked_species(gb_species_next)) |
|---|
| 593 | { |
|---|
| 594 | GBDATA *gb_next_species_field = GB_entry(gb_species_next, merge_field_name); |
|---|
| 595 | const char *next_name = GBT_read_name(gb_species_next); |
|---|
| 596 | |
|---|
| 597 | if (!gb_next_species_field) { |
|---|
| 598 | // exit if species doesn't have any data in the selected field |
|---|
| 599 | error = GBS_global_string("Species '%s' does not contain data in selected field '%s'", next_name, merge_field_name); |
|---|
| 600 | } |
|---|
| 601 | else { |
|---|
| 602 | char *gb_next_species_field_content = GB_read_string(gb_next_species_field); |
|---|
| 603 | |
|---|
| 604 | if (strcmp(gb_species_field_content, gb_next_species_field_content) == 0) { |
|---|
| 605 | addSpeciesToConcatenateList(&scl, next_name); |
|---|
| 606 | GB_write_flag(gb_species_next, 0); |
|---|
| 607 | ++similar_species; |
|---|
| 608 | ++progress; |
|---|
| 609 | } |
|---|
| 610 | free(gb_next_species_field_content); |
|---|
| 611 | } |
|---|
| 612 | } |
|---|
| 613 | |
|---|
| 614 | if (similar_species > 0 && !error) { |
|---|
| 615 | addSpeciesToConcatenateList(&scl, name); |
|---|
| 616 | GB_write_flag(gb_species, 0); |
|---|
| 617 | |
|---|
| 618 | GBDATA *new_species_created = concatenateFieldsCreateNewSpecies(aws, gb_species, scl); |
|---|
| 619 | |
|---|
| 620 | nt_assert(new_species_created); |
|---|
| 621 | if (new_species_created) { // create a list of newly created species |
|---|
| 622 | addSpeciesToConcatenateList(&newSpeciesList, GBT_read_name(new_species_created)); |
|---|
| 623 | } |
|---|
| 624 | |
|---|
| 625 | error = GBT_write_int(new_species_created, new_field_name, ++similar_species); |
|---|
| 626 | } |
|---|
| 627 | |
|---|
| 628 | freeSpeciesConcatenateList(scl); scl = 0; |
|---|
| 629 | free(gb_species_field_content); |
|---|
| 630 | } |
|---|
| 631 | |
|---|
| 632 | progress.inc_and_check_user_abort(error); |
|---|
| 633 | } |
|---|
| 634 | |
|---|
| 635 | if (!error) { |
|---|
| 636 | GBT_mark_all(GLOBAL.gb_main, 0); // unmark all species in the database |
|---|
| 637 | int newSpeciesCount = 0; |
|---|
| 638 | |
|---|
| 639 | for (; newSpeciesList; newSpeciesList = newSpeciesList->next) { // mark only newly created species |
|---|
| 640 | GB_write_flag(newSpeciesList->species, 1); |
|---|
| 641 | newSpeciesCount++; |
|---|
| 642 | } |
|---|
| 643 | aw_message(GBS_global_string("%i new species were created by taking \"%s\" as a criterion!", newSpeciesCount, merge_field_name)); |
|---|
| 644 | freeSpeciesConcatenateList(newSpeciesList); |
|---|
| 645 | } |
|---|
| 646 | |
|---|
| 647 | free(merge_field_name); |
|---|
| 648 | free(new_field_name); |
|---|
| 649 | |
|---|
| 650 | GB_end_transaction_show_error(GLOBAL.gb_main, error, aw_message); |
|---|
| 651 | } |
|---|
| 652 | // Concatenate alignments of the merged species if cl_mergeSimilarConcatenateAlignments = MERGE_SIMILAR_CONCATENATE_ALIGNMENTS |
|---|
| 653 | if (cl_mergeSimilarConcatenateAlignments && !error) concatenateAlignments(aws, cl_selected_alis); |
|---|
| 654 | } |
|---|
| 655 | |
|---|
| 656 | |
|---|
| 657 | |
|---|
| 658 | static AW_window *createMergeSimilarSpeciesWindow(AW_root *aw_root, AW_CL option, AW_CL cl_subsel) { |
|---|
| 659 | AW_window_simple *aws = new AW_window_simple; |
|---|
| 660 | |
|---|
| 661 | { |
|---|
| 662 | char *window_id = GBS_global_string_copy("MERGE_SPECIES_%i", int(option)); |
|---|
| 663 | aws->init(aw_root, window_id, "MERGE SPECIES WINDOW"); |
|---|
| 664 | free(window_id); |
|---|
| 665 | } |
|---|
| 666 | aws->load_xfig("merge_species.fig"); |
|---|
| 667 | |
|---|
| 668 | aws->callback(makeHelpCallback("merge_species.hlp")); |
|---|
| 669 | aws->at("help"); |
|---|
| 670 | aws->create_button("HELP", "HELP", "H"); |
|---|
| 671 | |
|---|
| 672 | aws->at("field_select"); |
|---|
| 673 | aws->auto_space(0, 0); |
|---|
| 674 | aws->callback(AW_POPDOWN); |
|---|
| 675 | create_selection_list_on_itemfields(GLOBAL.gb_main, aws, AWAR_CON_MERGE_FIELD, true, FIELD_FILTER_NDS, "field_select", 0, SPECIES_get_selector(), 20, 30, SelectedFields(SF_PSEUDO|SF_HIDDEN), "sel_merge_field"); |
|---|
| 676 | |
|---|
| 677 | aws->at("store_sp_no"); |
|---|
| 678 | aws->label_length(20); |
|---|
| 679 | aws->create_input_field(AWAR_CON_STORE_SIM_SP_NO, 20); |
|---|
| 680 | |
|---|
| 681 | aws->at("merge"); |
|---|
| 682 | aws->callback(mergeSimilarSpecies, option, cl_subsel); |
|---|
| 683 | aws->create_button("MERGE_SIMILAR_SPECIES", "MERGE SIMILAR SPECIES", "M"); |
|---|
| 684 | |
|---|
| 685 | aws->at("close"); |
|---|
| 686 | aws->callback(AW_POPDOWN); |
|---|
| 687 | aws->create_button("CLOSE", "CLOSE", "C"); |
|---|
| 688 | |
|---|
| 689 | return (AW_window *)aws; |
|---|
| 690 | } |
|---|
| 691 | |
|---|
| 692 | AW_window *NT_createMergeSimilarSpeciesWindow(AW_root *aw_root) { |
|---|
| 693 | static AW_window *aw = 0; |
|---|
| 694 | if (!aw) aw = createMergeSimilarSpeciesWindow(aw_root, 0, 0); |
|---|
| 695 | return aw; |
|---|
| 696 | } |
|---|
| 697 | |
|---|
| 698 | static AW_window *NT_createMergeSimilarSpeciesAndConcatenateWindow(AW_root *aw_root, AW_CL cl_subsel) { |
|---|
| 699 | static AW_window *aw = 0; |
|---|
| 700 | if (!aw) aw = createMergeSimilarSpeciesWindow(aw_root, MERGE_SIMILAR_CONCATENATE_ALIGNMENTS, cl_subsel); |
|---|
| 701 | return aw; |
|---|
| 702 | } |
|---|
| 703 | |
|---|
| 704 | // ----------------------------Creating concatenation window----------------------------------------- |
|---|
| 705 | AW_window *NT_createConcatenationWindow(AW_root *aw_root) { |
|---|
| 706 | AW_window_simple *aws = new AW_window_simple; |
|---|
| 707 | |
|---|
| 708 | aws->init(aw_root, "CONCAT_ALIGNMENTS", "Concatenate Alignments"); |
|---|
| 709 | aws->load_xfig("concatenate.fig"); |
|---|
| 710 | |
|---|
| 711 | aws->button_length(8); |
|---|
| 712 | |
|---|
| 713 | aws->callback(makeHelpCallback("concatenate.hlp")); |
|---|
| 714 | aws->at("help"); |
|---|
| 715 | aws->create_button("HELP", "HELP", "H"); |
|---|
| 716 | |
|---|
| 717 | aws->at("close"); |
|---|
| 718 | aws->callback((AW_CB0)AW_POPDOWN); |
|---|
| 719 | aws->create_button("CLOSE", "CLOSE", "C"); |
|---|
| 720 | |
|---|
| 721 | aws->at("dbAligns"); |
|---|
| 722 | AW_DB_selection *all_alis = createSelectionList(GLOBAL.gb_main, aws, AWAR_CON_DB_ALIGNS); |
|---|
| 723 | AW_selection *sel_alis = awt_create_subset_selection_list(aws, all_alis->get_sellist(), "concatAligns", "collect", "sort"); |
|---|
| 724 | |
|---|
| 725 | aws->at("type"); |
|---|
| 726 | aws->create_option_menu(AWAR_CON_SEQUENCE_TYPE, true); |
|---|
| 727 | aws->insert_option("DNA", "d", "dna"); |
|---|
| 728 | aws->insert_option("RNA", "r", "rna"); |
|---|
| 729 | aws->insert_default_option("PROTEIN", "p", "ami"); |
|---|
| 730 | aws->update_option_menu(); |
|---|
| 731 | aw_root->awar(AWAR_CON_SEQUENCE_TYPE)->add_callback(alitype_changed_cb, (AW_CL)all_alis); |
|---|
| 732 | |
|---|
| 733 | aws->button_length(0); |
|---|
| 734 | |
|---|
| 735 | aws->at("aliName"); |
|---|
| 736 | aws->label_length(15); |
|---|
| 737 | aws->create_input_field(AWAR_CON_NEW_ALIGNMENT_NAME, 25); |
|---|
| 738 | |
|---|
| 739 | aws->at("aliSeparator"); |
|---|
| 740 | aws->label_length(5); |
|---|
| 741 | aws->create_input_field(AWAR_CON_ALIGNMENT_SEPARATOR, 10); |
|---|
| 742 | |
|---|
| 743 | aws->button_length(22); |
|---|
| 744 | aws->auto_space(5, 5); |
|---|
| 745 | aws->at("go"); |
|---|
| 746 | |
|---|
| 747 | aws->callback(concatenateAlignments, (AW_CL)sel_alis); |
|---|
| 748 | aws->create_button("CONCATENATE", "CONCATENATE", "A"); |
|---|
| 749 | |
|---|
| 750 | aws->callback(NT_createMergeSimilarSpeciesWindow); |
|---|
| 751 | aws->create_button("MERGE_SPECIES", "MERGE SIMILAR SPECIES", "M"); |
|---|
| 752 | |
|---|
| 753 | aws->callback(AW_POPUP, (AW_CL)NT_createMergeSimilarSpeciesAndConcatenateWindow, (AW_CL)sel_alis); |
|---|
| 754 | aws->create_button("MERGE_CONCATENATE", "MERGE & CONCATENATE", "S"); |
|---|
| 755 | |
|---|
| 756 | aws->show(); |
|---|
| 757 | return aws; |
|---|
| 758 | } |
|---|
| 759 | // ------------------------------------------------------------------------------------------------------- |
|---|