1 | // ======================================================================================= |
---|
2 | // |
---|
3 | // File : NT_concatenate.cxx |
---|
4 | // Purpose : 1.Concatenatenation of sequences or alignments |
---|
5 | // 2.Merging the fields of similar species and creating a new species |
---|
6 | // Author : Yadhu Kumar |
---|
7 | // web site : http://www.arb-home.de/ |
---|
8 | // |
---|
9 | // Copyright Department of Microbiology (Technical University Munich) |
---|
10 | // |
---|
11 | // ======================================================================================= |
---|
12 | |
---|
13 | #include "NT_local.h" |
---|
14 | |
---|
15 | #include <items.h> |
---|
16 | #include <item_sel_list.h> |
---|
17 | #include <awt_sel_boxes.hxx> |
---|
18 | #include <AW_rename.hxx> |
---|
19 | #include <aw_question.hxx> |
---|
20 | #include <aw_awar.hxx> |
---|
21 | #include <aw_msg.hxx> |
---|
22 | #include <aw_root.hxx> |
---|
23 | #include <arb_progress.h> |
---|
24 | #include <arb_strbuf.h> |
---|
25 | #include <arb_strarray.h> |
---|
26 | #include <awt_modules.hxx> |
---|
27 | #include <arb_global_defs.h> |
---|
28 | |
---|
29 | using namespace std; |
---|
30 | |
---|
31 | #define AWAR_CON_SEQUENCE_TYPE "tmp/concat/sequence_type" |
---|
32 | #define AWAR_CON_NEW_ALIGNMENT_NAME "tmp/concat/new_alignment_name" |
---|
33 | #define AWAR_CON_ALIGNMENT_SEPARATOR "tmp/concat/alignment_separator" |
---|
34 | #define AWAR_CON_SELECTED_ALI "tmp/concat/database_alignments" |
---|
35 | #define AWAR_CON_MERGE_FIELD "tmp/concat/merge_field" |
---|
36 | #define AWAR_CON_STORE_SIM_SP_NO "tmp/concat/store_sim_sp_no" |
---|
37 | |
---|
38 | #define AWAR_CON_ALLOW_OVERWRITE_ALI "tmp/concat/overwrite" |
---|
39 | #define AWAR_CON_INSGAPS_FOR_MISS_ALIS "tmp/concat/insgaps" |
---|
40 | |
---|
41 | #define MOVE_DOWN 0 |
---|
42 | #define MOVE_UP 1 |
---|
43 | |
---|
44 | struct SpeciesConcatenateList { |
---|
45 | GBDATA *species; |
---|
46 | char *species_name; |
---|
47 | |
---|
48 | SpeciesConcatenateList *next; |
---|
49 | }; |
---|
50 | |
---|
51 | // --------------------------creating and initializing AWARS---------------------------------------- |
---|
52 | void NT_createConcatenationAwars(AW_root *aw_root, AW_default aw_def, GBDATA *gb_main) { |
---|
53 | GB_transaction ta(gb_main); |
---|
54 | |
---|
55 | char *ali_type = NULp; |
---|
56 | { |
---|
57 | char *ali_default = GBT_get_default_alignment(gb_main); |
---|
58 | if (ali_default) { |
---|
59 | ali_type = GBT_get_alignment_type_string(gb_main, ali_default); |
---|
60 | if (!ali_type) { |
---|
61 | // Note: this message will appear during startup (i.e. stick to general statement here!) |
---|
62 | aw_message(GBS_global_string("Failed to detect type of default alignment (%s)\n" |
---|
63 | "(Reason: %s)", ali_default, GB_await_error())); |
---|
64 | } |
---|
65 | free(ali_default); |
---|
66 | } |
---|
67 | else { |
---|
68 | GB_clear_error(); |
---|
69 | } |
---|
70 | } |
---|
71 | if (!ali_type) ali_type = ARB_strdup("rna"); |
---|
72 | |
---|
73 | aw_root->awar_string(AWAR_CON_SEQUENCE_TYPE, ali_type, aw_def); |
---|
74 | aw_root->awar_string(AWAR_CON_NEW_ALIGNMENT_NAME, "ali_concat", aw_def)->set_srt(SRT_AUTOCORRECT_ALINAME); |
---|
75 | aw_root->awar_string(AWAR_CON_ALIGNMENT_SEPARATOR, "XXX", aw_def); |
---|
76 | aw_root->awar_string(AWAR_CON_SELECTED_ALI, "", aw_def); |
---|
77 | aw_root->awar_string(AWAR_CON_MERGE_FIELD, "full_name", aw_def); |
---|
78 | aw_root->awar_string(AWAR_CON_STORE_SIM_SP_NO, "merged_species", aw_def); |
---|
79 | |
---|
80 | aw_root->awar_int(AWAR_CON_ALLOW_OVERWRITE_ALI, 0, aw_def); |
---|
81 | aw_root->awar_int(AWAR_CON_INSGAPS_FOR_MISS_ALIS, 1, aw_def); |
---|
82 | |
---|
83 | free(ali_type); |
---|
84 | } |
---|
85 | |
---|
86 | // ------------------------Selecting alignments from the database for concatenation---------------------- |
---|
87 | |
---|
88 | inline char *get_alitype_eval(AW_root *aw_root) { |
---|
89 | return GBS_global_string_copy("%s=", aw_root->awar(AWAR_CON_SEQUENCE_TYPE)->read_char_pntr()); |
---|
90 | } |
---|
91 | |
---|
92 | static void alitype_changed_cb(AW_root *aw_root, AW_DB_selection *db_sel) { |
---|
93 | char *ali_type = get_alitype_eval(aw_root); |
---|
94 | awt_reconfigure_ALI_selection_list(db_sel, ali_type); |
---|
95 | free(ali_type); |
---|
96 | } |
---|
97 | |
---|
98 | static AW_DB_selection* createSelectionList(GBDATA *gb_main, AW_window *aws, const char *awarName) { |
---|
99 | |
---|
100 | #ifdef DEBUG |
---|
101 | static bool ran=false; |
---|
102 | nt_assert(!ran); |
---|
103 | ran=true; // prevents calling this function for the second time |
---|
104 | #endif |
---|
105 | |
---|
106 | AW_root *aw_root = aws->get_root(); |
---|
107 | char *ali_type = get_alitype_eval(aw_root); |
---|
108 | AW_DB_selection *db_sel = awt_create_ALI_selection_list(gb_main, aws, awarName, ali_type); |
---|
109 | |
---|
110 | free(ali_type); |
---|
111 | return db_sel; |
---|
112 | } |
---|
113 | |
---|
114 | // ---------- Create SAI to display alignments that were concatenated -------------- |
---|
115 | |
---|
116 | static GB_ERROR create_concatInfo_SAI(GBDATA *gb_main, const char *new_ali_name, const char *ali_separator, const StrArray& ali_names) { |
---|
117 | GB_ERROR error = NULp; |
---|
118 | GBDATA *gb_extended = GBT_find_or_create_SAI(gb_main, "ConcatInfo"); |
---|
119 | |
---|
120 | if (!gb_extended) error = GB_await_error(); |
---|
121 | else { |
---|
122 | GBDATA *gb_data = GBT_add_data(gb_extended, new_ali_name, "data", GB_STRING); |
---|
123 | |
---|
124 | if (!gb_data) { |
---|
125 | error = GB_await_error(); |
---|
126 | } |
---|
127 | else { |
---|
128 | int new_ali_length = GBT_get_alignment_len(gb_main, new_ali_name); |
---|
129 | nt_assert(new_ali_length>0); |
---|
130 | |
---|
131 | int sep_len = strlen(ali_separator); |
---|
132 | |
---|
133 | char *info = ARB_alloc<char>(new_ali_length+1); |
---|
134 | memset(info, '=', new_ali_length); |
---|
135 | |
---|
136 | int offset = 0; |
---|
137 | int last_ali_idx = ali_names.size()-1; |
---|
138 | |
---|
139 | for (int a = 0; a <= last_ali_idx; ++a) { |
---|
140 | const char *ali = ali_names[a]; |
---|
141 | |
---|
142 | int ali_len = GBT_get_alignment_len(gb_main, ali); |
---|
143 | int ali_str_len = strlen(ali); |
---|
144 | nt_assert(ali_len>0); |
---|
145 | |
---|
146 | char *my_info = info+offset; |
---|
147 | |
---|
148 | int half_ali_len = ali_len/2; |
---|
149 | for (int i = 0; i<5; ++i) { |
---|
150 | if (i<half_ali_len) { |
---|
151 | my_info[i] = '<'; |
---|
152 | my_info[ali_len-i-1] = '>'; |
---|
153 | } |
---|
154 | } |
---|
155 | |
---|
156 | if (ali_str_len<ali_len) { |
---|
157 | int namepos = half_ali_len - ali_str_len/2; |
---|
158 | memcpy(my_info+namepos, ali, ali_str_len); |
---|
159 | } |
---|
160 | |
---|
161 | offset += ali_len; |
---|
162 | if (a != last_ali_idx) { |
---|
163 | memcpy(info+offset, ali_separator, sep_len); |
---|
164 | offset += sep_len; |
---|
165 | } |
---|
166 | } |
---|
167 | |
---|
168 | nt_assert(offset == new_ali_length); // wrong alignment length! |
---|
169 | info[new_ali_length] = 0; |
---|
170 | |
---|
171 | if (!error) error = GB_write_string(gb_data, info); |
---|
172 | free(info); |
---|
173 | } |
---|
174 | } |
---|
175 | return error; |
---|
176 | } |
---|
177 | |
---|
178 | // ---------------------------------------- Concatenation function ---------------------------------- |
---|
179 | static void concatenateAlignments(AW_window *aws, AW_selection *selected_alis) { |
---|
180 | nt_assert(selected_alis); |
---|
181 | |
---|
182 | GB_push_transaction(GLOBAL.gb_main); |
---|
183 | |
---|
184 | long marked_species = GBT_count_marked_species(GLOBAL.gb_main); |
---|
185 | AW_root *aw_root = aws->get_root(); |
---|
186 | char *new_ali_name = aw_root->awar(AWAR_CON_NEW_ALIGNMENT_NAME)->read_string(); |
---|
187 | GB_ERROR error = GBT_check_alignment_name(new_ali_name); |
---|
188 | |
---|
189 | StrArray ali_names; |
---|
190 | selected_alis->get_values(ali_names); |
---|
191 | |
---|
192 | arb_progress progress("Concatenating alignments", marked_species); |
---|
193 | size_t ali_count = ali_names.size(); |
---|
194 | |
---|
195 | if (!error && ali_count<2) { |
---|
196 | error = "Not enough alignments selected for concatenation (need at least 2)"; |
---|
197 | } |
---|
198 | if (!error) { |
---|
199 | int found[ali_count], missing[ali_count], ali_length[ali_count]; |
---|
200 | |
---|
201 | for (size_t a = 0; a<ali_count; a++) { |
---|
202 | found[a] = 0; |
---|
203 | missing[a] = 0; |
---|
204 | ali_length[a] = GBT_get_alignment_len(GLOBAL.gb_main, ali_names[a]); |
---|
205 | |
---|
206 | if (ali_length[a]<=0) { |
---|
207 | error = GB_await_error(); |
---|
208 | } |
---|
209 | else if (strcmp(ali_names[a], new_ali_name) == 0) { |
---|
210 | error = "Target alignment may not be one of the source alignments"; |
---|
211 | } |
---|
212 | } |
---|
213 | |
---|
214 | if (!error) { |
---|
215 | char *ali_separator = aw_root->awar(AWAR_CON_ALIGNMENT_SEPARATOR)->read_string(); |
---|
216 | const int sep_len = strlen(ali_separator); |
---|
217 | |
---|
218 | long new_alignment_len = (ali_count-1)*sep_len; |
---|
219 | for (size_t a = 0; a<ali_count; ++a) { // LOOP_VECTORIZED |
---|
220 | new_alignment_len += ali_length[a]; |
---|
221 | } |
---|
222 | |
---|
223 | GBDATA *gb_presets = GBT_get_presets(GLOBAL.gb_main); |
---|
224 | GBDATA *gb_alignment_exists = GB_find_string(gb_presets, "alignment_name", new_ali_name, GB_IGNORE_CASE, SEARCH_GRANDCHILD); |
---|
225 | GBDATA *gb_new_alignment = NULp; |
---|
226 | char *seq_type = aw_root->awar(AWAR_CON_SEQUENCE_TYPE)->read_string(); |
---|
227 | |
---|
228 | if (gb_alignment_exists) { |
---|
229 | // target alignment exists |
---|
230 | if (aw_root->awar(AWAR_CON_ALLOW_OVERWRITE_ALI)->read_int()) { // allow overwrite |
---|
231 | gb_new_alignment = GBT_get_alignment(GLOBAL.gb_main, new_ali_name); |
---|
232 | if (!gb_new_alignment) error = GB_await_error(); |
---|
233 | } |
---|
234 | else { |
---|
235 | error = GBS_global_string("Target alignment '%s' already exists\n(check overwrite-toggle if you really want to overwrite)", new_ali_name); |
---|
236 | } |
---|
237 | } |
---|
238 | else { |
---|
239 | // create new target alignment |
---|
240 | char *source_alignments = GBT_join_strings(ali_names, ' '); |
---|
241 | char *why_created = GBS_global_string_copy("while concatenating %s", source_alignments); |
---|
242 | |
---|
243 | gb_new_alignment = GBT_create_alignment(GLOBAL.gb_main, new_ali_name, new_alignment_len, 0, 0, seq_type, why_created); |
---|
244 | if (!gb_new_alignment) error = GB_await_error(); |
---|
245 | |
---|
246 | free(why_created); |
---|
247 | free(source_alignments); |
---|
248 | } |
---|
249 | |
---|
250 | if (!error) { |
---|
251 | AW_repeated_question ask_about_missing_alignment; |
---|
252 | bool insertGaps = aw_root->awar(AWAR_CON_INSGAPS_FOR_MISS_ALIS)->read_int(); |
---|
253 | |
---|
254 | for (GBDATA *gb_species = GBT_first_marked_species(GLOBAL.gb_main); |
---|
255 | gb_species && !error; |
---|
256 | gb_species = GBT_next_marked_species(gb_species)) |
---|
257 | { |
---|
258 | GBS_strstruct concat_seq(new_alignment_len+1); |
---|
259 | int data_inserted = 0; |
---|
260 | |
---|
261 | for (size_t a = 0; a<ali_count; ++a) { |
---|
262 | if (a) concat_seq.cat(ali_separator); |
---|
263 | |
---|
264 | GBDATA *gb_seq_data = GBT_find_sequence(gb_species, ali_names[a]); |
---|
265 | if (gb_seq_data) { // found data |
---|
266 | const char *seq_data = GB_read_char_pntr(gb_seq_data); |
---|
267 | concat_seq.cat(seq_data); |
---|
268 | ++found[a]; |
---|
269 | ++data_inserted; |
---|
270 | } |
---|
271 | else { // missing data |
---|
272 | if (insertGaps) concat_seq.nput('.', ali_length[a]); |
---|
273 | ++missing[a]; |
---|
274 | } |
---|
275 | } |
---|
276 | |
---|
277 | if (!data_inserted) { |
---|
278 | error = GBS_global_string("None of the source alignments had data for species '%s'", GBT_get_name_or_description(gb_species)); |
---|
279 | } |
---|
280 | else { |
---|
281 | GBDATA *gb_data = GBT_add_data(gb_species, new_ali_name, "data", GB_STRING); |
---|
282 | GB_write_string(gb_data, concat_seq.get_data()); |
---|
283 | } |
---|
284 | progress.inc_and_check_user_abort(error); |
---|
285 | } |
---|
286 | |
---|
287 | if (!error) { |
---|
288 | // ............. print missing alignments........... |
---|
289 | aw_message(GBS_global_string("Concatenation of alignments was performed for %ld species.", marked_species)); |
---|
290 | for (size_t a = 0; a<ali_count; ++a) { |
---|
291 | aw_message(GBS_global_string("%s: was found in %d species and missing in %d species.", ali_names[a], found[a], missing[a])); |
---|
292 | } |
---|
293 | } |
---|
294 | |
---|
295 | if (!error) error = GBT_check_data(GLOBAL.gb_main, new_ali_name); // update alignment info (otherwise create_concatInfo_SAI fails when overwriting an alignment) |
---|
296 | if (!error) error = create_concatInfo_SAI(GLOBAL.gb_main, new_ali_name, ali_separator, ali_names); |
---|
297 | } |
---|
298 | |
---|
299 | free(seq_type); |
---|
300 | free(ali_separator); |
---|
301 | } |
---|
302 | } |
---|
303 | |
---|
304 | if (!error) { |
---|
305 | error = GBT_add_alignment_changekeys(GLOBAL.gb_main, new_ali_name); |
---|
306 | } |
---|
307 | else { |
---|
308 | progress.done(); |
---|
309 | } |
---|
310 | GB_end_transaction_show_error(GLOBAL.gb_main, error, aw_message); |
---|
311 | free(new_ali_name); |
---|
312 | } |
---|
313 | |
---|
314 | static void addSpeciesToConcatenateList(SpeciesConcatenateList **sclp, GB_CSTR species_name) { |
---|
315 | |
---|
316 | GBDATA *gb_species_data = GBT_get_species_data(GLOBAL.gb_main); |
---|
317 | GBDATA *gb_species = GBT_find_species_rel_species_data(gb_species_data, species_name); |
---|
318 | |
---|
319 | if (gb_species) { |
---|
320 | SpeciesConcatenateList *scl = new SpeciesConcatenateList; |
---|
321 | |
---|
322 | scl->species = gb_species; |
---|
323 | scl->species_name = ARB_strdup(species_name); |
---|
324 | scl->next = *sclp; |
---|
325 | *sclp = scl; |
---|
326 | } |
---|
327 | } |
---|
328 | |
---|
329 | static void freeSpeciesConcatenateList(SpeciesConcatenateList *scl) { |
---|
330 | while (scl) { |
---|
331 | SpeciesConcatenateList *next = scl->next; |
---|
332 | free(scl->species_name); |
---|
333 | delete scl; |
---|
334 | scl = next; |
---|
335 | } |
---|
336 | } |
---|
337 | |
---|
338 | static GB_ERROR checkAndMergeFields(GBDATA *gb_new_species, GB_ERROR error, SpeciesConcatenateList *scl) { |
---|
339 | |
---|
340 | char *doneFields = ARB_strdup(";name;"); // all fields which are already merged |
---|
341 | int doneLen = strlen(doneFields); |
---|
342 | SpeciesConcatenateList *sl = scl; |
---|
343 | int sl_length = 0; while (scl) { sl_length++; scl=scl->next; } // counting no. of similar species stored in the list |
---|
344 | int *fieldStat = new int[sl_length]; // 0 = not used yet ; -1 = doesn't have field ; 1..n = field content (same number means same content) |
---|
345 | |
---|
346 | while (sl && !error) { // with all species do.. |
---|
347 | char *newFields = GB_get_subfields(sl->species); |
---|
348 | char *fieldStart = newFields; // points to ; before next field |
---|
349 | |
---|
350 | while (fieldStart[1] && !error) { // with all subfields of the species do.. |
---|
351 | char *fieldEnd = strchr(fieldStart+1, ';'); |
---|
352 | nt_assert(fieldEnd); |
---|
353 | char behind = fieldEnd[1]; fieldEnd[1] = 0; |
---|
354 | |
---|
355 | if (!strstr(doneFields, fieldStart)) { // field is not merged yet |
---|
356 | char *fieldName = fieldStart+1; |
---|
357 | int fieldLen = int(fieldEnd-fieldName); |
---|
358 | |
---|
359 | nt_assert(fieldEnd[0]==';'); |
---|
360 | fieldEnd[0] = 0; |
---|
361 | |
---|
362 | GBDATA *gb_field = GB_search(sl->species, fieldName, GB_FIND); // field does to exist (it was found before) |
---|
363 | GB_TYPES type = GB_read_type(gb_field); |
---|
364 | |
---|
365 | if (type==GB_STRING) { // we only merge string fields |
---|
366 | int i; int doneSpecies = 0; int nextStat = 1; |
---|
367 | |
---|
368 | for (i=0; i<sl_length; i++) { fieldStat[i] = 0; } // clear field status |
---|
369 | |
---|
370 | while (doneSpecies<sl_length) { // since all species in list were handled |
---|
371 | SpeciesConcatenateList *sl2 = sl; |
---|
372 | i = 0; |
---|
373 | |
---|
374 | while (sl2) { |
---|
375 | if (fieldStat[i]==0) { |
---|
376 | gb_field = GB_search(sl2->species, fieldName, GB_FIND); |
---|
377 | if (gb_field) { |
---|
378 | char *content = GB_read_as_string(gb_field); |
---|
379 | SpeciesConcatenateList *sl3 = sl2->next; |
---|
380 | fieldStat[i] = nextStat; |
---|
381 | int j = i+1; doneSpecies++; |
---|
382 | |
---|
383 | while (sl3) { |
---|
384 | if (fieldStat[j]==0) { |
---|
385 | gb_field = GB_search(sl3->species, fieldName, GB_FIND); |
---|
386 | if (gb_field) { |
---|
387 | char *content2 = GB_read_as_string(gb_field); |
---|
388 | if (strcmp(content, content2)==0) { // if contents are the same, they get the same status |
---|
389 | fieldStat[j] = nextStat; |
---|
390 | doneSpecies++; |
---|
391 | } |
---|
392 | free(content2); |
---|
393 | } |
---|
394 | else { |
---|
395 | fieldStat[j] = -1; |
---|
396 | doneSpecies++; |
---|
397 | } |
---|
398 | } |
---|
399 | sl3 = sl3->next; j++; |
---|
400 | } |
---|
401 | free(content); nextStat++; |
---|
402 | } |
---|
403 | else { |
---|
404 | fieldStat[i] = -1; // field does not exist here |
---|
405 | doneSpecies++; |
---|
406 | } |
---|
407 | } |
---|
408 | sl2 = sl2->next; i++; |
---|
409 | } |
---|
410 | if (!sl2) break; |
---|
411 | } |
---|
412 | nt_assert(nextStat!=1); // this would mean that none of the species contained the field |
---|
413 | { |
---|
414 | char *new_content = NULp; |
---|
415 | int new_content_len = 0; // @@@ useless (0 where used; unused otherwise) |
---|
416 | |
---|
417 | if (nextStat==2) { // all species contain same field content or do not have the field |
---|
418 | SpeciesConcatenateList *sl2 = sl; |
---|
419 | while (sl2) { |
---|
420 | gb_field = GB_search(sl2->species, fieldName, GB_FIND); |
---|
421 | if (gb_field) { |
---|
422 | new_content = GB_read_as_string(gb_field); |
---|
423 | new_content_len = strlen(new_content); |
---|
424 | break; |
---|
425 | } |
---|
426 | sl2 = sl2->next; |
---|
427 | } |
---|
428 | } |
---|
429 | else { // different field contents |
---|
430 | int actualStat; |
---|
431 | for (actualStat=1; actualStat<nextStat; actualStat++) { |
---|
432 | SpeciesConcatenateList *sl2 = sl; |
---|
433 | |
---|
434 | int names_len = 1; // open bracket |
---|
435 | char *content = NULp; |
---|
436 | i = 0; |
---|
437 | |
---|
438 | while (sl2) { |
---|
439 | if (fieldStat[i]==actualStat) { |
---|
440 | names_len += strlen(sl2->species_name)+1; |
---|
441 | if (!content) { |
---|
442 | gb_field = GB_search(sl2->species, fieldName, GB_FIND); |
---|
443 | nt_assert(gb_field); |
---|
444 | content = GB_read_as_string(gb_field); |
---|
445 | } |
---|
446 | } |
---|
447 | sl2 = sl2->next; i++; |
---|
448 | } |
---|
449 | nt_assert(content); |
---|
450 | int add_len = names_len+1+strlen(content); |
---|
451 | char *whole = ARB_alloc<char>(new_content_len+1+add_len+1); |
---|
452 | nt_assert(whole); |
---|
453 | char *add = new_content ? whole+sprintf(whole, "%s ", new_content) : whole; |
---|
454 | sl2 = sl; i = 0; |
---|
455 | int first = 1; |
---|
456 | while (sl2) { |
---|
457 | if (fieldStat[i]==actualStat) { |
---|
458 | add += sprintf(add, "%c%s", first ? '{' : ';', sl2->species_name); |
---|
459 | first = 0; |
---|
460 | } |
---|
461 | sl2 = sl2->next; i++; |
---|
462 | } |
---|
463 | add += sprintf(add, "} %s", content); |
---|
464 | |
---|
465 | free(content); |
---|
466 | freeset(new_content, whole); |
---|
467 | new_content_len = strlen(new_content); // cppcheck-suppress deallocuse |
---|
468 | } |
---|
469 | } |
---|
470 | |
---|
471 | if (new_content) { |
---|
472 | error = GBT_write_string(gb_new_species, fieldName, new_content); |
---|
473 | free(new_content); |
---|
474 | } |
---|
475 | } |
---|
476 | } |
---|
477 | |
---|
478 | // mark field as done: |
---|
479 | char *new_doneFields = ARB_alloc<char>(doneLen+fieldLen+1+1); |
---|
480 | sprintf(new_doneFields, "%s%s;", doneFields, fieldName); |
---|
481 | doneLen += fieldLen+1; |
---|
482 | freeset(doneFields, new_doneFields); |
---|
483 | fieldEnd[0] = ';'; |
---|
484 | } |
---|
485 | fieldEnd[1] = behind; |
---|
486 | fieldStart = fieldEnd; |
---|
487 | } |
---|
488 | free(newFields); |
---|
489 | sl = sl->next; |
---|
490 | } |
---|
491 | free(doneFields); |
---|
492 | delete [] fieldStat; |
---|
493 | |
---|
494 | return error; |
---|
495 | } |
---|
496 | |
---|
497 | static GBDATA *concatenateFieldsCreateNewSpecies(AW_window *, GBDATA *gb_species, SpeciesConcatenateList *scl) { |
---|
498 | GB_push_transaction(GLOBAL.gb_main); |
---|
499 | |
---|
500 | GB_ERROR error = NULp; |
---|
501 | GBDATA *gb_species_data = GBT_get_species_data(GLOBAL.gb_main); |
---|
502 | |
---|
503 | // data needed for name generation |
---|
504 | char *full_name = NULp; |
---|
505 | char *acc = NULp; |
---|
506 | |
---|
507 | // --------------------getting the species related data -------------------- |
---|
508 | |
---|
509 | GBDATA *gb_new_species = NULp; |
---|
510 | |
---|
511 | if (!error) { |
---|
512 | // copy species to create a new species |
---|
513 | gb_new_species = GB_create_container(gb_species_data, "species"); |
---|
514 | error = gb_new_species ? GB_copy_dropProtectMarksAndTempstate(gb_new_species, gb_species) : GB_await_error(); |
---|
515 | |
---|
516 | if (!error) { // write dummy-name (real name written below) |
---|
517 | error = GBT_write_string(gb_new_species, "name", "$currcat$"); |
---|
518 | } |
---|
519 | } |
---|
520 | |
---|
521 | if (!error) { // copy full name |
---|
522 | full_name = GBT_read_string(gb_species, "full_name"); |
---|
523 | if (!full_name) error = GB_await_error(); |
---|
524 | else error = GBT_write_string(gb_new_species, "full_name", full_name); |
---|
525 | } |
---|
526 | |
---|
527 | if (!error) { |
---|
528 | ConstStrArray ali_names; |
---|
529 | GBT_get_alignment_names(ali_names, GLOBAL.gb_main); |
---|
530 | |
---|
531 | long id = 0; |
---|
532 | for (SpeciesConcatenateList *speciesList = scl; speciesList; speciesList = speciesList->next) { |
---|
533 | for (int no_of_alignments = 0; ali_names[no_of_alignments]; no_of_alignments++) { |
---|
534 | GBDATA *gb_seq_data = GBT_find_sequence(speciesList->species, ali_names[no_of_alignments]); |
---|
535 | if (gb_seq_data) { |
---|
536 | const char *seq_data = GB_read_char_pntr(gb_seq_data); |
---|
537 | GBDATA *gb_data = GBT_add_data(gb_new_species, ali_names[no_of_alignments], "data", GB_STRING); |
---|
538 | error = GB_write_string(gb_data, seq_data); |
---|
539 | if (!error) id += GBS_checksum(seq_data, 1, ".-"); // creating checksum of the each aligned sequence to generate new accession number |
---|
540 | } |
---|
541 | if (error) error = GB_export_errorf("Can't create alignment '%s'", ali_names[no_of_alignments]); |
---|
542 | } |
---|
543 | } |
---|
544 | |
---|
545 | if (!error) { |
---|
546 | acc = GBS_global_string_copy("ARB_%lX", id); // create new accession number |
---|
547 | error = GBT_write_string(gb_new_species, "acc", acc); |
---|
548 | } |
---|
549 | } |
---|
550 | |
---|
551 | if (!error) error = checkAndMergeFields(gb_new_species, error, scl); |
---|
552 | |
---|
553 | // now generate new name |
---|
554 | if (!error) { |
---|
555 | char *new_species_name = NULp; |
---|
556 | |
---|
557 | const char *add_field = AW_get_nameserver_addid(GLOBAL.gb_main); |
---|
558 | GBDATA *gb_addid = add_field[0] ? GB_entry(gb_new_species, add_field) : NULp; |
---|
559 | char *addid = NULp; |
---|
560 | if (gb_addid) addid = GB_read_as_string(gb_addid); |
---|
561 | |
---|
562 | error = AWTC_generate_one_name(GLOBAL.gb_main, full_name, acc, addid, new_species_name); |
---|
563 | if (!error) { // name was created |
---|
564 | if (GBT_find_species_rel_species_data(gb_species_data, new_species_name)) { |
---|
565 | // if the name is not unique -> create unique name |
---|
566 | UniqueNameDetector und(gb_species_data); |
---|
567 | freeset(new_species_name, AWTC_makeUniqueShortName(new_species_name, und)); |
---|
568 | if (!new_species_name) error = GB_await_error(); |
---|
569 | } |
---|
570 | } |
---|
571 | |
---|
572 | if (!error) error = GBT_write_string(gb_new_species, "name", new_species_name); // insert new 'name' |
---|
573 | |
---|
574 | free(new_species_name); |
---|
575 | free(addid); |
---|
576 | } |
---|
577 | |
---|
578 | error = GB_end_transaction(GLOBAL.gb_main, error); |
---|
579 | if (error) { |
---|
580 | gb_new_species = NULp; |
---|
581 | aw_message(error); |
---|
582 | } |
---|
583 | |
---|
584 | free(acc); |
---|
585 | free(full_name); |
---|
586 | |
---|
587 | return gb_new_species; |
---|
588 | } |
---|
589 | |
---|
590 | enum MergeSpeciesType { |
---|
591 | MERGE_SPECIES_SIMPLE, |
---|
592 | MERGE_SPECIES_AND_CONCAT_ALI, |
---|
593 | }; |
---|
594 | |
---|
595 | static void mergeSimilarSpecies(AW_window *aws, MergeSpeciesType mergeType, AW_selection *selected_alis) { |
---|
596 | nt_assert(correlated(selected_alis, mergeType == MERGE_SPECIES_AND_CONCAT_ALI)); |
---|
597 | |
---|
598 | GB_ERROR error = NULp; |
---|
599 | arb_progress wrapper; |
---|
600 | { |
---|
601 | AW_root *aw_root = aws->get_root(); |
---|
602 | char *merge_field_name = aw_root->awar(AWAR_CON_MERGE_FIELD)->read_string(); |
---|
603 | |
---|
604 | SpeciesConcatenateList *scl = NULp; // to build list of similar species |
---|
605 | SpeciesConcatenateList *newSpeciesList = NULp; // new SpeciesConcatenateList |
---|
606 | |
---|
607 | GB_begin_transaction(GLOBAL.gb_main); // open database for transaction |
---|
608 | |
---|
609 | const char *report_field_name = prepare_and_get_selected_itemfield(aw_root, AWAR_CON_STORE_SIM_SP_NO, GLOBAL.gb_main, SPECIES_get_selector(), FIF_NAME_SELECTED); |
---|
610 | error = GB_incur_error_if(!report_field_name); |
---|
611 | |
---|
612 | if (!error && strcmp(merge_field_name, NO_FIELD_SELECTED) == 0) { |
---|
613 | error = "Please select database field for similarity detection"; |
---|
614 | } |
---|
615 | |
---|
616 | if (!error) { |
---|
617 | PersistentNameServerConnection stayAlive; |
---|
618 | arb_progress progress("Merging similar species", GBT_count_marked_species(GLOBAL.gb_main)); |
---|
619 | progress.auto_subtitles("Species"); |
---|
620 | |
---|
621 | for (GBDATA * gb_species = GBT_first_marked_species(GLOBAL.gb_main); |
---|
622 | gb_species && !error; |
---|
623 | gb_species = GBT_next_marked_species(gb_species)) |
---|
624 | { |
---|
625 | GBDATA *gb_species_field = GB_entry(gb_species, merge_field_name); |
---|
626 | |
---|
627 | if (!gb_species_field) { |
---|
628 | // exit if species doesn't have any data in the selected field |
---|
629 | error = GBS_global_string("Species '%s' does not contain data in selected field '%s'", |
---|
630 | GBT_get_name_or_description(gb_species), |
---|
631 | merge_field_name); |
---|
632 | } |
---|
633 | else { |
---|
634 | char *gb_species_field_content = GB_read_as_string(gb_species_field); |
---|
635 | int similar_species = 0; |
---|
636 | |
---|
637 | for (GBDATA * gb_species_next = GBT_next_marked_species(gb_species); |
---|
638 | gb_species_next && !error; |
---|
639 | gb_species_next = GBT_next_marked_species(gb_species_next)) |
---|
640 | { |
---|
641 | GBDATA *gb_next_species_field = GB_entry(gb_species_next, merge_field_name); |
---|
642 | if (!gb_next_species_field) { |
---|
643 | // exit if species doesn't have any data in the selected field |
---|
644 | error = GBS_global_string("Species '%s' does not contain data in selected field '%s'", |
---|
645 | GBT_get_name_or_description(gb_species_next), |
---|
646 | merge_field_name); |
---|
647 | } |
---|
648 | else { |
---|
649 | char *gb_next_species_field_content = GB_read_as_string(gb_next_species_field); |
---|
650 | |
---|
651 | if (strcmp(gb_species_field_content, gb_next_species_field_content) == 0) { |
---|
652 | addSpeciesToConcatenateList(&scl, GBT_get_name_or_description(gb_species_next)); |
---|
653 | GB_write_flag(gb_species_next, 0); |
---|
654 | ++similar_species; |
---|
655 | ++progress; |
---|
656 | } |
---|
657 | free(gb_next_species_field_content); |
---|
658 | } |
---|
659 | } |
---|
660 | |
---|
661 | if (similar_species > 0 && !error) { |
---|
662 | ++similar_species; // correct merge counter |
---|
663 | addSpeciesToConcatenateList(&scl, GBT_get_name_or_description(gb_species)); |
---|
664 | GB_write_flag(gb_species, 0); |
---|
665 | |
---|
666 | GBDATA *new_species_created = concatenateFieldsCreateNewSpecies(aws, gb_species, scl); |
---|
667 | |
---|
668 | nt_assert(new_species_created); |
---|
669 | if (new_species_created) { // create a list of newly created species |
---|
670 | addSpeciesToConcatenateList(&newSpeciesList, GBT_get_name_or_description(new_species_created)); |
---|
671 | } |
---|
672 | |
---|
673 | if (report_field_name) { |
---|
674 | GBDATA *gb_report = GBT_searchOrCreate_itemfield_according_to_changekey(new_species_created, report_field_name, SPECIES_get_selector().change_key_path); |
---|
675 | if (!gb_report) error = GB_await_error(); |
---|
676 | else error = GB_write_lossless_int(gb_report, similar_species); |
---|
677 | } |
---|
678 | } |
---|
679 | |
---|
680 | freeSpeciesConcatenateList(scl); scl = NULp; |
---|
681 | free(gb_species_field_content); |
---|
682 | } |
---|
683 | |
---|
684 | progress.inc_and_check_user_abort(error); |
---|
685 | } |
---|
686 | } |
---|
687 | |
---|
688 | if (!error) { |
---|
689 | GBT_mark_all(GLOBAL.gb_main, 0); // unmark all species in the database |
---|
690 | int newSpeciesCount = 0; |
---|
691 | |
---|
692 | for (; newSpeciesList; newSpeciesList = newSpeciesList->next) { // mark only newly created species |
---|
693 | GB_write_flag(newSpeciesList->species, 1); |
---|
694 | newSpeciesCount++; |
---|
695 | } |
---|
696 | aw_message(GBS_global_string("%i new species were created by taking \"%s\" as a criterion!", newSpeciesCount, merge_field_name)); |
---|
697 | freeSpeciesConcatenateList(newSpeciesList); |
---|
698 | } |
---|
699 | |
---|
700 | free(merge_field_name); |
---|
701 | |
---|
702 | GB_end_transaction_show_error(GLOBAL.gb_main, error, aw_message); |
---|
703 | } |
---|
704 | |
---|
705 | if (mergeType == MERGE_SPECIES_AND_CONCAT_ALI && !error) { |
---|
706 | // @@@ what happens if merge-process above succeeds and concatenateAlignments below fails? |
---|
707 | // @@@ i think both steps should be put into ONE transaction! |
---|
708 | concatenateAlignments(aws, selected_alis); |
---|
709 | } |
---|
710 | } |
---|
711 | |
---|
712 | static AW_window *createMergeSimilarSpeciesWindow(AW_root *aw_root, MergeSpeciesType mergeType, AW_selection *selected_alis) { |
---|
713 | AW_window_simple *aws = new AW_window_simple; |
---|
714 | |
---|
715 | { |
---|
716 | char *window_id = GBS_global_string_copy("MERGE_SPECIES_%i", mergeType); |
---|
717 | const char *window_title = NULp; |
---|
718 | switch (mergeType) { |
---|
719 | case MERGE_SPECIES_SIMPLE: window_title = "Merge species"; break; |
---|
720 | case MERGE_SPECIES_AND_CONCAT_ALI: window_title = "Merge and concatenate"; break; |
---|
721 | } |
---|
722 | aws->init(aw_root, window_id, window_title); |
---|
723 | free(window_id); |
---|
724 | } |
---|
725 | aws->load_xfig("merge_species.fig"); |
---|
726 | |
---|
727 | aws->callback(makeHelpCallback("merge_species.hlp")); |
---|
728 | aws->at("help"); |
---|
729 | aws->create_button("HELP", "HELP", "H"); |
---|
730 | |
---|
731 | create_itemfield_selection_button(aws, FieldSelDef(AWAR_CON_MERGE_FIELD, GLOBAL.gb_main, SPECIES_get_selector(), FIELD_FILTER_STRING_READABLE, "field to compare"), "field_select"); |
---|
732 | create_itemfield_selection_button(aws, FieldSelDef(AWAR_CON_STORE_SIM_SP_NO, GLOBAL.gb_main, SPECIES_get_selector(), FIELD_FILTER_INT_WRITEABLE, "report-field", SF_ALLOW_NEW), "store_sp_no"); |
---|
733 | |
---|
734 | { |
---|
735 | const char *buttonText = NULp; |
---|
736 | switch (mergeType) { |
---|
737 | case MERGE_SPECIES_SIMPLE: buttonText = "Merge similar species"; break; |
---|
738 | case MERGE_SPECIES_AND_CONCAT_ALI: buttonText = "Merge similar species and concat alignments"; break; |
---|
739 | } |
---|
740 | |
---|
741 | aws->at("merge"); |
---|
742 | aws->callback(makeWindowCallback(mergeSimilarSpecies, mergeType, selected_alis)); |
---|
743 | aws->create_autosize_button("MERGE_SIMILAR_SPECIES", buttonText, "M"); |
---|
744 | } |
---|
745 | |
---|
746 | aws->at("close"); |
---|
747 | aws->callback(AW_POPDOWN); |
---|
748 | aws->create_button("CLOSE", "CLOSE", "C"); |
---|
749 | |
---|
750 | return aws; |
---|
751 | } |
---|
752 | |
---|
753 | AW_window *NT_createMergeSimilarSpeciesWindow(AW_root *aw_root) { |
---|
754 | static AW_window *aw = NULp; |
---|
755 | if (!aw) aw = createMergeSimilarSpeciesWindow(aw_root, MERGE_SPECIES_SIMPLE, NULp); |
---|
756 | return aw; |
---|
757 | } |
---|
758 | |
---|
759 | static AW_window *NT_createMergeSimilarSpeciesAndConcatenateWindow(AW_root *aw_root, AW_selection *selected_alis) { |
---|
760 | static AW_window *aw = NULp; |
---|
761 | #if defined(ASSERTION_USED) |
---|
762 | static AW_selection *prev_selected_alis = NULp; |
---|
763 | #endif |
---|
764 | |
---|
765 | if (!aw) { |
---|
766 | aw = createMergeSimilarSpeciesWindow(aw_root, MERGE_SPECIES_AND_CONCAT_ALI, selected_alis); |
---|
767 | #if defined(ASSERTION_USED) |
---|
768 | prev_selected_alis = selected_alis; |
---|
769 | #endif |
---|
770 | } |
---|
771 | #if defined(ASSERTION_USED) |
---|
772 | nt_assert(selected_alis == prev_selected_alis); // would need multiple windows in that case |
---|
773 | #endif |
---|
774 | return aw; |
---|
775 | } |
---|
776 | |
---|
777 | static void useSelectedAlignment(AW_window *aww) { |
---|
778 | AW_root *root = aww->get_root(); |
---|
779 | const char *selali = root->awar(AWAR_CON_SELECTED_ALI)->read_char_pntr(); |
---|
780 | if (selali && strcmp(selali, NO_ALI_SELECTED) != 0) { |
---|
781 | root->awar(AWAR_CON_NEW_ALIGNMENT_NAME)->write_string(selali); |
---|
782 | } |
---|
783 | else { |
---|
784 | aw_message("Select alignment to use in the left alignment list"); |
---|
785 | } |
---|
786 | } |
---|
787 | |
---|
788 | // ----------------------------Creating concatenation window----------------------------------------- |
---|
789 | AW_window *NT_createConcatenationWindow(AW_root *aw_root) { |
---|
790 | AW_window_simple *aws = new AW_window_simple; |
---|
791 | |
---|
792 | aws->init(aw_root, "CONCAT_ALIGNMENTS", "Concatenate Alignments"); |
---|
793 | aws->load_xfig("concatenate.fig"); |
---|
794 | |
---|
795 | aws->auto_space(5, 5); |
---|
796 | aws->button_length(8); |
---|
797 | |
---|
798 | aws->callback(makeHelpCallback("concatenate.hlp")); |
---|
799 | aws->at("help"); |
---|
800 | aws->create_button("HELP", "HELP", "H"); |
---|
801 | |
---|
802 | aws->at("close"); |
---|
803 | aws->callback(AW_POPDOWN); |
---|
804 | aws->create_button("CLOSE", "CLOSE", "C"); |
---|
805 | |
---|
806 | aws->at("dbAligns"); |
---|
807 | AW_DB_selection *all_alis = createSelectionList(GLOBAL.gb_main, aws, AWAR_CON_SELECTED_ALI); |
---|
808 | AW_selection *sel_alis = awt_create_subset_selection_list(aws, all_alis->get_sellist(), "concatAligns", "collect", "sort"); |
---|
809 | |
---|
810 | aws->at("type"); |
---|
811 | aws->create_option_menu(AWAR_CON_SEQUENCE_TYPE); |
---|
812 | aws->insert_option("DNA", "d", "dna"); |
---|
813 | aws->insert_option("RNA", "r", "rna"); |
---|
814 | aws->insert_default_option("PROTEIN", "p", "ami"); |
---|
815 | aws->update_option_menu(); |
---|
816 | aw_root->awar(AWAR_CON_SEQUENCE_TYPE)->add_callback(makeRootCallback(alitype_changed_cb, all_alis)); |
---|
817 | |
---|
818 | aws->at("aliSeparator"); |
---|
819 | aws->create_input_field(AWAR_CON_ALIGNMENT_SEPARATOR, 10); |
---|
820 | |
---|
821 | aws->at("aliName"); |
---|
822 | aws->create_input_field(AWAR_CON_NEW_ALIGNMENT_NAME, 25); |
---|
823 | aws->button_length(5); |
---|
824 | aws->callback(useSelectedAlignment); |
---|
825 | aws->create_button("USE", "Use"); |
---|
826 | |
---|
827 | aws->at("overwrite"); |
---|
828 | aws->label("Allow to overwrite an existing alignment?"); |
---|
829 | aws->create_toggle(AWAR_CON_ALLOW_OVERWRITE_ALI); |
---|
830 | |
---|
831 | aws->at("insgaps"); |
---|
832 | aws->label("Insert gaps for missing alignment data?"); |
---|
833 | aws->create_toggle(AWAR_CON_INSGAPS_FOR_MISS_ALIS); |
---|
834 | |
---|
835 | aws->button_length(22); |
---|
836 | aws->at("go"); |
---|
837 | |
---|
838 | aws->callback(makeWindowCallback(concatenateAlignments, sel_alis)); |
---|
839 | aws->create_button("CONCATENATE", "CONCATENATE", "A"); |
---|
840 | |
---|
841 | aws->callback(NT_createMergeSimilarSpeciesWindow); |
---|
842 | aws->create_button("MERGE_SPECIES", "MERGE SIMILAR SPECIES", "M"); |
---|
843 | |
---|
844 | aws->callback(makeCreateWindowCallback(NT_createMergeSimilarSpeciesAndConcatenateWindow, sel_alis)); |
---|
845 | aws->create_button("MERGE_CONCATENATE", "MERGE & CONCATENATE", "S"); |
---|
846 | |
---|
847 | return aws; |
---|
848 | } |
---|
849 | // ------------------------------------------------------------------------------------------------------- |
---|