1 | #include "GDE_proto.h" |
---|
2 | |
---|
3 | #include <aw_msg.hxx> |
---|
4 | #include <AW_rename.hxx> |
---|
5 | #include <AP_filter.hxx> |
---|
6 | #include <aw_awar_defs.hxx> |
---|
7 | #include <arb_progress.h> |
---|
8 | #include <arb_global_defs.h> |
---|
9 | |
---|
10 | #include <algorithm> |
---|
11 | |
---|
12 | // AISC_MKPT_PROMOTE:#ifndef GDE_EXTGLOB_H |
---|
13 | // AISC_MKPT_PROMOTE:#include "GDE_extglob.h" |
---|
14 | // AISC_MKPT_PROMOTE:#endif |
---|
15 | |
---|
16 | typedef unsigned int UINT; |
---|
17 | |
---|
18 | int Arbdb_get_curelem(NA_Alignment& dataset) { |
---|
19 | int curelem = dataset.numelements++; |
---|
20 | if (curelem == 0) { |
---|
21 | dataset.maxnumelements = 5; |
---|
22 | ARB_alloc(dataset.element, dataset.maxnumelements); |
---|
23 | } |
---|
24 | else if (curelem == dataset.maxnumelements) { |
---|
25 | dataset.maxnumelements *= 2; |
---|
26 | ARB_realloc(dataset.element, dataset.maxnumelements); |
---|
27 | } |
---|
28 | return curelem; |
---|
29 | } |
---|
30 | |
---|
31 | static void set_constant_fields(NA_Sequence *this_elem) { |
---|
32 | this_elem->attr = DEFAULT_X_ATTR; |
---|
33 | this_elem->comments = ARB_strdup("no comments"); |
---|
34 | this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments)); |
---|
35 | this_elem->rmatrix = NULp; |
---|
36 | this_elem->tmatrix = NULp; |
---|
37 | this_elem->col_lut = Default_PROColor_LKUP; |
---|
38 | } |
---|
39 | |
---|
40 | static void AppendNA_and_free(NA_Sequence *this_elem, uchar *& sequfilt) { |
---|
41 | AppendNA((NA_Base *)sequfilt, strlen((const char *)sequfilt), this_elem); |
---|
42 | freenull(sequfilt); |
---|
43 | } |
---|
44 | |
---|
45 | __ATTR__USERESULT static int InsertDatainGDE(NA_Alignment& dataset, |
---|
46 | GBDATA **the_species, |
---|
47 | unsigned char **the_names, |
---|
48 | unsigned char **the_sequences, |
---|
49 | unsigned long numberspecies, |
---|
50 | unsigned long maxalignlen, |
---|
51 | const AP_filter *filter, |
---|
52 | GapCompression compress, |
---|
53 | bool cutoff_stop_codon, |
---|
54 | TypeInfo typeinfo) |
---|
55 | { |
---|
56 | GBDATA *gb_species; |
---|
57 | NA_Sequence *this_elem; |
---|
58 | AP_filter *allocatedFilter = NULp; |
---|
59 | |
---|
60 | gde_assert(contradicted(the_species, the_names)); |
---|
61 | |
---|
62 | if (!filter) { |
---|
63 | allocatedFilter = new AP_filter(maxalignlen); |
---|
64 | filter = allocatedFilter; |
---|
65 | } |
---|
66 | else { |
---|
67 | size_t fl = filter->get_length(); |
---|
68 | if (fl < maxalignlen) { |
---|
69 | aw_message("Warning: Your filter is shorter than the alignment len"); |
---|
70 | maxalignlen = fl; |
---|
71 | } |
---|
72 | } |
---|
73 | |
---|
74 | GB_ERROR error = filter->is_invalid(); |
---|
75 | if (!error) { |
---|
76 | size_t *seqlen = ARB_calloc<size_t>(numberspecies); |
---|
77 | // sequences may have different length |
---|
78 | { |
---|
79 | unsigned long i; |
---|
80 | for (i=0; i<numberspecies; i++) { |
---|
81 | seqlen[i] = strlen((char *)the_sequences[i]); |
---|
82 | } |
---|
83 | } |
---|
84 | |
---|
85 | if (cutoff_stop_codon) { |
---|
86 | unsigned long i; |
---|
87 | fputs("[CUTTING STOP CODONS]\n", stdout); |
---|
88 | for (i=0; i<numberspecies; i++) { |
---|
89 | uchar *seq = the_sequences[i]; |
---|
90 | uchar *stop_codon = (uchar*)strchr((char*)seq, '*'); |
---|
91 | if (stop_codon) { |
---|
92 | long pos = stop_codon-seq; |
---|
93 | long restlen = maxalignlen-pos; |
---|
94 | memset(stop_codon, '.', restlen); |
---|
95 | } |
---|
96 | } |
---|
97 | } |
---|
98 | |
---|
99 | // store (compressed) sequence data in array: |
---|
100 | uchar **sequfilt = ARB_calloc<uchar*>(numberspecies+1); |
---|
101 | GB_alignment_type alitype = GBT_get_alignment_type(dataset.gb_main, dataset.alignment_name); |
---|
102 | |
---|
103 | if (compress==COMPRESS_ALL) { // compress all gaps and filter positions |
---|
104 | long len = filter->get_filtered_length(); |
---|
105 | unsigned long i; |
---|
106 | |
---|
107 | for (i=0; i<numberspecies; i++) { |
---|
108 | ARB_calloc(sequfilt[i], len+1); |
---|
109 | long newcount = 0; |
---|
110 | for (unsigned long col=0; (col<maxalignlen); col++) { |
---|
111 | char c = the_sequences[i][col]; |
---|
112 | if (!c) break; |
---|
113 | if (filter->use_position(col) && !GAP::is_std_gap(c)) { |
---|
114 | sequfilt[i][newcount++] = c; |
---|
115 | } |
---|
116 | } |
---|
117 | } |
---|
118 | } |
---|
119 | else { |
---|
120 | if (compress==COMPRESS_VERTICAL_GAPS || // compress vertical gaps (and '.') |
---|
121 | compress == COMPRESS_NONINFO_COLUMNS) // and additionally all columns containing no info (only N or X) |
---|
122 | { |
---|
123 | size_t i; |
---|
124 | bool isInfo[256]; |
---|
125 | |
---|
126 | for (i=0; i<256; i++) isInfo[i] = true; |
---|
127 | isInfo[UINT('-')] = false; |
---|
128 | isInfo[UINT('.')] = false; |
---|
129 | if (compress == COMPRESS_NONINFO_COLUMNS) { |
---|
130 | switch (alitype) { |
---|
131 | case GB_AT_RNA: |
---|
132 | case GB_AT_DNA: |
---|
133 | isInfo[UINT('N')] = false; |
---|
134 | isInfo[UINT('n')] = false; |
---|
135 | break; |
---|
136 | case GB_AT_AA: |
---|
137 | isInfo[UINT('X')] = false; |
---|
138 | isInfo[UINT('x')] = false; |
---|
139 | break; |
---|
140 | default: |
---|
141 | gde_assert(0); |
---|
142 | break; |
---|
143 | } |
---|
144 | } |
---|
145 | |
---|
146 | bool modified = false; |
---|
147 | char *filterString = filter->to_string(); |
---|
148 | |
---|
149 | for (i=0; i<maxalignlen; i++) { |
---|
150 | if (filter->use_position(i)) { |
---|
151 | bool wantColumn = false; |
---|
152 | |
---|
153 | for (size_t n=0; n<numberspecies; n++) { |
---|
154 | if (i < seqlen[n]) { |
---|
155 | if (isInfo[UINT(the_sequences[n][i])]) { |
---|
156 | wantColumn = true; // have info -> take column |
---|
157 | break; |
---|
158 | } |
---|
159 | } |
---|
160 | } |
---|
161 | if (!wantColumn) { |
---|
162 | filterString[i] = '0'; |
---|
163 | modified = true; |
---|
164 | } |
---|
165 | } |
---|
166 | } |
---|
167 | |
---|
168 | if (modified) { |
---|
169 | size_t len = filter->get_length(); |
---|
170 | |
---|
171 | delete allocatedFilter; |
---|
172 | filter = allocatedFilter = new AP_filter(filterString, NULp, len); |
---|
173 | } |
---|
174 | |
---|
175 | free(filterString); |
---|
176 | } |
---|
177 | |
---|
178 | if (!error) error = filter->is_invalid(); |
---|
179 | |
---|
180 | if (!error) { |
---|
181 | long len = filter->get_filtered_length(); |
---|
182 | size_t i; |
---|
183 | |
---|
184 | for (i=0; i<numberspecies; i++) { |
---|
185 | int c; |
---|
186 | long newcount = 0; |
---|
187 | |
---|
188 | ARB_alloc(sequfilt[i], len+1); |
---|
189 | sequfilt[i][len] = 0; |
---|
190 | memset(sequfilt[i], '.', len); // Generate empty sequences |
---|
191 | |
---|
192 | const uchar *simplify = filter->get_simplify_table(); |
---|
193 | for (size_t col=0; (col<maxalignlen) && (c=the_sequences[i][col]); col++) { |
---|
194 | if (filter->use_position(col)) { |
---|
195 | sequfilt[i][newcount++] = simplify[c]; |
---|
196 | } |
---|
197 | } |
---|
198 | } |
---|
199 | } |
---|
200 | } |
---|
201 | free(seqlen); |
---|
202 | |
---|
203 | if (!error) { |
---|
204 | GB_transaction ta(db_access.gb_main); |
---|
205 | |
---|
206 | char *str = filter->to_string(); |
---|
207 | error = GBT_write_string(db_access.gb_main, AWAR_GDE_EXPORT_FILTER, str); |
---|
208 | free(str); |
---|
209 | } |
---|
210 | |
---|
211 | if (!error) { |
---|
212 | long number = 0; |
---|
213 | int curelem; |
---|
214 | int bad_names = 0; |
---|
215 | |
---|
216 | int elementtype = TEXT; |
---|
217 | int elementtype_init = RNA; |
---|
218 | switch (typeinfo) { |
---|
219 | case UNKNOWN_TYPEINFO: gde_assert(0); |
---|
220 | case BASIC_TYPEINFO: break; |
---|
221 | |
---|
222 | case DETAILED_TYPEINFO: |
---|
223 | switch (alitype) { |
---|
224 | case GB_AT_RNA: elementtype = RNA; break; |
---|
225 | case GB_AT_DNA: elementtype = DNA; break; |
---|
226 | case GB_AT_AA: elementtype = PROTEIN; break; |
---|
227 | default : gde_assert(0); break; |
---|
228 | } |
---|
229 | |
---|
230 | gde_assert(elementtype != TEXT); |
---|
231 | elementtype_init = elementtype; |
---|
232 | break; |
---|
233 | } |
---|
234 | |
---|
235 | if (!error) { |
---|
236 | arb_progress progress("Read data from DB", numberspecies); |
---|
237 | if (the_species) { |
---|
238 | for (gb_species = the_species[number]; gb_species && !error; gb_species = the_species[++number]) { |
---|
239 | curelem = Arbdb_get_curelem(dataset); |
---|
240 | this_elem = &(dataset.element[curelem]); |
---|
241 | |
---|
242 | InitNASeq(this_elem, elementtype_init); |
---|
243 | this_elem->gb_species = gb_species; |
---|
244 | |
---|
245 | #define GET_FIELD_CONTENT(fieldname,buffer,bufsize) do { \ |
---|
246 | gbd = GB_entry(gb_species, fieldname); \ |
---|
247 | if (gbd) { \ |
---|
248 | const char *val = GB_read_char_pntr(gbd); \ |
---|
249 | strcpy_truncate(buffer, val, bufsize); \ |
---|
250 | } \ |
---|
251 | else buffer[0] = 0; \ |
---|
252 | } while(0) |
---|
253 | |
---|
254 | GBDATA *gbd; |
---|
255 | GET_FIELD_CONTENT("name", this_elem->short_name, SIZE_SHORT_NAME); |
---|
256 | GET_FIELD_CONTENT("author", this_elem->authority, SIZE_AUTHORITY); |
---|
257 | GET_FIELD_CONTENT("full_name", this_elem->seq_name, SIZE_SEQ_NAME); |
---|
258 | GET_FIELD_CONTENT("acc", this_elem->id, SIZE_ID); |
---|
259 | |
---|
260 | this_elem->elementtype = elementtype; |
---|
261 | |
---|
262 | if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++; |
---|
263 | AppendNA_and_free(this_elem, sequfilt[number]); |
---|
264 | set_constant_fields(this_elem); |
---|
265 | progress.inc_and_check_user_abort(error); |
---|
266 | } |
---|
267 | } |
---|
268 | else { // use the_names |
---|
269 | unsigned char *species_name; |
---|
270 | |
---|
271 | for (species_name=the_names[number]; species_name && !error; species_name=the_names[++number]) { |
---|
272 | curelem = Arbdb_get_curelem(dataset); |
---|
273 | this_elem = &(dataset.element[curelem]); |
---|
274 | |
---|
275 | InitNASeq(this_elem, elementtype_init); |
---|
276 | this_elem->gb_species = NULp; |
---|
277 | |
---|
278 | strcpy_truncate(this_elem->short_name, (char*)species_name, SIZE_SHORT_NAME); |
---|
279 | this_elem->authority[0] = 0; |
---|
280 | this_elem->seq_name[0] = 0; |
---|
281 | this_elem->id[0] = 0; |
---|
282 | this_elem->elementtype = elementtype; |
---|
283 | |
---|
284 | if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++; |
---|
285 | AppendNA_and_free(this_elem, sequfilt[number]); |
---|
286 | set_constant_fields(this_elem); |
---|
287 | progress.inc_and_check_user_abort(error); |
---|
288 | } |
---|
289 | } |
---|
290 | } |
---|
291 | |
---|
292 | if (!error) { |
---|
293 | if (bad_names) { |
---|
294 | aw_message(GBS_global_string("Problematic names found: %i\n" |
---|
295 | "External program call may fail or produce invalid results.\n" |
---|
296 | "You might want to use 'Species/Synchronize IDs' and read the associated help.", |
---|
297 | bad_names)); |
---|
298 | } |
---|
299 | |
---|
300 | { |
---|
301 | unsigned long i; |
---|
302 | for (i=0; i<dataset.numelements; i++) { |
---|
303 | dataset.maxlen = std::max(dataset.maxlen, |
---|
304 | dataset.element[i].seqlen+dataset.element[i].offset); |
---|
305 | } |
---|
306 | for (i=0; i<numberspecies; i++) { |
---|
307 | delete sequfilt[i]; |
---|
308 | } |
---|
309 | free(sequfilt); |
---|
310 | } |
---|
311 | } |
---|
312 | } |
---|
313 | } |
---|
314 | |
---|
315 | delete allocatedFilter; |
---|
316 | if (error) { |
---|
317 | aw_message(error); |
---|
318 | return 1; // = aborted |
---|
319 | } |
---|
320 | return 0; |
---|
321 | |
---|
322 | } |
---|
323 | |
---|
324 | int ReadArbdb2(NA_Alignment& dataset, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) { |
---|
325 | // goes to header: __ATTR__USERESULT |
---|
326 | GBDATA **the_species; |
---|
327 | uchar **the_names; |
---|
328 | uchar **the_sequences; |
---|
329 | long maxalignlen; |
---|
330 | long numberspecies = 0; |
---|
331 | |
---|
332 | char *error = db_access.get_sequences(the_species, the_names, the_sequences, |
---|
333 | numberspecies, maxalignlen); |
---|
334 | |
---|
335 | gde_assert(contradicted(the_species, the_names)); |
---|
336 | |
---|
337 | if (error) { |
---|
338 | aw_message(error); |
---|
339 | return 1; |
---|
340 | } |
---|
341 | |
---|
342 | int res = InsertDatainGDE(dataset, NULp, the_names, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo); |
---|
343 | for (long i=0; i<numberspecies; i++) { |
---|
344 | delete the_sequences[i]; |
---|
345 | } |
---|
346 | delete the_sequences; |
---|
347 | if (the_species) delete the_species; |
---|
348 | else delete the_names; |
---|
349 | |
---|
350 | return res; |
---|
351 | } |
---|
352 | |
---|
353 | int ReadArbdb(NA_Alignment& dataset, bool marked, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) { |
---|
354 | // goes to header: __ATTR__USERESULT |
---|
355 | |
---|
356 | GBDATA *gb_species_data = GBT_get_species_data(dataset.gb_main); |
---|
357 | GBDATA **the_species; |
---|
358 | long numberspecies = 0; |
---|
359 | long missingdata = 0; |
---|
360 | |
---|
361 | GBDATA *gb_species; |
---|
362 | if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data); |
---|
363 | else gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
364 | |
---|
365 | while (gb_species) { |
---|
366 | if (GBT_find_sequence(gb_species, dataset.alignment_name)) numberspecies++; |
---|
367 | else missingdata++; |
---|
368 | |
---|
369 | if (marked) gb_species = GBT_next_marked_species(gb_species); |
---|
370 | else gb_species = GBT_next_species(gb_species); |
---|
371 | } |
---|
372 | |
---|
373 | if (missingdata) { |
---|
374 | aw_message(GBS_global_string("Skipped %li species which did not contain data in '%s'", missingdata, dataset.alignment_name)); |
---|
375 | } |
---|
376 | |
---|
377 | ARB_calloc(the_species, numberspecies+1); |
---|
378 | numberspecies = 0; |
---|
379 | |
---|
380 | if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data); |
---|
381 | else gb_species = GBT_first_species_rel_species_data(gb_species_data); |
---|
382 | |
---|
383 | while (gb_species) { |
---|
384 | if (GBT_find_sequence(gb_species, dataset.alignment_name)) { |
---|
385 | the_species[numberspecies]=gb_species; |
---|
386 | numberspecies++; |
---|
387 | } |
---|
388 | |
---|
389 | if (marked) gb_species = GBT_next_marked_species(gb_species); |
---|
390 | else gb_species = GBT_next_species(gb_species); |
---|
391 | } |
---|
392 | |
---|
393 | long maxalignlen = GBT_get_alignment_len(db_access.gb_main, dataset.alignment_name); |
---|
394 | char **the_sequences; ARB_calloc(the_sequences, numberspecies+1); |
---|
395 | |
---|
396 | for (long i=0; the_species[i]; i++) { |
---|
397 | ARB_alloc(the_sequences[i], maxalignlen+1); |
---|
398 | the_sequences[i][maxalignlen] = 0; |
---|
399 | memset(the_sequences[i], '.', (size_t)maxalignlen); |
---|
400 | const char *data = GB_read_char_pntr(GBT_find_sequence(the_species[i], dataset.alignment_name)); |
---|
401 | int size = strlen(data); |
---|
402 | if (size > maxalignlen) size = (int)maxalignlen; |
---|
403 | strcpy_truncate(the_sequences[i], data, size+1); |
---|
404 | } |
---|
405 | |
---|
406 | int res = InsertDatainGDE(dataset, the_species, NULp, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo); |
---|
407 | for (long i=0; i<numberspecies; i++) { |
---|
408 | free(the_sequences[i]); |
---|
409 | } |
---|
410 | free(the_sequences); |
---|
411 | free(the_species); |
---|
412 | |
---|
413 | return res; |
---|
414 | } |
---|
415 | |
---|
416 | int getelem(NA_Sequence *a, int b) { |
---|
417 | if (a->seqlen == 0) return -1; |
---|
418 | |
---|
419 | if (b<a->offset || (b>a->offset+a->seqlen)) { |
---|
420 | switch (a->elementtype) { |
---|
421 | case DNA: |
---|
422 | case RNA: return 0; |
---|
423 | case PROTEIN: |
---|
424 | case TEXT: return '~'; |
---|
425 | case MASK: return '0'; |
---|
426 | default: return '-'; |
---|
427 | } |
---|
428 | } |
---|
429 | |
---|
430 | return a->sequence[b-a->offset]; |
---|
431 | } |
---|
432 | |
---|
433 | void putelem(NA_Sequence *a, int b, NA_Base c) { |
---|
434 | if (b>=(a->offset+a->seqmaxlen)) { |
---|
435 | Warning("Putelem:insert beyond end of sequence space ignored"); |
---|
436 | } |
---|
437 | else if (b >= (a->offset)) { |
---|
438 | a->sequence[b-(a->offset)] = c; |
---|
439 | } |
---|
440 | else { |
---|
441 | NA_Base *temp = ARB_calloc<NA_Base>(a->seqmaxlen + a->offset - b); |
---|
442 | switch (a->elementtype) { |
---|
443 | // Pad out with gap characters fron the point of insertion to the offset |
---|
444 | case MASK: |
---|
445 | for (int j=b; j<a->offset; j++) temp[j-b] = '0'; |
---|
446 | break; |
---|
447 | |
---|
448 | case DNA: |
---|
449 | case RNA: |
---|
450 | for (int j=b; j<a->offset; j++) temp[j-b] = '\0'; |
---|
451 | break; |
---|
452 | |
---|
453 | case PROTEIN: |
---|
454 | for (int j=b; j<a->offset; j++) temp[j-b] = '-'; |
---|
455 | break; |
---|
456 | |
---|
457 | case TEXT: |
---|
458 | default: |
---|
459 | for (int j=b; j<a->offset; j++) temp[j-b] = ' '; |
---|
460 | break; |
---|
461 | } |
---|
462 | |
---|
463 | for (int j=0; j<a->seqmaxlen; j++) temp[j+a->offset-b] = a->sequence[j]; |
---|
464 | free(a->sequence); |
---|
465 | |
---|
466 | a->sequence = temp; |
---|
467 | a->seqlen += (a->offset - b); |
---|
468 | a->seqmaxlen += (a->offset - b); |
---|
469 | a->offset = b; |
---|
470 | a->sequence[0] = c; |
---|
471 | } |
---|
472 | } |
---|
473 | |
---|