Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

GDE_arbdb_io.cxx

Visit:

Last change on this file was 18126, checked in by westram, 6 years ago
full update from 'trunk' into 'alilink' adds: log:branches/cleanup@17627:17650,18105:18112 log:branches/clone@17813:17835 log:branches/config@17733:17737 log:branches/dbui@17881:17889 log:branches/fix@17380:17395,17480:17592,17597:17617 log:branches/fts@17653:18088,18090:18094 log:branches/gcc@17850:17876 log:branches/group@17162:17341 log:branches/gui@17754:18092 log:branches/mix@17631:17633,17637:17640 log:branches/optional@17481:17568 log:branches/progress@17195:17259 log:branches/svalues@17357:17477 log:trunk@17179:18123
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 17.3 KB

Line
1	#include "GDE_proto.h"
2
3	#include <aw_msg.hxx>
4	#include <AW_rename.hxx>
5	#include <AP_filter.hxx>
6	#include <aw_awar_defs.hxx>
7	#include <arb_progress.h>
8	#include <arb_global_defs.h>
9
10	#include <algorithm>
11
12	// AISC_MKPT_PROMOTE:#ifndef GDE_EXTGLOB_H
13	// AISC_MKPT_PROMOTE:#include "GDE_extglob.h"
14	// AISC_MKPT_PROMOTE:#endif
15
16	typedef unsigned int UINT;
17
18	int Arbdb_get_curelem(NA_Alignment& dataset) {
19	int curelem = dataset.numelements++;
20	if (curelem == 0) {
21	dataset.maxnumelements = 5;
22	ARB_alloc(dataset.element, dataset.maxnumelements);
23	}
24	else if (curelem == dataset.maxnumelements) {
25	dataset.maxnumelements *= 2;
26	ARB_realloc(dataset.element, dataset.maxnumelements);
27	}
28	return curelem;
29	}
30
31	static void set_constant_fields(NA_Sequence *this_elem) {
32	this_elem->attr = DEFAULT_X_ATTR;
33	this_elem->comments = ARB_strdup("no comments");
34	this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments));
35	this_elem->rmatrix = NULp;
36	this_elem->tmatrix = NULp;
37	this_elem->col_lut = Default_PROColor_LKUP;
38	}
39
40	static void AppendNA_and_free(NA_Sequence this_elem, uchar & sequfilt) {
41	AppendNA((NA_Base )sequfilt, strlen((const char )sequfilt), this_elem);
42	freenull(sequfilt);
43	}
44
45	__ATTR__USERESULT static int InsertDatainGDE(NA_Alignment& dataset,
46	GBDATA **the_species,
47	unsigned char **the_names,
48	unsigned char **the_sequences,
49	unsigned long numberspecies,
50	unsigned long maxalignlen,
51	const AP_filter *filter,
52	GapCompression compress,
53	bool cutoff_stop_codon,
54	TypeInfo typeinfo)
55	{
56	GBDATA *gb_species;
57	NA_Sequence *this_elem;
58	AP_filter *allocatedFilter = NULp;
59
60	gde_assert(contradicted(the_species, the_names));
61
62	if (!filter) {
63	allocatedFilter = new AP_filter(maxalignlen);
64	filter = allocatedFilter;
65	}
66	else {
67	size_t fl = filter->get_length();
68	if (fl < maxalignlen) {
69	aw_message("Warning: Your filter is shorter than the alignment len");
70	maxalignlen = fl;
71	}
72	}
73
74	GB_ERROR error = filter->is_invalid();
75	if (!error) {
76	size_t *seqlen = ARB_calloc<size_t>(numberspecies);
77	// sequences may have different length
78	{
79	unsigned long i;
80	for (i=0; i<numberspecies; i++) {
81	seqlen[i] = strlen((char *)the_sequences[i]);
82	}
83	}
84
85	if (cutoff_stop_codon) {
86	unsigned long i;
87	fputs("[CUTTING STOP CODONS]\n", stdout);
88	for (i=0; i<numberspecies; i++) {
89	uchar *seq = the_sequences[i];
90	uchar stop_codon = (uchar)strchr((char)seq, '');
91	if (stop_codon) {
92	long pos = stop_codon-seq;
93	long restlen = maxalignlen-pos;
94	memset(stop_codon, '.', restlen);
95	}
96	}
97	}
98
99	// store (compressed) sequence data in array:
100	uchar *sequfilt = ARB_calloc<uchar>(numberspecies+1);
101	GB_alignment_type alitype = GBT_get_alignment_type(dataset.gb_main, dataset.alignment_name);
102
103	if (compress==COMPRESS_ALL) { // compress all gaps and filter positions
104	long len = filter->get_filtered_length();
105	unsigned long i;
106
107	for (i=0; i<numberspecies; i++) {
108	ARB_calloc(sequfilt[i], len+1);
109	long newcount = 0;
110	for (unsigned long col=0; (col<maxalignlen); col++) {
111	char c = the_sequences[i][col];
112	if (!c) break;
113	if (filter->use_position(col) && !GAP::is_std_gap(c)) {
114	sequfilt[i][newcount++] = c;
115	}
116	}
117	}
118	}
119	else {
120	if (compress==COMPRESS_VERTICAL_GAPS \|\| // compress vertical gaps (and '.')
121	compress == COMPRESS_NONINFO_COLUMNS) // and additionally all columns containing no info (only N or X)
122	{
123	size_t i;
124	bool isInfo[256];
125
126	for (i=0; i<256; i++) isInfo[i] = true;
127	isInfo[UINT('-')] = false;
128	isInfo[UINT('.')] = false;
129	if (compress == COMPRESS_NONINFO_COLUMNS) {
130	switch (alitype) {
131	case GB_AT_RNA:
132	case GB_AT_DNA:
133	isInfo[UINT('N')] = false;
134	isInfo[UINT('n')] = false;
135	break;
136	case GB_AT_AA:
137	isInfo[UINT('X')] = false;
138	isInfo[UINT('x')] = false;
139	break;
140	default:
141	gde_assert(0);
142	break;
143	}
144	}
145
146	bool modified = false;
147	char *filterString = filter->to_string();
148
149	for (i=0; i<maxalignlen; i++) {
150	if (filter->use_position(i)) {
151	bool wantColumn = false;
152
153	for (size_t n=0; n<numberspecies; n++) {
154	if (i < seqlen[n]) {
155	if (isInfo[UINT(the_sequences[n][i])]) {
156	wantColumn = true; // have info -> take column
157	break;
158	}
159	}
160	}
161	if (!wantColumn) {
162	filterString[i] = '0';
163	modified = true;
164	}
165	}
166	}
167
168	if (modified) {
169	size_t len = filter->get_length();
170
171	delete allocatedFilter;
172	filter = allocatedFilter = new AP_filter(filterString, NULp, len);
173	}
174
175	free(filterString);
176	}
177
178	if (!error) error = filter->is_invalid();
179
180	if (!error) {
181	long len = filter->get_filtered_length();
182	size_t i;
183
184	for (i=0; i<numberspecies; i++) {
185	int c;
186	long newcount = 0;
187
188	ARB_alloc(sequfilt[i], len+1);
189	sequfilt[i][len] = 0;
190	memset(sequfilt[i], '.', len); // Generate empty sequences
191
192	const uchar *simplify = filter->get_simplify_table();
193	for (size_t col=0; (col<maxalignlen) && (c=the_sequences[i][col]); col++) {
194	if (filter->use_position(col)) {
195	sequfilt[i][newcount++] = simplify[c];
196	}
197	}
198	}
199	}
200	}
201	free(seqlen);
202
203	if (!error) {
204	GB_transaction ta(db_access.gb_main);
205
206	char *str = filter->to_string();
207	error = GBT_write_string(db_access.gb_main, AWAR_GDE_EXPORT_FILTER, str);
208	free(str);
209	}
210
211	if (!error) {
212	long number = 0;
213	int curelem;
214	int bad_names = 0;
215
216	int elementtype = TEXT;
217	int elementtype_init = RNA;
218	switch (typeinfo) {
219	case UNKNOWN_TYPEINFO: gde_assert(0);
220	case BASIC_TYPEINFO: break;
221
222	case DETAILED_TYPEINFO:
223	switch (alitype) {
224	case GB_AT_RNA: elementtype = RNA; break;
225	case GB_AT_DNA: elementtype = DNA; break;
226	case GB_AT_AA: elementtype = PROTEIN; break;
227	default : gde_assert(0); break;
228	}
229
230	gde_assert(elementtype != TEXT);
231	elementtype_init = elementtype;
232	break;
233	}
234
235	if (!error) {
236	arb_progress progress("Read data from DB", numberspecies);
237	if (the_species) {
238	for (gb_species = the_species[number]; gb_species && !error; gb_species = the_species[++number]) {
239	curelem = Arbdb_get_curelem(dataset);
240	this_elem = &(dataset.element[curelem]);
241
242	InitNASeq(this_elem, elementtype_init);
243	this_elem->gb_species = gb_species;
244
245	#define GET_FIELD_CONTENT(fieldname,buffer,bufsize) do { \
246	gbd = GB_entry(gb_species, fieldname); \
247	if (gbd) { \
248	const char *val = GB_read_char_pntr(gbd); \
249	strcpy_truncate(buffer, val, bufsize); \
250	} \
251	else buffer[0] = 0; \
252	} while(0)
253
254	GBDATA *gbd;
255	GET_FIELD_CONTENT("name", this_elem->short_name, SIZE_SHORT_NAME);
256	GET_FIELD_CONTENT("author", this_elem->authority, SIZE_AUTHORITY);
257	GET_FIELD_CONTENT("full_name", this_elem->seq_name, SIZE_SEQ_NAME);
258	GET_FIELD_CONTENT("acc", this_elem->id, SIZE_ID);
259
260	this_elem->elementtype = elementtype;
261
262	if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
263	AppendNA_and_free(this_elem, sequfilt[number]);
264	set_constant_fields(this_elem);
265	progress.inc_and_check_user_abort(error);
266	}
267	}
268	else { // use the_names
269	unsigned char *species_name;
270
271	for (species_name=the_names[number]; species_name && !error; species_name=the_names[++number]) {
272	curelem = Arbdb_get_curelem(dataset);
273	this_elem = &(dataset.element[curelem]);
274
275	InitNASeq(this_elem, elementtype_init);
276	this_elem->gb_species = NULp;
277
278	strcpy_truncate(this_elem->short_name, (char*)species_name, SIZE_SHORT_NAME);
279	this_elem->authority[0] = 0;
280	this_elem->seq_name[0] = 0;
281	this_elem->id[0] = 0;
282	this_elem->elementtype = elementtype;
283
284	if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
285	AppendNA_and_free(this_elem, sequfilt[number]);
286	set_constant_fields(this_elem);
287	progress.inc_and_check_user_abort(error);
288	}
289	}
290	}
291
292	if (!error) {
293	if (bad_names) {
294	aw_message(GBS_global_string("Problematic names found: %i\n"
295	"External program call may fail or produce invalid results.\n"
296	"You might want to use 'Species/Synchronize IDs' and read the associated help.",
297	bad_names));
298	}
299
300	{
301	unsigned long i;
302	for (i=0; i<dataset.numelements; i++) {
303	dataset.maxlen = std::max(dataset.maxlen,
304	dataset.element[i].seqlen+dataset.element[i].offset);
305	}
306	for (i=0; i<numberspecies; i++) {
307	delete sequfilt[i];
308	}
309	free(sequfilt);
310	}
311	}
312	}
313	}
314
315	delete allocatedFilter;
316	if (error) {
317	aw_message(error);
318	return 1; // = aborted
319	}
320	return 0;
321
322	}
323
324	int ReadArbdb2(NA_Alignment& dataset, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
325	// goes to header: __ATTR__USERESULT
326	GBDATA **the_species;
327	uchar **the_names;
328	uchar **the_sequences;
329	long maxalignlen;
330	long numberspecies = 0;
331
332	char *error = db_access.get_sequences(the_species, the_names, the_sequences,
333	numberspecies, maxalignlen);
334
335	gde_assert(contradicted(the_species, the_names));
336
337	if (error) {
338	aw_message(error);
339	return 1;
340	}
341
342	int res = InsertDatainGDE(dataset, NULp, the_names, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
343	for (long i=0; i<numberspecies; i++) {
344	delete the_sequences[i];
345	}
346	delete the_sequences;
347	if (the_species) delete the_species;
348	else delete the_names;
349
350	return res;
351	}
352
353	int ReadArbdb(NA_Alignment& dataset, bool marked, AP_filter *filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
354	// goes to header: __ATTR__USERESULT
355
356	GBDATA *gb_species_data = GBT_get_species_data(dataset.gb_main);
357	GBDATA **the_species;
358	long numberspecies = 0;
359	long missingdata = 0;
360
361	GBDATA *gb_species;
362	if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
363	else gb_species = GBT_first_species_rel_species_data(gb_species_data);
364
365	while (gb_species) {
366	if (GBT_find_sequence(gb_species, dataset.alignment_name)) numberspecies++;
367	else missingdata++;
368
369	if (marked) gb_species = GBT_next_marked_species(gb_species);
370	else gb_species = GBT_next_species(gb_species);
371	}
372
373	if (missingdata) {
374	aw_message(GBS_global_string("Skipped %li species which did not contain data in '%s'", missingdata, dataset.alignment_name));
375	}
376
377	ARB_calloc(the_species, numberspecies+1);
378	numberspecies = 0;
379
380	if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
381	else gb_species = GBT_first_species_rel_species_data(gb_species_data);
382
383	while (gb_species) {
384	if (GBT_find_sequence(gb_species, dataset.alignment_name)) {
385	the_species[numberspecies]=gb_species;
386	numberspecies++;
387	}
388
389	if (marked) gb_species = GBT_next_marked_species(gb_species);
390	else gb_species = GBT_next_species(gb_species);
391	}
392
393	long maxalignlen = GBT_get_alignment_len(db_access.gb_main, dataset.alignment_name);
394	char **the_sequences; ARB_calloc(the_sequences, numberspecies+1);
395
396	for (long i=0; the_species[i]; i++) {
397	ARB_alloc(the_sequences[i], maxalignlen+1);
398	the_sequences[i][maxalignlen] = 0;
399	memset(the_sequences[i], '.', (size_t)maxalignlen);
400	const char *data = GB_read_char_pntr(GBT_find_sequence(the_species[i], dataset.alignment_name));
401	int size = strlen(data);
402	if (size > maxalignlen) size = (int)maxalignlen;
403	strcpy_truncate(the_sequences[i], data, size+1);
404	}
405
406	int res = InsertDatainGDE(dataset, the_species, NULp, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
407	for (long i=0; i<numberspecies; i++) {
408	free(the_sequences[i]);
409	}
410	free(the_sequences);
411	free(the_species);
412
413	return res;
414	}
415
416	int getelem(NA_Sequence *a, int b) {
417	if (a->seqlen == 0) return -1;
418
419	if (b<a->offset \|\| (b>a->offset+a->seqlen)) {
420	switch (a->elementtype) {
421	case DNA:
422	case RNA: return 0;
423	case PROTEIN:
424	case TEXT: return '~';
425	case MASK: return '0';
426	default: return '-';
427	}
428	}
429
430	return a->sequence[b-a->offset];
431	}
432
433	void putelem(NA_Sequence *a, int b, NA_Base c) {
434	if (b>=(a->offset+a->seqmaxlen)) {
435	Warning("Putelem:insert beyond end of sequence space ignored");
436	}
437	else if (b >= (a->offset)) {
438	a->sequence[b-(a->offset)] = c;
439	}
440	else {
441	NA_Base *temp = ARB_calloc<NA_Base>(a->seqmaxlen + a->offset - b);
442	switch (a->elementtype) {
443	// Pad out with gap characters fron the point of insertion to the offset
444	case MASK:
445	for (int j=b; j<a->offset; j++) temp[j-b] = '0';
446	break;
447
448	case DNA:
449	case RNA:
450	for (int j=b; j<a->offset; j++) temp[j-b] = '\0';
451	break;
452
453	case PROTEIN:
454	for (int j=b; j<a->offset; j++) temp[j-b] = '-';
455	break;
456
457	case TEXT:
458	default:
459	for (int j=b; j<a->offset; j++) temp[j-b] = ' ';
460	break;
461	}
462
463	for (int j=0; j<a->seqmaxlen; j++) temp[j+a->offset-b] = a->sequence[j];
464	free(a->sequence);
465
466	a->sequence = temp;
467	a->seqlen += (a->offset - b);
468	a->seqmaxlen += (a->offset - b);
469	a->offset = b;
470	a->sequence[0] = c;
471	}
472	}
473

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: branches/alilink/ARB_GDE/GDE_arbdb_io.cxx

Download in other formats: