Context Navigation

source: tags/arb-6.0/ARB_GDE/GDE_arbdb_io.cxx

Visit:

Last change on this file was 12267, checked in by westram, 11 years ago
full update from 'trunk' into 'rc' because i was (nearly) only fixing bugs in the last 5 weeks adds: log:branches/fix@11985,11990:11997,12008:12013,12023:12042,12049:12053,12058:12137,12147:12156 log:branches/gcc@12076:12079 log:branches/tree@12004:12019,12116:12120 log:trunk@11971:11972,11974:11980,11982:11989,11991:12260
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 17.9 KB

Line
1	#include "GDE_def.h"
2	#include "GDE_proto.h"
3
4	#include <aw_msg.hxx>
5	#include <arb_progress.h>
6	#include <AW_rename.hxx>
7	#include <AP_filter.hxx>
8	#include <aw_awar_defs.hxx>
9
10	#include <algorithm>
11
12	// AISC_MKPT_PROMOTE:#ifndef GDE_EXTGLOB_H
13	// AISC_MKPT_PROMOTE:#include "GDE_extglob.h"
14	// AISC_MKPT_PROMOTE:#endif
15
16	typedef unsigned int UINT;
17
18	static int Arbdb_get_curelem(NA_Alignment *dataset)
19	{
20	int curelem;
21	curelem = dataset->numelements++;
22	if (curelem == 0) {
23	dataset->element = (NA_Sequence *) Calloc(5, sizeof(NA_Sequence));
24	dataset->maxnumelements = 5;
25	}
26	else if (curelem == dataset->maxnumelements) {
27	(dataset->maxnumelements) *= 2;
28	dataset->element = (NA_Sequence ) Realloc((char )dataset->element,
29	dataset->maxnumelements * sizeof(NA_Sequence));
30	}
31	return curelem;
32	}
33
34	extern int Default_PROColor_LKUP[], Default_NAColor_LKUP[];
35
36	static void set_constant_fields(NA_Sequence *this_elem) {
37	this_elem->attr = DEFAULT_X_ATTR;
38	this_elem->comments = strdup("no comments");
39	this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments));
40	this_elem->rmatrix = NULL;
41	this_elem->tmatrix = NULL;
42	this_elem->col_lut = Default_PROColor_LKUP;
43	}
44
45	static void AppendNA_and_free(NA_Sequence this_elem, uchar & sequfilt) {
46	AppendNA((NA_Base )sequfilt, strlen((const char )sequfilt), this_elem);
47	freenull(sequfilt);
48	}
49
50	__ATTR__USERESULT static int InsertDatainGDE(NA_Alignment *dataset,
51	GBDATA **the_species,
52	unsigned char **the_names,
53	unsigned char **the_sequences,
54	unsigned long numberspecies,
55	unsigned long maxalignlen,
56	const AP_filter *filter,
57	GapCompression compress,
58	bool cutoff_stop_codon,
59	TypeInfo typeinfo)
60	{
61	GBDATA *gb_species;
62	NA_Sequence *this_elem;
63	AP_filter *allocatedFilter = 0;
64
65	gde_assert(contradicted(the_species, the_names));
66
67	if (filter==0) {
68	allocatedFilter = new AP_filter(maxalignlen);
69	filter = allocatedFilter;
70	}
71	else {
72	size_t fl = filter->get_length();
73	if (fl < maxalignlen) {
74	aw_message("Warning: Your filter is shorter than the alignment len");
75	maxalignlen = fl;
76	}
77	}
78
79	GB_ERROR error = filter->is_invalid();
80	if (!error) {
81	size_t seqlen=(size_t )calloc((unsigned int)numberspecies, sizeof(size_t));
82	// sequences may have different length
83	{
84	unsigned long i;
85	for (i=0; i<numberspecies; i++) {
86	seqlen[i] = strlen((char *)the_sequences[i]);
87	}
88	}
89
90	if (cutoff_stop_codon) {
91	unsigned long i;
92	fputs("[CUTTING STOP CODONS]\n", stdout);
93	for (i=0; i<numberspecies; i++) {
94	uchar *seq = the_sequences[i];
95	uchar stop_codon = (uchar)strchr((char)seq, '');
96	if (stop_codon) {
97	long pos = stop_codon-seq;
98	long restlen = maxalignlen-pos;
99	memset(stop_codon, '.', restlen);
100	}
101	}
102	}
103
104	// store (compressed) sequence data in array:
105	uchar sequfilt = (uchar)calloc((unsigned int)numberspecies+1, sizeof(uchar*));
106	GB_alignment_type alitype = GBT_get_alignment_type(dataset->gb_main, dataset->alignment_name);
107
108	if (compress==COMPRESS_ALL) { // compress all gaps and filter positions
109	long len = filter->get_filtered_length();
110	unsigned long i;
111
112	for (i=0; i<numberspecies; i++) {
113	sequfilt[i] = (uchar*)calloc((unsigned int)len+1, sizeof(uchar));
114	long newcount = 0;
115	for (unsigned long col=0; (col<maxalignlen); col++) {
116	char c = the_sequences[i][col];
117	if (!c) break;
118	if ((filter->use_position(col)) && (c!='-') && (c!='.')) {
119	sequfilt[i][newcount++] = c;
120	}
121	}
122	}
123	}
124	else {
125	if (compress==COMPRESS_VERTICAL_GAPS \|\| // compress vertical gaps (and '.')
126	compress == COMPRESS_NONINFO_COLUMNS) // and additionally all columns containing no info (only N or X)
127	{
128	size_t i;
129	bool isInfo[256];
130
131	for (i=0; i<256; i++) isInfo[i] = true;
132	isInfo[UINT('-')] = false;
133	isInfo[UINT('.')] = false;
134	if (compress == COMPRESS_NONINFO_COLUMNS) {
135	switch (alitype) {
136	case GB_AT_RNA:
137	case GB_AT_DNA:
138	isInfo[UINT('N')] = false;
139	isInfo[UINT('n')] = false;
140	break;
141	case GB_AT_AA:
142	isInfo[UINT('X')] = false;
143	isInfo[UINT('x')] = false;
144	break;
145	default:
146	gde_assert(0);
147	break;
148	}
149	}
150
151	bool modified = false;
152	char *filterString = filter->to_string();
153
154	for (i=0; i<maxalignlen; i++) {
155	if (filter->use_position(i)) {
156	bool wantColumn = false;
157
158	for (size_t n=0; n<numberspecies; n++) {
159	if (i < seqlen[n]) {
160	if (isInfo[UINT(the_sequences[n][i])]) {
161	wantColumn = true; // have info -> take column
162	break;
163	}
164	}
165	}
166	if (!wantColumn) {
167	filterString[i] = '0';
168	modified = true;
169	}
170	}
171	}
172
173	if (modified) {
174	size_t len = filter->get_length();
175
176	delete allocatedFilter;
177	filter = allocatedFilter = new AP_filter(filterString, NULL, len);
178	}
179
180	free(filterString);
181	}
182
183	if (!error) error = filter->is_invalid();
184
185	if (!error) {
186	long len = filter->get_filtered_length();
187	size_t i;
188
189	for (i=0; i<numberspecies; i++) {
190	int c;
191	long newcount = 0;
192
193	sequfilt[i] = (uchar*)malloc((unsigned int)len+1);
194	sequfilt[i][len] = 0;
195	memset(sequfilt[i], '.', len); // Generate empty sequences
196
197	const uchar *simplify = filter->get_simplify_table();
198	for (size_t col=0; (col<maxalignlen) && (c=the_sequences[i][col]); col++) {
199	if (filter->use_position(col)) {
200	sequfilt[i][newcount++] = simplify[c];
201	}
202	}
203	}
204	}
205	}
206	free(seqlen);
207
208	if (!error) {
209	GB_transaction ta(db_access.gb_main);
210
211	char *str = filter->to_string();
212	error = GBT_write_string(db_access.gb_main, AWAR_GDE_EXPORT_FILTER, str);
213	free(str);
214	}
215
216	if (!error) {
217	long number = 0;
218	int curelem;
219	int bad_names = 0;
220
221	int elementtype = TEXT;
222	int elementtype_init = RNA;
223	switch (typeinfo) {
224	case UNKNOWN_TYPEINFO: gde_assert(0);
225	case BASIC_TYPEINFO: break;
226
227	case DETAILED_TYPEINFO:
228	switch (alitype) {
229	case GB_AT_RNA: elementtype = RNA; break;
230	case GB_AT_DNA: elementtype = DNA; break;
231	case GB_AT_AA: elementtype = PROTEIN; break;
232	default : gde_assert(0); break;
233	}
234
235	gde_assert(elementtype != TEXT);
236	elementtype_init = elementtype;
237	break;
238	}
239
240	if (!error) {
241	arb_progress progress("Read data from DB", numberspecies);
242	if (the_species) {
243	for (gb_species = the_species[number]; gb_species && !error; gb_species = the_species[++number]) {
244	curelem = Arbdb_get_curelem(dataset);
245	this_elem = &(dataset->element[curelem]);
246
247	InitNASeq(this_elem, elementtype_init);
248	this_elem->gb_species = gb_species;
249
250	#define GET_FIELD_CONTENT(fieldname,buffer,bufsize) do { \
251	gbd = GB_entry(gb_species, fieldname); \
252	if (gbd) { \
253	const char *val = GB_read_char_pntr(gbd); \
254	strncpy_terminate(buffer, val, bufsize); \
255	} \
256	else buffer[0] = 0; \
257	} while(0)
258
259	GBDATA *gbd;
260	GET_FIELD_CONTENT("name", this_elem->short_name, SIZE_SHORT_NAME);
261	GET_FIELD_CONTENT("author", this_elem->authority, SIZE_AUTHORITY);
262	GET_FIELD_CONTENT("full_name", this_elem->seq_name, SIZE_SEQ_NAME);
263	GET_FIELD_CONTENT("acc", this_elem->id, SIZE_ID);
264
265	this_elem->elementtype = elementtype;
266
267	if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
268	AppendNA_and_free(this_elem, sequfilt[number]);
269	set_constant_fields(this_elem);
270	progress.inc_and_check_user_abort(error);
271	}
272	}
273	else { // use the_names
274	unsigned char *species_name;
275
276	for (species_name=the_names[number]; species_name && !error; species_name=the_names[++number]) {
277	curelem = Arbdb_get_curelem(dataset);
278	this_elem = &(dataset->element[curelem]);
279
280	InitNASeq(this_elem, elementtype_init);
281	this_elem->gb_species = 0;
282
283	strncpy(this_elem->short_name, (char*)species_name, SIZE_SHORT_NAME);
284	this_elem->authority[0] = 0;
285	this_elem->seq_name[0] = 0;
286	this_elem->id[0] = 0;
287	this_elem->elementtype = elementtype;
288
289	if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
290	AppendNA_and_free(this_elem, sequfilt[number]);
291	set_constant_fields(this_elem);
292	progress.inc_and_check_user_abort(error);
293	}
294	}
295	}
296
297	if (!error) {
298	if (bad_names) {
299	aw_message(GBS_global_string("Problematic names found: %i\n"
300	"External program call may fail or produce invalid results.\n"
301	"You might want to use 'Species/Synchronize IDs' and read the associated help.",
302	bad_names));
303	}
304
305	{
306	unsigned long i;
307	for (i=0; i<dataset->numelements; i++) {
308	dataset->maxlen = std::max(dataset->maxlen,
309	dataset->element[i].seqlen+dataset->element[i].offset);
310	}
311	for (i=0; i<numberspecies; i++)
312	{
313	delete sequfilt[i];
314	}
315	free(sequfilt);
316	}
317	}
318	}
319	}
320
321	delete allocatedFilter;
322	if (error) {
323	aw_message(error);
324	return 1; // = aborted
325	}
326	return 0;
327
328	}
329
330	int ReadArbdb2(NA_Alignment dataset, AP_filter filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
331	// goes to header: __ATTR__USERESULT
332	dataset->gb_main = db_access.gb_main;
333
334	GBDATA **the_species;
335	uchar **the_names;
336	uchar **the_sequences;
337	long maxalignlen;
338	long numberspecies = 0;
339
340	char *error = db_access.get_sequences(db_access.client_data,
341	the_species, the_names, the_sequences,
342	numberspecies, maxalignlen);
343
344	gde_assert(contradicted(the_species, the_names));
345
346	if (error) {
347	aw_message(error);
348	return 1;
349	}
350
351	int res = InsertDatainGDE(dataset, 0, the_names, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
352	for (long i=0; i<numberspecies; i++) {
353	delete the_sequences[i];
354	}
355	delete the_sequences;
356	if (the_species) delete the_species;
357	else delete the_names;
358
359	return res;
360	}
361
362	int ReadArbdb(NA_Alignment dataset, bool marked, AP_filter filter, GapCompression compress, bool cutoff_stop_codon, TypeInfo typeinfo) {
363	// goes to header: __ATTR__USERESULT
364	dataset->gb_main = db_access.gb_main;
365
366	// Alignment choosen ?
367
368	GBDATA *gb_species_data = GBT_get_species_data(dataset->gb_main);
369	GBDATA **the_species;
370	long numberspecies = 0;
371	long missingdata = 0;
372
373	GBDATA *gb_species;
374	if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
375	else gb_species = GBT_first_species_rel_species_data(gb_species_data);
376
377	while (gb_species) {
378	if (GBT_read_sequence(gb_species, dataset->alignment_name)) numberspecies++;
379	else missingdata++;
380
381	if (marked) gb_species = GBT_next_marked_species(gb_species);
382	else gb_species = GBT_next_species(gb_species);
383	}
384
385	if (missingdata) {
386	aw_message(GBS_global_string("Skipped %li species which did not contain data in '%s'", missingdata, dataset->alignment_name));
387	}
388
389	the_species = (GBDATA*)calloc((unsigned int)numberspecies+1, sizeof(GBDATA));
390	numberspecies = 0;
391
392	if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
393	else gb_species = GBT_first_species_rel_species_data(gb_species_data);
394
395	while (gb_species) {
396	if (GBT_read_sequence(gb_species, dataset->alignment_name)) {
397	the_species[numberspecies]=gb_species;
398	numberspecies++;
399	}
400
401	if (marked) gb_species = GBT_next_marked_species(gb_species);
402	else gb_species = GBT_next_species(gb_species);
403	}
404
405	long maxalignlen = GBT_get_alignment_len(db_access.gb_main, dataset->alignment_name);
406	char the_sequences = (char)calloc((unsigned int)numberspecies+1, sizeof(char*));
407
408	for (long i=0; the_species[i]; i++) {
409	the_sequences[i] = (char *)malloc((size_t)maxalignlen+1);
410	the_sequences[i][maxalignlen] = 0;
411	memset(the_sequences[i], '.', (size_t)maxalignlen);
412	const char *data = GB_read_char_pntr(GBT_read_sequence(the_species[i], dataset->alignment_name));
413	int size = strlen(data);
414	if (size > maxalignlen) size = (int)maxalignlen;
415	strncpy_terminate(the_sequences[i], data, size+1);
416	}
417
418	int res = InsertDatainGDE(dataset, the_species, 0, (unsigned char **)the_sequences, numberspecies, maxalignlen, filter, compress, cutoff_stop_codon, typeinfo);
419	for (long i=0; i<numberspecies; i++) {
420	free(the_sequences[i]);
421	}
422	free(the_sequences);
423	free(the_species);
424
425	return res;
426	}
427
428	int getelem(NA_Sequence *a, int b) {
429	if (a->seqlen == 0) return -1;
430
431	if (b<a->offset \|\| (b>a->offset+a->seqlen)) {
432	switch (a->elementtype) {
433	case DNA:
434	case RNA: return 0;
435	case PROTEIN:
436	case TEXT: return '~';
437	case MASK: return '0';
438	default: return '-';
439	}
440	}
441
442	return a->sequence[b-a->offset];
443	}
444
445	void putelem(NA_Sequence *a, int b, NA_Base c) {
446	if (b>=(a->offset+a->seqmaxlen)) {
447	Warning("Putelem:insert beyond end of sequence space ignored");
448	}
449	else if (b >= (a->offset)) {
450	a->sequence[b-(a->offset)] = c;
451	}
452	else {
453	NA_Base temp = (NA_Base)Calloc(a->seqmaxlen+a->offset-b, sizeof(NA_Base));
454	switch (a->elementtype) {
455	// Pad out with gap characters fron the point of insertion to the offset
456	case MASK:
457	for (int j=b; j<a->offset; j++) temp[j-b] = '0';
458	break;
459
460	case DNA:
461	case RNA:
462	for (int j=b; j<a->offset; j++) temp[j-b] = '\0';
463	break;
464
465	case PROTEIN:
466	for (int j=b; j<a->offset; j++) temp[j-b] = '-';
467	break;
468
469	case TEXT:
470	default:
471	for (int j=b; j<a->offset; j++) temp[j-b] = ' ';
472	break;
473	}
474
475	for (int j=0; j<a->seqmaxlen; j++) temp[j+a->offset-b] = a->sequence[j];
476	Cfree((char*)a->sequence);
477
478	a->sequence = temp;
479	a->seqlen += (a->offset - b);
480	a->seqmaxlen += (a->offset - b);
481	a->offset = b;
482	a->sequence[0] = c;
483	}
484	}
485

Note: See TracBrowser for help on using the repository browser.

Download in other formats: