Context Navigation

source: branches/port5/ARB_GDE/GDE_arbdb_io.cxx

Visit:

Last change on this file was 6143, checked in by westram, 16 years ago
backport [6141] (parts changing code, but only strings and comments)
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 15.7 KB

Line
1	#include <stdio.h>
2	#include <stdlib.h>
3	#include <string.h>
4	#include <memory.h>
5
6	#include <arbdb.h>
7	#include <arbdbt.h>
8	#include <aw_root.hxx>
9	#include <aw_device.hxx>
10	#include <aw_window.hxx>
11	#include <awt.hxx>
12	#include <aw_awars.hxx>
13	#include <awt_tree.hxx>
14
15	#include "gde.hxx"
16	#include "GDE_def.h"
17	#include "GDE_menu.h"
18	#include "GDE_extglob.h"
19	#include "AW_rename.hxx"
20
21	typedef unsigned int UINT;
22
23	static int Arbdb_get_curelem(NA_Alignment *dataset)
24	{
25	int curelem;
26	curelem = dataset->numelements++;
27	if (curelem == 0) {
28	dataset->element = (NA_Sequence *) Calloc(5, sizeof(NA_Sequence));
29	dataset->maxnumelements = 5;
30	}
31	else if (curelem == dataset->maxnumelements) {
32	(dataset->maxnumelements) *= 2;
33	dataset->element = (NA_Sequence ) Realloc((char )dataset->element,
34	dataset->maxnumelements * sizeof(NA_Sequence));
35	}
36	return curelem;
37	}
38
39	extern int Default_PROColor_LKUP[],Default_NAColor_LKUP[];
40
41	static int InsertDatainGDE(NA_Alignment dataset,GBDATA the_species,unsigned char *the_names,
42	unsigned char **the_sequences, unsigned long numberspecies,
43	unsigned long maxalignlen, AP_filter *filter, GapCompression compress,
44	bool cutoff_stop_codon)
45	{
46	GBDATA *gb_name;
47	GBDATA *gb_species;
48	GBDATA *gbd;
49	int newfiltercreated = 0;
50	NA_Sequence *this_elem;
51
52	gde_assert((the_species==0) != (the_names==0));
53
54	if (filter==0) {
55	filter = new AP_filter;
56	filter->init(maxalignlen);
57	newfiltercreated=1;
58	}
59	else {
60	size_t fl = filter->filter_len;
61	if (fl < maxalignlen) {
62	aw_message("Warning: Your filter is shorter than the alignment len");
63	maxalignlen = fl;
64	}
65	}
66
67	size_t seqlen=(size_t )calloc((unsigned int)numberspecies,sizeof(size_t));
68	// sequences may have different length
69	{
70	unsigned long i;
71	for (i=0;i<numberspecies;i++) {
72	seqlen[i] = strlen((char *)the_sequences[i]);
73	}
74	}
75
76	if (cutoff_stop_codon) {
77	unsigned long i;
78	for (i=0;i<numberspecies;i++) {
79	uchar *seq = the_sequences[i];
80	uchar stop_codon = (uchar)strchr((char)seq, '');
81	if (stop_codon) {
82	long pos = stop_codon-seq;
83	long restlen = maxalignlen-pos;
84	memset(stop_codon, '.', restlen);
85	}
86	}
87	}
88
89	// store (compressed) sequence data in array:
90	uchar sequfilt = (uchar)calloc((unsigned int)numberspecies+1,sizeof(uchar*));
91
92	if (compress==COMPRESS_ALL) { // compress all gaps and filter positions
93	long len = filter->real_len;
94	unsigned long i;
95
96	for (i=0;i<numberspecies;i++) {
97	sequfilt[i] = (uchar*)calloc((unsigned int)len+1,sizeof(uchar));
98	long newcount = 0;
99	for (unsigned long col=0;(col<maxalignlen);col++) {
100	char c = the_sequences[i][col];
101	if (!c) break;
102	if ((filter->filter_mask[col]) && (c!='-') && (c!='.')) {
103	sequfilt[i][newcount++] = c;
104	}
105	}
106	}
107	}
108	else {
109	if (compress==COMPRESS_VERTICAL_GAPS \|\| // compress vertical gaps (and '.')
110	compress == COMPRESS_NONINFO_COLUMNS) // and additionally all columns containing no info (only N or X)
111	{
112	size_t i;
113	bool isInfo[256];
114
115	for (i=0; i<256; i++) isInfo[i] = true;
116	isInfo[UINT('-')] = false;
117	isInfo[UINT('.')] = false;
118	if (compress == COMPRESS_NONINFO_COLUMNS) {
119	GB_alignment_type type = GBT_get_alignment_type(dataset->gb_main, dataset->alignment_name);
120	switch (type) {
121	case GB_AT_RNA:
122	case GB_AT_DNA:
123	isInfo[UINT('N')] = false;
124	isInfo[UINT('n')] = false;
125	break;
126	case GB_AT_AA:
127	isInfo[UINT('X')] = false;
128	isInfo[UINT('x')] = false;
129	break;
130	default:
131	gde_assert(0);
132	break;
133	}
134	}
135
136	for (i=0; i<maxalignlen; i++) {
137	if (filter->filter_mask[i]) {
138	bool wantColumn = false;
139
140	for (size_t n=0; n<numberspecies; n++) {
141	if (i < seqlen[n]) {
142	if (isInfo[UINT(the_sequences[n][i])]) {
143	wantColumn = true; // have info -> take column
144	break;
145	}
146	}
147	}
148	if (!wantColumn) {
149	filter->filter_mask[i] = 0;
150	filter->real_len--;
151	}
152	}
153	}
154	}
155
156	long len = filter->real_len;
157	size_t i;
158
159	for (i=0;i<numberspecies;i++) {
160	int c;
161	long newcount = 0;
162
163	sequfilt[i] = (uchar*)malloc((unsigned int)len+1);
164	sequfilt[i][len] = 0;
165	memset(sequfilt[i],'.',len); // Generate empty sequences
166
167	size_t col;
168	for (col=0; (col<maxalignlen) && (c=the_sequences[i][col]); col++) {
169	if (filter->filter_mask[col]) {
170	sequfilt[i][newcount++] = filter->simplify[c];
171	}
172	}
173	}
174	}
175
176	{
177	GB_transaction dummy(GLOBAL_gb_main);
178	char *str = filter->to_string();
179	GB_ERROR error = GBT_write_string(GLOBAL_gb_main, AWAR_GDE_EXPORT_FILTER, str);
180	delete [] str;
181
182	if (error) aw_message(error);
183	}
184
185	free(seqlen);
186
187	long number = 0;
188	int curelem;
189	int bad_names = 0;
190
191	if (the_species) {
192	for (gb_species = the_species[number]; gb_species; gb_species = the_species[++number] ) {
193	if ((number/10)*10==number) {
194	if (aw_status((double)number/(double)numberspecies)) {
195	return 1;
196	}
197	}
198	gb_name = GB_entry(gb_species,"name");
199
200	curelem = Arbdb_get_curelem(dataset);
201	this_elem = &(dataset->element[curelem]);
202	InitNASeq(this_elem,RNA);
203	this_elem->attr = DEFAULT_X_ATTR;
204	this_elem->gb_species = gb_species;
205
206	strncpy(this_elem->short_name,GB_read_char_pntr(gb_name),31);
207
208	if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
209
210	gbd = GB_entry(gb_species,"author");
211	if (gbd) strncpy(this_elem->authority,GB_read_char_pntr(gbd),79);
212	gbd = GB_entry(gb_species,"full_name");
213	if (gbd) strncpy(this_elem->seq_name,GB_read_char_pntr(gbd),79);
214	gbd = GB_entry(gb_species,"acc");
215	if (gbd) {
216	strncpy(this_elem->id,GB_read_char_pntr(gbd),79);
217	}
218	{
219	AppendNA((NA_Base )sequfilt[number],strlen((const char )sequfilt[number]),this_elem);
220	freeset(sequfilt[number], 0);
221	}
222
223	this_elem->comments = strdup("no comments");
224	this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments));
225
226	// if (this_elem->rmatrix &&
227	// IS_REALLY_AA == false) {
228	// if (IS_REALLY_AA == false)
229	// Ascii2NA((char *)this_elem->sequence,
230	// this_elem->seqlen,
231	// this_elem->rmatrix);
232	// else
233	/*
234	* Force the sequence to be AA
235	*/
236	{
237	this_elem->elementtype = TEXT;
238	this_elem->rmatrix = NULL;
239	this_elem->tmatrix = NULL;
240	this_elem->col_lut = Default_PROColor_LKUP;
241	}
242	// }
243	}
244
245	}
246	else { // use the_names
247	unsigned char *species_name;
248
249	for (species_name=the_names[number]; species_name; species_name=the_names[++number]) {
250	if ((number/10)*10==number) {
251	if (aw_status((double)number/(double)numberspecies)) {
252	return 1;
253	}
254	}
255
256	curelem = Arbdb_get_curelem(dataset);
257	this_elem = &(dataset->element[curelem]);
258	InitNASeq(this_elem, RNA);
259	this_elem->attr = DEFAULT_X_ATTR;
260	this_elem->gb_species = 0;
261
262	strncpy((char)this_elem->short_name, (char)species_name, 31);
263	if (AWTC_name_quality(this_elem->short_name) != 0) bad_names++;
264
265	this_elem->authority[0] = 0;
266	this_elem->seq_name[0] = 0;
267	this_elem->id[0] = 0;
268
269	{
270	AppendNA((NA_Base )sequfilt[number],strlen((const char )sequfilt[number]),this_elem);
271	delete sequfilt[number]; sequfilt[number] = 0;
272	}
273
274	this_elem->comments = strdup("no comments");
275	this_elem->comments_maxlen = 1 + (this_elem->comments_len = strlen(this_elem->comments));
276
277	{
278	this_elem->elementtype = TEXT;
279	this_elem->rmatrix = NULL;
280	this_elem->tmatrix = NULL;
281	this_elem->col_lut = Default_PROColor_LKUP;
282	}
283	}
284	}
285
286	if (bad_names) {
287	aw_message(GBS_global_string("Problematic names found: %i\n"
288	"External program call may fail or produce invalid results.\n"
289	"You might want to use 'Generate new names' and read the associated help.",
290	bad_names));
291	}
292
293	{
294	unsigned long i;
295	for (i=0;i<dataset->numelements;i++) {
296	dataset->maxlen = MAX(dataset->maxlen,
297	dataset->element[i].seqlen+dataset->element[i].offset);
298	}
299	for (i=0;i<numberspecies;i++)
300	{
301	delete sequfilt[i];
302	}
303	free(sequfilt);
304	}
305	if (newfiltercreated) delete filter;
306	return 0;
307	}
308
309	void ReadArbdb_plain(char filename,NA_Alignment dataset,int type) {
310	AWUSE(filename); AWUSE(type);
311	ReadArbdb(dataset, true, NULL, COMPRESS_NONE, false);
312	}
313
314	int ReadArbdb2(NA_Alignment dataset, AP_filter filter, GapCompression compress, bool cutoff_stop_codon) {
315	dataset->gb_main = GLOBAL_gb_main;
316
317	GBDATA **the_species;
318	long maxalignlen;
319	long numberspecies = 0;
320	uchar **the_sequences;
321	uchar **the_names;
322	char *error = gde_cgss.get_sequences(gde_cgss.THIS,
323	the_species, the_names, the_sequences,
324	numberspecies, maxalignlen);
325
326	gde_assert((the_species==0) != (the_names==0));
327
328	if (error) {
329	aw_message(error);
330	return 1;
331	}
332
333	InsertDatainGDE(dataset,0,the_names,(unsigned char **)the_sequences,numberspecies,maxalignlen,filter,compress, cutoff_stop_codon);
334	long i;
335	for (i=0;i<numberspecies;i++) {
336	delete the_sequences[i];
337	}
338	delete the_sequences;
339	if (the_species) delete the_species;
340	else delete the_names;
341
342	return 0;
343	}
344
345
346
347	int ReadArbdb(NA_Alignment dataset, bool marked, AP_filter filter, GapCompression compress, bool cutoff_stop_codon) {
348	GBDATA *gb_species_data;
349	GBDATA *gb_species;
350
351	/ARB_NT END/
352	dataset->gb_main = GLOBAL_gb_main;
353
354	/* Alignment choosen ? */
355
356	gb_species_data = GB_entry(dataset->gb_main,"species_data");
357	ErrorOut5(gb_species_data!=0,"species_data not found");
358
359	long maxalignlen = GBT_get_alignment_len(GLOBAL_gb_main,dataset->alignment_name);
360	GBDATA **the_species;
361	long numberspecies = 0;
362	long missingdata = 0;
363
364	if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
365	else gb_species = GBT_first_species_rel_species_data(gb_species_data);
366
367	while(gb_species) {
368	if (GBT_read_sequence(gb_species,dataset->alignment_name)) numberspecies++;
369	else missingdata++;
370
371	if (marked) gb_species = GBT_next_marked_species(gb_species);
372	else gb_species = GBT_next_species(gb_species);
373	}
374
375	if (missingdata) {
376	aw_message(GBS_global_string("Skipped %li species which did not contain data in '%s'", missingdata, dataset->alignment_name));
377	}
378
379	the_species = (GBDATA*)calloc((unsigned int)numberspecies+1,sizeof(GBDATA));
380	numberspecies = 0;
381
382	if (marked) gb_species = GBT_first_marked_species_rel_species_data(gb_species_data);
383	else gb_species = GBT_first_species_rel_species_data(gb_species_data);
384
385	while(gb_species) {
386	if (GBT_read_sequence(gb_species,dataset->alignment_name)) {
387	the_species[numberspecies]=gb_species;
388	numberspecies++;
389	}
390
391	if (marked) gb_species = GBT_next_marked_species(gb_species);
392	else gb_species = GBT_next_species(gb_species);
393	}
394
395	maxalignlen = GBT_get_alignment_len(GLOBAL_gb_main,dataset->alignment_name);
396
397	char the_sequences = (char)calloc((unsigned int)numberspecies+1, sizeof(char*));
398
399	long i;
400	for (i=0;the_species[i];i++) {
401	the_sequences[i]= (char *)malloc((size_t)maxalignlen+1);
402	the_sequences[i][maxalignlen] = 0;
403	memset(the_sequences[i],'.',(size_t)maxalignlen);
404	const char *data = GB_read_char_pntr( GBT_read_sequence(the_species[i],dataset->alignment_name));
405	int size = strlen(data);
406	if (size > maxalignlen) size = (int)maxalignlen;
407	strncpy(the_sequences[i],data,size);
408	//printf("%s \n",the_sequences[i]);
409	}
410	InsertDatainGDE(dataset,the_species,0,(unsigned char **)the_sequences, numberspecies,maxalignlen,filter,compress, cutoff_stop_codon);
411	for (i=0;i<numberspecies;i++) {
412	free(the_sequences[i]);
413	}
414	free(the_sequences);
415	free(the_species);
416
417	return 0;
418	}
419
420	int getelem(NA_Sequence *a,int b) {
421	if (a->seqlen == 0) return -1;
422
423	if (b<a->offset \|\| (b>a->offset+a->seqlen)) {
424	switch(a->elementtype) {
425	case DNA:
426	case RNA: return 0;
427	case PROTEIN:
428	case TEXT: return '~';
429	case MASK: return '0';
430	default: return '-';
431	}
432	}
433
434	return a->sequence[b-a->offset];
435	}
436
437	void putelem(NA_Sequence *a,int b,NA_Base c) {
438	int j;
439	NA_Base *temp;
440
441	if (b>=(a->offset+a->seqmaxlen)) {
442	Warning("Putelem:insert beyond end of sequence space ignored");
443	}
444	else if (b >= (a->offset)) {
445	a->sequence[b-(a->offset)] = c;
446	}
447	else {
448	temp =(NA_Base*)Calloc(a->seqmaxlen+a->offset-b, sizeof(NA_Base));
449	switch (a->elementtype) {
450	/*
451	* Pad out with gap characters fron the point of insertion to the offset
452	*/
453	case MASK:
454	for (j=b;j<a->offset;j++) temp[j-b]='0';
455	break;
456	case DNA:
457	case RNA:
458	for (j=b;j<a->offset;j++) temp[j-b]='\0';
459	break;
460	case PROTEIN:
461	for (j=b;j<a->offset;j++) temp[j-b]='-';
462	break;
463	case TEXT:
464	default:
465	for (j=b;j<a->offset;j++) temp[j-b]=' ';
466	break;
467	}
468
469	for (j=0;j<a->seqmaxlen;j++) temp[j+a->offset-b] = a->sequence[j];
470	Cfree((char*)a->sequence);
471
472	a->sequence = temp;
473	a->seqlen += (a->offset - b);
474	a->seqmaxlen += (a->offset - b);
475	a->offset = b;
476	a->sequence[0] = c;
477	}
478	}
479

Note: See TracBrowser for help on using the repository browser.

Download in other formats: