Context Navigation

← Previous Revision
Next Revision →
Blame
Revision Log

GDE_FileIO.cxx

Visit:

Last change on this file was 17396, checked in by westram, 7 years ago
reintegrates 'fix' into 'trunk' elim dead code use const hash loop adds: log:branches/fix@17380,17388:17395
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 11.5 KB

Line
1	#include "GDE_proto.h"
2
3	#include <aw_msg.hxx>
4
5	#include <climits>
6	#include <algorithm>
7
8	static void Regroup(NA_Alignment& alignment) {
9	size_t j;
10	size_t group;
11	int last;
12
13	for (j=0; j<alignment.numelements; j++) {
14	alignment.element[j].groupf = NULp;
15	alignment.element[j].groupb = NULp;
16	}
17
18	for (group = 1; group <= alignment.numgroups; group++) {
19	last = -1;
20	for (j=0; j<alignment.numelements; j++)
21	if (alignment.element[j].groupid == group) {
22	if (last != -1) {
23	alignment.element[j].groupb = &(alignment.element[last]);
24	alignment.element[last].groupf = &(alignment.element[j]);
25	}
26	last = j;
27	}
28	}
29	}
30
31	static void NormalizeOffset(NA_Alignment& aln) {
32	size_t j;
33	int offset = INT_MAX;
34
35	for (j=0; j<aln.numelements; j++)
36	offset = std::min(offset, aln.element[j].offset);
37
38	for (j=0; j<aln.numelements; j++)
39	aln.element[j].offset -= offset;
40
41	aln.maxlen = INT_MIN;
42	for (j=0; j<aln.numelements; j++)
43	aln.maxlen = std::max(aln.element[j].seqlen+aln.element[j].offset, aln.maxlen);
44
45	gde_assert(aln.maxlen >= 0);
46
47	aln.rel_offset += offset;
48
49	if (aln.numelements == 0) aln.rel_offset = 0;
50	}
51
52	static void ReadNA_Flat(char *filename, NA_Alignment& data) {
53	size_t j;
54	int jj, curelem=0, offset;
55	char buffer[GBUFSIZ];
56	char in_line[GBUFSIZ];
57
58	NA_Sequence *this_elem;
59
60	FILE *file = fopen(filename, "r");
61	if (!file) {
62	fprintf(stderr, "Cannot open %s.\n", filename);
63	}
64	else {
65	for (; fgets(in_line, GBUFSIZ, file);) {
66	if (in_line[0] == '#' \|\|
67	in_line[0] == '%' \|\|
68	in_line[0] == '"' \|\|
69	in_line[0] == '@')
70	{
71	offset = 0;
72	for (j=0; j<strlen(in_line); j++) {
73	if (in_line[j] == '(') {
74	sscanf((char*)&(in_line[j+1]), "%d", &offset);
75	in_line[j] = '\0';
76	}
77	}
78
79	curelem = Arbdb_get_curelem(data);
80
81	InitNASeq(&(data.element[curelem]),
82	in_line[0] == '#' ? DNA :
83	in_line[0] == '%' ? PROTEIN :
84	in_line[0] == '"' ? TEXT :
85	in_line[0] == '@' ? MASK : TEXT);
86	this_elem = &(data.element[curelem]);
87	if (in_line[strlen(in_line)-1] == '\n')
88	in_line[strlen(in_line)-1] = '\0';
89	strcpy_truncate(this_elem->short_name, in_line+1, SIZE_SHORT_NAME);
90	this_elem->offset = offset;
91	}
92	else if (in_line[0] != '\n') {
93	size_t strl = strlen(in_line);
94	for (j=0, jj=0; j<strl; j++) {
95	if (in_line[j] != ' ' && in_line[j] != '\n' && in_line[j] != '\t') {
96	buffer[jj++] = in_line[j];
97	}
98	}
99
100	if (data.element[curelem].rmatrix) {
101	Ascii2NA(buffer, jj, data.element[curelem].rmatrix);
102	}
103	AppendNA((NA_Base*)buffer, jj, &(data.element[curelem]));
104	}
105	}
106	fclose(file);
107
108	for (j=0; j<data.numelements; j++) {
109	data.maxlen = std::max(data.maxlen, data.element[j].seqlen + data.element[j].offset);
110	}
111
112	NormalizeOffset(data);
113	Regroup(data);
114	}
115	}
116
117	/*
118	LoadFile():
119	Load the given filename into the given dataset. Handle any
120	type conversion needed to get the data into the specified data type.
121	This routine is used in situations where the format and datatype is known.
122
123	Copyright (c) 1989-1990, University of Illinois board of trustees. All
124	rights reserved. Written by Steven Smith at the Center for Prokaryote Genome
125	Analysis. Design and implementation guidance by Dr. Gary Olsen and Dr.
126	Carl Woese.
127
128	Copyright (c) 1990,1991,1992 Steven Smith at the Harvard Genome Laboratory.
129	All rights reserved.
130	*/
131
132	static GB_ERROR LoadFile(char *filename, NA_Alignment& dataset, int type, int format) {
133	GB_ERROR error = NULp;
134	if (DataType != type)
135	fprintf(stderr, "Warning, datatypes do not match.\n");
136	/*
137	Handle the overwrite/create/merge dialog here.
138	*/
139	switch (format) {
140	case NA_FLAT:
141	ReadNA_Flat(filename, dataset);
142	dataset.format = GDE;
143	break;
144
145	case GENBANK:
146	error = ReadGen(filename, dataset);
147	dataset.format = GENBANK;
148	break;
149
150	case GDE:
151	gde_assert(0); // should no longer occur
152	break;
153
154	default:
155	break;
156	}
157	return error;
158	}
159
160	static int FindType(char name, int dtype, int *ftype) {
161	FILE *file = fopen(name, "r");
162
163	*dtype = 0;
164	*ftype = 0;
165
166	int result = 1;
167	if (file) {
168	/* Is this a flat file?
169	* Get the first non blank line, see if a type marker shows up.
170	*/
171	char in_line[GBUFSIZ];
172	if (fgets(in_line, GBUFSIZ, file)) {
173	for (; strlen(in_line)<2 && fgets(in_line, GBUFSIZ, file);) ;
174
175	if (in_line[0] == '#' \|\| in_line[0] == '%' \|\|
176	in_line[0] == '"' \|\| in_line[0] == '@')
177	{
178	*dtype=NASEQ_ALIGN;
179	*ftype=NA_FLAT;
180	result = 0;
181	}
182	else { // try genbank
183	fclose(file);
184	file = fopen(name, "r");
185	*dtype=0;
186	*ftype=0;
187
188	if (file) {
189	for (; fgets(in_line, GBUFSIZ, file);) {
190	if (Find(in_line, "LOCUS")) {
191	*dtype = NASEQ_ALIGN;
192	*ftype = GENBANK;
193	break;
194	}
195	else if (Find(in_line, "sequence")) { // try GDE
196	*dtype = NASEQ_ALIGN;
197	*ftype = GDE;
198	break;
199	}
200	}
201	result = 0;
202	}
203	}
204	}
205	fclose(file);
206	}
207
208	return result;
209	}
210
211	void LoadData(char *filen, NA_Alignment& dataset) {
212	/* LoadData():
213	* Load a data set from the command line argument.
214	*
215	* Copyright (c) 1989, University of Illinois board of trustees. All rights
216	* reserved. Written by Steven Smith at the Center for Prokaryote Genome
217	* Analysis. Design and implementation guidance by Dr. Gary Olsen and Dr.
218	* Carl Woese.
219	* Copyright (c) 1990,1991,1992 Steven Smith at the Harvard Genome Laboratory.
220	* All rights reserved.
221	*/
222
223	// Get file name, determine the file type, and away we go..
224	if (Find2(filen, "gde") != 0)
225	strcpy(FileName, filen);
226
227	FILE *file = fopen(filen, "r");
228	if (file) {
229	FindType(filen, &DataType, &FileFormat);
230	switch (DataType) {
231	case NASEQ_ALIGN: {
232	GB_ERROR error = LoadFile(filen, dataset, DataType, FileFormat);
233	if (error) aw_message(error);
234	break;
235	}
236	default:
237	aw_message(GBS_global_string("Internal error: unknown file type of file %s", filen));
238	break;
239	}
240	fclose(file);
241	}
242	}
243
244	void AppendNA(NA_Base buffer, int len, NA_Sequence seq) {
245	int curlen=0, j;
246	if (seq->seqlen+len >= seq->seqmaxlen) {
247	seq->seqmaxlen = seq->seqlen + len + GBUFSIZ;
248	ARB_realloc(seq->sequence, seq->seqmaxlen);
249	}
250	// seqlen is the length, and the index of the next free base
251	curlen = seq->seqlen + seq->offset;
252	for (j=0; j<len; j++)
253	putelem(seq, j+curlen, buffer[j]);
254
255	seq->seqlen += len;
256	return;
257	}
258
259	void Ascii2NA(char *buffer, int len, int matrix[16]) {
260	// if the translation matrix exists, use it to encode the buffer.
261	if (matrix) {
262	for (int i=0; i<len; i++) {
263	buffer[i] = matrix[(unsigned char)buffer[i]];
264	}
265	}
266	}
267
268	int WriteNA_Flat(NA_Alignment& aln, char *filename, int method) {
269	if (aln.numelements == 0) return 1;
270
271	size_t j;
272	int kk;
273	int k, offset;
274	char offset_str[100], buf[100];
275	FILE *file;
276
277	NA_Sequence *seqs = aln.element;
278
279	file = fopen(filename, "w");
280	if (!file) {
281	Warning("Cannot open file for output");
282	return 1;
283	}
284
285	for (j=0; j<aln.numelements; j++) {
286	offset = seqs[j].offset;
287
288	if (offset+aln.rel_offset != 0)
289	sprintf(offset_str, "(%d)", offset+aln.rel_offset);
290	else
291	offset_str[0] = '\0';
292
293	if (method == ALL) {
294	fprintf(file, "%c%s%s\n",
295	seqs[j].elementtype == DNA ? '#' :
296	seqs[j].elementtype == RNA ? '#' :
297	seqs[j].elementtype == PROTEIN ? '%' :
298	seqs[j].elementtype == TEXT ? '"' :
299	seqs[j].elementtype == MASK ? '@' : '"',
300	seqs[j].short_name,
301	(offset+aln.rel_offset == 0) ? "" : offset_str);
302
303	if (seqs[j].tmatrix) {
304	for (k=0, kk=0; kk<seqs[j].seqlen; kk++) {
305	if ((k)%60 == 0 && k>0) {
306	buf[60] = '\0';
307	fputs(buf, file);
308	putc('\n', file);
309	}
310	buf[k%60] = ((char)seqs[j].tmatrix[(int)getelem(&(seqs[j]), kk+offset)]);
311	k++;
312	}
313	}
314	else {
315	for (k=0, kk=0; kk<seqs[j].seqlen; kk++) {
316	if ((k)%60 == 0 && k>0) {
317	buf[60] = '\0';
318	fputs(buf, file);
319	putc('\n', file);
320	}
321	buf[k%60] = (getelem(&(seqs[j]), kk+offset));
322	k++;
323	}
324	}
325	buf[(k%60)>0 ? (k%60) : 60] = '\0';
326	fputs(buf, file);
327	putc('\n', file);
328	}
329	}
330	fclose(file);
331	return 0;
332	}
333
334	void Warning(const char *s) {
335	aw_message(s);
336	}
337
338
339	void InitNASeq(NA_Sequence *seq, int type) {
340	SetTime(&(seq->t_stamp.origin));
341	SetTime(&(seq->t_stamp.modify));
342	strcpy_truncate(seq->id, uniqueID(), SIZE_ID);
343	seq->seq_name[0] = '\0';
344	seq->barcode[0] = '\0';
345	seq->contig[0] = '\0';
346	seq->membrane[0] = '\0';
347	seq->authority[0] = '\0';
348	seq->short_name[0] = '\0';
349	seq->sequence = NULp;
350	seq->offset = 0;
351	seq->baggage = NULp;
352	seq->baggage_len = 0;
353	seq->baggage_maxlen = 0;
354	seq->comments = NULp;
355	seq->comments_len = 0;
356	seq->comments_maxlen = 0;
357	seq->description[0] = '\0';
358	seq->seqlen = 0;
359	seq->seqmaxlen = 0;
360	seq->attr = IS_5_TO_3 + IS_PRIMARY;
361	seq->elementtype = type;
362	seq->groupid = 0;
363	seq->groupb = NULp;
364	seq->groupf = NULp;
365
366	switch (type) {
367	case DNA:
368	seq->tmatrix = Default_DNA_Trans;
369	seq->rmatrix = Default_NA_RTrans;
370	seq->col_lut = Default_NAColor_LKUP;
371	break;
372	case RNA:
373	seq->tmatrix = Default_RNA_Trans;
374	seq->rmatrix = Default_NA_RTrans;
375	seq->col_lut = Default_NAColor_LKUP;
376	break;
377	case PROTEIN:
378	seq->tmatrix = NULp;
379	seq->rmatrix = NULp;
380	seq->col_lut = Default_PROColor_LKUP;
381	break;
382	case MASK:
383	case TEXT:
384	default:
385	seq->tmatrix = NULp;
386	seq->rmatrix = NULp;
387	seq->col_lut = NULp;
388	break;
389	}
390	return;
391	}
392

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/ARB_GDE/GDE_FileIO.cxx

Download in other formats: