1 | #include <stdlib.h> |
---|
2 | #include <sys/types.h> |
---|
3 | #include <sys/stat.h> |
---|
4 | #include <stdio.h> |
---|
5 | #include <string.h> |
---|
6 | #include <ctype.h> |
---|
7 | #include <memory.h> |
---|
8 | |
---|
9 | extern void bcopy (); |
---|
10 | extern int bcmp (); |
---|
11 | |
---|
12 | #include "xmalloc.h" |
---|
13 | #include "lenstring.h" |
---|
14 | #include "hash.h" |
---|
15 | #include "careful.h" |
---|
16 | |
---|
17 | char *progname; |
---|
18 | |
---|
19 | typedef struct |
---|
20 | { |
---|
21 | lenstring locus_name; |
---|
22 | lenstring annotation; |
---|
23 | lenstring sequence; |
---|
24 | } |
---|
25 | gb_entry; |
---|
26 | |
---|
27 | |
---|
28 | /* Utilities. */ |
---|
29 | |
---|
30 | |
---|
31 | /* Printing usage messages. */ |
---|
32 | |
---|
33 | void |
---|
34 | get_help () |
---|
35 | { |
---|
36 | |
---|
37 | fputs ("\ |
---|
38 | Written by Pavel Slavin, pavel@darwin.life.uiuc.edu\n\ |
---|
39 | and Jim Blandy, jimb@gnu.ai.mit.edu\n\ |
---|
40 | $Id: gb-tabl.c 5458 2008-07-16 15:24:20Z westram $ \n\ |
---|
41 | \n\ |
---|
42 | gb-tabl transforms one or more concatenated GenBank entries into\n\ |
---|
43 | locus/value pairs. Calling sequence, with [defaults]:\n\ |
---|
44 | [ --gb-file FILE ] ; read GenBank entries from FILE [stdin]\n\ |
---|
45 | [ --include-loci FILE ] ; include only loci in FILE [all]\n\ |
---|
46 | [ --exclude-loci FILE ] ; exclude all loci in FILE [none]\n\ |
---|
47 | [ --locus-name-out FILE ] ; write locus names to FILE [stdout]\n\ |
---|
48 | [ --annotation-out FILE ] ; write entire annotation to FILE [stdout]\n\ |
---|
49 | [ --sequence-out FILE ] ; write sequence data to FILE [stdout]\n\ |
---|
50 | [ -h | --help ] ; Displays this text\n", |
---|
51 | stderr); |
---|
52 | |
---|
53 | exit (0); |
---|
54 | } |
---|
55 | |
---|
56 | |
---|
57 | |
---|
58 | /* A hash table for use in selecting subsets, and functions to use it. */ |
---|
59 | |
---|
60 | struct string_hash *index_table; |
---|
61 | |
---|
62 | /* Read the contents of a file full of loci (one per line) into the |
---|
63 | hash table. read_GenBank can then check each locus field against the |
---|
64 | hash table to decide whether to exclude/include the locus's entry. */ |
---|
65 | void |
---|
66 | read_index_file (char *index_file_name) |
---|
67 | { |
---|
68 | FILE *index_file; |
---|
69 | lenstring buf; |
---|
70 | |
---|
71 | index_file = careful_open (index_file_name, "r", 0); |
---|
72 | index_table = new_hash_table (); |
---|
73 | |
---|
74 | while (read_delimited_lenstring (&buf, "\n", index_file) != EOF) |
---|
75 | { |
---|
76 | lookup_hash_table (index_table, buf.text, buf.len); |
---|
77 | free (buf.text); |
---|
78 | } |
---|
79 | |
---|
80 | careful_close (index_file, index_file_name); |
---|
81 | } |
---|
82 | |
---|
83 | /* Return non-zero iff INDEX is in the hash table. */ |
---|
84 | int |
---|
85 | present_p (lenstring *index) |
---|
86 | { |
---|
87 | return lookup_hash_table_soft (index_table, index->text, index->len) != 0; |
---|
88 | } |
---|
89 | |
---|
90 | |
---|
91 | /* Dealing with GenBank entries. */ |
---|
92 | |
---|
93 | /* Find a line in BUFFER[BUFFER_LEN] that starts with HEADER. |
---|
94 | Return its starting address. */ |
---|
95 | char * |
---|
96 | find_header (lenstring *buffer, const char *header) |
---|
97 | { |
---|
98 | int pos = 0; |
---|
99 | |
---|
100 | for (;;) |
---|
101 | { |
---|
102 | pos = search_lenstring (buffer, header, pos); |
---|
103 | if (pos <= 0 || buffer->text[pos - 1] == '\n') |
---|
104 | break; |
---|
105 | pos++; |
---|
106 | } |
---|
107 | |
---|
108 | if (pos == -1) |
---|
109 | return NULL; |
---|
110 | else |
---|
111 | return buffer->text + pos; |
---|
112 | } |
---|
113 | |
---|
114 | |
---|
115 | /* Convert sequence data from GenBank format to tabl format. SEQ |
---|
116 | contains the sequence data, in GenBank format. The conversion is |
---|
117 | done in-place, since tabl is always smaller than GenBank. */ |
---|
118 | void |
---|
119 | gb_to_tabl_sequence (lenstring *seq) |
---|
120 | { |
---|
121 | char *source = seq->text; |
---|
122 | char *source_end = seq->text + seq->len; |
---|
123 | char *dest = source; |
---|
124 | |
---|
125 | do |
---|
126 | { |
---|
127 | char c; |
---|
128 | |
---|
129 | /* Skip zero or more blanks, zero or more digits, and then zero |
---|
130 | or more blanks. Consume the largest such prefix possible. */ |
---|
131 | while (source < source_end |
---|
132 | && (*source == ' ' || *source == '\t')) |
---|
133 | source++; |
---|
134 | while (source < source_end |
---|
135 | && isascii (*source) |
---|
136 | && isdigit (*source)) |
---|
137 | source++; |
---|
138 | while (source < source_end |
---|
139 | && (*source == ' ' || *source == '\t')) |
---|
140 | source++; |
---|
141 | |
---|
142 | /* Since we skip sections of text, we might not notice |
---|
143 | terminator characters in odd places, so we check against |
---|
144 | the ending address instead. */ |
---|
145 | while (source < source_end && (c = *source++) != '\n') |
---|
146 | { |
---|
147 | if (c != ' ') |
---|
148 | *dest++ = c; |
---|
149 | } |
---|
150 | } |
---|
151 | while (source < source_end); |
---|
152 | |
---|
153 | /* Make seq point to the area we've re-formatted. */ |
---|
154 | seq->len = dest - seq->text; |
---|
155 | } |
---|
156 | |
---|
157 | |
---|
158 | /* This function reads GenBank file ( entry-by-entry ). */ |
---|
159 | void |
---|
160 | read_GenBank (char *GenBank_filename, |
---|
161 | char *locus_name_filename, |
---|
162 | char *annotation_filename, |
---|
163 | char *sequence_filename, |
---|
164 | int include) |
---|
165 | { |
---|
166 | gb_entry entry; /* entry is a var of type gb_entry (see above) */ |
---|
167 | FILE *GenBank_file; |
---|
168 | FILE *locus_name_file; |
---|
169 | FILE *annotation_file; |
---|
170 | FILE *sequence_file; |
---|
171 | |
---|
172 | /* Buffer containing GenBank entry. */ |
---|
173 | lenstring buffer; |
---|
174 | lenstring unstripped_buffer; |
---|
175 | |
---|
176 | GenBank_file = careful_open (GenBank_filename, "r", stdin); |
---|
177 | locus_name_file = careful_open (locus_name_filename, "w+", stdout); |
---|
178 | annotation_file = careful_open (annotation_filename, "w+", stdout); |
---|
179 | sequence_file = careful_open (sequence_filename, "w+", stdout); |
---|
180 | |
---|
181 | while (read_delimited_lenstring (&unstripped_buffer, "//", GenBank_file) |
---|
182 | != EOF) |
---|
183 | { |
---|
184 | /* start of line after ORIGIN line. */ |
---|
185 | char *sequence_start; |
---|
186 | |
---|
187 | /* First address after the buffer. */ |
---|
188 | char *buffer_end; |
---|
189 | |
---|
190 | strip_newlines (&buffer, &unstripped_buffer); |
---|
191 | |
---|
192 | /* Ignore newlines before EOF. */ |
---|
193 | if (buffer.len == 0 && feof (GenBank_file)) |
---|
194 | { |
---|
195 | free (unstripped_buffer.text); |
---|
196 | break; |
---|
197 | } |
---|
198 | |
---|
199 | buffer_end = buffer.text + buffer.len; |
---|
200 | |
---|
201 | /* sequence_start is the first line after the ORIGIN record. */ |
---|
202 | sequence_start = find_header (&buffer, "ORIGIN"); |
---|
203 | if (sequence_start) |
---|
204 | sequence_start = (char *) memchr (sequence_start, '\n', |
---|
205 | buffer_end - sequence_start); |
---|
206 | if (! sequence_start) |
---|
207 | { |
---|
208 | fprintf (stderr, "%s: entry lacks a correct ORIGIN line\n", |
---|
209 | GenBank_filename ? GenBank_filename : "stdin"); |
---|
210 | exit (1); |
---|
211 | } |
---|
212 | |
---|
213 | /* sequence_start should really sit *after* the newline. */ |
---|
214 | sequence_start ++; |
---|
215 | |
---|
216 | /* Make entry.annotation point at the annotation section of the |
---|
217 | buffer. */ |
---|
218 | entry.annotation.text = buffer.text; |
---|
219 | entry.annotation.len = sequence_start - buffer.text; |
---|
220 | |
---|
221 | /* Find the locus field. */ |
---|
222 | { |
---|
223 | char *p; |
---|
224 | char *start; |
---|
225 | |
---|
226 | if (! (p = find_header (&entry.annotation, "LOCUS"))) |
---|
227 | { |
---|
228 | fprintf (stderr, |
---|
229 | "%s: entry lacks a correct LOCUS line\n", |
---|
230 | GenBank_filename ? GenBank_filename : "stdin"); |
---|
231 | exit (1); |
---|
232 | } |
---|
233 | |
---|
234 | /* Find the name on the LOCUS line. We assume it's the |
---|
235 | first string of non-spaces after the "LOCUS" string. */ |
---|
236 | p += 12; |
---|
237 | while (p < buffer_end && *p != '\n' && isspace (*p)) |
---|
238 | p++; |
---|
239 | start = p; |
---|
240 | while (p < buffer_end && ! isspace (*p)) |
---|
241 | p++; |
---|
242 | entry.locus_name.text = start; |
---|
243 | entry.locus_name.len = p - start; |
---|
244 | } |
---|
245 | |
---|
246 | |
---|
247 | /* Convert sequence to GenBank format. */ |
---|
248 | entry.sequence.text = sequence_start; |
---|
249 | entry.sequence.len = buffer_end - sequence_start; |
---|
250 | gb_to_tabl_sequence (&entry.sequence); |
---|
251 | |
---|
252 | |
---|
253 | /* Write this entry's data. */ |
---|
254 | { |
---|
255 | lenstring *locus = &entry.locus_name; |
---|
256 | |
---|
257 | /* If we're including or excluding, only write the appropriate |
---|
258 | stuff. */ |
---|
259 | if (include == 0 |
---|
260 | || (include == -1 && ! present_p (locus)) |
---|
261 | || (include == 1 && present_p (locus))) |
---|
262 | { |
---|
263 | /* what to print out and where */ |
---|
264 | write_lenstring (&entry.locus_name, locus_name_file); |
---|
265 | putc ('\n', locus_name_file); |
---|
266 | check_file (locus_name_file, locus_name_filename, |
---|
267 | "writing GenBank locus names"); |
---|
268 | |
---|
269 | write_lenstring (&entry.annotation, annotation_file); |
---|
270 | putc ('\f', annotation_file); |
---|
271 | check_file (annotation_file, annotation_filename, |
---|
272 | "writing GenBank annotations"); |
---|
273 | |
---|
274 | write_lenstring (&entry.sequence, sequence_file); |
---|
275 | putc ('\n', sequence_file); |
---|
276 | check_file (sequence_file, sequence_filename, |
---|
277 | "writing GenBank sequence data"); |
---|
278 | } |
---|
279 | } |
---|
280 | |
---|
281 | free (unstripped_buffer.text); |
---|
282 | } |
---|
283 | |
---|
284 | careful_close (GenBank_file, GenBank_filename); |
---|
285 | careful_close (locus_name_file, locus_name_filename); |
---|
286 | careful_close (annotation_file, annotation_filename); |
---|
287 | careful_close (sequence_file, sequence_filename); |
---|
288 | } |
---|
289 | |
---|
290 | |
---|
291 | /* Parsing command-line arguments. */ |
---|
292 | |
---|
293 | int |
---|
294 | main (int argc, char *argv[]) |
---|
295 | { |
---|
296 | char *GenBank_file = NULL; /* Name of gen_bank file */ |
---|
297 | char *index_file = NULL; /* file of indices to extract */ |
---|
298 | char include = 0; /* should exclude named indices, or include */ |
---|
299 | char *locus_name_file = NULL; /* Name of output file for loci */ |
---|
300 | char *annotation_file = NULL; /* name of output file for annotations */ |
---|
301 | char *sequence_file = NULL; /* name of output file for sequences */ |
---|
302 | int i = 0; /* counter for first for loop */ |
---|
303 | |
---|
304 | progname = careful_prog_name (argv[0]); |
---|
305 | |
---|
306 | for (i = 1; i < argc; i++) |
---|
307 | { |
---|
308 | if (!strcmp (argv[i], "--include-loci")) |
---|
309 | { |
---|
310 | if (include != 0) |
---|
311 | { |
---|
312 | fputs ("gb-tabl: " |
---|
313 | "`--include-loci' and `--exclude-loci' may not be\n" |
---|
314 | "gb-tabl: combined or repeated\n", |
---|
315 | stderr); |
---|
316 | exit (1); |
---|
317 | } |
---|
318 | i++; |
---|
319 | index_file = argv[i]; |
---|
320 | include = 1; |
---|
321 | } |
---|
322 | else if (!strcmp (argv[i], "--exclude-loci")) |
---|
323 | { |
---|
324 | if (include != 0) |
---|
325 | { |
---|
326 | fputs ("gb-tabl: " |
---|
327 | "`--include-loci' and `--exclude-loci' may not be\n" |
---|
328 | "gb-tabl: combined or repeated\n", |
---|
329 | stderr); |
---|
330 | exit (1); |
---|
331 | } |
---|
332 | i++; |
---|
333 | index_file = argv[i]; |
---|
334 | include = -1; |
---|
335 | } |
---|
336 | else if (!strcmp (argv[i], "--gb-file")) |
---|
337 | { |
---|
338 | i++; |
---|
339 | GenBank_file = argv[i]; |
---|
340 | } |
---|
341 | else if (!strcmp (argv[i], "--locus-name-out")) |
---|
342 | { |
---|
343 | i++; |
---|
344 | locus_name_file = argv[i]; |
---|
345 | } |
---|
346 | else if (!strcmp (argv[i], "--annotation-out")) |
---|
347 | { |
---|
348 | i++; |
---|
349 | annotation_file = argv[i]; |
---|
350 | } |
---|
351 | else if (!strcmp (argv[i], "--sequence-out")) |
---|
352 | { |
---|
353 | i++; |
---|
354 | sequence_file = argv[i]; |
---|
355 | } |
---|
356 | else if (!strcmp (argv[i], "-h") || !strcmp (argv[i], "--help")) |
---|
357 | { |
---|
358 | get_help (); |
---|
359 | return 1; |
---|
360 | } |
---|
361 | else |
---|
362 | { |
---|
363 | fputs ("\nYour calling sequence is incorrect. Try gb-tabl --help\n", |
---|
364 | stderr); |
---|
365 | return 1; |
---|
366 | } |
---|
367 | } |
---|
368 | |
---|
369 | if (include != 0) |
---|
370 | read_index_file (index_file); |
---|
371 | |
---|
372 | read_GenBank (GenBank_file, locus_name_file, annotation_file, sequence_file, |
---|
373 | include); |
---|
374 | |
---|
375 | return 0; |
---|
376 | } |
---|