source: tags/initial/ALEIO/gb-tabl.c

Last change on this file was 5458, checked in by baderk, 16 years ago

Removed .cvsignore files from repository. Hopefully this time all svn:ignore flags were set right.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 9.7 KB
Line 
1#include <stdlib.h>
2#include <sys/types.h>
3#include <sys/stat.h>
4#include <stdio.h>
5#include <string.h>
6#include <ctype.h>
7#include <memory.h>
8
9extern void bcopy ();
10extern int  bcmp ();
11
12#include "xmalloc.h"
13#include "lenstring.h"
14#include "hash.h"
15#include "careful.h"
16
17char *progname;
18
19typedef struct
20  {
21    lenstring locus_name;
22    lenstring annotation;
23    lenstring sequence;
24  }
25gb_entry;
26
27
28/* Utilities.  */
29
30
31/* Printing usage messages.  */
32
33void 
34get_help ()
35{
36
37  fputs ("\
38Written by Pavel Slavin, pavel@darwin.life.uiuc.edu\n\
39       and Jim Blandy, jimb@gnu.ai.mit.edu\n\
40$Id: gb-tabl.c 5458 2008-07-16 15:24:20Z westram $ \n\
41\n\
42gb-tabl transforms one or more concatenated GenBank entries into\n\
43locus/value pairs.  Calling sequence, with [defaults]:\n\
44[ --gb-file FILE ]               ; read GenBank entries from FILE [stdin]\n\
45[ --include-loci FILE ]          ; include only loci in FILE [all]\n\
46[ --exclude-loci FILE ]          ; exclude all loci in FILE [none]\n\
47[ --locus-name-out FILE ]        ; write locus names to FILE [stdout]\n\
48[ --annotation-out FILE ]        ; write entire annotation to FILE [stdout]\n\
49[ --sequence-out FILE ]          ; write sequence data to FILE [stdout]\n\
50[ -h | --help ]                  ; Displays this text\n",
51         stderr);
52
53  exit (0);
54}
55
56
57
58/* A hash table for use in selecting subsets, and functions to use it.  */
59
60struct string_hash *index_table;
61
62/* Read the contents of a file full of loci (one per line) into the
63   hash table.  read_GenBank can then check each locus field against the
64   hash table to decide whether to exclude/include the locus's entry.  */
65void
66read_index_file (char *index_file_name)
67{
68  FILE *index_file;
69  lenstring buf;
70
71  index_file = careful_open (index_file_name, "r", 0);
72  index_table = new_hash_table ();
73
74  while (read_delimited_lenstring (&buf, "\n", index_file) != EOF)
75    {
76      lookup_hash_table (index_table, buf.text, buf.len);
77      free (buf.text);
78    }
79
80  careful_close (index_file, index_file_name);
81}
82
83/* Return non-zero iff INDEX is in the hash table.  */
84int
85present_p (lenstring *index)
86{
87  return lookup_hash_table_soft (index_table, index->text, index->len) != 0;
88}
89
90
91/* Dealing with GenBank entries.  */
92
93/* Find a line in BUFFER[BUFFER_LEN] that starts with HEADER.
94   Return its starting address.  */
95char *
96find_header (lenstring *buffer, const char *header)
97{
98  int pos = 0;
99
100  for (;;)
101    {
102      pos = search_lenstring (buffer, header, pos);
103      if (pos <= 0 || buffer->text[pos - 1] == '\n')
104        break;
105      pos++;
106    }
107
108  if (pos == -1)
109    return NULL;
110  else
111    return buffer->text + pos;
112}
113
114
115/* Convert sequence data from GenBank format to tabl format.  SEQ
116   contains the sequence data, in GenBank format.  The conversion is
117   done in-place, since tabl is always smaller than GenBank.  */
118void
119gb_to_tabl_sequence (lenstring *seq)
120{
121  char *source     = seq->text;
122  char *source_end = seq->text + seq->len;
123  char *dest = source;
124
125  do
126    {
127      char c;
128     
129      /* Skip zero or more blanks, zero or more digits, and then zero
130         or more blanks.  Consume the largest such prefix possible.  */
131      while (source < source_end
132             && (*source == ' ' || *source == '\t'))
133        source++;
134      while (source < source_end
135             && isascii (*source)
136             && isdigit (*source))
137        source++;
138      while (source < source_end
139             && (*source == ' ' || *source == '\t'))
140        source++;
141
142      /* Since we skip sections of text, we might not notice
143         terminator characters in odd places, so we check against
144         the ending address instead.  */
145      while (source < source_end && (c = *source++) != '\n')
146        {
147          if (c != ' ')
148            *dest++ = c;
149        }
150    }
151  while (source < source_end);
152
153  /* Make seq point to the area we've re-formatted.  */
154  seq->len = dest - seq->text;
155}
156
157
158/*  This function reads GenBank file ( entry-by-entry ). */
159void 
160read_GenBank (char *GenBank_filename,
161              char *locus_name_filename,
162              char *annotation_filename,
163              char *sequence_filename,
164              int include)
165{
166  gb_entry entry;         /* entry is a var of type gb_entry (see above) */
167  FILE *GenBank_file;
168  FILE *locus_name_file;
169  FILE *annotation_file;
170  FILE *sequence_file;
171
172  /* Buffer containing GenBank entry.  */
173  lenstring buffer;
174  lenstring unstripped_buffer;
175
176  GenBank_file    = careful_open (GenBank_filename,    "r", stdin);
177  locus_name_file = careful_open (locus_name_filename, "w+", stdout);
178  annotation_file = careful_open (annotation_filename, "w+", stdout);
179  sequence_file   = careful_open (sequence_filename,   "w+", stdout);
180
181  while (read_delimited_lenstring (&unstripped_buffer, "//", GenBank_file)
182         != EOF)
183    {
184      /* start of line after ORIGIN line.  */
185      char *sequence_start;
186
187      /* First address after the buffer.  */
188      char *buffer_end;
189
190      strip_newlines (&buffer, &unstripped_buffer);
191
192      /* Ignore newlines before EOF.  */
193      if (buffer.len == 0 && feof (GenBank_file))
194        {
195          free (unstripped_buffer.text);
196          break;
197        }
198
199      buffer_end = buffer.text + buffer.len;
200
201      /* sequence_start is the first line after the ORIGIN record.  */
202      sequence_start = find_header (&buffer, "ORIGIN");
203      if (sequence_start)
204        sequence_start = (char *) memchr (sequence_start, '\n',
205                                          buffer_end - sequence_start);
206      if (! sequence_start)
207        {
208          fprintf (stderr, "%s: entry lacks a correct ORIGIN line\n",
209                   GenBank_filename ? GenBank_filename : "stdin");
210          exit (1);
211        }
212
213      /* sequence_start should really sit *after* the newline.  */
214      sequence_start ++;
215
216      /* Make entry.annotation point at the annotation section of the
217         buffer.  */
218      entry.annotation.text = buffer.text;
219      entry.annotation.len  = sequence_start - buffer.text;
220
221      /* Find the locus field.  */
222      {
223        char *p;
224        char *start;
225
226        if (! (p = find_header (&entry.annotation, "LOCUS")))
227          {
228            fprintf (stderr,
229                     "%s: entry lacks a correct LOCUS line\n",
230                     GenBank_filename ? GenBank_filename : "stdin");
231            exit (1);
232          }
233
234        /* Find the name on the LOCUS line.  We assume it's the
235           first string of non-spaces after the "LOCUS" string.  */
236        p += 12;
237        while (p < buffer_end && *p != '\n' && isspace (*p))
238          p++;
239        start = p;
240        while (p < buffer_end && ! isspace (*p))
241          p++;
242        entry.locus_name.text = start;
243        entry.locus_name.len  = p - start;
244      }
245
246
247      /* Convert sequence to GenBank format.  */
248      entry.sequence.text = sequence_start;
249      entry.sequence.len  = buffer_end - sequence_start;
250      gb_to_tabl_sequence (&entry.sequence);
251
252
253      /* Write this entry's data.  */
254      {
255        lenstring *locus = &entry.locus_name;
256
257        /* If we're including or excluding, only write the appropriate
258           stuff.  */
259        if (include == 0
260            || (include == -1 && ! present_p (locus))
261            || (include ==  1 &&   present_p (locus)))
262          {
263            /* what to print out and where */
264            write_lenstring (&entry.locus_name, locus_name_file);
265            putc ('\n', locus_name_file);
266            check_file (locus_name_file, locus_name_filename,
267                        "writing GenBank locus names");
268
269            write_lenstring (&entry.annotation, annotation_file);
270            putc ('\f', annotation_file);
271            check_file (annotation_file, annotation_filename,
272                        "writing GenBank annotations");
273
274            write_lenstring (&entry.sequence, sequence_file);
275            putc ('\n', sequence_file);
276            check_file (sequence_file, sequence_filename,
277                        "writing GenBank sequence data");
278          }
279      }
280
281      free (unstripped_buffer.text);
282    }
283
284  careful_close (GenBank_file, GenBank_filename);
285  careful_close (locus_name_file, locus_name_filename);
286  careful_close (annotation_file, annotation_filename);
287  careful_close (sequence_file, sequence_filename);
288}
289
290
291/* Parsing command-line arguments.  */
292
293int
294main (int argc, char *argv[])
295{
296  char *GenBank_file = NULL;    /* Name of gen_bank file */
297  char *index_file = NULL; /* file of indices to extract */
298  char include = 0;             /* should exclude named indices, or include */
299  char *locus_name_file = NULL; /* Name of output file for loci */
300  char *annotation_file = NULL; /* name of output file for annotations */
301  char *sequence_file = NULL;   /* name of output file for sequences */
302  int i = 0;                    /* counter for first for loop */
303
304  progname = careful_prog_name (argv[0]);
305
306  for (i = 1; i < argc; i++)
307    {
308      if (!strcmp (argv[i], "--include-loci"))
309        {
310          if (include != 0)
311            {
312              fputs ("gb-tabl: "
313                     "`--include-loci' and `--exclude-loci' may not be\n"
314                     "gb-tabl: combined or repeated\n",
315                     stderr);
316              exit (1);
317            }
318          i++;
319          index_file = argv[i];
320          include = 1;
321        }
322      else if (!strcmp (argv[i], "--exclude-loci"))
323        {
324          if (include != 0)
325            {
326              fputs ("gb-tabl: "
327                     "`--include-loci' and `--exclude-loci' may not be\n"
328                     "gb-tabl: combined or repeated\n",
329                     stderr);
330              exit (1);
331            }
332          i++;
333          index_file = argv[i];
334          include = -1;
335        }
336      else if (!strcmp (argv[i], "--gb-file"))
337        {
338          i++;
339          GenBank_file = argv[i];
340        }
341      else if (!strcmp (argv[i], "--locus-name-out"))
342        {
343          i++;
344          locus_name_file = argv[i];
345        }
346      else if (!strcmp (argv[i], "--annotation-out"))
347        {
348          i++;
349          annotation_file = argv[i];
350        }
351      else if (!strcmp (argv[i], "--sequence-out"))
352        {
353          i++;
354          sequence_file = argv[i];
355        }
356      else if (!strcmp (argv[i], "-h") || !strcmp (argv[i], "--help"))
357        {
358          get_help ();
359          return 1;
360        }
361      else
362        {
363          fputs ("\nYour calling sequence is incorrect.  Try gb-tabl --help\n",
364                 stderr);
365          return 1;
366        }
367    }
368
369  if (include != 0)
370    read_index_file (index_file);
371
372  read_GenBank (GenBank_file, locus_name_file, annotation_file, sequence_file, 
373                include);
374
375  return 0;
376}
Note: See TracBrowser for help on using the repository browser.