source: tags/initial/ALEIO/fasta-tabl.c

Last change on this file was 5458, checked in by baderk, 16 years ago

Removed .cvsignore files from repository. Hopefully this time all svn:ignore flags were set right.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 8.5 KB
Line 
1/* fasta-tabl.c --- Converting FASTA files to tabl format
2   Jim Blandy <jimb@gnu.ai.mit.edu> --- September 1994 */
3
4#include <stdio.h>
5#include <stdlib.h>
6#include <string.h>
7#include <ctype.h>
8
9#include "xmalloc.h"
10#include "lenstring.h"
11#include "hash.h"
12#include "careful.h"
13
14char *progname;
15
16
17/* Utilities.  */
18
19
20/* Usage messages.  */
21
22void 
23get_help ()
24{
25
26  fputs ("\
27Written by Jim Blandy <jimb@gnu.ai.mit.edu>\n\
28$Id: fasta-tabl.c 5458 2008-07-16 15:24:20Z westram $\n\
29\n\
30fasta-tabl transforms one or more concatenated FASTA entries into\n\
31key/value pairs.  Calling sequence, with [defaults]:\n\
32[ --fasta-file FILE ]            ; read FASTA entries from FILE [stdin]\n\
33[ --include-keys FILE ]          ; include only keys in FILE [all]\n\
34[ --exclude-keys FILE ]          ; exclude all keys in FILE [none]\n\
35[ --entry-name-out FILE ]        ; write entry names to FILE [stdout]\n\
36[ --annotation-out FILE ]        ; write entire annotation to FILE [stdout]\n\
37[ --sequence-out FILE ]          ; write sequence data to FILE [stdout]\n\
38[ -h | --help ]                  ; Displays this text\n",
39         stderr);
40
41  exit (0);
42}
43
44
45/* A hash table for use in selecting subsets, and functions to use it.  */
46
47struct string_hash *index_table;
48
49/* Read the contents of a file full of loci (one per line) into the
50   hash table.  read_FASTA can then check each entry name field against the
51   hash table to decide whether to exclude/include the entry.  */
52void
53read_index_file (char *index_file_name)
54{
55  FILE *index_file;
56  lenstring buf;
57
58  index_file = careful_open (index_file_name, "r", 0);
59  index_table = new_hash_table ();
60
61  while (read_delimited_lenstring (&buf, "\n", index_file) != EOF)
62    {
63      lookup_hash_table (index_table, buf.text, buf.len);
64      free (buf.text);
65    }
66
67  careful_close (index_file, index_file_name);
68}
69
70/* Return non-zero iff INDEX is in the hash table.  */
71int
72present_p (lenstring *index)
73{
74  return lookup_hash_table_soft (index_table, index->text, index->len) != 0;
75}
76
77
78/* Dealing with FASTA files.  */
79
80void 
81read_FASTA (char *FASTA_filename,
82           char *entry_name_filename,
83           char *annotation_filename,
84           char *sequence_filename,
85           int include)
86{
87  FILE *FASTA_file;
88  FILE *entry_name_file;
89  FILE *annotation_file;
90  FILE *sequence_file;
91
92  /* The state of the input.  Be careful to notice:
93     sequence lines before any header lines
94     consecutive header lines, with no intervening sequence  */
95  enum input_state {
96    top_of_file,
97    after_header,
98    after_some_sequence
99  };
100  enum input_state input_state = top_of_file;
101
102  /* The state of the output --- do we have an unterminated sequence
103     to finish?  */
104  int unterminated_sequence = 0;
105
106  /* True if the current sequence is to be included.  */
107  int include_entry;
108
109  lenstring buffer;
110
111  FASTA_file      = careful_open (FASTA_filename,       "r",  stdin);
112  entry_name_file = careful_open (entry_name_filename, "w+", stdout);
113  annotation_file = careful_open (annotation_filename, "w+", stdout);
114  sequence_file   = careful_open (sequence_filename,   "w+", stdout);
115
116  while (read_delimited_lenstring (&buffer, "\n", FASTA_file) != EOF)
117    {
118      /* Is this a header line or a sequence line?  */
119      if (buffer.len >= 1 && buffer.text[0] == '>')
120        /* Process a header line.  */
121        {
122          char *buffer_end = buffer.text + buffer.len;
123          char *p = buffer.text + 1;
124
125          /* A guess at a decent entry name.  */
126          lenstring entry_name;
127
128          if (input_state == after_header)
129            {
130              /* We just had a null sequence (i.e. the line
131                 immediately before this was a header line too.  */
132              fprintf (stderr,
133                       "%s: %s: FASTA file has two consecutive header lines\n"
134                       "%s: %s: with no sequence between them\n",
135                       progname, FASTA_filename,
136                       progname, FASTA_filename);
137              exit (2);
138            }         
139          input_state = after_header;
140
141          if (unterminated_sequence)
142            {
143              /* End any sequence line that came before this.  */
144              putc ('\n', sequence_file);
145              unterminated_sequence = 0; /* doesn't matter */
146            }
147
148          /* Skip blanks after the >.  */
149          while (p < buffer_end && isspace (*p))
150            p++;
151
152          /* Guess that an entry name is a string of up to ten
153             characters containing no spaces (or colons, because Gary
154             Olsen says he likes to separate the entry name from other
155             data with a colon, or commas, because ReadSeq writes
156             FASTA files with commas).  */
157          entry_name.text = p;
158          while (p < buffer_end
159                 && p - entry_name.text < 10
160                 && ! isspace (*p)
161                 && *p != ':'
162                 && *p != ',')
163            p++;
164          entry_name.len = p - entry_name.text;
165
166          /* Should we include this entry in the output?  */
167          include_entry =
168            (include == 0
169             || (include == -1 && ! present_p (&entry_name))
170             || (include ==  1 &&   present_p (&entry_name)));
171
172          if (include_entry)
173            {
174              write_lenstring (&entry_name, entry_name_file);
175              putc ('\n', entry_name_file);
176
177              /* Treat the entire line as the annotation.  */
178              write_lenstring (&buffer, annotation_file);
179              putc ('\n', annotation_file);
180              putc ('\f', annotation_file);
181
182              check_file (entry_name_file, entry_name_filename,
183                          "writing FASTA entry names");
184              check_file (annotation_file, annotation_filename,
185                          "writing FASTA annotations");
186            }
187        }
188      else
189        /* Process a sequence line.  */
190        {
191          if (input_state == top_of_file)
192            {
193              /* This is a headerless sequence.  */
194              fprintf (stderr,
195                       "%s: %s: FASTA file doesn't start with a header line\n",
196                       progname, FASTA_filename);
197              exit (1);
198            }
199          input_state = after_some_sequence;
200             
201          if (include_entry)
202            {
203              char *source = buffer.text;
204              char *dest   = buffer.text;
205              char *source_end = buffer.text + buffer.len;
206
207              for (; source < source_end; source++)
208                if (! isspace (*source))
209                  *dest++ = *source;
210
211              buffer.len = dest - buffer.text;
212              write_lenstring (&buffer, sequence_file);
213              check_file (sequence_file, sequence_filename,
214                          "writing FASTA sequence data");
215              unterminated_sequence = 1;
216            }
217        }
218
219      free (buffer.text);
220    }
221
222  /* Finish off any sequence line we were in the midst of.  */
223  if (unterminated_sequence)
224    putc ('\n', sequence_file);
225
226  careful_close (FASTA_file, FASTA_filename);
227  careful_close (entry_name_file, entry_name_filename);
228  careful_close (annotation_file, annotation_filename);
229  careful_close (sequence_file, sequence_filename);
230}
231
232
233
234/* Processing command-line arguments.  */
235
236int
237main (int argc, char *argv[])
238{
239  char *FASTA_file = NULL;      /* Name of FASTA file */
240  char *index_file = NULL; /* file of indices to extract */
241  char include = 0;             /* should exclude named indices, or include */
242  char *entry_name_file = NULL; /* Name of output file for keys */
243  char *annotation_file = NULL; /* name of output file for annotations */
244  char *sequence_file = NULL;   /* name of output file for sequences */
245  int i = 0;                    /* counter for first for loop */
246
247  progname = careful_prog_name (argv[0]);
248
249  for (i = 1; i < argc; i++)
250    {
251      if (!strcmp (argv[i], "--include-keys"))
252        {
253          if (include != 0)
254            {
255              fputs ("fasta-tabl: "
256                     "`--include-keys' and `--exclude-keys' may not be\n"
257                     "fasta-tabl: combined or repeated\n",
258                     stderr);
259              exit (1);
260            }
261          i++;
262          index_file = argv[i];
263          include = 1;
264        }
265      else if (!strcmp (argv[i], "--exclude-keys"))
266        {
267          if (include != 0)
268            {
269              fputs ("fasta-tabl: "
270                     "`--include-keys' and `--exclude-keys' may not be\n"
271                     "fasta-tabl: combined or repeated\n",
272                     stderr);
273              exit (1);
274            }
275          i++;
276          index_file = argv[i];
277          include = -1;
278        }
279      else if (!strcmp (argv[i], "--fasta-file"))
280        {
281          i++;
282          FASTA_file = argv[i];
283        }
284      else if (!strcmp (argv[i], "--entry-name-out"))
285        {
286          i++;
287          entry_name_file = argv[i];
288        }
289      else if (!strcmp (argv[i], "--annotation-out"))
290        {
291          i++;
292          annotation_file = argv[i];
293        }
294      else if (!strcmp (argv[i], "--sequence-out"))
295        {
296          i++;
297          sequence_file = argv[i];
298        }
299      else if (!strcmp (argv[i], "-h") || !strcmp (argv[i], "--help"))
300        {
301          get_help ();
302          return 1;
303        }
304      else
305        {
306          fprintf (stderr, 
307                   "\nYour calling sequence is incorrect.  Try %s --help\n",
308                   progname);
309          return 1;
310        }
311    }
312
313  if (include != 0)
314    read_index_file (index_file);
315
316  read_FASTA (FASTA_file, entry_name_file, annotation_file, sequence_file, 
317             include);
318
319  return 0;
320}
Note: See TracBrowser for help on using the repository browser.