| 1 | /* fasta-tabl.c --- Converting FASTA files to tabl format |
|---|
| 2 | Jim Blandy <jimb@gnu.ai.mit.edu> --- September 1994 */ |
|---|
| 3 | |
|---|
| 4 | #include <stdio.h> |
|---|
| 5 | #include <stdlib.h> |
|---|
| 6 | #include <string.h> |
|---|
| 7 | #include <ctype.h> |
|---|
| 8 | |
|---|
| 9 | #include "xmalloc.h" |
|---|
| 10 | #include "lenstring.h" |
|---|
| 11 | #include "hash.h" |
|---|
| 12 | #include "careful.h" |
|---|
| 13 | |
|---|
| 14 | char *progname; |
|---|
| 15 | |
|---|
| 16 | |
|---|
| 17 | /* Utilities. */ |
|---|
| 18 | |
|---|
| 19 | |
|---|
| 20 | /* Usage messages. */ |
|---|
| 21 | |
|---|
| 22 | void |
|---|
| 23 | get_help () |
|---|
| 24 | { |
|---|
| 25 | |
|---|
| 26 | fputs ("\ |
|---|
| 27 | Written by Jim Blandy <jimb@gnu.ai.mit.edu>\n\ |
|---|
| 28 | $Id: fasta-tabl.c 5458 2008-07-16 15:24:20Z westram $\n\ |
|---|
| 29 | \n\ |
|---|
| 30 | fasta-tabl transforms one or more concatenated FASTA entries into\n\ |
|---|
| 31 | key/value pairs. Calling sequence, with [defaults]:\n\ |
|---|
| 32 | [ --fasta-file FILE ] ; read FASTA entries from FILE [stdin]\n\ |
|---|
| 33 | [ --include-keys FILE ] ; include only keys in FILE [all]\n\ |
|---|
| 34 | [ --exclude-keys FILE ] ; exclude all keys in FILE [none]\n\ |
|---|
| 35 | [ --entry-name-out FILE ] ; write entry names to FILE [stdout]\n\ |
|---|
| 36 | [ --annotation-out FILE ] ; write entire annotation to FILE [stdout]\n\ |
|---|
| 37 | [ --sequence-out FILE ] ; write sequence data to FILE [stdout]\n\ |
|---|
| 38 | [ -h | --help ] ; Displays this text\n", |
|---|
| 39 | stderr); |
|---|
| 40 | |
|---|
| 41 | exit (0); |
|---|
| 42 | } |
|---|
| 43 | |
|---|
| 44 | |
|---|
| 45 | /* A hash table for use in selecting subsets, and functions to use it. */ |
|---|
| 46 | |
|---|
| 47 | struct string_hash *index_table; |
|---|
| 48 | |
|---|
| 49 | /* Read the contents of a file full of loci (one per line) into the |
|---|
| 50 | hash table. read_FASTA can then check each entry name field against the |
|---|
| 51 | hash table to decide whether to exclude/include the entry. */ |
|---|
| 52 | void |
|---|
| 53 | read_index_file (char *index_file_name) |
|---|
| 54 | { |
|---|
| 55 | FILE *index_file; |
|---|
| 56 | lenstring buf; |
|---|
| 57 | |
|---|
| 58 | index_file = careful_open (index_file_name, "r", 0); |
|---|
| 59 | index_table = new_hash_table (); |
|---|
| 60 | |
|---|
| 61 | while (read_delimited_lenstring (&buf, "\n", index_file) != EOF) |
|---|
| 62 | { |
|---|
| 63 | lookup_hash_table (index_table, buf.text, buf.len); |
|---|
| 64 | free (buf.text); |
|---|
| 65 | } |
|---|
| 66 | |
|---|
| 67 | careful_close (index_file, index_file_name); |
|---|
| 68 | } |
|---|
| 69 | |
|---|
| 70 | /* Return non-zero iff INDEX is in the hash table. */ |
|---|
| 71 | int |
|---|
| 72 | present_p (lenstring *index) |
|---|
| 73 | { |
|---|
| 74 | return lookup_hash_table_soft (index_table, index->text, index->len) != 0; |
|---|
| 75 | } |
|---|
| 76 | |
|---|
| 77 | |
|---|
| 78 | /* Dealing with FASTA files. */ |
|---|
| 79 | |
|---|
| 80 | void |
|---|
| 81 | read_FASTA (char *FASTA_filename, |
|---|
| 82 | char *entry_name_filename, |
|---|
| 83 | char *annotation_filename, |
|---|
| 84 | char *sequence_filename, |
|---|
| 85 | int include) |
|---|
| 86 | { |
|---|
| 87 | FILE *FASTA_file; |
|---|
| 88 | FILE *entry_name_file; |
|---|
| 89 | FILE *annotation_file; |
|---|
| 90 | FILE *sequence_file; |
|---|
| 91 | |
|---|
| 92 | /* The state of the input. Be careful to notice: |
|---|
| 93 | sequence lines before any header lines |
|---|
| 94 | consecutive header lines, with no intervening sequence */ |
|---|
| 95 | enum input_state { |
|---|
| 96 | top_of_file, |
|---|
| 97 | after_header, |
|---|
| 98 | after_some_sequence |
|---|
| 99 | }; |
|---|
| 100 | enum input_state input_state = top_of_file; |
|---|
| 101 | |
|---|
| 102 | /* The state of the output --- do we have an unterminated sequence |
|---|
| 103 | to finish? */ |
|---|
| 104 | int unterminated_sequence = 0; |
|---|
| 105 | |
|---|
| 106 | /* True if the current sequence is to be included. */ |
|---|
| 107 | int include_entry; |
|---|
| 108 | |
|---|
| 109 | lenstring buffer; |
|---|
| 110 | |
|---|
| 111 | FASTA_file = careful_open (FASTA_filename, "r", stdin); |
|---|
| 112 | entry_name_file = careful_open (entry_name_filename, "w+", stdout); |
|---|
| 113 | annotation_file = careful_open (annotation_filename, "w+", stdout); |
|---|
| 114 | sequence_file = careful_open (sequence_filename, "w+", stdout); |
|---|
| 115 | |
|---|
| 116 | while (read_delimited_lenstring (&buffer, "\n", FASTA_file) != EOF) |
|---|
| 117 | { |
|---|
| 118 | /* Is this a header line or a sequence line? */ |
|---|
| 119 | if (buffer.len >= 1 && buffer.text[0] == '>') |
|---|
| 120 | /* Process a header line. */ |
|---|
| 121 | { |
|---|
| 122 | char *buffer_end = buffer.text + buffer.len; |
|---|
| 123 | char *p = buffer.text + 1; |
|---|
| 124 | |
|---|
| 125 | /* A guess at a decent entry name. */ |
|---|
| 126 | lenstring entry_name; |
|---|
| 127 | |
|---|
| 128 | if (input_state == after_header) |
|---|
| 129 | { |
|---|
| 130 | /* We just had a null sequence (i.e. the line |
|---|
| 131 | immediately before this was a header line too. */ |
|---|
| 132 | fprintf (stderr, |
|---|
| 133 | "%s: %s: FASTA file has two consecutive header lines\n" |
|---|
| 134 | "%s: %s: with no sequence between them\n", |
|---|
| 135 | progname, FASTA_filename, |
|---|
| 136 | progname, FASTA_filename); |
|---|
| 137 | exit (2); |
|---|
| 138 | } |
|---|
| 139 | input_state = after_header; |
|---|
| 140 | |
|---|
| 141 | if (unterminated_sequence) |
|---|
| 142 | { |
|---|
| 143 | /* End any sequence line that came before this. */ |
|---|
| 144 | putc ('\n', sequence_file); |
|---|
| 145 | unterminated_sequence = 0; /* doesn't matter */ |
|---|
| 146 | } |
|---|
| 147 | |
|---|
| 148 | /* Skip blanks after the >. */ |
|---|
| 149 | while (p < buffer_end && isspace (*p)) |
|---|
| 150 | p++; |
|---|
| 151 | |
|---|
| 152 | /* Guess that an entry name is a string of up to ten |
|---|
| 153 | characters containing no spaces (or colons, because Gary |
|---|
| 154 | Olsen says he likes to separate the entry name from other |
|---|
| 155 | data with a colon, or commas, because ReadSeq writes |
|---|
| 156 | FASTA files with commas). */ |
|---|
| 157 | entry_name.text = p; |
|---|
| 158 | while (p < buffer_end |
|---|
| 159 | && p - entry_name.text < 10 |
|---|
| 160 | && ! isspace (*p) |
|---|
| 161 | && *p != ':' |
|---|
| 162 | && *p != ',') |
|---|
| 163 | p++; |
|---|
| 164 | entry_name.len = p - entry_name.text; |
|---|
| 165 | |
|---|
| 166 | /* Should we include this entry in the output? */ |
|---|
| 167 | include_entry = |
|---|
| 168 | (include == 0 |
|---|
| 169 | || (include == -1 && ! present_p (&entry_name)) |
|---|
| 170 | || (include == 1 && present_p (&entry_name))); |
|---|
| 171 | |
|---|
| 172 | if (include_entry) |
|---|
| 173 | { |
|---|
| 174 | write_lenstring (&entry_name, entry_name_file); |
|---|
| 175 | putc ('\n', entry_name_file); |
|---|
| 176 | |
|---|
| 177 | /* Treat the entire line as the annotation. */ |
|---|
| 178 | write_lenstring (&buffer, annotation_file); |
|---|
| 179 | putc ('\n', annotation_file); |
|---|
| 180 | putc ('\f', annotation_file); |
|---|
| 181 | |
|---|
| 182 | check_file (entry_name_file, entry_name_filename, |
|---|
| 183 | "writing FASTA entry names"); |
|---|
| 184 | check_file (annotation_file, annotation_filename, |
|---|
| 185 | "writing FASTA annotations"); |
|---|
| 186 | } |
|---|
| 187 | } |
|---|
| 188 | else |
|---|
| 189 | /* Process a sequence line. */ |
|---|
| 190 | { |
|---|
| 191 | if (input_state == top_of_file) |
|---|
| 192 | { |
|---|
| 193 | /* This is a headerless sequence. */ |
|---|
| 194 | fprintf (stderr, |
|---|
| 195 | "%s: %s: FASTA file doesn't start with a header line\n", |
|---|
| 196 | progname, FASTA_filename); |
|---|
| 197 | exit (1); |
|---|
| 198 | } |
|---|
| 199 | input_state = after_some_sequence; |
|---|
| 200 | |
|---|
| 201 | if (include_entry) |
|---|
| 202 | { |
|---|
| 203 | char *source = buffer.text; |
|---|
| 204 | char *dest = buffer.text; |
|---|
| 205 | char *source_end = buffer.text + buffer.len; |
|---|
| 206 | |
|---|
| 207 | for (; source < source_end; source++) |
|---|
| 208 | if (! isspace (*source)) |
|---|
| 209 | *dest++ = *source; |
|---|
| 210 | |
|---|
| 211 | buffer.len = dest - buffer.text; |
|---|
| 212 | write_lenstring (&buffer, sequence_file); |
|---|
| 213 | check_file (sequence_file, sequence_filename, |
|---|
| 214 | "writing FASTA sequence data"); |
|---|
| 215 | unterminated_sequence = 1; |
|---|
| 216 | } |
|---|
| 217 | } |
|---|
| 218 | |
|---|
| 219 | free (buffer.text); |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | /* Finish off any sequence line we were in the midst of. */ |
|---|
| 223 | if (unterminated_sequence) |
|---|
| 224 | putc ('\n', sequence_file); |
|---|
| 225 | |
|---|
| 226 | careful_close (FASTA_file, FASTA_filename); |
|---|
| 227 | careful_close (entry_name_file, entry_name_filename); |
|---|
| 228 | careful_close (annotation_file, annotation_filename); |
|---|
| 229 | careful_close (sequence_file, sequence_filename); |
|---|
| 230 | } |
|---|
| 231 | |
|---|
| 232 | |
|---|
| 233 | |
|---|
| 234 | /* Processing command-line arguments. */ |
|---|
| 235 | |
|---|
| 236 | int |
|---|
| 237 | main (int argc, char *argv[]) |
|---|
| 238 | { |
|---|
| 239 | char *FASTA_file = NULL; /* Name of FASTA file */ |
|---|
| 240 | char *index_file = NULL; /* file of indices to extract */ |
|---|
| 241 | char include = 0; /* should exclude named indices, or include */ |
|---|
| 242 | char *entry_name_file = NULL; /* Name of output file for keys */ |
|---|
| 243 | char *annotation_file = NULL; /* name of output file for annotations */ |
|---|
| 244 | char *sequence_file = NULL; /* name of output file for sequences */ |
|---|
| 245 | int i = 0; /* counter for first for loop */ |
|---|
| 246 | |
|---|
| 247 | progname = careful_prog_name (argv[0]); |
|---|
| 248 | |
|---|
| 249 | for (i = 1; i < argc; i++) |
|---|
| 250 | { |
|---|
| 251 | if (!strcmp (argv[i], "--include-keys")) |
|---|
| 252 | { |
|---|
| 253 | if (include != 0) |
|---|
| 254 | { |
|---|
| 255 | fputs ("fasta-tabl: " |
|---|
| 256 | "`--include-keys' and `--exclude-keys' may not be\n" |
|---|
| 257 | "fasta-tabl: combined or repeated\n", |
|---|
| 258 | stderr); |
|---|
| 259 | exit (1); |
|---|
| 260 | } |
|---|
| 261 | i++; |
|---|
| 262 | index_file = argv[i]; |
|---|
| 263 | include = 1; |
|---|
| 264 | } |
|---|
| 265 | else if (!strcmp (argv[i], "--exclude-keys")) |
|---|
| 266 | { |
|---|
| 267 | if (include != 0) |
|---|
| 268 | { |
|---|
| 269 | fputs ("fasta-tabl: " |
|---|
| 270 | "`--include-keys' and `--exclude-keys' may not be\n" |
|---|
| 271 | "fasta-tabl: combined or repeated\n", |
|---|
| 272 | stderr); |
|---|
| 273 | exit (1); |
|---|
| 274 | } |
|---|
| 275 | i++; |
|---|
| 276 | index_file = argv[i]; |
|---|
| 277 | include = -1; |
|---|
| 278 | } |
|---|
| 279 | else if (!strcmp (argv[i], "--fasta-file")) |
|---|
| 280 | { |
|---|
| 281 | i++; |
|---|
| 282 | FASTA_file = argv[i]; |
|---|
| 283 | } |
|---|
| 284 | else if (!strcmp (argv[i], "--entry-name-out")) |
|---|
| 285 | { |
|---|
| 286 | i++; |
|---|
| 287 | entry_name_file = argv[i]; |
|---|
| 288 | } |
|---|
| 289 | else if (!strcmp (argv[i], "--annotation-out")) |
|---|
| 290 | { |
|---|
| 291 | i++; |
|---|
| 292 | annotation_file = argv[i]; |
|---|
| 293 | } |
|---|
| 294 | else if (!strcmp (argv[i], "--sequence-out")) |
|---|
| 295 | { |
|---|
| 296 | i++; |
|---|
| 297 | sequence_file = argv[i]; |
|---|
| 298 | } |
|---|
| 299 | else if (!strcmp (argv[i], "-h") || !strcmp (argv[i], "--help")) |
|---|
| 300 | { |
|---|
| 301 | get_help (); |
|---|
| 302 | return 1; |
|---|
| 303 | } |
|---|
| 304 | else |
|---|
| 305 | { |
|---|
| 306 | fprintf (stderr, |
|---|
| 307 | "\nYour calling sequence is incorrect. Try %s --help\n", |
|---|
| 308 | progname); |
|---|
| 309 | return 1; |
|---|
| 310 | } |
|---|
| 311 | } |
|---|
| 312 | |
|---|
| 313 | if (include != 0) |
|---|
| 314 | read_index_file (index_file); |
|---|
| 315 | |
|---|
| 316 | read_FASTA (FASTA_file, entry_name_file, annotation_file, sequence_file, |
|---|
| 317 | include); |
|---|
| 318 | |
|---|
| 319 | return 0; |
|---|
| 320 | } |
|---|