| 1 | #include <stdio.h> |
|---|
| 2 | #include "convert.h" |
|---|
| 3 | #include "global.h" |
|---|
| 4 | |
|---|
| 5 | #include <assert.h> |
|---|
| 6 | #include <errno.h> |
|---|
| 7 | |
|---|
| 8 | |
|---|
| 9 | /* --------------------------------------------------------------- |
|---|
| 10 | * Function init_phylip(). |
|---|
| 11 | * Initialize genbank entry. |
|---|
| 12 | */ |
|---|
| 13 | void init_phylip() { |
|---|
| 14 | |
|---|
| 15 | } |
|---|
| 16 | /* --------------------------------------------------------------- |
|---|
| 17 | * Function to_phylip() |
|---|
| 18 | * Convert from some format to PHYLIP format. |
|---|
| 19 | */ |
|---|
| 20 | void to_phylip(inf, outf, informat,readstdin) |
|---|
| 21 | char *inf, *outf; |
|---|
| 22 | int informat; |
|---|
| 23 | int readstdin; |
|---|
| 24 | { |
|---|
| 25 | FILE *IFP, *ofp; |
|---|
| 26 | FILE_BUFFER ifp; |
|---|
| 27 | int maxsize, current, total_seq; |
|---|
| 28 | int out_of_memory, indi; |
|---|
| 29 | char temp[TOKENNUM], eof; |
|---|
| 30 | char *name; |
|---|
| 31 | |
|---|
| 32 | if((IFP=fopen(inf, "r"))==NULL) { |
|---|
| 33 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 34 | error(64, temp); |
|---|
| 35 | } |
|---|
| 36 | ifp = create_FILE_BUFFER(inf, IFP); |
|---|
| 37 | if(Lenstr(outf) <= 0) { |
|---|
| 38 | ofp = stdout; |
|---|
| 39 | assert(0); // can't use stdout (because rewind is used below) |
|---|
| 40 | error(140, "Cannot write to standard output, EXIT\n"); |
|---|
| 41 | } |
|---|
| 42 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 43 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 44 | error(117, temp); |
|---|
| 45 | } |
|---|
| 46 | maxsize = 1; |
|---|
| 47 | out_of_memory = 0; |
|---|
| 48 | name = NULL; |
|---|
| 49 | init(); |
|---|
| 50 | init_phylip(); |
|---|
| 51 | total_seq = 0; |
|---|
| 52 | do { |
|---|
| 53 | if(informat==ALMA) { |
|---|
| 54 | init_alma(); |
|---|
| 55 | eof=alma_in(ifp); |
|---|
| 56 | } else if(informat==GENBANK) { |
|---|
| 57 | init_genbank(); |
|---|
| 58 | eof=genbank_in_locus(ifp); |
|---|
| 59 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 60 | init_embl(); |
|---|
| 61 | eof=embl_in_id(ifp); |
|---|
| 62 | } else if(informat==MACKE) { |
|---|
| 63 | init_macke(); |
|---|
| 64 | eof=macke_in_name(ifp); |
|---|
| 65 | } else error(34, "UNKNOW input format when converting to PHYLIP format."); |
|---|
| 66 | if(eof==EOF) break; |
|---|
| 67 | if(informat==ALMA) { |
|---|
| 68 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
|---|
| 69 | } else if(informat==GENBANK) { |
|---|
| 70 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
|---|
| 71 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 72 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
|---|
| 73 | } else if(informat==MACKE) { |
|---|
| 74 | Cpystr(temp, data.macke.seqabbr); |
|---|
| 75 | } else error(119, "UNKNOW input format when converting to PHYLIP format."); |
|---|
| 76 | total_seq++; |
|---|
| 77 | |
|---|
| 78 | if((name = Dupstr(temp))==NULL&&temp!=NULL) { out_of_memory=1; break; } |
|---|
| 79 | if(data.seq_length>maxsize) maxsize = data.seq_length; |
|---|
| 80 | |
|---|
| 81 | if (!realloc_sequence_data(total_seq)) { out_of_memory = 1; break; } |
|---|
| 82 | |
|---|
| 83 | data.ids[total_seq-1] = name; |
|---|
| 84 | data.seqs[total_seq-1] = (char*)Dupstr(data.sequence); |
|---|
| 85 | data.lengths[total_seq-1] = Lenstr(data.sequence); |
|---|
| 86 | } while(!out_of_memory); |
|---|
| 87 | |
|---|
| 88 | if(out_of_memory) { /* cannot hold all seqs into mem. */ |
|---|
| 89 | fprintf(stderr, |
|---|
| 90 | "Rerun the conversion throught one seq. by one seq. base.\n"); |
|---|
| 91 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
|---|
| 92 | to_phylip_1x1(inf, outf, informat); |
|---|
| 93 | return; |
|---|
| 94 | } |
|---|
| 95 | current = 0; |
|---|
| 96 | int headersize1 = fprintf(ofp, "%8d %8d", maxsize, current); |
|---|
| 97 | if (readstdin){ |
|---|
| 98 | int c; |
|---|
| 99 | int spaced = 0; |
|---|
| 100 | while (1) { |
|---|
| 101 | c = getchar(); |
|---|
| 102 | if (c == EOF) break; /* read all from stdin now (not only one line)*/ |
|---|
| 103 | /* if (c == EOF||c=='\n') break; */ |
|---|
| 104 | if (!spaced) { |
|---|
| 105 | fputc(' ', ofp); |
|---|
| 106 | spaced = 1; |
|---|
| 107 | } |
|---|
| 108 | fputc(c,ofp); |
|---|
| 109 | } |
|---|
| 110 | |
|---|
| 111 | } |
|---|
| 112 | fprintf(ofp,"\n"); |
|---|
| 113 | |
|---|
| 114 | while(maxsize>current) { |
|---|
| 115 | for(indi=0; indi<total_seq; indi++) { |
|---|
| 116 | phylip_print_line(data.ids[indi], data.seqs[indi], data.lengths[indi], current, ofp); |
|---|
| 117 | } |
|---|
| 118 | if(current==0) current +=(SEQLINE-10); |
|---|
| 119 | else current += SEQLINE; |
|---|
| 120 | if(maxsize>current) fprintf(ofp, "\n"); |
|---|
| 121 | } |
|---|
| 122 | /* rewrite output header */ |
|---|
| 123 | errno = 0; |
|---|
| 124 | rewind(ofp); |
|---|
| 125 | assert(errno == 0); |
|---|
| 126 | if (errno) { |
|---|
| 127 | perror("rewind error"); |
|---|
| 128 | sprintf(temp, "Failed to rewind file (errno=%i), EXIT\n", errno); |
|---|
| 129 | error(141, temp); |
|---|
| 130 | } |
|---|
| 131 | |
|---|
| 132 | int headersize2 = fprintf(ofp, "%8d %8d", total_seq, maxsize); |
|---|
| 133 | |
|---|
| 134 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
|---|
| 135 | |
|---|
| 136 | if (headersize1 != headersize2) { |
|---|
| 137 | sprintf(temp, "Failed to rewrite header (headersize differs: %i != %i), EXIT\n", headersize1, headersize2); |
|---|
| 138 | assert(0); |
|---|
| 139 | error(142, temp); |
|---|
| 140 | } |
|---|
| 141 | |
|---|
| 142 | #ifdef log |
|---|
| 143 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
|---|
| 144 | #endif |
|---|
| 145 | } |
|---|
| 146 | /* --------------------------------------------------------------- |
|---|
| 147 | * Function to_phylip_1x1() |
|---|
| 148 | * Convert from one format to PHYLIP format, one seq by one seq. |
|---|
| 149 | */ |
|---|
| 150 | void |
|---|
| 151 | to_phylip_1x1(inf, outf, informat) |
|---|
| 152 | char *inf, *outf; |
|---|
| 153 | int informat; |
|---|
| 154 | { |
|---|
| 155 | FILE *IFP, *ofp; |
|---|
| 156 | FILE_BUFFER ifp; |
|---|
| 157 | int maxsize, current, total_seq; |
|---|
| 158 | char temp[TOKENNUM], eof; |
|---|
| 159 | char *name; |
|---|
| 160 | |
|---|
| 161 | if((IFP=fopen(inf, "r"))==NULL) { |
|---|
| 162 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 163 | error(123, temp); |
|---|
| 164 | } |
|---|
| 165 | ifp = create_FILE_BUFFER(inf, IFP); |
|---|
| 166 | if(Lenstr(outf) <= 0) ofp = stdout; |
|---|
| 167 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 168 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 169 | error(124, temp); |
|---|
| 170 | } |
|---|
| 171 | maxsize = 1; current = 0; |
|---|
| 172 | name = NULL; |
|---|
| 173 | fprintf(ofp, "%4d %4d\n", maxsize, current); |
|---|
| 174 | while(maxsize>current) { |
|---|
| 175 | init(); |
|---|
| 176 | FILE_BUFFER_rewind(ifp); |
|---|
| 177 | total_seq = 0; |
|---|
| 178 | do { /* read in one sequence */ |
|---|
| 179 | init_phylip(); |
|---|
| 180 | if(informat==ALMA) { |
|---|
| 181 | init_alma(); |
|---|
| 182 | eof=alma_in(ifp); |
|---|
| 183 | } else if(informat==GENBANK) { |
|---|
| 184 | init_genbank(); |
|---|
| 185 | eof=genbank_in_locus(ifp); |
|---|
| 186 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 187 | init_embl(); |
|---|
| 188 | eof=embl_in_id(ifp); |
|---|
| 189 | } else if(informat==MACKE) { |
|---|
| 190 | init_macke(); |
|---|
| 191 | eof=macke_in_name(ifp); |
|---|
| 192 | } else error(128, "UNKNOWN input format when converting to PHYLIP format."); |
|---|
| 193 | if(eof==EOF) break; |
|---|
| 194 | if(informat==ALMA) { |
|---|
| 195 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
|---|
| 196 | } else if(informat==GENBANK) { |
|---|
| 197 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
|---|
| 198 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 199 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
|---|
| 200 | } else if(informat==MACKE) { |
|---|
| 201 | macke_key_word(data.macke.name, 0, temp, TOKENNUM); |
|---|
| 202 | } else error(130, "UNKNOWN input format when converting to PHYLIP format."); |
|---|
| 203 | Freespace(&name); |
|---|
| 204 | name = Dupstr(temp); |
|---|
| 205 | if(data.seq_length>maxsize) maxsize = data.seq_length; |
|---|
| 206 | phylip_print_line(name, data.sequence, 0, current, ofp); |
|---|
| 207 | total_seq++; |
|---|
| 208 | } while(1); |
|---|
| 209 | if(current==0) current += (SEQLINE-10); |
|---|
| 210 | else current += SEQLINE; |
|---|
| 211 | if(maxsize>current) fprintf(ofp, "\n"); |
|---|
| 212 | } /* print block by block */ |
|---|
| 213 | |
|---|
| 214 | rewind(ofp); |
|---|
| 215 | fprintf(ofp, "%4d %4d", total_seq, maxsize); |
|---|
| 216 | |
|---|
| 217 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
|---|
| 218 | |
|---|
| 219 | #ifdef log |
|---|
| 220 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
|---|
| 221 | #endif |
|---|
| 222 | |
|---|
| 223 | } |
|---|
| 224 | /* -------------------------------------------------------------- |
|---|
| 225 | * Function phylip_print_line(). |
|---|
| 226 | * Print phylip line. |
|---|
| 227 | */ |
|---|
| 228 | void |
|---|
| 229 | phylip_print_line(name, sequence, seq_length, index, fp) |
|---|
| 230 | char *name, *sequence; |
|---|
| 231 | int seq_length; |
|---|
| 232 | int index; |
|---|
| 233 | FILE *fp; |
|---|
| 234 | { |
|---|
| 235 | int indi, indj, length, bnum; |
|---|
| 236 | |
|---|
| 237 | if(index==0) { |
|---|
| 238 | if(Lenstr(name)>10) { |
|---|
| 239 | /* truncate id length of seq ID is greater than 10 */ |
|---|
| 240 | for(indi=0; indi<10; indi++) fputc(name[indi], fp); |
|---|
| 241 | bnum = 1; |
|---|
| 242 | } else { |
|---|
| 243 | fprintf(fp, "%s", name); |
|---|
| 244 | bnum = 10 - Lenstr(name)+1; |
|---|
| 245 | } |
|---|
| 246 | /* fill in blanks to make up 10 chars for ID. */ |
|---|
| 247 | for(indi=0; indi<bnum; indi++) fputc(' ', fp); |
|---|
| 248 | length = SEQLINE - 10; |
|---|
| 249 | } else if(index>=data.seq_length) length = 0; |
|---|
| 250 | else length = SEQLINE; |
|---|
| 251 | |
|---|
| 252 | if (seq_length == 0) seq_length = Lenstr(sequence); |
|---|
| 253 | for(indi=indj=0; indi<length; indi++) { |
|---|
| 254 | if((index+indi)<seq_length) { |
|---|
| 255 | char c= sequence[index+indi]; |
|---|
| 256 | if (c=='.') c= '?'; |
|---|
| 257 | fputc(c,fp); |
|---|
| 258 | indj++; |
|---|
| 259 | if(indj==10 && (index+indi)<(seq_length-1) && indi<(length-1)) { |
|---|
| 260 | fputc(' ', fp); |
|---|
| 261 | indj=0; |
|---|
| 262 | } |
|---|
| 263 | } else break; |
|---|
| 264 | } |
|---|
| 265 | fprintf(fp, "\n"); |
|---|
| 266 | } |
|---|