1 | |
---|
2 | #include <stdio.h> |
---|
3 | #include "convert.h" |
---|
4 | #include "global.h" |
---|
5 | |
---|
6 | /* --------------------------------------------------------------- */ |
---|
7 | /* Function init_phylip(). |
---|
8 | /* Initialize genbank entry. |
---|
9 | */ |
---|
10 | void |
---|
11 | init_phylip() { |
---|
12 | |
---|
13 | } |
---|
14 | /* --------------------------------------------------------------- */ |
---|
15 | /* Function to_phylip() |
---|
16 | /* Convert from some format to PHYLIP format. |
---|
17 | */ |
---|
18 | void |
---|
19 | to_phylip(inf, outf, informat,readstdin) |
---|
20 | char *inf, *outf; |
---|
21 | int informat; |
---|
22 | int readstdin; |
---|
23 | { |
---|
24 | FILE *ifp, *ofp, *fopen(); |
---|
25 | int maxsize, current, total_seq, Lenstr(); |
---|
26 | int out_of_memory, indi; |
---|
27 | int alma_key_word(); |
---|
28 | char alma_in(), genbank_in_locus(); |
---|
29 | char macke_in_name(), embl_in_id(); |
---|
30 | char temp[TOKENNUM], eof; |
---|
31 | char *Dupstr(), *today_date(), *today, *name; |
---|
32 | void init(), init_phylip(), init_seq_data(); |
---|
33 | void init_alma(), init_genbank(), init_macke(), init_embl(); |
---|
34 | void phylip_print_line(), to_phylip_1x1(); |
---|
35 | void Freespace(), error(), Cpystr(); |
---|
36 | void genbank_key_word(), embl_key_word(); |
---|
37 | |
---|
38 | if((ifp=fopen(inf, "r"))==NULL) { |
---|
39 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
---|
40 | error(64, temp); |
---|
41 | } |
---|
42 | if(Lenstr(outf)<=0) ofp = stdout; |
---|
43 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
44 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
---|
45 | error(117, temp); |
---|
46 | } |
---|
47 | maxsize = 1; |
---|
48 | out_of_memory = 0; |
---|
49 | name = NULL; |
---|
50 | init(); |
---|
51 | init_phylip(); |
---|
52 | total_seq = 0; |
---|
53 | do { |
---|
54 | if(informat==ALMA) { |
---|
55 | init_alma(); |
---|
56 | eof=alma_in(ifp); |
---|
57 | } else if(informat==GENBANK) { |
---|
58 | init_genbank(); |
---|
59 | eof=genbank_in_locus(ifp); |
---|
60 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
61 | init_embl(); |
---|
62 | eof=embl_in_id(ifp); |
---|
63 | } else if(informat==MACKE) { |
---|
64 | init_macke(); |
---|
65 | eof=macke_in_name(ifp); |
---|
66 | } else error(34, "UNKNOW input format when converting to PHYLIP format."); |
---|
67 | if(eof==EOF) break; |
---|
68 | if(informat==ALMA) { |
---|
69 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
---|
70 | } else if(informat==GENBANK) { |
---|
71 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
---|
72 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
73 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
---|
74 | } else if(informat==MACKE) { |
---|
75 | Cpystr(temp, data.macke.seqabbr); |
---|
76 | } else error(119, "UNKNOW input format when converting to PHYLIP format."); |
---|
77 | total_seq++; |
---|
78 | if((name = Dupstr(temp))==NULL&&temp!=NULL) |
---|
79 | { out_of_memory=1; break; } |
---|
80 | if(data.seq_length>maxsize) |
---|
81 | maxsize = data.seq_length; |
---|
82 | data.ids=(char**)Reallocspace((char *)data.ids, |
---|
83 | sizeof(char*)*total_seq); |
---|
84 | if(data.ids==NULL) { out_of_memory=1; break; } |
---|
85 | data.seqs=(char**)Reallocspace((char *)data.seqs, |
---|
86 | sizeof(char*)*total_seq); |
---|
87 | if(data.seqs==NULL) { out_of_memory=1; break; } |
---|
88 | data.ids[total_seq-1] = name; |
---|
89 | data.seqs[total_seq-1]=(char*)Dupstr(data.sequence); |
---|
90 | } while(!out_of_memory); |
---|
91 | |
---|
92 | if(out_of_memory) { /* cannot hold all seqs into mem. */ |
---|
93 | fprintf(stderr, |
---|
94 | "Rerun the conversion throught one seq. by one seq. base.\n"); |
---|
95 | fclose(ifp); fclose(ofp); |
---|
96 | to_phylip_1x1(inf, outf, informat); |
---|
97 | return; |
---|
98 | } |
---|
99 | current = 0; |
---|
100 | fprintf(ofp, "%4d %4d ", maxsize, current); |
---|
101 | if (readstdin){ |
---|
102 | int c; |
---|
103 | while (1){ |
---|
104 | c = getchar(); |
---|
105 | if (c == EOF) break; |
---|
106 | fputc(c,ofp); |
---|
107 | } |
---|
108 | |
---|
109 | }else{ |
---|
110 | fprintf(ofp,"\n"); |
---|
111 | } |
---|
112 | while(maxsize>current) { |
---|
113 | for(indi=0; indi<total_seq; indi++) { |
---|
114 | phylip_print_line(data.ids[indi], |
---|
115 | data.seqs[indi], current, ofp); |
---|
116 | } |
---|
117 | if(current==0) current +=(SEQLINE-10); |
---|
118 | else current += SEQLINE; |
---|
119 | if(maxsize>current) fprintf(ofp, "\n"); |
---|
120 | } |
---|
121 | /* rewrite output header */ |
---|
122 | rewind(ofp); |
---|
123 | fprintf(ofp, "%4d %4d", total_seq, maxsize); |
---|
124 | |
---|
125 | fclose(ifp); fclose(ofp); |
---|
126 | |
---|
127 | #ifdef log |
---|
128 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
---|
129 | #endif |
---|
130 | |
---|
131 | } |
---|
132 | /* --------------------------------------------------------------- */ |
---|
133 | /* Function to_phylip_1x1() |
---|
134 | /* Convert from one format to PHYLIP format, one seq by one seq. |
---|
135 | */ |
---|
136 | void |
---|
137 | to_phylip_1x1(inf, outf, informat) |
---|
138 | char *inf, *outf; |
---|
139 | int informat; |
---|
140 | { |
---|
141 | FILE *ifp, *ofp, *fopen(); |
---|
142 | int maxsize, current, total_seq, Lenstr(); |
---|
143 | int alma_key_word(); |
---|
144 | char temp[TOKENNUM], eof; |
---|
145 | char *Dupstr(), *name, *today_date(), *today; |
---|
146 | |
---|
147 | if((ifp=fopen(inf, "r"))==NULL) { |
---|
148 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
---|
149 | error(123, temp); |
---|
150 | } |
---|
151 | if(Lenstr(outf)<=0) ofp = stdout; |
---|
152 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
153 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
---|
154 | error(124, temp); |
---|
155 | } |
---|
156 | maxsize = 1; current = 0; |
---|
157 | name = NULL; |
---|
158 | fprintf(ofp, "%4d %4d\n", maxsize, current); |
---|
159 | while(maxsize>current) { |
---|
160 | init(); |
---|
161 | rewind(ifp); |
---|
162 | total_seq = 0; |
---|
163 | do { /* read in one sequence */ |
---|
164 | init_phylip(); |
---|
165 | if(informat==ALMA) { |
---|
166 | init_alma(); |
---|
167 | eof=alma_in(ifp); |
---|
168 | } else if(informat==GENBANK) { |
---|
169 | init_genbank(); |
---|
170 | eof=genbank_in_locus(ifp); |
---|
171 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
172 | init_embl(); |
---|
173 | eof=embl_in_id(ifp); |
---|
174 | } else if(informat==MACKE) { |
---|
175 | init_macke(); |
---|
176 | eof=macke_in_name(ifp); |
---|
177 | } else error(128, "UNKNOW input format when converting to PHYLIP format."); |
---|
178 | if(eof==EOF) break; |
---|
179 | if(informat==ALMA) { |
---|
180 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
---|
181 | } else if(informat==GENBANK) { |
---|
182 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
---|
183 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
184 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
---|
185 | } else if(informat==MACKE) { |
---|
186 | macke_key_word(data.macke.name, 0, temp, TOKENNUM); |
---|
187 | } else error(130, |
---|
188 | "UNKNOW input format when converting to PHYLIP format."); |
---|
189 | Freespace(&name); |
---|
190 | name = Dupstr(temp); |
---|
191 | if(data.seq_length>maxsize) |
---|
192 | maxsize = data.seq_length; |
---|
193 | phylip_print_line(name, data.sequence, current, ofp); |
---|
194 | total_seq++; |
---|
195 | } while(1); |
---|
196 | if(current==0) current += (SEQLINE-10); |
---|
197 | else current += SEQLINE; |
---|
198 | if(maxsize>current) fprintf(ofp, "\n"); |
---|
199 | } /* print block by block */ |
---|
200 | |
---|
201 | rewind(ofp); |
---|
202 | fprintf(ofp, "%4d %4d", total_seq, maxsize); |
---|
203 | |
---|
204 | fclose(ifp); fclose(ofp); |
---|
205 | |
---|
206 | #ifdef log |
---|
207 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
---|
208 | #endif |
---|
209 | |
---|
210 | } |
---|
211 | /* -------------------------------------------------------------- */ |
---|
212 | /* Function phylip_print_line(). |
---|
213 | /* Print phylip line. |
---|
214 | */ |
---|
215 | void |
---|
216 | phylip_print_line(name, sequence, index, fp) |
---|
217 | char *name, *sequence; |
---|
218 | int index; |
---|
219 | FILE *fp; |
---|
220 | { |
---|
221 | int indi, indj, length, bnum, Lenstr(), seq_length; |
---|
222 | |
---|
223 | if(index==0) { |
---|
224 | if(Lenstr(name)>10) { |
---|
225 | /* truncate id length of seq ID is greater than 10 */ |
---|
226 | for(indi=0; indi<10; indi++) |
---|
227 | fprintf(fp, "%c", name[indi]); |
---|
228 | bnum = 1; |
---|
229 | } else { |
---|
230 | fprintf(fp, "%s", name); |
---|
231 | bnum = 10 - Lenstr(name)+1; |
---|
232 | } |
---|
233 | /* fill in blanks to make up 10 chars for ID. */ |
---|
234 | for(indi=0; indi<bnum; indi++) |
---|
235 | fprintf(fp, " "); |
---|
236 | length = SEQLINE - 10; |
---|
237 | } else if(index>=data.seq_length) length = 0; |
---|
238 | else length = SEQLINE; |
---|
239 | seq_length = Lenstr(sequence); |
---|
240 | for(indi=indj=0; indi<length; indi++) { |
---|
241 | if((index+indi)<seq_length) { |
---|
242 | char c= sequence[index+indi]; |
---|
243 | if (c=='.') c= '?'; |
---|
244 | fputc(c,fp); |
---|
245 | indj++; |
---|
246 | if(indj==10&&(index+indi)<(seq_length-1) |
---|
247 | &&indi<(length-1)) { |
---|
248 | fprintf(fp, " "); |
---|
249 | indj=0; |
---|
250 | } |
---|
251 | } else break; |
---|
252 | } |
---|
253 | fprintf(fp, "\n"); |
---|
254 | } |
---|