1 | |
---|
2 | #include <stdio.h> |
---|
3 | #include "convert.h" |
---|
4 | #include "global.h" |
---|
5 | |
---|
6 | /* ------------------------------------------------------------ */ |
---|
7 | /* Function init_paup(). |
---|
8 | /* Init. paup data. |
---|
9 | */ |
---|
10 | void |
---|
11 | init_paup() { |
---|
12 | |
---|
13 | int indi; |
---|
14 | void Freespace(); |
---|
15 | |
---|
16 | for(indi=0; indi<data.paup.ntax; indi++) { |
---|
17 | Freespace(&(data.ids[indi])); |
---|
18 | Freespace(&(data.seqs[indi])); |
---|
19 | } |
---|
20 | Freespace(&(data.ids)); |
---|
21 | Freespace(&(data.seqs)); |
---|
22 | data.paup.ntax = 0; |
---|
23 | data.paup.nchar = 0; |
---|
24 | } |
---|
25 | /* ------------------------------------------------------------- */ |
---|
26 | /* Function to_paup() |
---|
27 | /* Convert from some format to PAUP format. |
---|
28 | */ |
---|
29 | void |
---|
30 | to_paup(inf, outf, informat) |
---|
31 | char *inf, *outf; |
---|
32 | int informat; |
---|
33 | { |
---|
34 | FILE *ifp, *ofp, *fopen(); |
---|
35 | int maxsize, current, total_seq, Lenstr(), first_line; |
---|
36 | int out_of_memory, indi; |
---|
37 | int alma_key_word(); |
---|
38 | char alma_in(), genbank_in_locus(); |
---|
39 | char macke_in_name(), embl_in_id(); |
---|
40 | char temp[TOKENNUM], eof; |
---|
41 | char *Dupstr(), *name, *today_date(), *today; |
---|
42 | void init(), init_paup(), init_seq_data(); |
---|
43 | void init_alma(), init_genbank(); |
---|
44 | void init_macke(), init_embl(); |
---|
45 | void paup_print_line(), to_paup_1x1(); |
---|
46 | void error(), Cpystr(); |
---|
47 | void genbank_key_word(), embl_key_word(); |
---|
48 | void paup_verify_name(), paup_print_header(); |
---|
49 | |
---|
50 | if((ifp=fopen(inf, "r"))==NULL) { |
---|
51 | sprintf(temp, |
---|
52 | "Cannot open input file %s, exit\n", inf); |
---|
53 | error(64, temp); |
---|
54 | } |
---|
55 | if(Lenstr(outf)<=0) ofp = stdout; |
---|
56 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
57 | sprintf(temp, |
---|
58 | "Cannot open output file %s, exit\n", outf); |
---|
59 | error(65, temp); |
---|
60 | } |
---|
61 | maxsize = 1; |
---|
62 | out_of_memory = 0; |
---|
63 | name = NULL; |
---|
64 | init(); |
---|
65 | init_paup(); |
---|
66 | paup_print_header(ofp); |
---|
67 | total_seq = 0; |
---|
68 | do { |
---|
69 | if(informat==ALMA) { |
---|
70 | init_alma(); |
---|
71 | eof=alma_in(ifp); |
---|
72 | } else if(informat==GENBANK) { |
---|
73 | init_genbank(); |
---|
74 | eof=genbank_in_locus(ifp); |
---|
75 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
76 | init_embl(); |
---|
77 | eof=embl_in_id(ifp); |
---|
78 | } else if(informat==MACKE) { |
---|
79 | init_macke(); |
---|
80 | eof=macke_in_name(ifp); |
---|
81 | } else error(63, |
---|
82 | "UNKNOW input format when converting to PAUP format."); |
---|
83 | if(eof==EOF) break; |
---|
84 | if(informat==ALMA) { |
---|
85 | alma_key_word |
---|
86 | (data.alma.id, 0, temp, TOKENNUM); |
---|
87 | } else if(informat==GENBANK) { |
---|
88 | genbank_key_word |
---|
89 | (data.gbk.locus, 0, temp, TOKENNUM); |
---|
90 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
91 | embl_key_word |
---|
92 | (data.embl.id, 0, temp, TOKENNUM); |
---|
93 | } else if(informat==MACKE) { |
---|
94 | Cpystr(temp, data.macke.seqabbr); |
---|
95 | } else error(118, |
---|
96 | "UNKNOW input format when converting to PAUP format."); |
---|
97 | |
---|
98 | total_seq++; |
---|
99 | |
---|
100 | if((name = Dupstr(temp))==NULL&&temp!=NULL) |
---|
101 | { out_of_memory=1; break; } |
---|
102 | paup_verify_name(&name); |
---|
103 | |
---|
104 | if(data.seq_length>maxsize) |
---|
105 | maxsize = data.seq_length; |
---|
106 | data.ids=(char**)Reallocspace((char *)data.ids, |
---|
107 | sizeof(char*)*total_seq); |
---|
108 | |
---|
109 | if(data.ids==NULL) { out_of_memory=1; break; } |
---|
110 | data.seqs=(char**)Reallocspace((char *)data.seqs, |
---|
111 | sizeof(char*)*total_seq); |
---|
112 | |
---|
113 | if(data.seqs==NULL) { out_of_memory=1; break; } |
---|
114 | |
---|
115 | data.ids[total_seq-1]=name; |
---|
116 | data.seqs[total_seq-1]=(char*)Dupstr(data.sequence); |
---|
117 | |
---|
118 | } while(!out_of_memory); |
---|
119 | |
---|
120 | if(out_of_memory) { |
---|
121 | /* cannot hold all seqs into mem. */ |
---|
122 | fprintf(stderr, |
---|
123 | "Rerun the conversion throught one seq. by one seq. base.\n"); |
---|
124 | |
---|
125 | fclose(ifp); fclose(ofp); |
---|
126 | to_paup_1x1(inf, outf, informat); |
---|
127 | return; |
---|
128 | } |
---|
129 | current = 0; |
---|
130 | while(maxsize>current) { |
---|
131 | first_line = 0; |
---|
132 | for(indi=0; indi<total_seq; indi++) { |
---|
133 | if(current<Lenstr(data.seqs[indi])) |
---|
134 | first_line++; |
---|
135 | paup_print_line(data.ids[indi], |
---|
136 | data.seqs[indi], current, |
---|
137 | (first_line==1), ofp); |
---|
138 | |
---|
139 | /* Avoid repeating */ |
---|
140 | if(first_line==1) first_line++; |
---|
141 | } |
---|
142 | current +=(SEQLINE-10); |
---|
143 | if(maxsize>current) fprintf(ofp, "\n"); |
---|
144 | } |
---|
145 | |
---|
146 | #ifdef log |
---|
147 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
---|
148 | #endif |
---|
149 | |
---|
150 | fprintf(ofp, " ;\nENDBLOCK;\n"); |
---|
151 | /* rewrite output header */ |
---|
152 | rewind(ofp); |
---|
153 | fprintf(ofp, "#NEXUS\n"); |
---|
154 | today = today_date(); |
---|
155 | if(today[Lenstr(today)-1]=='\n') |
---|
156 | today[Lenstr(today)-1] = '\0'; |
---|
157 | |
---|
158 | fprintf(ofp, |
---|
159 | "[! RDP - the Ribsomal Database Project, (%s).]\n", today); |
---|
160 | |
---|
161 | fprintf(ofp, |
---|
162 | "[! To get started, send HELP to rdp@info.mcs.anl.gov ]\n"); |
---|
163 | |
---|
164 | fprintf(ofp, "BEGIN DATA;\n DIMENSIONS\n"); |
---|
165 | fprintf(ofp, |
---|
166 | " NTAX = %6d\n NCHAR = %6d\n ;\n", |
---|
167 | total_seq, maxsize); |
---|
168 | |
---|
169 | fclose(ifp); fclose(ofp); |
---|
170 | } |
---|
171 | /* --------------------------------------------------------------- */ |
---|
172 | /* Function to_paup_1x1() |
---|
173 | /* Convert from ALMA format to PAUP format, |
---|
174 | /* one seq by one seq. |
---|
175 | */ |
---|
176 | void |
---|
177 | to_paup_1x1(inf, outf, informat) |
---|
178 | char *inf, *outf; |
---|
179 | int informat; |
---|
180 | { |
---|
181 | FILE *ifp, *ofp, *fopen(); |
---|
182 | int maxsize, current, total_seq, Lenstr(), first_line; |
---|
183 | int alma_key_word(); |
---|
184 | char temp[TOKENNUM], eof; |
---|
185 | char *Dupstr(), *name, *today_date(), *today; |
---|
186 | |
---|
187 | if((ifp=fopen(inf, "r"))==NULL) { |
---|
188 | sprintf(temp, |
---|
189 | "Cannot open input file %s, exit\n", inf); |
---|
190 | error(121, temp); |
---|
191 | } |
---|
192 | if(Lenstr(outf)<=0) ofp = stdout; |
---|
193 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
194 | sprintf(temp, |
---|
195 | "Cannot open output file %s, exit\n", outf); |
---|
196 | error(122, temp); |
---|
197 | } |
---|
198 | maxsize = 1; current = 0; |
---|
199 | name = NULL; |
---|
200 | paup_print_header(ofp); |
---|
201 | while(maxsize>current) { |
---|
202 | init(); |
---|
203 | rewind(ifp); |
---|
204 | total_seq = 0; |
---|
205 | first_line = 0; |
---|
206 | do { /* read in one sequence */ |
---|
207 | init_paup(); |
---|
208 | if(informat==ALMA) { |
---|
209 | init_alma(); |
---|
210 | eof=alma_in(ifp); |
---|
211 | } else if(informat==GENBANK) { |
---|
212 | init_genbank(); |
---|
213 | eof=genbank_in_locus(ifp); |
---|
214 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
215 | init_embl(); |
---|
216 | eof=embl_in_id(ifp); |
---|
217 | } else if(informat==MACKE) { |
---|
218 | init_macke(); |
---|
219 | eof=macke_in_name(ifp); |
---|
220 | } else error(127, |
---|
221 | "UNKNOW input format when converting to PAUP format."); |
---|
222 | |
---|
223 | if(eof==EOF) break; |
---|
224 | if(informat==ALMA) { |
---|
225 | alma_key_word |
---|
226 | (data.alma.id, 0, temp, TOKENNUM); |
---|
227 | } else if(informat==GENBANK) { |
---|
228 | genbank_key_word |
---|
229 | (data.gbk.locus, 0, temp, TOKENNUM); |
---|
230 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
231 | embl_key_word |
---|
232 | (data.embl.id, 0, temp, TOKENNUM); |
---|
233 | } else if(informat==MACKE) { |
---|
234 | macke_key_word |
---|
235 | (data.macke.name, 0, temp, TOKENNUM); |
---|
236 | } else error(70, |
---|
237 | "UNKNOW input format when converting to PAUP format."); |
---|
238 | |
---|
239 | Freespace(&name); |
---|
240 | name = Dupstr(temp); |
---|
241 | paup_verify_name(&name); |
---|
242 | |
---|
243 | if(data.seq_length>maxsize) |
---|
244 | maxsize = data.seq_length; |
---|
245 | |
---|
246 | if(current<data.seq_length) first_line++; |
---|
247 | |
---|
248 | paup_print_line(name, data.sequence, current, |
---|
249 | (first_line==1), ofp); |
---|
250 | |
---|
251 | /* Avoid repeating */ |
---|
252 | if(first_line==1) first_line++; |
---|
253 | |
---|
254 | total_seq++; |
---|
255 | |
---|
256 | } while(1); |
---|
257 | current += (SEQLINE-10); |
---|
258 | if(maxsize>current) fprintf(ofp, "\n"); |
---|
259 | } /* print block by block */ |
---|
260 | |
---|
261 | #ifdef log |
---|
262 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
---|
263 | #endif |
---|
264 | |
---|
265 | fprintf(ofp, " ;\nENDBLOCK;\n"); |
---|
266 | /* rewrite output header */ |
---|
267 | rewind(ofp); |
---|
268 | fprintf(ofp, "#NEXUS\n"); |
---|
269 | today = today_date(); |
---|
270 | if(today[Lenstr(today)-1]=='\n') today[Lenstr(today)-1] = '\0'; |
---|
271 | |
---|
272 | fprintf(ofp, |
---|
273 | "[! RDP - the Ribsomal Database Project, (%s).]\n", today); |
---|
274 | |
---|
275 | fprintf(ofp, |
---|
276 | "[! To get started, send HELP to rdp@info.mcs.anl.gov ]\n"); |
---|
277 | |
---|
278 | fprintf(ofp, "BEGIN DATA;\n DIMENSIONS\n"); |
---|
279 | fprintf(ofp, |
---|
280 | " NTAX = %6d\n NCHAR = %6d\n ;\n", |
---|
281 | total_seq, maxsize); |
---|
282 | |
---|
283 | fclose(ifp); fclose(ofp); |
---|
284 | } |
---|
285 | /* ----------------------------------------------------------- */ |
---|
286 | /* Function paup_verify_name(). |
---|
287 | /* Verify short_id in PUAP format. |
---|
288 | */ |
---|
289 | void |
---|
290 | paup_verify_name(string) |
---|
291 | char **string; |
---|
292 | { |
---|
293 | int indi, len, Lenstr(), index; |
---|
294 | char temp[TOKENNUM], *Dupstr(); |
---|
295 | void Freespace(); |
---|
296 | |
---|
297 | for(indi=index=0,len=Lenstr((*string)); indi<len&&index==0; |
---|
298 | indi++) |
---|
299 | if((*string)[indi]=='*'||(*string)[indi]=='(' |
---|
300 | ||(*string)[indi]==')'||(*string)[indi]=='{' |
---|
301 | ||(*string)[indi]=='/'||(*string)[indi]==',' |
---|
302 | ||(*string)[indi]==';'||(*string)[indi]=='_' |
---|
303 | ||(*string)[indi]=='='||(*string)[indi]==':' |
---|
304 | ||(*string)[indi]=='\\'||(*string)[indi]=='\'') |
---|
305 | index=1; |
---|
306 | |
---|
307 | if(index==0) return; |
---|
308 | else { |
---|
309 | temp[0]='\''; |
---|
310 | for(indi=0, index=1; indi<len; indi++, index++) { |
---|
311 | temp[index]=(*string)[indi]; |
---|
312 | if((*string)[indi]=='\'') |
---|
313 | temp[++index]='\''; |
---|
314 | } |
---|
315 | temp[index++]='\''; |
---|
316 | temp[index]='\0'; |
---|
317 | Freespace(string); |
---|
318 | (*string)=(char*)Dupstr(temp); |
---|
319 | } |
---|
320 | } |
---|
321 | /* -------------------------------------------------------------- */ |
---|
322 | /* Function paup_print_line(). |
---|
323 | /* print paup file. |
---|
324 | */ |
---|
325 | void |
---|
326 | paup_print_line(string, sequence, index, first_line, fp) |
---|
327 | char *string, *sequence; |
---|
328 | int index, first_line; |
---|
329 | FILE *fp; |
---|
330 | { |
---|
331 | int indi, indj, bnum, length, Lenstr(), seq_length; |
---|
332 | |
---|
333 | length = SEQLINE-10; |
---|
334 | fprintf(fp, " "); |
---|
335 | if(Lenstr(string)>10) { |
---|
336 | /* truncate if length of seq ID is greater than 10 */ |
---|
337 | for(indi=0; indi<10; indi++) |
---|
338 | fprintf(fp, "%c", string[indi]); |
---|
339 | bnum = 1; |
---|
340 | } else { |
---|
341 | fprintf(fp, "%s", string); |
---|
342 | bnum = 10 - Lenstr(string)+1; |
---|
343 | } |
---|
344 | /* Print out blanks after sequence ID to make up 10 chars. */ |
---|
345 | seq_length = Lenstr(sequence); |
---|
346 | |
---|
347 | if(index<seq_length) |
---|
348 | for(indi=0; indi<bnum; indi++) |
---|
349 | fprintf(fp, " "); |
---|
350 | |
---|
351 | for(indi=indj=0; indi<length; indi++) { |
---|
352 | if((index+indi)<seq_length) { |
---|
353 | |
---|
354 | fprintf(fp, "%c", sequence[index+indi]); |
---|
355 | indj++; |
---|
356 | if(indj==10&&indi<(length-1) |
---|
357 | &&(indi+index)<(seq_length-1)) { |
---|
358 | fprintf(fp, " "); |
---|
359 | indj=0; |
---|
360 | } |
---|
361 | } else break; |
---|
362 | } |
---|
363 | |
---|
364 | if(first_line) |
---|
365 | fprintf(fp, " [%d - %d]", index+1, (index+indi)); |
---|
366 | |
---|
367 | fprintf(fp, "\n"); |
---|
368 | } |
---|
369 | /* ---------------------------------------------------------- */ |
---|
370 | /* Function paup_print_header(). |
---|
371 | /* Print out the header of each paup format. |
---|
372 | */ |
---|
373 | void |
---|
374 | paup_print_header(ofp) |
---|
375 | FILE *ofp; |
---|
376 | { |
---|
377 | char *today_date(), *today; |
---|
378 | |
---|
379 | fprintf(ofp, "#NEXUS\n"); |
---|
380 | today = today_date(); |
---|
381 | if(today[Lenstr(today)-1]=='\n') |
---|
382 | today[Lenstr(today)-1] = '\0'; |
---|
383 | |
---|
384 | fprintf(ofp, |
---|
385 | "[! RDP - the Ribsomal Database Project, (%s).]\n", |
---|
386 | today); |
---|
387 | |
---|
388 | fprintf(ofp, |
---|
389 | "[! To get started, send HELP to rdp@info.mcs.anl.gov ]\n"); |
---|
390 | |
---|
391 | fprintf(ofp, "BEGIN DATA;\n DIMENSIONS\n"); |
---|
392 | |
---|
393 | fprintf(ofp, |
---|
394 | " NTAX = \n NCHAR = \n ;\n"); |
---|
395 | |
---|
396 | fprintf(ofp, " FORMAT\n LABELPOS = LEFT\n"); |
---|
397 | |
---|
398 | fprintf(ofp, |
---|
399 | " MISSING = .\n EQUATE = \"%s\"\n", |
---|
400 | data.paup.equate); |
---|
401 | |
---|
402 | fprintf(ofp, |
---|
403 | " INTERLEAVE\n DATATYPE = RNA\n GAP = %c\n ;\n", |
---|
404 | data.paup.gap); |
---|
405 | |
---|
406 | fprintf(ofp, |
---|
407 | " OPTIONS\n GAPMODE = MISSING\n ;\n MATRIX\n"); |
---|
408 | } |
---|