1 | #include <stdio.h> |
---|
2 | #include "convert.h" |
---|
3 | #include "global.h" |
---|
4 | /* ---------------------------------------------------------- */ |
---|
5 | /* Function init_pm_data(). |
---|
6 | /* Init macke and swissprot data. |
---|
7 | */ |
---|
8 | void |
---|
9 | init_pm_data() { |
---|
10 | void init_macke(), init_protein(); |
---|
11 | |
---|
12 | init_macke(); |
---|
13 | init_protein(); |
---|
14 | } |
---|
15 | /* ------------------------------------------------------------ */ |
---|
16 | /* Function init_protein(). |
---|
17 | /* Initialize protein entry. |
---|
18 | */ |
---|
19 | void |
---|
20 | init_protein() { |
---|
21 | |
---|
22 | int indi; |
---|
23 | void Freespace(); |
---|
24 | char *Dupstr(); |
---|
25 | |
---|
26 | /* initialize protein format */ |
---|
27 | Freespace(&(data.protein.id)); |
---|
28 | Freespace(&(data.protein.date)); |
---|
29 | Freespace(&(data.protein.definition)); |
---|
30 | Freespace(&(data.protein.formname)); |
---|
31 | Freespace(&(data.protein.accession)); |
---|
32 | Freespace(&(data.protein.keywords)); |
---|
33 | for(indi=0; indi<data.protein.numofref; indi++) { |
---|
34 | Freespace(&(data.protein.reference[indi].author)); |
---|
35 | Freespace(&(data.protein.reference[indi].title)); |
---|
36 | Freespace(&(data.protein.reference[indi].journal)); |
---|
37 | Freespace(&(data.protein.reference[indi].processing)); |
---|
38 | } |
---|
39 | Freespace(&(data.protein.reference)); |
---|
40 | Freespace(&(data.protein.comments)); |
---|
41 | data.protein.id=Dupstr("\n"); |
---|
42 | data.protein.date=Dupstr("\n"); |
---|
43 | data.protein.definition=Dupstr("\n"); |
---|
44 | data.protein.formname=Dupstr("\n"); |
---|
45 | data.protein.accession=Dupstr("\n"); |
---|
46 | data.protein.keywords=Dupstr("\n"); |
---|
47 | data.protein.numofref=0; |
---|
48 | data.protein.reference=NULL; |
---|
49 | data.protein.comments=Dupstr(""); |
---|
50 | } |
---|
51 | /* ---------------------------------------------------------- */ |
---|
52 | /* Function protein_to_macke(). |
---|
53 | /* Convert from Protein format to Macke format. |
---|
54 | */ |
---|
55 | void |
---|
56 | protein_to_macke(inf, outf) |
---|
57 | char *inf, *outf; |
---|
58 | { |
---|
59 | FILE *ifp, *ofp, *fopen(); |
---|
60 | char temp[TOKENNUM], protein_in(); |
---|
61 | void init(), init_pm_data(), init_seq_data(); |
---|
62 | void macke_out_header(), macke_out0(), macke_out1(), macke_out2(); |
---|
63 | void error(); |
---|
64 | int indi, ptom(), total_num; |
---|
65 | |
---|
66 | if((ifp=fopen(inf, "r"))==NULL) { |
---|
67 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
---|
68 | error(93, temp); |
---|
69 | } |
---|
70 | if(Lenstr(outf)<=0) ofp = stdout; |
---|
71 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
72 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
---|
73 | error(95, temp); |
---|
74 | } |
---|
75 | |
---|
76 | /* seq irelenvant header */ |
---|
77 | init(); |
---|
78 | macke_out_header(ofp); |
---|
79 | for(indi=0; indi<3; indi++) { |
---|
80 | FILE_BUFFER_rewind(ifp); |
---|
81 | init_seq_data(); |
---|
82 | init_pm_data(); |
---|
83 | while(protein_in(ifp)!=EOF) { |
---|
84 | data.numofseq++; |
---|
85 | if(ptom()) { |
---|
86 | /* convert from protein form to macke form */ |
---|
87 | switch(indi) { |
---|
88 | case 0: |
---|
89 | /* output seq display format */ |
---|
90 | macke_out0(ofp, PROTEIN); |
---|
91 | break; |
---|
92 | case 1: |
---|
93 | /* output seq information */ |
---|
94 | macke_out1(ofp); |
---|
95 | break; |
---|
96 | case 2: |
---|
97 | /* output seq data */ |
---|
98 | macke_out2(ofp); |
---|
99 | break; |
---|
100 | default: ; |
---|
101 | } |
---|
102 | } else error(82, |
---|
103 | "Conversion from protein to macke fails, Exit"); |
---|
104 | init_pm_data(); |
---|
105 | } |
---|
106 | total_num = data.numofseq; |
---|
107 | if(indi==0) fprintf(ofp, "#-\n"); |
---|
108 | } /* for each seq; loop */ |
---|
109 | |
---|
110 | #ifdef log |
---|
111 | fprintf(stderr, "Total %d sequences have been processed\n", total_num); |
---|
112 | #endif |
---|
113 | |
---|
114 | } |
---|
115 | /* --------------------------------------------------------------- */ |
---|
116 | /* Function protein_in(). |
---|
117 | /* Read in one protein entry. |
---|
118 | */ |
---|
119 | char |
---|
120 | protein_in(fp) |
---|
121 | FILE *fp; |
---|
122 | { |
---|
123 | char line[LINENUM], key[TOKENNUM], temp[LINENUM]; |
---|
124 | char *Fgetline(), *eof, eoen; |
---|
125 | char *protein_id(), *protein_definition(); |
---|
126 | char *protein_accession(), *protein_date(), *protein_source(); |
---|
127 | char *protein_keywords(), *protein_reference(); |
---|
128 | char *protein_author(), *protein_title(), *protein_version(); |
---|
129 | char *protein_processing(); |
---|
130 | char *protein_comments(), *protein_origin(); |
---|
131 | char *protein_skip_unidentified(); |
---|
132 | void protein_key_word(), warning(), error(); |
---|
133 | int Lenstr(); |
---|
134 | |
---|
135 | eoen=' '; /* end-of-entry, set to be 'y' after '//' is read */ |
---|
136 | for(eof=Fgetline(line, LINENUM, fp); eof!=NULL&&eoen!='y'; ) { |
---|
137 | if(Lenstr(line)<=1) { |
---|
138 | eof=Fgetline(line, LINENUM, fp); |
---|
139 | continue; /* empty line, skip */ |
---|
140 | } |
---|
141 | protein_key_word(line, 0, key, TOKENNUM); |
---|
142 | eoen='n'; |
---|
143 | if((Cmpstr(key, "ID"))==EQ) { |
---|
144 | eof = protein_id(line, fp); |
---|
145 | } else if((Cmpstr(key, "DT"))==EQ) { |
---|
146 | eof = protein_date(line, fp); |
---|
147 | } else if((Cmpstr(key, "DE"))==EQ) { |
---|
148 | eof = protein_definition(line, fp); |
---|
149 | } else if((Cmpstr(key, "OS"))==EQ) { |
---|
150 | eof = protein_source(line, fp); |
---|
151 | } else if((Cmpstr(key, "AC"))==EQ) { |
---|
152 | eof = protein_accession(line, fp); |
---|
153 | } else if((Cmpstr(key, "KW"))==EQ) { |
---|
154 | eof = protein_keywords(line, fp); |
---|
155 | } else if((Cmpstr(key, "RA"))==EQ) { |
---|
156 | eof = protein_author(line, fp); |
---|
157 | } else if((Cmpstr(key, "RP"))==EQ) { |
---|
158 | eof = protein_processing(line, fp); |
---|
159 | } else if((Cmpstr(key, "RT"))==EQ) { |
---|
160 | eof = protein_title(line, fp); |
---|
161 | } else if((Cmpstr(key, "RL"))==EQ) { |
---|
162 | eof = protein_reference(line, fp); |
---|
163 | } else if((Cmpstr(key, "RN"))==EQ) { |
---|
164 | eof = protein_version(line, fp); |
---|
165 | } else if((Cmpstr(key, "CC"))==EQ) { |
---|
166 | eof = protein_comments(line, fp); |
---|
167 | } else if((Cmpstr(key, "SQ"))==EQ) { |
---|
168 | eof = protein_origin(line, fp); |
---|
169 | eoen = 'y'; |
---|
170 | } else { /* unidentified key word */ |
---|
171 | eof = protein_skip_unidentified(key, line, fp); |
---|
172 | } |
---|
173 | /* except "SQ", at the end of all the other cases, a |
---|
174 | /* new line has already read in, so no further read is |
---|
175 | /* necessary*/ |
---|
176 | } /* for loop to read an entry line by line */ |
---|
177 | |
---|
178 | if(eoen=='n') |
---|
179 | error(42, "Reach EOF before one entry is read, Exit"); |
---|
180 | |
---|
181 | if(eof==NULL) return(EOF); |
---|
182 | else return(EOF+1); |
---|
183 | |
---|
184 | } |
---|
185 | /* --------------------------------------------------------------- */ |
---|
186 | /* Function protein_in_id(). |
---|
187 | /* Read in one protein entry with id and seq only. |
---|
188 | */ |
---|
189 | char |
---|
190 | protein_in_id(fp) |
---|
191 | FILE *fp; |
---|
192 | { |
---|
193 | char line[LINENUM], key[TOKENNUM], temp[LINENUM]; |
---|
194 | char *Fgetline(), *eof, eoen; |
---|
195 | char *protein_id(), *protein_origin(); |
---|
196 | char *protein_skip_unidentified(); |
---|
197 | void protein_key_word(), warning(), error(); |
---|
198 | int Lenstr(); |
---|
199 | |
---|
200 | eoen=' '; /* end-of-entry, set to be 'y' after '//' is read */ |
---|
201 | for(eof=Fgetline(line, LINENUM, fp); eof!=NULL&&eoen!='y'; ) { |
---|
202 | if(Lenstr(line)<=1) { |
---|
203 | eof=Fgetline(line, LINENUM, fp); |
---|
204 | continue; /* empty line, skip */ |
---|
205 | } |
---|
206 | protein_key_word(line, 0, key, TOKENNUM); |
---|
207 | eoen='n'; |
---|
208 | if((Cmpstr(key, "ID"))==EQ) { |
---|
209 | eof = protein_id(line, fp); |
---|
210 | } else if((Cmpstr(key, "SQ"))==EQ) { |
---|
211 | eof = protein_origin(line, fp); |
---|
212 | eoen = 'y'; |
---|
213 | } else { /* unidentified key word */ |
---|
214 | eof = protein_skip_unidentified(key, line, fp); |
---|
215 | } |
---|
216 | /* except "SQ", at the end of all the other cases, a |
---|
217 | /* new line has already read in, so no further read is |
---|
218 | /* necessary*/ |
---|
219 | } /* for loop to read an entry line by line */ |
---|
220 | |
---|
221 | if(eoen=='n') |
---|
222 | error(87, "Reach EOF before one entry is read, Exit"); |
---|
223 | |
---|
224 | if(eof==NULL) return(EOF); |
---|
225 | else return(EOF+1); |
---|
226 | |
---|
227 | } |
---|
228 | /* ---------------------------------------------------------------- */ |
---|
229 | /* Function protein_key_word(). |
---|
230 | /* Get the key_word from line beginning at index. |
---|
231 | */ |
---|
232 | void |
---|
233 | protein_key_word(line, index, key, length) |
---|
234 | char *line; |
---|
235 | int index; |
---|
236 | char *key; |
---|
237 | int length; |
---|
238 | { |
---|
239 | int indi, indj; |
---|
240 | |
---|
241 | if(line==NULL) { key[0]='\0'; return; } |
---|
242 | for(indi=index, indj=0; (index=indi)<length&&line[indi]!=' ' |
---|
243 | &&line[indi]!='\t'&&line[indi]!='\n'&&line[indi]!='\0'; |
---|
244 | indi++, indj++) |
---|
245 | key[indj] = line[indi]; |
---|
246 | key[indj] = '\0'; |
---|
247 | } |
---|
248 | /* ------------------------------------------------------------ */ |
---|
249 | /* Function protein_chcek_blanks(). |
---|
250 | /* Check if there is (numb) blanks at beginning of line. |
---|
251 | */ |
---|
252 | int |
---|
253 | protein_check_blanks(line, numb) |
---|
254 | char *line; |
---|
255 | int numb; |
---|
256 | { |
---|
257 | int blank=1, indi, indk; |
---|
258 | |
---|
259 | for(indi=0; blank&&indi<numb; indi++) { |
---|
260 | if(line[indi]!=' '&&line[indi]!='\t') blank=0; |
---|
261 | if(line[indi]=='\t') { |
---|
262 | indk=indi/8+1; indi=8*indk+1; |
---|
263 | } |
---|
264 | } |
---|
265 | |
---|
266 | return(blank); |
---|
267 | } |
---|
268 | /* ---------------------------------------------------------------- */ |
---|
269 | /* Function protein_continue_line(). |
---|
270 | /* if there are (numb) blanks at the beginning of line, |
---|
271 | /* it is a continue line of the current command. |
---|
272 | */ |
---|
273 | char |
---|
274 | *protein_continue_line(pattern, string, line, fp) |
---|
275 | char *pattern, **string, *line; |
---|
276 | FILE *fp; |
---|
277 | { |
---|
278 | int Lenstr(), Cmpstr(), len, ind; |
---|
279 | int protein_check_blanks(), Skip_white_space(); |
---|
280 | char key[TOKENNUM], *eof, temp[LINENUM], *Catstr(); |
---|
281 | char *Fgetline(); |
---|
282 | void Cpystr(), protein_key_word(), Append_rp_eoln(); |
---|
283 | |
---|
284 | /* check continue lines */ |
---|
285 | for(eof=Fgetline(line, LINENUM, fp); |
---|
286 | eof!=NULL; eof=Fgetline(line, LINENUM, fp)) { |
---|
287 | if(Lenstr(line)<=1) continue; |
---|
288 | protein_key_word(line, 0, key, TOKENNUM); |
---|
289 | if(Cmpstr(pattern, key)!=EQ) break; |
---|
290 | ind=Skip_white_space(line, p_nonkey_start); |
---|
291 | Cpystr(temp, (line+ind)); |
---|
292 | Append_rp_eoln(string, temp); |
---|
293 | } /* end of continue line checking */ |
---|
294 | return(eof); |
---|
295 | } |
---|
296 | /* -------------------------------------------------------------- */ |
---|
297 | /* Function protein_id(). |
---|
298 | /* Read in protein ID lines. |
---|
299 | */ |
---|
300 | char |
---|
301 | *protein_id(line, fp) |
---|
302 | char *line; |
---|
303 | FILE *fp; |
---|
304 | { |
---|
305 | int index, Skip_white_space(), Lenstr(); |
---|
306 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
307 | void error(), Freespace(); |
---|
308 | |
---|
309 | index = Skip_white_space(line, p_nonkey_start); |
---|
310 | Freespace(&(data.protein.id)); |
---|
311 | data.protein.id = Dupstr(line+index); |
---|
312 | eof = (char*)protein_continue_line("ID", &(data.protein.id), line, fp); |
---|
313 | |
---|
314 | return(eof); |
---|
315 | } |
---|
316 | /* -------------------------------------------------------------- */ |
---|
317 | /* Function protein_date(). |
---|
318 | /* Read in protein DATE lines. |
---|
319 | */ |
---|
320 | char |
---|
321 | *protein_date(line, fp) |
---|
322 | char *line; |
---|
323 | FILE *fp; |
---|
324 | { |
---|
325 | int index, Skip_white_space(), Lenstr(); |
---|
326 | char *eof, *protein_continue_line(), *Dupstr(), *dummy; |
---|
327 | void error(), Freespace(); |
---|
328 | |
---|
329 | index = Skip_white_space(line, p_nonkey_start); |
---|
330 | Freespace(&(data.protein.date)); |
---|
331 | data.protein.date = Dupstr(line+index); |
---|
332 | dummy = Dupstr(" "); |
---|
333 | eof = (char*)protein_continue_line("DT", &(dummy), line, fp); |
---|
334 | Freespace(&dummy); |
---|
335 | |
---|
336 | return(eof); |
---|
337 | } |
---|
338 | /* -------------------------------------------------------------- */ |
---|
339 | /* Function protein_source(). |
---|
340 | /* Read in protein DE lines. |
---|
341 | */ |
---|
342 | char |
---|
343 | *protein_source(line, fp) |
---|
344 | char *line; |
---|
345 | FILE *fp; |
---|
346 | { |
---|
347 | int index, Skip_white_space(); |
---|
348 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
349 | void Freespace(); |
---|
350 | |
---|
351 | index = Skip_white_space(line, p_nonkey_start); |
---|
352 | Freespace(&(data.protein.formname)); |
---|
353 | data.protein.formname = Dupstr(line+index); |
---|
354 | eof = protein_continue_line("OS", &(data.protein.formname), |
---|
355 | line, fp); |
---|
356 | |
---|
357 | return(eof); |
---|
358 | } |
---|
359 | /* -------------------------------------------------------------- */ |
---|
360 | /* Function protein_definition(). |
---|
361 | /* Read in protein DE lines. |
---|
362 | */ |
---|
363 | char |
---|
364 | *protein_definition(line, fp) |
---|
365 | char *line; |
---|
366 | FILE *fp; |
---|
367 | { |
---|
368 | int index, Skip_white_space(); |
---|
369 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
370 | void Freespace(); |
---|
371 | |
---|
372 | index = Skip_white_space(line, p_nonkey_start); |
---|
373 | Freespace(&(data.protein.definition)); |
---|
374 | data.protein.definition = Dupstr(line+index); |
---|
375 | eof = protein_continue_line("DE", &(data.protein.definition), line, fp); |
---|
376 | |
---|
377 | return(eof); |
---|
378 | } |
---|
379 | /* -------------------------------------------------------------- */ |
---|
380 | /* Function protein_accession(). |
---|
381 | /* Read in protein ACCESSION lines. |
---|
382 | */ |
---|
383 | char |
---|
384 | *protein_accession(line, fp) |
---|
385 | char *line; |
---|
386 | FILE *fp; |
---|
387 | { |
---|
388 | int index, Skip_white_space(); |
---|
389 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
390 | void Freespace(); |
---|
391 | |
---|
392 | index = Skip_white_space(line, p_nonkey_start); |
---|
393 | Freespace(&(data.protein.accession)); |
---|
394 | data.protein.accession = Dupstr(line+index); |
---|
395 | eof = protein_continue_line("AC", &(data.protein.accession), line, fp); |
---|
396 | |
---|
397 | return(eof); |
---|
398 | } |
---|
399 | /* -------------------------------------------------------------- */ |
---|
400 | /* Function protein_processing(). |
---|
401 | /* Read in protein RP lines. |
---|
402 | */ |
---|
403 | char |
---|
404 | *protein_processing(line, fp) |
---|
405 | char *line; |
---|
406 | FILE *fp; |
---|
407 | { |
---|
408 | int index, Skip_white_space(); |
---|
409 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
410 | void Freespace(); |
---|
411 | |
---|
412 | index = Skip_white_space(line, p_nonkey_start); |
---|
413 | Freespace(&(data.protein.reference[data.protein.numofref-1].processing)); |
---|
414 | data.protein.reference[data.protein.numofref-1].processing = Dupstr(line+index); |
---|
415 | eof = (char*)protein_continue_line("RP", |
---|
416 | &(data.protein.reference[data.protein.numofref-1].processing), line, fp); |
---|
417 | |
---|
418 | return(eof); |
---|
419 | } |
---|
420 | /* -------------------------------------------------------------- */ |
---|
421 | /* Function protein_keywords(). |
---|
422 | /* Read in protein KEYWORDS lines. |
---|
423 | */ |
---|
424 | char |
---|
425 | *protein_keywords(line, fp) |
---|
426 | char *line; |
---|
427 | FILE *fp; |
---|
428 | { |
---|
429 | int index, Skip_white_space(); |
---|
430 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
431 | void Freespace(); |
---|
432 | |
---|
433 | index = Skip_white_space(line, p_nonkey_start); |
---|
434 | Freespace(&(data.protein.keywords)); |
---|
435 | data.protein.keywords = Dupstr(line+index); |
---|
436 | eof = (char*)protein_continue_line("KW", &(data.protein.keywords), |
---|
437 | line, fp); |
---|
438 | |
---|
439 | return(eof); |
---|
440 | } |
---|
441 | /* -------------------------------------------------------------- */ |
---|
442 | /* Function protein_author(). |
---|
443 | /* Read in protein RL lines. |
---|
444 | */ |
---|
445 | char |
---|
446 | *protein_author(line, fp) |
---|
447 | char *line; |
---|
448 | FILE *fp; |
---|
449 | { |
---|
450 | int index, Skip_white_space(); |
---|
451 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
452 | void Freespace(); |
---|
453 | |
---|
454 | index = Skip_white_space(line, p_nonkey_start); |
---|
455 | Freespace(&(data.protein.reference[data.protein.numofref-1].author)); |
---|
456 | data.protein.reference[data.protein.numofref-1].author = Dupstr(line+index); |
---|
457 | eof = (char*)protein_continue_line("RA", |
---|
458 | &(data.protein.reference[data.protein.numofref-1].author), line, fp); |
---|
459 | |
---|
460 | return(eof); |
---|
461 | } |
---|
462 | /* -------------------------------------------------------------- */ |
---|
463 | /* Function protein_title(). |
---|
464 | /* Read in protein RT lines. |
---|
465 | */ |
---|
466 | char |
---|
467 | *protein_title(line, fp) |
---|
468 | char *line; |
---|
469 | FILE *fp; |
---|
470 | { |
---|
471 | int index, Skip_white_space(); |
---|
472 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
473 | void Freespace(); |
---|
474 | |
---|
475 | index = Skip_white_space(line, p_nonkey_start); |
---|
476 | Freespace(&(data.protein.reference[data.protein.numofref-1].title)); |
---|
477 | data.protein.reference[data.protein.numofref-1].title = Dupstr(line+index); |
---|
478 | eof = (char*)protein_continue_line("RT", |
---|
479 | &(data.protein.reference[data.protein.numofref-1].title), line, fp); |
---|
480 | |
---|
481 | return(eof); |
---|
482 | } |
---|
483 | /* -------------------------------------------------------------- */ |
---|
484 | /* Function protein_reference(). |
---|
485 | /* Read in protein RL lines. |
---|
486 | */ |
---|
487 | char |
---|
488 | *protein_reference(line, fp) |
---|
489 | char *line; |
---|
490 | FILE *fp; |
---|
491 | { |
---|
492 | int index, Skip_white_space(); |
---|
493 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
494 | void Freespace(); |
---|
495 | |
---|
496 | index = Skip_white_space(line, p_nonkey_start); |
---|
497 | Freespace(&(data.protein.reference[data.protein.numofref-1].journal)); |
---|
498 | data.protein.reference[data.protein.numofref-1].journal = Dupstr(line+index); |
---|
499 | eof = (char*)protein_continue_line("RL", |
---|
500 | &(data.protein.reference[data.protein.numofref-1].journal), line, fp); |
---|
501 | |
---|
502 | return(eof); |
---|
503 | } |
---|
504 | /* -------------------------------------------------------------- */ |
---|
505 | /* Function protein_version(). |
---|
506 | /* Read in protein RN lines. |
---|
507 | */ |
---|
508 | char |
---|
509 | *protein_version(line, fp) |
---|
510 | char *line; |
---|
511 | FILE *fp; |
---|
512 | { |
---|
513 | int index, Skip_white_space(); |
---|
514 | char *eof, *protein_continue_line(), *Dupstr(); |
---|
515 | char *Reallocspace(), *Fgetline(); |
---|
516 | void Freespace(); |
---|
517 | |
---|
518 | index = Skip_white_space(line, p_nonkey_start); |
---|
519 | if(data.protein.numofref==0) { |
---|
520 | data.protein.numofref++; |
---|
521 | data.protein.reference = (Emblref*)calloc(1,sizeof(Emblref)*1); |
---|
522 | data.protein.reference[0].author = Dupstr(""); |
---|
523 | data.protein.reference[0].title = Dupstr(""); |
---|
524 | data.protein.reference[0].journal = Dupstr(""); |
---|
525 | data.protein.reference[0].processing = Dupstr(""); |
---|
526 | } else { |
---|
527 | data.protein.numofref++; |
---|
528 | data.protein.reference = (Emblref*)Reallocspace(data.protein.reference, |
---|
529 | sizeof(Emblref)*(data.protein.numofref)); |
---|
530 | data.protein.reference[data.protein.numofref-1].author = Dupstr(""); |
---|
531 | data.protein.reference[data.protein.numofref-1].title = Dupstr(""); |
---|
532 | data.protein.reference[data.protein.numofref-1].journal = Dupstr(""); |
---|
533 | data.protein.reference[data.protein.numofref-1].processing = Dupstr(""); |
---|
534 | } |
---|
535 | eof=Fgetline(line, LINENUM, fp); |
---|
536 | return(eof); |
---|
537 | } |
---|
538 | /* -------------------------------------------------------------- */ |
---|
539 | /* Function protein_comments(). |
---|
540 | /* Read in protein comment lines. |
---|
541 | */ |
---|
542 | char |
---|
543 | *protein_comments(line, fp) |
---|
544 | char *line; |
---|
545 | FILE *fp; |
---|
546 | { |
---|
547 | int index, Skip_white_space(), len, Lenstr(); |
---|
548 | char *eof, *Fgetline(), *protein_continue_line(), *Dupstr(); |
---|
549 | void Freespace(), Append(); |
---|
550 | |
---|
551 | if(Lenstr(data.protein.comments)<=1) |
---|
552 | Freespace(&(data.protein.comments)); |
---|
553 | for(; line[0]='C'&&line[1]=='C'; eof=Fgetline(line, LINENUM, fp)) |
---|
554 | Append(&(data.protein.comments), line+5); |
---|
555 | return(eof); |
---|
556 | } |
---|
557 | /* ---------------------------------------------------------------- */ |
---|
558 | /* Function protein_skip_unidentified(). |
---|
559 | /* if there are (numb) blanks at the beginning of line, |
---|
560 | /* it is a continue line of the current command. |
---|
561 | */ |
---|
562 | char |
---|
563 | *protein_skip_unidentified(pattern, line, fp) |
---|
564 | char *pattern, *line; |
---|
565 | FILE *fp; |
---|
566 | { |
---|
567 | int Lenstr(), Cmpstr(); |
---|
568 | char *Fgetline(), *eof; |
---|
569 | char key[TOKENNUM]; |
---|
570 | void protein_key_word(); |
---|
571 | |
---|
572 | /* check continue lines */ |
---|
573 | for(eof=Fgetline(line, LINENUM, fp); |
---|
574 | eof!=NULL; eof=Fgetline(line, LINENUM, fp)) { |
---|
575 | protein_key_word(line, 0, key, TOKENNUM); |
---|
576 | if(Cmpstr(key, pattern)!=EQ) break; |
---|
577 | } /* end of continue line checking */ |
---|
578 | return(eof); |
---|
579 | } |
---|
580 | /* -------------------------------------------------------------- */ |
---|
581 | /* Function protein_origin(). |
---|
582 | /* Read in protein sequence data. |
---|
583 | */ |
---|
584 | char |
---|
585 | *protein_origin(line, fp) |
---|
586 | char *line; |
---|
587 | FILE *fp; |
---|
588 | { |
---|
589 | char *Fgetline(), *eof, *Reallocspace(); |
---|
590 | int index; |
---|
591 | |
---|
592 | data.seq_length = 0; |
---|
593 | /* read in whole sequence data */ |
---|
594 | for(eof=Fgetline(line, LINENUM, fp); |
---|
595 | eof!=NULL&&line[0]!='/'&&line[1]!='/'; |
---|
596 | eof=Fgetline(line, LINENUM, fp)) |
---|
597 | { |
---|
598 | for(index=5; line[index]!='\n'&&line[index]!='\0'; |
---|
599 | index++) { |
---|
600 | if(line[index]!=' '&&data.seq_length>=data.max) { |
---|
601 | data.max += 100; |
---|
602 | data.sequence = (char*)Reallocspace( |
---|
603 | data.sequence, |
---|
604 | (unsigned)(sizeof(char)*data.max)); |
---|
605 | } |
---|
606 | if(line[index]!=' ') |
---|
607 | data.sequence[data.seq_length++] |
---|
608 | = line[index]; |
---|
609 | } |
---|
610 | data.sequence[data.seq_length] = '\0'; |
---|
611 | } |
---|
612 | return(eof); |
---|
613 | } |
---|
614 | /* -------------------------------------------------------------- */ |
---|
615 | /* Function ptom(). |
---|
616 | /* Convert from Protein format to Macke format. |
---|
617 | */ |
---|
618 | int |
---|
619 | ptom() { |
---|
620 | void protein_key_word(), error(); |
---|
621 | int Lenstr(), indj, indk, remnum; |
---|
622 | char temp[LONGTEXT], *Dupstr(), *Reallocspace(); |
---|
623 | void Freespace(); |
---|
624 | |
---|
625 | /* copy seq abbr, assume every entry in protein must end with \n\0 */ |
---|
626 | /* no '\n' at the end of the string */ |
---|
627 | protein_key_word(data.protein.id, 0, temp, TOKENNUM); |
---|
628 | Freespace(&(data.macke.seqabbr)); |
---|
629 | data.macke.seqabbr = Dupstr(temp); |
---|
630 | /* copy name */ |
---|
631 | Freespace(&(data.macke.name)); |
---|
632 | data.macke.name = Dupstr(data.protein.formname); |
---|
633 | /* copy date---DD-MMM-YYYY\n\0 */ |
---|
634 | Freespace(&(data.macke.date)); |
---|
635 | data.macke.date = Dupstr(data.protein.date); |
---|
636 | /* copy protein entry (accession has higher priority) */ |
---|
637 | if(Lenstr(data.protein.accession)>1) { |
---|
638 | Freespace(&(data.macke.acs)); |
---|
639 | data.macke.acs = Dupstr(data.protein.accession); |
---|
640 | } |
---|
641 | if(data.protein.numofref>0) { |
---|
642 | if(Lenstr(data.protein.reference[0].journal)>1) { |
---|
643 | Freespace(&(data.macke.journal)); |
---|
644 | data.macke.journal = Dupstr(data.protein.reference[0].journal); |
---|
645 | } |
---|
646 | if(Lenstr(data.protein.reference[0].title)>1) { |
---|
647 | Freespace(&(data.macke.title)); |
---|
648 | data.macke.title = Dupstr(data.protein.reference[0].title); |
---|
649 | } |
---|
650 | if(Lenstr(data.protein.reference[0].author)>1) { |
---|
651 | Freespace(&(data.macke.author)); |
---|
652 | data.macke.author = Dupstr(data.protein.reference[0].author); |
---|
653 | } |
---|
654 | } |
---|
655 | /* the rest of data are put into remarks, rem:..... */ |
---|
656 | remnum=0; |
---|
657 | for(indj=1; indj<data.protein.numofref; indj++) { |
---|
658 | if(Lenstr(data.protein.reference[indj].journal)>1) { |
---|
659 | sprintf(temp, "jour:%s", data.protein.reference[indj].journal); |
---|
660 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
---|
661 | sizeof(char*)*(remnum+1)); |
---|
662 | data.macke.remarks[remnum++] = Dupstr(temp); |
---|
663 | } /* not empty */ |
---|
664 | if(Lenstr(data.protein.reference[indj].author)>1) { |
---|
665 | sprintf(temp, "auth:%s", data.protein.reference[indj].author); |
---|
666 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
---|
667 | sizeof(char*)*(remnum+1)); |
---|
668 | data.macke.remarks[remnum++] = Dupstr(temp); |
---|
669 | } /* not empty author field */ |
---|
670 | if(Lenstr(data.protein.reference[indj].title)>1) { |
---|
671 | sprintf(temp, "title:%s", data.protein.reference[indj].title); |
---|
672 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
---|
673 | sizeof(char*)*(remnum+1)); |
---|
674 | data.macke.remarks[remnum++] = Dupstr(temp); |
---|
675 | } /* not empty title field */ |
---|
676 | if(Lenstr(data.protein.reference[indj].processing)>1) { |
---|
677 | sprintf(temp, "processing:%s", data.protein.reference[indj].processing); |
---|
678 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
---|
679 | sizeof(char*)*(remnum+1)); |
---|
680 | data.macke.remarks[remnum++] = Dupstr(temp); |
---|
681 | } /* not empty processing field */ |
---|
682 | } /* loop for copying other reference */ |
---|
683 | /* copy keywords as remark */ |
---|
684 | if(Lenstr(data.protein.keywords)>1) { |
---|
685 | sprintf(temp, "KEYWORDS:%s", data.protein.keywords); |
---|
686 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
---|
687 | sizeof(char*)*(remnum+1)); |
---|
688 | data.macke.remarks[remnum++] = Dupstr(temp); |
---|
689 | } |
---|
690 | /* Maybe redudantly */ |
---|
691 | if(Lenstr(data.protein.comments)>1) { |
---|
692 | for(indj=0, indk=0; data.protein.comments[indj]!='\0'; indj++) |
---|
693 | { |
---|
694 | temp[indk++] = data.protein.comments[indj]; |
---|
695 | if(data.protein.comments[indj]=='\n') { |
---|
696 | temp[indk] = '\0'; |
---|
697 | data.macke.remarks = (char**)Reallocspace |
---|
698 | (data.macke.remarks, |
---|
699 | sizeof(char*)*(remnum+1)); |
---|
700 | data.macke.remarks[remnum++] |
---|
701 | = Dupstr(temp); |
---|
702 | indk=0; |
---|
703 | } /* new remark line */ |
---|
704 | } /* for loop to find other remarks */ |
---|
705 | } /* other comments */ |
---|
706 | data.macke.numofrem = remnum; |
---|
707 | return(1); |
---|
708 | } |
---|
709 | /* ---------------------------------------------------------- */ |
---|
710 | /* Function protein_to_genbank(). |
---|
711 | /* Convert from Protein format to genbank format. |
---|
712 | */ |
---|
713 | void |
---|
714 | protein_to_genbank(inf, outf) |
---|
715 | char *inf, *outf; |
---|
716 | { |
---|
717 | FILE *ifp, *ofp, *fopen(); |
---|
718 | char temp[TOKENNUM], protein_in(); |
---|
719 | void init(), init_genbank(), init_macke(), init_protein(); |
---|
720 | void init_seq_data(); |
---|
721 | void genbank_out(); |
---|
722 | void error(); |
---|
723 | int indi, ptom(), mtog(), total_num; |
---|
724 | |
---|
725 | if((ifp=fopen(inf, "r"))==NULL) { |
---|
726 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
---|
727 | error(94, temp); |
---|
728 | } |
---|
729 | if(Lenstr(outf)<=0) ofp = stdout; |
---|
730 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
731 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
---|
732 | error(96, temp); |
---|
733 | } |
---|
734 | |
---|
735 | /* seq irelenvant header */ |
---|
736 | init(); |
---|
737 | /* rewind(ifp); */ |
---|
738 | init_genbank(); |
---|
739 | init_macke(); |
---|
740 | init_protein(); |
---|
741 | while(protein_in(ifp)!=EOF) { |
---|
742 | data.numofseq++; |
---|
743 | if(ptom()&&mtog()) genbank_out(ofp); |
---|
744 | init_genbank(); |
---|
745 | init_macke(); |
---|
746 | init_protein(); |
---|
747 | } |
---|
748 | |
---|
749 | #ifdef log |
---|
750 | fprintf(stderr, "Total %d sequences have been processed\n", total_num); |
---|
751 | #endif |
---|
752 | |
---|
753 | } |
---|
754 | /* ---------------------------------------------------------------- */ |
---|
755 | /* Function protein_to_paup(). |
---|
756 | /* Convert from Swissprot file to paup file. |
---|
757 | */ |
---|
758 | void |
---|
759 | protein_to_paup(inf, outf) |
---|
760 | char *inf, *outf; |
---|
761 | { |
---|
762 | FILE *ifp, *ofp, *fopen(); |
---|
763 | int Lenstr(), maxsize, current, total_seq, first_line; |
---|
764 | char protein_in_id(), temp[TOKENNUM], *name; |
---|
765 | char *Dupstr(), *today_date(), *today; |
---|
766 | void init(), init_paup(), init_seq_data(), paup_print_line(); |
---|
767 | void error(), init_protein(), protein_key_word(), Freespace(); |
---|
768 | |
---|
769 | if((ifp=fopen(inf, "r"))==NULL) { |
---|
770 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
---|
771 | error(80, temp); |
---|
772 | } |
---|
773 | if(Lenstr(outf)<=0) ofp = stdout; |
---|
774 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
775 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
---|
776 | error(81, temp); |
---|
777 | } |
---|
778 | maxsize = 1; current = 0; |
---|
779 | name = NULL; |
---|
780 | init_paup(); |
---|
781 | paup_print_header(ofp); |
---|
782 | while(maxsize>current) { |
---|
783 | init(); |
---|
784 | FILE_BUFFER_rewind(ifp); |
---|
785 | total_seq = 0; |
---|
786 | /* first time read input file */ |
---|
787 | first_line = 0; |
---|
788 | while(protein_in_id(ifp)!=EOF) { |
---|
789 | Freespace(&name); |
---|
790 | protein_key_word(data.protein.id, 0, temp, TOKENNUM); |
---|
791 | name = Dupstr(temp); |
---|
792 | if(data.seq_length>maxsize) |
---|
793 | maxsize = data.seq_length; |
---|
794 | if(current<data.seq_length) first_line++; |
---|
795 | paup_print_line(name, data.sequence, current, |
---|
796 | (first_line==1), ofp); |
---|
797 | if(first_line==1) first_line++; /* avoid repeating */ |
---|
798 | init_paup(); |
---|
799 | init_protein(); |
---|
800 | total_seq++; |
---|
801 | } |
---|
802 | current += (SEQLINE - 10); |
---|
803 | if(maxsize>current) fprintf(ofp, "\n"); |
---|
804 | } /* print block by block */ |
---|
805 | fprintf(ofp, " ;\nENDBLOCK;\n"); |
---|
806 | rewind(ofp); |
---|
807 | fprintf(ofp, "#NEXUS\n"); |
---|
808 | today = today_date(); |
---|
809 | if(today[Lenstr(today)-1]=='\n') today[Lenstr(today)-1] = '\0'; |
---|
810 | fprintf(ofp, "[! RDP - the Ribsomal Database Project, (%s).]\n", today); |
---|
811 | fprintf(ofp, "[! To get started, send HELP to rdp@info.mcs.anl.gov ]\n"); |
---|
812 | fprintf(ofp, "BEGIN DATA;\n DIMENSIONS\n"); |
---|
813 | fprintf(ofp, " NTAX = %6d\n NCHAR = %6d\n ;\n", total_seq, maxsize); |
---|
814 | |
---|
815 | #ifdef log |
---|
816 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
---|
817 | #endif |
---|
818 | |
---|
819 | fclose(ifp); fclose(ofp); |
---|
820 | } |
---|