1 | #include <stdio.h> |
---|
2 | #include "convert.h" |
---|
3 | #include "global.h" |
---|
4 | |
---|
5 | #define PRTLENGTH 62 |
---|
6 | |
---|
7 | /* --------------------------------------------------------------- |
---|
8 | * Function to_printable() |
---|
9 | * Convert from some format to PRINTABLE format. |
---|
10 | */ |
---|
11 | void |
---|
12 | to_printable(inf, outf, informat) |
---|
13 | char *inf, *outf; |
---|
14 | int informat; |
---|
15 | { |
---|
16 | FILE *IFP, *ofp; |
---|
17 | FILE_BUFFER ifp; |
---|
18 | int maxsize, current, total_seq, length; |
---|
19 | int out_of_memory, indi, index, *base_nums, base_count, start; |
---|
20 | char temp[TOKENNUM], eof; |
---|
21 | char *name; |
---|
22 | |
---|
23 | if((IFP=fopen(inf, "r"))==NULL) { |
---|
24 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
---|
25 | error(64, temp); |
---|
26 | } |
---|
27 | ifp = create_FILE_BUFFER(inf, IFP); |
---|
28 | if(Lenstr(outf) <= 0) ofp = stdout; |
---|
29 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
30 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
---|
31 | error(117, temp); |
---|
32 | } |
---|
33 | maxsize = 1; |
---|
34 | out_of_memory = 0; |
---|
35 | name = NULL; |
---|
36 | init(); |
---|
37 | total_seq = 0; |
---|
38 | base_nums=NULL; |
---|
39 | do { |
---|
40 | if(informat==ALMA) { |
---|
41 | init_alma(); |
---|
42 | eof=alma_in(ifp); |
---|
43 | } else if(informat==GENBANK) { |
---|
44 | init_genbank(); |
---|
45 | eof=genbank_in_locus(ifp); |
---|
46 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
47 | init_embl(); |
---|
48 | eof=embl_in_id(ifp); |
---|
49 | } else if(informat==MACKE) { |
---|
50 | init_macke(); |
---|
51 | eof=macke_in_name(ifp); |
---|
52 | } else error(48, |
---|
53 | "UNKNOW input format when converting to PRINABLE format."); |
---|
54 | if(eof==EOF) break; |
---|
55 | if(informat==ALMA) { |
---|
56 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
---|
57 | } else if(informat==GENBANK) { |
---|
58 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
---|
59 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
60 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
---|
61 | } else if(informat==MACKE) { |
---|
62 | Cpystr(temp, data.macke.seqabbr); |
---|
63 | } else error(120, |
---|
64 | "UNKNOW input format when converting to PRINABLE format."); |
---|
65 | total_seq++; |
---|
66 | |
---|
67 | if((name = Dupstr(temp))==NULL&&temp!=NULL) { out_of_memory=1; break; } |
---|
68 | if(data.seq_length>maxsize) maxsize = data.seq_length; |
---|
69 | |
---|
70 | if (!realloc_sequence_data(total_seq)) { out_of_memory = 1; break; } |
---|
71 | |
---|
72 | base_nums=(int*)Reallocspace((char *)base_nums, sizeof(int)*total_seq); |
---|
73 | if(base_nums==NULL) { out_of_memory=1; break; } |
---|
74 | |
---|
75 | data.ids[total_seq-1] = name; |
---|
76 | data.seqs[total_seq-1] = (char*)Dupstr(data.sequence); |
---|
77 | data.lengths[total_seq-1] = Lenstr(data.sequence); |
---|
78 | base_nums[total_seq-1] = 0; |
---|
79 | |
---|
80 | } while(!out_of_memory); |
---|
81 | |
---|
82 | if(out_of_memory) { /* cannot hold all seqs into mem. */ |
---|
83 | fprintf(stderr, |
---|
84 | "Rerun the conversion throught one seq. by one seq. base.\n"); |
---|
85 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
---|
86 | to_printable_1x1(inf, outf, informat); |
---|
87 | return; |
---|
88 | } |
---|
89 | current = 0; |
---|
90 | while(maxsize>current) { |
---|
91 | for(indi=0; indi<total_seq; indi++) { |
---|
92 | length = Lenstr(data.seqs[indi]); |
---|
93 | for(index=base_count=0; |
---|
94 | index<PRTLENGTH&&(current+index)<length; index++) |
---|
95 | if(data.seqs[indi][index+current]!='~'&& |
---|
96 | data.seqs[indi][index+current]!='-'&& |
---|
97 | data.seqs[indi][index+current]!='.') |
---|
98 | base_count++; |
---|
99 | |
---|
100 | /* check if the first char is base or not */ |
---|
101 | if(current<length&&data.seqs[indi][current]!='~'&& |
---|
102 | data.seqs[indi][current]!='-'&& |
---|
103 | data.seqs[indi][current]!='.') |
---|
104 | start = base_nums[indi]+1; |
---|
105 | else start = base_nums[indi]; |
---|
106 | |
---|
107 | printable_print_line(data.ids[indi], |
---|
108 | data.seqs[indi], current, start, ofp); |
---|
109 | base_nums[indi] += base_count; |
---|
110 | } |
---|
111 | current += PRTLENGTH; |
---|
112 | if(maxsize>current) fprintf(ofp, "\n\n"); |
---|
113 | } |
---|
114 | |
---|
115 | Freespace((char **)&(base_nums)); |
---|
116 | |
---|
117 | #ifdef log |
---|
118 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
---|
119 | #endif |
---|
120 | |
---|
121 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
---|
122 | } |
---|
123 | /* --------------------------------------------------------------- |
---|
124 | * Function to_printable_1x1() |
---|
125 | * Convert from one foramt to PRINTABLE format, one seq by one seq. |
---|
126 | */ |
---|
127 | void |
---|
128 | to_printable_1x1(inf, outf, informat) |
---|
129 | char *inf, *outf; |
---|
130 | int informat; |
---|
131 | { |
---|
132 | FILE *IFP, *ofp; |
---|
133 | FILE_BUFFER ifp; |
---|
134 | int maxsize, current, total_seq; |
---|
135 | int base_count, index; |
---|
136 | char temp[TOKENNUM], eof; |
---|
137 | char *name; |
---|
138 | |
---|
139 | if((IFP=fopen(inf, "r"))==NULL) { |
---|
140 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
---|
141 | error(125, temp); |
---|
142 | } |
---|
143 | ifp = create_FILE_BUFFER(inf, IFP); |
---|
144 | if(Lenstr(outf) <= 0) ofp = stdout; |
---|
145 | else if((ofp=fopen(outf, "w"))==NULL) { |
---|
146 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
---|
147 | error(126, temp); |
---|
148 | } |
---|
149 | maxsize = 1; current = 0; |
---|
150 | name = NULL; |
---|
151 | while(maxsize>current) { |
---|
152 | init(); |
---|
153 | FILE_BUFFER_rewind(ifp); |
---|
154 | total_seq = 0; |
---|
155 | do { /* read in one sequence */ |
---|
156 | if(informat==ALMA) { |
---|
157 | init_alma(); |
---|
158 | eof=alma_in(ifp); |
---|
159 | } else if(informat==GENBANK) { |
---|
160 | init_genbank(); |
---|
161 | eof=genbank_in_locus(ifp); |
---|
162 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
163 | init_embl(); |
---|
164 | eof=embl_in_id(ifp); |
---|
165 | } else if(informat==MACKE) { |
---|
166 | init_macke(); |
---|
167 | eof=macke_in_name(ifp); |
---|
168 | } else error(129, |
---|
169 | "UNKNOW input format when converting to PRINTABLE format."); |
---|
170 | if(eof==EOF) break; |
---|
171 | if(informat==ALMA) { |
---|
172 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
---|
173 | } else if(informat==GENBANK) { |
---|
174 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
---|
175 | } else if(informat==EMBL||informat==PROTEIN) { |
---|
176 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
---|
177 | } else if(informat==MACKE) { |
---|
178 | macke_key_word(data.macke.name, 0, temp, TOKENNUM); |
---|
179 | } else error(131, |
---|
180 | "UNKNOW input format when converting to PRINTABLE format."); |
---|
181 | Freespace(&name); |
---|
182 | name = Dupstr(temp); |
---|
183 | if(data.seq_length>maxsize) |
---|
184 | maxsize = data.seq_length; |
---|
185 | for(index=base_count=0; |
---|
186 | index<current&&index<data.seq_length; index++) { |
---|
187 | if(data.sequence[index]!='~' |
---|
188 | &&data.sequence[index]!='.' |
---|
189 | &&data.sequence[index]!='-') base_count++; |
---|
190 | } |
---|
191 | /* check if the first char is a base or not */ |
---|
192 | if(current<data.seq_length&&data.sequence[current]!='~' |
---|
193 | &&data.sequence[current]!='.' |
---|
194 | &&data.sequence[current]!='-') base_count++; |
---|
195 | |
---|
196 | /* find if there any non-gap char in the next 62 |
---|
197 | * char of the seq. data |
---|
198 | * #### count no need to be the first base num |
---|
199 | for(index=current, count=0; |
---|
200 | count==0&&index<data.seq_length |
---|
201 | &&index<(current+PRTLENGTH); index++) |
---|
202 | if(data.sequence[index]!='~' |
---|
203 | &&data.sequence[index]!='.' |
---|
204 | &&data.sequence[index]!='-') |
---|
205 | { base_count++; count++; } |
---|
206 | */ |
---|
207 | |
---|
208 | printable_print_line(name, data.sequence, current, |
---|
209 | base_count, ofp); |
---|
210 | total_seq++; |
---|
211 | } while(1); |
---|
212 | current += PRTLENGTH; |
---|
213 | if(maxsize>current) fprintf(ofp, "\n\n"); |
---|
214 | } /* print block by block */ |
---|
215 | |
---|
216 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
---|
217 | |
---|
218 | #ifdef log |
---|
219 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
---|
220 | #endif |
---|
221 | |
---|
222 | } |
---|
223 | /* ------------------------------------------------------------ |
---|
224 | * Function printable_print_line(). |
---|
225 | * print one printable line. |
---|
226 | */ |
---|
227 | void |
---|
228 | printable_print_line(id, sequence, start, base_count, fp) |
---|
229 | char *id, *sequence; |
---|
230 | int start; |
---|
231 | int base_count; |
---|
232 | FILE *fp; |
---|
233 | { |
---|
234 | int indi, index, count, bnum, /*length,*/ seq_length; |
---|
235 | |
---|
236 | fprintf(fp, " "); |
---|
237 | if((bnum=Lenstr(id))>10) { |
---|
238 | /* truncate if length of id is greater than 10 */ |
---|
239 | for(indi=0; indi<10; indi++) |
---|
240 | fprintf(fp, "%c", id[indi]); |
---|
241 | bnum = 1; |
---|
242 | } else { |
---|
243 | fprintf(fp, "%s", id); |
---|
244 | bnum = 10 - bnum + 1; |
---|
245 | } |
---|
246 | /* fill in the blanks to make up 10 chars id spaces */ |
---|
247 | seq_length = Lenstr(sequence); |
---|
248 | if(start<seq_length) |
---|
249 | for(indi=0; indi<bnum; indi++) |
---|
250 | fprintf(fp, " "); |
---|
251 | else { fprintf(fp, "\n"); return; } |
---|
252 | fprintf(fp, "%4d ", base_count); |
---|
253 | for(index=start, count=0; count<PRTLENGTH&&index<seq_length; index++) { |
---|
254 | fprintf(fp, "%c", sequence[index]); |
---|
255 | count++; |
---|
256 | } /* printout sequence data */ |
---|
257 | fprintf(fp, "\n"); |
---|
258 | } |
---|