| 1 | #include <stdio.h> |
|---|
| 2 | #include "convert.h" |
|---|
| 3 | #include "global.h" |
|---|
| 4 | |
|---|
| 5 | #define PRTLENGTH 62 |
|---|
| 6 | |
|---|
| 7 | /* --------------------------------------------------------------- */ |
|---|
| 8 | /* Function to_printable() |
|---|
| 9 | /* Convert from some format to PRINTABLE format. |
|---|
| 10 | */ |
|---|
| 11 | void |
|---|
| 12 | to_printable(inf, outf, informat) |
|---|
| 13 | char *inf, *outf; |
|---|
| 14 | int informat; |
|---|
| 15 | { |
|---|
| 16 | FILE *ifp, *ofp, *fopen(); |
|---|
| 17 | int maxsize, current, total_seq, Lenstr(), length; |
|---|
| 18 | int out_of_memory, indi, index, *base_nums, base_count, start; |
|---|
| 19 | int alma_key_word(); |
|---|
| 20 | char alma_in(), genbank_in_locus(); |
|---|
| 21 | char macke_in_name(), embl_in_id(); |
|---|
| 22 | char temp[TOKENNUM], eof; |
|---|
| 23 | char *Dupstr(), *Reallocspace(), *name, *today_date(), *today; |
|---|
| 24 | void init(), init_seq_data(); |
|---|
| 25 | void init_alma(), init_genbank(), init_macke(), init_embl(); |
|---|
| 26 | void printable_print_line(), to_printable_1x1(); |
|---|
| 27 | void Freespace(), error(), Cpystr(); |
|---|
| 28 | void genbank_key_word(), embl_key_word(); |
|---|
| 29 | |
|---|
| 30 | if((ifp=fopen(inf, "r"))==NULL) { |
|---|
| 31 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 32 | error(64, temp); |
|---|
| 33 | } |
|---|
| 34 | if(Lenstr(outf)<=0) ofp = stdout; |
|---|
| 35 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 36 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 37 | error(117, temp); |
|---|
| 38 | } |
|---|
| 39 | maxsize = 1; |
|---|
| 40 | out_of_memory = 0; |
|---|
| 41 | name = NULL; |
|---|
| 42 | init(); |
|---|
| 43 | total_seq = 0; |
|---|
| 44 | base_nums=NULL; |
|---|
| 45 | do { |
|---|
| 46 | if(informat==ALMA) { |
|---|
| 47 | init_alma(); |
|---|
| 48 | eof=alma_in(ifp); |
|---|
| 49 | } else if(informat==GENBANK) { |
|---|
| 50 | init_genbank(); |
|---|
| 51 | eof=genbank_in_locus(ifp); |
|---|
| 52 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 53 | init_embl(); |
|---|
| 54 | eof=embl_in_id(ifp); |
|---|
| 55 | } else if(informat==MACKE) { |
|---|
| 56 | init_macke(); |
|---|
| 57 | eof=macke_in_name(ifp); |
|---|
| 58 | } else error(48, |
|---|
| 59 | "UNKNOW input format when converting to PRINABLE format."); |
|---|
| 60 | if(eof==EOF) break; |
|---|
| 61 | if(informat==ALMA) { |
|---|
| 62 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
|---|
| 63 | } else if(informat==GENBANK) { |
|---|
| 64 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
|---|
| 65 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 66 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
|---|
| 67 | } else if(informat==MACKE) { |
|---|
| 68 | Cpystr(temp, data.macke.seqabbr); |
|---|
| 69 | } else error(120, |
|---|
| 70 | "UNKNOW input format when converting to PRINABLE format."); |
|---|
| 71 | total_seq++; |
|---|
| 72 | if((name = Dupstr(temp))==NULL&&temp!=NULL) |
|---|
| 73 | { out_of_memory=1; break; } |
|---|
| 74 | if(data.seq_length>maxsize) |
|---|
| 75 | maxsize = data.seq_length; |
|---|
| 76 | data.ids=(char**)Reallocspace((char *)data.ids, |
|---|
| 77 | sizeof(char*)*total_seq); |
|---|
| 78 | if(data.ids==NULL) { out_of_memory=1; break; } |
|---|
| 79 | data.seqs=(char**)Reallocspace((char *)data.seqs, |
|---|
| 80 | sizeof(char*)*total_seq); |
|---|
| 81 | if(data.seqs==NULL) { out_of_memory=1; break; } |
|---|
| 82 | base_nums=(int*)Reallocspace((char *)base_nums, sizeof(int)*total_seq); |
|---|
| 83 | if(base_nums==NULL) { out_of_memory=1; break; } |
|---|
| 84 | data.ids[total_seq-1]=name; |
|---|
| 85 | data.seqs[total_seq-1]=(char*)Dupstr(data.sequence); |
|---|
| 86 | base_nums[total_seq-1]=0; |
|---|
| 87 | } while(!out_of_memory); |
|---|
| 88 | |
|---|
| 89 | if(out_of_memory) { /* cannot hold all seqs into mem. */ |
|---|
| 90 | fprintf(stderr, |
|---|
| 91 | "Rerun the conversion throught one seq. by one seq. base.\n"); |
|---|
| 92 | fclose(ifp); fclose(ofp); |
|---|
| 93 | to_printable_1x1(inf, outf, informat); |
|---|
| 94 | return; |
|---|
| 95 | } |
|---|
| 96 | current = 0; |
|---|
| 97 | while(maxsize>current) { |
|---|
| 98 | for(indi=0; indi<total_seq; indi++) { |
|---|
| 99 | length = Lenstr(data.seqs[indi]); |
|---|
| 100 | for(index=base_count=0; |
|---|
| 101 | index<PRTLENGTH&&(current+index)<length; index++) |
|---|
| 102 | if(data.seqs[indi][index+current]!='~'&& |
|---|
| 103 | data.seqs[indi][index+current]!='-'&& |
|---|
| 104 | data.seqs[indi][index+current]!='.') |
|---|
| 105 | base_count++; |
|---|
| 106 | |
|---|
| 107 | /* check if the first char is base or not */ |
|---|
| 108 | if(current<length&&data.seqs[indi][current]!='~'&& |
|---|
| 109 | data.seqs[indi][current]!='-'&& |
|---|
| 110 | data.seqs[indi][current]!='.') |
|---|
| 111 | start = base_nums[indi]+1; |
|---|
| 112 | else start = base_nums[indi]; |
|---|
| 113 | |
|---|
| 114 | printable_print_line(data.ids[indi], |
|---|
| 115 | data.seqs[indi], current, start, ofp); |
|---|
| 116 | base_nums[indi] += base_count; |
|---|
| 117 | } |
|---|
| 118 | current += PRTLENGTH; |
|---|
| 119 | if(maxsize>current) fprintf(ofp, "\n\n"); |
|---|
| 120 | } |
|---|
| 121 | |
|---|
| 122 | Freespace((char **)&(base_nums)); |
|---|
| 123 | |
|---|
| 124 | #ifdef log |
|---|
| 125 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
|---|
| 126 | #endif |
|---|
| 127 | |
|---|
| 128 | fclose(ifp); fclose(ofp); |
|---|
| 129 | } |
|---|
| 130 | /* --------------------------------------------------------------- */ |
|---|
| 131 | /* Function to_printable_1x1() |
|---|
| 132 | /* Convert from one foramt to PRINTABLE format, one seq by one seq. |
|---|
| 133 | */ |
|---|
| 134 | void |
|---|
| 135 | to_printable_1x1(inf, outf, informat) |
|---|
| 136 | char *inf, *outf; |
|---|
| 137 | int informat; |
|---|
| 138 | { |
|---|
| 139 | FILE *ifp, *ofp, *fopen(); |
|---|
| 140 | int maxsize, current, total_seq, Lenstr(); |
|---|
| 141 | int base_count, count, index; |
|---|
| 142 | int alma_key_word(); |
|---|
| 143 | char temp[TOKENNUM], eof, alma_in(); |
|---|
| 144 | char *Dupstr(), *name, *today_date(), *today; |
|---|
| 145 | |
|---|
| 146 | if((ifp=fopen(inf, "r"))==NULL) { |
|---|
| 147 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 148 | error(125, temp); |
|---|
| 149 | } |
|---|
| 150 | if(Lenstr(outf)<=0) ofp = stdout; |
|---|
| 151 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 152 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 153 | error(126, temp); |
|---|
| 154 | } |
|---|
| 155 | maxsize = 1; current = 0; |
|---|
| 156 | name = NULL; |
|---|
| 157 | while(maxsize>current) { |
|---|
| 158 | init(); |
|---|
| 159 | rewind(ifp); |
|---|
| 160 | total_seq = 0; |
|---|
| 161 | do { /* read in one sequence */ |
|---|
| 162 | if(informat==ALMA) { |
|---|
| 163 | init_alma(); |
|---|
| 164 | eof=alma_in(ifp); |
|---|
| 165 | } else if(informat==GENBANK) { |
|---|
| 166 | init_genbank(); |
|---|
| 167 | eof=genbank_in_locus(ifp); |
|---|
| 168 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 169 | init_embl(); |
|---|
| 170 | eof=embl_in_id(ifp); |
|---|
| 171 | } else if(informat==MACKE) { |
|---|
| 172 | init_macke(); |
|---|
| 173 | eof=macke_in_name(ifp); |
|---|
| 174 | } else error(129, |
|---|
| 175 | "UNKNOW input format when converting to PRINTABLE format."); |
|---|
| 176 | if(eof==EOF) break; |
|---|
| 177 | if(informat==ALMA) { |
|---|
| 178 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
|---|
| 179 | } else if(informat==GENBANK) { |
|---|
| 180 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
|---|
| 181 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 182 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
|---|
| 183 | } else if(informat==MACKE) { |
|---|
| 184 | macke_key_word(data.macke.name, 0, temp, TOKENNUM); |
|---|
| 185 | } else error(131, |
|---|
| 186 | "UNKNOW input format when converting to PRINTABLE format."); |
|---|
| 187 | Freespace(&name); |
|---|
| 188 | name = Dupstr(temp); |
|---|
| 189 | if(data.seq_length>maxsize) |
|---|
| 190 | maxsize = data.seq_length; |
|---|
| 191 | for(index=base_count=0; |
|---|
| 192 | index<current&&index<data.seq_length; index++) { |
|---|
| 193 | if(data.sequence[index]!='~' |
|---|
| 194 | &&data.sequence[index]!='.' |
|---|
| 195 | &&data.sequence[index]!='-') base_count++; |
|---|
| 196 | } |
|---|
| 197 | /* check if the first char is a base or not */ |
|---|
| 198 | if(current<data.seq_length&&data.sequence[current]!='~' |
|---|
| 199 | &&data.sequence[current]!='.' |
|---|
| 200 | &&data.sequence[current]!='-') base_count++; |
|---|
| 201 | |
|---|
| 202 | /* find if there any non-gap char in the next 62 |
|---|
| 203 | /* char of the seq. data */ |
|---|
| 204 | /* #### count no need to be the first base num |
|---|
| 205 | for(index=current, count=0; |
|---|
| 206 | count==0&&index<data.seq_length |
|---|
| 207 | &&index<(current+PRTLENGTH); index++) |
|---|
| 208 | if(data.sequence[index]!='~' |
|---|
| 209 | &&data.sequence[index]!='.' |
|---|
| 210 | &&data.sequence[index]!='-') |
|---|
| 211 | { base_count++; count++; } |
|---|
| 212 | */ |
|---|
| 213 | printable_print_line(name, data.sequence, current, |
|---|
| 214 | base_count, ofp); |
|---|
| 215 | total_seq++; |
|---|
| 216 | } while(1); |
|---|
| 217 | current += PRTLENGTH; |
|---|
| 218 | if(maxsize>current) fprintf(ofp, "\n\n"); |
|---|
| 219 | } /* print block by block */ |
|---|
| 220 | |
|---|
| 221 | fclose(ifp); fclose(ofp); |
|---|
| 222 | |
|---|
| 223 | #ifdef log |
|---|
| 224 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
|---|
| 225 | #endif |
|---|
| 226 | |
|---|
| 227 | } |
|---|
| 228 | /* ------------------------------------------------------------ */ |
|---|
| 229 | /* Function printable_print_line(). |
|---|
| 230 | /* print one printable line. |
|---|
| 231 | */ |
|---|
| 232 | void |
|---|
| 233 | printable_print_line(id, sequence, start, base_count, fp) |
|---|
| 234 | char *id, *sequence; |
|---|
| 235 | int start; |
|---|
| 236 | int base_count; |
|---|
| 237 | FILE *fp; |
|---|
| 238 | { |
|---|
| 239 | int indi, index, count, bnum, length, Lenstr(), seq_length; |
|---|
| 240 | |
|---|
| 241 | fprintf(fp, " "); |
|---|
| 242 | if((bnum=Lenstr(id))>10) { |
|---|
| 243 | /* truncate if length of id is greater than 10 */ |
|---|
| 244 | for(indi=0; indi<10; indi++) |
|---|
| 245 | fprintf(fp, "%c", id[indi]); |
|---|
| 246 | bnum = 1; |
|---|
| 247 | } else { |
|---|
| 248 | fprintf(fp, "%s", id); |
|---|
| 249 | bnum = 10 - bnum + 1; |
|---|
| 250 | } |
|---|
| 251 | /* fill in the blanks to make up 10 chars id spaces */ |
|---|
| 252 | seq_length = Lenstr(sequence); |
|---|
| 253 | if(start<seq_length) |
|---|
| 254 | for(indi=0; indi<bnum; indi++) |
|---|
| 255 | fprintf(fp, " "); |
|---|
| 256 | else { fprintf(fp, "\n"); return; } |
|---|
| 257 | fprintf(fp, "%4d ", base_count); |
|---|
| 258 | for(index=start, count=0; count<PRTLENGTH&&index<seq_length; index++) { |
|---|
| 259 | fprintf(fp, "%c", sequence[index]); |
|---|
| 260 | count++; |
|---|
| 261 | } /* printout sequence data */ |
|---|
| 262 | fprintf(fp, "\n"); |
|---|
| 263 | } |
|---|