| 1 | #include <stdio.h> |
|---|
| 2 | #include "convert.h" |
|---|
| 3 | #include "global.h" |
|---|
| 4 | |
|---|
| 5 | #define PRTLENGTH 62 |
|---|
| 6 | |
|---|
| 7 | /* --------------------------------------------------------------- |
|---|
| 8 | * Function to_printable() |
|---|
| 9 | * Convert from some format to PRINTABLE format. |
|---|
| 10 | */ |
|---|
| 11 | void |
|---|
| 12 | to_printable(inf, outf, informat) |
|---|
| 13 | char *inf, *outf; |
|---|
| 14 | int informat; |
|---|
| 15 | { |
|---|
| 16 | FILE *IFP, *ofp; |
|---|
| 17 | FILE_BUFFER ifp; |
|---|
| 18 | int maxsize, current, total_seq, length; |
|---|
| 19 | int out_of_memory, indi, index, *base_nums, base_count, start; |
|---|
| 20 | char temp[TOKENNUM], eof; |
|---|
| 21 | char *name; |
|---|
| 22 | |
|---|
| 23 | if((IFP=fopen(inf, "r"))==NULL) { |
|---|
| 24 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 25 | error(64, temp); |
|---|
| 26 | } |
|---|
| 27 | ifp = create_FILE_BUFFER(inf, IFP); |
|---|
| 28 | if(Lenstr(outf) <= 0) ofp = stdout; |
|---|
| 29 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 30 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 31 | error(117, temp); |
|---|
| 32 | } |
|---|
| 33 | maxsize = 1; |
|---|
| 34 | out_of_memory = 0; |
|---|
| 35 | name = NULL; |
|---|
| 36 | init(); |
|---|
| 37 | total_seq = 0; |
|---|
| 38 | base_nums=NULL; |
|---|
| 39 | do { |
|---|
| 40 | if(informat==ALMA) { |
|---|
| 41 | init_alma(); |
|---|
| 42 | eof=alma_in(ifp); |
|---|
| 43 | } else if(informat==GENBANK) { |
|---|
| 44 | init_genbank(); |
|---|
| 45 | eof=genbank_in_locus(ifp); |
|---|
| 46 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 47 | init_embl(); |
|---|
| 48 | eof=embl_in_id(ifp); |
|---|
| 49 | } else if(informat==MACKE) { |
|---|
| 50 | init_macke(); |
|---|
| 51 | eof=macke_in_name(ifp); |
|---|
| 52 | } else error(48, |
|---|
| 53 | "UNKNOW input format when converting to PRINABLE format."); |
|---|
| 54 | if(eof==EOF) break; |
|---|
| 55 | if(informat==ALMA) { |
|---|
| 56 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
|---|
| 57 | } else if(informat==GENBANK) { |
|---|
| 58 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
|---|
| 59 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 60 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
|---|
| 61 | } else if(informat==MACKE) { |
|---|
| 62 | Cpystr(temp, data.macke.seqabbr); |
|---|
| 63 | } else error(120, |
|---|
| 64 | "UNKNOW input format when converting to PRINABLE format."); |
|---|
| 65 | total_seq++; |
|---|
| 66 | |
|---|
| 67 | if((name = Dupstr(temp))==NULL&&temp!=NULL) { out_of_memory=1; break; } |
|---|
| 68 | if(data.seq_length>maxsize) maxsize = data.seq_length; |
|---|
| 69 | |
|---|
| 70 | if (!realloc_sequence_data(total_seq)) { out_of_memory = 1; break; } |
|---|
| 71 | |
|---|
| 72 | base_nums=(int*)Reallocspace((char *)base_nums, sizeof(int)*total_seq); |
|---|
| 73 | if(base_nums==NULL) { out_of_memory=1; break; } |
|---|
| 74 | |
|---|
| 75 | data.ids[total_seq-1] = name; |
|---|
| 76 | data.seqs[total_seq-1] = (char*)Dupstr(data.sequence); |
|---|
| 77 | data.lengths[total_seq-1] = Lenstr(data.sequence); |
|---|
| 78 | base_nums[total_seq-1] = 0; |
|---|
| 79 | |
|---|
| 80 | } while(!out_of_memory); |
|---|
| 81 | |
|---|
| 82 | if(out_of_memory) { /* cannot hold all seqs into mem. */ |
|---|
| 83 | fprintf(stderr, |
|---|
| 84 | "Rerun the conversion throught one seq. by one seq. base.\n"); |
|---|
| 85 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
|---|
| 86 | to_printable_1x1(inf, outf, informat); |
|---|
| 87 | return; |
|---|
| 88 | } |
|---|
| 89 | current = 0; |
|---|
| 90 | while(maxsize>current) { |
|---|
| 91 | for(indi=0; indi<total_seq; indi++) { |
|---|
| 92 | length = Lenstr(data.seqs[indi]); |
|---|
| 93 | for(index=base_count=0; |
|---|
| 94 | index<PRTLENGTH&&(current+index)<length; index++) |
|---|
| 95 | if(data.seqs[indi][index+current]!='~'&& |
|---|
| 96 | data.seqs[indi][index+current]!='-'&& |
|---|
| 97 | data.seqs[indi][index+current]!='.') |
|---|
| 98 | base_count++; |
|---|
| 99 | |
|---|
| 100 | /* check if the first char is base or not */ |
|---|
| 101 | if(current<length&&data.seqs[indi][current]!='~'&& |
|---|
| 102 | data.seqs[indi][current]!='-'&& |
|---|
| 103 | data.seqs[indi][current]!='.') |
|---|
| 104 | start = base_nums[indi]+1; |
|---|
| 105 | else start = base_nums[indi]; |
|---|
| 106 | |
|---|
| 107 | printable_print_line(data.ids[indi], |
|---|
| 108 | data.seqs[indi], current, start, ofp); |
|---|
| 109 | base_nums[indi] += base_count; |
|---|
| 110 | } |
|---|
| 111 | current += PRTLENGTH; |
|---|
| 112 | if(maxsize>current) fprintf(ofp, "\n\n"); |
|---|
| 113 | } |
|---|
| 114 | |
|---|
| 115 | Freespace((char **)&(base_nums)); |
|---|
| 116 | |
|---|
| 117 | #ifdef log |
|---|
| 118 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
|---|
| 119 | #endif |
|---|
| 120 | |
|---|
| 121 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
|---|
| 122 | } |
|---|
| 123 | /* --------------------------------------------------------------- |
|---|
| 124 | * Function to_printable_1x1() |
|---|
| 125 | * Convert from one foramt to PRINTABLE format, one seq by one seq. |
|---|
| 126 | */ |
|---|
| 127 | void |
|---|
| 128 | to_printable_1x1(inf, outf, informat) |
|---|
| 129 | char *inf, *outf; |
|---|
| 130 | int informat; |
|---|
| 131 | { |
|---|
| 132 | FILE *IFP, *ofp; |
|---|
| 133 | FILE_BUFFER ifp; |
|---|
| 134 | int maxsize, current, total_seq; |
|---|
| 135 | int base_count, index; |
|---|
| 136 | char temp[TOKENNUM], eof; |
|---|
| 137 | char *name; |
|---|
| 138 | |
|---|
| 139 | if((IFP=fopen(inf, "r"))==NULL) { |
|---|
| 140 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 141 | error(125, temp); |
|---|
| 142 | } |
|---|
| 143 | ifp = create_FILE_BUFFER(inf, IFP); |
|---|
| 144 | if(Lenstr(outf) <= 0) ofp = stdout; |
|---|
| 145 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 146 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 147 | error(126, temp); |
|---|
| 148 | } |
|---|
| 149 | maxsize = 1; current = 0; |
|---|
| 150 | name = NULL; |
|---|
| 151 | while(maxsize>current) { |
|---|
| 152 | init(); |
|---|
| 153 | FILE_BUFFER_rewind(ifp); |
|---|
| 154 | total_seq = 0; |
|---|
| 155 | do { /* read in one sequence */ |
|---|
| 156 | if(informat==ALMA) { |
|---|
| 157 | init_alma(); |
|---|
| 158 | eof=alma_in(ifp); |
|---|
| 159 | } else if(informat==GENBANK) { |
|---|
| 160 | init_genbank(); |
|---|
| 161 | eof=genbank_in_locus(ifp); |
|---|
| 162 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 163 | init_embl(); |
|---|
| 164 | eof=embl_in_id(ifp); |
|---|
| 165 | } else if(informat==MACKE) { |
|---|
| 166 | init_macke(); |
|---|
| 167 | eof=macke_in_name(ifp); |
|---|
| 168 | } else error(129, |
|---|
| 169 | "UNKNOW input format when converting to PRINTABLE format."); |
|---|
| 170 | if(eof==EOF) break; |
|---|
| 171 | if(informat==ALMA) { |
|---|
| 172 | alma_key_word(data.alma.id, 0, temp, TOKENNUM); |
|---|
| 173 | } else if(informat==GENBANK) { |
|---|
| 174 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
|---|
| 175 | } else if(informat==EMBL||informat==PROTEIN) { |
|---|
| 176 | embl_key_word(data.embl.id, 0, temp, TOKENNUM); |
|---|
| 177 | } else if(informat==MACKE) { |
|---|
| 178 | macke_key_word(data.macke.name, 0, temp, TOKENNUM); |
|---|
| 179 | } else error(131, |
|---|
| 180 | "UNKNOW input format when converting to PRINTABLE format."); |
|---|
| 181 | Freespace(&name); |
|---|
| 182 | name = Dupstr(temp); |
|---|
| 183 | if(data.seq_length>maxsize) |
|---|
| 184 | maxsize = data.seq_length; |
|---|
| 185 | for(index=base_count=0; |
|---|
| 186 | index<current&&index<data.seq_length; index++) { |
|---|
| 187 | if(data.sequence[index]!='~' |
|---|
| 188 | &&data.sequence[index]!='.' |
|---|
| 189 | &&data.sequence[index]!='-') base_count++; |
|---|
| 190 | } |
|---|
| 191 | /* check if the first char is a base or not */ |
|---|
| 192 | if(current<data.seq_length&&data.sequence[current]!='~' |
|---|
| 193 | &&data.sequence[current]!='.' |
|---|
| 194 | &&data.sequence[current]!='-') base_count++; |
|---|
| 195 | |
|---|
| 196 | /* find if there any non-gap char in the next 62 |
|---|
| 197 | * char of the seq. data |
|---|
| 198 | * #### count no need to be the first base num |
|---|
| 199 | for(index=current, count=0; |
|---|
| 200 | count==0&&index<data.seq_length |
|---|
| 201 | &&index<(current+PRTLENGTH); index++) |
|---|
| 202 | if(data.sequence[index]!='~' |
|---|
| 203 | &&data.sequence[index]!='.' |
|---|
| 204 | &&data.sequence[index]!='-') |
|---|
| 205 | { base_count++; count++; } |
|---|
| 206 | */ |
|---|
| 207 | |
|---|
| 208 | printable_print_line(name, data.sequence, current, |
|---|
| 209 | base_count, ofp); |
|---|
| 210 | total_seq++; |
|---|
| 211 | } while(1); |
|---|
| 212 | current += PRTLENGTH; |
|---|
| 213 | if(maxsize>current) fprintf(ofp, "\n\n"); |
|---|
| 214 | } /* print block by block */ |
|---|
| 215 | |
|---|
| 216 | destroy_FILE_BUFFER(ifp); fclose(ofp); |
|---|
| 217 | |
|---|
| 218 | #ifdef log |
|---|
| 219 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
|---|
| 220 | #endif |
|---|
| 221 | |
|---|
| 222 | } |
|---|
| 223 | /* ------------------------------------------------------------ |
|---|
| 224 | * Function printable_print_line(). |
|---|
| 225 | * print one printable line. |
|---|
| 226 | */ |
|---|
| 227 | void |
|---|
| 228 | printable_print_line(id, sequence, start, base_count, fp) |
|---|
| 229 | char *id, *sequence; |
|---|
| 230 | int start; |
|---|
| 231 | int base_count; |
|---|
| 232 | FILE *fp; |
|---|
| 233 | { |
|---|
| 234 | int indi, index, count, bnum, /*length,*/ seq_length; |
|---|
| 235 | |
|---|
| 236 | fprintf(fp, " "); |
|---|
| 237 | if((bnum=Lenstr(id))>10) { |
|---|
| 238 | /* truncate if length of id is greater than 10 */ |
|---|
| 239 | for(indi=0; indi<10; indi++) |
|---|
| 240 | fprintf(fp, "%c", id[indi]); |
|---|
| 241 | bnum = 1; |
|---|
| 242 | } else { |
|---|
| 243 | fprintf(fp, "%s", id); |
|---|
| 244 | bnum = 10 - bnum + 1; |
|---|
| 245 | } |
|---|
| 246 | /* fill in the blanks to make up 10 chars id spaces */ |
|---|
| 247 | seq_length = Lenstr(sequence); |
|---|
| 248 | if(start<seq_length) |
|---|
| 249 | for(indi=0; indi<bnum; indi++) |
|---|
| 250 | fprintf(fp, " "); |
|---|
| 251 | else { fprintf(fp, "\n"); return; } |
|---|
| 252 | fprintf(fp, "%4d ", base_count); |
|---|
| 253 | for(index=start, count=0; count<PRTLENGTH&&index<seq_length; index++) { |
|---|
| 254 | fprintf(fp, "%c", sequence[index]); |
|---|
| 255 | count++; |
|---|
| 256 | } /* printout sequence data */ |
|---|
| 257 | fprintf(fp, "\n"); |
|---|
| 258 | } |
|---|