| 1 | #include <stdio.h> |
|---|
| 2 | #include "convert.h" |
|---|
| 3 | #include "global.h" |
|---|
| 4 | /* ---------------------------------------------------------- */ |
|---|
| 5 | /* Function init_pm_data(). |
|---|
| 6 | /* Init macke and swissprot data. |
|---|
| 7 | */ |
|---|
| 8 | void |
|---|
| 9 | init_pm_data() { |
|---|
| 10 | void init_macke(), init_protein(); |
|---|
| 11 | |
|---|
| 12 | init_macke(); |
|---|
| 13 | init_protein(); |
|---|
| 14 | } |
|---|
| 15 | /* ------------------------------------------------------------ */ |
|---|
| 16 | /* Function init_protein(). |
|---|
| 17 | /* Initialize protein entry. |
|---|
| 18 | */ |
|---|
| 19 | void |
|---|
| 20 | init_protein() { |
|---|
| 21 | |
|---|
| 22 | int indi; |
|---|
| 23 | void Freespace(); |
|---|
| 24 | char *Dupstr(); |
|---|
| 25 | |
|---|
| 26 | /* initialize protein format */ |
|---|
| 27 | Freespace(&(data.protein.id)); |
|---|
| 28 | Freespace(&(data.protein.date)); |
|---|
| 29 | Freespace(&(data.protein.definition)); |
|---|
| 30 | Freespace(&(data.protein.formname)); |
|---|
| 31 | Freespace(&(data.protein.accession)); |
|---|
| 32 | Freespace(&(data.protein.keywords)); |
|---|
| 33 | for(indi=0; indi<data.protein.numofref; indi++) { |
|---|
| 34 | Freespace(&(data.protein.reference[indi].author)); |
|---|
| 35 | Freespace(&(data.protein.reference[indi].title)); |
|---|
| 36 | Freespace(&(data.protein.reference[indi].journal)); |
|---|
| 37 | Freespace(&(data.protein.reference[indi].processing)); |
|---|
| 38 | } |
|---|
| 39 | Freespace(&(data.protein.reference)); |
|---|
| 40 | Freespace(&(data.protein.comments)); |
|---|
| 41 | data.protein.id=Dupstr("\n"); |
|---|
| 42 | data.protein.date=Dupstr("\n"); |
|---|
| 43 | data.protein.definition=Dupstr("\n"); |
|---|
| 44 | data.protein.formname=Dupstr("\n"); |
|---|
| 45 | data.protein.accession=Dupstr("\n"); |
|---|
| 46 | data.protein.keywords=Dupstr("\n"); |
|---|
| 47 | data.protein.numofref=0; |
|---|
| 48 | data.protein.reference=NULL; |
|---|
| 49 | data.protein.comments=Dupstr(""); |
|---|
| 50 | } |
|---|
| 51 | /* ---------------------------------------------------------- */ |
|---|
| 52 | /* Function protein_to_macke(). |
|---|
| 53 | /* Convert from Protein format to Macke format. |
|---|
| 54 | */ |
|---|
| 55 | void |
|---|
| 56 | protein_to_macke(inf, outf) |
|---|
| 57 | char *inf, *outf; |
|---|
| 58 | { |
|---|
| 59 | FILE *ifp, *ofp, *fopen(); |
|---|
| 60 | char temp[TOKENNUM], protein_in(); |
|---|
| 61 | void init(), init_pm_data(), init_seq_data(); |
|---|
| 62 | void macke_out_header(), macke_out0(), macke_out1(), macke_out2(); |
|---|
| 63 | void error(); |
|---|
| 64 | int indi, ptom(), total_num; |
|---|
| 65 | |
|---|
| 66 | if((ifp=fopen(inf, "r"))==NULL) { |
|---|
| 67 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 68 | error(93, temp); |
|---|
| 69 | } |
|---|
| 70 | if(Lenstr(outf)<=0) ofp = stdout; |
|---|
| 71 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 72 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 73 | error(95, temp); |
|---|
| 74 | } |
|---|
| 75 | |
|---|
| 76 | /* seq irelenvant header */ |
|---|
| 77 | init(); |
|---|
| 78 | macke_out_header(ofp); |
|---|
| 79 | for(indi=0; indi<3; indi++) { |
|---|
| 80 | FILE_BUFFER_rewind(ifp); |
|---|
| 81 | init_seq_data(); |
|---|
| 82 | init_pm_data(); |
|---|
| 83 | while(protein_in(ifp)!=EOF) { |
|---|
| 84 | data.numofseq++; |
|---|
| 85 | if(ptom()) { |
|---|
| 86 | /* convert from protein form to macke form */ |
|---|
| 87 | switch(indi) { |
|---|
| 88 | case 0: |
|---|
| 89 | /* output seq display format */ |
|---|
| 90 | macke_out0(ofp, PROTEIN); |
|---|
| 91 | break; |
|---|
| 92 | case 1: |
|---|
| 93 | /* output seq information */ |
|---|
| 94 | macke_out1(ofp); |
|---|
| 95 | break; |
|---|
| 96 | case 2: |
|---|
| 97 | /* output seq data */ |
|---|
| 98 | macke_out2(ofp); |
|---|
| 99 | break; |
|---|
| 100 | default: ; |
|---|
| 101 | } |
|---|
| 102 | } else error(82, |
|---|
| 103 | "Conversion from protein to macke fails, Exit"); |
|---|
| 104 | init_pm_data(); |
|---|
| 105 | } |
|---|
| 106 | total_num = data.numofseq; |
|---|
| 107 | if(indi==0) fprintf(ofp, "#-\n"); |
|---|
| 108 | } /* for each seq; loop */ |
|---|
| 109 | |
|---|
| 110 | #ifdef log |
|---|
| 111 | fprintf(stderr, "Total %d sequences have been processed\n", total_num); |
|---|
| 112 | #endif |
|---|
| 113 | |
|---|
| 114 | } |
|---|
| 115 | /* --------------------------------------------------------------- */ |
|---|
| 116 | /* Function protein_in(). |
|---|
| 117 | /* Read in one protein entry. |
|---|
| 118 | */ |
|---|
| 119 | char |
|---|
| 120 | protein_in(fp) |
|---|
| 121 | FILE *fp; |
|---|
| 122 | { |
|---|
| 123 | char line[LINENUM], key[TOKENNUM], temp[LINENUM]; |
|---|
| 124 | char *Fgetline(), *eof, eoen; |
|---|
| 125 | char *protein_id(), *protein_definition(); |
|---|
| 126 | char *protein_accession(), *protein_date(), *protein_source(); |
|---|
| 127 | char *protein_keywords(), *protein_reference(); |
|---|
| 128 | char *protein_author(), *protein_title(), *protein_version(); |
|---|
| 129 | char *protein_processing(); |
|---|
| 130 | char *protein_comments(), *protein_origin(); |
|---|
| 131 | char *protein_skip_unidentified(); |
|---|
| 132 | void protein_key_word(), warning(), error(); |
|---|
| 133 | int Lenstr(); |
|---|
| 134 | |
|---|
| 135 | eoen=' '; /* end-of-entry, set to be 'y' after '//' is read */ |
|---|
| 136 | for(eof=Fgetline(line, LINENUM, fp); eof!=NULL&&eoen!='y'; ) { |
|---|
| 137 | if(Lenstr(line)<=1) { |
|---|
| 138 | eof=Fgetline(line, LINENUM, fp); |
|---|
| 139 | continue; /* empty line, skip */ |
|---|
| 140 | } |
|---|
| 141 | protein_key_word(line, 0, key, TOKENNUM); |
|---|
| 142 | eoen='n'; |
|---|
| 143 | if((Cmpstr(key, "ID"))==EQ) { |
|---|
| 144 | eof = protein_id(line, fp); |
|---|
| 145 | } else if((Cmpstr(key, "DT"))==EQ) { |
|---|
| 146 | eof = protein_date(line, fp); |
|---|
| 147 | } else if((Cmpstr(key, "DE"))==EQ) { |
|---|
| 148 | eof = protein_definition(line, fp); |
|---|
| 149 | } else if((Cmpstr(key, "OS"))==EQ) { |
|---|
| 150 | eof = protein_source(line, fp); |
|---|
| 151 | } else if((Cmpstr(key, "AC"))==EQ) { |
|---|
| 152 | eof = protein_accession(line, fp); |
|---|
| 153 | } else if((Cmpstr(key, "KW"))==EQ) { |
|---|
| 154 | eof = protein_keywords(line, fp); |
|---|
| 155 | } else if((Cmpstr(key, "RA"))==EQ) { |
|---|
| 156 | eof = protein_author(line, fp); |
|---|
| 157 | } else if((Cmpstr(key, "RP"))==EQ) { |
|---|
| 158 | eof = protein_processing(line, fp); |
|---|
| 159 | } else if((Cmpstr(key, "RT"))==EQ) { |
|---|
| 160 | eof = protein_title(line, fp); |
|---|
| 161 | } else if((Cmpstr(key, "RL"))==EQ) { |
|---|
| 162 | eof = protein_reference(line, fp); |
|---|
| 163 | } else if((Cmpstr(key, "RN"))==EQ) { |
|---|
| 164 | eof = protein_version(line, fp); |
|---|
| 165 | } else if((Cmpstr(key, "CC"))==EQ) { |
|---|
| 166 | eof = protein_comments(line, fp); |
|---|
| 167 | } else if((Cmpstr(key, "SQ"))==EQ) { |
|---|
| 168 | eof = protein_origin(line, fp); |
|---|
| 169 | eoen = 'y'; |
|---|
| 170 | } else { /* unidentified key word */ |
|---|
| 171 | eof = protein_skip_unidentified(key, line, fp); |
|---|
| 172 | } |
|---|
| 173 | /* except "SQ", at the end of all the other cases, a |
|---|
| 174 | /* new line has already read in, so no further read is |
|---|
| 175 | /* necessary*/ |
|---|
| 176 | } /* for loop to read an entry line by line */ |
|---|
| 177 | |
|---|
| 178 | if(eoen=='n') |
|---|
| 179 | error(42, "Reach EOF before one entry is read, Exit"); |
|---|
| 180 | |
|---|
| 181 | if(eof==NULL) return(EOF); |
|---|
| 182 | else return(EOF+1); |
|---|
| 183 | |
|---|
| 184 | } |
|---|
| 185 | /* --------------------------------------------------------------- */ |
|---|
| 186 | /* Function protein_in_id(). |
|---|
| 187 | /* Read in one protein entry with id and seq only. |
|---|
| 188 | */ |
|---|
| 189 | char |
|---|
| 190 | protein_in_id(fp) |
|---|
| 191 | FILE *fp; |
|---|
| 192 | { |
|---|
| 193 | char line[LINENUM], key[TOKENNUM], temp[LINENUM]; |
|---|
| 194 | char *Fgetline(), *eof, eoen; |
|---|
| 195 | char *protein_id(), *protein_origin(); |
|---|
| 196 | char *protein_skip_unidentified(); |
|---|
| 197 | void protein_key_word(), warning(), error(); |
|---|
| 198 | int Lenstr(); |
|---|
| 199 | |
|---|
| 200 | eoen=' '; /* end-of-entry, set to be 'y' after '//' is read */ |
|---|
| 201 | for(eof=Fgetline(line, LINENUM, fp); eof!=NULL&&eoen!='y'; ) { |
|---|
| 202 | if(Lenstr(line)<=1) { |
|---|
| 203 | eof=Fgetline(line, LINENUM, fp); |
|---|
| 204 | continue; /* empty line, skip */ |
|---|
| 205 | } |
|---|
| 206 | protein_key_word(line, 0, key, TOKENNUM); |
|---|
| 207 | eoen='n'; |
|---|
| 208 | if((Cmpstr(key, "ID"))==EQ) { |
|---|
| 209 | eof = protein_id(line, fp); |
|---|
| 210 | } else if((Cmpstr(key, "SQ"))==EQ) { |
|---|
| 211 | eof = protein_origin(line, fp); |
|---|
| 212 | eoen = 'y'; |
|---|
| 213 | } else { /* unidentified key word */ |
|---|
| 214 | eof = protein_skip_unidentified(key, line, fp); |
|---|
| 215 | } |
|---|
| 216 | /* except "SQ", at the end of all the other cases, a |
|---|
| 217 | /* new line has already read in, so no further read is |
|---|
| 218 | /* necessary*/ |
|---|
| 219 | } /* for loop to read an entry line by line */ |
|---|
| 220 | |
|---|
| 221 | if(eoen=='n') |
|---|
| 222 | error(87, "Reach EOF before one entry is read, Exit"); |
|---|
| 223 | |
|---|
| 224 | if(eof==NULL) return(EOF); |
|---|
| 225 | else return(EOF+1); |
|---|
| 226 | |
|---|
| 227 | } |
|---|
| 228 | /* ---------------------------------------------------------------- */ |
|---|
| 229 | /* Function protein_key_word(). |
|---|
| 230 | /* Get the key_word from line beginning at index. |
|---|
| 231 | */ |
|---|
| 232 | void |
|---|
| 233 | protein_key_word(line, index, key, length) |
|---|
| 234 | char *line; |
|---|
| 235 | int index; |
|---|
| 236 | char *key; |
|---|
| 237 | int length; |
|---|
| 238 | { |
|---|
| 239 | int indi, indj; |
|---|
| 240 | |
|---|
| 241 | if(line==NULL) { key[0]='\0'; return; } |
|---|
| 242 | for(indi=index, indj=0; (index=indi)<length&&line[indi]!=' ' |
|---|
| 243 | &&line[indi]!='\t'&&line[indi]!='\n'&&line[indi]!='\0'; |
|---|
| 244 | indi++, indj++) |
|---|
| 245 | key[indj] = line[indi]; |
|---|
| 246 | key[indj] = '\0'; |
|---|
| 247 | } |
|---|
| 248 | /* ------------------------------------------------------------ */ |
|---|
| 249 | /* Function protein_chcek_blanks(). |
|---|
| 250 | /* Check if there is (numb) blanks at beginning of line. |
|---|
| 251 | */ |
|---|
| 252 | int |
|---|
| 253 | protein_check_blanks(line, numb) |
|---|
| 254 | char *line; |
|---|
| 255 | int numb; |
|---|
| 256 | { |
|---|
| 257 | int blank=1, indi, indk; |
|---|
| 258 | |
|---|
| 259 | for(indi=0; blank&&indi<numb; indi++) { |
|---|
| 260 | if(line[indi]!=' '&&line[indi]!='\t') blank=0; |
|---|
| 261 | if(line[indi]=='\t') { |
|---|
| 262 | indk=indi/8+1; indi=8*indk+1; |
|---|
| 263 | } |
|---|
| 264 | } |
|---|
| 265 | |
|---|
| 266 | return(blank); |
|---|
| 267 | } |
|---|
| 268 | /* ---------------------------------------------------------------- */ |
|---|
| 269 | /* Function protein_continue_line(). |
|---|
| 270 | /* if there are (numb) blanks at the beginning of line, |
|---|
| 271 | /* it is a continue line of the current command. |
|---|
| 272 | */ |
|---|
| 273 | char |
|---|
| 274 | *protein_continue_line(pattern, string, line, fp) |
|---|
| 275 | char *pattern, **string, *line; |
|---|
| 276 | FILE *fp; |
|---|
| 277 | { |
|---|
| 278 | int Lenstr(), Cmpstr(), len, ind; |
|---|
| 279 | int protein_check_blanks(), Skip_white_space(); |
|---|
| 280 | char key[TOKENNUM], *eof, temp[LINENUM], *Catstr(); |
|---|
| 281 | char *Fgetline(); |
|---|
| 282 | void Cpystr(), protein_key_word(), Append_rp_eoln(); |
|---|
| 283 | |
|---|
| 284 | /* check continue lines */ |
|---|
| 285 | for(eof=Fgetline(line, LINENUM, fp); |
|---|
| 286 | eof!=NULL; eof=Fgetline(line, LINENUM, fp)) { |
|---|
| 287 | if(Lenstr(line)<=1) continue; |
|---|
| 288 | protein_key_word(line, 0, key, TOKENNUM); |
|---|
| 289 | if(Cmpstr(pattern, key)!=EQ) break; |
|---|
| 290 | ind=Skip_white_space(line, p_nonkey_start); |
|---|
| 291 | Cpystr(temp, (line+ind)); |
|---|
| 292 | Append_rp_eoln(string, temp); |
|---|
| 293 | } /* end of continue line checking */ |
|---|
| 294 | return(eof); |
|---|
| 295 | } |
|---|
| 296 | /* -------------------------------------------------------------- */ |
|---|
| 297 | /* Function protein_id(). |
|---|
| 298 | /* Read in protein ID lines. |
|---|
| 299 | */ |
|---|
| 300 | char |
|---|
| 301 | *protein_id(line, fp) |
|---|
| 302 | char *line; |
|---|
| 303 | FILE *fp; |
|---|
| 304 | { |
|---|
| 305 | int index, Skip_white_space(), Lenstr(); |
|---|
| 306 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 307 | void error(), Freespace(); |
|---|
| 308 | |
|---|
| 309 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 310 | Freespace(&(data.protein.id)); |
|---|
| 311 | data.protein.id = Dupstr(line+index); |
|---|
| 312 | eof = (char*)protein_continue_line("ID", &(data.protein.id), line, fp); |
|---|
| 313 | |
|---|
| 314 | return(eof); |
|---|
| 315 | } |
|---|
| 316 | /* -------------------------------------------------------------- */ |
|---|
| 317 | /* Function protein_date(). |
|---|
| 318 | /* Read in protein DATE lines. |
|---|
| 319 | */ |
|---|
| 320 | char |
|---|
| 321 | *protein_date(line, fp) |
|---|
| 322 | char *line; |
|---|
| 323 | FILE *fp; |
|---|
| 324 | { |
|---|
| 325 | int index, Skip_white_space(), Lenstr(); |
|---|
| 326 | char *eof, *protein_continue_line(), *Dupstr(), *dummy; |
|---|
| 327 | void error(), Freespace(); |
|---|
| 328 | |
|---|
| 329 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 330 | Freespace(&(data.protein.date)); |
|---|
| 331 | data.protein.date = Dupstr(line+index); |
|---|
| 332 | dummy = Dupstr(" "); |
|---|
| 333 | eof = (char*)protein_continue_line("DT", &(dummy), line, fp); |
|---|
| 334 | Freespace(&dummy); |
|---|
| 335 | |
|---|
| 336 | return(eof); |
|---|
| 337 | } |
|---|
| 338 | /* -------------------------------------------------------------- */ |
|---|
| 339 | /* Function protein_source(). |
|---|
| 340 | /* Read in protein DE lines. |
|---|
| 341 | */ |
|---|
| 342 | char |
|---|
| 343 | *protein_source(line, fp) |
|---|
| 344 | char *line; |
|---|
| 345 | FILE *fp; |
|---|
| 346 | { |
|---|
| 347 | int index, Skip_white_space(); |
|---|
| 348 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 349 | void Freespace(); |
|---|
| 350 | |
|---|
| 351 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 352 | Freespace(&(data.protein.formname)); |
|---|
| 353 | data.protein.formname = Dupstr(line+index); |
|---|
| 354 | eof = protein_continue_line("OS", &(data.protein.formname), |
|---|
| 355 | line, fp); |
|---|
| 356 | |
|---|
| 357 | return(eof); |
|---|
| 358 | } |
|---|
| 359 | /* -------------------------------------------------------------- */ |
|---|
| 360 | /* Function protein_definition(). |
|---|
| 361 | /* Read in protein DE lines. |
|---|
| 362 | */ |
|---|
| 363 | char |
|---|
| 364 | *protein_definition(line, fp) |
|---|
| 365 | char *line; |
|---|
| 366 | FILE *fp; |
|---|
| 367 | { |
|---|
| 368 | int index, Skip_white_space(); |
|---|
| 369 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 370 | void Freespace(); |
|---|
| 371 | |
|---|
| 372 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 373 | Freespace(&(data.protein.definition)); |
|---|
| 374 | data.protein.definition = Dupstr(line+index); |
|---|
| 375 | eof = protein_continue_line("DE", &(data.protein.definition), line, fp); |
|---|
| 376 | |
|---|
| 377 | return(eof); |
|---|
| 378 | } |
|---|
| 379 | /* -------------------------------------------------------------- */ |
|---|
| 380 | /* Function protein_accession(). |
|---|
| 381 | /* Read in protein ACCESSION lines. |
|---|
| 382 | */ |
|---|
| 383 | char |
|---|
| 384 | *protein_accession(line, fp) |
|---|
| 385 | char *line; |
|---|
| 386 | FILE *fp; |
|---|
| 387 | { |
|---|
| 388 | int index, Skip_white_space(); |
|---|
| 389 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 390 | void Freespace(); |
|---|
| 391 | |
|---|
| 392 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 393 | Freespace(&(data.protein.accession)); |
|---|
| 394 | data.protein.accession = Dupstr(line+index); |
|---|
| 395 | eof = protein_continue_line("AC", &(data.protein.accession), line, fp); |
|---|
| 396 | |
|---|
| 397 | return(eof); |
|---|
| 398 | } |
|---|
| 399 | /* -------------------------------------------------------------- */ |
|---|
| 400 | /* Function protein_processing(). |
|---|
| 401 | /* Read in protein RP lines. |
|---|
| 402 | */ |
|---|
| 403 | char |
|---|
| 404 | *protein_processing(line, fp) |
|---|
| 405 | char *line; |
|---|
| 406 | FILE *fp; |
|---|
| 407 | { |
|---|
| 408 | int index, Skip_white_space(); |
|---|
| 409 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 410 | void Freespace(); |
|---|
| 411 | |
|---|
| 412 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 413 | Freespace(&(data.protein.reference[data.protein.numofref-1].processing)); |
|---|
| 414 | data.protein.reference[data.protein.numofref-1].processing = Dupstr(line+index); |
|---|
| 415 | eof = (char*)protein_continue_line("RP", |
|---|
| 416 | &(data.protein.reference[data.protein.numofref-1].processing), line, fp); |
|---|
| 417 | |
|---|
| 418 | return(eof); |
|---|
| 419 | } |
|---|
| 420 | /* -------------------------------------------------------------- */ |
|---|
| 421 | /* Function protein_keywords(). |
|---|
| 422 | /* Read in protein KEYWORDS lines. |
|---|
| 423 | */ |
|---|
| 424 | char |
|---|
| 425 | *protein_keywords(line, fp) |
|---|
| 426 | char *line; |
|---|
| 427 | FILE *fp; |
|---|
| 428 | { |
|---|
| 429 | int index, Skip_white_space(); |
|---|
| 430 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 431 | void Freespace(); |
|---|
| 432 | |
|---|
| 433 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 434 | Freespace(&(data.protein.keywords)); |
|---|
| 435 | data.protein.keywords = Dupstr(line+index); |
|---|
| 436 | eof = (char*)protein_continue_line("KW", &(data.protein.keywords), |
|---|
| 437 | line, fp); |
|---|
| 438 | |
|---|
| 439 | return(eof); |
|---|
| 440 | } |
|---|
| 441 | /* -------------------------------------------------------------- */ |
|---|
| 442 | /* Function protein_author(). |
|---|
| 443 | /* Read in protein RL lines. |
|---|
| 444 | */ |
|---|
| 445 | char |
|---|
| 446 | *protein_author(line, fp) |
|---|
| 447 | char *line; |
|---|
| 448 | FILE *fp; |
|---|
| 449 | { |
|---|
| 450 | int index, Skip_white_space(); |
|---|
| 451 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 452 | void Freespace(); |
|---|
| 453 | |
|---|
| 454 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 455 | Freespace(&(data.protein.reference[data.protein.numofref-1].author)); |
|---|
| 456 | data.protein.reference[data.protein.numofref-1].author = Dupstr(line+index); |
|---|
| 457 | eof = (char*)protein_continue_line("RA", |
|---|
| 458 | &(data.protein.reference[data.protein.numofref-1].author), line, fp); |
|---|
| 459 | |
|---|
| 460 | return(eof); |
|---|
| 461 | } |
|---|
| 462 | /* -------------------------------------------------------------- */ |
|---|
| 463 | /* Function protein_title(). |
|---|
| 464 | /* Read in protein RT lines. |
|---|
| 465 | */ |
|---|
| 466 | char |
|---|
| 467 | *protein_title(line, fp) |
|---|
| 468 | char *line; |
|---|
| 469 | FILE *fp; |
|---|
| 470 | { |
|---|
| 471 | int index, Skip_white_space(); |
|---|
| 472 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 473 | void Freespace(); |
|---|
| 474 | |
|---|
| 475 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 476 | Freespace(&(data.protein.reference[data.protein.numofref-1].title)); |
|---|
| 477 | data.protein.reference[data.protein.numofref-1].title = Dupstr(line+index); |
|---|
| 478 | eof = (char*)protein_continue_line("RT", |
|---|
| 479 | &(data.protein.reference[data.protein.numofref-1].title), line, fp); |
|---|
| 480 | |
|---|
| 481 | return(eof); |
|---|
| 482 | } |
|---|
| 483 | /* -------------------------------------------------------------- */ |
|---|
| 484 | /* Function protein_reference(). |
|---|
| 485 | /* Read in protein RL lines. |
|---|
| 486 | */ |
|---|
| 487 | char |
|---|
| 488 | *protein_reference(line, fp) |
|---|
| 489 | char *line; |
|---|
| 490 | FILE *fp; |
|---|
| 491 | { |
|---|
| 492 | int index, Skip_white_space(); |
|---|
| 493 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 494 | void Freespace(); |
|---|
| 495 | |
|---|
| 496 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 497 | Freespace(&(data.protein.reference[data.protein.numofref-1].journal)); |
|---|
| 498 | data.protein.reference[data.protein.numofref-1].journal = Dupstr(line+index); |
|---|
| 499 | eof = (char*)protein_continue_line("RL", |
|---|
| 500 | &(data.protein.reference[data.protein.numofref-1].journal), line, fp); |
|---|
| 501 | |
|---|
| 502 | return(eof); |
|---|
| 503 | } |
|---|
| 504 | /* -------------------------------------------------------------- */ |
|---|
| 505 | /* Function protein_version(). |
|---|
| 506 | /* Read in protein RN lines. |
|---|
| 507 | */ |
|---|
| 508 | char |
|---|
| 509 | *protein_version(line, fp) |
|---|
| 510 | char *line; |
|---|
| 511 | FILE *fp; |
|---|
| 512 | { |
|---|
| 513 | int index, Skip_white_space(); |
|---|
| 514 | char *eof, *protein_continue_line(), *Dupstr(); |
|---|
| 515 | char *Reallocspace(), *Fgetline(); |
|---|
| 516 | void Freespace(); |
|---|
| 517 | |
|---|
| 518 | index = Skip_white_space(line, p_nonkey_start); |
|---|
| 519 | if(data.protein.numofref==0) { |
|---|
| 520 | data.protein.numofref++; |
|---|
| 521 | data.protein.reference = (Emblref*)calloc(1,sizeof(Emblref)*1); |
|---|
| 522 | data.protein.reference[0].author = Dupstr(""); |
|---|
| 523 | data.protein.reference[0].title = Dupstr(""); |
|---|
| 524 | data.protein.reference[0].journal = Dupstr(""); |
|---|
| 525 | data.protein.reference[0].processing = Dupstr(""); |
|---|
| 526 | } else { |
|---|
| 527 | data.protein.numofref++; |
|---|
| 528 | data.protein.reference = (Emblref*)Reallocspace(data.protein.reference, |
|---|
| 529 | sizeof(Emblref)*(data.protein.numofref)); |
|---|
| 530 | data.protein.reference[data.protein.numofref-1].author = Dupstr(""); |
|---|
| 531 | data.protein.reference[data.protein.numofref-1].title = Dupstr(""); |
|---|
| 532 | data.protein.reference[data.protein.numofref-1].journal = Dupstr(""); |
|---|
| 533 | data.protein.reference[data.protein.numofref-1].processing = Dupstr(""); |
|---|
| 534 | } |
|---|
| 535 | eof=Fgetline(line, LINENUM, fp); |
|---|
| 536 | return(eof); |
|---|
| 537 | } |
|---|
| 538 | /* -------------------------------------------------------------- */ |
|---|
| 539 | /* Function protein_comments(). |
|---|
| 540 | /* Read in protein comment lines. |
|---|
| 541 | */ |
|---|
| 542 | char |
|---|
| 543 | *protein_comments(line, fp) |
|---|
| 544 | char *line; |
|---|
| 545 | FILE *fp; |
|---|
| 546 | { |
|---|
| 547 | int index, Skip_white_space(), len, Lenstr(); |
|---|
| 548 | char *eof, *Fgetline(), *protein_continue_line(), *Dupstr(); |
|---|
| 549 | void Freespace(), Append(); |
|---|
| 550 | |
|---|
| 551 | if(Lenstr(data.protein.comments)<=1) |
|---|
| 552 | Freespace(&(data.protein.comments)); |
|---|
| 553 | for(; line[0]='C'&&line[1]=='C'; eof=Fgetline(line, LINENUM, fp)) |
|---|
| 554 | Append(&(data.protein.comments), line+5); |
|---|
| 555 | return(eof); |
|---|
| 556 | } |
|---|
| 557 | /* ---------------------------------------------------------------- */ |
|---|
| 558 | /* Function protein_skip_unidentified(). |
|---|
| 559 | /* if there are (numb) blanks at the beginning of line, |
|---|
| 560 | /* it is a continue line of the current command. |
|---|
| 561 | */ |
|---|
| 562 | char |
|---|
| 563 | *protein_skip_unidentified(pattern, line, fp) |
|---|
| 564 | char *pattern, *line; |
|---|
| 565 | FILE *fp; |
|---|
| 566 | { |
|---|
| 567 | int Lenstr(), Cmpstr(); |
|---|
| 568 | char *Fgetline(), *eof; |
|---|
| 569 | char key[TOKENNUM]; |
|---|
| 570 | void protein_key_word(); |
|---|
| 571 | |
|---|
| 572 | /* check continue lines */ |
|---|
| 573 | for(eof=Fgetline(line, LINENUM, fp); |
|---|
| 574 | eof!=NULL; eof=Fgetline(line, LINENUM, fp)) { |
|---|
| 575 | protein_key_word(line, 0, key, TOKENNUM); |
|---|
| 576 | if(Cmpstr(key, pattern)!=EQ) break; |
|---|
| 577 | } /* end of continue line checking */ |
|---|
| 578 | return(eof); |
|---|
| 579 | } |
|---|
| 580 | /* -------------------------------------------------------------- */ |
|---|
| 581 | /* Function protein_origin(). |
|---|
| 582 | /* Read in protein sequence data. |
|---|
| 583 | */ |
|---|
| 584 | char |
|---|
| 585 | *protein_origin(line, fp) |
|---|
| 586 | char *line; |
|---|
| 587 | FILE *fp; |
|---|
| 588 | { |
|---|
| 589 | char *Fgetline(), *eof, *Reallocspace(); |
|---|
| 590 | int index; |
|---|
| 591 | |
|---|
| 592 | data.seq_length = 0; |
|---|
| 593 | /* read in whole sequence data */ |
|---|
| 594 | for(eof=Fgetline(line, LINENUM, fp); |
|---|
| 595 | eof!=NULL&&line[0]!='/'&&line[1]!='/'; |
|---|
| 596 | eof=Fgetline(line, LINENUM, fp)) |
|---|
| 597 | { |
|---|
| 598 | for(index=5; line[index]!='\n'&&line[index]!='\0'; |
|---|
| 599 | index++) { |
|---|
| 600 | if(line[index]!=' '&&data.seq_length>=data.max) { |
|---|
| 601 | data.max += 100; |
|---|
| 602 | data.sequence = (char*)Reallocspace( |
|---|
| 603 | data.sequence, |
|---|
| 604 | (unsigned)(sizeof(char)*data.max)); |
|---|
| 605 | } |
|---|
| 606 | if(line[index]!=' ') |
|---|
| 607 | data.sequence[data.seq_length++] |
|---|
| 608 | = line[index]; |
|---|
| 609 | } |
|---|
| 610 | data.sequence[data.seq_length] = '\0'; |
|---|
| 611 | } |
|---|
| 612 | return(eof); |
|---|
| 613 | } |
|---|
| 614 | /* -------------------------------------------------------------- */ |
|---|
| 615 | /* Function ptom(). |
|---|
| 616 | /* Convert from Protein format to Macke format. |
|---|
| 617 | */ |
|---|
| 618 | int |
|---|
| 619 | ptom() { |
|---|
| 620 | void protein_key_word(), error(); |
|---|
| 621 | int Lenstr(), indj, indk, remnum; |
|---|
| 622 | char temp[LONGTEXT], *Dupstr(), *Reallocspace(); |
|---|
| 623 | void Freespace(); |
|---|
| 624 | |
|---|
| 625 | /* copy seq abbr, assume every entry in protein must end with \n\0 */ |
|---|
| 626 | /* no '\n' at the end of the string */ |
|---|
| 627 | protein_key_word(data.protein.id, 0, temp, TOKENNUM); |
|---|
| 628 | Freespace(&(data.macke.seqabbr)); |
|---|
| 629 | data.macke.seqabbr = Dupstr(temp); |
|---|
| 630 | /* copy name */ |
|---|
| 631 | Freespace(&(data.macke.name)); |
|---|
| 632 | data.macke.name = Dupstr(data.protein.formname); |
|---|
| 633 | /* copy date---DD-MMM-YYYY\n\0 */ |
|---|
| 634 | Freespace(&(data.macke.date)); |
|---|
| 635 | data.macke.date = Dupstr(data.protein.date); |
|---|
| 636 | /* copy protein entry (accession has higher priority) */ |
|---|
| 637 | if(Lenstr(data.protein.accession)>1) { |
|---|
| 638 | Freespace(&(data.macke.acs)); |
|---|
| 639 | data.macke.acs = Dupstr(data.protein.accession); |
|---|
| 640 | } |
|---|
| 641 | if(data.protein.numofref>0) { |
|---|
| 642 | if(Lenstr(data.protein.reference[0].journal)>1) { |
|---|
| 643 | Freespace(&(data.macke.journal)); |
|---|
| 644 | data.macke.journal = Dupstr(data.protein.reference[0].journal); |
|---|
| 645 | } |
|---|
| 646 | if(Lenstr(data.protein.reference[0].title)>1) { |
|---|
| 647 | Freespace(&(data.macke.title)); |
|---|
| 648 | data.macke.title = Dupstr(data.protein.reference[0].title); |
|---|
| 649 | } |
|---|
| 650 | if(Lenstr(data.protein.reference[0].author)>1) { |
|---|
| 651 | Freespace(&(data.macke.author)); |
|---|
| 652 | data.macke.author = Dupstr(data.protein.reference[0].author); |
|---|
| 653 | } |
|---|
| 654 | } |
|---|
| 655 | /* the rest of data are put into remarks, rem:..... */ |
|---|
| 656 | remnum=0; |
|---|
| 657 | for(indj=1; indj<data.protein.numofref; indj++) { |
|---|
| 658 | if(Lenstr(data.protein.reference[indj].journal)>1) { |
|---|
| 659 | sprintf(temp, "jour:%s", data.protein.reference[indj].journal); |
|---|
| 660 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
|---|
| 661 | sizeof(char*)*(remnum+1)); |
|---|
| 662 | data.macke.remarks[remnum++] = Dupstr(temp); |
|---|
| 663 | } /* not empty */ |
|---|
| 664 | if(Lenstr(data.protein.reference[indj].author)>1) { |
|---|
| 665 | sprintf(temp, "auth:%s", data.protein.reference[indj].author); |
|---|
| 666 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
|---|
| 667 | sizeof(char*)*(remnum+1)); |
|---|
| 668 | data.macke.remarks[remnum++] = Dupstr(temp); |
|---|
| 669 | } /* not empty author field */ |
|---|
| 670 | if(Lenstr(data.protein.reference[indj].title)>1) { |
|---|
| 671 | sprintf(temp, "title:%s", data.protein.reference[indj].title); |
|---|
| 672 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
|---|
| 673 | sizeof(char*)*(remnum+1)); |
|---|
| 674 | data.macke.remarks[remnum++] = Dupstr(temp); |
|---|
| 675 | } /* not empty title field */ |
|---|
| 676 | if(Lenstr(data.protein.reference[indj].processing)>1) { |
|---|
| 677 | sprintf(temp, "processing:%s", data.protein.reference[indj].processing); |
|---|
| 678 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
|---|
| 679 | sizeof(char*)*(remnum+1)); |
|---|
| 680 | data.macke.remarks[remnum++] = Dupstr(temp); |
|---|
| 681 | } /* not empty processing field */ |
|---|
| 682 | } /* loop for copying other reference */ |
|---|
| 683 | /* copy keywords as remark */ |
|---|
| 684 | if(Lenstr(data.protein.keywords)>1) { |
|---|
| 685 | sprintf(temp, "KEYWORDS:%s", data.protein.keywords); |
|---|
| 686 | data.macke.remarks = (char**)Reallocspace(data.macke.remarks, |
|---|
| 687 | sizeof(char*)*(remnum+1)); |
|---|
| 688 | data.macke.remarks[remnum++] = Dupstr(temp); |
|---|
| 689 | } |
|---|
| 690 | /* Maybe redudantly */ |
|---|
| 691 | if(Lenstr(data.protein.comments)>1) { |
|---|
| 692 | for(indj=0, indk=0; data.protein.comments[indj]!='\0'; indj++) |
|---|
| 693 | { |
|---|
| 694 | temp[indk++] = data.protein.comments[indj]; |
|---|
| 695 | if(data.protein.comments[indj]=='\n') { |
|---|
| 696 | temp[indk] = '\0'; |
|---|
| 697 | data.macke.remarks = (char**)Reallocspace |
|---|
| 698 | (data.macke.remarks, |
|---|
| 699 | sizeof(char*)*(remnum+1)); |
|---|
| 700 | data.macke.remarks[remnum++] |
|---|
| 701 | = Dupstr(temp); |
|---|
| 702 | indk=0; |
|---|
| 703 | } /* new remark line */ |
|---|
| 704 | } /* for loop to find other remarks */ |
|---|
| 705 | } /* other comments */ |
|---|
| 706 | data.macke.numofrem = remnum; |
|---|
| 707 | return(1); |
|---|
| 708 | } |
|---|
| 709 | /* ---------------------------------------------------------- */ |
|---|
| 710 | /* Function protein_to_genbank(). |
|---|
| 711 | /* Convert from Protein format to genbank format. |
|---|
| 712 | */ |
|---|
| 713 | void |
|---|
| 714 | protein_to_genbank(inf, outf) |
|---|
| 715 | char *inf, *outf; |
|---|
| 716 | { |
|---|
| 717 | FILE *ifp, *ofp, *fopen(); |
|---|
| 718 | char temp[TOKENNUM], protein_in(); |
|---|
| 719 | void init(), init_genbank(), init_macke(), init_protein(); |
|---|
| 720 | void init_seq_data(); |
|---|
| 721 | void genbank_out(); |
|---|
| 722 | void error(); |
|---|
| 723 | int indi, ptom(), mtog(), total_num; |
|---|
| 724 | |
|---|
| 725 | if((ifp=fopen(inf, "r"))==NULL) { |
|---|
| 726 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 727 | error(94, temp); |
|---|
| 728 | } |
|---|
| 729 | if(Lenstr(outf)<=0) ofp = stdout; |
|---|
| 730 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 731 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 732 | error(96, temp); |
|---|
| 733 | } |
|---|
| 734 | |
|---|
| 735 | /* seq irelenvant header */ |
|---|
| 736 | init(); |
|---|
| 737 | /* rewind(ifp); */ |
|---|
| 738 | init_genbank(); |
|---|
| 739 | init_macke(); |
|---|
| 740 | init_protein(); |
|---|
| 741 | while(protein_in(ifp)!=EOF) { |
|---|
| 742 | data.numofseq++; |
|---|
| 743 | if(ptom()&&mtog()) genbank_out(ofp); |
|---|
| 744 | init_genbank(); |
|---|
| 745 | init_macke(); |
|---|
| 746 | init_protein(); |
|---|
| 747 | } |
|---|
| 748 | |
|---|
| 749 | #ifdef log |
|---|
| 750 | fprintf(stderr, "Total %d sequences have been processed\n", total_num); |
|---|
| 751 | #endif |
|---|
| 752 | |
|---|
| 753 | } |
|---|
| 754 | /* ---------------------------------------------------------------- */ |
|---|
| 755 | /* Function protein_to_paup(). |
|---|
| 756 | /* Convert from Swissprot file to paup file. |
|---|
| 757 | */ |
|---|
| 758 | void |
|---|
| 759 | protein_to_paup(inf, outf) |
|---|
| 760 | char *inf, *outf; |
|---|
| 761 | { |
|---|
| 762 | FILE *ifp, *ofp, *fopen(); |
|---|
| 763 | int Lenstr(), maxsize, current, total_seq, first_line; |
|---|
| 764 | char protein_in_id(), temp[TOKENNUM], *name; |
|---|
| 765 | char *Dupstr(), *today_date(), *today; |
|---|
| 766 | void init(), init_paup(), init_seq_data(), paup_print_line(); |
|---|
| 767 | void error(), init_protein(), protein_key_word(), Freespace(); |
|---|
| 768 | |
|---|
| 769 | if((ifp=fopen(inf, "r"))==NULL) { |
|---|
| 770 | sprintf(temp, "Cannot open input file %s, exit\n", inf); |
|---|
| 771 | error(80, temp); |
|---|
| 772 | } |
|---|
| 773 | if(Lenstr(outf)<=0) ofp = stdout; |
|---|
| 774 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 775 | sprintf(temp, "Cannot open output file %s, exit\n", outf); |
|---|
| 776 | error(81, temp); |
|---|
| 777 | } |
|---|
| 778 | maxsize = 1; current = 0; |
|---|
| 779 | name = NULL; |
|---|
| 780 | init_paup(); |
|---|
| 781 | paup_print_header(ofp); |
|---|
| 782 | while(maxsize>current) { |
|---|
| 783 | init(); |
|---|
| 784 | FILE_BUFFER_rewind(ifp); |
|---|
| 785 | total_seq = 0; |
|---|
| 786 | /* first time read input file */ |
|---|
| 787 | first_line = 0; |
|---|
| 788 | while(protein_in_id(ifp)!=EOF) { |
|---|
| 789 | Freespace(&name); |
|---|
| 790 | protein_key_word(data.protein.id, 0, temp, TOKENNUM); |
|---|
| 791 | name = Dupstr(temp); |
|---|
| 792 | if(data.seq_length>maxsize) |
|---|
| 793 | maxsize = data.seq_length; |
|---|
| 794 | if(current<data.seq_length) first_line++; |
|---|
| 795 | paup_print_line(name, data.sequence, current, |
|---|
| 796 | (first_line==1), ofp); |
|---|
| 797 | if(first_line==1) first_line++; /* avoid repeating */ |
|---|
| 798 | init_paup(); |
|---|
| 799 | init_protein(); |
|---|
| 800 | total_seq++; |
|---|
| 801 | } |
|---|
| 802 | current += (SEQLINE - 10); |
|---|
| 803 | if(maxsize>current) fprintf(ofp, "\n"); |
|---|
| 804 | } /* print block by block */ |
|---|
| 805 | fprintf(ofp, " ;\nENDBLOCK;\n"); |
|---|
| 806 | rewind(ofp); |
|---|
| 807 | fprintf(ofp, "#NEXUS\n"); |
|---|
| 808 | today = today_date(); |
|---|
| 809 | if(today[Lenstr(today)-1]=='\n') today[Lenstr(today)-1] = '\0'; |
|---|
| 810 | fprintf(ofp, "[! RDP - the Ribsomal Database Project, (%s).]\n", today); |
|---|
| 811 | fprintf(ofp, "[! To get started, send HELP to rdp@info.mcs.anl.gov ]\n"); |
|---|
| 812 | fprintf(ofp, "BEGIN DATA;\n DIMENSIONS\n"); |
|---|
| 813 | fprintf(ofp, " NTAX = %6d\n NCHAR = %6d\n ;\n", total_seq, maxsize); |
|---|
| 814 | |
|---|
| 815 | #ifdef log |
|---|
| 816 | fprintf(stderr, "Total %d sequences have been processed\n", total_seq); |
|---|
| 817 | #endif |
|---|
| 818 | |
|---|
| 819 | fclose(ifp); fclose(ofp); |
|---|
| 820 | } |
|---|