| 1 | /* genabnk and Macke converting program */ |
|---|
| 2 | |
|---|
| 3 | #include <stdio.h> |
|---|
| 4 | #include <stdlib.h> |
|---|
| 5 | #include "convert.h" |
|---|
| 6 | #include "global.h" |
|---|
| 7 | |
|---|
| 8 | extern int warning_out; |
|---|
| 9 | |
|---|
| 10 | /* -------------------------------------------------------------- |
|---|
| 11 | * Function init_gm_data(). |
|---|
| 12 | * Initialize data structure of genbank and Macke formats. |
|---|
| 13 | */ |
|---|
| 14 | void |
|---|
| 15 | init_gm_data() { |
|---|
| 16 | /* void init_macke(), init_genbank(); */ |
|---|
| 17 | |
|---|
| 18 | init_macke(); |
|---|
| 19 | init_genbank(); |
|---|
| 20 | } |
|---|
| 21 | /* ---------------------------------------------------------- |
|---|
| 22 | * Function genbank_to_macke(). |
|---|
| 23 | * Convert from Genbank format to Macke format. |
|---|
| 24 | */ |
|---|
| 25 | void |
|---|
| 26 | genbank_to_macke(inf, outf) |
|---|
| 27 | char *inf, *outf; |
|---|
| 28 | { |
|---|
| 29 | FILE *IFP, *ofp; |
|---|
| 30 | FILE_BUFFER ifp; |
|---|
| 31 | char temp[TOKENNUM]; |
|---|
| 32 | int indi, total_num; |
|---|
| 33 | |
|---|
| 34 | if((IFP=fopen(inf, "r"))==NULL) { |
|---|
| 35 | sprintf(temp, "CANNOT open input file %s, exit.\n", inf); |
|---|
| 36 | error(0, temp); |
|---|
| 37 | } |
|---|
| 38 | ifp = create_FILE_BUFFER(inf, IFP); |
|---|
| 39 | if(Lenstr(outf) <= 0) ofp = stdout; |
|---|
| 40 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 41 | sprintf(temp, "CANNOT open output file %s, exit.\n", outf); |
|---|
| 42 | error(1, temp); |
|---|
| 43 | } |
|---|
| 44 | /* seq irelenvant header */ |
|---|
| 45 | init(); |
|---|
| 46 | init_gm_data(); |
|---|
| 47 | macke_out_header(ofp); |
|---|
| 48 | |
|---|
| 49 | #ifdef log |
|---|
| 50 | fprintf(stderr, "Start converting...\n"); |
|---|
| 51 | #endif |
|---|
| 52 | |
|---|
| 53 | for(indi=0; indi<3; indi++) { |
|---|
| 54 | FILE_BUFFER_rewind(ifp); |
|---|
| 55 | init_seq_data(); |
|---|
| 56 | while(genbank_in(ifp)!=EOF) { |
|---|
| 57 | data.numofseq++; |
|---|
| 58 | if(gtom()) { |
|---|
| 59 | /* convert from genbank form to macke form */ |
|---|
| 60 | switch(indi) { |
|---|
| 61 | case 0: |
|---|
| 62 | /* output seq display format */ |
|---|
| 63 | macke_out0(ofp, GENBANK); |
|---|
| 64 | break; |
|---|
| 65 | case 1: |
|---|
| 66 | /* output seq information */ |
|---|
| 67 | macke_out1(ofp); |
|---|
| 68 | break; |
|---|
| 69 | case 2: |
|---|
| 70 | /* output seq data */ |
|---|
| 71 | macke_out2(ofp); |
|---|
| 72 | break; |
|---|
| 73 | default: ; |
|---|
| 74 | } |
|---|
| 75 | } else error(7, |
|---|
| 76 | "Conversion from genbank to macke fails, Exit"); |
|---|
| 77 | init_gm_data(); |
|---|
| 78 | #ifdef log |
|---|
| 79 | if((data.numofseq % 100)==0) |
|---|
| 80 | fprintf(stderr, "%d sequences have been processed\n", |
|---|
| 81 | data.numofseq); |
|---|
| 82 | #endif |
|---|
| 83 | } |
|---|
| 84 | total_num = data.numofseq; |
|---|
| 85 | if(indi==0) { |
|---|
| 86 | fprintf(ofp, "#-\n"); |
|---|
| 87 | /* no warning message for next loop */ |
|---|
| 88 | warning_out = 0; |
|---|
| 89 | } |
|---|
| 90 | } /* for each seq; loop */ |
|---|
| 91 | |
|---|
| 92 | warning_out = 1; /* resume warning messages */ |
|---|
| 93 | |
|---|
| 94 | #ifdef log |
|---|
| 95 | fprintf(stderr, |
|---|
| 96 | "Total %d sequences have been processed\n", total_num); |
|---|
| 97 | #endif |
|---|
| 98 | |
|---|
| 99 | } |
|---|
| 100 | /* -------------------------------------------------------------- |
|---|
| 101 | * Function gtom(). |
|---|
| 102 | * Convert from Genbank format to Macke format. |
|---|
| 103 | */ |
|---|
| 104 | int |
|---|
| 105 | gtom() { |
|---|
| 106 | /* void genbank_key_word(), error(), Freespace(), Cpystr(); */ |
|---|
| 107 | /* void Append(), gtom_remarks(), replace_entry(); */ |
|---|
| 108 | /* char *Catstr(); */ |
|---|
| 109 | char temp[LONGTEXT], buffer[TOKENNUM]; |
|---|
| 110 | char genus[TOKENNUM], species[TOKENNUM]; |
|---|
| 111 | /* char *genbank_date(), *today_date(); */ |
|---|
| 112 | /* char *genbank_get_strain(), *genbank_get_subspecies(); */ |
|---|
| 113 | /* char *genbank_get_atcc(); */ |
|---|
| 114 | /* int Lenstr(), num_of_remark(), Cmpstr(); */ |
|---|
| 115 | /* int len; */ |
|---|
| 116 | /* int indj, indk, remnum; */ |
|---|
| 117 | |
|---|
| 118 | /* copy seq abbr, assume every entry in gbk must end with \n\0 */ |
|---|
| 119 | /* no '\n' at the end of the string */ |
|---|
| 120 | genbank_key_word(data.gbk.locus, 0, temp, TOKENNUM); |
|---|
| 121 | replace_entry(&(data.macke.seqabbr), temp); |
|---|
| 122 | |
|---|
| 123 | /* copy name and definition*/ |
|---|
| 124 | if(Lenstr(data.gbk.organism)>1) |
|---|
| 125 | replace_entry(&(data.macke.name),data.gbk.organism); |
|---|
| 126 | else if(Lenstr(data.gbk.definition)>1) { |
|---|
| 127 | sscanf(data.gbk.definition, "%s %s", genus, species); |
|---|
| 128 | if(species[Lenstr(species)-1]==';') |
|---|
| 129 | species[Lenstr(species)-1] = '\0'; |
|---|
| 130 | sprintf(temp, "%s %s\n", genus, species); |
|---|
| 131 | replace_entry(&(data.macke.name), temp); |
|---|
| 132 | } |
|---|
| 133 | |
|---|
| 134 | /* copy cc name and number */ |
|---|
| 135 | if(Lenstr(data.gbk.comments.orginf.cc)>1) |
|---|
| 136 | replace_entry(&(data.macke.atcc), |
|---|
| 137 | data.gbk.comments.orginf.cc); |
|---|
| 138 | |
|---|
| 139 | /* copy rna(methods) */ |
|---|
| 140 | if(Lenstr(data.gbk.comments.seqinf.methods)>1) |
|---|
| 141 | replace_entry(&(data.macke.rna), |
|---|
| 142 | data.gbk.comments.seqinf.methods); |
|---|
| 143 | |
|---|
| 144 | /* copy date---DD-MMM-YYYY\n\0 */ |
|---|
| 145 | if(Lenstr(data.gbk.locus)<61) { |
|---|
| 146 | data.macke.date = genbank_date(today_date()); |
|---|
| 147 | Append(&(data.macke.date), "\n"); |
|---|
| 148 | } else |
|---|
| 149 | replace_entry(&(data.macke.date), data.gbk.locus+50); |
|---|
| 150 | |
|---|
| 151 | /* copy genbank entry (gbkentry has higher priority than gbk.accession)*/ |
|---|
| 152 | if(Lenstr(data.gbk.comments.seqinf.gbkentry)>1) |
|---|
| 153 | replace_entry(&(data.macke.acs), |
|---|
| 154 | data.gbk.comments.seqinf.gbkentry); |
|---|
| 155 | else { |
|---|
| 156 | if(Lenstr(data.gbk.accession)>1 |
|---|
| 157 | &&Cmpstr(data.gbk.accession, "No information\n")!=EQ) { |
|---|
| 158 | sscanf(data.gbk.accession, "%s", buffer); |
|---|
| 159 | Catstr(buffer, "\n"); |
|---|
| 160 | } else Cpystr(buffer, "\n"); |
|---|
| 161 | replace_entry(&(data.macke.acs), buffer); |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | /* copy the first reference from GenBank to Macke */ |
|---|
| 165 | if(data.gbk.numofref>0) { |
|---|
| 166 | if(Lenstr(data.gbk.reference[0].author)>1) |
|---|
| 167 | replace_entry(&(data.macke.author), |
|---|
| 168 | data.gbk.reference[0].author); |
|---|
| 169 | |
|---|
| 170 | if(Lenstr(data.gbk.reference[0].journal)>1) |
|---|
| 171 | replace_entry(&(data.macke.journal), |
|---|
| 172 | data.gbk.reference[0].journal); |
|---|
| 173 | |
|---|
| 174 | if(Lenstr(data.gbk.reference[0].title)>1) |
|---|
| 175 | replace_entry(&(data.macke.title), |
|---|
| 176 | data.gbk.reference[0].title); |
|---|
| 177 | } /* the rest of references are put into remarks, rem:..... */ |
|---|
| 178 | |
|---|
| 179 | gtom_remarks(); |
|---|
| 180 | |
|---|
| 181 | /* adjust the strain, subspecies, and atcc information */ |
|---|
| 182 | Freespace(&(data.macke.strain)); |
|---|
| 183 | data.macke.strain = (char*)genbank_get_strain(); |
|---|
| 184 | Freespace(&(data.macke.subspecies)); |
|---|
| 185 | data.macke.subspecies = (char*)genbank_get_subspecies(); |
|---|
| 186 | if(Lenstr(data.macke.atcc)<=1) { |
|---|
| 187 | Freespace(&(data.macke.atcc)); |
|---|
| 188 | data.macke.atcc = (char*)genbank_get_atcc(); |
|---|
| 189 | } |
|---|
| 190 | |
|---|
| 191 | return(1); |
|---|
| 192 | } |
|---|
| 193 | /* -------------------------------------------------------------- |
|---|
| 194 | * Function gtom_remarks(). |
|---|
| 195 | * Create Macke remarks. |
|---|
| 196 | */ |
|---|
| 197 | void |
|---|
| 198 | gtom_remarks() { |
|---|
| 199 | |
|---|
| 200 | int remnum, len; |
|---|
| 201 | int indi, indj; |
|---|
| 202 | char temp[LONGTEXT]; |
|---|
| 203 | /* void gtom_copy_remark(); */ |
|---|
| 204 | |
|---|
| 205 | /* remarks in Macke format */ |
|---|
| 206 | remnum = num_of_remark(); |
|---|
| 207 | data.macke.remarks = (char**)calloc(1,(unsigned)(sizeof(char*)*remnum)); |
|---|
| 208 | remnum=0; |
|---|
| 209 | |
|---|
| 210 | /* REFERENCE the first reference */ |
|---|
| 211 | if(data.gbk.numofref>0) |
|---|
| 212 | gtom_copy_remark(data.gbk.reference[0].ref, "ref:", &remnum); |
|---|
| 213 | |
|---|
| 214 | /* The rest of the REFERENCES */ |
|---|
| 215 | for(indi=1; indi<data.gbk.numofref; indi++) { |
|---|
| 216 | gtom_copy_remark(data.gbk.reference[indi].ref, "ref:", &remnum); |
|---|
| 217 | gtom_copy_remark(data.gbk.reference[indi].author, "auth:", &remnum); |
|---|
| 218 | gtom_copy_remark(data.gbk.reference[indi].journal, "jour:", &remnum); |
|---|
| 219 | gtom_copy_remark(data.gbk.reference[indi].title, "title:", &remnum); |
|---|
| 220 | gtom_copy_remark(data.gbk.reference[indi].standard, "standard:", &remnum); |
|---|
| 221 | |
|---|
| 222 | } /* loop for copying other reference */ |
|---|
| 223 | |
|---|
| 224 | /* copy keywords as remark */ |
|---|
| 225 | gtom_copy_remark(data.gbk.keywords, "KEYWORDS:", &remnum); |
|---|
| 226 | |
|---|
| 227 | /* copy accession as remark when genbank entry also exists. */ |
|---|
| 228 | gtom_copy_remark(data.gbk.accession, "GenBank ACCESSION:", &remnum); |
|---|
| 229 | |
|---|
| 230 | /* copy source of strain */ |
|---|
| 231 | gtom_copy_remark(data.gbk.comments.orginf.source, "Source of strain:", &remnum); |
|---|
| 232 | |
|---|
| 233 | /* copy former name */ |
|---|
| 234 | gtom_copy_remark(data.gbk.comments.orginf.formname, "Former name:", &remnum); |
|---|
| 235 | |
|---|
| 236 | /* copy alternate name */ |
|---|
| 237 | gtom_copy_remark(data.gbk.comments.orginf.nickname, "Alternate name:", &remnum); |
|---|
| 238 | |
|---|
| 239 | /* copy common name */ |
|---|
| 240 | gtom_copy_remark(data.gbk.comments.orginf.commname, "Common name:", &remnum); |
|---|
| 241 | |
|---|
| 242 | /* copy host organism */ |
|---|
| 243 | gtom_copy_remark(data.gbk.comments.orginf.hostorg, "Host organism:", &remnum); |
|---|
| 244 | |
|---|
| 245 | /* copy RDP ID */ |
|---|
| 246 | gtom_copy_remark(data.gbk.comments.seqinf.RDPid, "RDP ID:", &remnum); |
|---|
| 247 | |
|---|
| 248 | /* copy methods */ |
|---|
| 249 | gtom_copy_remark(data.gbk.comments.seqinf.methods, "Sequencing methods:", &remnum); |
|---|
| 250 | |
|---|
| 251 | /* copy 3' end complete */ |
|---|
| 252 | if(data.gbk.comments.seqinf.comp3!=' ') { |
|---|
| 253 | if(data.gbk.comments.seqinf.comp3=='y') data.macke.remarks[remnum++] = Dupstr("3' end complete: Yes\n"); |
|---|
| 254 | else data.macke.remarks[remnum++]= Dupstr("3' end complete: No\n"); |
|---|
| 255 | } |
|---|
| 256 | |
|---|
| 257 | /* copy 5' end complete */ |
|---|
| 258 | if(data.gbk.comments.seqinf.comp5!=' ') { |
|---|
| 259 | if(data.gbk.comments.seqinf.comp5=='y') data.macke.remarks[remnum++]= Dupstr("5' end complete: Yes\n"); |
|---|
| 260 | else data.macke.remarks[remnum++]= Dupstr("5' end complete: No\n"); |
|---|
| 261 | } |
|---|
| 262 | |
|---|
| 263 | /* other comments, not RDP DataBase specially defined */ |
|---|
| 264 | if(Lenstr(data.gbk.comments.others)>0) { |
|---|
| 265 | len = Lenstr(data.gbk.comments.others); |
|---|
| 266 | for(indi=0, indj=0; indi<len; indi++) |
|---|
| 267 | { |
|---|
| 268 | temp[indj++] = data.gbk.comments.others[indi]; |
|---|
| 269 | if(data.gbk.comments.others[indi]=='\n' |
|---|
| 270 | || data.gbk.comments.others[indi]=='\0') { |
|---|
| 271 | temp[indj] = '\0'; |
|---|
| 272 | data.macke.remarks[remnum++] = |
|---|
| 273 | (char*)Dupstr(temp); |
|---|
| 274 | |
|---|
| 275 | indj=0; |
|---|
| 276 | } /* new remark line */ |
|---|
| 277 | } /* for loop to find other remarks */ |
|---|
| 278 | } /* other comments */ |
|---|
| 279 | |
|---|
| 280 | /* done with the remarks copying */ |
|---|
| 281 | |
|---|
| 282 | data.macke.numofrem = remnum; |
|---|
| 283 | } |
|---|
| 284 | /* -------------------------------------------------------------------- |
|---|
| 285 | * Function gtom_copy_remark(). |
|---|
| 286 | * If string length > 1 then copy string with key to remark. |
|---|
| 287 | */ |
|---|
| 288 | void gtom_copy_remark(string, key, remnum) |
|---|
| 289 | char *string; |
|---|
| 290 | const char *key; |
|---|
| 291 | int *remnum; |
|---|
| 292 | { |
|---|
| 293 | /* int Lenstr(); */ |
|---|
| 294 | /* char *Dupstr(); */ |
|---|
| 295 | /* void Append(); */ |
|---|
| 296 | |
|---|
| 297 | /* copy host organism */ |
|---|
| 298 | if(Lenstr(string)>1) { |
|---|
| 299 | data.macke.remarks[(*remnum)] |
|---|
| 300 | =(char*)Dupstr(key); |
|---|
| 301 | Append(&(data.macke.remarks[(*remnum)]), string); |
|---|
| 302 | (*remnum)++; |
|---|
| 303 | } |
|---|
| 304 | } |
|---|
| 305 | /* -------------------------------------------------------------------- |
|---|
| 306 | * Function genbank_get_strain(). |
|---|
| 307 | * Get strain from DEFINITION, COMMENT or SOURCE line in |
|---|
| 308 | * Genbank data file. |
|---|
| 309 | */ |
|---|
| 310 | char *genbank_get_strain() { |
|---|
| 311 | |
|---|
| 312 | int indj, indk; |
|---|
| 313 | /* int find_pattern(), Lenstr(); */ |
|---|
| 314 | /* int Skip_white_space(), Reach_white_space(); */ |
|---|
| 315 | char strain[LONGTEXT], temp[LONGTEXT], buffer[LONGTEXT]; |
|---|
| 316 | /* void get_string(), warning(), Cpystr(); */ |
|---|
| 317 | |
|---|
| 318 | strain[0]='\0'; |
|---|
| 319 | /* get strain */ |
|---|
| 320 | if(Lenstr(data.gbk.comments.others)>1) { |
|---|
| 321 | if((indj=find_pattern(data.gbk.comments.others, |
|---|
| 322 | "*source:"))>=0) { |
|---|
| 323 | if((indk=find_pattern( |
|---|
| 324 | (data.gbk.comments.others+indj), |
|---|
| 325 | "strain="))>=0) { |
|---|
| 326 | /* skip blank spaces */ |
|---|
| 327 | indj=Skip_white_space( |
|---|
| 328 | data.gbk.comments.others, |
|---|
| 329 | (indj+indk+7)); |
|---|
| 330 | /* get strain */ |
|---|
| 331 | get_string(data.gbk.comments.others, |
|---|
| 332 | temp, indj); |
|---|
| 333 | Cpystr(strain, temp); /* copy new strain */ |
|---|
| 334 | } /* get strain */ |
|---|
| 335 | } /* find source: line in comment */ |
|---|
| 336 | } /* look for strain on comments */ |
|---|
| 337 | |
|---|
| 338 | if(Lenstr(data.gbk.definition)>1) { |
|---|
| 339 | if((indj=find_pattern(data.gbk.definition, "str. "))>=0 |
|---|
| 340 | ||(indj=find_pattern(data.gbk.definition, "strain "))>=0) { |
|---|
| 341 | /* skip the key word */ |
|---|
| 342 | indj=Reach_white_space(data.gbk.definition, indj); |
|---|
| 343 | /* skip blank spaces */ |
|---|
| 344 | indj=Skip_white_space(data.gbk.definition, indj); |
|---|
| 345 | /* get strain */ |
|---|
| 346 | get_string(data.gbk.definition, temp, indj); |
|---|
| 347 | if(Lenstr(strain)>1) { |
|---|
| 348 | if(Cmpstr(temp, strain)!=EQ){ |
|---|
| 349 | sprintf(buffer, |
|---|
| 350 | "Inconsistent strain definition in DEFINITION: %s and %s", |
|---|
| 351 | temp, strain); |
|---|
| 352 | warning(91, buffer); |
|---|
| 353 | } /* check consistency of duplicated def */ |
|---|
| 354 | } else Cpystr(strain, temp); /* get strain */ |
|---|
| 355 | } /* find strain in definition */ |
|---|
| 356 | } /* if there is definition line */ |
|---|
| 357 | |
|---|
| 358 | if(Lenstr(data.gbk.source)>1) { |
|---|
| 359 | if((indj=find_pattern(data.gbk.source, "str. "))>=0 |
|---|
| 360 | ||(indj=find_pattern(data.gbk.source, "strain "))>=0){ |
|---|
| 361 | /* skip the key word */ |
|---|
| 362 | indj=Reach_white_space(data.gbk.source, indj); |
|---|
| 363 | /* skip blank spaces */ |
|---|
| 364 | indj=Skip_white_space(data.gbk.source, indj); |
|---|
| 365 | /* get strain */ |
|---|
| 366 | get_string(data.gbk.source, temp, indj); |
|---|
| 367 | if(Lenstr(strain)>1) { |
|---|
| 368 | if(Cmpstr(temp, strain)!=EQ) { |
|---|
| 369 | sprintf(buffer, |
|---|
| 370 | "Inconsistent strain definition in SOURCE: %s and %s", |
|---|
| 371 | temp, strain); |
|---|
| 372 | warning(92, buffer); |
|---|
| 373 | } |
|---|
| 374 | } else Cpystr(strain, temp); |
|---|
| 375 | /* check consistency of duplicated def */ |
|---|
| 376 | } /* find strain */ |
|---|
| 377 | } /* look for strain in SOURCE line */ |
|---|
| 378 | |
|---|
| 379 | return(Dupstr(strain)); |
|---|
| 380 | } |
|---|
| 381 | /* -------------------------------------------------------------------- |
|---|
| 382 | * Function genbank_get_subspecies(). |
|---|
| 383 | * Get subspecies information from SOURCE, DEFENITION, or |
|---|
| 384 | * COMMENT line of Genabnk data file. |
|---|
| 385 | */ |
|---|
| 386 | char |
|---|
| 387 | *genbank_get_subspecies() { |
|---|
| 388 | |
|---|
| 389 | int indj, indk; |
|---|
| 390 | /* int find_pattern(), Lenstr(); */ |
|---|
| 391 | /* int Skip_white_space(), Reach_white_space(); */ |
|---|
| 392 | char subspecies[LONGTEXT], temp[LONGTEXT], buffer[LONGTEXT]; |
|---|
| 393 | /* char *Dupstr(); */ |
|---|
| 394 | /* void get_string(), warning(), Cpystr(); */ |
|---|
| 395 | /* void correct_subspecies(); */ |
|---|
| 396 | |
|---|
| 397 | subspecies[0]='\0'; |
|---|
| 398 | /* get subspecies */ |
|---|
| 399 | if(Lenstr(data.gbk.definition)>1) { |
|---|
| 400 | if((indj=find_pattern(data.gbk.definition, "subsp. "))>=0) { |
|---|
| 401 | /* skip the key word */ |
|---|
| 402 | indj=Reach_white_space(data.gbk.definition, indj); |
|---|
| 403 | /* skip blank spaces */ |
|---|
| 404 | indj=Skip_white_space(data.gbk.definition, indj); |
|---|
| 405 | /* get subspecies */ |
|---|
| 406 | get_string(data.gbk.definition, temp, indj); |
|---|
| 407 | correct_subspecies(temp); |
|---|
| 408 | Cpystr(subspecies, temp); |
|---|
| 409 | } |
|---|
| 410 | } |
|---|
| 411 | if(Lenstr(data.gbk.comments.others)>1) { |
|---|
| 412 | if((indj=find_pattern(data.gbk.comments.others, |
|---|
| 413 | "*source:"))>=0) { |
|---|
| 414 | if((indk=find_pattern((data.gbk.comments.others+indj), |
|---|
| 415 | "sub-species="))>=0 |
|---|
| 416 | ||(indk=find_pattern((data.gbk.comments.others+indj), |
|---|
| 417 | "subspecies="))>=0 |
|---|
| 418 | ||(indk=find_pattern((data.gbk.comments.others+indj), |
|---|
| 419 | "subsp.="))>=0) { |
|---|
| 420 | /* skip the key word */ |
|---|
| 421 | for(indj+=indk; |
|---|
| 422 | data.gbk.comments.others[indj]!='='; indj++); |
|---|
| 423 | indj++; |
|---|
| 424 | /* skip blank spaces */ |
|---|
| 425 | indj=Skip_white_space(data.gbk.comments.others, indj); |
|---|
| 426 | /* get subspecies */ |
|---|
| 427 | get_string(data.gbk.comments.others, temp, |
|---|
| 428 | indj); |
|---|
| 429 | if(Lenstr(subspecies)>1){ |
|---|
| 430 | if(Cmpstr(temp, subspecies)!=EQ){ |
|---|
| 431 | sprintf(buffer, |
|---|
| 432 | "Inconsistent subspecies definition in COMMENTS *source: %s and %s", |
|---|
| 433 | temp, subspecies); |
|---|
| 434 | warning(20, buffer); |
|---|
| 435 | } |
|---|
| 436 | } else Cpystr(subspecies, temp); |
|---|
| 437 | } /* get subspecies */ |
|---|
| 438 | } /* find *source: line in comment */ |
|---|
| 439 | } /* look for subspecies on comments */ |
|---|
| 440 | |
|---|
| 441 | if(Lenstr(data.gbk.source)>1) { |
|---|
| 442 | if((indj=find_pattern(data.gbk.source, "subsp. "))>=0 |
|---|
| 443 | ||(indj=find_pattern(data.gbk.source, "subspecies "))>=0 |
|---|
| 444 | ||(indj=find_pattern(data.gbk.source, "sub-species "))>=0) { |
|---|
| 445 | /* skip the key word */ |
|---|
| 446 | indj=Reach_white_space(data.gbk.source, indj); |
|---|
| 447 | /* skip blank spaces */ |
|---|
| 448 | indj=Skip_white_space(data.gbk.source, indj); |
|---|
| 449 | /* get subspecies */ |
|---|
| 450 | get_string(data.gbk.source, temp, indj); |
|---|
| 451 | correct_subspecies(temp); |
|---|
| 452 | if(Lenstr(subspecies)>1) { |
|---|
| 453 | if(Cmpstr(temp, subspecies)!=EQ){ |
|---|
| 454 | sprintf(buffer, |
|---|
| 455 | "Inconsistent subspecies definition in SOURCE: %s and %s", |
|---|
| 456 | temp, subspecies); |
|---|
| 457 | warning(21, buffer); |
|---|
| 458 | } |
|---|
| 459 | } else Cpystr(subspecies, temp); |
|---|
| 460 | /* check consistency of duplicated def */ |
|---|
| 461 | } /* find subspecies */ |
|---|
| 462 | } /* look for subspecies in SOURCE line */ |
|---|
| 463 | |
|---|
| 464 | return(Dupstr(subspecies)); |
|---|
| 465 | } |
|---|
| 466 | /* --------------------------------------------------------------- |
|---|
| 467 | * Function correct_subspecies(). |
|---|
| 468 | * Remove the strain information in subspecies which is |
|---|
| 469 | * sometime mistakenly written into it. |
|---|
| 470 | */ |
|---|
| 471 | void |
|---|
| 472 | correct_subspecies(subspecies) |
|---|
| 473 | char *subspecies; |
|---|
| 474 | { |
|---|
| 475 | int indj; |
|---|
| 476 | |
|---|
| 477 | if((indj=find_pattern(subspecies, "str\n"))>=0 |
|---|
| 478 | ||(indj=find_pattern(subspecies, "str."))>=0 |
|---|
| 479 | ||(indj=find_pattern(subspecies, "strain"))>=0) { |
|---|
| 480 | subspecies[indj-1]='\n'; |
|---|
| 481 | subspecies[indj]='\0'; |
|---|
| 482 | } |
|---|
| 483 | } |
|---|
| 484 | /* -------------------------------------------------------------------- |
|---|
| 485 | * Function genbank_get_atcc(). |
|---|
| 486 | * Get atcc from SOURCE line in Genbank data file. |
|---|
| 487 | */ |
|---|
| 488 | char |
|---|
| 489 | *genbank_get_atcc() { |
|---|
| 490 | |
|---|
| 491 | /* int Lenstr(); */ |
|---|
| 492 | char temp[LONGTEXT]; |
|---|
| 493 | char *atcc; |
|---|
| 494 | |
|---|
| 495 | |
|---|
| 496 | atcc = NULL; |
|---|
| 497 | /* get culture collection # */ |
|---|
| 498 | if(Lenstr(data.gbk.source)>1) |
|---|
| 499 | atcc = get_atcc(data.gbk.source); |
|---|
| 500 | if(Lenstr(atcc)<=1&&Lenstr(data.macke.strain)>1) { |
|---|
| 501 | /* add () to macke strain to be processed correctly */ |
|---|
| 502 | sprintf(temp, "(%s)", data.macke.strain); |
|---|
| 503 | atcc = get_atcc(temp); |
|---|
| 504 | } |
|---|
| 505 | return(atcc); |
|---|
| 506 | } |
|---|
| 507 | /* ------------------------------------------------------------------- */ |
|---|
| 508 | /* Function get_atcc(). |
|---|
| 509 | */ |
|---|
| 510 | char |
|---|
| 511 | *get_atcc(source) |
|---|
| 512 | char *source; |
|---|
| 513 | { |
|---|
| 514 | |
|---|
| 515 | static int cc_num=16; |
|---|
| 516 | static const char *CC[16] = {"ATCC", "CCM", "CDC", "CIP", "CNCTC", |
|---|
| 517 | "DSM", "EPA", "JCM", "NADC", "NCDO", "NCTC", "NRCC", |
|---|
| 518 | "NRRL", "PCC", "USDA", "VPI"}; |
|---|
| 519 | /* int indk; */ |
|---|
| 520 | int indi, indj, index; |
|---|
| 521 | int length; |
|---|
| 522 | /* int find_pattern(), Lenstr(); */ |
|---|
| 523 | /* int paren_string(), Skip_white_space(), Reach_white_space(); */ |
|---|
| 524 | char buffer[LONGTEXT], temp[LONGTEXT], pstring[LONGTEXT]; |
|---|
| 525 | char atcc[LONGTEXT]; |
|---|
| 526 | /* char *Catstr(), *Dupstr(); */ |
|---|
| 527 | /* void get_atcc_string(); */ |
|---|
| 528 | |
|---|
| 529 | atcc[0]='\0'; |
|---|
| 530 | for(indi=0; indi<cc_num; indi++) { |
|---|
| 531 | index=0; |
|---|
| 532 | while((index=paren_string(source, pstring, index))>0) { |
|---|
| 533 | if((indj=find_pattern(pstring, CC[indi]))>=0){ |
|---|
| 534 | /* skip the key word */ |
|---|
| 535 | indj += Lenstr(CC[indi]); |
|---|
| 536 | /* skip blank spaces */ |
|---|
| 537 | indj=Skip_white_space(pstring, indj); |
|---|
| 538 | /* get strain */ |
|---|
| 539 | get_atcc_string(pstring, buffer, indj); |
|---|
| 540 | sprintf(temp, "%s %s", CC[indi], buffer); |
|---|
| 541 | length=Lenstr(atcc); |
|---|
| 542 | if(length>0) { |
|---|
| 543 | atcc[length]= '\0'; |
|---|
| 544 | Catstr(atcc, ", "); |
|---|
| 545 | } |
|---|
| 546 | Catstr(atcc, temp); |
|---|
| 547 | } /* find atcc */ |
|---|
| 548 | } /* while loop */ |
|---|
| 549 | } /* for loop */ |
|---|
| 550 | /* append eoln to the atcc string */ |
|---|
| 551 | length = Lenstr(atcc); |
|---|
| 552 | if (data.macke.atcc) { |
|---|
| 553 | data.macke.atcc[length] = '\0'; |
|---|
| 554 | } |
|---|
| 555 | Catstr(atcc, "\n"); |
|---|
| 556 | return(Dupstr(atcc)); |
|---|
| 557 | } |
|---|
| 558 | /* ----------------------------------------------------------------- */ |
|---|
| 559 | /* Function paren_string() |
|---|
| 560 | */ |
|---|
| 561 | int |
|---|
| 562 | paren_string(string, pstring, index) |
|---|
| 563 | char *string, *pstring; |
|---|
| 564 | int index; |
|---|
| 565 | { |
|---|
| 566 | int pcount=0, len, indi; |
|---|
| 567 | |
|---|
| 568 | for(indi=0, len=Lenstr(string); index<len; index++) { |
|---|
| 569 | if(pcount>=1) pstring[indi++]=string[index]; |
|---|
| 570 | if(string[index]=='(') pcount++; |
|---|
| 571 | if(string[index]==')') pcount--; |
|---|
| 572 | } |
|---|
| 573 | if(indi==0) return(-1); |
|---|
| 574 | pstring[--indi]='\0'; |
|---|
| 575 | return(index); |
|---|
| 576 | } |
|---|
| 577 | /* ---------------------------------------------------------------- |
|---|
| 578 | * Function num_of_remark(). |
|---|
| 579 | * Count num of remarks needed in order to alloc spaces. |
|---|
| 580 | */ |
|---|
| 581 | int |
|---|
| 582 | num_of_remark() { |
|---|
| 583 | |
|---|
| 584 | int remnum, /*indi, */indj, length; |
|---|
| 585 | |
|---|
| 586 | remnum = 0; |
|---|
| 587 | /* count references to be put into remarks */ |
|---|
| 588 | if(data.gbk.numofref>0&&Lenstr(data.gbk.reference[0].ref)>1) |
|---|
| 589 | remnum++; |
|---|
| 590 | for(indj=1; indj<data.gbk.numofref; indj++) { |
|---|
| 591 | if(Lenstr(data.gbk.reference[indj].ref)>1) |
|---|
| 592 | remnum++; |
|---|
| 593 | if(Lenstr(data.gbk.reference[indj].journal)>1) |
|---|
| 594 | remnum++; |
|---|
| 595 | if(Lenstr(data.gbk.reference[indj].author)>1) |
|---|
| 596 | remnum++; |
|---|
| 597 | if(Lenstr(data.gbk.reference[indj].title)>1) |
|---|
| 598 | remnum++; |
|---|
| 599 | if(Lenstr(data.gbk.reference[indj].standard)>1) |
|---|
| 600 | remnum++; |
|---|
| 601 | } /* loop for copying other reference */ |
|---|
| 602 | /* count the other keyword in GenBank format to be put into remarks */ |
|---|
| 603 | if(Lenstr(data.gbk.keywords)>1) |
|---|
| 604 | remnum++; |
|---|
| 605 | if(Lenstr(data.gbk.accession)>1) |
|---|
| 606 | remnum++; |
|---|
| 607 | if(Lenstr(data.gbk.comments.orginf.source)>1) /* Source of strain */ |
|---|
| 608 | remnum++; |
|---|
| 609 | if(Lenstr(data.gbk.comments.orginf.formname)>1) |
|---|
| 610 | remnum++; |
|---|
| 611 | if(Lenstr(data.gbk.comments.orginf.nickname)>1) /* Alternate name */ |
|---|
| 612 | remnum++; |
|---|
| 613 | if(Lenstr(data.gbk.comments.orginf.commname)>1) |
|---|
| 614 | remnum++; |
|---|
| 615 | if(Lenstr(data.gbk.comments.orginf.hostorg)>1) /* host organism */ |
|---|
| 616 | remnum++; |
|---|
| 617 | if(Lenstr(data.gbk.comments.seqinf.RDPid)>1) |
|---|
| 618 | remnum++; |
|---|
| 619 | if(Lenstr(data.gbk.comments.seqinf.methods)>1) |
|---|
| 620 | remnum++; |
|---|
| 621 | if(data.gbk.comments.seqinf.comp3!=' ') |
|---|
| 622 | remnum++; |
|---|
| 623 | if(data.gbk.comments.seqinf.comp5!=' ') |
|---|
| 624 | remnum++; |
|---|
| 625 | /* counting other than specific keyword comments */ |
|---|
| 626 | if(Lenstr(data.gbk.comments.others)>0) { |
|---|
| 627 | length = Lenstr(data.gbk.comments.others); |
|---|
| 628 | for(indj=0; indj<length; indj++) |
|---|
| 629 | { |
|---|
| 630 | if(data.gbk.comments.others[indj]=='\n' |
|---|
| 631 | || data.gbk.comments.others[indj]=='\0') { |
|---|
| 632 | remnum++; |
|---|
| 633 | } /* new remark line */ |
|---|
| 634 | } /* for loop to find other remarks */ |
|---|
| 635 | } /* other comments */ |
|---|
| 636 | return(remnum); |
|---|
| 637 | } |
|---|
| 638 | /* ----------------------------------------------------------------- |
|---|
| 639 | * Function macke_to_genbank(). |
|---|
| 640 | * Convert from macke format to genbank format. |
|---|
| 641 | */ |
|---|
| 642 | void |
|---|
| 643 | macke_to_genbank(inf, outf) |
|---|
| 644 | char *inf, *outf; |
|---|
| 645 | { |
|---|
| 646 | FILE *IFP1, *IFP2, *IFP3, *ofp; |
|---|
| 647 | FILE_BUFFER ifp1, ifp2, ifp3; |
|---|
| 648 | char temp[TOKENNUM]; |
|---|
| 649 | |
|---|
| 650 | if((IFP1=fopen(inf, "r"))==NULL|| |
|---|
| 651 | (IFP2=fopen(inf, "r"))==NULL|| |
|---|
| 652 | (IFP3=fopen(inf, "r"))==NULL) { |
|---|
| 653 | sprintf(temp, "Cannot open input file %s\n", inf); |
|---|
| 654 | error(19, temp); |
|---|
| 655 | } |
|---|
| 656 | |
|---|
| 657 | ifp1 = create_FILE_BUFFER(inf, IFP1); |
|---|
| 658 | ifp2 = create_FILE_BUFFER(inf, IFP2); |
|---|
| 659 | ifp3 = create_FILE_BUFFER(inf, IFP3); |
|---|
| 660 | |
|---|
| 661 | if(Lenstr(outf)<=0) ofp = stdout; |
|---|
| 662 | else if((ofp=fopen(outf, "w"))==NULL) { |
|---|
| 663 | sprintf(temp, "Cannot open output file %s\n", outf); |
|---|
| 664 | error(44, temp); |
|---|
| 665 | } |
|---|
| 666 | init(); |
|---|
| 667 | init_seq_data(); |
|---|
| 668 | init_gm_data(); |
|---|
| 669 | |
|---|
| 670 | #ifdef log |
|---|
| 671 | fprintf(stderr, "Start converting...\n"); |
|---|
| 672 | #endif |
|---|
| 673 | |
|---|
| 674 | while(macke_in(ifp1, ifp2, ifp3)!=EOF) { |
|---|
| 675 | data.numofseq++; |
|---|
| 676 | if(mtog()) genbank_out(ofp); |
|---|
| 677 | else error(15, "Conversion from macke to genbank fails, Exit"); |
|---|
| 678 | init_gm_data(); |
|---|
| 679 | |
|---|
| 680 | #ifdef log |
|---|
| 681 | if((data.numofseq % 50)==0) |
|---|
| 682 | fprintf(stderr, |
|---|
| 683 | "%d sequences are converted...\n", |
|---|
| 684 | data.numofseq); |
|---|
| 685 | #endif |
|---|
| 686 | |
|---|
| 687 | } |
|---|
| 688 | |
|---|
| 689 | #ifdef log |
|---|
| 690 | fprintf(stderr, |
|---|
| 691 | "Total %d sequences have been processed\n", data.numofseq); |
|---|
| 692 | #endif |
|---|
| 693 | |
|---|
| 694 | } |
|---|
| 695 | /* ---------------------------------------------------------------- |
|---|
| 696 | * Function mtog(). |
|---|
| 697 | * Convert Macke format to Genbank format. |
|---|
| 698 | */ |
|---|
| 699 | int |
|---|
| 700 | mtog() { |
|---|
| 701 | int indi; |
|---|
| 702 | /* int len, Lenstr(), Cmpstr(); */ |
|---|
| 703 | char temp[LONGTEXT]; |
|---|
| 704 | /* char *today_date(); */ |
|---|
| 705 | /* char *Dupstr(), *Reallocspace(), *macke_copyrem(); */ |
|---|
| 706 | /* char *genbank_date(); */ |
|---|
| 707 | /* void Freespace(), Cpystr(), Append(), Append_char(); */ |
|---|
| 708 | /* void mtog_genbank_def_and_source(); */ |
|---|
| 709 | /* void mtog_decode_ref_and_remarks(); */ |
|---|
| 710 | /* void init_reference(), replace_entry(), warning(); */ |
|---|
| 711 | |
|---|
| 712 | Cpystr(temp, data.macke.seqabbr); |
|---|
| 713 | |
|---|
| 714 | for(indi=Lenstr(temp); indi<13; temp[indi++] = ' ') ; |
|---|
| 715 | |
|---|
| 716 | if(Lenstr(data.macke.date)>1) |
|---|
| 717 | |
|---|
| 718 | sprintf((temp+10), |
|---|
| 719 | "%7d bp RNA RNA %s\n", |
|---|
| 720 | data.seq_length, genbank_date(data.macke.date)); |
|---|
| 721 | |
|---|
| 722 | else sprintf((temp+10), |
|---|
| 723 | "%7d bp RNA RNA %s\n", |
|---|
| 724 | data.seq_length, genbank_date(today_date())); |
|---|
| 725 | |
|---|
| 726 | replace_entry(&(data.gbk.locus), temp); |
|---|
| 727 | |
|---|
| 728 | /* GenBank ORGANISM */ |
|---|
| 729 | if(Lenstr(data.macke.name)>1) { |
|---|
| 730 | replace_entry(&(data.gbk.organism), data.macke.name); |
|---|
| 731 | |
|---|
| 732 | /* append a '.' at the end */ |
|---|
| 733 | Append_char(&(data.gbk.organism), '.'); |
|---|
| 734 | } |
|---|
| 735 | |
|---|
| 736 | if(Lenstr(data.macke.rna)>1) { |
|---|
| 737 | data.gbk.comments.seqinf.exist = 1; |
|---|
| 738 | replace_entry(&(data.gbk.comments.seqinf.methods), |
|---|
| 739 | data.macke.rna); |
|---|
| 740 | } |
|---|
| 741 | if(Lenstr(data.macke.acs)>1) { |
|---|
| 742 | /* #### not converted to accession but to comment gbkentry only, temporarily |
|---|
| 743 | Freespace(&(data.gbk.accession)); |
|---|
| 744 | data.gbk.accession = Dupstr(data.macke.acs); |
|---|
| 745 | */ |
|---|
| 746 | data.gbk.comments.seqinf.exist = 1; |
|---|
| 747 | replace_entry(&(data.gbk.comments.seqinf.gbkentry), |
|---|
| 748 | data.macke.acs); |
|---|
| 749 | } else if(Lenstr(data.macke.nbk)>1) { |
|---|
| 750 | /* #### not converted to accession but to comment gbkentry only, temp |
|---|
| 751 | Freespace(&(data.gbk.accession)); |
|---|
| 752 | data.gbk.accession = Dupstr(data.macke.nbk); |
|---|
| 753 | */ |
|---|
| 754 | data.gbk.comments.seqinf.exist = 1; |
|---|
| 755 | replace_entry(&(data.gbk.comments.seqinf.gbkentry), |
|---|
| 756 | data.macke.nbk); |
|---|
| 757 | } |
|---|
| 758 | if(Lenstr(data.macke.atcc)>1) { |
|---|
| 759 | data.gbk.comments.orginf.exist = 1; |
|---|
| 760 | replace_entry(&(data.gbk.comments.orginf.cc), |
|---|
| 761 | data.macke.atcc); |
|---|
| 762 | } |
|---|
| 763 | mtog_decode_ref_and_remarks(); |
|---|
| 764 | /* final conversion of cc */ |
|---|
| 765 | if(Lenstr(data.gbk.comments.orginf.cc)<=1&&Lenstr(data.macke.atcc)>1){ |
|---|
| 766 | replace_entry(&(data.gbk.comments.orginf.cc), |
|---|
| 767 | data.macke.atcc); |
|---|
| 768 | } |
|---|
| 769 | |
|---|
| 770 | /* define GenBank DEFINITION, after GenBank KEYWORD is defined. */ |
|---|
| 771 | mtog_genbank_def_and_source(); |
|---|
| 772 | |
|---|
| 773 | return(1); |
|---|
| 774 | } |
|---|
| 775 | /* --------------------------------------------------------------- |
|---|
| 776 | * Function mtog_decode_remarks(). |
|---|
| 777 | * Decode remarks of Macke to GenBank format. |
|---|
| 778 | */ |
|---|
| 779 | void |
|---|
| 780 | mtog_decode_ref_and_remarks() { |
|---|
| 781 | |
|---|
| 782 | int indi, indj; |
|---|
| 783 | int acount, tcount, jcount, rcount, scount; |
|---|
| 784 | char key[TOKENNUM], temp[LONGTEXT]; |
|---|
| 785 | /* char *macke_copyrem(), *Reallocspace(), *Dupstr(); */ |
|---|
| 786 | /* void Append_char(), Append(), Cpystr(); */ |
|---|
| 787 | /* void mtog_copy_remark(), init_reference(); */ |
|---|
| 788 | |
|---|
| 789 | data.gbk.numofref=acount=tcount=jcount=rcount=scount=0; |
|---|
| 790 | |
|---|
| 791 | if(Lenstr(data.macke.author)>1) { |
|---|
| 792 | if((acount+1)>data.gbk.numofref) { |
|---|
| 793 | /* new reference */ |
|---|
| 794 | data.gbk.reference = (Reference*)Reallocspace(data.gbk.reference, sizeof(Reference)*(acount+1)); |
|---|
| 795 | data.gbk.numofref = acount+1; |
|---|
| 796 | init_reference(&(data.gbk.reference[acount]), AUTHOR); |
|---|
| 797 | } else acount = data.gbk.numofref - 1; |
|---|
| 798 | data.gbk.reference[acount++].author |
|---|
| 799 | = (char*)Dupstr(data.macke.author); |
|---|
| 800 | } |
|---|
| 801 | if(Lenstr(data.macke.journal)>1) { |
|---|
| 802 | if((jcount+1)>data.gbk.numofref) { |
|---|
| 803 | data.gbk.reference = |
|---|
| 804 | (Reference*)Reallocspace(data.gbk.reference, |
|---|
| 805 | sizeof(Reference)*(jcount+1)); |
|---|
| 806 | data.gbk.numofref = jcount+1; |
|---|
| 807 | init_reference(&(data.gbk.reference[jcount]), JOURNAL); |
|---|
| 808 | } else jcount = data.gbk.numofref - 1; |
|---|
| 809 | data.gbk.reference[jcount++].journal |
|---|
| 810 | = (char*)Dupstr(data.macke.journal); |
|---|
| 811 | } |
|---|
| 812 | if(Lenstr(data.macke.title)>1) { |
|---|
| 813 | if((tcount+1)>data.gbk.numofref) { |
|---|
| 814 | data.gbk.reference = |
|---|
| 815 | (Reference*)Reallocspace(data.gbk.reference, |
|---|
| 816 | sizeof(Reference)*(tcount+1)); |
|---|
| 817 | data.gbk.numofref = tcount+1; |
|---|
| 818 | init_reference(&(data.gbk.reference[tcount]), TITLE); |
|---|
| 819 | } else tcount = data.gbk.numofref - 1; |
|---|
| 820 | data.gbk.reference[tcount++].title |
|---|
| 821 | = (char*)Dupstr(data.macke.title); |
|---|
| 822 | } |
|---|
| 823 | for(indi=0; indi<data.macke.numofrem; indi++) { |
|---|
| 824 | indj = macke_key_word(data.macke.remarks[indi], |
|---|
| 825 | 0, key, TOKENNUM); |
|---|
| 826 | if(Cmpstr(key, "KEYWORDS")==EQ) { |
|---|
| 827 | mtog_copy_remark(&(data.gbk.keywords), |
|---|
| 828 | &indi, indj); |
|---|
| 829 | |
|---|
| 830 | /* append a '.' at the end */ |
|---|
| 831 | Append_char(&(data.gbk.keywords), '.'); |
|---|
| 832 | |
|---|
| 833 | } else if(Cmpstr(key, "GenBank ACCESSION")==EQ) { |
|---|
| 834 | mtog_copy_remark(&(data.gbk.accession), |
|---|
| 835 | &indi, indj); |
|---|
| 836 | |
|---|
| 837 | } else if(Cmpstr(key, "ref")==EQ) { |
|---|
| 838 | if((rcount+1)>data.gbk.numofref) { |
|---|
| 839 | /* new reference */ |
|---|
| 840 | data.gbk.reference = (Reference*) |
|---|
| 841 | Reallocspace(data.gbk.reference, |
|---|
| 842 | sizeof(Reference)*(rcount+1)); |
|---|
| 843 | data.gbk.numofref = rcount+1; |
|---|
| 844 | init_reference(&(data.gbk.reference[rcount]), |
|---|
| 845 | REF); |
|---|
| 846 | } else rcount = data.gbk.numofref - 1; |
|---|
| 847 | data.gbk.reference[rcount++].ref |
|---|
| 848 | = macke_copyrem(data.macke.remarks, &indi, |
|---|
| 849 | data.macke.numofrem, indj); |
|---|
| 850 | } else if(Cmpstr(key, "auth")==EQ) { |
|---|
| 851 | if((acount+1)>data.gbk.numofref) { |
|---|
| 852 | /* new reference */ |
|---|
| 853 | data.gbk.reference = (Reference*) |
|---|
| 854 | Reallocspace(data.gbk.reference, |
|---|
| 855 | sizeof(Reference)*(acount+1)); |
|---|
| 856 | data.gbk.numofref = acount+1; |
|---|
| 857 | init_reference(&(data.gbk.reference[acount]), |
|---|
| 858 | AUTHOR); |
|---|
| 859 | } else acount = data.gbk.numofref - 1; |
|---|
| 860 | data.gbk.reference[acount++].author |
|---|
| 861 | = macke_copyrem(data.macke.remarks, &indi, |
|---|
| 862 | data.macke.numofrem, indj); |
|---|
| 863 | } else if(Cmpstr(key, "title")==EQ) { |
|---|
| 864 | if((tcount+1)>data.gbk.numofref) { |
|---|
| 865 | data.gbk.reference = (Reference*) |
|---|
| 866 | Reallocspace(data.gbk.reference, |
|---|
| 867 | sizeof(Reference)*(tcount+1)); |
|---|
| 868 | data.gbk.numofref = tcount+1; |
|---|
| 869 | init_reference(&(data.gbk.reference[tcount]), |
|---|
| 870 | TITLE); |
|---|
| 871 | } else tcount = data.gbk.numofref - 1; |
|---|
| 872 | data.gbk.reference[tcount++].title |
|---|
| 873 | = macke_copyrem(data.macke.remarks, &indi, |
|---|
| 874 | data.macke.numofrem, indj); |
|---|
| 875 | } else if(Cmpstr(key, "jour")==EQ) { |
|---|
| 876 | if((jcount+1)>data.gbk.numofref) { |
|---|
| 877 | data.gbk.reference = (Reference*) |
|---|
| 878 | Reallocspace(data.gbk.reference, |
|---|
| 879 | sizeof(Reference)*(jcount+1)); |
|---|
| 880 | data.gbk.numofref = jcount+1; |
|---|
| 881 | init_reference(&(data.gbk.reference[jcount]), |
|---|
| 882 | JOURNAL); |
|---|
| 883 | } else jcount = data.gbk.numofref - 1; |
|---|
| 884 | data.gbk.reference[jcount++].journal |
|---|
| 885 | = macke_copyrem(data.macke.remarks, &indi, |
|---|
| 886 | data.macke.numofrem, indj); |
|---|
| 887 | } else if(Cmpstr(key, "standard")==EQ) { |
|---|
| 888 | if((scount+1)>data.gbk.numofref) { |
|---|
| 889 | data.gbk.reference = (Reference*) |
|---|
| 890 | Reallocspace(data.gbk.reference, |
|---|
| 891 | sizeof(Reference)*(scount+1)); |
|---|
| 892 | data.gbk.numofref = scount+1; |
|---|
| 893 | init_reference(&(data.gbk.reference[scount]), |
|---|
| 894 | STANDARD); |
|---|
| 895 | } else scount = data.gbk.numofref - 1; |
|---|
| 896 | data.gbk.reference[scount++].standard |
|---|
| 897 | = macke_copyrem(data.macke.remarks, &indi, |
|---|
| 898 | data.macke.numofrem, indj); |
|---|
| 899 | |
|---|
| 900 | } else if(Cmpstr(key, "Source of strain")==EQ) { |
|---|
| 901 | |
|---|
| 902 | data.gbk.comments.orginf.exist = 1; |
|---|
| 903 | mtog_copy_remark( |
|---|
| 904 | &(data.gbk.comments.orginf.source), |
|---|
| 905 | &indi, indj); |
|---|
| 906 | |
|---|
| 907 | } else if(Cmpstr(key, "Former name")==EQ) { |
|---|
| 908 | |
|---|
| 909 | data.gbk.comments.orginf.exist = 1; |
|---|
| 910 | mtog_copy_remark( |
|---|
| 911 | &(data.gbk.comments.orginf.formname), |
|---|
| 912 | &indi, indj); |
|---|
| 913 | |
|---|
| 914 | } else if(Cmpstr(key, "Alternate name")==EQ) { |
|---|
| 915 | |
|---|
| 916 | data.gbk.comments.orginf.exist = 1; |
|---|
| 917 | mtog_copy_remark( |
|---|
| 918 | &(data.gbk.comments.orginf.nickname), |
|---|
| 919 | &indi, indj); |
|---|
| 920 | |
|---|
| 921 | } else if(Cmpstr(key, "Common name")==EQ) { |
|---|
| 922 | |
|---|
| 923 | data.gbk.comments.orginf.exist = 1; |
|---|
| 924 | mtog_copy_remark( |
|---|
| 925 | &(data.gbk.comments.orginf.commname), |
|---|
| 926 | &indi, indj); |
|---|
| 927 | |
|---|
| 928 | } else if(Cmpstr(key, "Host organism")==EQ) { |
|---|
| 929 | |
|---|
| 930 | data.gbk.comments.orginf.exist = 1; |
|---|
| 931 | mtog_copy_remark( |
|---|
| 932 | &(data.gbk.comments.orginf.hostorg), |
|---|
| 933 | &indi, indj); |
|---|
| 934 | |
|---|
| 935 | } else if(Cmpstr(key, "RDP ID")==EQ) { |
|---|
| 936 | |
|---|
| 937 | data.gbk.comments.seqinf.exist = 1; |
|---|
| 938 | mtog_copy_remark( |
|---|
| 939 | &(data.gbk.comments.seqinf.RDPid), |
|---|
| 940 | &indi, indj); |
|---|
| 941 | |
|---|
| 942 | } else if(Cmpstr(key, "Sequencing methods")==EQ) { |
|---|
| 943 | |
|---|
| 944 | data.gbk.comments.seqinf.exist = 1; |
|---|
| 945 | mtog_copy_remark( |
|---|
| 946 | &(data.gbk.comments.seqinf.methods), |
|---|
| 947 | &indi, indj); |
|---|
| 948 | |
|---|
| 949 | } else if(Cmpstr(key, "3' end complete")==EQ) { |
|---|
| 950 | |
|---|
| 951 | data.gbk.comments.seqinf.exist = 1; |
|---|
| 952 | sscanf(data.macke.remarks[indi]+indj, "%s", key); |
|---|
| 953 | if(Cmpstr(key, "Yes")==EQ) |
|---|
| 954 | data.gbk.comments.seqinf.comp3 = 'y'; |
|---|
| 955 | else data.gbk.comments.seqinf.comp3 = 'n'; |
|---|
| 956 | |
|---|
| 957 | } else if(Cmpstr(key, "5' end complete")==EQ) { |
|---|
| 958 | |
|---|
| 959 | data.gbk.comments.seqinf.exist = 1; |
|---|
| 960 | sscanf(data.macke.remarks[indi]+indj, "%s", key); |
|---|
| 961 | |
|---|
| 962 | if(Cmpstr(key, "Yes")==EQ) |
|---|
| 963 | data.gbk.comments.seqinf.comp5 |
|---|
| 964 | = 'y'; |
|---|
| 965 | else data.gbk.comments.seqinf.comp5 |
|---|
| 966 | = 'n'; |
|---|
| 967 | |
|---|
| 968 | } else { /* other comments */ |
|---|
| 969 | Cpystr(temp, data.macke.remarks[indi]); |
|---|
| 970 | if(data.gbk.comments.others==NULL) |
|---|
| 971 | data.gbk.comments.others |
|---|
| 972 | = (char*)Dupstr(temp); |
|---|
| 973 | else Append(&(data.gbk.comments.others), temp); |
|---|
| 974 | } |
|---|
| 975 | } /* for each rem */ |
|---|
| 976 | } |
|---|
| 977 | /* --------------------------------------------------------------- |
|---|
| 978 | * Function mtog_copy_remark(). |
|---|
| 979 | * Convert one remark back to GenBank format. |
|---|
| 980 | */ |
|---|
| 981 | void |
|---|
| 982 | mtog_copy_remark(string, indi, indj) |
|---|
| 983 | char **string; |
|---|
| 984 | int *indi, indj; |
|---|
| 985 | { |
|---|
| 986 | /* void Freespace(); */ |
|---|
| 987 | /* char *macke_copyrem(); */ |
|---|
| 988 | |
|---|
| 989 | Freespace(string); |
|---|
| 990 | (*string) = (char*)macke_copyrem(data.macke.remarks, indi, |
|---|
| 991 | data.macke.numofrem, indj); |
|---|
| 992 | } |
|---|
| 993 | /* ------------------------------------------------------------------ |
|---|
| 994 | * Function macke_copyrem(). |
|---|
| 995 | * uncode rem lines. |
|---|
| 996 | */ |
|---|
| 997 | char *macke_copyrem(strings, index, maxline, pointer) |
|---|
| 998 | char **strings; |
|---|
| 999 | int *index, maxline, pointer; |
|---|
| 1000 | { |
|---|
| 1001 | char *string; |
|---|
| 1002 | int indi; |
|---|
| 1003 | /* int len, length, macke_rem_continue_line(); */ |
|---|
| 1004 | /* char *Catstr(), *Dupstr(); */ |
|---|
| 1005 | /* void Append_rp_eoln(); */ |
|---|
| 1006 | |
|---|
| 1007 | string = (char*)Dupstr(strings[(*index)]+pointer); |
|---|
| 1008 | for(indi=(*index)+1; indi<maxline |
|---|
| 1009 | &&macke_rem_continue_line(strings, indi); indi++) |
|---|
| 1010 | Append_rp_eoln(&string, strings[indi]+3); |
|---|
| 1011 | (*index) = indi-1; |
|---|
| 1012 | return(string); |
|---|
| 1013 | } |
|---|
| 1014 | /* ------------------------------------------------------------------ |
|---|
| 1015 | * Function mtog_genbank_def_and_source(). |
|---|
| 1016 | * Define GenBank DEFINITION and SOURCE lines the way RDP |
|---|
| 1017 | * group likes. |
|---|
| 1018 | */ |
|---|
| 1019 | void |
|---|
| 1020 | mtog_genbank_def_and_source() { |
|---|
| 1021 | |
|---|
| 1022 | /* int Lenstr(); */ |
|---|
| 1023 | /* void Freespace(), Append(), Append_rm_eoln(), Append_char(); */ |
|---|
| 1024 | /* void replace_entry(), warning(); */ |
|---|
| 1025 | |
|---|
| 1026 | if(Lenstr(data.macke.name)>1) |
|---|
| 1027 | replace_entry(&(data.gbk.definition), |
|---|
| 1028 | data.macke.name); |
|---|
| 1029 | |
|---|
| 1030 | if(Lenstr(data.macke.subspecies)>1) { |
|---|
| 1031 | |
|---|
| 1032 | if(Lenstr(data.gbk.definition)<=1) { |
|---|
| 1033 | warning(22, "Genus and Species not defined"); |
|---|
| 1034 | Append_rm_eoln(&(data.gbk.definition), "subsp. "); |
|---|
| 1035 | } else Append_rm_eoln(&(data.gbk.definition), " subsp. "); |
|---|
| 1036 | |
|---|
| 1037 | Append(&(data.gbk.definition), data.macke.subspecies); |
|---|
| 1038 | } |
|---|
| 1039 | |
|---|
| 1040 | if(Lenstr(data.macke.strain)>1) { |
|---|
| 1041 | |
|---|
| 1042 | if(Lenstr(data.gbk.definition)<=1) { |
|---|
| 1043 | warning(23, |
|---|
| 1044 | "Genus and Species and Subspecies not defined"); |
|---|
| 1045 | Append_rm_eoln(&(data.gbk.definition), "str. "); |
|---|
| 1046 | } else Append_rm_eoln(&(data.gbk.definition), " str. "); |
|---|
| 1047 | |
|---|
| 1048 | Append(&(data.gbk.definition), data.macke.strain); |
|---|
| 1049 | } |
|---|
| 1050 | |
|---|
| 1051 | /* create SOURCE line, temp. */ |
|---|
| 1052 | if(Lenstr(data.gbk.definition)>1) { |
|---|
| 1053 | replace_entry(&(data.gbk.source), data.gbk.definition); |
|---|
| 1054 | Append_char(&(data.gbk.source), '.'); |
|---|
| 1055 | } |
|---|
| 1056 | |
|---|
| 1057 | /* append keyword to definition, if there is keyword. */ |
|---|
| 1058 | if(Lenstr(data.gbk.keywords)>1) { |
|---|
| 1059 | |
|---|
| 1060 | if(Lenstr(data.gbk.definition)>1) |
|---|
| 1061 | Append_rm_eoln(&(data.gbk.definition), |
|---|
| 1062 | "; \n"); |
|---|
| 1063 | |
|---|
| 1064 | /* Here keywords must be ended by a '.' already */ |
|---|
| 1065 | Append_rm_eoln(&(data.gbk.definition), |
|---|
| 1066 | data.gbk.keywords); |
|---|
| 1067 | |
|---|
| 1068 | } else Append_rm_eoln((&data.gbk.definition), ".\n"); |
|---|
| 1069 | |
|---|
| 1070 | } |
|---|
| 1071 | /* ------------------------------------------------------------------- |
|---|
| 1072 | * Function get_string(). |
|---|
| 1073 | * Get the rest of the string untill reaching certain |
|---|
| 1074 | * terminators, such as ';', ',', '.',... |
|---|
| 1075 | * Always append "\n" at the end of the returned string. |
|---|
| 1076 | */ |
|---|
| 1077 | void |
|---|
| 1078 | get_string(line, temp, index) |
|---|
| 1079 | char *line, *temp; |
|---|
| 1080 | int index; |
|---|
| 1081 | { |
|---|
| 1082 | int indk, len, paren_num; |
|---|
| 1083 | /* int not_ending_mark(); */ |
|---|
| 1084 | |
|---|
| 1085 | len = Lenstr(line); |
|---|
| 1086 | paren_num=0; |
|---|
| 1087 | for(indk=0; index<len; index++, indk++) { |
|---|
| 1088 | temp[indk]=line[index]; |
|---|
| 1089 | if(temp[indk]=='(') paren_num++; |
|---|
| 1090 | if(temp[indk]==')') |
|---|
| 1091 | if(paren_num==0) break; |
|---|
| 1092 | else paren_num--; |
|---|
| 1093 | else if(temp[indk]=='\n'||(paren_num==0 |
|---|
| 1094 | &&temp[indk]==';')) break; |
|---|
| 1095 | } |
|---|
| 1096 | if(indk>1 && !(not_ending_mark(temp[indk-1]))) indk--; |
|---|
| 1097 | temp[indk++]='\n'; |
|---|
| 1098 | temp[indk]='\0'; |
|---|
| 1099 | } |
|---|
| 1100 | /* ------------------------------------------------------------------- |
|---|
| 1101 | * Function get_atcc_string(). |
|---|
| 1102 | * Get the rest of the string untill reaching certain |
|---|
| 1103 | * terminators, such as ';', ',', '.',... |
|---|
| 1104 | */ |
|---|
| 1105 | void |
|---|
| 1106 | get_atcc_string(line, temp, index) |
|---|
| 1107 | char *line, *temp; |
|---|
| 1108 | int index; |
|---|
| 1109 | { |
|---|
| 1110 | int indk, len, paren_num; |
|---|
| 1111 | |
|---|
| 1112 | len = Lenstr(line); |
|---|
| 1113 | paren_num=0; |
|---|
| 1114 | for(indk=0; index<len; index++, indk++){ |
|---|
| 1115 | temp[indk]=line[index]; |
|---|
| 1116 | if(temp[indk]=='(') paren_num++; |
|---|
| 1117 | if(temp[indk]==')') |
|---|
| 1118 | if(paren_num==0) break; |
|---|
| 1119 | else paren_num--; |
|---|
| 1120 | else if(paren_num==0&&(temp[indk]==';'||temp[indk]=='.' |
|---|
| 1121 | ||temp[indk]==','||temp[indk]=='/'||temp[indk]=='\n')) |
|---|
| 1122 | break; |
|---|
| 1123 | } |
|---|
| 1124 | temp[indk]='\0'; |
|---|
| 1125 | } |
|---|