| 1 | // genbank and Macke converting program |
|---|
| 2 | |
|---|
| 3 | #include "genbank.h" |
|---|
| 4 | #include "macke.h" |
|---|
| 5 | |
|---|
| 6 | static int paren_string(char *line, char *pstring, int index) { |
|---|
| 7 | int len = str0len(line); |
|---|
| 8 | int paren_num = 0; |
|---|
| 9 | int indk; |
|---|
| 10 | |
|---|
| 11 | for (indk = 0; index < len; index++) { |
|---|
| 12 | if (paren_num >= 1) |
|---|
| 13 | pstring[indk++] = line[index]; |
|---|
| 14 | if (line[index] == '(') |
|---|
| 15 | paren_num++; |
|---|
| 16 | if (line[index] == ')') |
|---|
| 17 | paren_num--; |
|---|
| 18 | } |
|---|
| 19 | if (indk == 0) |
|---|
| 20 | return -1; |
|---|
| 21 | pstring[--indk] = '\0'; |
|---|
| 22 | return index; |
|---|
| 23 | } |
|---|
| 24 | |
|---|
| 25 | static void get_atcc_string(const char *line, char *temp, int index) { |
|---|
| 26 | // Get the rest of the line until reaching certain terminators, such as ';', ',', '.',... |
|---|
| 27 | |
|---|
| 28 | int len = str0len(line); |
|---|
| 29 | int paren_num = 0; |
|---|
| 30 | int indk; |
|---|
| 31 | |
|---|
| 32 | for (indk = 0; index < len; index++, indk++) { |
|---|
| 33 | temp[indk] = line[index]; |
|---|
| 34 | if (temp[indk] == '(') |
|---|
| 35 | paren_num++; |
|---|
| 36 | if (temp[indk] == ')') |
|---|
| 37 | if (paren_num == 0) |
|---|
| 38 | break; |
|---|
| 39 | else |
|---|
| 40 | paren_num--; |
|---|
| 41 | else if (paren_num == 0 && (temp[indk] == ';' || temp[indk] == '.' || temp[indk] == ',' || temp[indk] == '/' || temp[indk] == '\n')) |
|---|
| 42 | break; |
|---|
| 43 | } |
|---|
| 44 | temp[indk] = '\0'; |
|---|
| 45 | } |
|---|
| 46 | |
|---|
| 47 | static char *get_atcc(const Macke& macke, char *source) { |
|---|
| 48 | static int cc_num = 16; |
|---|
| 49 | static const char *CC[16] = { |
|---|
| 50 | "ATCC", "CCM", "CDC", "CIP", "CNCTC", |
|---|
| 51 | "DSM", "EPA", "JCM", "NADC", "NCDO", "NCTC", "NRCC", |
|---|
| 52 | "NRRL", "PCC", "USDA", "VPI" |
|---|
| 53 | }; |
|---|
| 54 | |
|---|
| 55 | int indi, indj, index; |
|---|
| 56 | int length; |
|---|
| 57 | char buffer[LONGTEXT], temp[LONGTEXT+5], pstring[LONGTEXT]; |
|---|
| 58 | char atcc[LONGTEXT]; |
|---|
| 59 | |
|---|
| 60 | atcc[0] = '\0'; |
|---|
| 61 | for (indi = 0; indi < cc_num; indi++) { |
|---|
| 62 | index = 0; |
|---|
| 63 | while ((index = paren_string(source, pstring, index)) > 0) { |
|---|
| 64 | if ((indj = find_pattern(pstring, CC[indi])) >= 0) { |
|---|
| 65 | // skip the key word |
|---|
| 66 | indj += str0len(CC[indi]); |
|---|
| 67 | // skip blank spaces |
|---|
| 68 | indj = Skip_white_space(pstring, indj); |
|---|
| 69 | // get strain |
|---|
| 70 | get_atcc_string(pstring, buffer, indj); |
|---|
| 71 | sprintf(temp, "%s %s", CC[indi], buffer); |
|---|
| 72 | length = str0len(atcc); |
|---|
| 73 | if (length > 0) { |
|---|
| 74 | atcc[length] = '\0'; |
|---|
| 75 | strcat(atcc, ", "); |
|---|
| 76 | } |
|---|
| 77 | strcat(atcc, temp); |
|---|
| 78 | } |
|---|
| 79 | } |
|---|
| 80 | } |
|---|
| 81 | // append eoln to the atcc string |
|---|
| 82 | length = str0len(atcc); |
|---|
| 83 | if (macke.atcc) { |
|---|
| 84 | macke.atcc[length] = '\0'; |
|---|
| 85 | } |
|---|
| 86 | strcat(atcc, "\n"); |
|---|
| 87 | return nulldup(atcc); |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | static char *genbank_get_atcc(const GenBank& gbk, const Macke& macke) { |
|---|
| 91 | // Get atcc from SOURCE line in Genbank data file. |
|---|
| 92 | char *atcc = NULp; |
|---|
| 93 | |
|---|
| 94 | // get culture collection # |
|---|
| 95 | if (has_content(gbk.source)) { |
|---|
| 96 | atcc = get_atcc(macke, gbk.source); |
|---|
| 97 | } |
|---|
| 98 | if (!has_content(atcc) && has_content(macke.strain)) { |
|---|
| 99 | // add () to macke strain to be processed correctly |
|---|
| 100 | char temp[LONGTEXT]; |
|---|
| 101 | sprintf(temp, "(%s)", macke.strain); |
|---|
| 102 | atcc = get_atcc(macke, temp); |
|---|
| 103 | } |
|---|
| 104 | return atcc; |
|---|
| 105 | } |
|---|
| 106 | |
|---|
| 107 | void Macke::add_35end_remark(char end35, char yn) { |
|---|
| 108 | if (yn == ' ') return; |
|---|
| 109 | |
|---|
| 110 | char *content = strf("%c' end complete: %s\n", end35, yn == 'y' ? "Yes" : "No"); |
|---|
| 111 | add_remark(content); |
|---|
| 112 | free(content); |
|---|
| 113 | } |
|---|
| 114 | |
|---|
| 115 | void Macke::add_remarks_from(const GenbankRef& ref) { |
|---|
| 116 | add_remark_if_content("ref:", ref.ref); |
|---|
| 117 | add_remark_if_content("auth:", ref.author); |
|---|
| 118 | add_remark_if_content("jour:", ref.journal); |
|---|
| 119 | add_remark_if_content("title:", ref.title); |
|---|
| 120 | add_remark_if_content("standard:", ref.standard); |
|---|
| 121 | } |
|---|
| 122 | |
|---|
| 123 | void Macke::add_remarks_from(const OrgInfo& orginf) { |
|---|
| 124 | add_remark_if_content("Source of strain:", orginf.source); // copy source of strain |
|---|
| 125 | add_remark_if_content("Former name:", orginf.formname); // copy former name |
|---|
| 126 | add_remark_if_content("Alternate name:", orginf.nickname); // copy alternate name |
|---|
| 127 | add_remark_if_content("Common name:", orginf.commname); // copy common name |
|---|
| 128 | add_remark_if_content("Host organism:", orginf.hostorg); // copy host organism |
|---|
| 129 | } |
|---|
| 130 | |
|---|
| 131 | void Macke::add_remarks_from(const RDP_comments& comments) { |
|---|
| 132 | add_remarks_from(comments.orginf); |
|---|
| 133 | add_remarks_from(comments.seqinf); |
|---|
| 134 | |
|---|
| 135 | // other comments, not RDP DataBase specially defined |
|---|
| 136 | int len = str0len(comments.others); |
|---|
| 137 | if (len > 0) { |
|---|
| 138 | for (int indi = 0, indj = 0; indi < len; indi++) { |
|---|
| 139 | char temp[LONGTEXT]; |
|---|
| 140 | temp[indj++] = comments.others[indi]; |
|---|
| 141 | if (comments.others[indi] == '\n' || comments.others[indi] == '\0') { |
|---|
| 142 | temp[indj] = '\0'; |
|---|
| 143 | add_remark(temp); |
|---|
| 144 | indj = 0; |
|---|
| 145 | } |
|---|
| 146 | } |
|---|
| 147 | } |
|---|
| 148 | } |
|---|
| 149 | |
|---|
| 150 | void Macke::add_remarks_from(const SeqInfo& seqinf) { |
|---|
| 151 | add_remark_if_content("RDP ID:", seqinf.RDPid); // copy RDP ID |
|---|
| 152 | add_remark_if_content("Sequencing methods:", seqinf.methods); // copy methods |
|---|
| 153 | |
|---|
| 154 | add_35end_remark('3', seqinf.comp3); |
|---|
| 155 | add_35end_remark('5', seqinf.comp5); |
|---|
| 156 | } |
|---|
| 157 | |
|---|
| 158 | void Macke::add_remarks_from(const GenBank& gbk) { |
|---|
| 159 | // Create Macke remarks. |
|---|
| 160 | |
|---|
| 161 | // REFERENCE the first reference |
|---|
| 162 | if (gbk.has_refs()) |
|---|
| 163 | add_remark_if_content("ref:", gbk.get_ref(0).ref); |
|---|
| 164 | |
|---|
| 165 | // The rest of the REFERENCES |
|---|
| 166 | for (int indi = 1; indi < gbk.get_refcount(); indi++) { |
|---|
| 167 | add_remarks_from(gbk.get_ref(indi)); |
|---|
| 168 | } |
|---|
| 169 | |
|---|
| 170 | add_remark_if_content("KEYWORDS:", gbk.keywords); // copy keywords as remark |
|---|
| 171 | add_remark_if_content("GenBank ACCESSION:", gbk.accession); // copy accession as remark when genbank entry also exists. |
|---|
| 172 | add_remarks_from(gbk.comments); |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | static void correct_subspecies(char *subspecies) { |
|---|
| 176 | // Remove the strain information in subspecies which is sometime mistakenly written into it. |
|---|
| 177 | int indj; |
|---|
| 178 | |
|---|
| 179 | if ((indj = find_pattern(subspecies, "str\n")) >= 0 || (indj = find_strain(subspecies, 0)) >= 0) { |
|---|
| 180 | ca_assert(subspecies[indj-1] == ' '); // assume to overwrite a space |
|---|
| 181 | subspecies[indj - 1] = '\n'; |
|---|
| 182 | subspecies[indj] = '\0'; |
|---|
| 183 | } |
|---|
| 184 | } |
|---|
| 185 | |
|---|
| 186 | static void check_consistency(const char *what, char* const& var, const char *New) { |
|---|
| 187 | if (has_content(var)) { |
|---|
| 188 | if (!str_equal(var, New)) { |
|---|
| 189 | warningf(20, "Inconsistent %s definitions detected:\n" |
|---|
| 190 | " %s" |
|---|
| 191 | "and %s", what, var, New); |
|---|
| 192 | } |
|---|
| 193 | } |
|---|
| 194 | else { |
|---|
| 195 | strcpy(var, New); |
|---|
| 196 | } |
|---|
| 197 | } |
|---|
| 198 | |
|---|
| 199 | static void get_string(char *temp, const char *line, int index) { |
|---|
| 200 | // Get the rest of the line until reaching certain terminators, |
|---|
| 201 | // such as ';', ',', '.',... |
|---|
| 202 | // Always append "\n" at the end of the result. |
|---|
| 203 | |
|---|
| 204 | index = Skip_white_space(line, index); |
|---|
| 205 | |
|---|
| 206 | int len = str0len(line); |
|---|
| 207 | int paren_num = 0; |
|---|
| 208 | int indk; |
|---|
| 209 | |
|---|
| 210 | for (indk = 0; index < len; index++, indk++) { |
|---|
| 211 | temp[indk] = line[index]; |
|---|
| 212 | if (temp[indk] == '(') |
|---|
| 213 | paren_num++; |
|---|
| 214 | if (temp[indk] == ')') |
|---|
| 215 | if (paren_num == 0) |
|---|
| 216 | break; |
|---|
| 217 | else |
|---|
| 218 | paren_num--; |
|---|
| 219 | else if (temp[indk] == '\n' || (paren_num == 0 && temp[indk] == ';')) |
|---|
| 220 | break; |
|---|
| 221 | } |
|---|
| 222 | if (indk > 1 && is_end_mark(temp[indk - 1])) |
|---|
| 223 | indk--; |
|---|
| 224 | temp[indk++] = '\n'; |
|---|
| 225 | temp[indk] = '\0'; |
|---|
| 226 | } |
|---|
| 227 | |
|---|
| 228 | static void copy_subspecies_and_check_consistency(char* const& subspecies, const char *from, int indj) { |
|---|
| 229 | char temp[LONGTEXT]; |
|---|
| 230 | get_string(temp, from, indj); |
|---|
| 231 | correct_subspecies(temp); |
|---|
| 232 | check_consistency("subspecies", subspecies, temp); |
|---|
| 233 | } |
|---|
| 234 | static void copy_strain_and_check_consistency(char* const& strain, const char *from, int indj) { |
|---|
| 235 | char temp[LONGTEXT]; |
|---|
| 236 | get_string(temp, from, indj); |
|---|
| 237 | check_consistency("strain", strain, temp); |
|---|
| 238 | } |
|---|
| 239 | |
|---|
| 240 | static void check_strain_from(char* const& strain, const char *from) { |
|---|
| 241 | if (has_content(from)) { |
|---|
| 242 | int indj = skip_strain(from, ' '); |
|---|
| 243 | if (indj >= 0) copy_strain_and_check_consistency(strain, from, indj); |
|---|
| 244 | } |
|---|
| 245 | } |
|---|
| 246 | |
|---|
| 247 | static char *genbank_get_strain(const GenBank& gbk) { |
|---|
| 248 | // Get strain from DEFINITION, COMMENT or SOURCE line in Genbank data file. |
|---|
| 249 | char strain[LONGTEXT]; |
|---|
| 250 | |
|---|
| 251 | strain[0] = '\0'; |
|---|
| 252 | |
|---|
| 253 | if (has_content(gbk.comments.others)) { |
|---|
| 254 | int indj = find_pattern(gbk.comments.others, "*source:"); |
|---|
| 255 | if (indj >= 0) { |
|---|
| 256 | int indk = skip_pattern(gbk.comments.others + indj, "strain="); |
|---|
| 257 | if (indk >= 0) copy_strain_and_check_consistency(strain, gbk.comments.others, indj+indk); |
|---|
| 258 | } |
|---|
| 259 | } |
|---|
| 260 | |
|---|
| 261 | check_strain_from(strain, gbk.definition); |
|---|
| 262 | check_strain_from(strain, gbk.source); |
|---|
| 263 | |
|---|
| 264 | return nulldup(strain); |
|---|
| 265 | } |
|---|
| 266 | |
|---|
| 267 | static char *genbank_get_subspecies(const GenBank& gbk) { |
|---|
| 268 | // Get subspecies information from SOURCE, DEFINITION, or COMMENT line of Genbank data file. |
|---|
| 269 | int indj; |
|---|
| 270 | char subspecies[LONGTEXT]; |
|---|
| 271 | |
|---|
| 272 | subspecies[0] = '\0'; |
|---|
| 273 | |
|---|
| 274 | if (has_content(gbk.definition)) { |
|---|
| 275 | if ((indj = skip_pattern(gbk.definition, "subsp. ")) >= 0) { |
|---|
| 276 | copy_subspecies_and_check_consistency(subspecies, gbk.definition, indj); |
|---|
| 277 | } |
|---|
| 278 | } |
|---|
| 279 | if (has_content(gbk.comments.others)) { |
|---|
| 280 | if ((indj = find_pattern(gbk.comments.others, "*source:")) >= 0) { |
|---|
| 281 | int indk = skip_subspecies(gbk.comments.others + indj, '='); |
|---|
| 282 | if (indk >= 0) { |
|---|
| 283 | copy_subspecies_and_check_consistency(subspecies, gbk.comments.others, indj+indk); |
|---|
| 284 | } |
|---|
| 285 | } |
|---|
| 286 | } |
|---|
| 287 | |
|---|
| 288 | if (has_content(gbk.source)) { |
|---|
| 289 | if ((indj = skip_subspecies(gbk.source, ' ')) >= 0) { |
|---|
| 290 | copy_subspecies_and_check_consistency(subspecies, gbk.source, indj); |
|---|
| 291 | } |
|---|
| 292 | } |
|---|
| 293 | |
|---|
| 294 | return nulldup(subspecies); |
|---|
| 295 | } |
|---|
| 296 | |
|---|
| 297 | static void mtog_decode_ref_and_remarks(const Macke& macke, GenBank& gbk) { |
|---|
| 298 | // Decode remarks of Macke to GenBank format. |
|---|
| 299 | ca_assert(gbk.get_refcount() == 0); |
|---|
| 300 | |
|---|
| 301 | if (has_content(macke.author)) freedup(gbk.get_new_ref().author, macke.author); |
|---|
| 302 | if (has_content(macke.journal)) freedup(gbk.get_latest_ref().journal, macke.journal); |
|---|
| 303 | if (has_content(macke.title)) freedup(gbk.get_latest_ref().title, macke.title); |
|---|
| 304 | |
|---|
| 305 | bool first_ref = true; |
|---|
| 306 | |
|---|
| 307 | RDP_comments& comments = gbk.comments; |
|---|
| 308 | OrgInfo& orginf = comments.orginf; |
|---|
| 309 | SeqInfo& seqinf = comments.seqinf; |
|---|
| 310 | |
|---|
| 311 | for (int ridx = 0; ridx < macke.get_rem_count(); ridx++) { |
|---|
| 312 | char key[TOKENSIZE]; |
|---|
| 313 | int offset = macke_key_word(macke.get_rem(ridx), 0, key); |
|---|
| 314 | |
|---|
| 315 | if (str_equal(key, "ref")) { |
|---|
| 316 | GenbankRef& ref = first_ref ? gbk.get_latest_ref() : gbk.get_new_ref(); |
|---|
| 317 | freeset(ref.ref, macke.copy_multi_rem(ridx, offset)); |
|---|
| 318 | first_ref = false; |
|---|
| 319 | } |
|---|
| 320 | else if (str_equal(key, "auth")) { |
|---|
| 321 | freeset(gbk.get_latest_ref().author, macke.copy_multi_rem(ridx, offset)); |
|---|
| 322 | } |
|---|
| 323 | else if (str_equal(key, "title")) { |
|---|
| 324 | freeset(gbk.get_latest_ref().title, macke.copy_multi_rem(ridx, offset)); |
|---|
| 325 | } |
|---|
| 326 | else if (str_equal(key, "jour")) { |
|---|
| 327 | freeset(gbk.get_latest_ref().journal, macke.copy_multi_rem(ridx, offset)); |
|---|
| 328 | } |
|---|
| 329 | else if (str_equal(key, "standard")) { |
|---|
| 330 | freeset(gbk.get_latest_ref().standard, macke.copy_multi_rem(ridx, offset)); |
|---|
| 331 | } |
|---|
| 332 | else if (str_equal(key, "KEYWORDS")) { |
|---|
| 333 | freeset(gbk.keywords, macke.copy_multi_rem(ridx, offset)); |
|---|
| 334 | terminate_with(gbk.keywords, '.'); |
|---|
| 335 | } |
|---|
| 336 | else if (str_equal(key, "GenBank ACCESSION")) { |
|---|
| 337 | freeset(gbk.accession, macke.copy_multi_rem(ridx, offset)); |
|---|
| 338 | } |
|---|
| 339 | else if (str_equal(key, "Source of strain")) { |
|---|
| 340 | freeset(orginf.source, macke.copy_multi_rem(ridx, offset)); |
|---|
| 341 | } |
|---|
| 342 | else if (str_equal(key, "Former name")) { |
|---|
| 343 | freeset(orginf.formname, macke.copy_multi_rem(ridx, offset)); |
|---|
| 344 | } |
|---|
| 345 | else if (str_equal(key, "Alternate name")) { |
|---|
| 346 | freeset(orginf.nickname, macke.copy_multi_rem(ridx, offset)); |
|---|
| 347 | } |
|---|
| 348 | else if (str_equal(key, "Common name")) { |
|---|
| 349 | freeset(orginf.commname, macke.copy_multi_rem(ridx, offset)); |
|---|
| 350 | } |
|---|
| 351 | else if (str_equal(key, "Host organism")) { |
|---|
| 352 | freeset(orginf.hostorg, macke.copy_multi_rem(ridx, offset)); |
|---|
| 353 | } |
|---|
| 354 | else if (str_equal(key, "RDP ID")) { |
|---|
| 355 | freeset(seqinf.RDPid, macke.copy_multi_rem(ridx, offset)); |
|---|
| 356 | } |
|---|
| 357 | else if (str_equal(key, "Sequencing methods")) { |
|---|
| 358 | freeset(seqinf.methods, macke.copy_multi_rem(ridx, offset)); |
|---|
| 359 | } |
|---|
| 360 | else if (str_equal(key, "3' end complete")) { |
|---|
| 361 | scan_token_or_die(key, macke.get_rem(ridx) + offset); |
|---|
| 362 | seqinf.comp3 = str_equal(key, "Yes") ? 'y' : 'n'; |
|---|
| 363 | } |
|---|
| 364 | else if (str_equal(key, "5' end complete")) { |
|---|
| 365 | scan_token_or_die(key, macke.get_rem(ridx) + offset); |
|---|
| 366 | seqinf.comp5 = str_equal(key, "Yes") ? 'y' : 'n'; |
|---|
| 367 | } |
|---|
| 368 | else { // other (non-interpreted) comments |
|---|
| 369 | Append(comments.others, macke.get_rem(ridx)); |
|---|
| 370 | } |
|---|
| 371 | } |
|---|
| 372 | } |
|---|
| 373 | |
|---|
| 374 | static void mtog_genbank_def_and_source(const Macke& macke, GenBank& gbk) { |
|---|
| 375 | // Define GenBank DEFINITION and SOURCE lines the way RDP group likes. |
|---|
| 376 | copy_content(gbk.definition, macke.name); |
|---|
| 377 | if (has_content(macke.subspecies)) { |
|---|
| 378 | if (!has_content(gbk.definition)) { |
|---|
| 379 | warning(22, "Genus and Species not defined"); |
|---|
| 380 | skip_eolnl_and_append(gbk.definition, "subsp. "); |
|---|
| 381 | } |
|---|
| 382 | else |
|---|
| 383 | skip_eolnl_and_append(gbk.definition, " subsp. "); |
|---|
| 384 | |
|---|
| 385 | Append(gbk.definition, macke.subspecies); |
|---|
| 386 | } |
|---|
| 387 | |
|---|
| 388 | if (has_content(macke.strain)) { |
|---|
| 389 | if (!has_content(gbk.definition)) { |
|---|
| 390 | warning(23, "Genus and Species and Subspecies not defined"); |
|---|
| 391 | skip_eolnl_and_append(gbk.definition, "str. "); |
|---|
| 392 | } |
|---|
| 393 | else |
|---|
| 394 | skip_eolnl_and_append(gbk.definition, " str. "); |
|---|
| 395 | |
|---|
| 396 | Append(gbk.definition, macke.strain); |
|---|
| 397 | } |
|---|
| 398 | |
|---|
| 399 | // create SOURCE line, temp. |
|---|
| 400 | if (copy_content(gbk.source, gbk.definition)) terminate_with(gbk.source, '.'); |
|---|
| 401 | |
|---|
| 402 | // append keyword to definition, if there is keyword. |
|---|
| 403 | if (has_content(gbk.keywords)) { |
|---|
| 404 | if (has_content(gbk.definition)) |
|---|
| 405 | skip_eolnl_and_append(gbk.definition, "; \n"); |
|---|
| 406 | |
|---|
| 407 | // Here keywords must be ended by a '.' already |
|---|
| 408 | skip_eolnl_and_append(gbk.definition, gbk.keywords); |
|---|
| 409 | } |
|---|
| 410 | else |
|---|
| 411 | skip_eolnl_and_append(gbk.definition, ".\n"); |
|---|
| 412 | } |
|---|
| 413 | |
|---|
| 414 | int mtog(const Macke& macke, GenBank& gbk, const Seq& seq) { // __ATTR__USERESULT |
|---|
| 415 | // Convert Macke format to Genbank format. |
|---|
| 416 | int indi; |
|---|
| 417 | char temp[LONGTEXT]; |
|---|
| 418 | |
|---|
| 419 | strcpy(temp, macke.seqabbr); |
|---|
| 420 | |
|---|
| 421 | for (indi = str0len(temp); indi < 13; temp[indi++] = ' ') {} |
|---|
| 422 | |
|---|
| 423 | if (has_content(macke.date)) |
|---|
| 424 | sprintf((temp + 10), "%7d bp RNA RNA %s\n", seq.get_len(), genbank_date(macke.date)); |
|---|
| 425 | else |
|---|
| 426 | sprintf((temp + 10), "%7d bp RNA RNA %s\n", seq.get_len(), genbank_date(today_date())); |
|---|
| 427 | |
|---|
| 428 | freedup(gbk.locus, temp); |
|---|
| 429 | |
|---|
| 430 | // GenBank ORGANISM |
|---|
| 431 | if (copy_content(gbk.organism, macke.name)) terminate_with(gbk.organism, '.'); |
|---|
| 432 | |
|---|
| 433 | RDP_comments& comments = gbk.comments; |
|---|
| 434 | OrgInfo& orginf = comments.orginf; |
|---|
| 435 | SeqInfo& seqinf = comments.seqinf; |
|---|
| 436 | |
|---|
| 437 | copy_content(seqinf.methods, macke.rna); |
|---|
| 438 | |
|---|
| 439 | if (!copy_content(seqinf.gbkentry, macke.acs)) |
|---|
| 440 | copy_content(seqinf.gbkentry, macke.nbk); |
|---|
| 441 | |
|---|
| 442 | copy_content(orginf.cultcoll, macke.atcc); |
|---|
| 443 | mtog_decode_ref_and_remarks(macke, gbk); |
|---|
| 444 | |
|---|
| 445 | // final conversion of cultcoll |
|---|
| 446 | if (!has_content(orginf.cultcoll)) copy_content(orginf.cultcoll, macke.atcc); |
|---|
| 447 | |
|---|
| 448 | // define GenBank DEFINITION, after GenBank KEYWORD is defined. |
|---|
| 449 | mtog_genbank_def_and_source(macke, gbk); |
|---|
| 450 | |
|---|
| 451 | return 1; |
|---|
| 452 | } |
|---|
| 453 | |
|---|
| 454 | int gtom(const GenBank& gbk, Macke& macke) { // __ATTR__USERESULT |
|---|
| 455 | // Convert from Genbank format to Macke format. |
|---|
| 456 | |
|---|
| 457 | // copy sequence abbr, assume every entry in gbk must end with \n\0 |
|---|
| 458 | // no '\n' at the end of the string |
|---|
| 459 | { |
|---|
| 460 | char temp[LONGTEXT]; |
|---|
| 461 | genbank_key_word(gbk.locus, 0, temp); |
|---|
| 462 | freedup(macke.seqabbr, temp); |
|---|
| 463 | } |
|---|
| 464 | |
|---|
| 465 | // copy name and definition |
|---|
| 466 | if (!copy_content(macke.name, gbk.organism) && has_content(gbk.definition)) { |
|---|
| 467 | char genus[TOKENSIZE]; |
|---|
| 468 | char species[TOKENSIZE]; |
|---|
| 469 | |
|---|
| 470 | ASSERT_RESULT(int, 2, sscanf(gbk.definition, "%s %s", genus, species)); |
|---|
| 471 | |
|---|
| 472 | int last = str0len(species)-1; |
|---|
| 473 | if (species[last] == ';') species[last] = '\0'; |
|---|
| 474 | |
|---|
| 475 | freeset(macke.name, strf("%s %s\n", genus, species)); |
|---|
| 476 | } |
|---|
| 477 | |
|---|
| 478 | const OrgInfo& orginf = gbk.comments.orginf; |
|---|
| 479 | const SeqInfo& seqinf = gbk.comments.seqinf; |
|---|
| 480 | |
|---|
| 481 | copy_content(macke.atcc, orginf.cultcoll); // copy cultcoll name and number |
|---|
| 482 | copy_content(macke.rna, seqinf.methods); // copy rna(methods) |
|---|
| 483 | |
|---|
| 484 | freeset(macke.date, gbk.get_date()); Append(macke.date, "\n"); |
|---|
| 485 | |
|---|
| 486 | // copy genbank entry (gbkentry has higher priority than gbk.accession) |
|---|
| 487 | if (!copy_content(macke.acs, seqinf.gbkentry)) { |
|---|
| 488 | char buffer[TOKENSIZE]; |
|---|
| 489 | if (has_content(gbk.accession) && !str_equal(gbk.accession, "No information\n")) { |
|---|
| 490 | scan_token_or_die(buffer, gbk.accession); |
|---|
| 491 | strcat(buffer, "\n"); |
|---|
| 492 | } |
|---|
| 493 | else { |
|---|
| 494 | strcpy(buffer, "\n"); |
|---|
| 495 | } |
|---|
| 496 | freedup(macke.acs, buffer); |
|---|
| 497 | } |
|---|
| 498 | |
|---|
| 499 | // copy the first reference from GenBank to Macke |
|---|
| 500 | if (gbk.has_refs()) { |
|---|
| 501 | copy_content(macke.author, gbk.get_ref(0).author); |
|---|
| 502 | copy_content(macke.journal, gbk.get_ref(0).journal); |
|---|
| 503 | copy_content(macke.title, gbk.get_ref(0).title); |
|---|
| 504 | } |
|---|
| 505 | // the rest of references are put into remarks, rem:..... |
|---|
| 506 | macke.add_remarks_from(gbk); |
|---|
| 507 | |
|---|
| 508 | // adjust the strain, subspecies, and atcc information |
|---|
| 509 | freeset(macke.strain, genbank_get_strain(gbk)); |
|---|
| 510 | freeset(macke.subspecies, genbank_get_subspecies(gbk)); |
|---|
| 511 | if (!has_content(macke.atcc)) { |
|---|
| 512 | freeset(macke.atcc, genbank_get_atcc(gbk, macke)); |
|---|
| 513 | } |
|---|
| 514 | |
|---|
| 515 | return 1; |
|---|
| 516 | } |
|---|