| 1 | #include "embl.h" |
|---|
| 2 | #include "genbank.h" |
|---|
| 3 | #include "macke.h" |
|---|
| 4 | #include "wrap.h" |
|---|
| 5 | |
|---|
| 6 | static void embl_continue_line(const char *pattern, char*& Str, Reader& reader) { |
|---|
| 7 | // if there are (numb) blanks at the beginning of line, |
|---|
| 8 | // it is a continue line of the current command. |
|---|
| 9 | int ind; |
|---|
| 10 | char key[TOKENSIZE], temp[LINESIZE]; |
|---|
| 11 | |
|---|
| 12 | // check continue lines |
|---|
| 13 | for (++reader; reader.line(); ++reader) { |
|---|
| 14 | if (has_content(reader.line())) { |
|---|
| 15 | embl_key_word(reader.line(), 0, key); |
|---|
| 16 | if (!str_equal(pattern, key)) break; |
|---|
| 17 | |
|---|
| 18 | // remove end-of-line, if there is any |
|---|
| 19 | ind = Skip_white_space(reader.line(), p_nonkey_start); |
|---|
| 20 | strcpy(temp, reader.line() + ind); |
|---|
| 21 | skip_eolnl_and_append_spaced(Str, temp); |
|---|
| 22 | } |
|---|
| 23 | } |
|---|
| 24 | } |
|---|
| 25 | |
|---|
| 26 | static void embl_one_entry(Reader& reader, char*& entry, const char *key) { |
|---|
| 27 | // Read in one embl entry lines. |
|---|
| 28 | int index = Skip_white_space(reader.line(), p_nonkey_start); |
|---|
| 29 | freedup(entry, reader.line() + index); |
|---|
| 30 | embl_continue_line(key, entry, reader); |
|---|
| 31 | } |
|---|
| 32 | |
|---|
| 33 | static void embl_date(Embl& embl, Reader& reader) { |
|---|
| 34 | // Read in embl DATE lines. |
|---|
| 35 | int index = Skip_white_space(reader.line(), p_nonkey_start); |
|---|
| 36 | freedup(embl.dateu, reader.line() + index); |
|---|
| 37 | |
|---|
| 38 | ++reader; |
|---|
| 39 | |
|---|
| 40 | char key[TOKENSIZE]; |
|---|
| 41 | embl_key_word(reader.line(), 0, key); |
|---|
| 42 | if (str_equal(key, "DT")) { |
|---|
| 43 | index = Skip_white_space(reader.line(), p_nonkey_start); |
|---|
| 44 | freedup(embl.datec, reader.line() + index); |
|---|
| 45 | // skip the rest of DT lines |
|---|
| 46 | do { |
|---|
| 47 | ++reader; |
|---|
| 48 | if (!reader.line()) break; |
|---|
| 49 | embl_key_word(reader.line(), 0, key); |
|---|
| 50 | } |
|---|
| 51 | while (str_equal(key, "DT")); |
|---|
| 52 | } |
|---|
| 53 | else { |
|---|
| 54 | // always expect more than two DT lines |
|---|
| 55 | warning(33, "one DT line is missing"); |
|---|
| 56 | } |
|---|
| 57 | } |
|---|
| 58 | |
|---|
| 59 | static void embl_correct_title(Emblref& ref) { |
|---|
| 60 | // Check missing '"' at the both ends |
|---|
| 61 | |
|---|
| 62 | terminate_with(ref.title, ';'); |
|---|
| 63 | |
|---|
| 64 | int len = str0len(ref.title); |
|---|
| 65 | if (len > 2 && (ref.title[0] != '"' || ref.title[len - 3] != '"')) { |
|---|
| 66 | char *temp = NULp; |
|---|
| 67 | if (ref.title[0] != '"') |
|---|
| 68 | temp = ARB_strdup("\""); |
|---|
| 69 | else |
|---|
| 70 | temp = ARB_strdup(""); |
|---|
| 71 | Append(temp, ref.title); |
|---|
| 72 | if ((len > 2 && ref.title[len - 3] |
|---|
| 73 | != '"')) { |
|---|
| 74 | len = str0len(temp); |
|---|
| 75 | temp[len - 2] = '"'; |
|---|
| 76 | terminate_with(temp, ';'); |
|---|
| 77 | } |
|---|
| 78 | freedup(ref.title, temp); |
|---|
| 79 | free(temp); |
|---|
| 80 | } |
|---|
| 81 | } |
|---|
| 82 | |
|---|
| 83 | int comment_subkey(const char *line, char *key) { |
|---|
| 84 | // Get the subkey-word (including delimiting ':') from a comment line |
|---|
| 85 | int len = parse_key_word(line, key, ":\t\n("); |
|---|
| 86 | if (!len) return 0; |
|---|
| 87 | |
|---|
| 88 | if (line[len] == ':') { |
|---|
| 89 | key[len] = ':'; |
|---|
| 90 | key[len+1] = 0; |
|---|
| 91 | } |
|---|
| 92 | return len+1; |
|---|
| 93 | } |
|---|
| 94 | |
|---|
| 95 | inline bool is_embl_comment(const char *line) { return line && line[0] == 'C' && line[1] == 'C'; } |
|---|
| 96 | |
|---|
| 97 | static void embl_one_comment_entry(char*& datastring, int start_index, Reader& reader) { |
|---|
| 98 | // Read in one embl sub-entry in comments lines. |
|---|
| 99 | // If it's not a RDP defined comment, you should not call this function. |
|---|
| 100 | |
|---|
| 101 | int index = Skip_white_space(reader.line(), start_index); |
|---|
| 102 | freedup(datastring, reader.line() + index); |
|---|
| 103 | |
|---|
| 104 | const int expectedIndent = RDP_CONTINUED_INDENT+RDP_SUBKEY_INDENT; |
|---|
| 105 | |
|---|
| 106 | for (++reader; |
|---|
| 107 | is_embl_comment(reader.line()) && count_spaces(reader.line() + 2) >= expectedIndent; |
|---|
| 108 | ++reader) |
|---|
| 109 | { |
|---|
| 110 | index = Skip_white_space(reader.line(), p_nonkey_start + expectedIndent); |
|---|
| 111 | |
|---|
| 112 | char temp[LINESIZE]; |
|---|
| 113 | strcpy(temp, reader.line() + index); |
|---|
| 114 | skip_eolnl_and_append_spaced(datastring, temp); |
|---|
| 115 | } |
|---|
| 116 | } |
|---|
| 117 | |
|---|
| 118 | static void embl_comments(Embl& embl, Reader& reader) { |
|---|
| 119 | // Read in embl comment lines. |
|---|
| 120 | |
|---|
| 121 | for (; is_embl_comment(reader.line());) { |
|---|
| 122 | char key[TOKENSIZE]; |
|---|
| 123 | int index = Skip_white_space(reader.line(), 5); |
|---|
| 124 | int offset = comment_subkey(reader.line() + index, key); |
|---|
| 125 | index = Skip_white_space(reader.line(), index + offset); |
|---|
| 126 | |
|---|
| 127 | RDP_comment_parser one_comment_entry = embl_one_comment_entry; |
|---|
| 128 | RDP_comments& comments = embl.comments; |
|---|
| 129 | |
|---|
| 130 | if (!parse_RDP_comment(comments, one_comment_entry, key, index, reader)) { |
|---|
| 131 | // other comments |
|---|
| 132 | Append(comments.others, reader.line() + 5); |
|---|
| 133 | ++reader; |
|---|
| 134 | } |
|---|
| 135 | } |
|---|
| 136 | } |
|---|
| 137 | |
|---|
| 138 | static void embl_skip_unidentified(const char *pattern, Reader& reader) { |
|---|
| 139 | // if there are (numb) blanks at the beginning of line, |
|---|
| 140 | // it is a continue line of the current command. |
|---|
| 141 | |
|---|
| 142 | for (++reader; reader.line(); ++reader) { |
|---|
| 143 | char key[TOKENSIZE]; |
|---|
| 144 | embl_key_word(reader.line(), 0, key); |
|---|
| 145 | if (!str_equal(key, pattern)) break; |
|---|
| 146 | } |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | void EmblParser::parse_section() { |
|---|
| 150 | char key[TOKENSIZE]; |
|---|
| 151 | embl_key_word(reader.line(), 0, key); |
|---|
| 152 | state = ENTRY_STARTED; |
|---|
| 153 | parse_keyed_section(key); |
|---|
| 154 | } |
|---|
| 155 | |
|---|
| 156 | static void embl_origin(Seq& seq, Reader& reader) { |
|---|
| 157 | // Read in embl sequence data. |
|---|
| 158 | ca_assert(seq.is_empty()); |
|---|
| 159 | |
|---|
| 160 | // read in whole sequence data |
|---|
| 161 | for (++reader; |
|---|
| 162 | reader.line() && !is_sequence_terminator(reader.line()); |
|---|
| 163 | ++reader) |
|---|
| 164 | { |
|---|
| 165 | const char *line = reader.line(); |
|---|
| 166 | for (int idx = 5; line[idx]; ++idx) { |
|---|
| 167 | char ch = line[idx]; |
|---|
| 168 | if (ch == ' ' || ch == '\n') continue; |
|---|
| 169 | if (idx>70) continue; |
|---|
| 170 | seq.add(ch); |
|---|
| 171 | } |
|---|
| 172 | } |
|---|
| 173 | } |
|---|
| 174 | |
|---|
| 175 | void EmblParser::parse_keyed_section(const char *key) { |
|---|
| 176 | if (str_equal(key, "ID")) { |
|---|
| 177 | embl_one_entry(reader, embl.ID, key); |
|---|
| 178 | } |
|---|
| 179 | else if (str_equal(key, "DT")) { |
|---|
| 180 | embl_date(embl, reader); |
|---|
| 181 | } |
|---|
| 182 | else if (str_equal(key, "DE")) { |
|---|
| 183 | embl_one_entry(reader, embl.description, key); |
|---|
| 184 | } |
|---|
| 185 | else if (str_equal(key, "OS")) { |
|---|
| 186 | embl_one_entry(reader, embl.os, key); |
|---|
| 187 | } |
|---|
| 188 | else if (str_equal(key, "AC")) { |
|---|
| 189 | embl_one_entry(reader, embl.accession, key); |
|---|
| 190 | } |
|---|
| 191 | else if (str_equal(key, "KW")) { |
|---|
| 192 | embl_one_entry(reader, embl.keywords, key); |
|---|
| 193 | |
|---|
| 194 | // correct missing '.' |
|---|
| 195 | if (!has_content(embl.keywords)) freedup(embl.keywords, ".\n"); |
|---|
| 196 | else terminate_with(embl.keywords, '.'); |
|---|
| 197 | } |
|---|
| 198 | else if (str_equal(key, "DR")) { |
|---|
| 199 | embl_one_entry(reader, embl.dr, key); |
|---|
| 200 | } |
|---|
| 201 | else if (str_equal(key, "RA")) { |
|---|
| 202 | Emblref& ref = embl.get_latest_ref(); |
|---|
| 203 | embl_one_entry(reader, ref.author, key); |
|---|
| 204 | terminate_with(ref.author, ';'); |
|---|
| 205 | } |
|---|
| 206 | else if (str_equal(key, "RT")) { |
|---|
| 207 | Emblref& ref = embl.get_latest_ref(); |
|---|
| 208 | embl_one_entry(reader, ref.title, key); |
|---|
| 209 | embl_correct_title(ref); |
|---|
| 210 | } |
|---|
| 211 | else if (str_equal(key, "RL")) { |
|---|
| 212 | Emblref& ref = embl.get_latest_ref(); |
|---|
| 213 | embl_one_entry(reader, ref.journal, key); |
|---|
| 214 | terminate_with(ref.journal, '.'); |
|---|
| 215 | } |
|---|
| 216 | else if (str_equal(key, "RP")) { |
|---|
| 217 | Emblref& ref = embl.get_latest_ref(); |
|---|
| 218 | embl_one_entry(reader, ref.processing, key); |
|---|
| 219 | } |
|---|
| 220 | else if (str_equal(key, "RN")) { |
|---|
| 221 | embl.resize_refs(embl.get_refcount()+1); |
|---|
| 222 | ++reader; |
|---|
| 223 | } |
|---|
| 224 | else if (str_equal(key, "CC")) { |
|---|
| 225 | embl_comments(embl, reader); |
|---|
| 226 | } |
|---|
| 227 | else if (str_equal(key, "SQ")) { |
|---|
| 228 | embl_origin(seq, reader); |
|---|
| 229 | state = ENTRY_COMPLETED; |
|---|
| 230 | } |
|---|
| 231 | else { |
|---|
| 232 | embl_skip_unidentified(key, reader); |
|---|
| 233 | } |
|---|
| 234 | } |
|---|
| 235 | |
|---|
| 236 | void embl_key_word(const char *line, int index, char *key) { |
|---|
| 237 | parse_key_word(line+index, key, " \t\n"); |
|---|
| 238 | } |
|---|
| 239 | |
|---|
| 240 | static void embl_print_lines(Writer& write, const char *key, const char *content, const WrapMode& wrapMode) { |
|---|
| 241 | // Print EMBL entry and wrap around if line over EMBLMAXLINE. |
|---|
| 242 | ca_assert(strlen(key) == 2); |
|---|
| 243 | |
|---|
| 244 | char prefix[TOKENSIZE]; |
|---|
| 245 | sprintf(prefix, "%-*s", EMBLINDENT, key); |
|---|
| 246 | |
|---|
| 247 | wrapMode.print(write, prefix, prefix, content, EMBLMAXLINE); |
|---|
| 248 | } |
|---|
| 249 | |
|---|
| 250 | static bool embl_print_lines_if_content(Writer& write, const char *key, const char *content, const WrapMode& wrapMode, bool followed_by_spacer) { |
|---|
| 251 | if (has_content(content)) { |
|---|
| 252 | embl_print_lines(write, key, content, wrapMode); |
|---|
| 253 | if (followed_by_spacer) write.out("XX\n"); |
|---|
| 254 | return true; |
|---|
| 255 | } |
|---|
| 256 | return false; |
|---|
| 257 | } |
|---|
| 258 | |
|---|
| 259 | static void embl_print_comment_if_content(Writer& write, const char *key, const char *content) { |
|---|
| 260 | // Print one embl comment line, wrap around |
|---|
| 261 | |
|---|
| 262 | if (!has_content(content)) return; |
|---|
| 263 | |
|---|
| 264 | char first[LINESIZE]; sprintf(first, "CC%*s%s", (EMBLINDENT-2)+RDP_SUBKEY_INDENT, "", key); |
|---|
| 265 | char other[LINESIZE]; sprintf(other, "CC%*s", (EMBLINDENT-2)+RDP_SUBKEY_INDENT+RDP_CONTINUED_INDENT, ""); |
|---|
| 266 | WrapMode(true).print(write, first, other, content, EMBLMAXLINE); |
|---|
| 267 | } |
|---|
| 268 | |
|---|
| 269 | inline void embl_print_completeness(Writer& write, char compX, char X) { |
|---|
| 270 | if (compX == ' ') return; |
|---|
| 271 | ca_assert(compX == 'y' || compX == 'n'); |
|---|
| 272 | write.outf("CC %c' end complete: %s\n", X, compX == 'y' ? "Yes" : "No"); |
|---|
| 273 | } |
|---|
| 274 | |
|---|
| 275 | static void embl_out_comments(const Embl& embl, const Seq& seq, Writer& write) { |
|---|
| 276 | // Print out the comments part of EMBL format. |
|---|
| 277 | |
|---|
| 278 | const OrgInfo& orginf = embl.comments.orginf; |
|---|
| 279 | if (orginf.exists()) { |
|---|
| 280 | write.out("CC Organism information\n"); |
|---|
| 281 | |
|---|
| 282 | embl_print_comment_if_content(write, "Source of strain: ", orginf.source); |
|---|
| 283 | embl_print_comment_if_content(write, "Culture collection: ", orginf.cultcoll); |
|---|
| 284 | embl_print_comment_if_content(write, "Former name: ", orginf.formname); |
|---|
| 285 | embl_print_comment_if_content(write, "Alternate name: ", orginf.nickname); |
|---|
| 286 | embl_print_comment_if_content(write, "Common name: ", orginf.commname); |
|---|
| 287 | embl_print_comment_if_content(write, "Host organism: ", orginf.hostorg); |
|---|
| 288 | } |
|---|
| 289 | |
|---|
| 290 | const SeqInfo& seqinf = embl.comments.seqinf; |
|---|
| 291 | if (seqinf.exists()) { |
|---|
| 292 | write.outf("CC Sequence information (bases 1 to %d)\n", seq.get_len()); |
|---|
| 293 | |
|---|
| 294 | embl_print_comment_if_content(write, "RDP ID: ", seqinf.RDPid); |
|---|
| 295 | embl_print_comment_if_content(write, "Corresponding GenBank entry: ", seqinf.gbkentry); |
|---|
| 296 | embl_print_comment_if_content(write, "Sequencing methods: ", seqinf.methods); |
|---|
| 297 | |
|---|
| 298 | embl_print_completeness(write, seqinf.comp5, '5'); |
|---|
| 299 | embl_print_completeness(write, seqinf.comp3, '3'); |
|---|
| 300 | } |
|---|
| 301 | |
|---|
| 302 | embl_print_lines_if_content(write, "CC", embl.comments.others, WrapMode("\n"), true); |
|---|
| 303 | } |
|---|
| 304 | |
|---|
| 305 | static void embl_out_origin(const Seq& seq, Writer& write) { |
|---|
| 306 | // Print out the sequence data of EMBL format. |
|---|
| 307 | BaseCounts bases; |
|---|
| 308 | seq.count(bases); |
|---|
| 309 | write.outf("SQ Sequence %d BP; %d A; %d C; %d G; %d T; %d other;\n", |
|---|
| 310 | seq.get_len(), bases.a, bases.c, bases.g, bases.t, bases.other); |
|---|
| 311 | |
|---|
| 312 | seq.out(write, EMBL); |
|---|
| 313 | } |
|---|
| 314 | |
|---|
| 315 | void embl_out_header(const Embl& embl, const Seq& seq, Writer& write) { |
|---|
| 316 | WrapMode wrapWords(true); |
|---|
| 317 | WrapMode neverWrap(false); |
|---|
| 318 | |
|---|
| 319 | embl_print_lines_if_content(write, "ID", embl.ID, neverWrap, true); |
|---|
| 320 | embl_print_lines_if_content(write, "AC", embl.accession, wrapWords, true); |
|---|
| 321 | |
|---|
| 322 | { |
|---|
| 323 | bool dt1 = embl_print_lines_if_content(write, "DT", embl.dateu, neverWrap, false); |
|---|
| 324 | bool dt2 = embl_print_lines_if_content(write, "DT", embl.datec, neverWrap, false); |
|---|
| 325 | if (dt1 || dt2) write.out("XX\n"); |
|---|
| 326 | } |
|---|
| 327 | |
|---|
| 328 | embl_print_lines_if_content(write, "DE", embl.description, wrapWords, true); |
|---|
| 329 | embl_print_lines_if_content(write, "KW", embl.keywords, WrapMode(";"), true); |
|---|
| 330 | |
|---|
| 331 | if (has_content(embl.os)) { |
|---|
| 332 | embl_print_lines(write, "OS", embl.os, wrapWords); |
|---|
| 333 | write.out("OC No information.\n"); |
|---|
| 334 | write.out("XX\n"); |
|---|
| 335 | } |
|---|
| 336 | |
|---|
| 337 | // GenbankRef |
|---|
| 338 | for (int indi = 0; indi < embl.get_refcount(); indi++) { |
|---|
| 339 | const Emblref& ref = embl.get_ref(indi); |
|---|
| 340 | |
|---|
| 341 | write.outf("RN [%d]\n", indi + 1); |
|---|
| 342 | embl_print_lines_if_content(write, "RP", ref.processing, neverWrap, false); |
|---|
| 343 | embl_print_lines_if_content(write, "RA", ref.author, WrapMode(","), false); |
|---|
| 344 | |
|---|
| 345 | if (has_content(ref.title)) embl_print_lines(write, "RT", ref.title, wrapWords); |
|---|
| 346 | else write.out("RT ;\n"); |
|---|
| 347 | |
|---|
| 348 | embl_print_lines_if_content(write, "RL", ref.journal, wrapWords, false); |
|---|
| 349 | write.out("XX\n"); |
|---|
| 350 | } |
|---|
| 351 | |
|---|
| 352 | if (has_content(embl.dr)) { |
|---|
| 353 | embl_print_lines(write, "DR", embl.dr, wrapWords); |
|---|
| 354 | write.out("XX\n"); |
|---|
| 355 | } |
|---|
| 356 | |
|---|
| 357 | embl_out_comments(embl, seq, write); |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | void embl_out(const Embl& embl, const Seq& seq, Writer& write) { |
|---|
| 361 | // Output EMBL data. |
|---|
| 362 | embl_out_header(embl, seq, write); |
|---|
| 363 | embl_out_origin(seq, write); |
|---|
| 364 | } |
|---|
| 365 | |
|---|
| 366 | static char *etog_author(char *Str) { |
|---|
| 367 | // Convert EMBL author format to Genbank author format. |
|---|
| 368 | int indi, indk, len, index; |
|---|
| 369 | char token[TOKENSIZE], *author; |
|---|
| 370 | |
|---|
| 371 | author = ARB_strdup(""); |
|---|
| 372 | for (indi = index = 0, len = str0len(Str) - 1; indi < len; indi++, index++) { |
|---|
| 373 | if (Str[indi] == ',' || Str[indi] == ';') { |
|---|
| 374 | token[index--] = '\0'; |
|---|
| 375 | if (has_content(author)) { |
|---|
| 376 | Append(author, (Str[indi] == ',') ? "," : " and"); |
|---|
| 377 | } |
|---|
| 378 | // search backward to find the first blank and replace the blank by ',' |
|---|
| 379 | for (indk = 0; index > 0 && indk == 0; index--) |
|---|
| 380 | if (token[index] == ' ') { |
|---|
| 381 | token[index] = ','; |
|---|
| 382 | indk = 1; |
|---|
| 383 | } |
|---|
| 384 | Append(author, token); |
|---|
| 385 | index = (-1); |
|---|
| 386 | } |
|---|
| 387 | else |
|---|
| 388 | token[index] = Str[indi]; |
|---|
| 389 | } |
|---|
| 390 | Append(author, "\n"); |
|---|
| 391 | return author; |
|---|
| 392 | } |
|---|
| 393 | static char *etog_journal(const char *eJournal) { |
|---|
| 394 | // Convert journal part from EMBL to GenBank format. |
|---|
| 395 | char *new_journal = NULp; |
|---|
| 396 | char token[TOKENSIZE]; |
|---|
| 397 | |
|---|
| 398 | scan_token_or_die(token, eJournal); |
|---|
| 399 | if (str_equal(token, "(in)") == 1 || str_equal(token, "Submitted") || str_equal(token, "Unpublished")) { |
|---|
| 400 | // remove trailing '.' |
|---|
| 401 | int len = strlen(eJournal); |
|---|
| 402 | ca_assert(eJournal[len-2] == '.'); |
|---|
| 403 | new_journal = strndup(eJournal, len-2); |
|---|
| 404 | Append(new_journal, "\n"); |
|---|
| 405 | } |
|---|
| 406 | else { |
|---|
| 407 | const char *colon = strchr(eJournal, ':'); |
|---|
| 408 | |
|---|
| 409 | if (colon) { |
|---|
| 410 | const char *p1 = strchr(colon+1, '('); |
|---|
| 411 | if (p1) { |
|---|
| 412 | const char *p2 = strchr(p1+1, ')'); |
|---|
| 413 | if (p2 && strcmp(p2+1, ".\n") == 0) { |
|---|
| 414 | ARB_realloc(new_journal, str0len(eJournal)+1+1); |
|---|
| 415 | |
|---|
| 416 | int l1 = colon-eJournal; |
|---|
| 417 | int l2 = p1-colon-1; |
|---|
| 418 | int l3 = p2-p1+1; |
|---|
| 419 | |
|---|
| 420 | char *pos = new_journal; |
|---|
| 421 | |
|---|
| 422 | memcpy(pos, eJournal, l1); pos += l1; |
|---|
| 423 | memcpy(pos, ", ", 2); pos += 2; |
|---|
| 424 | memcpy(pos, colon+1, l2); pos += l2; |
|---|
| 425 | memcpy(pos, " ", 1); pos += 1; |
|---|
| 426 | memcpy(pos, p1, l3); pos += l3; |
|---|
| 427 | memcpy(pos, "\n", 2); |
|---|
| 428 | } |
|---|
| 429 | } |
|---|
| 430 | } |
|---|
| 431 | |
|---|
| 432 | if (!new_journal) { |
|---|
| 433 | warningf(148, "Removed unknown journal format: %s", eJournal); |
|---|
| 434 | new_journal = no_content(); |
|---|
| 435 | } |
|---|
| 436 | } |
|---|
| 437 | |
|---|
| 438 | return new_journal; |
|---|
| 439 | } |
|---|
| 440 | static void etog_convert_references(const Embl& embl, GenBank& gbk) { |
|---|
| 441 | // Convert reference from EMBL to GenBank format. |
|---|
| 442 | int indi, len, start, end; |
|---|
| 443 | char temp[LONGTEXT]; |
|---|
| 444 | |
|---|
| 445 | gbk.resize_refs(embl.get_refcount()); |
|---|
| 446 | |
|---|
| 447 | for (indi = 0; indi < embl.get_refcount(); indi++) { |
|---|
| 448 | const Emblref& ref = embl.get_ref(indi); |
|---|
| 449 | GenbankRef& gref = gbk.get_ref(indi); |
|---|
| 450 | |
|---|
| 451 | if (has_content(ref.processing) && |
|---|
| 452 | sscanf(ref.processing, "%d %d", &start, &end) == 2) |
|---|
| 453 | { |
|---|
| 454 | end *= -1; // will get negative from sscanf |
|---|
| 455 | sprintf(temp, "%d (bases %d to %d)\n", (indi + 1), start, end); |
|---|
| 456 | } |
|---|
| 457 | else { |
|---|
| 458 | sprintf(temp, "%d\n", (indi + 1)); |
|---|
| 459 | } |
|---|
| 460 | |
|---|
| 461 | freedup(gref.ref, temp); |
|---|
| 462 | |
|---|
| 463 | if (has_content(ref.title) && ref.title[0] != ';') { |
|---|
| 464 | // remove '"' and ';', if there is any |
|---|
| 465 | len = str0len(ref.title); |
|---|
| 466 | if (len > 2 && ref.title[0] == '"' && ref.title[len - 2] == ';' && ref.title[len - 3] == '"') { |
|---|
| 467 | ref.title[len - 3] = '\n'; |
|---|
| 468 | ref.title[len - 2] = '\0'; |
|---|
| 469 | freedup(gref.title, ref.title+1); |
|---|
| 470 | ref.title[len - 3] = '"'; |
|---|
| 471 | ref.title[len - 2] = ';'; |
|---|
| 472 | } |
|---|
| 473 | else { |
|---|
| 474 | freedup(gref.title, ref.title); |
|---|
| 475 | } |
|---|
| 476 | } |
|---|
| 477 | else { |
|---|
| 478 | freeset(gref.title, no_content()); |
|---|
| 479 | } |
|---|
| 480 | |
|---|
| 481 | freeset(gref.author, has_content(ref.author) ? etog_author(ref.author) : no_content()); |
|---|
| 482 | freeset(gref.journal, has_content(ref.journal) ? etog_journal(ref.journal) : no_content()); |
|---|
| 483 | |
|---|
| 484 | freeset(gref.standard, no_content()); |
|---|
| 485 | } |
|---|
| 486 | } |
|---|
| 487 | |
|---|
| 488 | int etog(const Embl& embl, GenBank& gbk, const Seq& seq) { // __ATTR__USERESULT |
|---|
| 489 | // Convert from embl to genbank format. |
|---|
| 490 | int indi; |
|---|
| 491 | char key[TOKENSIZE], temp[LONGTEXT]; |
|---|
| 492 | char t1[TOKENSIZE], t2[TOKENSIZE], t3[TOKENSIZE]; |
|---|
| 493 | |
|---|
| 494 | embl_key_word(embl.ID, 0, key); |
|---|
| 495 | if (has_content(embl.dr)) { |
|---|
| 496 | // get short_id from DR line if there is RDP def. |
|---|
| 497 | strcpy(t3, "dummy"); |
|---|
| 498 | ASSERT_RESULT(int, 3, sscanf(embl.dr, "%s %s %s", t1, t2, t3)); |
|---|
| 499 | if (str_equal(t1, "RDP;")) { |
|---|
| 500 | if (!str_equal(t3, "dummy")) { |
|---|
| 501 | strcpy(key, t3); |
|---|
| 502 | } |
|---|
| 503 | else |
|---|
| 504 | strcpy(key, t2); |
|---|
| 505 | key[str0len(key) - 1] = '\0'; // remove '.' |
|---|
| 506 | } |
|---|
| 507 | } |
|---|
| 508 | strcpy(temp, key); |
|---|
| 509 | |
|---|
| 510 | // LOCUS |
|---|
| 511 | for (indi = str0len(temp); indi < 13; temp[indi++] = ' ') {} |
|---|
| 512 | { |
|---|
| 513 | const char *date = has_content(embl.dateu) ? embl.dateu : today_date(); |
|---|
| 514 | sprintf((temp + 10), "%7d bp RNA RNA %s\n", |
|---|
| 515 | seq.get_len(), |
|---|
| 516 | genbank_date(date)); |
|---|
| 517 | } |
|---|
| 518 | freedup(gbk.locus, temp); |
|---|
| 519 | |
|---|
| 520 | // DEFINITION |
|---|
| 521 | if (copy_content(gbk.definition, embl.description)) terminate_with(gbk.definition, '.'); |
|---|
| 522 | |
|---|
| 523 | // SOURCE and DEFINITION if not yet defined |
|---|
| 524 | if (copy_content(gbk.source, embl.os)) { |
|---|
| 525 | freedup(gbk.organism, embl.os); |
|---|
| 526 | if (!has_content(embl.description)) { |
|---|
| 527 | freedup(gbk.definition, embl.os); |
|---|
| 528 | } |
|---|
| 529 | } |
|---|
| 530 | |
|---|
| 531 | // COMMENT GenBank entry |
|---|
| 532 | copy_content(gbk.accession, embl.accession); |
|---|
| 533 | if (has_content(embl.keywords) && embl.keywords[0] != '.') { |
|---|
| 534 | freedup(gbk.keywords, embl.keywords); |
|---|
| 535 | } |
|---|
| 536 | |
|---|
| 537 | etog_convert_references(embl, gbk); |
|---|
| 538 | gbk.comments.set_content_from(embl.comments); |
|---|
| 539 | |
|---|
| 540 | return 1; |
|---|
| 541 | } |
|---|
| 542 | |
|---|
| 543 | int etom(const Embl& embl, Macke& macke, const Seq& seq) { // __ATTR__USERESULT |
|---|
| 544 | // Convert from embl format to Macke format. |
|---|
| 545 | GenBank gbk; |
|---|
| 546 | return etog(embl, gbk, seq) && gtom(gbk, macke); |
|---|
| 547 | } |
|---|
| 548 | |
|---|
| 549 | // -------------------------------------------------------------------------------- |
|---|
| 550 | |
|---|
| 551 | #ifdef UNIT_TESTS |
|---|
| 552 | #include <test_unit.h> |
|---|
| 553 | |
|---|
| 554 | #define TEST_EXPECT_ETOG_JOURNAL_PARSES(i,o) \ |
|---|
| 555 | do { \ |
|---|
| 556 | char *dup = ARB_strdup(i); \ |
|---|
| 557 | char *res = etog_journal(dup); \ |
|---|
| 558 | TEST_EXPECT_EQUAL(res, o); \ |
|---|
| 559 | free(res); \ |
|---|
| 560 | free(dup); \ |
|---|
| 561 | } while (0) |
|---|
| 562 | |
|---|
| 563 | void TEST_BASIC_etog_journal() { |
|---|
| 564 | // behavior documented in r6943: |
|---|
| 565 | TEST_EXPECT_ETOG_JOURNAL_PARSES("Gene 134:283-287(1993).\n", |
|---|
| 566 | "Gene 134, 283-287 (1993)\n"); |
|---|
| 567 | TEST_EXPECT_ETOG_JOURNAL_PARSES("J. Exp. Med. 179:1809-1821(1994).\n", |
|---|
| 568 | "J. Exp. Med. 179, 1809-1821 (1994)\n"); |
|---|
| 569 | TEST_EXPECT_ETOG_JOURNAL_PARSES("Unpublished whatever.\n", |
|---|
| 570 | "Unpublished whatever\n"); |
|---|
| 571 | TEST_EXPECT_ETOG_JOURNAL_PARSES("bla bla bla.\n", |
|---|
| 572 | "\n"); // skips if can't parse |
|---|
| 573 | TEST_EXPECT_ETOG_JOURNAL_PARSES("bla bla bla\n", |
|---|
| 574 | "\n"); |
|---|
| 575 | } |
|---|
| 576 | |
|---|
| 577 | #endif // UNIT_TESTS |
|---|
| 578 | |
|---|
| 579 | static char *gtoe_author(char *author) { |
|---|
| 580 | // Convert GenBank author to EMBL author. |
|---|
| 581 | int indi, len, index, odd; |
|---|
| 582 | char *auth, *Str; |
|---|
| 583 | |
|---|
| 584 | // replace " and " by ", " |
|---|
| 585 | auth = nulldup(author); |
|---|
| 586 | if ((index = find_pattern(auth, " and ")) > 0) { |
|---|
| 587 | auth[index] = '\0'; |
|---|
| 588 | Str = nulldup(auth); |
|---|
| 589 | auth[index] = ' '; // remove '\0' for free space later |
|---|
| 590 | Append(Str, ","); |
|---|
| 591 | Append(Str, auth + index + 4); |
|---|
| 592 | } |
|---|
| 593 | else |
|---|
| 594 | Str = nulldup(author); |
|---|
| 595 | |
|---|
| 596 | for (indi = 0, len = str0len(Str), odd = 1; indi < len; indi++) { |
|---|
| 597 | if (Str[indi] == ',') { |
|---|
| 598 | if (odd) { |
|---|
| 599 | Str[indi] = ' '; |
|---|
| 600 | odd = 0; |
|---|
| 601 | } |
|---|
| 602 | else { |
|---|
| 603 | odd = 1; |
|---|
| 604 | } |
|---|
| 605 | } |
|---|
| 606 | } |
|---|
| 607 | |
|---|
| 608 | freenull(auth); |
|---|
| 609 | return Str; |
|---|
| 610 | } |
|---|
| 611 | static char *gtoe_journal(char *Str) { |
|---|
| 612 | // Convert GenBank journal to EMBL journal. |
|---|
| 613 | char token[TOKENSIZE], *journal; |
|---|
| 614 | int indi, indj, index, len; |
|---|
| 615 | |
|---|
| 616 | if (scan_token(token, Str)) { |
|---|
| 617 | if (str_equal(token, "(in)") == 1 || str_equal(token, "Unpublished") || str_equal(token, "Submitted")) { |
|---|
| 618 | journal = nulldup(Str); |
|---|
| 619 | terminate_with(journal, '.'); |
|---|
| 620 | return journal; |
|---|
| 621 | } |
|---|
| 622 | } |
|---|
| 623 | |
|---|
| 624 | journal = nulldup(Str); |
|---|
| 625 | for (indi = indj = index = 0, len = str0len(journal); indi < len; indi++, indj++) { |
|---|
| 626 | if (journal[indi] == ',') { |
|---|
| 627 | journal[indi] = ':'; |
|---|
| 628 | indi++; // skip blank after ',' |
|---|
| 629 | index = 1; |
|---|
| 630 | } |
|---|
| 631 | else if (journal[indi] == ' ' && index) { |
|---|
| 632 | indj--; |
|---|
| 633 | } |
|---|
| 634 | else |
|---|
| 635 | journal[indj] = journal[indi]; |
|---|
| 636 | } |
|---|
| 637 | |
|---|
| 638 | journal[indj] = '\0'; |
|---|
| 639 | terminate_with(journal, '.'); |
|---|
| 640 | return journal; |
|---|
| 641 | } |
|---|
| 642 | static void gtoe_reference(const GenBank& gbk, Embl& embl) { |
|---|
| 643 | // Convert references from GenBank to EMBL. |
|---|
| 644 | if (gbk.has_refs()) { |
|---|
| 645 | embl.resize_refs(gbk.get_refcount()); |
|---|
| 646 | } |
|---|
| 647 | |
|---|
| 648 | for (int indi = 0; indi < gbk.get_refcount(); indi++) { |
|---|
| 649 | Emblref& ref = embl.get_ref(indi); |
|---|
| 650 | const GenbankRef& gref = gbk.get_ref(indi); |
|---|
| 651 | |
|---|
| 652 | freedup(ref.title, gref.title); |
|---|
| 653 | embl_correct_title(ref); |
|---|
| 654 | |
|---|
| 655 | freeset(ref.journal, gtoe_journal(gref.journal)); |
|---|
| 656 | terminate_with(ref.journal, '.'); |
|---|
| 657 | |
|---|
| 658 | freeset(ref.author, gtoe_author(gref.author)); |
|---|
| 659 | terminate_with(ref.author, ';'); |
|---|
| 660 | |
|---|
| 661 | // create processing information |
|---|
| 662 | int refnum, start = 0, end = 0; |
|---|
| 663 | char t1[TOKENSIZE], t2[TOKENSIZE], t3[TOKENSIZE]; |
|---|
| 664 | |
|---|
| 665 | if (!gref.ref || sscanf(gref.ref, "%d %s %d %s %d %s", &refnum, t1, &start, t2, &end, t3) != 6) { |
|---|
| 666 | start = 0; |
|---|
| 667 | end = 0; |
|---|
| 668 | } |
|---|
| 669 | |
|---|
| 670 | freenull(ref.processing); |
|---|
| 671 | if (start || end) ref.processing = strf("%d-%d\n", start, end); |
|---|
| 672 | else ref.processing = no_content(); |
|---|
| 673 | |
|---|
| 674 | } |
|---|
| 675 | } |
|---|
| 676 | |
|---|
| 677 | int gtoe(const GenBank& gbk, Embl& embl, const Seq& seq) { // __ATTR__USERESULT |
|---|
| 678 | // Genbank to EMBL. |
|---|
| 679 | { |
|---|
| 680 | char temp[LONGTEXT]; |
|---|
| 681 | strcpy(temp, gbk.get_id()); |
|---|
| 682 | |
|---|
| 683 | upcase(temp); // Adjust short-id, EMBL short_id always upper case |
|---|
| 684 | for (int indi = min(str0len(temp), 9); indi < 10; indi++) |
|---|
| 685 | temp[indi] = ' '; |
|---|
| 686 | |
|---|
| 687 | sprintf(temp + 10, "preliminary; RNA; UNA; %d BP.\n", seq.get_len()); |
|---|
| 688 | freedup(embl.ID, temp); |
|---|
| 689 | } |
|---|
| 690 | |
|---|
| 691 | // accession number |
|---|
| 692 | if (has_content(gbk.accession)) |
|---|
| 693 | // take just the accession num, no version num. |
|---|
| 694 | freedup(embl.accession, gbk.accession); |
|---|
| 695 | |
|---|
| 696 | // date |
|---|
| 697 | { |
|---|
| 698 | char *date = gbk.get_date(); |
|---|
| 699 | |
|---|
| 700 | freeset(embl.dateu, strf("%s (Rel. 1, Last updated, Version 1)\n", date)); |
|---|
| 701 | freeset(embl.datec, strf("%s (Rel. 1, Created)\n", date)); |
|---|
| 702 | |
|---|
| 703 | free(date); |
|---|
| 704 | } |
|---|
| 705 | |
|---|
| 706 | // description |
|---|
| 707 | copy_content(embl.description, gbk.definition); |
|---|
| 708 | // EMBL KW line |
|---|
| 709 | if (copy_content(embl.keywords, gbk.keywords)) { |
|---|
| 710 | terminate_with(embl.keywords, '.'); |
|---|
| 711 | } |
|---|
| 712 | else { |
|---|
| 713 | freedup(embl.keywords, ".\n"); |
|---|
| 714 | } |
|---|
| 715 | |
|---|
| 716 | copy_content(embl.os, gbk.organism); // EMBL OS line |
|---|
| 717 | // reference |
|---|
| 718 | gtoe_reference(gbk, embl); |
|---|
| 719 | |
|---|
| 720 | // EMBL DR line |
|---|
| 721 | { |
|---|
| 722 | char token[TOKENSIZE]; |
|---|
| 723 | char temp[LONGTEXT]; |
|---|
| 724 | |
|---|
| 725 | scan_token_or_die(token, gbk.locus); // short_id |
|---|
| 726 | if (has_content(gbk.comments.seqinf.RDPid)) { |
|---|
| 727 | char rdpid[TOKENSIZE]; |
|---|
| 728 | scan_token_or_die(rdpid, gbk.comments.seqinf.RDPid); |
|---|
| 729 | sprintf(temp, "RDP; %s; %s.\n", rdpid, token); |
|---|
| 730 | } |
|---|
| 731 | else { |
|---|
| 732 | sprintf(temp, "RDP; %s.\n", token); |
|---|
| 733 | } |
|---|
| 734 | freedup(embl.dr, temp); |
|---|
| 735 | } |
|---|
| 736 | embl.comments.set_content_from(gbk.comments); |
|---|
| 737 | |
|---|
| 738 | return 1; |
|---|
| 739 | } |
|---|
| 740 | |
|---|
| 741 | static int partial_mtoe(const Macke& macke, Embl& embl) { |
|---|
| 742 | // Handle subspecies information when converting from Macke to EMBL. |
|---|
| 743 | char*& others = embl.comments.others; |
|---|
| 744 | |
|---|
| 745 | if (has_content(macke.strain)) { |
|---|
| 746 | int ridx = skip_pattern(others, "*source:"); |
|---|
| 747 | bool have_strain = ridx >= 0 && stristr(others+ridx, "strain="); |
|---|
| 748 | |
|---|
| 749 | if (!have_strain) { |
|---|
| 750 | if (!has_content(others)) freenull(others); |
|---|
| 751 | Append(others, "*source: strain="); |
|---|
| 752 | Append(others, macke.strain); |
|---|
| 753 | if (!is_end_mark(others[str0len(others) - 2])) skip_eolnl_and_append(others, ";\n"); |
|---|
| 754 | } |
|---|
| 755 | } |
|---|
| 756 | |
|---|
| 757 | if (has_content(macke.subspecies)) { |
|---|
| 758 | int ridx = skip_pattern(others, "*source:"); |
|---|
| 759 | bool have_subsp = ridx >= 0 && find_subspecies(others+ridx, '=') >= 0; |
|---|
| 760 | |
|---|
| 761 | if (!have_subsp) { |
|---|
| 762 | if (!has_content(others)) freenull(others); |
|---|
| 763 | Append(others, "*source: subspecies="); |
|---|
| 764 | Append(others, macke.subspecies); |
|---|
| 765 | if (!is_end_mark(others[str0len(others) - 2])) skip_eolnl_and_append(others, ";\n"); |
|---|
| 766 | } |
|---|
| 767 | } |
|---|
| 768 | |
|---|
| 769 | return 1; |
|---|
| 770 | } |
|---|
| 771 | |
|---|
| 772 | int mtoe(const Macke& macke, Embl& embl, const Seq& seq) { // __ATTR__USERESULT |
|---|
| 773 | GenBank gbk; |
|---|
| 774 | return mtog(macke, gbk, seq) && gtoe(gbk, embl, seq) && partial_mtoe(macke, embl); |
|---|
| 775 | } |
|---|
| 776 | |
|---|
| 777 | bool EmblSwissprotReader::read_one_entry(Seq& seq) { |
|---|
| 778 | data.reinit(); |
|---|
| 779 | if (!EmblParser(data, seq, *this).parse_entry()) abort(); |
|---|
| 780 | return ok(); |
|---|
| 781 | } |
|---|