| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | import sys |
|---|
| 4 | import os |
|---|
| 5 | import shutil |
|---|
| 6 | import datetime |
|---|
| 7 | import time |
|---|
| 8 | import logging |
|---|
| 9 | import multiprocessing |
|---|
| 10 | from string import maketrans |
|---|
| 11 | |
|---|
| 12 | from epac.ete2 import Tree, SeqGroup |
|---|
| 13 | from epac.argparse import ArgumentParser,RawTextHelpFormatter |
|---|
| 14 | from epac.config import EpacConfig,EpacTrainerConfig |
|---|
| 15 | from epac.raxml_util import RaxmlWrapper, FileUtils |
|---|
| 16 | from epac.taxonomy_util import Taxonomy, TaxTreeBuilder |
|---|
| 17 | from epac.json_util import RefJsonBuilder |
|---|
| 18 | from epac.erlang import tree_param |
|---|
| 19 | from epac.msa import hmmer |
|---|
| 20 | from epac.classify_util import TaxTreeHelper |
|---|
| 21 | |
|---|
| 22 | class InputValidator: |
|---|
| 23 | def __init__(self, config, input_tax, input_seqs, verbose=True): |
|---|
| 24 | self.cfg = config |
|---|
| 25 | self.taxonomy = input_tax |
|---|
| 26 | self.alignment = input_seqs |
|---|
| 27 | self.verbose = verbose |
|---|
| 28 | self.dupseq_sets = None |
|---|
| 29 | self.merged_ranks = None |
|---|
| 30 | self.corr_seqid = {} |
|---|
| 31 | self.corr_ranks = {} |
|---|
| 32 | self.gaps_trantab = maketrans("?N", "--") |
|---|
| 33 | |
|---|
| 34 | def validate(self): |
|---|
| 35 | # following two checks are obsolete and disabled by default |
|---|
| 36 | self.check_tax_disbalance() |
|---|
| 37 | self.check_tax_duplicates() |
|---|
| 38 | |
|---|
| 39 | self.check_seq_ids() |
|---|
| 40 | self.check_invalid_chars() |
|---|
| 41 | |
|---|
| 42 | self.check_identical_seqs() |
|---|
| 43 | self.check_identical_ranks() |
|---|
| 44 | |
|---|
| 45 | self.taxonomy.close_taxonomy_gaps() |
|---|
| 46 | |
|---|
| 47 | return self.corr_ranks, self.corr_seqid, self.merged_ranks |
|---|
| 48 | |
|---|
| 49 | def normalize_gaps(self, seq): |
|---|
| 50 | return seq.translate(self.gaps_trantab) |
|---|
| 51 | |
|---|
| 52 | def check_seq_ids(self): |
|---|
| 53 | # check that seq IDs in taxonomy and alignment correspond |
|---|
| 54 | self.mis_ids = [] |
|---|
| 55 | for sid in self.taxonomy.seq_ranks_map.iterkeys(): |
|---|
| 56 | unprefixed_sid = EpacConfig.strip_ref_prefix(sid) |
|---|
| 57 | if not self.alignment.has_seq(unprefixed_sid): |
|---|
| 58 | self.mis_ids.append(unprefixed_sid) |
|---|
| 59 | |
|---|
| 60 | if len(self.mis_ids) > 0 and self.verbose: |
|---|
| 61 | errmsg = "ERROR: Following %d sequence(s) are missing in your alignment file:\n%s\n\n" % (len(self.mis_ids), "\n".join(self.mis_ids)) |
|---|
| 62 | errmsg += "Please make sure sequence IDs in taxonomic annotation file and in alignment are identical!\n" |
|---|
| 63 | self.cfg.exit_user_error(errmsg) |
|---|
| 64 | |
|---|
| 65 | return self.mis_ids |
|---|
| 66 | |
|---|
| 67 | def check_invalid_chars(self): |
|---|
| 68 | # check for invalid characters in rank names |
|---|
| 69 | self.corr_ranks = self.taxonomy.normalize_rank_names() |
|---|
| 70 | |
|---|
| 71 | # check for invalid characters in sequence IDs |
|---|
| 72 | self.corr_seqid = self.taxonomy.normalize_seq_ids() |
|---|
| 73 | |
|---|
| 74 | if self.verbose: |
|---|
| 75 | for old_rank in sorted(self.corr_ranks.keys()): |
|---|
| 76 | self.cfg.log.debug("NOTE: Following rank name contains illegal symbols and was renamed: %s --> %s", old_rank, self.corr_ranks[old_rank]) |
|---|
| 77 | if len(self.corr_ranks) > 0: |
|---|
| 78 | self.cfg.log.debug("") |
|---|
| 79 | for old_sid in sorted(self.corr_seqid.keys()): |
|---|
| 80 | self.cfg.log.debug("NOTE: Following sequence ID contains illegal symbols and was renamed: %s --> %s" , old_sid, self.corr_seqid[old_sid]) |
|---|
| 81 | if len(self.corr_seqid) > 0: |
|---|
| 82 | self.cfg.log.debug("") |
|---|
| 83 | |
|---|
| 84 | return self.corr_ranks, self.corr_seqid |
|---|
| 85 | |
|---|
| 86 | def check_identical_seqs(self): |
|---|
| 87 | seq_hash_map = {} |
|---|
| 88 | for name, seq, comment, sid in self.alignment.iter_entries(): |
|---|
| 89 | ref_seq_name = EpacConfig.REF_SEQ_PREFIX + name |
|---|
| 90 | ref_seq_name = self.corr_seqid.get(ref_seq_name, ref_seq_name) |
|---|
| 91 | if ref_seq_name in self.taxonomy.seq_ranks_map: |
|---|
| 92 | seq_hash = hash(self.normalize_gaps(seq)) |
|---|
| 93 | if seq_hash in seq_hash_map: |
|---|
| 94 | seq_hash_map[seq_hash] += [name] |
|---|
| 95 | else: |
|---|
| 96 | seq_hash_map[seq_hash] = [name] |
|---|
| 97 | |
|---|
| 98 | self.dupseq_count = 0 |
|---|
| 99 | self.dupseq_sets = [] |
|---|
| 100 | for seq_hash, seq_ids in seq_hash_map.iteritems(): |
|---|
| 101 | check_ids = seq_ids[:] |
|---|
| 102 | while len(check_ids) > 1: |
|---|
| 103 | # compare actual sequence strings, to account for a possible hash collision |
|---|
| 104 | seq1 = self.normalize_gaps(self.alignment.get_seq(check_ids[0])) |
|---|
| 105 | coll_ids = [] |
|---|
| 106 | dup_ids = [check_ids[0]] |
|---|
| 107 | for i in range(1, len(check_ids)): |
|---|
| 108 | seq2 = self.normalize_gaps(self.alignment.get_seq(check_ids[i])) |
|---|
| 109 | if seq1 == seq2: |
|---|
| 110 | dup_ids += [check_ids[i]] |
|---|
| 111 | else: |
|---|
| 112 | # collision found, add put seq id on a list to be checked in the next iteration |
|---|
| 113 | coll_ids += [check_ids[i]] |
|---|
| 114 | |
|---|
| 115 | if len(dup_ids) > 1: |
|---|
| 116 | self.dupseq_sets += [dup_ids] |
|---|
| 117 | self.dupseq_count += len(dup_ids) - 1 |
|---|
| 118 | |
|---|
| 119 | check_ids = coll_ids |
|---|
| 120 | |
|---|
| 121 | if self.verbose: |
|---|
| 122 | for dup_ids in self.dupseq_sets: |
|---|
| 123 | self.cfg.log.debug("NOTE: Following sequences are identical: %s", ", ".join(dup_ids)) |
|---|
| 124 | if self.dupseq_count > 0: |
|---|
| 125 | self.cfg.log.debug("\nNOTE: Found %d sequence duplicates", self.dupseq_count) |
|---|
| 126 | |
|---|
| 127 | return self.dupseq_count, self.dupseq_sets |
|---|
| 128 | |
|---|
| 129 | def check_identical_ranks(self): |
|---|
| 130 | if not self.dupseq_sets: |
|---|
| 131 | self.check_identical_seqs() |
|---|
| 132 | self.merged_ranks = {} |
|---|
| 133 | for dup_ids in self.dupseq_sets: |
|---|
| 134 | if len(dup_ids) > 1: |
|---|
| 135 | duprank_map = {} |
|---|
| 136 | for seq_name in dup_ids: |
|---|
| 137 | rank_id = self.taxonomy.seq_rank_id(seq_name) |
|---|
| 138 | duprank_map[rank_id] = duprank_map.get(rank_id, 0) + 1 |
|---|
| 139 | if len(duprank_map) > 1 and self.cfg.debug: |
|---|
| 140 | self.cfg.log.debug("Ranks sharing duplicates: %s\n", str(duprank_map)) |
|---|
| 141 | dup_ranks = [] |
|---|
| 142 | for rank_id, count in duprank_map.iteritems(): |
|---|
| 143 | if count > self.cfg.taxa_ident_thres * self.taxonomy.get_rank_seq_count(rank_id): |
|---|
| 144 | dup_ranks += [rank_id] |
|---|
| 145 | if len(dup_ranks) > 1: |
|---|
| 146 | prefix = "__TAXCLUSTER%d__" % (len(self.merged_ranks) + 1) |
|---|
| 147 | merged_rank_id = self.taxonomy.merge_ranks(dup_ranks, prefix) |
|---|
| 148 | self.merged_ranks[merged_rank_id] = dup_ranks |
|---|
| 149 | |
|---|
| 150 | if self.verbose: |
|---|
| 151 | merged_count = 0 |
|---|
| 152 | for merged_rank_id, dup_ranks in self.merged_ranks.iteritems(): |
|---|
| 153 | dup_ranks_str = "\n".join([Taxonomy.rank_uid_to_lineage_str(rank_id) for rank_id in dup_ranks]) |
|---|
| 154 | self.cfg.log.warning("\nWARNING: Following taxa share >%.0f%% indentical sequences und thus considered indistinguishable:\n%s", self.cfg.taxa_ident_thres * 100, dup_ranks_str) |
|---|
| 155 | merged_rank_str = Taxonomy.rank_uid_to_lineage_str(merged_rank_id) |
|---|
| 156 | self.cfg.log.warning("For the purpose of mislabels identification, they were merged into one taxon:\n%s\n", merged_rank_str) |
|---|
| 157 | merged_count += len(dup_ranks) |
|---|
| 158 | |
|---|
| 159 | if merged_count > 0: |
|---|
| 160 | self.cfg.log.warning("WARNING: %d indistinguishable taxa have been merged into %d clusters.\n", merged_count, len(self.merged_ranks)) |
|---|
| 161 | |
|---|
| 162 | return self.merged_ranks |
|---|
| 163 | |
|---|
| 164 | def check_tax_disbalance(self): |
|---|
| 165 | # make sure we don't taxonomy "irregularities" (more than 7 ranks or missing ranks in the middle) |
|---|
| 166 | action = self.cfg.wrong_rank_count |
|---|
| 167 | if action != "ignore": |
|---|
| 168 | autofix = action == "autofix" |
|---|
| 169 | errs = self.taxonomy.check_for_disbalance(autofix) |
|---|
| 170 | if len(errs) > 0: |
|---|
| 171 | if action == "autofix": |
|---|
| 172 | print "WARNING: %d sequences with invalid annotation (missing/redundant ranks) found and were fixed as follows:\n" % len(errs) |
|---|
| 173 | for err in errs: |
|---|
| 174 | print "Original: %s\t%s" % (err[0], err[1]) |
|---|
| 175 | print "Fixed as: %s\t%s\n" % (err[0], err[2]) |
|---|
| 176 | elif action == "skip": |
|---|
| 177 | print "WARNING: Following %d sequences with invalid annotation (missing/redundant ranks) were skipped:\n" % len(errs) |
|---|
| 178 | for err in errs: |
|---|
| 179 | self.taxonomy.remove_seq(err[0]) |
|---|
| 180 | print "%s\t%s" % err |
|---|
| 181 | else: # abort |
|---|
| 182 | print "ERROR: %d sequences with invalid annotation (missing/redundant ranks) found:\n" % len(errs) |
|---|
| 183 | for err in errs: |
|---|
| 184 | print "%s\t%s" % err |
|---|
| 185 | print "\nPlease fix them manually (add/remove ranks) and run the pipeline again (or use -wrong-rank-count autofix option)" |
|---|
| 186 | print "NOTE: Only standard 7-level taxonomies are supported at the moment. Although missing trailing ranks (e.g. species) are allowed," |
|---|
| 187 | print "missing intermediate ranks (e.g. family) or sublevels (e.g. suborder) are not!\n" |
|---|
| 188 | self.cfg.exit_user_error() |
|---|
| 189 | |
|---|
| 190 | def check_tax_duplicates(self): |
|---|
| 191 | # check for duplicate rank names |
|---|
| 192 | action = self.cfg.dup_rank_names |
|---|
| 193 | if action != "ignore": |
|---|
| 194 | autofix = action == "autofix" |
|---|
| 195 | dups = self.taxonomy.check_for_duplicates(autofix) |
|---|
| 196 | if len(dups) > 0: |
|---|
| 197 | if action == "autofix": |
|---|
| 198 | print "WARNING: %d sequences with duplicate rank names found and were renamed as follows:\n" % len(dups) |
|---|
| 199 | for dup in dups: |
|---|
| 200 | print "Original: %s\t%s" % (dup[0], dup[1]) |
|---|
| 201 | print "Duplicate: %s\t%s" % (dup[2], dup[3]) |
|---|
| 202 | print "Renamed to: %s\t%s\n" % (dup[2], dup[4]) |
|---|
| 203 | elif action == "skip": |
|---|
| 204 | print "WARNING: Following %d sequences with duplicate rank names were skipped:\n" % len(dups) |
|---|
| 205 | for dup in dups: |
|---|
| 206 | self.taxonomy.remove_seq(dup[2]) |
|---|
| 207 | print "%s\t%s\n" % (dup[2], dup[3]) |
|---|
| 208 | else: # abort |
|---|
| 209 | print "ERROR: %d sequences with duplicate rank names found:\n" % len(dups) |
|---|
| 210 | for dup in dups: |
|---|
| 211 | print "%s\t%s\n%s\t%s\n" % dup |
|---|
| 212 | print "Please fix (rename) them and run the pipeline again (or use -dup-rank-names autofix option)" |
|---|
| 213 | self.cfg.exit_user_error() |
|---|
| 214 | |
|---|
| 215 | class RefTreeBuilder: |
|---|
| 216 | def __init__(self, config): |
|---|
| 217 | self.cfg = config |
|---|
| 218 | self.mfresolv_job_name = self.cfg.subst_name("mfresolv_%NAME%") |
|---|
| 219 | self.epalbl_job_name = self.cfg.subst_name("epalbl_%NAME%") |
|---|
| 220 | self.optmod_job_name = self.cfg.subst_name("optmod_%NAME%") |
|---|
| 221 | self.raxml_wrapper = RaxmlWrapper(config) |
|---|
| 222 | |
|---|
| 223 | self.outgr_fname = self.cfg.tmp_fname("%NAME%_outgr.tre") |
|---|
| 224 | self.reftree_mfu_fname = self.cfg.tmp_fname("%NAME%_mfu.tre") |
|---|
| 225 | self.reftree_bfu_fname = self.cfg.tmp_fname("%NAME%_bfu.tre") |
|---|
| 226 | self.optmod_fname = self.cfg.tmp_fname("%NAME%.opt") |
|---|
| 227 | self.lblalign_fname = self.cfg.tmp_fname("%NAME%_lblq.fa") |
|---|
| 228 | self.reftree_lbl_fname = self.cfg.tmp_fname("%NAME%_lbl.tre") |
|---|
| 229 | self.reftree_tax_fname = self.cfg.tmp_fname("%NAME%_tax.tre") |
|---|
| 230 | self.brmap_fname = self.cfg.tmp_fname("%NAME%_map.txt") |
|---|
| 231 | |
|---|
| 232 | def load_alignment(self): |
|---|
| 233 | in_file = self.cfg.align_fname |
|---|
| 234 | self.input_seqs = None |
|---|
| 235 | formats = ["fasta", "phylip_relaxed", "iphylip_relaxed", "phylip", "iphylip"] |
|---|
| 236 | for fmt in formats: |
|---|
| 237 | try: |
|---|
| 238 | self.input_seqs = SeqGroup(sequences=in_file, format = fmt) |
|---|
| 239 | break |
|---|
| 240 | except: |
|---|
| 241 | self.cfg.log.debug("Guessing input format: not " + fmt) |
|---|
| 242 | if self.input_seqs == None: |
|---|
| 243 | self.cfg.exit_user_error("Invalid input file format: %s\nThe supported input formats are fasta and phylip" % in_file) |
|---|
| 244 | |
|---|
| 245 | def validate_taxonomy(self): |
|---|
| 246 | self.input_validator = InputValidator(self.cfg, self.taxonomy, self.input_seqs) |
|---|
| 247 | self.input_validator.validate() |
|---|
| 248 | |
|---|
| 249 | def build_multif_tree(self): |
|---|
| 250 | c = self.cfg |
|---|
| 251 | |
|---|
| 252 | tb = TaxTreeBuilder(c, self.taxonomy) |
|---|
| 253 | (t, ids) = tb.build(c.reftree_min_rank, c.reftree_max_seqs_per_leaf, c.reftree_clades_to_include, c.reftree_clades_to_ignore) |
|---|
| 254 | self.reftree_ids = frozenset(ids) |
|---|
| 255 | self.reftree_size = len(ids) |
|---|
| 256 | self.reftree_multif = t |
|---|
| 257 | |
|---|
| 258 | # IMPORTANT: select GAMMA or CAT model based on tree size! |
|---|
| 259 | self.cfg.resolve_auto_settings(self.reftree_size) |
|---|
| 260 | |
|---|
| 261 | if self.cfg.debug: |
|---|
| 262 | refseq_fname = self.cfg.tmp_fname("%NAME%_seq_ids.txt") |
|---|
| 263 | # list of sequence ids which comprise the reference tree |
|---|
| 264 | with open(refseq_fname, "w") as f: |
|---|
| 265 | for sid in ids: |
|---|
| 266 | f.write("%s\n" % sid) |
|---|
| 267 | |
|---|
| 268 | # original tree with taxonomic ranks as internal node labels |
|---|
| 269 | reftax_fname = self.cfg.tmp_fname("%NAME%_mfu_tax.tre") |
|---|
| 270 | t.write(outfile=reftax_fname, format=8) |
|---|
| 271 | # t.show() |
|---|
| 272 | |
|---|
| 273 | def export_ref_alignment(self): |
|---|
| 274 | """This function transforms the input alignment in the following way: |
|---|
| 275 | 1. Filter out sequences which are not part of the reference tree |
|---|
| 276 | 2. Add sequence name prefix (r_)""" |
|---|
| 277 | |
|---|
| 278 | self.refalign_fname = self.cfg.tmp_fname("%NAME%_matrix.afa") |
|---|
| 279 | with open(self.refalign_fname, "w") as fout: |
|---|
| 280 | for name, seq, comment, sid in self.input_seqs.iter_entries(): |
|---|
| 281 | seq_name = EpacConfig.REF_SEQ_PREFIX + name |
|---|
| 282 | if seq_name in self.input_validator.corr_seqid: |
|---|
| 283 | seq_name = self.input_validator.corr_seqid[seq_name] |
|---|
| 284 | if seq_name in self.reftree_ids: |
|---|
| 285 | fout.write(">" + seq_name + "\n" + seq + "\n") |
|---|
| 286 | |
|---|
| 287 | # we do not need the original alignment anymore, so free its memory |
|---|
| 288 | self.input_seqs = None |
|---|
| 289 | |
|---|
| 290 | def export_ref_taxonomy(self): |
|---|
| 291 | self.taxonomy_map = {} |
|---|
| 292 | |
|---|
| 293 | for sid, ranks in self.taxonomy.iteritems(): |
|---|
| 294 | if sid in self.reftree_ids: |
|---|
| 295 | self.taxonomy_map[sid] = ranks |
|---|
| 296 | |
|---|
| 297 | if self.cfg.debug: |
|---|
| 298 | tax_fname = self.cfg.tmp_fname("%NAME%_tax.txt") |
|---|
| 299 | with open(tax_fname, "w") as fout: |
|---|
| 300 | for sid, ranks in self.taxonomy_map.iteritems(): |
|---|
| 301 | ranks_str = self.taxonomy.seq_lineage_str(sid) |
|---|
| 302 | fout.write(sid + "\t" + ranks_str + "\n") |
|---|
| 303 | |
|---|
| 304 | def save_rooting(self): |
|---|
| 305 | rt = self.reftree_multif |
|---|
| 306 | |
|---|
| 307 | tax_map = self.taxonomy.get_map() |
|---|
| 308 | self.taxtree_helper = TaxTreeHelper(self.cfg, tax_map) |
|---|
| 309 | self.taxtree_helper.set_mf_rooted_tree(rt) |
|---|
| 310 | outgr = self.taxtree_helper.get_outgroup() |
|---|
| 311 | outgr_size = len(outgr.get_leaves()) |
|---|
| 312 | outgr.write(outfile=self.outgr_fname, format=9) |
|---|
| 313 | self.reftree_outgroup = outgr |
|---|
| 314 | self.cfg.log.debug("Outgroup for rooting was saved to: %s, outgroup size: %d", self.outgr_fname, outgr_size) |
|---|
| 315 | |
|---|
| 316 | # remove unifurcation at the root |
|---|
| 317 | if len(rt.children) == 1: |
|---|
| 318 | rt = rt.children[0] |
|---|
| 319 | |
|---|
| 320 | # now we can safely unroot the tree and remove internal node labels to make it suitable for raxml |
|---|
| 321 | rt.write(outfile=self.reftree_mfu_fname, format=9) |
|---|
| 322 | |
|---|
| 323 | # RAxML call to convert multifurcating tree to the strictly bifurcating one |
|---|
| 324 | def resolve_multif(self): |
|---|
| 325 | self.cfg.log.debug("\nReducing the alignment: \n") |
|---|
| 326 | self.reduced_refalign_fname = self.raxml_wrapper.reduce_alignment(self.refalign_fname) |
|---|
| 327 | |
|---|
| 328 | self.cfg.log.debug("\nConstrained ML inference: \n") |
|---|
| 329 | raxml_params = ["-s", self.reduced_refalign_fname, "-g", self.reftree_mfu_fname, "--no-seq-check", "-N", str(self.cfg.rep_num)] |
|---|
| 330 | if self.cfg.mfresolv_method == "fast": |
|---|
| 331 | raxml_params += ["-D"] |
|---|
| 332 | elif self.cfg.mfresolv_method == "ultrafast": |
|---|
| 333 | raxml_params += ["-f", "e"] |
|---|
| 334 | if self.cfg.restart and self.raxml_wrapper.result_exists(self.mfresolv_job_name): |
|---|
| 335 | self.invocation_raxml_multif = self.raxml_wrapper.get_invocation_str(self.mfresolv_job_name) |
|---|
| 336 | self.cfg.log.debug("\nUsing existing ML tree found in: %s\n", self.raxml_wrapper.result_fname(self.mfresolv_job_name)) |
|---|
| 337 | else: |
|---|
| 338 | self.invocation_raxml_multif = self.raxml_wrapper.run(self.mfresolv_job_name, raxml_params) |
|---|
| 339 | # self.invocation_raxml_multif = self.raxml_wrapper.run_multiple(self.mfresolv_job_name, raxml_params, self.cfg.rep_num) |
|---|
| 340 | if self.cfg.mfresolv_method == "ultrafast": |
|---|
| 341 | self.raxml_wrapper.copy_result_tree(self.mfresolv_job_name, self.raxml_wrapper.besttree_fname(self.mfresolv_job_name)) |
|---|
| 342 | |
|---|
| 343 | if self.raxml_wrapper.besttree_exists(self.mfresolv_job_name): |
|---|
| 344 | if not self.cfg.reopt_model: |
|---|
| 345 | self.raxml_wrapper.copy_best_tree(self.mfresolv_job_name, self.reftree_bfu_fname) |
|---|
| 346 | self.raxml_wrapper.copy_optmod_params(self.mfresolv_job_name, self.optmod_fname) |
|---|
| 347 | self.invocation_raxml_optmod = "" |
|---|
| 348 | job_name = self.mfresolv_job_name |
|---|
| 349 | else: |
|---|
| 350 | bfu_fname = self.raxml_wrapper.besttree_fname(self.mfresolv_job_name) |
|---|
| 351 | job_name = self.optmod_job_name |
|---|
| 352 | |
|---|
| 353 | # RAxML call to optimize model parameters and write them down to the binary model file |
|---|
| 354 | self.cfg.log.debug("\nOptimizing model parameters: \n") |
|---|
| 355 | raxml_params = ["-f", "e", "-s", self.reduced_refalign_fname, "-t", bfu_fname, "--no-seq-check"] |
|---|
| 356 | if self.cfg.raxml_model.startswith("GTRCAT") and not self.cfg.compress_patterns: |
|---|
| 357 | raxml_params += ["-H"] |
|---|
| 358 | if self.cfg.restart and self.raxml_wrapper.result_exists(self.optmod_job_name): |
|---|
| 359 | self.invocation_raxml_optmod = self.raxml_wrapper.get_invocation_str(self.optmod_job_name) |
|---|
| 360 | self.cfg.log.debug("\nUsing existing optimized tree and parameters found in: %s\n", self.raxml_wrapper.result_fname(self.optmod_job_name)) |
|---|
| 361 | else: |
|---|
| 362 | self.invocation_raxml_optmod = self.raxml_wrapper.run(self.optmod_job_name, raxml_params) |
|---|
| 363 | if self.raxml_wrapper.result_exists(self.optmod_job_name): |
|---|
| 364 | self.raxml_wrapper.copy_result_tree(self.optmod_job_name, self.reftree_bfu_fname) |
|---|
| 365 | self.raxml_wrapper.copy_optmod_params(self.optmod_job_name, self.optmod_fname) |
|---|
| 366 | else: |
|---|
| 367 | errmsg = "RAxML run failed (model optimization), please examine the log for details: %s" \ |
|---|
| 368 | % self.raxml_wrapper.make_raxml_fname("output", self.optmod_job_name) |
|---|
| 369 | self.cfg.exit_fatal_error(errmsg) |
|---|
| 370 | |
|---|
| 371 | if self.cfg.raxml_model.startswith("GTRCAT"): |
|---|
| 372 | mod_name = "CAT" |
|---|
| 373 | else: |
|---|
| 374 | mod_name = "GAMMA" |
|---|
| 375 | self.reftree_loglh = self.raxml_wrapper.get_tree_lh(job_name, mod_name) |
|---|
| 376 | self.cfg.log.debug("\n%s-based logLH of the reference tree: %f\n" % (mod_name, self.reftree_loglh)) |
|---|
| 377 | |
|---|
| 378 | else: |
|---|
| 379 | errmsg = "RAxML run failed (mutlifurcation resolution), please examine the log for details: %s" \ |
|---|
| 380 | % self.raxml_wrapper.make_raxml_fname("output", self.mfresolv_job_name) |
|---|
| 381 | self.cfg.exit_fatal_error(errmsg) |
|---|
| 382 | |
|---|
| 383 | def load_reduced_refalign(self): |
|---|
| 384 | formats = ["fasta", "phylip_relaxed"] |
|---|
| 385 | for fmt in formats: |
|---|
| 386 | try: |
|---|
| 387 | self.reduced_refalign_seqs = SeqGroup(sequences=self.reduced_refalign_fname, format = fmt) |
|---|
| 388 | break |
|---|
| 389 | except: |
|---|
| 390 | pass |
|---|
| 391 | if self.reduced_refalign_seqs == None: |
|---|
| 392 | errmsg = "FATAL ERROR: Invalid input file format in %s! (load_reduced_refalign)" % self.reduced_refalign_fname |
|---|
| 393 | self.cfg.exit_fatal_error(errmsg) |
|---|
| 394 | |
|---|
| 395 | # dummy EPA run to label the branches of the reference tree, which we need to build a mapping to tax ranks |
|---|
| 396 | def epa_branch_labeling(self): |
|---|
| 397 | # create alignment with dummy query seq |
|---|
| 398 | self.refalign_width = len(self.reduced_refalign_seqs.get_seqbyid(0)) |
|---|
| 399 | self.reduced_refalign_seqs.write(format="fasta", outfile=self.lblalign_fname) |
|---|
| 400 | |
|---|
| 401 | with open(self.lblalign_fname, "a") as fout: |
|---|
| 402 | fout.write(">" + "DUMMY131313" + "\n") |
|---|
| 403 | fout.write("A"*self.refalign_width + "\n") |
|---|
| 404 | |
|---|
| 405 | # TODO always load model regardless of the config file settings? |
|---|
| 406 | epa_result = self.raxml_wrapper.run_epa(self.epalbl_job_name, self.lblalign_fname, self.reftree_bfu_fname, self.optmod_fname, mode="epa_mp") |
|---|
| 407 | self.reftree_lbl_str = epa_result.get_std_newick_tree() |
|---|
| 408 | self.raxml_version = epa_result.get_raxml_version() |
|---|
| 409 | self.invocation_raxml_epalbl = epa_result.get_raxml_invocation() |
|---|
| 410 | |
|---|
| 411 | if not self.raxml_wrapper.epa_result_exists(self.epalbl_job_name): |
|---|
| 412 | errmsg = "RAxML EPA run failed, please examine the log for details: %s" \ |
|---|
| 413 | % self.raxml_wrapper.make_raxml_fname("output", self.epalbl_job_name) |
|---|
| 414 | self.cfg.exit_fatal_error(errmsg) |
|---|
| 415 | |
|---|
| 416 | def epa_post_process(self): |
|---|
| 417 | lbl_tree = Tree(self.reftree_lbl_str) |
|---|
| 418 | self.taxtree_helper.set_bf_unrooted_tree(lbl_tree) |
|---|
| 419 | self.reftree_tax = self.taxtree_helper.get_tax_tree() |
|---|
| 420 | self.bid_ranks_map = self.taxtree_helper.get_bid_taxonomy_map() |
|---|
| 421 | |
|---|
| 422 | if self.cfg.debug: |
|---|
| 423 | self.reftree_tax.write(outfile=self.reftree_tax_fname, format=3) |
|---|
| 424 | with open(self.reftree_lbl_fname, "w") as outf: |
|---|
| 425 | outf.write(self.reftree_lbl_str) |
|---|
| 426 | with open(self.brmap_fname, "w") as outf: |
|---|
| 427 | for bid, br_rec in self.bid_ranks_map.iteritems(): |
|---|
| 428 | outf.write("%s\t%s\t%d\t%f\n" % (bid, br_rec[0], br_rec[1], br_rec[2])) |
|---|
| 429 | |
|---|
| 430 | def calc_node_heights(self): |
|---|
| 431 | """Calculate node heights on the reference tree (used to define branch-length cutoff during classification step) |
|---|
| 432 | Algorithm is as follows: |
|---|
| 433 | Tip node or node resolved to Species level: height = 1 |
|---|
| 434 | Inner node resolved to Genus or above: height = min(left_height, right_height) + 1 |
|---|
| 435 | """ |
|---|
| 436 | nh_map = {} |
|---|
| 437 | dummy_added = False |
|---|
| 438 | for node in self.reftree_tax.traverse("postorder"): |
|---|
| 439 | if not node.is_root(): |
|---|
| 440 | if not hasattr(node, "B"): |
|---|
| 441 | # In a rooted tree, there is always one more node/branch than in unrooted one |
|---|
| 442 | # That's why one branch will be always not EPA-labelled after the rooting |
|---|
| 443 | if not dummy_added: |
|---|
| 444 | node.B = "DDD" |
|---|
| 445 | dummy_added = True |
|---|
| 446 | species_rank = Taxonomy.EMPTY_RANK |
|---|
| 447 | else: |
|---|
| 448 | errmsg = "FATAL ERROR: More than one tree branch without EPA label (calc_node_heights)" |
|---|
| 449 | self.cfg.exit_fatal_error(errmsg) |
|---|
| 450 | else: |
|---|
| 451 | species_rank = self.bid_ranks_map[node.B][-1] |
|---|
| 452 | bid = node.B |
|---|
| 453 | if node.is_leaf() or species_rank != Taxonomy.EMPTY_RANK: |
|---|
| 454 | nh_map[bid] = 1 |
|---|
| 455 | else: |
|---|
| 456 | lchild = node.children[0] |
|---|
| 457 | rchild = node.children[1] |
|---|
| 458 | nh_map[bid] = min(nh_map[lchild.B], nh_map[rchild.B]) + 1 |
|---|
| 459 | |
|---|
| 460 | # remove heights for dummy nodes, since there won't be any placements on them |
|---|
| 461 | if dummy_added: |
|---|
| 462 | del nh_map["DDD"] |
|---|
| 463 | |
|---|
| 464 | self.node_height_map = nh_map |
|---|
| 465 | |
|---|
| 466 | def __get_all_rank_names(self, root): |
|---|
| 467 | rnames = set([]) |
|---|
| 468 | for node in root.traverse("postorder"): |
|---|
| 469 | ranks = node.ranks |
|---|
| 470 | for rk in ranks: |
|---|
| 471 | rnames.add(rk) |
|---|
| 472 | return rnames |
|---|
| 473 | |
|---|
| 474 | def mono_index(self): |
|---|
| 475 | """This method will calculate monophyly index by looking at the left and right hand side of the tree""" |
|---|
| 476 | children = self.reftree_tax.children |
|---|
| 477 | if len(children) == 1: |
|---|
| 478 | while len(children) == 1: |
|---|
| 479 | children = children[0].children |
|---|
| 480 | if len(children) == 2: |
|---|
| 481 | left = children[0] |
|---|
| 482 | right =children[1] |
|---|
| 483 | lset = self.__get_all_rank_names(left) |
|---|
| 484 | rset = self.__get_all_rank_names(right) |
|---|
| 485 | iset = lset & rset |
|---|
| 486 | return iset |
|---|
| 487 | else: |
|---|
| 488 | print("Error: input tree not birfurcating") |
|---|
| 489 | return set([]) |
|---|
| 490 | |
|---|
| 491 | def build_hmm_profile(self, json_builder): |
|---|
| 492 | print "Building the HMMER profile...\n" |
|---|
| 493 | |
|---|
| 494 | # this stupid workaround is needed because RAxML outputs the reduced |
|---|
| 495 | # alignment in relaxed PHYLIP format, which is not supported by HMMER |
|---|
| 496 | refalign_fasta = self.cfg.tmp_fname("%NAME%_ref_reduced.fa") |
|---|
| 497 | self.reduced_refalign_seqs.write(outfile=refalign_fasta) |
|---|
| 498 | |
|---|
| 499 | hmm = hmmer(self.cfg, refalign_fasta) |
|---|
| 500 | fprofile = hmm.build_hmm_profile() |
|---|
| 501 | |
|---|
| 502 | json_builder.set_hmm_profile(fprofile) |
|---|
| 503 | |
|---|
| 504 | def write_json(self): |
|---|
| 505 | jw = RefJsonBuilder() |
|---|
| 506 | |
|---|
| 507 | jw.set_branch_tax_map(self.bid_ranks_map) |
|---|
| 508 | jw.set_tree(self.reftree_lbl_str) |
|---|
| 509 | jw.set_outgroup(self.reftree_outgroup) |
|---|
| 510 | jw.set_ratehet_model(self.cfg.raxml_model) |
|---|
| 511 | jw.set_tax_tree(self.reftree_multif) |
|---|
| 512 | jw.set_pattern_compression(self.cfg.compress_patterns) |
|---|
| 513 | jw.set_taxcode(self.cfg.taxcode_name) |
|---|
| 514 | |
|---|
| 515 | jw.set_merged_ranks_map(self.input_validator.merged_ranks) |
|---|
| 516 | corr_ranks_reverse = dict((reversed(item) for item in self.input_validator.corr_ranks.items())) |
|---|
| 517 | jw.set_corr_ranks_map(corr_ranks_reverse) |
|---|
| 518 | corr_seqid_reverse = dict((reversed(item) for item in self.input_validator.corr_seqid.items())) |
|---|
| 519 | jw.set_corr_seqid_map(corr_seqid_reverse) |
|---|
| 520 | |
|---|
| 521 | mdata = { "ref_tree_size": self.reftree_size, |
|---|
| 522 | "ref_alignment_width": self.refalign_width, |
|---|
| 523 | "raxml_version": self.raxml_version, |
|---|
| 524 | "timestamp": str(datetime.datetime.now()), |
|---|
| 525 | "invocation_epac": self.invocation_epac, |
|---|
| 526 | "invocation_raxml_multif": self.invocation_raxml_multif, |
|---|
| 527 | "invocation_raxml_optmod": self.invocation_raxml_optmod, |
|---|
| 528 | "invocation_raxml_epalbl": self.invocation_raxml_epalbl, |
|---|
| 529 | "reftree_loglh": self.reftree_loglh |
|---|
| 530 | } |
|---|
| 531 | jw.set_metadata(mdata) |
|---|
| 532 | |
|---|
| 533 | seqs = self.reduced_refalign_seqs.get_entries() |
|---|
| 534 | jw.set_sequences(seqs) |
|---|
| 535 | |
|---|
| 536 | if not self.cfg.no_hmmer: |
|---|
| 537 | self.build_hmm_profile(jw) |
|---|
| 538 | |
|---|
| 539 | orig_tax = self.taxonomy_map |
|---|
| 540 | jw.set_origin_taxonomy(orig_tax) |
|---|
| 541 | |
|---|
| 542 | self.cfg.log.debug("Calculating the speciation rate...\n") |
|---|
| 543 | tp = tree_param(tree = self.reftree_lbl_str, origin_taxonomy = orig_tax) |
|---|
| 544 | jw.set_rate(tp.get_speciation_rate_fast()) |
|---|
| 545 | jw.set_nodes_height(self.node_height_map) |
|---|
| 546 | |
|---|
| 547 | jw.set_binary_model(self.optmod_fname) |
|---|
| 548 | |
|---|
| 549 | self.cfg.log.debug("Writing down the reference file...\n") |
|---|
| 550 | jw.dump(self.cfg.refjson_fname) |
|---|
| 551 | |
|---|
| 552 | # top-level function to build a reference tree |
|---|
| 553 | def build_ref_tree(self): |
|---|
| 554 | self.cfg.log.info("=> Loading taxonomy from file: %s ...\n" , self.cfg.taxonomy_fname) |
|---|
| 555 | self.taxonomy = Taxonomy(prefix=EpacConfig.REF_SEQ_PREFIX, tax_fname=self.cfg.taxonomy_fname) |
|---|
| 556 | self.cfg.log.info("==> Loading reference alignment from file: %s ...\n" , self.cfg.align_fname) |
|---|
| 557 | self.load_alignment() |
|---|
| 558 | self.cfg.log.info("===> Validating taxonomy and alignment ...\n") |
|---|
| 559 | self.validate_taxonomy() |
|---|
| 560 | self.cfg.log.info("====> Building a multifurcating tree from taxonomy with %d seqs ...\n" , self.taxonomy.seq_count()) |
|---|
| 561 | self.build_multif_tree() |
|---|
| 562 | self.cfg.log.info("=====> Building the reference alignment ...\n") |
|---|
| 563 | self.export_ref_alignment() |
|---|
| 564 | self.export_ref_taxonomy() |
|---|
| 565 | self.cfg.log.info("======> Saving the outgroup for later re-rooting ...\n") |
|---|
| 566 | self.save_rooting() |
|---|
| 567 | self.cfg.log.info("=======> Resolving multifurcation: choosing the best topology from %d independent RAxML runs ...\n" % self.cfg.rep_num) |
|---|
| 568 | self.resolve_multif() |
|---|
| 569 | self.load_reduced_refalign() |
|---|
| 570 | self.cfg.log.info("========> Calling RAxML-EPA to obtain branch labels ...\n") |
|---|
| 571 | self.epa_branch_labeling() |
|---|
| 572 | self.cfg.log.info("=========> Post-processing the EPA tree (re-rooting, taxonomic labeling etc.) ...\n") |
|---|
| 573 | self.epa_post_process() |
|---|
| 574 | self.calc_node_heights() |
|---|
| 575 | |
|---|
| 576 | self.cfg.log.debug("\n==========> Checking branch labels ...") |
|---|
| 577 | self.cfg.log.debug("shared rank names before training: %s", repr(self.taxonomy.get_common_ranks())) |
|---|
| 578 | self.cfg.log.debug("shared rank names after training: %s\n", repr(self.mono_index())) |
|---|
| 579 | |
|---|
| 580 | self.cfg.log.info("==========> Saving the reference JSON file: %s\n" % self.cfg.refjson_fname) |
|---|
| 581 | self.write_json() |
|---|
| 582 | |
|---|
| 583 | def parse_args(): |
|---|
| 584 | parser = ArgumentParser(description="Build a reference tree for EPA taxonomic placement.", |
|---|
| 585 | epilog="Example: ./epa_trainer.py -t example/training_tax.txt -s example/training_seq.fa -n myref", |
|---|
| 586 | formatter_class=RawTextHelpFormatter) |
|---|
| 587 | parser.add_argument("-t", dest="taxonomy_fname", required=True, |
|---|
| 588 | help="""Reference taxonomy file.""") |
|---|
| 589 | parser.add_argument("-s", dest="align_fname", required=True, |
|---|
| 590 | help="""Reference alignment file. Sequences must be aligned, their IDs must correspond to those |
|---|
| 591 | in taxonomy file.""") |
|---|
| 592 | parser.add_argument("-r", dest="ref_fname", |
|---|
| 593 | help="""Reference output file. It will contain reference alignment, phylogenetic tree and other |
|---|
| 594 | information needed for taxonomic placement of query sequences.""") |
|---|
| 595 | parser.add_argument("-T", dest="num_threads", type=int, default=None, |
|---|
| 596 | help="""Specify the number of CPUs. Default: %d""" % multiprocessing.cpu_count()) |
|---|
| 597 | parser.add_argument("-c", dest="config_fname", default=None, |
|---|
| 598 | help="""Config file name.""") |
|---|
| 599 | parser.add_argument("-o", dest="output_dir", default=".", |
|---|
| 600 | help="""Output directory""") |
|---|
| 601 | parser.add_argument("-n", dest="output_name", default=None, |
|---|
| 602 | help="""Run name.""") |
|---|
| 603 | parser.add_argument("-p", dest="rand_seed", type=int, default=None, |
|---|
| 604 | help="""Random seed to be used with RAxML. Default: current system time.""") |
|---|
| 605 | parser.add_argument("-m", dest="mfresolv_method", choices=["thorough", "fast", "ultrafast"], |
|---|
| 606 | default="thorough", help="""Method of multifurcation resolution: |
|---|
| 607 | thorough use stardard constrainted RAxML tree search (default) |
|---|
| 608 | fast use RF distance as search convergence criterion (RAxML -D option) |
|---|
| 609 | ultrafast optimize model+branch lengths only (RAxML -f e option)""") |
|---|
| 610 | parser.add_argument("-N", dest="rep_num", type=int, default=1, |
|---|
| 611 | help="""Number of RAxML tree searches (with distinct random seeds). Default: 1""") |
|---|
| 612 | parser.add_argument("-x", dest="taxcode_name", choices=["bac", "bot", "zoo", "vir"], type = str.lower, |
|---|
| 613 | help="""Taxonomic code: BAC(teriological), BOT(anical), ZOO(logical), VIR(ological)""") |
|---|
| 614 | parser.add_argument("-R", dest="restart", action="store_true", |
|---|
| 615 | help="""Resume execution after a premature termination (e.g., due to expired job time limit). |
|---|
| 616 | Run name of the previous (terminated) job must be specified via -n option.""") |
|---|
| 617 | parser.add_argument("-v", dest="verbose", action="store_true", |
|---|
| 618 | help="""Print additional info messages to the console.""") |
|---|
| 619 | parser.add_argument("-debug", dest="debug", action="store_true", |
|---|
| 620 | help="""Debug mode, intermediate files will not be cleaned up.""") |
|---|
| 621 | parser.add_argument("-no-hmmer", dest="no_hmmer", action="store_true", |
|---|
| 622 | help="""Do not build HMMER profile.""") |
|---|
| 623 | parser.add_argument("-dup-rank-names", dest="dup_rank_names", choices=["ignore", "abort", "skip", "autofix"], |
|---|
| 624 | default="ignore", help="""Action to be performed if different ranks with same name are found: |
|---|
| 625 | ignore do nothing |
|---|
| 626 | abort report duplicates and exit |
|---|
| 627 | skip skip the corresponding sequences (exlude from reference) |
|---|
| 628 | autofix make name unique by concatenating it with the parent rank's name""") |
|---|
| 629 | parser.add_argument("-wrong-rank-count", dest="wrong_rank_count", choices=["ignore", "abort", "skip", "autofix"], |
|---|
| 630 | default="ignore", help="""Action to be performed if lineage has less (more) than 7 ranks |
|---|
| 631 | ignore do nothing |
|---|
| 632 | abort report duplicates and exit |
|---|
| 633 | skip skip the corresponding sequences (exlude from reference) |
|---|
| 634 | autofix try to guess wich ranks should be added or removed (use with caution!)""") |
|---|
| 635 | parser.add_argument("-tmpdir", dest="temp_dir", default=None, |
|---|
| 636 | help="""Directory for temporary files.""") |
|---|
| 637 | |
|---|
| 638 | if len(sys.argv) < 4: |
|---|
| 639 | parser.print_help() |
|---|
| 640 | sys.exit() |
|---|
| 641 | |
|---|
| 642 | args = parser.parse_args() |
|---|
| 643 | |
|---|
| 644 | return args |
|---|
| 645 | |
|---|
| 646 | def check_args(args): |
|---|
| 647 | #check if taxonomy file exists |
|---|
| 648 | if not os.path.isfile(args.taxonomy_fname): |
|---|
| 649 | print "ERROR: Taxonomy file not found: %s" % args.taxonomy_fname |
|---|
| 650 | sys.exit() |
|---|
| 651 | |
|---|
| 652 | #check if alignment file exists |
|---|
| 653 | if not os.path.isfile(args.align_fname): |
|---|
| 654 | print "ERROR: Alignment file not found: %s" % args.align_fname |
|---|
| 655 | sys.exit() |
|---|
| 656 | |
|---|
| 657 | if not args.output_name: |
|---|
| 658 | args.output_name = args.align_fname |
|---|
| 659 | |
|---|
| 660 | if not args.ref_fname: |
|---|
| 661 | args.ref_fname = "%s.refjson" % args.output_name |
|---|
| 662 | |
|---|
| 663 | if args.output_dir and not os.path.dirname(args.ref_fname): |
|---|
| 664 | args.ref_fname = os.path.join(args.output_dir, args.ref_fname) |
|---|
| 665 | |
|---|
| 666 | #check if reference json file already exists |
|---|
| 667 | if os.path.isfile(args.ref_fname): |
|---|
| 668 | print "ERROR: Reference tree file already exists: %s" % args.ref_fname |
|---|
| 669 | print "Please delete it explicitely if you want to overwrite." |
|---|
| 670 | sys.exit() |
|---|
| 671 | |
|---|
| 672 | #check if reference file can be created |
|---|
| 673 | try: |
|---|
| 674 | f = open(args.ref_fname, "w") |
|---|
| 675 | f.close() |
|---|
| 676 | os.remove(args.ref_fname) |
|---|
| 677 | except: |
|---|
| 678 | print "ERROR: cannot create output file: %s" % args.ref_fname |
|---|
| 679 | print "Please check if directory %s exists and you have write permissions for it." % os.path.split(os.path.abspath(args.ref_fname))[0] |
|---|
| 680 | sys.exit() |
|---|
| 681 | |
|---|
| 682 | if args.rep_num < 1 or args.rep_num > 1000: |
|---|
| 683 | print "ERROR: Number of RAxML runs must be between 1 and 1000." |
|---|
| 684 | sys.exit() |
|---|
| 685 | |
|---|
| 686 | def which(program, custom_path=[]): |
|---|
| 687 | def is_exe(fpath): |
|---|
| 688 | return os.path.isfile(fpath) and os.access(fpath, os.X_OK) |
|---|
| 689 | |
|---|
| 690 | fpath, fname = os.path.split(program) |
|---|
| 691 | if fpath: |
|---|
| 692 | if is_exe(program): |
|---|
| 693 | return program |
|---|
| 694 | else: |
|---|
| 695 | path_list = custom_path |
|---|
| 696 | path_list += os.environ["PATH"].split(os.pathsep) |
|---|
| 697 | for path in path_list: |
|---|
| 698 | path = path.strip('"') |
|---|
| 699 | exe_file = os.path.join(path, program) |
|---|
| 700 | if is_exe(exe_file): |
|---|
| 701 | return exe_file |
|---|
| 702 | |
|---|
| 703 | return None |
|---|
| 704 | |
|---|
| 705 | def check_dep(config): |
|---|
| 706 | if not config.no_hmmer: |
|---|
| 707 | if not which("hmmalign", [config.hmmer_home]): |
|---|
| 708 | print "ERROR: HMMER not found!" |
|---|
| 709 | print "Please either specify path to HMMER executables in the config file" |
|---|
| 710 | print "or call this script with -no-hmmer option to skip building HMMER profile." |
|---|
| 711 | config.exit_user_error() |
|---|
| 712 | |
|---|
| 713 | def run_trainer(config): |
|---|
| 714 | check_dep(config) |
|---|
| 715 | builder = RefTreeBuilder(config) |
|---|
| 716 | builder.invocation_epac = " ".join(sys.argv) |
|---|
| 717 | builder.build_ref_tree() |
|---|
| 718 | |
|---|
| 719 | # ------- |
|---|
| 720 | # MAIN |
|---|
| 721 | # ------- |
|---|
| 722 | if __name__ == "__main__": |
|---|
| 723 | args = parse_args() |
|---|
| 724 | check_args(args) |
|---|
| 725 | config = EpacTrainerConfig(args) |
|---|
| 726 | |
|---|
| 727 | print "" |
|---|
| 728 | config.print_version("SATIVA-trainer") |
|---|
| 729 | |
|---|
| 730 | start_time = time.time() |
|---|
| 731 | |
|---|
| 732 | run_trainer(config) |
|---|
| 733 | config.clean_tempdir() |
|---|
| 734 | |
|---|
| 735 | config.log.info("Reference JSON was saved to: %s", os.path.abspath(config.refjson_fname)) |
|---|
| 736 | config.log.info("Execution log was saved to: %s\n", os.path.abspath(config.log_fname)) |
|---|
| 737 | |
|---|
| 738 | elapsed_time = time.time() - start_time |
|---|
| 739 | config.log.info("Training completed successfully, elapsed time: %.0f seconds\n", elapsed_time) |
|---|