| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | import os |
|---|
| 4 | import sys |
|---|
| 5 | import glob |
|---|
| 6 | import shutil |
|---|
| 7 | import datetime |
|---|
| 8 | import random |
|---|
| 9 | import re |
|---|
| 10 | from subprocess import call,STDOUT |
|---|
| 11 | from json_util import EpaJsonParser |
|---|
| 12 | |
|---|
| 13 | class FileUtils: |
|---|
| 14 | |
|---|
| 15 | @staticmethod |
|---|
| 16 | def normalize_dir(dir_str): |
|---|
| 17 | if dir_str and not dir_str.endswith("/"): |
|---|
| 18 | dir_str += "/" |
|---|
| 19 | return dir_str |
|---|
| 20 | |
|---|
| 21 | @staticmethod |
|---|
| 22 | def remove_if_exists(fname): |
|---|
| 23 | try: |
|---|
| 24 | os.remove(fname) |
|---|
| 25 | except: |
|---|
| 26 | pass |
|---|
| 27 | |
|---|
| 28 | @staticmethod |
|---|
| 29 | def rebase(fname, old_basedir, new_basedir): |
|---|
| 30 | return fname.replace(old_basedir, new_basedir) |
|---|
| 31 | |
|---|
| 32 | class RaxmlWrapper: |
|---|
| 33 | |
|---|
| 34 | def __init__(self, config): |
|---|
| 35 | self.cfg = config |
|---|
| 36 | if config.rand_seed: |
|---|
| 37 | random.seed(config.rand_seed) |
|---|
| 38 | |
|---|
| 39 | def make_raxml_fname(self, stem, job_name, absolute=True): |
|---|
| 40 | fname = "RAxML_" + stem + "." + job_name |
|---|
| 41 | if absolute: |
|---|
| 42 | return os.path.join(self.cfg.raxml_outdir, fname) |
|---|
| 43 | else: |
|---|
| 44 | return fname |
|---|
| 45 | |
|---|
| 46 | def make_raxml_wildcard(self, job_name): |
|---|
| 47 | return self.make_raxml_fname("*", job_name) |
|---|
| 48 | |
|---|
| 49 | def cleanup(self, job_name, remove_jplace=False): |
|---|
| 50 | raxml_out_mask = self.make_raxml_wildcard(job_name) |
|---|
| 51 | for fl in glob.glob(raxml_out_mask): |
|---|
| 52 | os.remove(fl) |
|---|
| 53 | if remove_jplace: |
|---|
| 54 | jplace_fname = self.make_raxml_fname("portableTree", job_name) + ".jplace" |
|---|
| 55 | FileUtils.remove_if_exists(jplace_fname) |
|---|
| 56 | |
|---|
| 57 | def reduce_alignment(self, align_fname, job_name="reduce"): |
|---|
| 58 | reduced_fname = align_fname + ".reduced" |
|---|
| 59 | # we don't have to do anything in restart mode |
|---|
| 60 | if self.cfg.restart and os.path.isfile(reduced_fname): |
|---|
| 61 | return reduced_fname |
|---|
| 62 | else: |
|---|
| 63 | FileUtils.remove_if_exists(reduced_fname) |
|---|
| 64 | raxml_params = ["-f", "c", "-s", align_fname] |
|---|
| 65 | raxml_params += ["--no-dup-check"] |
|---|
| 66 | self.run(job_name, raxml_params) |
|---|
| 67 | self.cleanup(job_name) |
|---|
| 68 | if os.path.isfile(reduced_fname): |
|---|
| 69 | return reduced_fname |
|---|
| 70 | else: |
|---|
| 71 | return align_fname |
|---|
| 72 | |
|---|
| 73 | def run_epa(self, job_name, align_fname, reftree_fname, optmod_fname="", silent=True, mode="epa", subtree_fname=None,\ |
|---|
| 74 | lhw_acc_threshold=0.999): |
|---|
| 75 | raxml_params = ["-s", align_fname, "-t", reftree_fname] |
|---|
| 76 | # assume that by the time we call EPA reference has been cleaned already (e.g. with previous reduce_alignment call) |
|---|
| 77 | raxml_params += ["--no-seq-check"] |
|---|
| 78 | if mode == "l1o_seq": |
|---|
| 79 | raxml_params += ["-f", "O"] |
|---|
| 80 | result_file_stem = "leaveOneOutResults" |
|---|
| 81 | elif mode == "l1o_subtree": |
|---|
| 82 | raxml_params += ["-f", "P", "-z", subtree_fname] |
|---|
| 83 | result_file_stem = "subtreePlacement" |
|---|
| 84 | elif mode == "epa": |
|---|
| 85 | raxml_params += ["-f", "v"] |
|---|
| 86 | result_file_stem = "portableTree" |
|---|
| 87 | elif mode == "epa_mp": |
|---|
| 88 | raxml_params += ["-f", "y"] |
|---|
| 89 | result_file_stem = "portableTree" |
|---|
| 90 | else: |
|---|
| 91 | print "ERROR: Invalid RAxML-EPA running mode: %s" % mode |
|---|
| 92 | sys.exit() |
|---|
| 93 | |
|---|
| 94 | raxml_params += ["--epa-accumulated-threshold", str(lhw_acc_threshold)] |
|---|
| 95 | |
|---|
| 96 | if self.cfg.epa_use_heuristic in ["TRUE", "YES", "1"]: |
|---|
| 97 | raxml_params += ["-G", str(self.cfg.epa_heur_rate)] |
|---|
| 98 | |
|---|
| 99 | if self.cfg.epa_load_optmod and optmod_fname: |
|---|
| 100 | if os.path.isfile(optmod_fname): |
|---|
| 101 | raxml_params += ["-R", optmod_fname] |
|---|
| 102 | if self.cfg.raxml_model.startswith("GTRCAT") and not self.cfg.compress_patterns: |
|---|
| 103 | raxml_params += ["-H"] |
|---|
| 104 | else: |
|---|
| 105 | print "WARNING: Binary model file not found: %s" % optmod_fname |
|---|
| 106 | print "WARNING: Model parameters will be estimated by RAxML" |
|---|
| 107 | |
|---|
| 108 | self.run(job_name, raxml_params, silent) |
|---|
| 109 | |
|---|
| 110 | jp = None |
|---|
| 111 | failed = False |
|---|
| 112 | if mode == "l1o_subtree": |
|---|
| 113 | jp = [] |
|---|
| 114 | i = 0 |
|---|
| 115 | while True: |
|---|
| 116 | jp_fname = self.make_raxml_fname(result_file_stem, job_name) + ".%d.jplace" % (i+1) |
|---|
| 117 | if not os.path.isfile(jp_fname): |
|---|
| 118 | break |
|---|
| 119 | jp.append(EpaJsonParser(jp_fname)) |
|---|
| 120 | i += 1 |
|---|
| 121 | failed = i == 0 |
|---|
| 122 | else: |
|---|
| 123 | jp_fname = self.make_raxml_fname(result_file_stem, job_name) + ".jplace" |
|---|
| 124 | if os.path.isfile(jp_fname): |
|---|
| 125 | jp = EpaJsonParser(jp_fname) |
|---|
| 126 | else: |
|---|
| 127 | failed = True |
|---|
| 128 | |
|---|
| 129 | if failed: |
|---|
| 130 | print "RAxML EPA run failed, please examine the log for details:\n %s" \ |
|---|
| 131 | % self.make_raxml_fname("output", job_name) |
|---|
| 132 | sys.exit() |
|---|
| 133 | else: |
|---|
| 134 | return jp |
|---|
| 135 | |
|---|
| 136 | def run(self, job_name, params, silent=True, chkpoint_fname=None): |
|---|
| 137 | if self.cfg.raxml_model == "AUTO": |
|---|
| 138 | print "ERROR: you should have called EpacConfig.resolve_auto_settings() in your script!\n" |
|---|
| 139 | sys.exit() |
|---|
| 140 | |
|---|
| 141 | self.cleanup(job_name) |
|---|
| 142 | |
|---|
| 143 | lparams = [] |
|---|
| 144 | lparams += params |
|---|
| 145 | lparams += ["-m", self.cfg.raxml_model, "-n", job_name] |
|---|
| 146 | |
|---|
| 147 | if not self.cfg.use_bfgs: |
|---|
| 148 | lparams += ["--no-bfgs"] |
|---|
| 149 | |
|---|
| 150 | if self.cfg.save_memory: |
|---|
| 151 | lparams += ["-U"] |
|---|
| 152 | |
|---|
| 153 | if self.cfg.verbose: |
|---|
| 154 | lparams += ["--verbose"] |
|---|
| 155 | |
|---|
| 156 | if not "-p" in lparams: |
|---|
| 157 | seed = random.randint(1, 32000) |
|---|
| 158 | lparams += ["-p", str(seed)] |
|---|
| 159 | |
|---|
| 160 | if chkpoint_fname: |
|---|
| 161 | lparams += ["-Z", chkpoint_fname] |
|---|
| 162 | |
|---|
| 163 | if self.cfg.run_on_cluster: |
|---|
| 164 | self.run_cluster(lparams) |
|---|
| 165 | return; |
|---|
| 166 | |
|---|
| 167 | if self.cfg.raxml_remote_call: |
|---|
| 168 | call_str = ["ssh", self.cfg.raxml_remote_host] |
|---|
| 169 | else: |
|---|
| 170 | call_str = [] |
|---|
| 171 | call_str += self.cfg.raxml_cmd + lparams |
|---|
| 172 | if silent: |
|---|
| 173 | self.cfg.log.debug(' '.join(call_str) + "\n") |
|---|
| 174 | out_fname = self.make_raxml_fname("output", job_name) |
|---|
| 175 | with open(out_fname, "w") as fout: |
|---|
| 176 | call(call_str, stdout=fout, stderr=STDOUT) |
|---|
| 177 | else: |
|---|
| 178 | call(call_str) |
|---|
| 179 | |
|---|
| 180 | return ' '.join(call_str) |
|---|
| 181 | |
|---|
| 182 | def run_multiple(self, job_name, params, repnum, silent=True): |
|---|
| 183 | best_lh = float("-inf") |
|---|
| 184 | best_jobname = None |
|---|
| 185 | check_old_jobs = self.cfg.restart |
|---|
| 186 | |
|---|
| 187 | for i in range(repnum): |
|---|
| 188 | call_raxml = True |
|---|
| 189 | chkpoint_fname = None |
|---|
| 190 | |
|---|
| 191 | rep_jobname = "%s.%d" % (job_name, i) |
|---|
| 192 | |
|---|
| 193 | if check_old_jobs: |
|---|
| 194 | # in resume mode, we have to check where we have stopped before |
|---|
| 195 | next_jobname = "%s.%d" % (job_name, i+1) |
|---|
| 196 | next_info = self.info_fname(next_jobname) |
|---|
| 197 | # if RAxML_info file for the next job exists, current job has had finished -> skip it |
|---|
| 198 | if os.path.isfile(next_info): |
|---|
| 199 | call_raxml = False |
|---|
| 200 | else: |
|---|
| 201 | # use RAxML checkpoints if there are any |
|---|
| 202 | old_chkpoint_fname = self.checkpoint_fname(rep_jobname) |
|---|
| 203 | if not os.path.isfile(old_chkpoint_fname): |
|---|
| 204 | old_chkpoint_fname = self.bkup_checkpoint_fname(rep_jobname) |
|---|
| 205 | if not os.path.isfile(old_chkpoint_fname): |
|---|
| 206 | old_chkpoint_fname = None |
|---|
| 207 | |
|---|
| 208 | FileUtils.remove_if_exists(next_info) |
|---|
| 209 | if old_chkpoint_fname: |
|---|
| 210 | chkpoint_fname = self.checkpoint_fname("last_chkpoint") |
|---|
| 211 | shutil.move(old_chkpoint_fname, chkpoint_fname) |
|---|
| 212 | |
|---|
| 213 | # this was the last RAxML run from previous SATIVA invocation -> proceed without checks from now on |
|---|
| 214 | check_old_jobs = False |
|---|
| 215 | |
|---|
| 216 | if call_raxml: |
|---|
| 217 | invoc_str = self.run(rep_jobname, params, silent, chkpoint_fname) |
|---|
| 218 | lh = self.get_tree_lh(rep_jobname, "GAMMA") |
|---|
| 219 | if lh > best_lh: |
|---|
| 220 | best_lh = lh |
|---|
| 221 | best_jobname = rep_jobname |
|---|
| 222 | self.cfg.log.debug("Tree %d GAMMA-based logLH: %s\n" % (i, str(lh))) |
|---|
| 223 | |
|---|
| 224 | best_fname = self.info_fname(best_jobname) |
|---|
| 225 | dst_fname = self.info_fname(job_name) |
|---|
| 226 | shutil.copy(best_fname, dst_fname) |
|---|
| 227 | |
|---|
| 228 | best_fname = self.result_fname(best_jobname) |
|---|
| 229 | dst_fname = self.result_fname(job_name) |
|---|
| 230 | shutil.copy(best_fname, dst_fname) |
|---|
| 231 | |
|---|
| 232 | return invoc_str |
|---|
| 233 | |
|---|
| 234 | def run_cluster(self, params): |
|---|
| 235 | if self.cfg.raxml_remote_call: |
|---|
| 236 | qsub_call_str = ["ssh", self.cfg.raxml_remote_host] |
|---|
| 237 | else: |
|---|
| 238 | qsub_call_str = [] |
|---|
| 239 | |
|---|
| 240 | raxml_call_cmd = self.cfg.raxml_cmd + params |
|---|
| 241 | for i in range(len(raxml_call_cmd)): |
|---|
| 242 | if isinstance(raxml_call_cmd[i], basestring): |
|---|
| 243 | raxml_call_cmd[i] = FileUtils.rebase(raxml_call_cmd[i], self.cfg.epac_home, self.cfg.cluster_epac_home) |
|---|
| 244 | raxml_call_str = ' '.join(raxml_call_cmd) |
|---|
| 245 | |
|---|
| 246 | script_fname = self.cfg.tmp_fname("%NAME%_sub.sh") |
|---|
| 247 | FileUtils.remove_if_exists(script_fname) |
|---|
| 248 | shutil.copy(self.cfg.cluster_qsub_script, script_fname) |
|---|
| 249 | qsub_job_name = "epa" |
|---|
| 250 | with open(script_fname, "a") as fout: |
|---|
| 251 | fout.write("#$ -N %s\n" % qsub_job_name) |
|---|
| 252 | fout.write("\n") |
|---|
| 253 | fout.write(raxml_call_str + "\n") |
|---|
| 254 | |
|---|
| 255 | cluster_script_fname = FileUtils.rebase(script_fname, self.cfg.epac_home, self.cfg.cluster_epac_home) |
|---|
| 256 | qsub_call_str += ["qsub", "-sync", "y", cluster_script_fname] |
|---|
| 257 | |
|---|
| 258 | print raxml_call_str + "\n" |
|---|
| 259 | print ' '.join(qsub_call_str) + "\n" |
|---|
| 260 | # sys.exit() |
|---|
| 261 | |
|---|
| 262 | call(qsub_call_str) |
|---|
| 263 | if not self.cfg.debug: |
|---|
| 264 | FileUtils.remove_if_exists(script_fname) |
|---|
| 265 | |
|---|
| 266 | def get_tree_lh(self, job_name, ratehet="GAMMA"): |
|---|
| 267 | info_fname = self.info_fname(job_name) |
|---|
| 268 | with open(info_fname, "r") as info_file: |
|---|
| 269 | info_str = info_file.read() |
|---|
| 270 | |
|---|
| 271 | lh_patterns = [ "Final %s-based Score of best tree " % ratehet, |
|---|
| 272 | "Final %s likelihood: " % ratehet, |
|---|
| 273 | "%s-based likelihood " % ratehet] |
|---|
| 274 | |
|---|
| 275 | m = None |
|---|
| 276 | for pat in lh_patterns: |
|---|
| 277 | m = re.search('(?<=%s)[0-9.\-]+' % pat, info_str) |
|---|
| 278 | if m: |
|---|
| 279 | lh = float(m.group(0)) |
|---|
| 280 | return lh |
|---|
| 281 | |
|---|
| 282 | return None |
|---|
| 283 | |
|---|
| 284 | def get_invocation_str(self, job_name): |
|---|
| 285 | info_fname = self.info_fname(job_name) |
|---|
| 286 | with open(info_fname, "r") as info_file: |
|---|
| 287 | info_str = info_file.read() |
|---|
| 288 | |
|---|
| 289 | pattern = "RAxML was called as follows:\n\n" |
|---|
| 290 | |
|---|
| 291 | m = re.search('(?<=%s).*' % pattern, info_str) |
|---|
| 292 | if m: |
|---|
| 293 | return m.group(0) |
|---|
| 294 | else: |
|---|
| 295 | return "" |
|---|
| 296 | |
|---|
| 297 | def result_fname(self, job_name): |
|---|
| 298 | return self.make_raxml_fname("result", job_name) |
|---|
| 299 | |
|---|
| 300 | def besttree_fname(self, job_name): |
|---|
| 301 | return self.make_raxml_fname("bestTree", job_name) |
|---|
| 302 | |
|---|
| 303 | def info_fname(self, job_name): |
|---|
| 304 | return self.make_raxml_fname("info", job_name) |
|---|
| 305 | |
|---|
| 306 | def checkpoint_fname(self, job_name): |
|---|
| 307 | return self.make_raxml_fname("binaryCheckpoint", job_name) |
|---|
| 308 | |
|---|
| 309 | def bkup_checkpoint_fname(self, job_name): |
|---|
| 310 | return self.make_raxml_fname("binaryCheckpointBackup", job_name) |
|---|
| 311 | |
|---|
| 312 | def result_exists(self, job_name): |
|---|
| 313 | if os.path.isfile(self.result_fname(job_name)): |
|---|
| 314 | return True |
|---|
| 315 | else: |
|---|
| 316 | return False |
|---|
| 317 | |
|---|
| 318 | def besttree_exists(self, job_name): |
|---|
| 319 | if os.path.isfile(self.besttree_fname(job_name)): |
|---|
| 320 | return True |
|---|
| 321 | else: |
|---|
| 322 | return False |
|---|
| 323 | |
|---|
| 324 | def epa_result_exists(self, job_name): |
|---|
| 325 | if os.path.isfile(self.make_raxml_fname("labelledTree", job_name)): |
|---|
| 326 | return True |
|---|
| 327 | else: |
|---|
| 328 | return False |
|---|
| 329 | |
|---|
| 330 | def copy_result_tree(self, job_name, dst_fname): |
|---|
| 331 | src_fname = self.result_fname(job_name) |
|---|
| 332 | shutil.copy(src_fname, dst_fname) |
|---|
| 333 | |
|---|
| 334 | def copy_best_tree(self, job_name, dst_fname): |
|---|
| 335 | src_fname = self.besttree_fname(job_name) |
|---|
| 336 | shutil.copy(src_fname, dst_fname) |
|---|
| 337 | |
|---|
| 338 | def copy_optmod_params(self, job_name, dst_fname): |
|---|
| 339 | src_fname = self.make_raxml_fname("binaryModelParameters", job_name) |
|---|
| 340 | shutil.copy(src_fname, dst_fname) |
|---|
| 341 | |
|---|
| 342 | def copy_epa_orig_tree(self, job_name, dst_fname): |
|---|
| 343 | src_fname = self.make_raxml_fname("originalLabelledTree", job_name) |
|---|
| 344 | shutil.copy(src_fname, dst_fname) |
|---|
| 345 | |
|---|
| 346 | def copy_epa_result_tree(self, job_name, dst_fname): |
|---|
| 347 | src_fname = self.make_raxml_fname("labelledTree", job_name) |
|---|
| 348 | shutil.copy(src_fname, dst_fname) |
|---|
| 349 | |
|---|
| 350 | def copy_epa_jplace(self, job_name, dst_fname, mode="epa", move=False): |
|---|
| 351 | if mode == "l1o_seq": |
|---|
| 352 | result_file_stem = "leaveOneOutResults" |
|---|
| 353 | elif mode == "l1o_subtree": |
|---|
| 354 | result_file_stem = "subtreePlacement" |
|---|
| 355 | elif mode == "epa": |
|---|
| 356 | result_file_stem = "portableTree" |
|---|
| 357 | else: |
|---|
| 358 | return |
|---|
| 359 | src_fname = self.make_raxml_fname(result_file_stem, job_name) + ".jplace" |
|---|
| 360 | if move: |
|---|
| 361 | shutil.move(src_fname, dst_fname) |
|---|
| 362 | else: |
|---|
| 363 | shutil.copy(src_fname, dst_fname) |
|---|