1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | import os |
---|
4 | import sys |
---|
5 | import glob |
---|
6 | import shutil |
---|
7 | import datetime |
---|
8 | import time |
---|
9 | import logging |
---|
10 | import random |
---|
11 | import multiprocessing |
---|
12 | import ConfigParser |
---|
13 | |
---|
14 | from epac.version import SATIVA_BUILD,SATIVA_RELEASE_DATE,SATIVA_RAXML_VER |
---|
15 | |
---|
16 | class DefaultedConfigParser(ConfigParser.SafeConfigParser): |
---|
17 | def get_param(self, section, option, ctype=str, default=None): |
---|
18 | if default is None: |
---|
19 | ret = self.get(section, option) |
---|
20 | else: |
---|
21 | confdict = self.__dict__.get('_sections') |
---|
22 | sec = confdict.get(section) |
---|
23 | if sec: |
---|
24 | if ctype == bool: |
---|
25 | ret = self.getboolean(section, option) |
---|
26 | else: |
---|
27 | ret = sec.get(option, default) |
---|
28 | else: |
---|
29 | ret = default |
---|
30 | return ctype(ret) |
---|
31 | |
---|
32 | class EpacConfig: |
---|
33 | # this prefix will be added to every sequence name in reference to prevent |
---|
34 | # name clashes with query sequences, which are coded with numbers |
---|
35 | REF_SEQ_PREFIX = "r_"; |
---|
36 | QUERY_SEQ_PREFIX = "q_"; |
---|
37 | |
---|
38 | CAT_LOWER_THRES = 100 |
---|
39 | CAT_GAMMA_THRES = 500 |
---|
40 | GAMMA_UPPER_THRES = 10000 |
---|
41 | EPA_HEUR_THRES = 1000 |
---|
42 | |
---|
43 | SATIVA_INFO = \ |
---|
44 | """%s %s, released on %s. Last version: https://github.com/amkozlov/sativa |
---|
45 | By A.Kozlov and J.Zhang, the Exelixis Lab. Based on RAxML %s by A.Stamatakis.\n"""\ |
---|
46 | % ("%s", SATIVA_BUILD, SATIVA_RELEASE_DATE, SATIVA_RAXML_VER) |
---|
47 | |
---|
48 | |
---|
49 | @staticmethod |
---|
50 | def strip_prefix(seq_name, prefix): |
---|
51 | if seq_name.startswith(prefix): |
---|
52 | plen = len(prefix) |
---|
53 | return seq_name[plen:] |
---|
54 | else: |
---|
55 | return seq_name |
---|
56 | |
---|
57 | @staticmethod |
---|
58 | def strip_ref_prefix(seq_name): |
---|
59 | return EpacConfig.strip_prefix(seq_name, EpacConfig.REF_SEQ_PREFIX) |
---|
60 | |
---|
61 | @staticmethod |
---|
62 | def strip_query_prefix(seq_name): |
---|
63 | return EpacConfig.strip_prefix(seq_name, EpacConfig.QUERY_SEQ_PREFIX) |
---|
64 | |
---|
65 | def __init__(self, args=None): |
---|
66 | self.basepath = os.path.dirname(os.path.abspath(__file__)) |
---|
67 | self.epac_home = os.path.abspath(os.path.join(self.basepath, os.pardir)) + "/" |
---|
68 | |
---|
69 | self.set_defaults() |
---|
70 | |
---|
71 | if not args: |
---|
72 | return |
---|
73 | |
---|
74 | self.verbose = args.verbose |
---|
75 | self.debug = args.debug |
---|
76 | self.restart = args.restart |
---|
77 | self.refjson_fname = args.ref_fname |
---|
78 | |
---|
79 | self.rand_seed = args.rand_seed |
---|
80 | |
---|
81 | timestamp = "%d" % (time.time()*1000) #datetime.datetime.now().strftime("%Y%m%d_%H%M%S") |
---|
82 | if args.output_name: |
---|
83 | self.name = args.output_name |
---|
84 | else: |
---|
85 | self.name = str(random.randint(1, 99999)) |
---|
86 | |
---|
87 | self.output_dir = args.output_dir |
---|
88 | |
---|
89 | if args.temp_dir: |
---|
90 | self.temp_dir = args.temp_dir |
---|
91 | else: |
---|
92 | self.temp_dir = os.path.join(self.epac_home, "tmp") |
---|
93 | |
---|
94 | if self.restart: |
---|
95 | tmpdirs = glob.glob(os.path.join(self.temp_dir, self.name + "_*")) |
---|
96 | if len(tmpdirs) > 0: |
---|
97 | tmpdirs.sort(key=os.path.getmtime, reverse=True) |
---|
98 | self.temp_dir = tmpdirs[0] |
---|
99 | else: |
---|
100 | self.exit_user_error("ERROR: Cannot resume execution: temp directory for the previous run not found in %s" % self.temp_dir) |
---|
101 | else: |
---|
102 | temp_name = self.name + "_" + timestamp |
---|
103 | self.temp_dir = os.path.join(self.temp_dir, temp_name) |
---|
104 | |
---|
105 | self.raxml_outdir = self.temp_dir |
---|
106 | self.raxml_outdir_abs = os.path.abspath(self.raxml_outdir) |
---|
107 | |
---|
108 | self.init_logger() |
---|
109 | |
---|
110 | if not args.config_fname: |
---|
111 | args.config_fname = os.path.join(self.epac_home, "sativa.cfg") |
---|
112 | |
---|
113 | self.config_path = os.path.dirname(os.path.abspath(args.config_fname)) |
---|
114 | self.read_from_file(args.config_fname) |
---|
115 | |
---|
116 | # command line setting has preference over config file and default |
---|
117 | if args.num_threads: |
---|
118 | self.num_threads = args.num_threads |
---|
119 | self.check_raxml() |
---|
120 | |
---|
121 | if not self.restart: |
---|
122 | os.mkdir(self.temp_dir) |
---|
123 | |
---|
124 | def set_defaults(self): |
---|
125 | self.muscle_home = self.epac_home + "/epac/bin" + "/" |
---|
126 | self.hmmer_home = self.epac_home + "/epac/bin" + "/" |
---|
127 | self.raxml_home = self.epac_home + "/epac/bin" + "/" |
---|
128 | self.raxml_exec = "raxmlHPC-PTHREADS-SSE3" |
---|
129 | self.raxml_model = "AUTO" |
---|
130 | self.raxml_remote_host = "" |
---|
131 | self.raxml_remote_call = False |
---|
132 | self.run_on_cluster = False |
---|
133 | self.cluster_epac_home = self.epac_home |
---|
134 | self.cluster_qsub_script = "" |
---|
135 | self.epa_load_optmod = True |
---|
136 | self.epa_use_heuristic = "AUTO" |
---|
137 | self.epa_heur_rate = 0.01 |
---|
138 | self.min_confidence = 0.2 |
---|
139 | self.num_threads = multiprocessing.cpu_count() |
---|
140 | self.compress_patterns = False |
---|
141 | self.use_bfgs = False |
---|
142 | self.save_memory = False |
---|
143 | self.taxa_ident_thres = 0.6 |
---|
144 | self.debug = False |
---|
145 | self.restart = False |
---|
146 | self.verbose = False |
---|
147 | self.log = logging.getLogger('epac') |
---|
148 | |
---|
149 | def init_logger(self): |
---|
150 | self.log_fname = self.out_fname("%NAME%.log") |
---|
151 | if self.verbose or self.debug: |
---|
152 | log_lvl = logging.DEBUG |
---|
153 | else: |
---|
154 | log_lvl = logging.INFO |
---|
155 | |
---|
156 | # configure logger object |
---|
157 | self.log.setLevel(logging.DEBUG) |
---|
158 | formatter = logging.Formatter('%(message)s') |
---|
159 | |
---|
160 | # add console handler |
---|
161 | ch = logging.StreamHandler() |
---|
162 | ch.setLevel(log_lvl) |
---|
163 | ch.setFormatter(formatter) |
---|
164 | self.log.addHandler(ch) |
---|
165 | |
---|
166 | # add file handler |
---|
167 | if self.restart: |
---|
168 | logf_mode = "a" |
---|
169 | else: |
---|
170 | logf_mode = "w" |
---|
171 | fh = logging.FileHandler(self.log_fname, mode=logf_mode) |
---|
172 | fh.setLevel(log_lvl) |
---|
173 | fh.setFormatter(formatter) |
---|
174 | self.log.addHandler(fh) |
---|
175 | |
---|
176 | def resolve_auto_settings(self, tree_size): |
---|
177 | if self.raxml_model == "AUTO": |
---|
178 | if tree_size > EpacConfig.CAT_GAMMA_THRES: |
---|
179 | self.raxml_model = "GTRCAT" |
---|
180 | else: |
---|
181 | self.raxml_model = "GTRGAMMA" |
---|
182 | elif self.raxml_model == "GTRCAT" and tree_size < EpacConfig.CAT_LOWER_THRES: |
---|
183 | print "WARNING: You're using GTRCAT model on a very small dataset (%d taxa), which might lead to unreliable results!" % tree_size |
---|
184 | print "Please consider switching to GTRGAMMA model.\n" |
---|
185 | elif self.raxml_model == "GTRGAMMA" and tree_size > EpacConfig.GAMMA_UPPER_THRES: |
---|
186 | print "WARNING: You're using GTRGAMMA model on a very large dataset (%d taxa), which might lead to numerical issues!" % tree_size |
---|
187 | print "In case of problems, please consider switching to GTRCAT model.\n" |
---|
188 | |
---|
189 | if self.epa_use_heuristic == "AUTO": |
---|
190 | if tree_size > EpacConfig.EPA_HEUR_THRES: |
---|
191 | self.epa_use_heuristic = "TRUE" |
---|
192 | self.epa_heur_rate = 0.5 * float(EpacConfig.EPA_HEUR_THRES) / tree_size |
---|
193 | else: |
---|
194 | self.epa_use_heuristic = "FALSE" |
---|
195 | |
---|
196 | def resolve_relative_path(self, rpath): |
---|
197 | if rpath.startswith("/"): |
---|
198 | return rpath |
---|
199 | else: |
---|
200 | return os.path.join(self.config_path, rpath) |
---|
201 | |
---|
202 | def check_raxml(self): |
---|
203 | self.raxml_exec_full = self.raxml_home + self.raxml_exec |
---|
204 | if self.raxml_remote_host in ["", "localhost"]: |
---|
205 | self.raxml_remote_call = False |
---|
206 | # if raxml_home is empty, raxml binary must be on PATH; otherwise check if file exists |
---|
207 | if self.raxml_home: |
---|
208 | if not os.path.isdir(self.raxml_home): |
---|
209 | self.exit_user_error("RAxML home directory not found: %s" % self.raxml_home) |
---|
210 | elif not os.path.isfile(self.raxml_exec_full): |
---|
211 | self.exit_user_error("RAxML executable not found: %s" % self.raxml_exec_full) |
---|
212 | else: |
---|
213 | self.raxml_remote_call = True |
---|
214 | self.raxml_cmd = [self.raxml_exec_full, "-w", self.raxml_outdir_abs] |
---|
215 | if self.num_threads > 1: |
---|
216 | self.raxml_cmd += ["-T", str(self.num_threads)] |
---|
217 | |
---|
218 | def read_from_file(self, config_fname): |
---|
219 | if not os.path.exists(config_fname): |
---|
220 | self.exit_user_error("ERROR: Config file not found: %s" % config_fname) |
---|
221 | |
---|
222 | parser = DefaultedConfigParser() #ConfigParser.SafeConfigParser() |
---|
223 | parser.read(config_fname) |
---|
224 | |
---|
225 | self.raxml_home = parser.get_param("raxml", "raxml_home", str, self.raxml_home) |
---|
226 | if self.raxml_home: |
---|
227 | self.raxml_home = self.resolve_relative_path(self.raxml_home + "/") |
---|
228 | self.raxml_exec = parser.get_param("raxml", "raxml_exec", str, self.raxml_exec) |
---|
229 | self.raxml_remote_host = parser.get_param("raxml", "raxml_remote_host", str, self.raxml_remote_host) |
---|
230 | |
---|
231 | self.raxml_model = parser.get_param("raxml", "raxml_model", str, self.raxml_model).upper() |
---|
232 | self.num_threads = parser.get_param("raxml", "raxml_threads", int, self.num_threads) |
---|
233 | |
---|
234 | self.epa_use_heuristic = parser.get_param("raxml", "epa_use_heuristic", str, self.epa_use_heuristic).upper() |
---|
235 | self.epa_heur_rate = parser.get_param("raxml", "epa_heur_rate", float, self.epa_heur_rate) |
---|
236 | self.epa_load_optmod = parser.get_param("raxml", "epa_load_optmod", bool, self.epa_load_optmod) |
---|
237 | |
---|
238 | self.hmmer_home = self.resolve_relative_path(parser.get_param("hmmer", "hmmer_home", str, self.hmmer_home)) |
---|
239 | self.muscle_home = self.resolve_relative_path(parser.get_param("muscle", "muscle_home", str, self.muscle_home)) |
---|
240 | |
---|
241 | self.run_on_cluster = parser.get_param("cluster", "run_on_cluster", bool, self.run_on_cluster) |
---|
242 | self.cluster_epac_home = parser.get_param("cluster", "cluster_epac_home", str, self.cluster_epac_home) + "/" |
---|
243 | self.cluster_qsub_script = parser.get_param("cluster", "cluster_qsub_script", str, self.cluster_qsub_script) |
---|
244 | |
---|
245 | self.min_confidence = parser.get_param("assignment", "min_confidence", float, self.min_confidence) |
---|
246 | |
---|
247 | return parser |
---|
248 | |
---|
249 | def subst_name(self, in_str): |
---|
250 | """Replace %NAME% macros with an actual EPAC run name. Used to |
---|
251 | generate unique run-specific identifiers (filenames, RAxML job names etc)""" |
---|
252 | return in_str.replace("%NAME%", self.name) |
---|
253 | |
---|
254 | def tmp_fname(self, fname): |
---|
255 | return os.path.join(self.temp_dir, self.subst_name(fname)) |
---|
256 | |
---|
257 | def out_fname(self, fname): |
---|
258 | return os.path.join(self.output_dir, self.subst_name(fname)) |
---|
259 | |
---|
260 | def clean_tempdir(self): |
---|
261 | if not self.debug and os.path.isdir(self.temp_dir): |
---|
262 | shutil.rmtree(self.temp_dir) |
---|
263 | |
---|
264 | def exit_fatal_error(self, msg=None): |
---|
265 | if msg: |
---|
266 | self.log.error(msg) |
---|
267 | sys.exit(13) |
---|
268 | |
---|
269 | def exit_user_error(self, msg=None): |
---|
270 | if msg: |
---|
271 | self.log.error(msg) |
---|
272 | self.clean_tempdir() |
---|
273 | sys.exit(14) |
---|
274 | |
---|
275 | def print_version(self, progname): |
---|
276 | self.log.info(EpacConfig.SATIVA_INFO % progname) |
---|
277 | if self.restart: |
---|
278 | self.log.info("Resuming %s execution using files from previous run found in: %s\n" % (progname, self.temp_dir)) |
---|
279 | |
---|
280 | class EpacTrainerConfig(EpacConfig): |
---|
281 | |
---|
282 | def __init__(self, args=None): |
---|
283 | EpacConfig.__init__(self, args) |
---|
284 | if args: |
---|
285 | self.taxonomy_fname = args.taxonomy_fname |
---|
286 | self.align_fname = args.align_fname |
---|
287 | self.no_hmmer = args.no_hmmer |
---|
288 | self.dup_rank_names = args.dup_rank_names |
---|
289 | self.wrong_rank_count = args.wrong_rank_count |
---|
290 | self.mfresolv_method = args.mfresolv_method |
---|
291 | self.taxcode_name = args.taxcode_name |
---|
292 | self.rep_num = args.rep_num |
---|
293 | |
---|
294 | def set_defaults(self): |
---|
295 | EpacConfig.set_defaults(self) |
---|
296 | self.no_hmmer = False |
---|
297 | # whether model parameters should be re-optimized from scratch on the best topology using "-f e" |
---|
298 | self.reopt_model = False |
---|
299 | self.dup_rank_names = "ignore" |
---|
300 | self.wrong_rank_count = "ignore" |
---|
301 | self.taxassign_method = "1" |
---|
302 | self.mfresolv_method = "thorough" |
---|
303 | self.taxcode_name = "bac" |
---|
304 | self.rep_num = 1 |
---|
305 | # default settings below imply no taxonomy filtering, |
---|
306 | # i.e. all sequences from taxonomy file will be included into reference tree |
---|
307 | self.reftree_min_rank = 0 |
---|
308 | self.reftree_max_seqs_per_leaf = 1e6 |
---|
309 | self.reftree_clades_to_include=[] |
---|
310 | self.reftree_clades_to_ignore=[] |
---|
311 | |
---|
312 | |
---|
313 | def read_from_file(self, config_fname): |
---|
314 | parser = EpacConfig.read_from_file(self, config_fname) |
---|
315 | |
---|
316 | self.reftree_min_rank = parser.get_param("reftree", "min_rank", int, self.reftree_min_rank) |
---|
317 | self.reftree_max_seqs_per_leaf = parser.get_param("reftree", "max_seqs_per_leaf", int, self.reftree_max_seqs_per_leaf) |
---|
318 | clades_str = parser.get_param("reftree", "clades_to_include", str, "") |
---|
319 | self.reftree_clades_to_include = self.parse_clades(clades_str) |
---|
320 | clades_str = parser.get_param("reftree", "clades_to_ignore", str, "") |
---|
321 | self.reftree_clades_to_ignore = self.parse_clades(clades_str) |
---|
322 | |
---|
323 | def parse_clades(self, clades_str): |
---|
324 | clade_list = [] |
---|
325 | try: |
---|
326 | if clades_str: |
---|
327 | clades = clades_str.split(",") |
---|
328 | for clade in clades: |
---|
329 | toks = clade.split("|") |
---|
330 | clade_list += [(int(toks[0]), toks[1])] |
---|
331 | except: |
---|
332 | print "Invalid format in config parameter: clades_to_include" |
---|
333 | sys.exit() |
---|
334 | |
---|
335 | return clade_list |
---|
336 | |
---|
337 | class EpacClassifierConfig(EpacConfig): |
---|
338 | |
---|
339 | def __init__(self, args=None): |
---|
340 | EpacConfig.__init__(self, args) |
---|
341 | |
---|
342 | if args: |
---|
343 | self.taxassign_method = args.taxassign_method |
---|
344 | self.min_lhw = args.min_lhw |
---|
345 | self.brlen_pv = args.brlen_pv |
---|
346 | else: |
---|
347 | self.taxassign_method = "1" |
---|
348 | self.min_lhw = 0. |
---|
349 | self.brlen_pv = 0. |
---|
350 | |
---|
351 | class SativaConfig(EpacTrainerConfig): |
---|
352 | |
---|
353 | def __init__(self, args): |
---|
354 | args.no_hmmer = True |
---|
355 | args.dup_rank_names = "ignore" |
---|
356 | args.wrong_rank_count = "ignore" |
---|
357 | args.taxassign_method = "1" |
---|
358 | |
---|
359 | EpacTrainerConfig.__init__(self, args) |
---|
360 | |
---|
361 | self.taxassign_method = args.taxassign_method |
---|
362 | self.min_lhw = args.min_lhw |
---|
363 | self.brlen_pv = args.brlen_pv |
---|
364 | self.ranktest = args.ranktest |
---|
365 | self.conf_cutoff = args.conf_cutoff |
---|
366 | self.jplace_fname = args.jplace_fname |
---|
367 | |
---|
368 | self.output_interim_files = True |
---|
369 | self.compress_patterns = True |
---|
370 | self.use_bfgs = True |
---|
371 | self.save_memory = False |
---|
372 | |
---|
373 | if self.refjson_fname: |
---|
374 | self.load_refjson = True |
---|
375 | else: |
---|
376 | self.load_refjson = False |
---|
377 | if self.output_interim_files: |
---|
378 | self.refjson_fname = self.out_fname("%NAME%.refjson") |
---|
379 | else: |
---|
380 | self.refjson_fname = self.tmp_fname("%NAME%.refjson") |
---|
381 | |
---|
382 | if self.restart and os.path.isfile(self.refjson_fname): |
---|
383 | self.load_refjson = True |
---|
384 | |
---|
385 | def set_defaults(self): |
---|
386 | EpacTrainerConfig.set_defaults(self) |
---|
387 | |
---|
388 | def read_from_file(self, config_fname): |
---|
389 | parser = EpacTrainerConfig.read_from_file(self, config_fname) |
---|