1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | import sys |
---|
4 | import re |
---|
5 | from string import maketrans |
---|
6 | from ete2 import Tree |
---|
7 | |
---|
8 | class TaxCode: |
---|
9 | UNI_TAX_RANKS = { |
---|
10 | 1: ("Domain", "d__"), |
---|
11 | 2: ("Kingdom", "k__"), |
---|
12 | 3: ("Subkingdom", "a__"), |
---|
13 | 4: ("Phylum", "p__"), |
---|
14 | 5: ("Subphylum", "a__"), |
---|
15 | 6: ("Class", "c__"), |
---|
16 | 7: ("Subclass", "x__"), |
---|
17 | 8: ("Superorder", "e__"), |
---|
18 | 9: ("Order", "o__"), |
---|
19 | 10: ("Suborder", "h__"), |
---|
20 | 11: ("Infraorder", "i__"), |
---|
21 | 12: ("Superfamily", "j__"), |
---|
22 | 13: ("Epifamily", "l__"), |
---|
23 | 14: ("Family", "f__"), |
---|
24 | 15: ("Subfamily", "m__"), |
---|
25 | 16: ("Infrafamily", "n__"), |
---|
26 | 17: ("Tribe", "t__"), |
---|
27 | 18: ("Subtribe", "u__"), |
---|
28 | 19: ("Infratribe", "v__"), |
---|
29 | 20: ("Genus", "g__"), |
---|
30 | 21: ("Species", "s__"), |
---|
31 | 22: ("Subspecies", "b__"), |
---|
32 | 23: ("Strain", "r__"), |
---|
33 | 24: ("Isolate", "q__") |
---|
34 | } |
---|
35 | UNI_TAX_LEVELS = len(UNI_TAX_RANKS) |
---|
36 | |
---|
37 | # ranks of the standard 7-levels taxonomy |
---|
38 | # kingdom, phylum, class, order, family, genus, species |
---|
39 | STD_RANKS = [2, 4, 6, 9, 14, 20, 21] |
---|
40 | |
---|
41 | BAC_TAX_CODE = { |
---|
42 | 2: ((), ("bacteria", "archaea")), # kingdom |
---|
43 | 4: ((), ()), # phylum |
---|
44 | 6: ((), ()), # class |
---|
45 | 7: (("idae"), ()), # subclass |
---|
46 | 9: (("ales"), ()), # order |
---|
47 | 10: (("ineae"), ()), # suborder |
---|
48 | 14: (("aceae"), ()), # family |
---|
49 | 15: (("oideae"), ()), # subfamily |
---|
50 | 20: ((), ()), # genus |
---|
51 | 21: ((), ()), # species |
---|
52 | 22: ((), ()), # subspecies |
---|
53 | 23: ((), ()), # strain |
---|
54 | 24: ((), ()) # isolate |
---|
55 | } |
---|
56 | |
---|
57 | BOT_TAX_CODE = { |
---|
58 | 1: ((), ("eukaryota")), # domain |
---|
59 | 2: ((), ("plantae", "algae", "fungi")), # kingdom |
---|
60 | 3: ((), ("dikarya")), # subkingdom |
---|
61 | 4: (("phyta", "phycota", "mycota"), ()), # phylum |
---|
62 | 5: (("phytina", "phycotina", "mycotina"), ()), # subphylum |
---|
63 | 6: (("opsida", "phyceae", "mycetes"), ()), # class |
---|
64 | 7: (("idae", "phycidae", "mycetidae"), ()), # subclass |
---|
65 | 8: (("anae"), ()), # superorder |
---|
66 | 9: (("ales"), ()), # order |
---|
67 | 10: (("ineae"), ()), # suborder |
---|
68 | 11: (("aria"), ()), # infraorder |
---|
69 | 12: (("acea"), ()), # superfamily |
---|
70 | 14: (("aceae"), ()), # family |
---|
71 | 15: (("oideae"), ()), # subfamily |
---|
72 | 17: (("eae"), ()), # tribe |
---|
73 | 18: (("inae"), ()), # subtribe |
---|
74 | 20: ((), ()), # genus |
---|
75 | 21: ((), ()), # species |
---|
76 | 22: ((), ()), # subspecies |
---|
77 | 23: ((), ()), # specimen |
---|
78 | 24: ((), ()) # isolate |
---|
79 | } |
---|
80 | |
---|
81 | ZOO_TAX_CODE = { |
---|
82 | 1: ((), ("eukaryota")), # domain |
---|
83 | 2: ((), ("animalia")), # kingdom |
---|
84 | 4: ((), ("chordata", "arthropoda", "mollusca", "nematoda")), # phylum |
---|
85 | 5: ((), ("vertebrata", "myriapoda", "crustacea", "hexapoda")), # subphylum |
---|
86 | 6: ((), ("mammalia", "aves", "reptilia", "amphibia", "insecta")), # class |
---|
87 | 7: ((), ()), # subclass |
---|
88 | 8: ((), ()), # superorder |
---|
89 | 9: ((), ()), # order |
---|
90 | 10: ((), ()), # suborder |
---|
91 | 11: ((), ()), # infraorder |
---|
92 | 12: (("oidea"), ()), # superfamily |
---|
93 | 13: (("oidae"), ()), # epifamily |
---|
94 | 14: (("idae"), ()), # family |
---|
95 | 15: (("inae"), ()), # subfamily |
---|
96 | 16: (("odd"), ()), # infrafamily |
---|
97 | 17: (("ini"), ()), # tribe |
---|
98 | 18: (("ina"), ()), # subtribe |
---|
99 | 19: (("ad", "iti"), ()), # infratribe |
---|
100 | 20: ((), ()), # genus |
---|
101 | 21: ((), ()), # species |
---|
102 | 22: ((), ()), # subspecies |
---|
103 | 23: ((), ()), # specimen |
---|
104 | 24: ((), ()) # isolate |
---|
105 | } |
---|
106 | |
---|
107 | VIR_TAX_CODE = { |
---|
108 | 2: ((), ("viruses")), # kingdom |
---|
109 | 6: ((), ()), # class |
---|
110 | 7: (("idae"), ()), # subclass |
---|
111 | 9: (("virales"), ()), # order |
---|
112 | 14: (("viridae"), ()), # family |
---|
113 | 15: (("virinae"), ()), # subfamily |
---|
114 | 20: (("virus"), ()), # genus |
---|
115 | 21: ((" virus"), ()), # species |
---|
116 | 23: ((), ()), # strain |
---|
117 | 24: ((), ()) # isolate |
---|
118 | } |
---|
119 | |
---|
120 | TAX_CODE_MAP = { |
---|
121 | "bac": BAC_TAX_CODE, |
---|
122 | "bot": BOT_TAX_CODE, |
---|
123 | "zoo": ZOO_TAX_CODE, |
---|
124 | "vir": VIR_TAX_CODE |
---|
125 | } |
---|
126 | |
---|
127 | def __init__(self, tax_code_name): |
---|
128 | self.tax_code = TaxCode.TAX_CODE_MAP.get(tax_code_name.lower(), None) |
---|
129 | if not self.tax_code: |
---|
130 | print "ERROR: Unknown taxonomic code: %s" % tax_code_name |
---|
131 | sys.exit() |
---|
132 | |
---|
133 | @staticmethod |
---|
134 | def rank_level_name(uni_rank_level): |
---|
135 | return TaxCode.UNI_TAX_RANKS.get(uni_rank_level, ("Unknown", "?__")) |
---|
136 | |
---|
137 | def guess_rank_level(self, ranks, rank_level): |
---|
138 | rank_name = re.sub("[\W_]+", "", ranks[rank_level].lower()) |
---|
139 | |
---|
140 | sorted_tax_levels = sorted(self.tax_code.keys()) |
---|
141 | |
---|
142 | real_level = 0 |
---|
143 | |
---|
144 | # first, try to guess rank level based on its name or name suffix |
---|
145 | for lvl in sorted_tax_levels: |
---|
146 | (suffix, exact_match) = self.tax_code[lvl] |
---|
147 | if rank_name.endswith(suffix) or rank_name in exact_match: |
---|
148 | real_level = lvl |
---|
149 | break |
---|
150 | |
---|
151 | # if name-based identification failed, try to derive rank level from its parent |
---|
152 | if real_level == 0: |
---|
153 | if rank_level == 0: # kingdom |
---|
154 | real_level = 2 |
---|
155 | else: |
---|
156 | parent_level = self.guess_rank_level(ranks, rank_level-1) |
---|
157 | if parent_level == 0: |
---|
158 | return 0 |
---|
159 | idx = sorted_tax_levels.index(parent_level) |
---|
160 | for i in range(idx+1, len(sorted_tax_levels)): |
---|
161 | lvl = sorted_tax_levels[i] |
---|
162 | suffix = self.tax_code[lvl][0] |
---|
163 | if lvl in TaxCode.STD_RANKS: |
---|
164 | real_level = lvl |
---|
165 | break |
---|
166 | |
---|
167 | return real_level |
---|
168 | |
---|
169 | def guess_rank_level_name(self, ranks, rank_level): |
---|
170 | real_level = self.guess_rank_level(ranks, rank_level) |
---|
171 | return TaxCode.rank_level_name(real_level) |
---|
172 | |
---|
173 | |
---|
174 | class Taxonomy: |
---|
175 | EMPTY_RANK = "-" |
---|
176 | RANK_UID_DELIM = "@@" |
---|
177 | |
---|
178 | RANK_PLACEHOLDERS = ["k__", "p__", "c__", "o__", "f__", "g__", "s__"] |
---|
179 | |
---|
180 | @staticmethod |
---|
181 | def lineage_str(ranks): |
---|
182 | return ";".join(ranks).strip(';') |
---|
183 | |
---|
184 | @staticmethod |
---|
185 | def lowest_assigned_rank_level(ranks): |
---|
186 | rank_level = len(ranks)-1 |
---|
187 | while rank_level >= 0 and ranks[rank_level] == Taxonomy.EMPTY_RANK: |
---|
188 | rank_level -= 1 |
---|
189 | return rank_level |
---|
190 | |
---|
191 | @staticmethod |
---|
192 | def lowest_assigned_rank(ranks): |
---|
193 | rank_level = Taxonomy.lowest_assigned_rank_level(ranks) |
---|
194 | if rank_level >= 0: |
---|
195 | return ranks[rank_level] |
---|
196 | else: |
---|
197 | return None |
---|
198 | |
---|
199 | @staticmethod |
---|
200 | def get_rank_uid(ranks, rank_level=-1): |
---|
201 | if rank_level == -1: |
---|
202 | rank_level = Taxonomy.lowest_assigned_rank_level(ranks) |
---|
203 | return Taxonomy.RANK_UID_DELIM.join(ranks[:rank_level+1]) |
---|
204 | |
---|
205 | @staticmethod |
---|
206 | def split_rank_uid(rank_uid, min_lvls=0): |
---|
207 | ranks = rank_uid.split(Taxonomy.RANK_UID_DELIM) |
---|
208 | if len(ranks) < min_lvls: |
---|
209 | return ranks + [Taxonomy.EMPTY_RANK] * (min_lvls - len(ranks)) |
---|
210 | else: |
---|
211 | return ranks |
---|
212 | |
---|
213 | @staticmethod |
---|
214 | def rank_uid_to_lineage_str(rank_uid, min_lvls=0): |
---|
215 | ranks = Taxonomy.split_rank_uid(rank_uid, min_lvls) |
---|
216 | return Taxonomy.lineage_str(ranks) |
---|
217 | |
---|
218 | def __init__(self, prefix="", tax_fname="", tax_map=None): |
---|
219 | self.prefix = prefix |
---|
220 | |
---|
221 | tree_nodes = [] |
---|
222 | self.common_ranks = set([]) |
---|
223 | self.seq_ranks_map = {} |
---|
224 | self.rank_seqs_map = {} |
---|
225 | self.corr_seq_ids = {} |
---|
226 | self.uncorr_rank_ids = {} |
---|
227 | if tax_map: |
---|
228 | for sid, ranks in tax_map.iteritems(): |
---|
229 | self.seq_ranks_map[sid] = ranks |
---|
230 | rank_id = Taxonomy.get_rank_uid(ranks) |
---|
231 | self.rank_seqs_map[rank_id] = self.rank_seqs_map.get(rank_id, []) + [sid] |
---|
232 | elif tax_fname: |
---|
233 | self.load_taxonomy(tax_fname) |
---|
234 | |
---|
235 | def get_common_ranks(self): |
---|
236 | allranks = list(self.seq_ranks_map.items()) |
---|
237 | numitems = len(allranks) |
---|
238 | if numitems > 0: |
---|
239 | self.common_ranks = set(allranks[0][1]) |
---|
240 | for i in range(1, numitems): |
---|
241 | curr_set = set(allranks[i][1]) |
---|
242 | inters = self.common_ranks & curr_set |
---|
243 | self.common_ranks = inters |
---|
244 | return self.common_ranks |
---|
245 | else: |
---|
246 | return set([]) |
---|
247 | |
---|
248 | def seq_count(self): |
---|
249 | return len(self.seq_ranks_map) |
---|
250 | |
---|
251 | def items(self): |
---|
252 | return self.seq_ranks_map.items() |
---|
253 | |
---|
254 | def iteritems(self): |
---|
255 | return self.seq_ranks_map.items() |
---|
256 | |
---|
257 | def get_map(self): |
---|
258 | return self.seq_ranks_map |
---|
259 | |
---|
260 | def remove_seq(self, seqid): |
---|
261 | rank_id = self.seq_rank_id(seqid) |
---|
262 | self.rank_seqs_map[rank_id].remove(seq_id) |
---|
263 | del self.seq_ranks_map[seqid] |
---|
264 | |
---|
265 | def rename_seq(self, old_seqid, new_seqid): |
---|
266 | rank_id = self.seq_rank_id(old_seqid) |
---|
267 | self.rank_seqs_map[rank_id].remove(old_seqid) |
---|
268 | self.rank_seqs_map[rank_id].append(new_seqid) |
---|
269 | self.seq_ranks_map[new_seqid] = self.seq_ranks_map[old_seqid] |
---|
270 | del self.seq_ranks_map[old_seqid] |
---|
271 | |
---|
272 | def get_seq_ranks(self, seq_id): |
---|
273 | if not seq_id.startswith(self.prefix): |
---|
274 | seq_id = self.prefix + seq_id |
---|
275 | seq_id = self.corr_seq_ids.get(seq_id, seq_id) |
---|
276 | return self.seq_ranks_map[seq_id] |
---|
277 | |
---|
278 | def seq_lineage_str(self, seq_id): |
---|
279 | ranks = list(self.get_seq_ranks(seq_id)) |
---|
280 | return Taxonomy.lineage_str(ranks) |
---|
281 | |
---|
282 | def seq_rank_id(self, seq_id): |
---|
283 | ranks = list(self.get_seq_ranks(seq_id)) |
---|
284 | return Taxonomy.get_rank_uid(ranks) |
---|
285 | |
---|
286 | def get_rank_seqs(self, rank_id): |
---|
287 | return self.rank_seqs_map.get(rank_id, []) |
---|
288 | |
---|
289 | def get_rank_seq_count(self, rank_id): |
---|
290 | return len(self.rank_seqs_map.get(rank_id, [])) |
---|
291 | |
---|
292 | def get_uncorr_rank_id(self, rank_id): |
---|
293 | return self.uncorr_rank_ids.get(rank_id, rank_id) |
---|
294 | |
---|
295 | def merge_ranks(self, rank_ids, name_prefix="__TAXCLUSTER__"): |
---|
296 | if len(rank_ids) < 2: |
---|
297 | return None |
---|
298 | |
---|
299 | all_sid = [] |
---|
300 | for rank_id in rank_ids: |
---|
301 | all_sid += self.get_rank_seqs(rank_id) |
---|
302 | del self.rank_seqs_map[rank_id] |
---|
303 | |
---|
304 | first_taxon = Taxonomy.split_rank_uid(rank_ids[0]) |
---|
305 | merge_lvl = Taxonomy.lowest_assigned_rank_level(first_taxon) |
---|
306 | new_name = name_prefix + first_taxon[merge_lvl] |
---|
307 | new_rank = first_taxon[:merge_lvl] + [new_name] |
---|
308 | new_rank_id = Taxonomy.get_rank_uid(new_rank) |
---|
309 | |
---|
310 | self.rank_seqs_map[new_rank_id] = all_sid |
---|
311 | |
---|
312 | for sid in all_sid: |
---|
313 | self.seq_ranks_map[sid] = new_rank |
---|
314 | |
---|
315 | return new_rank_id |
---|
316 | |
---|
317 | def load_taxonomy(self, tax_fname): |
---|
318 | with open(tax_fname) as fin: |
---|
319 | for line in fin: |
---|
320 | line = line.strip() |
---|
321 | toks = line.split("\t") |
---|
322 | sid = self.prefix + toks[0] |
---|
323 | ranks_str = toks[1] |
---|
324 | ranks = ranks_str.split(";") |
---|
325 | for i in range(len(ranks)): |
---|
326 | rank_name = ranks[i].strip() |
---|
327 | # if rank_name in GGTaxonomyFile.rank_placeholders: |
---|
328 | # rank_name = Taxonomy.EMPTY_RANK |
---|
329 | ranks[i] = rank_name |
---|
330 | rank_id = Taxonomy.get_rank_uid(ranks) |
---|
331 | |
---|
332 | self.seq_ranks_map[sid] = ranks |
---|
333 | self.rank_seqs_map[rank_id] = self.rank_seqs_map.get(rank_id, []) + [sid] |
---|
334 | |
---|
335 | def normalize_rank_names(self): |
---|
336 | invalid_chars = "[](),;:'" |
---|
337 | sub_chars = "_" * len(invalid_chars) |
---|
338 | trantab = maketrans(invalid_chars, sub_chars) |
---|
339 | corr_ranks = {} |
---|
340 | for sid, ranks in self.seq_ranks_map.iteritems(): |
---|
341 | old_rank_id = Taxonomy.get_rank_uid(ranks) |
---|
342 | for i in range(len(ranks)): |
---|
343 | if ranks[i] in corr_ranks: |
---|
344 | ranks[i] = corr_ranks[ranks[i]] |
---|
345 | else: |
---|
346 | new_rank_name = ranks[i].translate(trantab); |
---|
347 | if new_rank_name != ranks[i]: |
---|
348 | corr_ranks[ranks[i]] = new_rank_name |
---|
349 | ranks[i] = new_rank_name |
---|
350 | |
---|
351 | new_rank_id = Taxonomy.get_rank_uid(ranks) |
---|
352 | if new_rank_id != old_rank_id and old_rank_id in self.rank_seqs_map: |
---|
353 | self.rank_seqs_map[new_rank_id] = self.rank_seqs_map[old_rank_id] |
---|
354 | del self.rank_seqs_map[old_rank_id] |
---|
355 | self.uncorr_rank_ids[new_rank_id] = old_rank_id |
---|
356 | |
---|
357 | return corr_ranks |
---|
358 | |
---|
359 | def normalize_seq_ids(self): |
---|
360 | invalid_chars = "[](),;:' " |
---|
361 | sub_chars = "_" * len(invalid_chars) |
---|
362 | trantab = maketrans(invalid_chars, sub_chars) |
---|
363 | self.corr_seq_ids = {} |
---|
364 | for old_sid in self.seq_ranks_map.iterkeys(): |
---|
365 | new_sid = old_sid.translate(trantab); |
---|
366 | if new_sid != old_sid: |
---|
367 | self.rename_seq(old_sid, new_sid) |
---|
368 | self.corr_seq_ids[old_sid] = new_sid |
---|
369 | |
---|
370 | return self.corr_seq_ids |
---|
371 | |
---|
372 | def close_taxonomy_gaps(self): |
---|
373 | for sid, ranks in self.seq_ranks_map.iteritems(): |
---|
374 | last_rank = None |
---|
375 | gap_count = 0 |
---|
376 | for i in reversed(range(1, len(ranks))): |
---|
377 | if ranks[i] != Taxonomy.EMPTY_RANK: |
---|
378 | last_rank = ranks[i] |
---|
379 | elif last_rank: |
---|
380 | gap_count += 1 |
---|
381 | ranks[i] = "parent%d_" % gap_count + last_rank |
---|
382 | |
---|
383 | def check_for_duplicates(self, autofix=False): |
---|
384 | parent_map = {} |
---|
385 | dups = [] |
---|
386 | old_fixed = {} |
---|
387 | old_ranks = {} |
---|
388 | for sid, ranks in self.seq_ranks_map.iteritems(): |
---|
389 | for i in range(1, len(ranks)): |
---|
390 | if ranks[i] == Taxonomy.EMPTY_RANK: |
---|
391 | break |
---|
392 | parent = ranks[i-1] |
---|
393 | if not ranks[i] in parent_map: |
---|
394 | parent_map[ranks[i]] = sid |
---|
395 | else: |
---|
396 | old_sid = parent_map[ranks[i]] |
---|
397 | if self.get_seq_ranks(old_sid)[i-1] != parent: |
---|
398 | if autofix: |
---|
399 | orig_name = self.lineage_str(sid) |
---|
400 | orig_old_name = self.lineage_str(old_sid) |
---|
401 | |
---|
402 | if old_sid not in old_fixed: |
---|
403 | old_fixed[old_sid] = self.lineage_str(old_sid) |
---|
404 | old_ranks[old_sid] = set([]) |
---|
405 | |
---|
406 | if i not in old_ranks[old_sid]: |
---|
407 | self.seq_ranks_map[old_sid][i] += "_" + self.seq_ranks_map[old_sid][i-1] |
---|
408 | old_ranks[old_sid].add(i) |
---|
409 | |
---|
410 | self.seq_ranks_map[sid][i] = ranks[i] + "_" + parent |
---|
411 | dup_rec = (old_sid, orig_old_name, sid, orig_name, self.lineage_str(sid)) |
---|
412 | else: |
---|
413 | dup_rec = (old_sid, self.lineage_str(old_sid), sid, self.lineage_str(sid)) |
---|
414 | dups.append(dup_rec) |
---|
415 | |
---|
416 | if autofix: |
---|
417 | for sid, orig_name in old_fixed.iteritems(): |
---|
418 | dup_rec = (sid, orig_name, sid, orig_name, self.lineage_str(sid)) |
---|
419 | dups.append(dup_rec) |
---|
420 | |
---|
421 | return dups |
---|
422 | |
---|
423 | def check_for_disbalance(self, autofix=False): |
---|
424 | # the next block finds "orphan" ranks - could be used to decide which ranks to drop (not used now) |
---|
425 | if 0: |
---|
426 | child_map = {} |
---|
427 | for sid, ranks in self.seq_ranks_map.iteritems(): |
---|
428 | for i in range(1, len(ranks)): |
---|
429 | parent = "%d_%s" % (i-1, ranks[i-1]) |
---|
430 | if parent not in child_map: |
---|
431 | child_map[parent] = set([ranks[i]]) |
---|
432 | else: |
---|
433 | child_map[parent].add(ranks[i]) |
---|
434 | |
---|
435 | errs = [] |
---|
436 | for sid, ranks in self.seq_ranks_map.iteritems(): |
---|
437 | if len(ranks) > 7: |
---|
438 | if autofix: |
---|
439 | orig_name = self.lineage_str(sid) |
---|
440 | |
---|
441 | dropq = [] |
---|
442 | keepq = [] |
---|
443 | restq = [] |
---|
444 | for i in range(1, len(ranks)): |
---|
445 | # drop Subclass and Suborder, preserve Order and Family (based on common suffixes) |
---|
446 | if ranks[i].endswith("dae") or ranks[i].endswith("neae"): |
---|
447 | dropq += [i] |
---|
448 | elif ranks[i].endswith("ceae") or ranks[i].endswith("ales"): |
---|
449 | keepq += [i] |
---|
450 | else: |
---|
451 | restq += [i] |
---|
452 | |
---|
453 | to_remove = dropq + restq + keepq |
---|
454 | to_remove = to_remove[:len(ranks)-7] |
---|
455 | |
---|
456 | new_ranks = [] |
---|
457 | for i in range(len(ranks)): |
---|
458 | if i not in to_remove: |
---|
459 | new_ranks += [ranks[i]] |
---|
460 | |
---|
461 | self.seq_ranks_map[sid] = new_ranks |
---|
462 | |
---|
463 | err_rec = (sid, orig_name, self.lineage_str(sid)) |
---|
464 | else: |
---|
465 | err_rec = (sid, self.lineage_str(sid)) |
---|
466 | errs.append(err_rec) |
---|
467 | |
---|
468 | return errs |
---|
469 | |
---|
470 | |
---|
471 | class TaxTreeBuilder: |
---|
472 | ROOT_LABEL = "<<root>>" |
---|
473 | |
---|
474 | def __init__(self, config, taxonomy): |
---|
475 | self.tree_nodes = {} |
---|
476 | self.leaf_count = {} |
---|
477 | self.config = config |
---|
478 | self.taxonomy = taxonomy |
---|
479 | |
---|
480 | def prune_unifu_nodes(self, tree): |
---|
481 | for node in tree.traverse("preorder"): |
---|
482 | if len(node.children) == 1: |
---|
483 | node.delete() |
---|
484 | |
---|
485 | def add_tree_node(self, tree, nodeId, ranks, rank_level): |
---|
486 | if rank_level >= 0: |
---|
487 | parent_level = rank_level |
---|
488 | while ranks[parent_level] == Taxonomy.EMPTY_RANK: |
---|
489 | parent_level -= 1 |
---|
490 | parentId = Taxonomy.get_rank_uid(ranks, parent_level) |
---|
491 | else: |
---|
492 | parentId = TaxTreeBuilder.ROOT_LABEL |
---|
493 | parent_level = -1 |
---|
494 | |
---|
495 | if (parentId in self.tree_nodes): |
---|
496 | parentNode = self.tree_nodes[parentId] |
---|
497 | else: |
---|
498 | parentNode = self.add_tree_node(tree, parentId, ranks, parent_level-1) |
---|
499 | self.tree_nodes[parentId] = parentNode; |
---|
500 | |
---|
501 | # print "Adding node: %s, parent: %s, parent_level: %d" % (nodeId, parentId, parent_level) |
---|
502 | newNode = parentNode.add_child() |
---|
503 | newNode.add_feature("name", nodeId) |
---|
504 | return newNode |
---|
505 | |
---|
506 | def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]): |
---|
507 | |
---|
508 | t0 = Tree() |
---|
509 | t0.add_feature("name", TaxTreeBuilder.ROOT_LABEL) |
---|
510 | self.tree_nodes[TaxTreeBuilder.ROOT_LABEL] = t0; |
---|
511 | self.leaf_count[TaxTreeBuilder.ROOT_LABEL] = 0; |
---|
512 | k = 0 |
---|
513 | added = 0 |
---|
514 | seq_ids = [] |
---|
515 | # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1) |
---|
516 | for sid, ranks in self.taxonomy.iteritems(): |
---|
517 | k += 1 |
---|
518 | if self.config.verbose and k % 1000 == 0: |
---|
519 | print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added |
---|
520 | |
---|
521 | # filter by minimum rank level |
---|
522 | if ranks[min_rank] == Taxonomy.EMPTY_RANK: |
---|
523 | continue |
---|
524 | |
---|
525 | # filter by rank contraints (e.g. class Clostridia only) |
---|
526 | clade_is_ok = False |
---|
527 | |
---|
528 | # check against the inclusion list |
---|
529 | if len(clades_to_include) > 0: |
---|
530 | for (rank_level, rank_name) in clades_to_include: |
---|
531 | if ranks[rank_level] == rank_name: |
---|
532 | clade_is_ok = True |
---|
533 | break |
---|
534 | else: # default: include all |
---|
535 | clade_is_ok = True |
---|
536 | |
---|
537 | # if sequence is about to be included, check it against the ignore list |
---|
538 | if clade_is_ok: |
---|
539 | for (rank_level, rank_name) in clades_to_ignore: |
---|
540 | if ranks[rank_level] == rank_name: |
---|
541 | clade_is_ok = False |
---|
542 | break |
---|
543 | |
---|
544 | # final decision |
---|
545 | if not clade_is_ok: |
---|
546 | continue |
---|
547 | |
---|
548 | tax_seq_level = len(ranks) |
---|
549 | parent_level = tax_seq_level - 1 |
---|
550 | while ranks[parent_level] == Taxonomy.EMPTY_RANK: |
---|
551 | parent_level -= 1 |
---|
552 | parent_name = Taxonomy.get_rank_uid(ranks, parent_level) |
---|
553 | if parent_name in self.tree_nodes: |
---|
554 | parent_node = self.tree_nodes[parent_name] |
---|
555 | # max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level) |
---|
556 | if parent_level == tax_seq_level - 1: |
---|
557 | max_seq_per_rank = max_seqs_per_leaf # * (tax_seq_level - parent_level) |
---|
558 | if parent_name in self.leaf_count and self.leaf_count[parent_name] >= max_seq_per_rank: |
---|
559 | continue |
---|
560 | |
---|
561 | self.leaf_count[parent_name] = self.leaf_count.get(parent_name, 0) + 1 |
---|
562 | |
---|
563 | # all checks succeeded: add the sequence to the tree |
---|
564 | self.add_tree_node(t0, sid, ranks, parent_level) |
---|
565 | seq_ids += [sid] |
---|
566 | added += 1 |
---|
567 | |
---|
568 | self.config.log.debug("Total nodes in resulting tree: %d", added) |
---|
569 | |
---|
570 | if self.config.debug: |
---|
571 | reftax_fname = self.config.tmp_fname("%NAME%_mf_unpruned.tre") |
---|
572 | t0.write(outfile=reftax_fname, format=8) |
---|
573 | |
---|
574 | self.prune_unifu_nodes(t0) |
---|
575 | return t0, seq_ids |
---|