| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | import sys |
|---|
| 4 | import re |
|---|
| 5 | from string import maketrans |
|---|
| 6 | from ete2 import Tree |
|---|
| 7 | |
|---|
| 8 | class TaxCode: |
|---|
| 9 | UNI_TAX_RANKS = { |
|---|
| 10 | 1: ("Domain", "d__"), |
|---|
| 11 | 2: ("Kingdom", "k__"), |
|---|
| 12 | 3: ("Subkingdom", "a__"), |
|---|
| 13 | 4: ("Phylum", "p__"), |
|---|
| 14 | 5: ("Subphylum", "a__"), |
|---|
| 15 | 6: ("Class", "c__"), |
|---|
| 16 | 7: ("Subclass", "x__"), |
|---|
| 17 | 8: ("Superorder", "e__"), |
|---|
| 18 | 9: ("Order", "o__"), |
|---|
| 19 | 10: ("Suborder", "h__"), |
|---|
| 20 | 11: ("Infraorder", "i__"), |
|---|
| 21 | 12: ("Superfamily", "j__"), |
|---|
| 22 | 13: ("Epifamily", "l__"), |
|---|
| 23 | 14: ("Family", "f__"), |
|---|
| 24 | 15: ("Subfamily", "m__"), |
|---|
| 25 | 16: ("Infrafamily", "n__"), |
|---|
| 26 | 17: ("Tribe", "t__"), |
|---|
| 27 | 18: ("Subtribe", "u__"), |
|---|
| 28 | 19: ("Infratribe", "v__"), |
|---|
| 29 | 20: ("Genus", "g__"), |
|---|
| 30 | 21: ("Species", "s__"), |
|---|
| 31 | 22: ("Subspecies", "b__"), |
|---|
| 32 | 23: ("Strain", "r__"), |
|---|
| 33 | 24: ("Isolate", "q__") |
|---|
| 34 | } |
|---|
| 35 | UNI_TAX_LEVELS = len(UNI_TAX_RANKS) |
|---|
| 36 | |
|---|
| 37 | # ranks of the standard 7-levels taxonomy |
|---|
| 38 | # kingdom, phylum, class, order, family, genus, species |
|---|
| 39 | STD_RANKS = [2, 4, 6, 9, 14, 20, 21] |
|---|
| 40 | |
|---|
| 41 | BAC_TAX_CODE = { |
|---|
| 42 | 2: ((), ("bacteria", "archaea")), # kingdom |
|---|
| 43 | 4: ((), ()), # phylum |
|---|
| 44 | 6: ((), ()), # class |
|---|
| 45 | 7: (("idae"), ()), # subclass |
|---|
| 46 | 9: (("ales"), ()), # order |
|---|
| 47 | 10: (("ineae"), ()), # suborder |
|---|
| 48 | 14: (("aceae"), ()), # family |
|---|
| 49 | 15: (("oideae"), ()), # subfamily |
|---|
| 50 | 20: ((), ()), # genus |
|---|
| 51 | 21: ((), ()), # species |
|---|
| 52 | 22: ((), ()), # subspecies |
|---|
| 53 | 23: ((), ()), # strain |
|---|
| 54 | 24: ((), ()) # isolate |
|---|
| 55 | } |
|---|
| 56 | |
|---|
| 57 | BOT_TAX_CODE = { |
|---|
| 58 | 1: ((), ("eukaryota")), # domain |
|---|
| 59 | 2: ((), ("plantae", "algae", "fungi")), # kingdom |
|---|
| 60 | 3: ((), ("dikarya")), # subkingdom |
|---|
| 61 | 4: (("phyta", "phycota", "mycota"), ()), # phylum |
|---|
| 62 | 5: (("phytina", "phycotina", "mycotina"), ()), # subphylum |
|---|
| 63 | 6: (("opsida", "phyceae", "mycetes"), ()), # class |
|---|
| 64 | 7: (("idae", "phycidae", "mycetidae"), ()), # subclass |
|---|
| 65 | 8: (("anae"), ()), # superorder |
|---|
| 66 | 9: (("ales"), ()), # order |
|---|
| 67 | 10: (("ineae"), ()), # suborder |
|---|
| 68 | 11: (("aria"), ()), # infraorder |
|---|
| 69 | 12: (("acea"), ()), # superfamily |
|---|
| 70 | 14: (("aceae"), ()), # family |
|---|
| 71 | 15: (("oideae"), ()), # subfamily |
|---|
| 72 | 17: (("eae"), ()), # tribe |
|---|
| 73 | 18: (("inae"), ()), # subtribe |
|---|
| 74 | 20: ((), ()), # genus |
|---|
| 75 | 21: ((), ()), # species |
|---|
| 76 | 22: ((), ()), # subspecies |
|---|
| 77 | 23: ((), ()), # specimen |
|---|
| 78 | 24: ((), ()) # isolate |
|---|
| 79 | } |
|---|
| 80 | |
|---|
| 81 | ZOO_TAX_CODE = { |
|---|
| 82 | 1: ((), ("eukaryota")), # domain |
|---|
| 83 | 2: ((), ("animalia")), # kingdom |
|---|
| 84 | 4: ((), ("chordata", "arthropoda", "mollusca", "nematoda")), # phylum |
|---|
| 85 | 5: ((), ("vertebrata", "myriapoda", "crustacea", "hexapoda")), # subphylum |
|---|
| 86 | 6: ((), ("mammalia", "aves", "reptilia", "amphibia", "insecta")), # class |
|---|
| 87 | 7: ((), ()), # subclass |
|---|
| 88 | 8: ((), ()), # superorder |
|---|
| 89 | 9: ((), ()), # order |
|---|
| 90 | 10: ((), ()), # suborder |
|---|
| 91 | 11: ((), ()), # infraorder |
|---|
| 92 | 12: (("oidea"), ()), # superfamily |
|---|
| 93 | 13: (("oidae"), ()), # epifamily |
|---|
| 94 | 14: (("idae"), ()), # family |
|---|
| 95 | 15: (("inae"), ()), # subfamily |
|---|
| 96 | 16: (("odd"), ()), # infrafamily |
|---|
| 97 | 17: (("ini"), ()), # tribe |
|---|
| 98 | 18: (("ina"), ()), # subtribe |
|---|
| 99 | 19: (("ad", "iti"), ()), # infratribe |
|---|
| 100 | 20: ((), ()), # genus |
|---|
| 101 | 21: ((), ()), # species |
|---|
| 102 | 22: ((), ()), # subspecies |
|---|
| 103 | 23: ((), ()), # specimen |
|---|
| 104 | 24: ((), ()) # isolate |
|---|
| 105 | } |
|---|
| 106 | |
|---|
| 107 | VIR_TAX_CODE = { |
|---|
| 108 | 2: ((), ("viruses")), # kingdom |
|---|
| 109 | 6: ((), ()), # class |
|---|
| 110 | 7: (("idae"), ()), # subclass |
|---|
| 111 | 9: (("virales"), ()), # order |
|---|
| 112 | 14: (("viridae"), ()), # family |
|---|
| 113 | 15: (("virinae"), ()), # subfamily |
|---|
| 114 | 20: (("virus"), ()), # genus |
|---|
| 115 | 21: ((" virus"), ()), # species |
|---|
| 116 | 23: ((), ()), # strain |
|---|
| 117 | 24: ((), ()) # isolate |
|---|
| 118 | } |
|---|
| 119 | |
|---|
| 120 | TAX_CODE_MAP = { |
|---|
| 121 | "bac": BAC_TAX_CODE, |
|---|
| 122 | "bot": BOT_TAX_CODE, |
|---|
| 123 | "zoo": ZOO_TAX_CODE, |
|---|
| 124 | "vir": VIR_TAX_CODE |
|---|
| 125 | } |
|---|
| 126 | |
|---|
| 127 | def __init__(self, tax_code_name): |
|---|
| 128 | self.tax_code = TaxCode.TAX_CODE_MAP.get(tax_code_name.lower(), None) |
|---|
| 129 | if not self.tax_code: |
|---|
| 130 | print "ERROR: Unknown taxonomic code: %s" % tax_code_name |
|---|
| 131 | sys.exit() |
|---|
| 132 | |
|---|
| 133 | @staticmethod |
|---|
| 134 | def rank_level_name(uni_rank_level): |
|---|
| 135 | return TaxCode.UNI_TAX_RANKS.get(uni_rank_level, ("Unknown", "?__")) |
|---|
| 136 | |
|---|
| 137 | def guess_rank_level(self, ranks, rank_level): |
|---|
| 138 | rank_name = re.sub("[\W_]+", "", ranks[rank_level].lower()) |
|---|
| 139 | |
|---|
| 140 | sorted_tax_levels = sorted(self.tax_code.keys()) |
|---|
| 141 | |
|---|
| 142 | real_level = 0 |
|---|
| 143 | |
|---|
| 144 | # first, try to guess rank level based on its name or name suffix |
|---|
| 145 | for lvl in sorted_tax_levels: |
|---|
| 146 | (suffix, exact_match) = self.tax_code[lvl] |
|---|
| 147 | if rank_name.endswith(suffix) or rank_name in exact_match: |
|---|
| 148 | real_level = lvl |
|---|
| 149 | break |
|---|
| 150 | |
|---|
| 151 | # if name-based identification failed, try to derive rank level from its parent |
|---|
| 152 | if real_level == 0: |
|---|
| 153 | if rank_level == 0: # kingdom |
|---|
| 154 | real_level = 2 |
|---|
| 155 | else: |
|---|
| 156 | parent_level = self.guess_rank_level(ranks, rank_level-1) |
|---|
| 157 | if parent_level == 0: |
|---|
| 158 | return 0 |
|---|
| 159 | idx = sorted_tax_levels.index(parent_level) |
|---|
| 160 | for i in range(idx+1, len(sorted_tax_levels)): |
|---|
| 161 | lvl = sorted_tax_levels[i] |
|---|
| 162 | suffix = self.tax_code[lvl][0] |
|---|
| 163 | if lvl in TaxCode.STD_RANKS: |
|---|
| 164 | real_level = lvl |
|---|
| 165 | break |
|---|
| 166 | |
|---|
| 167 | return real_level |
|---|
| 168 | |
|---|
| 169 | def guess_rank_level_name(self, ranks, rank_level): |
|---|
| 170 | real_level = self.guess_rank_level(ranks, rank_level) |
|---|
| 171 | return TaxCode.rank_level_name(real_level) |
|---|
| 172 | |
|---|
| 173 | |
|---|
| 174 | class Taxonomy: |
|---|
| 175 | EMPTY_RANK = "-" |
|---|
| 176 | RANK_UID_DELIM = "@@" |
|---|
| 177 | |
|---|
| 178 | RANK_PLACEHOLDERS = ["k__", "p__", "c__", "o__", "f__", "g__", "s__"] |
|---|
| 179 | |
|---|
| 180 | @staticmethod |
|---|
| 181 | def lineage_str(ranks): |
|---|
| 182 | return ";".join(ranks).strip(';') |
|---|
| 183 | |
|---|
| 184 | @staticmethod |
|---|
| 185 | def lowest_assigned_rank_level(ranks): |
|---|
| 186 | rank_level = len(ranks)-1 |
|---|
| 187 | while rank_level >= 0 and ranks[rank_level] == Taxonomy.EMPTY_RANK: |
|---|
| 188 | rank_level -= 1 |
|---|
| 189 | return rank_level |
|---|
| 190 | |
|---|
| 191 | @staticmethod |
|---|
| 192 | def lowest_assigned_rank(ranks): |
|---|
| 193 | rank_level = Taxonomy.lowest_assigned_rank_level(ranks) |
|---|
| 194 | if rank_level >= 0: |
|---|
| 195 | return ranks[rank_level] |
|---|
| 196 | else: |
|---|
| 197 | return None |
|---|
| 198 | |
|---|
| 199 | @staticmethod |
|---|
| 200 | def get_rank_uid(ranks, rank_level=-1): |
|---|
| 201 | if rank_level == -1: |
|---|
| 202 | rank_level = Taxonomy.lowest_assigned_rank_level(ranks) |
|---|
| 203 | return Taxonomy.RANK_UID_DELIM.join(ranks[:rank_level+1]) |
|---|
| 204 | |
|---|
| 205 | @staticmethod |
|---|
| 206 | def split_rank_uid(rank_uid, min_lvls=0): |
|---|
| 207 | ranks = rank_uid.split(Taxonomy.RANK_UID_DELIM) |
|---|
| 208 | if len(ranks) < min_lvls: |
|---|
| 209 | return ranks + [Taxonomy.EMPTY_RANK] * (min_lvls - len(ranks)) |
|---|
| 210 | else: |
|---|
| 211 | return ranks |
|---|
| 212 | |
|---|
| 213 | @staticmethod |
|---|
| 214 | def rank_uid_to_lineage_str(rank_uid, min_lvls=0): |
|---|
| 215 | ranks = Taxonomy.split_rank_uid(rank_uid, min_lvls) |
|---|
| 216 | return Taxonomy.lineage_str(ranks) |
|---|
| 217 | |
|---|
| 218 | def __init__(self, prefix="", tax_fname="", tax_map=None): |
|---|
| 219 | self.prefix = prefix |
|---|
| 220 | |
|---|
| 221 | tree_nodes = [] |
|---|
| 222 | self.common_ranks = set([]) |
|---|
| 223 | self.seq_ranks_map = {} |
|---|
| 224 | self.rank_seqs_map = {} |
|---|
| 225 | self.corr_seq_ids = {} |
|---|
| 226 | self.uncorr_rank_ids = {} |
|---|
| 227 | if tax_map: |
|---|
| 228 | for sid, ranks in tax_map.iteritems(): |
|---|
| 229 | self.seq_ranks_map[sid] = ranks |
|---|
| 230 | rank_id = Taxonomy.get_rank_uid(ranks) |
|---|
| 231 | self.rank_seqs_map[rank_id] = self.rank_seqs_map.get(rank_id, []) + [sid] |
|---|
| 232 | elif tax_fname: |
|---|
| 233 | self.load_taxonomy(tax_fname) |
|---|
| 234 | |
|---|
| 235 | def get_common_ranks(self): |
|---|
| 236 | allranks = list(self.seq_ranks_map.items()) |
|---|
| 237 | numitems = len(allranks) |
|---|
| 238 | if numitems > 0: |
|---|
| 239 | self.common_ranks = set(allranks[0][1]) |
|---|
| 240 | for i in range(1, numitems): |
|---|
| 241 | curr_set = set(allranks[i][1]) |
|---|
| 242 | inters = self.common_ranks & curr_set |
|---|
| 243 | self.common_ranks = inters |
|---|
| 244 | return self.common_ranks |
|---|
| 245 | else: |
|---|
| 246 | return set([]) |
|---|
| 247 | |
|---|
| 248 | def seq_count(self): |
|---|
| 249 | return len(self.seq_ranks_map) |
|---|
| 250 | |
|---|
| 251 | def items(self): |
|---|
| 252 | return self.seq_ranks_map.items() |
|---|
| 253 | |
|---|
| 254 | def iteritems(self): |
|---|
| 255 | return self.seq_ranks_map.items() |
|---|
| 256 | |
|---|
| 257 | def get_map(self): |
|---|
| 258 | return self.seq_ranks_map |
|---|
| 259 | |
|---|
| 260 | def remove_seq(self, seqid): |
|---|
| 261 | rank_id = self.seq_rank_id(seqid) |
|---|
| 262 | self.rank_seqs_map[rank_id].remove(seq_id) |
|---|
| 263 | del self.seq_ranks_map[seqid] |
|---|
| 264 | |
|---|
| 265 | def rename_seq(self, old_seqid, new_seqid): |
|---|
| 266 | rank_id = self.seq_rank_id(old_seqid) |
|---|
| 267 | self.rank_seqs_map[rank_id].remove(old_seqid) |
|---|
| 268 | self.rank_seqs_map[rank_id].append(new_seqid) |
|---|
| 269 | self.seq_ranks_map[new_seqid] = self.seq_ranks_map[old_seqid] |
|---|
| 270 | del self.seq_ranks_map[old_seqid] |
|---|
| 271 | |
|---|
| 272 | def get_seq_ranks(self, seq_id): |
|---|
| 273 | if not seq_id.startswith(self.prefix): |
|---|
| 274 | seq_id = self.prefix + seq_id |
|---|
| 275 | seq_id = self.corr_seq_ids.get(seq_id, seq_id) |
|---|
| 276 | return self.seq_ranks_map[seq_id] |
|---|
| 277 | |
|---|
| 278 | def seq_lineage_str(self, seq_id): |
|---|
| 279 | ranks = list(self.get_seq_ranks(seq_id)) |
|---|
| 280 | return Taxonomy.lineage_str(ranks) |
|---|
| 281 | |
|---|
| 282 | def seq_rank_id(self, seq_id): |
|---|
| 283 | ranks = list(self.get_seq_ranks(seq_id)) |
|---|
| 284 | return Taxonomy.get_rank_uid(ranks) |
|---|
| 285 | |
|---|
| 286 | def get_rank_seqs(self, rank_id): |
|---|
| 287 | return self.rank_seqs_map.get(rank_id, []) |
|---|
| 288 | |
|---|
| 289 | def get_rank_seq_count(self, rank_id): |
|---|
| 290 | return len(self.rank_seqs_map.get(rank_id, [])) |
|---|
| 291 | |
|---|
| 292 | def get_uncorr_rank_id(self, rank_id): |
|---|
| 293 | return self.uncorr_rank_ids.get(rank_id, rank_id) |
|---|
| 294 | |
|---|
| 295 | def merge_ranks(self, rank_ids, name_prefix="__TAXCLUSTER__"): |
|---|
| 296 | if len(rank_ids) < 2: |
|---|
| 297 | return None |
|---|
| 298 | |
|---|
| 299 | all_sid = [] |
|---|
| 300 | for rank_id in rank_ids: |
|---|
| 301 | all_sid += self.get_rank_seqs(rank_id) |
|---|
| 302 | del self.rank_seqs_map[rank_id] |
|---|
| 303 | |
|---|
| 304 | first_taxon = Taxonomy.split_rank_uid(rank_ids[0]) |
|---|
| 305 | merge_lvl = Taxonomy.lowest_assigned_rank_level(first_taxon) |
|---|
| 306 | new_name = name_prefix + first_taxon[merge_lvl] |
|---|
| 307 | new_rank = first_taxon[:merge_lvl] + [new_name] |
|---|
| 308 | new_rank_id = Taxonomy.get_rank_uid(new_rank) |
|---|
| 309 | |
|---|
| 310 | self.rank_seqs_map[new_rank_id] = all_sid |
|---|
| 311 | |
|---|
| 312 | for sid in all_sid: |
|---|
| 313 | self.seq_ranks_map[sid] = new_rank |
|---|
| 314 | |
|---|
| 315 | return new_rank_id |
|---|
| 316 | |
|---|
| 317 | def load_taxonomy(self, tax_fname): |
|---|
| 318 | with open(tax_fname) as fin: |
|---|
| 319 | for line in fin: |
|---|
| 320 | line = line.strip() |
|---|
| 321 | toks = line.split("\t") |
|---|
| 322 | sid = self.prefix + toks[0] |
|---|
| 323 | ranks_str = toks[1] |
|---|
| 324 | ranks = ranks_str.split(";") |
|---|
| 325 | for i in range(len(ranks)): |
|---|
| 326 | rank_name = ranks[i].strip() |
|---|
| 327 | # if rank_name in GGTaxonomyFile.rank_placeholders: |
|---|
| 328 | # rank_name = Taxonomy.EMPTY_RANK |
|---|
| 329 | ranks[i] = rank_name |
|---|
| 330 | rank_id = Taxonomy.get_rank_uid(ranks) |
|---|
| 331 | |
|---|
| 332 | self.seq_ranks_map[sid] = ranks |
|---|
| 333 | self.rank_seqs_map[rank_id] = self.rank_seqs_map.get(rank_id, []) + [sid] |
|---|
| 334 | |
|---|
| 335 | def normalize_rank_names(self): |
|---|
| 336 | invalid_chars = "[](),;:'" |
|---|
| 337 | sub_chars = "_" * len(invalid_chars) |
|---|
| 338 | trantab = maketrans(invalid_chars, sub_chars) |
|---|
| 339 | corr_ranks = {} |
|---|
| 340 | for sid, ranks in self.seq_ranks_map.iteritems(): |
|---|
| 341 | old_rank_id = Taxonomy.get_rank_uid(ranks) |
|---|
| 342 | for i in range(len(ranks)): |
|---|
| 343 | if ranks[i] in corr_ranks: |
|---|
| 344 | ranks[i] = corr_ranks[ranks[i]] |
|---|
| 345 | else: |
|---|
| 346 | new_rank_name = ranks[i].translate(trantab); |
|---|
| 347 | if new_rank_name != ranks[i]: |
|---|
| 348 | corr_ranks[ranks[i]] = new_rank_name |
|---|
| 349 | ranks[i] = new_rank_name |
|---|
| 350 | |
|---|
| 351 | new_rank_id = Taxonomy.get_rank_uid(ranks) |
|---|
| 352 | if new_rank_id != old_rank_id and old_rank_id in self.rank_seqs_map: |
|---|
| 353 | self.rank_seqs_map[new_rank_id] = self.rank_seqs_map[old_rank_id] |
|---|
| 354 | del self.rank_seqs_map[old_rank_id] |
|---|
| 355 | self.uncorr_rank_ids[new_rank_id] = old_rank_id |
|---|
| 356 | |
|---|
| 357 | return corr_ranks |
|---|
| 358 | |
|---|
| 359 | def normalize_seq_ids(self): |
|---|
| 360 | invalid_chars = "[](),;:' " |
|---|
| 361 | sub_chars = "_" * len(invalid_chars) |
|---|
| 362 | trantab = maketrans(invalid_chars, sub_chars) |
|---|
| 363 | self.corr_seq_ids = {} |
|---|
| 364 | for old_sid in self.seq_ranks_map.iterkeys(): |
|---|
| 365 | new_sid = old_sid.translate(trantab); |
|---|
| 366 | if new_sid != old_sid: |
|---|
| 367 | self.rename_seq(old_sid, new_sid) |
|---|
| 368 | self.corr_seq_ids[old_sid] = new_sid |
|---|
| 369 | |
|---|
| 370 | return self.corr_seq_ids |
|---|
| 371 | |
|---|
| 372 | def close_taxonomy_gaps(self): |
|---|
| 373 | for sid, ranks in self.seq_ranks_map.iteritems(): |
|---|
| 374 | last_rank = None |
|---|
| 375 | gap_count = 0 |
|---|
| 376 | for i in reversed(range(1, len(ranks))): |
|---|
| 377 | if ranks[i] != Taxonomy.EMPTY_RANK: |
|---|
| 378 | last_rank = ranks[i] |
|---|
| 379 | elif last_rank: |
|---|
| 380 | gap_count += 1 |
|---|
| 381 | ranks[i] = "parent%d_" % gap_count + last_rank |
|---|
| 382 | |
|---|
| 383 | def check_for_duplicates(self, autofix=False): |
|---|
| 384 | parent_map = {} |
|---|
| 385 | dups = [] |
|---|
| 386 | old_fixed = {} |
|---|
| 387 | old_ranks = {} |
|---|
| 388 | for sid, ranks in self.seq_ranks_map.iteritems(): |
|---|
| 389 | for i in range(1, len(ranks)): |
|---|
| 390 | if ranks[i] == Taxonomy.EMPTY_RANK: |
|---|
| 391 | break |
|---|
| 392 | parent = ranks[i-1] |
|---|
| 393 | if not ranks[i] in parent_map: |
|---|
| 394 | parent_map[ranks[i]] = sid |
|---|
| 395 | else: |
|---|
| 396 | old_sid = parent_map[ranks[i]] |
|---|
| 397 | if self.get_seq_ranks(old_sid)[i-1] != parent: |
|---|
| 398 | if autofix: |
|---|
| 399 | orig_name = self.lineage_str(sid) |
|---|
| 400 | orig_old_name = self.lineage_str(old_sid) |
|---|
| 401 | |
|---|
| 402 | if old_sid not in old_fixed: |
|---|
| 403 | old_fixed[old_sid] = self.lineage_str(old_sid) |
|---|
| 404 | old_ranks[old_sid] = set([]) |
|---|
| 405 | |
|---|
| 406 | if i not in old_ranks[old_sid]: |
|---|
| 407 | self.seq_ranks_map[old_sid][i] += "_" + self.seq_ranks_map[old_sid][i-1] |
|---|
| 408 | old_ranks[old_sid].add(i) |
|---|
| 409 | |
|---|
| 410 | self.seq_ranks_map[sid][i] = ranks[i] + "_" + parent |
|---|
| 411 | dup_rec = (old_sid, orig_old_name, sid, orig_name, self.lineage_str(sid)) |
|---|
| 412 | else: |
|---|
| 413 | dup_rec = (old_sid, self.lineage_str(old_sid), sid, self.lineage_str(sid)) |
|---|
| 414 | dups.append(dup_rec) |
|---|
| 415 | |
|---|
| 416 | if autofix: |
|---|
| 417 | for sid, orig_name in old_fixed.iteritems(): |
|---|
| 418 | dup_rec = (sid, orig_name, sid, orig_name, self.lineage_str(sid)) |
|---|
| 419 | dups.append(dup_rec) |
|---|
| 420 | |
|---|
| 421 | return dups |
|---|
| 422 | |
|---|
| 423 | def check_for_disbalance(self, autofix=False): |
|---|
| 424 | # the next block finds "orphan" ranks - could be used to decide which ranks to drop (not used now) |
|---|
| 425 | if 0: |
|---|
| 426 | child_map = {} |
|---|
| 427 | for sid, ranks in self.seq_ranks_map.iteritems(): |
|---|
| 428 | for i in range(1, len(ranks)): |
|---|
| 429 | parent = "%d_%s" % (i-1, ranks[i-1]) |
|---|
| 430 | if parent not in child_map: |
|---|
| 431 | child_map[parent] = set([ranks[i]]) |
|---|
| 432 | else: |
|---|
| 433 | child_map[parent].add(ranks[i]) |
|---|
| 434 | |
|---|
| 435 | errs = [] |
|---|
| 436 | for sid, ranks in self.seq_ranks_map.iteritems(): |
|---|
| 437 | if len(ranks) > 7: |
|---|
| 438 | if autofix: |
|---|
| 439 | orig_name = self.lineage_str(sid) |
|---|
| 440 | |
|---|
| 441 | dropq = [] |
|---|
| 442 | keepq = [] |
|---|
| 443 | restq = [] |
|---|
| 444 | for i in range(1, len(ranks)): |
|---|
| 445 | # drop Subclass and Suborder, preserve Order and Family (based on common suffixes) |
|---|
| 446 | if ranks[i].endswith("dae") or ranks[i].endswith("neae"): |
|---|
| 447 | dropq += [i] |
|---|
| 448 | elif ranks[i].endswith("ceae") or ranks[i].endswith("ales"): |
|---|
| 449 | keepq += [i] |
|---|
| 450 | else: |
|---|
| 451 | restq += [i] |
|---|
| 452 | |
|---|
| 453 | to_remove = dropq + restq + keepq |
|---|
| 454 | to_remove = to_remove[:len(ranks)-7] |
|---|
| 455 | |
|---|
| 456 | new_ranks = [] |
|---|
| 457 | for i in range(len(ranks)): |
|---|
| 458 | if i not in to_remove: |
|---|
| 459 | new_ranks += [ranks[i]] |
|---|
| 460 | |
|---|
| 461 | self.seq_ranks_map[sid] = new_ranks |
|---|
| 462 | |
|---|
| 463 | err_rec = (sid, orig_name, self.lineage_str(sid)) |
|---|
| 464 | else: |
|---|
| 465 | err_rec = (sid, self.lineage_str(sid)) |
|---|
| 466 | errs.append(err_rec) |
|---|
| 467 | |
|---|
| 468 | return errs |
|---|
| 469 | |
|---|
| 470 | |
|---|
| 471 | class TaxTreeBuilder: |
|---|
| 472 | ROOT_LABEL = "<<root>>" |
|---|
| 473 | |
|---|
| 474 | def __init__(self, config, taxonomy): |
|---|
| 475 | self.tree_nodes = {} |
|---|
| 476 | self.leaf_count = {} |
|---|
| 477 | self.config = config |
|---|
| 478 | self.taxonomy = taxonomy |
|---|
| 479 | |
|---|
| 480 | def prune_unifu_nodes(self, tree): |
|---|
| 481 | for node in tree.traverse("preorder"): |
|---|
| 482 | if len(node.children) == 1: |
|---|
| 483 | node.delete() |
|---|
| 484 | |
|---|
| 485 | def add_tree_node(self, tree, nodeId, ranks, rank_level): |
|---|
| 486 | if rank_level >= 0: |
|---|
| 487 | parent_level = rank_level |
|---|
| 488 | while ranks[parent_level] == Taxonomy.EMPTY_RANK: |
|---|
| 489 | parent_level -= 1 |
|---|
| 490 | parentId = Taxonomy.get_rank_uid(ranks, parent_level) |
|---|
| 491 | else: |
|---|
| 492 | parentId = TaxTreeBuilder.ROOT_LABEL |
|---|
| 493 | parent_level = -1 |
|---|
| 494 | |
|---|
| 495 | if (parentId in self.tree_nodes): |
|---|
| 496 | parentNode = self.tree_nodes[parentId] |
|---|
| 497 | else: |
|---|
| 498 | parentNode = self.add_tree_node(tree, parentId, ranks, parent_level-1) |
|---|
| 499 | self.tree_nodes[parentId] = parentNode; |
|---|
| 500 | |
|---|
| 501 | # print "Adding node: %s, parent: %s, parent_level: %d" % (nodeId, parentId, parent_level) |
|---|
| 502 | newNode = parentNode.add_child() |
|---|
| 503 | newNode.add_feature("name", nodeId) |
|---|
| 504 | return newNode |
|---|
| 505 | |
|---|
| 506 | def build(self, min_rank=0, max_seqs_per_leaf=1e9, clades_to_include=[], clades_to_ignore=[]): |
|---|
| 507 | |
|---|
| 508 | t0 = Tree() |
|---|
| 509 | t0.add_feature("name", TaxTreeBuilder.ROOT_LABEL) |
|---|
| 510 | self.tree_nodes[TaxTreeBuilder.ROOT_LABEL] = t0; |
|---|
| 511 | self.leaf_count[TaxTreeBuilder.ROOT_LABEL] = 0; |
|---|
| 512 | k = 0 |
|---|
| 513 | added = 0 |
|---|
| 514 | seq_ids = [] |
|---|
| 515 | # sequences are leafs of the tree, so they always have the lowest taxonomy level (e.g. "species"+1) |
|---|
| 516 | for sid, ranks in self.taxonomy.iteritems(): |
|---|
| 517 | k += 1 |
|---|
| 518 | if self.config.verbose and k % 1000 == 0: |
|---|
| 519 | print "Processed nodes: ", k, ", added: ", added, ", skipped: ", k - added |
|---|
| 520 | |
|---|
| 521 | # filter by minimum rank level |
|---|
| 522 | if ranks[min_rank] == Taxonomy.EMPTY_RANK: |
|---|
| 523 | continue |
|---|
| 524 | |
|---|
| 525 | # filter by rank contraints (e.g. class Clostridia only) |
|---|
| 526 | clade_is_ok = False |
|---|
| 527 | |
|---|
| 528 | # check against the inclusion list |
|---|
| 529 | if len(clades_to_include) > 0: |
|---|
| 530 | for (rank_level, rank_name) in clades_to_include: |
|---|
| 531 | if ranks[rank_level] == rank_name: |
|---|
| 532 | clade_is_ok = True |
|---|
| 533 | break |
|---|
| 534 | else: # default: include all |
|---|
| 535 | clade_is_ok = True |
|---|
| 536 | |
|---|
| 537 | # if sequence is about to be included, check it against the ignore list |
|---|
| 538 | if clade_is_ok: |
|---|
| 539 | for (rank_level, rank_name) in clades_to_ignore: |
|---|
| 540 | if ranks[rank_level] == rank_name: |
|---|
| 541 | clade_is_ok = False |
|---|
| 542 | break |
|---|
| 543 | |
|---|
| 544 | # final decision |
|---|
| 545 | if not clade_is_ok: |
|---|
| 546 | continue |
|---|
| 547 | |
|---|
| 548 | tax_seq_level = len(ranks) |
|---|
| 549 | parent_level = tax_seq_level - 1 |
|---|
| 550 | while ranks[parent_level] == Taxonomy.EMPTY_RANK: |
|---|
| 551 | parent_level -= 1 |
|---|
| 552 | parent_name = Taxonomy.get_rank_uid(ranks, parent_level) |
|---|
| 553 | if parent_name in self.tree_nodes: |
|---|
| 554 | parent_node = self.tree_nodes[parent_name] |
|---|
| 555 | # max_seq_per_rank = max_seqs_per_leaf * (tax_seq_level - parent_level) |
|---|
| 556 | if parent_level == tax_seq_level - 1: |
|---|
| 557 | max_seq_per_rank = max_seqs_per_leaf # * (tax_seq_level - parent_level) |
|---|
| 558 | if parent_name in self.leaf_count and self.leaf_count[parent_name] >= max_seq_per_rank: |
|---|
| 559 | continue |
|---|
| 560 | |
|---|
| 561 | self.leaf_count[parent_name] = self.leaf_count.get(parent_name, 0) + 1 |
|---|
| 562 | |
|---|
| 563 | # all checks succeeded: add the sequence to the tree |
|---|
| 564 | self.add_tree_node(t0, sid, ranks, parent_level) |
|---|
| 565 | seq_ids += [sid] |
|---|
| 566 | added += 1 |
|---|
| 567 | |
|---|
| 568 | self.config.log.debug("Total nodes in resulting tree: %d", added) |
|---|
| 569 | |
|---|
| 570 | if self.config.debug: |
|---|
| 571 | reftax_fname = self.config.tmp_fname("%NAME%_mf_unpruned.tre") |
|---|
| 572 | t0.write(outfile=reftax_fname, format=8) |
|---|
| 573 | |
|---|
| 574 | self.prune_unifu_nodes(t0) |
|---|
| 575 | return t0, seq_ids |
|---|