├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── docs ├── index.html ├── multitax.html ├── multitax │ ├── multitax.html │ └── utils.html └── search.js ├── make_docs.sh ├── multitax ├── __init__.py ├── customtx.py ├── dummytx.py ├── greengenestx.py ├── gtdbtx.py ├── multitax.py ├── ncbitx.py ├── otttx.py ├── silvatx.py └── utils.py ├── pyproject.toml ├── setup.py └── tests └── multitax ├── data_minimal ├── custom.tsv.gz ├── custom2.tsv.gz ├── custom_unit_test.tsv.gz ├── gg.txt.gz ├── gtdb_ar.tsv.gz ├── gtdb_ar_metadata.tsv.gz ├── gtdb_bac.tsv.gz ├── gtdb_bac_metadata.tsv.gz ├── ncbi.tar.gz ├── ott.tgz └── silva.txt.gz ├── integration ├── test_common.py ├── test_empty.py └── test_online.py ├── unit ├── test_functions.py └── test_init.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | lib 16 | lib64 17 | __pycache__ 18 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.9" 4 | - "3.10" 5 | - "3.11" 6 | - "3.12" 7 | - "3.13" 8 | 9 | before_install: 10 | - python -m pip install coverage 11 | - python -m pip install setuptools importlib-metadata --upgrade # fix bug setuptools py37 12 | 13 | install: 14 | - python setup.py install 15 | 16 | script: 17 | - python -m unittest discover -s tests/multitax/unit/ -v 18 | - python -m unittest discover -s tests/multitax/integration/ -v 19 | - python -m coverage run --omit="/usr/*,tests/*" -m unittest discover -s tests/multitax/unit/ -v 20 | - python -m coverage run --append --omit="/usr/*,tests/*" -m unittest discover -s tests/multitax/integration/ -v 21 | 22 | after_success: 23 | - python -m coverage xml -o coverage_py.xml 24 | - curl -Os https://uploader.codecov.io/latest/linux/codecov; 25 | - chmod +x codecov; 26 | - ./codecov --nonZero -X search --file coverage_py.xml; 27 | 28 | notifications: 29 | email: false 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Vitor C. Piro - pirovc.github.io 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MultiTax [![Build Status](https://app.travis-ci.com/pirovc/multitax.svg?token=q6Nfx8pLHh8hV3hLz3Pq&branch=main)](https://app.travis-ci.com/pirovc/multitax) [![codecov](https://codecov.io/gh/pirovc/multitax/branch/main/graph/badge.svg)](https://codecov.io/gh/pirovc/multitax) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/multitax/README.html) 2 | 3 | Python package to obtain, parse and explore biological taxonomies 4 | 5 | ## Description 6 | 7 | MultiTax is a Python package that provides a common and generalized set of functions to download, parse, filter, explore, translate, convert and write multiple biological taxonomies (**GTDB, NCBI, Silva, Greengenes, Open Tree taxonomy**) and custom formatted taxonomies. Main goals are: 8 | 9 | - Be fast, intuitive, generalized and easy to use 10 | - Explore different taxonomies with same set of commands 11 | - Enable integration and compatibility with multiple taxonomies 12 | - Translate taxonomies (partially implemented) 13 | - Convert taxonomies (not yet implemented) 14 | 15 | MultiTax does not link sequence identifiers to taxonomic nodes, it just handles the taxonomy alone. Some integration to work with sequence or external identifiers is planned, but not yet implemented. 16 | 17 | ## API Documentation 18 | 19 | https://pirovc.github.io/multitax/ 20 | 21 | ## Installation 22 | 23 | ### pip 24 | 25 | ```bash 26 | pip install multitax 27 | ``` 28 | 29 | ### conda 30 | 31 | ```bash 32 | conda install -c bioconda multitax 33 | ``` 34 | 35 | ### local 36 | 37 | ```bash 38 | git clone https://github.com/pirovc/multitax.git 39 | cd multitax 40 | python setup.py install --record files.txt 41 | ``` 42 | 43 | ## Basic usage with GTDB 44 | 45 | ```python 46 | from multitax import GtdbTx 47 | 48 | # Download and parse taxonomy 49 | tax = GtdbTx() 50 | 51 | # Get lineage for the Escherichia genus 52 | tax.lineage("g__Escherichia") 53 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia'] 54 | ``` 55 | 56 | ## Examples 57 | 58 | - [List of functions](https://pirovc.github.io/multitax/multitax/multitax.html) 59 | 60 | ### Load 61 | 62 | ```python 63 | from multitax import GtdbTx # or NcbiTx, SilvaTx, ... 64 | 65 | # Download and parse in memory 66 | tax = GtdbTx() 67 | 68 | # Parse local files 69 | tax = GtdbTx(files=["bac120_taxonomy.tsv.gz", "ar122_taxonomy.tsv.gz"]) 70 | 71 | # Download, write and parse files 72 | tax = GtdbTx(output_prefix="my/path/") 73 | 74 | # Download and filter only specific branch 75 | tax = GtdbTx(root_node="p__Proteobacteria") 76 | ``` 77 | 78 | ### Explore 79 | 80 | ```python 81 | # List parent node 82 | tax.parent("g__Escherichia") 83 | # f__Enterobacteriaceae 84 | 85 | # List children nodes 86 | tax.children("g__Escherichia") 87 | # ['s__Escherichia coli', 88 | # 's__Escherichia albertii', 89 | # 's__Escherichia marmotae', 90 | # 's__Escherichia fergusonii', 91 | # 's__Escherichia sp005843885', 92 | # 's__Escherichia ruysiae', 93 | # 's__Escherichia sp001660175', 94 | # 's__Escherichia sp004211955', 95 | # 's__Escherichia sp002965065', 96 | # 's__Escherichia coli_E'] 97 | 98 | # Get parent node from a defined rank 99 | tax.parent_rank("s__Lentisphaera araneosa", "phylum") 100 | # 'p__Verrucomicrobiota' 101 | 102 | # Get the closest parent from a list of ranks 103 | tax.closest_parent("s__Lentisphaera araneosa", ranks=["phylum", "class", "family"]) 104 | # 'f__Lentisphaeraceae' 105 | 106 | # Get lineage 107 | tax.lineage("g__Escherichia") 108 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia'] 109 | 110 | # Get lineage of names 111 | tax.name_lineage("g__Escherichia") 112 | # ['root', 'Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacterales', 'Enterobacteriaceae', 'Escherichia'] 113 | 114 | # Get lineage of ranks 115 | tax.rank_lineage("g__Escherichia") 116 | # ['root', 'domain', 'phylum', 'class', 'order', 'family', 'genus'] 117 | 118 | # Get lineage with defined ranks and root node 119 | tax.lineage("g__Escherichia", root_node="p__Proteobacteria", ranks=["phylum", "class", "family", "genus"]) 120 | # ['p__Proteobacteria', 'c__Gammaproteobacteria', 'f__Enterobacteriaceae', 'g__Escherichia'] 121 | 122 | # Build lineages in memory for faster access 123 | tax.build_lineages() 124 | 125 | # Get leaf nodes 126 | tax.leaves("p__Hadarchaeota") 127 | # ['s__DG-33 sp004375695', 's__DG-33 sp001515185', 's__Hadarchaeum yellowstonense', 's__B75-G9 sp003661465', 's__WYZ-LMO6 sp004347925', 's__B88-G9 sp003660555'] 128 | 129 | # Search names and filter by rank 130 | tax.search_name("Escherichia", exact=False, rank="genus") 131 | # ['g__Escherichia', 'g__Escherichia_C'] 132 | 133 | # Show stats of loaded tax 134 | tax.stats() 135 | #{'leaves': 31910, 136 | # 'names': 45503, 137 | # 'nodes': 45503, 138 | # 'ranked_leaves': Counter({'species': 31910}), 139 | # 'ranked_nodes': Counter({'species': 31910, 140 | # 'genus': 9428, 141 | # 'family': 2600, 142 | # 'order': 1034, 143 | # 'class': 379, 144 | # 'phylum': 149, 145 | # 'domain': 2, 146 | # 'root': 1}), 147 | # 'ranks': 45503} 148 | ``` 149 | 150 | ### Filter 151 | 152 | ```python 153 | # Filter ancestors (desc=True for descendants) 154 | tax.filter(["g__Escherichia", "s__Pseudomonas aeruginosa"]) 155 | tax.stats() 156 | #{'leaves': 2, 157 | # 'names': 11, 158 | # 'nodes': 11, 159 | # 'ranked_leaves': Counter({'genus': 1, 'species': 1}), 160 | # 'ranked_nodes': Counter({'genus': 2, 161 | # 'family': 2, 162 | # 'order': 2, 163 | # 'class': 1, 164 | # 'phylum': 1, 165 | # 'domain': 1, 166 | # 'species': 1, 167 | # 'root': 1}), 168 | # 'ranks': 11} 169 | ``` 170 | 171 | ### Add, remove, prune 172 | 173 | ```python 174 | # Add node to the tree 175 | tax.add("my_custom_node", "g__Escherichia", name="my custom name", rank="strain") 176 | tax.lineage("my_custom_node") 177 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia', 'my_custom_node'] 178 | 179 | # Remove node from tree (warning: removing parent nodes may break tree -> use check_consistency) 180 | tax.remove("s__Pseudomonas aeruginosa", check_consistency=True) 181 | 182 | # Prune (remove) full branches of the tree under a certain node 183 | tax.prune("g__Escherichia") 184 | ``` 185 | 186 | ### Translate 187 | 188 | ```python 189 | # GTDB to NCBI 190 | from multitax import GtdbTx, NcbiTx 191 | ncbi_tax = NcbiTx() 192 | gtdb_tax = GtdbTx() 193 | 194 | # Build translation 195 | gtdb_tax.build_translation(ncbi_tax) 196 | 197 | # Check translated nodes 198 | gtdb_tax.translate("g__Escherichia") 199 | # {'1301', '547', '561', '570', '590', '620'} 200 | ``` 201 | 202 | ### Write 203 | 204 | ```python 205 | # Write tax to file 206 | tax.write("custom_tax.tsv", cols=["node", "rank", "name_lineage"]) 207 | 208 | #g__Escherichia genus root|Bacteria|Proteobacteria|Gammaproteobacteria|Ent#erobacterales|Enterobacteriaceae|Escherichia 209 | #f__Enterobacteriaceae family root|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae 210 | #o__Enterobacterales order root|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales 211 | #c__Gammaproteobacteria class root|Bacteria|Proteobacteria|Gammaproteobacteria 212 | #... 213 | ``` 214 | 215 | ### The same applies to other taxonomies 216 | 217 | ```python 218 | # NCBI 219 | from multitax import NcbiTx 220 | tax = NcbiTx() 221 | tax.lineage("561") 222 | # ['1', '131567', '2', '1224', '1236', '91347', '543', '561'] 223 | 224 | # Silva 225 | from multitax import SilvaTx 226 | tax = SilvaTx() 227 | tax.lineage("46463") 228 | # ['1', '3', '2375', '3303', '46449', '46454', '46463'] 229 | 230 | # Open Tree taxonomy 231 | from multitax import OttTx 232 | tax = OttTx() 233 | tax.lineage("474503") 234 | # ['805080', '93302', '844192', '248067', '822744', '768012', '424023', '474503'] 235 | 236 | # GreenGenes 237 | from multitax import GreengenesTx 238 | tax = GreengenesTx() 239 | tax.lineage("f__Enterobacteriaceae") 240 | # ['1', 'k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae'] 241 | ``` 242 | 243 | ## LCA integration 244 | 245 | Using pylca: https://github.com/pirovc/pylca 246 | 247 | ```bash 248 | conda install -c bioconda pylca 249 | ``` 250 | 251 | ```python 252 | from pylca.pylca import LCA 253 | from multitax import GtdbTx 254 | 255 | # Download and parse GTDB Taxonomy 256 | tax = GtdbTx() 257 | 258 | # Build LCA structure 259 | lca = LCA(tax._nodes) 260 | 261 | # Get LCA 262 | lca("s__Escherichia dysenteriae", "s__Pseudomonas aeruginosa") 263 | # 'c__Gammaproteobacteria' 264 | ``` 265 | 266 | ## Details 267 | 268 | - After downloading/parsing the desired taxonomies, MultiTax works fully offline. 269 | - Taxonomies are parsed into `nodes`. Each node is annotated with a `name` and a `rank`. 270 | - Some taxonomies have a numeric taxonomic identifier (e.g. NCBI) and other use the rank + name as an identifier (e.g. GTDB). In MultiTax all identifiers are treated as strings. 271 | - A single root node is defined by default for each taxonomy (or `1` when not defined). This can be changed with `root_node` when loading the taxonomy (as well as annotations `root_parent`, `root_name`, `root_rank`). If the `root_node` already exists, the tree will be filtered. 272 | - Standard values for unknown/undefined nodes can be configured with `undefined_node`,`undefined_name` and `undefined_rank`. Those are default values returned when nodes/names/ranks are not found. 273 | - Taxonomy files are automatically downloaded or can be loaded from disk (`files` parameter). Alternative `urls` can be provided. When downloaded, files are handled in memory. It is possible to save the downloaded file to disk with `output_prefix`. 274 | 275 | ## Translation between taxonomies 276 | 277 | Partially implemented. The goal is to map different taxonomies if the linkage data is available. That's what is currently availble. 278 | 279 | 280 | |from/to |NCBI |GTDB |SILVA |OTT |GG | 281 | |--------|---------|-------|----------|--------|------| 282 | |NCBI |- |PART |[part] |[part] |no | 283 | |GTDB |FULL |- |[part] |no |[part]| 284 | |SILVA |[full] |[part] |- |[part] |no | 285 | |OTT |[part] |no |[part] |- |no | 286 | |GG |no |[part] |no |no |- | 287 | 288 | Legend: 289 | 290 | - full: complete translation available 291 | - part: partial translation available 292 | - no: no translation possible 293 | - []: not yet implemented 294 | 295 | ### Files and information about specific translations 296 | 297 | - NCBI <-> GTDB 298 | - GTDB is a subset of the NCBI repository, so the translation from NCBI to GTDB can be only partial 299 | - Translation in both ways is based on: https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz and https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz 300 | 301 | --- 302 | 303 | ## Further ideas to be implemented 304 | 305 | - More translations 306 | - Conversion between taxonomies (write on specific format) 307 | 308 | 309 | ## Similar projects 310 | 311 | - https://github.com/FOI-Bioinformatics/flextaxd 312 | - https://github.com/shenwei356/taxonkit 313 | - https://github.com/bioforensics/pytaxonkit 314 | - https://github.com/chanzuckerberg/taxoniq 315 | - https://github.com/sherrillmix/taxonomizr 316 | - https://github.com/etetoolkit/ete 317 | - https://github.com/apcamargo/taxopy -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /docs/search.js: -------------------------------------------------------------------------------- 1 | window.pdocSearch = (function(){ 2 | /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();o

\n"}, {"fullname": "multitax.CustomTx", "modulename": "multitax", "qualname": "CustomTx", "type": "class", "doc": "

\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.CustomTx.__init__", "modulename": "multitax", "qualname": "CustomTx.__init__", "type": "function", "doc": "

CustomTx()

\n\n

Parameters:

\n\n
    \n
  • cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: \"node\", \"parent\", \"rank\", \"name\"
  • \n
  • sep [str]: Separator of fields
  • \n
  • **kwargs defined at multitax.multitax.MultiTax
  • \n
\n\n

Example:

\n\n
tax_custom1 = CustomTx(files=\"my_custom_tax.tsv\", cols=[\"node\",\"parent\",\"rank\"])\ntax_custom2 = CustomTx(files=\"my_custom_tax.tsv\", cols={\"node\": 0, \"parent\": 1, \"name\": 5, \"rank\": 3})\n
\n", "signature": "(\n self,\n cols: list = ['node', 'parent', 'rank', 'name'],\n sep: str = '\\t',\n **kwargs\n)", "funcdef": "def"}, {"fullname": "multitax.DummyTx", "modulename": "multitax", "qualname": "DummyTx", "type": "class", "doc": "

\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.DummyTx.__init__", "modulename": "multitax", "qualname": "DummyTx.__init__", "type": "function", "doc": "

DummyTx() - Dummy empty taxonomy

\n\n

Parameters:

\n\n
    \n
  • **kwargs defined at multitax.multitax.MultiTax
  • \n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.GreengenesTx", "modulename": "multitax", "qualname": "GreengenesTx", "type": "class", "doc": "

\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.GreengenesTx.__init__", "modulename": "multitax", "qualname": "GreengenesTx.__init__", "type": "function", "doc": "

Main constructor of MultiTax and sub-classes

\n\n

Parameters:

\n\n
    \n
  • files [str, list]: One or more local files to parse.
  • \n
  • urls [str, list]: One or more urls to download and parse.
  • \n
  • output_prefix [str]: Directory to write downloaded files.
  • \n
  • root_node [str]: Define an alternative root node.
  • \n
  • root_parent [str]: Define the root parent node identifier.
  • \n
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • \n
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • \n
  • undefined_node [str]: Define a default return value for undefined nodes.
  • \n
  • undefined_name [str]: Define a default return value for undefined names.
  • \n
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • \n
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • \n
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • \n
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • \n
  • extended_names [bool]: Parse extended names if available.
  • \n
\n\n

Example:

\n\n
tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.GtdbTx", "modulename": "multitax", "qualname": "GtdbTx", "type": "class", "doc": "

\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.GtdbTx.__init__", "modulename": "multitax", "qualname": "GtdbTx.__init__", "type": "function", "doc": "

Main constructor of MultiTax and sub-classes

\n\n

Parameters:

\n\n
    \n
  • files [str, list]: One or more local files to parse.
  • \n
  • urls [str, list]: One or more urls to download and parse.
  • \n
  • output_prefix [str]: Directory to write downloaded files.
  • \n
  • root_node [str]: Define an alternative root node.
  • \n
  • root_parent [str]: Define the root parent node identifier.
  • \n
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • \n
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • \n
  • undefined_node [str]: Define a default return value for undefined nodes.
  • \n
  • undefined_name [str]: Define a default return value for undefined names.
  • \n
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • \n
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • \n
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • \n
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • \n
  • extended_names [bool]: Parse extended names if available.
  • \n
\n\n

Example:

\n\n
tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx", "modulename": "multitax", "qualname": "NcbiTx", "type": "class", "doc": "

\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.NcbiTx.__init__", "modulename": "multitax", "qualname": "NcbiTx.__init__", "type": "function", "doc": "

Main constructor of MultiTax and sub-classes

\n\n

Parameters:

\n\n
    \n
  • files [str, list]: One or more local files to parse.
  • \n
  • urls [str, list]: One or more urls to download and parse.
  • \n
  • output_prefix [str]: Directory to write downloaded files.
  • \n
  • root_node [str]: Define an alternative root node.
  • \n
  • root_parent [str]: Define the root parent node identifier.
  • \n
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • \n
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • \n
  • undefined_node [str]: Define a default return value for undefined nodes.
  • \n
  • undefined_name [str]: Define a default return value for undefined names.
  • \n
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • \n
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • \n
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • \n
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • \n
  • extended_names [bool]: Parse extended names if available.
  • \n
\n\n

Example:

\n\n
tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.latest", "modulename": "multitax", "qualname": "NcbiTx.latest", "type": "function", "doc": "

Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.merged", "modulename": "multitax", "qualname": "NcbiTx.merged", "type": "function", "doc": "

Returns relative entry from the merged.dmp file of a given node.

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.search_name", "modulename": "multitax", "qualname": "NcbiTx.search_name", "type": "function", "doc": "

Search node by exact or partial name.

\n\n

Default order (can be skipped with force_extended=True):

\n\n

1) Search names defined as \"scientific name\" on nodes.dmp

\n\n

2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))

\n\n

Parameters:

\n\n
    \n
  • text [str]: Text to search.
  • \n
  • rank [str]: Filter results by rank.
  • \n
  • exact [bool]: Exact or partial name search (both case sensitive).
  • \n
  • force_extended [bool]: Search for text in all categories at once.
  • \n
\n\n

Returns: list of matching nodes

\n", "signature": "(\n self,\n text: str,\n rank: str = None,\n exact: bool = True,\n force_extended: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.stats", "modulename": "multitax", "qualname": "NcbiTx.stats", "type": "function", "doc": "

Returns a dict with general numbers of the taxonomic tree

\n\n

Example:

\n\n
from pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n                          'genus': 8778,\n                          'family': 2323,\n                          'order': 930,\n                          'class': 337,\n                          'phylum': 131,\n                          'domain': 1,\n                          'root': 1}),\n 'ranks': 42739}\n
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.OttTx", "modulename": "multitax", "qualname": "OttTx", "type": "class", "doc": "

\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.OttTx.__init__", "modulename": "multitax", "qualname": "OttTx.__init__", "type": "function", "doc": "

Main constructor of MultiTax and sub-classes

\n\n

Parameters:

\n\n
    \n
  • files [str, list]: One or more local files to parse.
  • \n
  • urls [str, list]: One or more urls to download and parse.
  • \n
  • output_prefix [str]: Directory to write downloaded files.
  • \n
  • root_node [str]: Define an alternative root node.
  • \n
  • root_parent [str]: Define the root parent node identifier.
  • \n
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • \n
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • \n
  • undefined_node [str]: Define a default return value for undefined nodes.
  • \n
  • undefined_name [str]: Define a default return value for undefined names.
  • \n
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • \n
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • \n
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • \n
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • \n
  • extended_names [bool]: Parse extended names if available.
  • \n
\n\n

Example:

\n\n
tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.OttTx.forwards", "modulename": "multitax", "qualname": "OttTx.forwards", "type": "function", "doc": "

Returns relative entry from the forwards.tsv file of a given node.

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.OttTx.latest", "modulename": "multitax", "qualname": "OttTx.latest", "type": "function", "doc": "

Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.OttTx.search_name", "modulename": "multitax", "qualname": "OttTx.search_name", "type": "function", "doc": "

Search node by exact or partial name.

\n\n

Default order (can be skipped with force_extended=True):

\n\n

1) Search default names defined on \"taxonomy.tsv\"

\n\n

2) If nothing was found, search in all other names defined on \"synonyms.tsv\" (must be activated with OttTx(extended_names=True))

\n\n

Parameters:

\n\n
    \n
  • text [str]: Text to search.
  • \n
  • rank [str]: Filter results by rank.
  • \n
  • exact [bool]: Exact or partial name search (both case sensitive).
  • \n
  • force_extended [bool]: Search for text in all categories at once.
  • \n
\n\n

Returns: list of matching nodes

\n", "signature": "(\n self,\n text: str,\n rank: str = None,\n exact: bool = True,\n force_extended: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.OttTx.stats", "modulename": "multitax", "qualname": "OttTx.stats", "type": "function", "doc": "

Returns a dict with general numbers of the taxonomic tree

\n\n

Example:

\n\n
from pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n                          'genus': 8778,\n                          'family': 2323,\n                          'order': 930,\n                          'class': 337,\n                          'phylum': 131,\n                          'domain': 1,\n                          'root': 1}),\n 'ranks': 42739}\n
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.SilvaTx", "modulename": "multitax", "qualname": "SilvaTx", "type": "class", "doc": "

\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.SilvaTx.__init__", "modulename": "multitax", "qualname": "SilvaTx.__init__", "type": "function", "doc": "

Main constructor of MultiTax and sub-classes

\n\n

Parameters:

\n\n
    \n
  • files [str, list]: One or more local files to parse.
  • \n
  • urls [str, list]: One or more urls to download and parse.
  • \n
  • output_prefix [str]: Directory to write downloaded files.
  • \n
  • root_node [str]: Define an alternative root node.
  • \n
  • root_parent [str]: Define the root parent node identifier.
  • \n
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • \n
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • \n
  • undefined_node [str]: Define a default return value for undefined nodes.
  • \n
  • undefined_name [str]: Define a default return value for undefined names.
  • \n
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • \n
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • \n
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • \n
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • \n
  • extended_names [bool]: Parse extended names if available.
  • \n
\n\n

Example:

\n\n
tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.multitax", "modulename": "multitax.multitax", "type": "module", "doc": "

\n"}, {"fullname": "multitax.multitax.MultiTax", "modulename": "multitax.multitax", "qualname": "MultiTax", "type": "class", "doc": "

\n"}, {"fullname": "multitax.multitax.MultiTax.__init__", "modulename": "multitax.multitax", "qualname": "MultiTax.__init__", "type": "function", "doc": "

Main constructor of MultiTax and sub-classes

\n\n

Parameters:

\n\n
    \n
  • files [str, list]: One or more local files to parse.
  • \n
  • urls [str, list]: One or more urls to download and parse.
  • \n
  • output_prefix [str]: Directory to write downloaded files.
  • \n
  • root_node [str]: Define an alternative root node.
  • \n
  • root_parent [str]: Define the root parent node identifier.
  • \n
  • root_name [str]: Define an alternative root name. Set to None to use original name.
  • \n
  • root_rank [str]: Define an alternative root rank. Set to None to use original name.
  • \n
  • undefined_node [str]: Define a default return value for undefined nodes.
  • \n
  • undefined_name [str]: Define a default return value for undefined names.
  • \n
  • undefined_rank [str]: Define a default return value for undefined ranks.
  • \n
  • build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
  • \n
  • build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
  • \n
  • build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
  • \n
  • extended_names [bool]: Parse extended names if available.
  • \n
\n\n

Example:

\n\n
tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(\n self,\n files: list = None,\n urls: list = None,\n output_prefix: str = None,\n root_node: str = None,\n root_parent: str = '0',\n root_name: str = None,\n root_rank: str = None,\n undefined_node: str = None,\n undefined_name: str = None,\n undefined_rank: str = None,\n build_name_nodes: bool = False,\n build_node_children: bool = False,\n build_rank_nodes: bool = False,\n extended_names: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.version", "modulename": "multitax.multitax", "qualname": "MultiTax.version", "type": "variable", "doc": "

\n", "default_value": " = '1.3.1'"}, {"fullname": "multitax.multitax.MultiTax.add", "modulename": "multitax.multitax", "qualname": "MultiTax.add", "type": "function", "doc": "

Add node to taxonomy.\nDeletes built lineages and translations.

\n", "signature": "(self, node: str, parent: str, name: str = None, rank: str = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.build_lineages", "modulename": "multitax.multitax", "qualname": "MultiTax.build_lineages", "type": "function", "doc": "

Stores lineages in memory for faster access.\nIt is valid for lineage(), rank_lineage() and name_lineage().\nIf keyword arguments (root_node, ranks) are used in those functions stored lineages are not used.

\n\n

Returns: None

\n", "signature": "(self, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.build_translation", "modulename": "multitax.multitax", "qualname": "MultiTax.build_translation", "type": "function", "doc": "

Create a translation of current taxonomy to another

\n\n

Parameters:

\n\n
    \n
  • tax [MultiTax]: A target taxonomy to be translated to.
  • \n
  • files [str, list]: One or more local files to parse.
  • \n
  • urls [str, list]: One or more urls to download and parse.
  • \n
\n\n

Example:

\n\n
from multitax import GtdbTx, NcbiTx\ngtdb_tax = GtdbTx()\nncbi_tax = NcbiTx()\n\n# Automatically download translation files\ngtdb_tax.build_translation(ncbi_tax)\ngtdb_tax.translate(\"g__Escherichia\")\n    {'1301', '547', '561', '570', '590', '620'}\n\n# Using local files (NCBI <-> GTDB)\nncbi_tax.build_translation(gtdb_tax, files=[\"ar53_metadata.tar.gz\", \"bac120_metadata.tar.gz\"])\nncbi_tax.translate(\"620\")\n    {'g__Escherichia', 'g__Proteus', 'g__Serratia'}\n
\n", "signature": "(self, tax, files: list = None, urls: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.children", "modulename": "multitax.multitax", "qualname": "MultiTax.children", "type": "function", "doc": "

Returns list of direct children nodes of a given node.

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.check_consistency", "modulename": "multitax.multitax", "qualname": "MultiTax.check_consistency", "type": "function", "doc": "

Checks consistency of the tree

\n\n

Returns: raise an Exception otherwise None

\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.clear_lineages", "modulename": "multitax.multitax", "qualname": "MultiTax.clear_lineages", "type": "function", "doc": "

Clear built lineages.

\n\n

Returns: None

\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.closest_parent", "modulename": "multitax.multitax", "qualname": "MultiTax.closest_parent", "type": "function", "doc": "

Returns the closest parent node based on a defined list of ranks

\n", "signature": "(self, node: str, ranks: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.filter", "modulename": "multitax.multitax", "qualname": "MultiTax.filter", "type": "function", "doc": "

Filters taxonomy given a list of nodes.\nBy default keep all the ancestors of the given nodes.\nIf desc=True, keep all descendants instead.\nDeletes built lineages and translations.

\n\n

Example:

\n\n
from multitax import GtdbTx\ntax = GtdbTx()\n\ntax.lineage('s__Enterovibrio marina')\n# ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Vibrionaceae', 'g__Enterovibrio', 's__Enterovibrio marina']\n# Keep only ancestors of 'g__Enterovibrio'\ntax.filter('g__Enterovibrio')\n\n# Reload taxonomy\ntax = GtdbTx()\n# Keep only descendants of 'g__Enterovibrio'\ntax.filter('g__Enterovibrio', desc=True)\n
\n", "signature": "(self, nodes: list, desc: bool = False)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.latest", "modulename": "multitax.multitax", "qualname": "MultiTax.latest", "type": "function", "doc": "

Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.leaves", "modulename": "multitax.multitax", "qualname": "MultiTax.leaves", "type": "function", "doc": "

Returns a list of leaf nodes of a given node.

\n", "signature": "(self, node: str = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.lineage", "type": "function", "doc": "

Returns a list with the lineage of a given node.\nIf ranks is provided, returns only nodes annotated with such ranks.\nIf root_node is provided, use it instead of default root of tree.

\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.name", "modulename": "multitax.multitax", "qualname": "MultiTax.name", "type": "function", "doc": "

Returns name of a given node.

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.name_lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.name_lineage", "type": "function", "doc": "

Returns a list with the name lineage of a given node.

\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.nodes_rank", "modulename": "multitax.multitax", "qualname": "MultiTax.nodes_rank", "type": "function", "doc": "

Returns list of nodes of a given rank.

\n", "signature": "(self, rank: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.parent", "modulename": "multitax.multitax", "qualname": "MultiTax.parent", "type": "function", "doc": "

Returns the direct parent node of a given node.

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.parent_rank", "modulename": "multitax.multitax", "qualname": "MultiTax.parent_rank", "type": "function", "doc": "

Returns the parent node of a given rank in the specified rank.

\n", "signature": "(self, node: str, rank: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.prune", "modulename": "multitax.multitax", "qualname": "MultiTax.prune", "type": "function", "doc": "

Prunes branches of the tree under the given nodes.\nDeletes built lineages and translations.

\n", "signature": "(self, nodes: list)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.rank", "modulename": "multitax.multitax", "qualname": "MultiTax.rank", "type": "function", "doc": "

Returns the rank of a given node.

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.rank_lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.rank_lineage", "type": "function", "doc": "

Returns a list with the rank lineage of a given node.

\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.remove", "modulename": "multitax.multitax", "qualname": "MultiTax.remove", "type": "function", "doc": "

Removes node from taxonomy. Can break the tree if a parent node is removed. To remove a certain branch, use prune.\nRunning check consistency after removing a node is recommended.\nDeletes built lineages and translations.

\n", "signature": "(self, node: str, check_consistency: bool = False)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.search_name", "modulename": "multitax.multitax", "qualname": "MultiTax.search_name", "type": "function", "doc": "

Search node by exact or partial name

\n\n

Parameters:

\n\n
    \n
  • text [str]: Text to search.
  • \n
  • rank [str]: Filter results by rank.
  • \n
  • exact [bool]: Exact or partial name search (both case sensitive).
  • \n
\n\n

Returns: list of matching nodes

\n", "signature": "(self, text: str, rank: str = None, exact: bool = True)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.stats", "modulename": "multitax.multitax", "qualname": "MultiTax.stats", "type": "function", "doc": "

Returns a dict with general numbers of the taxonomic tree

\n\n

Example:

\n\n
from pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n                          'genus': 8778,\n                          'family': 2323,\n                          'order': 930,\n                          'class': 337,\n                          'phylum': 131,\n                          'domain': 1,\n                          'root': 1}),\n 'ranks': 42739}\n
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.translate", "modulename": "multitax.multitax", "qualname": "MultiTax.translate", "type": "function", "doc": "

Returns the translated node from another taxonomy. Translated nodes are generated with the build_translation function.

\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.write", "modulename": "multitax.multitax", "qualname": "MultiTax.write", "type": "function", "doc": "

Writes loaded taxonomy to a file.

\n\n

Parameters:

\n\n
    \n
  • cols [list]: Options: \"node\", \"latest\", \"parent\", \"rank\", \"name\", \"leaves\", \"children\", \"lineage\", \"rank_lineage\", \"name_lineage\"
  • \n
  • sep [str]: Separator of fields
  • \n
  • sep_multi [str]: Separator of multi-valued fields
  • \n
  • ranks [list]: Ranks to report
  • \n
  • gz [bool]: Gzip output
  • \n
\n\n

Returns: None

\n", "signature": "(\n self,\n output_file: str,\n cols: list = ['node', 'parent', 'rank', 'name'],\n sep: str = '\\t',\n sep_multi: str = '|',\n ranks: list = None,\n gz: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.utils", "modulename": "multitax.utils", "type": "module", "doc": "

\n"}, {"fullname": "multitax.utils.check_dir", "modulename": "multitax.utils", "qualname": "check_dir", "type": "function", "doc": "

\n", "signature": "(prefix: str)", "funcdef": "def"}, {"fullname": "multitax.utils.check_file", "modulename": "multitax.utils", "qualname": "check_file", "type": "function", "doc": "

\n", "signature": "(file: str)", "funcdef": "def"}, {"fullname": "multitax.utils.check_no_file", "modulename": "multitax.utils", "qualname": "check_no_file", "type": "function", "doc": "

\n", "signature": "(file: str)", "funcdef": "def"}, {"fullname": "multitax.utils.close_files", "modulename": "multitax.utils", "qualname": "close_files", "type": "function", "doc": "

Parameters:

\n\n
    \n
  • fhs [dict]: {file: file handler}
  • \n
\n\n

Returns: Nothing

\n", "signature": "(fhs: dict)", "funcdef": "def"}, {"fullname": "multitax.utils.download_files", "modulename": "multitax.utils", "qualname": "download_files", "type": "function", "doc": "

Download and open files (memory/stream) or write to disk (multitax.utils.save_urls)

\n\n

Parameters:

\n\n
    \n
  • urls [list]: List of files to download (text, \".gz\", \".tar.gz\", \".tgz\")
  • \n
  • output_prefix [str]: Output directory to save files
  • \n
\n\n

Returns:

\n\n
    \n
  • OrderedDict {file: file handler} (same order as input)
  • \n
\n", "signature": "(urls: list, output_prefix: str = None, retry_attempts: int = 1)", "funcdef": "def"}, {"fullname": "multitax.utils.filter_function", "modulename": "multitax.utils", "qualname": "filter_function", "type": "function", "doc": "

\n", "signature": "(elements, function, value)", "funcdef": "def"}, {"fullname": "multitax.utils.join_check", "modulename": "multitax.utils", "qualname": "join_check", "type": "function", "doc": "

\n", "signature": "(elements, sep: str)", "funcdef": "def"}, {"fullname": "multitax.utils.load_url_mem", "modulename": "multitax.utils", "qualname": "load_url_mem", "type": "function", "doc": "

Parameters:

\n\n
    \n
  • url [str]: URL to load into memory
  • \n
\n\n

Returns:

\n\n
    \n
  • io.BytesIO of the requested url
  • \n
\n", "signature": "(url: str)", "funcdef": "def"}, {"fullname": "multitax.utils.open_files", "modulename": "multitax.utils", "qualname": "open_files", "type": "function", "doc": "

Parameters:

\n\n
    \n
  • files [list]: List of files to open (text, \".gz\", \".tar.gz\", \".tgz\")
  • \n
\n\n

Returns:

\n\n
    \n
  • OrderedDict {file: file handler} (same order as input)
  • \n
\n", "signature": "(files: list)", "funcdef": "def"}, {"fullname": "multitax.utils.reverse_dict", "modulename": "multitax.utils", "qualname": "reverse_dict", "type": "function", "doc": "

\n", "signature": "(d: dict)", "funcdef": "def"}, {"fullname": "multitax.utils.save_urls", "modulename": "multitax.utils", "qualname": "save_urls", "type": "function", "doc": "

Parameters:

\n\n
    \n
  • urls [list]: List of urls to download
  • \n
  • output_prefix [str]: Output directory to save files
  • \n
\n\n

Returns:

\n\n
    \n
  • list of files saved
  • \n
\n", "signature": "(urls: list, output_prefix: str)", "funcdef": "def"}, {"fullname": "multitax.utils.warning_on_one_line", "modulename": "multitax.utils", "qualname": "warning_on_one_line", "type": "function", "doc": "

\n", "signature": "(message, category, filename, lineno, file=None, line=None)", "funcdef": "def"}]; 4 | 5 | // mirrored in build-search-index.js (part 1) 6 | // Also split on html tags. this is a cheap heuristic, but good enough. 7 | elasticlunr.tokenizer.setSeperator(/[\s\-.;&_'"=,()]+|<[^>]*>/); 8 | 9 | let searchIndex; 10 | if (docs._isPrebuiltIndex) { 11 | console.info("using precompiled search index"); 12 | searchIndex = elasticlunr.Index.load(docs); 13 | } else { 14 | console.time("building search index"); 15 | // mirrored in build-search-index.js (part 2) 16 | searchIndex = elasticlunr(function () { 17 | this.pipeline.remove(elasticlunr.stemmer); 18 | this.pipeline.remove(elasticlunr.stopWordFilter); 19 | this.addField("qualname"); 20 | this.addField("fullname"); 21 | this.addField("annotation"); 22 | this.addField("default_value"); 23 | this.addField("signature"); 24 | this.addField("bases"); 25 | this.addField("doc"); 26 | this.setRef("fullname"); 27 | }); 28 | for (let doc of docs) { 29 | searchIndex.addDoc(doc); 30 | } 31 | console.timeEnd("building search index"); 32 | } 33 | 34 | return (term) => searchIndex.search(term, { 35 | fields: { 36 | qualname: {boost: 4}, 37 | fullname: {boost: 2}, 38 | annotation: {boost: 2}, 39 | default_value: {boost: 2}, 40 | signature: {boost: 2}, 41 | bases: {boost: 2}, 42 | doc: {boost: 1}, 43 | }, 44 | expand: true 45 | }); 46 | })(); -------------------------------------------------------------------------------- /make_docs.sh: -------------------------------------------------------------------------------- 1 | pdoc -o docs multitax multitax/multitax.py multitax/utils.py 2 | -------------------------------------------------------------------------------- /multitax/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.3.2" 2 | 3 | __all__ = ( 4 | 'CustomTx', 5 | 'DummyTx', 6 | 'GreengenesTx', 7 | 'GtdbTx', 8 | 'NcbiTx', 9 | 'OttTx', 10 | 'SilvaTx', 11 | ) 12 | 13 | from .customtx import CustomTx 14 | from .dummytx import DummyTx 15 | from .greengenestx import GreengenesTx 16 | from .gtdbtx import GtdbTx 17 | from .ncbitx import NcbiTx 18 | from .otttx import OttTx 19 | from .silvatx import SilvaTx 20 | -------------------------------------------------------------------------------- /multitax/customtx.py: -------------------------------------------------------------------------------- 1 | from .multitax import MultiTax 2 | import warnings 3 | 4 | 5 | class CustomTx(MultiTax): 6 | 7 | _required_cols = ["node", "parent"] 8 | _possible_cols = ["node", "parent", "rank", "name"] 9 | 10 | def __init__(self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs): 11 | """ 12 | CustomTx() 13 | 14 | Parameters: 15 | * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name" 16 | * **sep** *[str]*: Separator of fields 17 | * **\*\*kwargs** defined at `multitax.multitax.MultiTax` 18 | 19 | Example: 20 | 21 | tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"]) 22 | tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3}) 23 | """ 24 | 25 | self._cols = self._parse_cols(cols) 26 | self._sep = sep 27 | super().__init__(**kwargs) 28 | 29 | def __repr__(self): 30 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] 31 | return 'CustomTx({})'.format(', '.join(stats)) 32 | 33 | def _build_translation(self, target_tax, files: list = None, urls: list = None): 34 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + 35 | "," + target_tax.__class__.__name__ + "] not yet implemented.") 36 | return {} 37 | 38 | def _parse(self, fhs, **kwargs): 39 | nodes = {} 40 | ranks = {} 41 | names = {} 42 | for source, fh in fhs.items(): 43 | for line in fh: 44 | try: 45 | fields = line.rstrip().split(self._sep) 46 | except: 47 | fields = line.decode().rstrip().split(self._sep) 48 | 49 | node = fields[self._cols["node"]] 50 | nodes[node] = fields[self._cols["parent"]] 51 | if "name" in self._cols: 52 | names[node] = fields[self._cols["name"]] 53 | if "rank" in self._cols: 54 | ranks[node] = fields[self._cols["rank"]] 55 | 56 | return nodes, ranks, names 57 | 58 | def _parse_cols(self, cols): 59 | if isinstance(cols, list): 60 | cols = {c: i for i, c in enumerate(cols)} 61 | 62 | for rc in self._required_cols: 63 | if rc not in cols: 64 | raise ValueError(rc + " is a required column") 65 | 66 | for c in cols: 67 | if c not in self._possible_cols: 68 | raise ValueError(c + " is not a valid column: " + 69 | ",".join(self._possible_cols)) 70 | 71 | return cols 72 | -------------------------------------------------------------------------------- /multitax/dummytx.py: -------------------------------------------------------------------------------- 1 | from .multitax import MultiTax 2 | 3 | 4 | class DummyTx(MultiTax): 5 | 6 | def __init__(self, **kwargs): 7 | """ 8 | DummyTx() - Dummy empty taxonomy 9 | 10 | Parameters: 11 | 12 | * \*\*kwargs defined at `multitax.multitax.MultiTax` 13 | """ 14 | super().__init__(**kwargs) 15 | 16 | def __repr__(self): 17 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] 18 | return 'DummyTx({})'.format(', '.join(stats)) 19 | -------------------------------------------------------------------------------- /multitax/greengenestx.py: -------------------------------------------------------------------------------- 1 | from .multitax import MultiTax 2 | import warnings 3 | 4 | 5 | class GreengenesTx(MultiTax): 6 | _default_urls = [ 7 | "https://gg-sg-web.s3-us-west-2.amazonaws.com/downloads/greengenes_database/gg_13_5/gg_13_5_taxonomy.txt.gz"] 8 | _rank_codes = [("k__", "kingdom"), 9 | ("p__", "phylum"), 10 | ("c__", "class"), 11 | ("o__", "order"), 12 | ("f__", "family"), 13 | ("g__", "genus"), 14 | ("s__", "species")] 15 | 16 | def __init__(self, **kwargs): 17 | # forwards.tsv 18 | self._forwards = {} 19 | super().__init__(**kwargs) 20 | 21 | def __repr__(self): 22 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] 23 | return 'GreengenesTx({})'.format(', '.join(stats)) 24 | 25 | def _build_translation(self, target_tax, files: list = None, urls: list = None): 26 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + 27 | "," + target_tax.__class__.__name__ + "] not yet implemented.") 28 | return {} 29 | 30 | def _parse(self, fhs, **kwargs): 31 | nodes = {} 32 | ranks = {} 33 | names = {} 34 | 35 | for source, fh in fhs.items(): 36 | for line in fh: 37 | try: 38 | _, lineage = line.rstrip().split('\t') 39 | except: 40 | _, lineage = line.decode().rstrip().split('\t') 41 | lin = lineage.split("; ") 42 | for i in range(len(lin))[::-1]: 43 | # assert rank 44 | assert lin[i][:3] == self._rank_codes[i][0] 45 | # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 46 | taxid = lin[i] 47 | name = lin[i][3:] 48 | if not name: 49 | continue # empty entry "s__" 50 | rank = self._rank_codes[i][1] 51 | if i == 0: 52 | parent_taxid = self._default_root_node 53 | else: 54 | parent_taxid = lin[i-1] 55 | if taxid not in nodes: 56 | nodes[taxid] = parent_taxid 57 | names[taxid] = name 58 | ranks[taxid] = rank 59 | 60 | return nodes, ranks, names 61 | -------------------------------------------------------------------------------- /multitax/gtdbtx.py: -------------------------------------------------------------------------------- 1 | from .multitax import MultiTax 2 | from .utils import check_file 3 | from .utils import open_files 4 | from .utils import download_files 5 | import warnings 6 | 7 | 8 | class GtdbTx(MultiTax): 9 | 10 | _default_urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_taxonomy.tsv.gz", 11 | "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_taxonomy.tsv.gz"] 12 | _rank_codes = [("d__", "domain"), 13 | ("p__", "phylum"), 14 | ("c__", "class"), 15 | ("o__", "order"), 16 | ("f__", "family"), 17 | ("g__", "genus"), 18 | ("s__", "species")] 19 | 20 | def __init__(self, **kwargs): 21 | super().__init__(**kwargs) 22 | 23 | def __repr__(self): 24 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] 25 | return 'GtdbTx({})'.format(', '.join(stats)) 26 | 27 | def _build_translation(self, target_tax, files: list = None, urls: list = None): 28 | translated_nodes = {} 29 | if target_tax.__class__.__name__ == "NcbiTx": 30 | 31 | if files: 32 | fhs = open_files(files) 33 | else: 34 | _urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz", 35 | "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz"] 36 | fhs = download_files( 37 | urls=urls if urls else _urls, retry_attempts=3) 38 | 39 | accession_col = 0 40 | gtdb_taxonomy_col = 19 41 | ncbi_taxid_col = 80 42 | 43 | for source, fh in fhs.items(): 44 | for line in fh: 45 | try: 46 | fields = line.rstrip().split('\t') 47 | except: 48 | fields = line.decode().rstrip().split('\t') 49 | 50 | # skip header 51 | if fields[accession_col] == "accession": 52 | continue 53 | 54 | print(fields) 55 | ncbi_leaf_node = target_tax.latest(fields[ncbi_taxid_col]) 56 | if ncbi_leaf_node != target_tax.undefined_node: 57 | ncbi_nodes = target_tax.lineage(ncbi_leaf_node, ranks=[ 58 | "superkingdom", "phylum", "class", 59 | "order", "family", "genus", "species"]) 60 | else: 61 | continue 62 | 63 | # Build GTDB lineage from leaf (species on given lineage) 64 | # to accomodate possible changes in the loaded tax 65 | gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1] 66 | if gtdb_leaf_node != self.undefined_node: 67 | gtdb_nodes = self.lineage(gtdb_leaf_node, ranks=[ 68 | "domain", "phylum", "class", "order", 69 | "family", "genus", "species"]) 70 | else: 71 | continue 72 | 73 | # Match ranks 74 | for i, gtdb_n in enumerate(gtdb_nodes): 75 | if ncbi_nodes[i] != target_tax.undefined_node and gtdb_n != self.undefined_node: 76 | if gtdb_n not in translated_nodes: 77 | translated_nodes[gtdb_n] = set() 78 | translated_nodes[gtdb_n].add(ncbi_nodes[i]) 79 | 80 | else: 81 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + 82 | "," + target_tax.__class__.__name__ + "] not yet implemented.") 83 | 84 | return translated_nodes 85 | 86 | def _parse(self, fhs, **kwargs): 87 | nodes = {} 88 | ranks = {} 89 | names = {} 90 | for source, fh in fhs.items(): 91 | for line in fh: 92 | try: 93 | _, lineage = line.rstrip().split('\t') 94 | except: 95 | _, lineage = line.decode().rstrip().split('\t') 96 | lin = lineage.split(";") 97 | for i in range(len(lin))[::-1]: 98 | # assert rank 99 | assert lin[i][:3] == self._rank_codes[i][0] 100 | # taxid = "c__Deinococci", rank = "class", name = "Deinococci" 101 | taxid = lin[i] 102 | name = lin[i][3:] 103 | # empty entry "s__" 104 | if not name: 105 | continue 106 | rank = self._rank_codes[i][1] 107 | if i == 0: 108 | parent_taxid = self._default_root_node 109 | else: 110 | parent_taxid = lin[i-1] 111 | if taxid not in nodes: 112 | nodes[taxid] = parent_taxid 113 | names[taxid] = name 114 | ranks[taxid] = rank 115 | 116 | return nodes, ranks, names 117 | -------------------------------------------------------------------------------- /multitax/multitax.py: -------------------------------------------------------------------------------- 1 | from .utils import * 2 | from collections import Counter 3 | from . import __version__ 4 | 5 | class MultiTax(object): 6 | 7 | version = __version__ 8 | 9 | _default_urls = [] 10 | _default_root_node = "1" 11 | 12 | def __init__(self, 13 | files: list = None, 14 | urls: list = None, 15 | output_prefix: str = None, 16 | root_node: str = None, 17 | root_parent: str = "0", 18 | root_name: str = None, 19 | root_rank: str = None, 20 | undefined_node: str = None, 21 | undefined_name: str = None, 22 | undefined_rank: str = None, 23 | build_name_nodes: bool = False, 24 | build_node_children: bool = False, 25 | build_rank_nodes: bool = False, 26 | extended_names: bool = False): 27 | """ 28 | Main constructor of MultiTax and sub-classes 29 | 30 | Parameters: 31 | * **files** *[str, list]*: One or more local files to parse. 32 | * **urls** *[str, list]*: One or more urls to download and parse. 33 | * **output_prefix** *[str]*: Directory to write downloaded files. 34 | * **root_node** *[str]*: Define an alternative root node. 35 | * **root_parent** *[str]*: Define the root parent node identifier. 36 | * **root_name** *[str]*: Define an alternative root name. Set to None to use original name. 37 | * **root_rank** *[str]*: Define an alternative root rank. Set to None to use original name. 38 | * **undefined_node** *[str]*: Define a default return value for undefined nodes. 39 | * **undefined_name** *[str]*: Define a default return value for undefined names. 40 | * **undefined_rank** *[str]*: Define a default return value for undefined ranks. 41 | * **build_node_children** *[bool]*: Build node,children dict (otherwise it will be created on first use). 42 | * **build_name_nodes** *[bool]*: Build name,nodes dict (otherwise it will be created on first use). 43 | * **build_rank_nodes** *[bool]*: Build rank,nodes dict (otherwise it will be created on first use). 44 | * **extended_names** *[bool]*: Parse extended names if available. 45 | 46 | Example: 47 | 48 | tax_ncbi = NcbiTx() 49 | tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"]) 50 | tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"]) 51 | tax_ott = OttTx(root_node="844192") 52 | tax_gg = GreengenesTx(output_prefix="save/to/prefix_") 53 | """ 54 | if files: 55 | if isinstance(files, str): 56 | files = [files] 57 | for file in files: 58 | check_file(file) 59 | 60 | if output_prefix: 61 | check_dir(output_prefix) 62 | 63 | # Main structures 64 | self._nodes = {} 65 | self._ranks = {} 66 | self._names = {} 67 | # Aux. structures 68 | self._lineages = {} 69 | self._name_nodes = {} 70 | self._node_children = {} 71 | self._rank_nodes = {} 72 | self._translated_nodes = {} 73 | 74 | # Store source of tax files (url or file) 75 | self.sources = [] 76 | 77 | # Open/Download/Write files 78 | fhs = {} 79 | if files: 80 | fhs = open_files(files) 81 | elif urls or self._default_urls: 82 | fhs = download_files(urls=urls if urls else self._default_urls, 83 | output_prefix=output_prefix, 84 | retry_attempts=3) 85 | 86 | if fhs: 87 | # Parse taxonomy 88 | self._nodes, self._ranks, self._names = self._parse( 89 | fhs, extended_names=extended_names) 90 | close_files(fhs) 91 | # Save sources for stats (files or urls) 92 | self.sources = list(fhs.keys()) 93 | 94 | # Set undefined values 95 | self.undefined_node = undefined_node 96 | self.undefined_name = undefined_name 97 | self.undefined_rank = undefined_rank 98 | 99 | # Set root values 100 | self._set_root_node(root=root_node if root_node else self._default_root_node, 101 | parent=root_parent, name=root_name, rank=root_rank) 102 | 103 | # build auxiliary structures 104 | if build_node_children: 105 | self._node_children = reverse_dict(self._nodes) 106 | if build_name_nodes: 107 | self._name_nodes = reverse_dict(self._names) 108 | if build_rank_nodes: 109 | self._rank_nodes = reverse_dict(self._ranks) 110 | 111 | self.check_consistency() 112 | 113 | def _exact_name(self, text: str, names: dict): 114 | """ 115 | Returns list of nodes of a given exact name (case sensitive). 116 | """ 117 | if text in names: 118 | return names[text] 119 | else: 120 | return [] 121 | 122 | def _parse(self, fhs: dict): 123 | """ 124 | main function to be overloaded 125 | receives a dictionary with {"url/file": file handler} 126 | return nodes, ranks and names dicts 127 | """ 128 | return {}, {}, {} 129 | 130 | def _partial_name(self, text: str, names: dict): 131 | """ 132 | Searches names containing a certain text (case sensitive) and return their respective nodes. 133 | """ 134 | matching_nodes = set() 135 | for name in names: 136 | if text in name: 137 | matching_nodes.update(names[name]) 138 | return list(matching_nodes) 139 | 140 | def _recurse_leaves(self, node: str): 141 | """ 142 | Recursive function returning leaf nodes 143 | """ 144 | children = self.children(node) 145 | if not children: 146 | return [node] 147 | leaves = [] 148 | for child in children: 149 | leaves.extend(self._recurse_leaves(child)) 150 | return leaves 151 | 152 | def _remove(self, node: str): 153 | """ 154 | Removes node from taxonomy, no checking, for internal use 155 | """ 156 | del self._nodes[node] 157 | if node in self._names: 158 | del self._names[node] 159 | if node in self._ranks: 160 | del self._ranks[node] 161 | 162 | def _reset_aux_data(self): 163 | """ 164 | Reset aux. data structures 165 | """ 166 | self._lineages = {} 167 | self._name_nodes = {} 168 | self._node_children = {} 169 | self._rank_nodes = {} 170 | self._translated_nodes = {} 171 | 172 | def _set_root_node(self, root: str, parent: str, name: str, rank: str): 173 | """ 174 | Set root node of the tree. 175 | The files are parsed based on the self._default_root_node for each class 176 | A user-defined root node can be: 177 | 1) internal: will filter the tree acodingly and delete the default root_node 178 | 2) external: will add node and link to the default 179 | """ 180 | 181 | # Set parent/root with defaults 182 | self.root_parent = parent 183 | self.root_node = self._default_root_node 184 | self._nodes[self.root_node] = self.root_parent 185 | 186 | # Default root node is the top by definition 187 | if root != self._default_root_node: 188 | if root in self._nodes: 189 | # Not default but exists on tree, filter only descendants 190 | self.filter(root, desc=True) 191 | # Remove entry for _default_root_node 192 | self._remove(self._default_root_node) 193 | else: 194 | # Not on tree, link default node with new root 195 | self._nodes[self._default_root_node] = root 196 | # Change root to user defined 197 | self.root_node = root 198 | # Set/Update new root node parent link 199 | self._nodes[self.root_node] = self.root_parent 200 | 201 | # User-defined rank/name. 202 | # If provided, insert manually, 203 | # If None, check if is in the tree (defined in the given tax) 204 | # otherwise insert default "root" 205 | if name: 206 | self._names[self.root_node] = name 207 | elif self.root_node not in self._names: 208 | self._names[self.root_node] = "root" 209 | # Set static name 210 | self.root_name = self._names[self.root_node] 211 | 212 | if rank: 213 | self._ranks[self.root_node] = rank 214 | elif self.root_node not in self._ranks: 215 | self._ranks[self.root_node] = "root" 216 | # Set static rank 217 | self.root_rank = self._ranks[self.root_node] 218 | 219 | def add(self, node: str, parent: str, name: str = None, rank: str = None): 220 | """ 221 | Add node to taxonomy. 222 | Deletes built lineages and translations. 223 | """ 224 | if parent not in self._nodes: 225 | raise ValueError("Parent node [" + parent + "] not found.") 226 | elif node in self._nodes: 227 | raise ValueError("Node [" + node + "] already present.") 228 | 229 | self._nodes[node] = parent 230 | self._names[node] = name if name is not None else self.undefined_name 231 | self._ranks[node] = rank if rank is not None else self.undefined_rank 232 | self._reset_aux_data() 233 | 234 | def build_lineages(self, root_node: str = None, ranks: list = None): 235 | """ 236 | Stores lineages in memory for faster access. 237 | It is valid for lineage(), rank_lineage() and name_lineage(). 238 | If keyword arguments (root_node, ranks) are used in those functions stored lineages are not used. 239 | 240 | Returns: None 241 | """ 242 | self.clear_lineages() 243 | for node in self._nodes: 244 | self._lineages[node] = self.lineage( 245 | node=node, root_node=root_node, ranks=ranks) 246 | 247 | def build_translation(self, tax, files: list = None, urls: list = None): 248 | """ 249 | Create a translation of current taxonomy to another 250 | 251 | Parameters: 252 | 253 | * **tax** [MultiTax]: A target taxonomy to be translated to. 254 | * **files** *[str, list]*: One or more local files to parse. 255 | * **urls** *[str, list]*: One or more urls to download and parse. 256 | 257 | Example: 258 | 259 | from multitax import GtdbTx, NcbiTx 260 | gtdb_tax = GtdbTx() 261 | ncbi_tax = NcbiTx() 262 | 263 | # Automatically download translation files 264 | gtdb_tax.build_translation(ncbi_tax) 265 | gtdb_tax.translate("g__Escherichia") 266 | {'1301', '547', '561', '570', '590', '620'} 267 | 268 | # Using local files (NCBI <-> GTDB) 269 | ncbi_tax.build_translation(gtdb_tax, files=["ar53_metadata.tsv.gz", "bac120_metadata.tsv.gz"]) 270 | ncbi_tax.translate("620") 271 | {'g__Escherichia', 'g__Proteus', 'g__Serratia'} 272 | """ 273 | if files: 274 | if isinstance(files, str): 275 | files = [files] 276 | for file in files: 277 | check_file(file) 278 | 279 | self._translated_nodes = self._build_translation(tax, files, urls) 280 | 281 | def children(self, node: str): 282 | """ 283 | Returns list of direct children nodes of a given node. 284 | """ 285 | # Setup on first use 286 | if not self._node_children: 287 | self._node_children = reverse_dict(self._nodes) 288 | if node in self._node_children: 289 | return self._node_children[node] 290 | else: 291 | return [] 292 | 293 | def check_consistency(self): 294 | """ 295 | Checks consistency of the tree 296 | 297 | Returns: raise an Exception otherwise None 298 | """ 299 | if self.root_node not in self._nodes: 300 | raise ValueError("Root node [" + self.root_node + "] not found.") 301 | if self.root_parent in self._nodes: 302 | raise ValueError( 303 | "Root parent [" + self.root_parent + "] found but should not be on tree.") 304 | if self.undefined_node in self._nodes: 305 | raise ValueError( 306 | "Undefined node [" + self.undefined_node + "] found but should not be on tree.") 307 | 308 | # Difference between values and keys should be only root_parent 309 | lost_nodes = set(self._nodes.values()).difference(self._nodes) 310 | if self.root_parent not in lost_nodes: 311 | raise ValueError( 312 | "Root parent [" + self.root_parent + "] not properly defined.") 313 | # Remove root_parent from lost nodes to report only missing 314 | lost_nodes.remove(self.root_parent) 315 | if len(lost_nodes) > 0: 316 | raise ValueError("Parent nodes missing: " + ",".join(lost_nodes)) 317 | 318 | return None 319 | 320 | def clear_lineages(self): 321 | """ 322 | Clear built lineages. 323 | 324 | Returns: None 325 | """ 326 | self._lineages = {} 327 | 328 | def closest_parent(self, node: str, ranks: str): 329 | """ 330 | Returns the closest parent node based on a defined list of ranks 331 | """ 332 | # Rank of node is already on the list 333 | if self.rank(node) in ranks: 334 | return node 335 | else: 336 | # check lineage from back to front until find a valid node 337 | for n in self.lineage(node, ranks=ranks)[::-1]: 338 | if n != self.undefined_node: 339 | return n 340 | # nothing found 341 | return self.undefined_node 342 | 343 | def filter(self, nodes: list, desc: bool = False): 344 | """ 345 | Filters taxonomy given a list of nodes. 346 | By default keep all the ancestors of the given nodes. 347 | If desc=True, keep all descendants instead. 348 | Deletes built lineages and translations. 349 | 350 | Example: 351 | 352 | from multitax import GtdbTx 353 | tax = GtdbTx() 354 | 355 | tax.lineage('s__Enterovibrio marina') 356 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Vibrionaceae', 'g__Enterovibrio', 's__Enterovibrio marina'] 357 | # Keep only ancestors of 'g__Enterovibrio' 358 | tax.filter('g__Enterovibrio') 359 | 360 | # Reload taxonomy 361 | tax = GtdbTx() 362 | # Keep only descendants of 'g__Enterovibrio' 363 | tax.filter('g__Enterovibrio', desc=True) 364 | """ 365 | if isinstance(nodes, str): 366 | nodes = [nodes] 367 | 368 | # Keep track of nodes to be filtered out 369 | filtered_nodes = set(self._nodes) 370 | # Always keep root 371 | filtered_nodes.discard(self.root_node) 372 | 373 | if desc: 374 | # Keep descendants of the given nodes 375 | for node in nodes: 376 | # Check if node exists (skips root) 377 | if node in filtered_nodes: 378 | # For each leaf of the selected nodes 379 | for leaf in self.leaves(node): 380 | # Build lineage of each leaf up-to node itself 381 | for n in self.lineage(leaf, root_node=node): 382 | # Discard nodes from set to be kept 383 | filtered_nodes.discard(n) 384 | # Link node to root 385 | self._nodes[node] = self.root_node 386 | else: 387 | # Keep ancestors of the given nodes (full lineage up-to root) 388 | for node in nodes: 389 | # ranks=[] in case build_lineages() was used with specific ranks 390 | for n in self.lineage(node, ranks=[]): 391 | # Discard nodes from set to be kept 392 | filtered_nodes.discard(n) 393 | 394 | # Delete filtered nodes 395 | for node in filtered_nodes: 396 | self._remove(node) 397 | 398 | # Delete aux. data structures 399 | self._reset_aux_data() 400 | self.check_consistency() 401 | 402 | def latest(self, node: str): 403 | """ 404 | Returns latest/updated version of a given node. 405 | If node is already the latests, returns itself. 406 | Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv) 407 | """ 408 | if node in self._nodes: 409 | return node 410 | else: 411 | return self.undefined_node 412 | 413 | def leaves(self, node: str = None): 414 | """ 415 | Returns a list of leaf nodes of a given node. 416 | """ 417 | if node is None or node == self.root_node: 418 | # Leaves are nodes not contained in _nodes.values() ("parents") 419 | return list(set(self._nodes).difference(self._nodes.values())) 420 | elif node in self._nodes: 421 | return self._recurse_leaves(node) 422 | else: 423 | return [] 424 | 425 | def lineage(self, node: str, root_node: str = None, ranks: list = None): 426 | """ 427 | Returns a list with the lineage of a given node. 428 | If ranks is provided, returns only nodes annotated with such ranks. 429 | If root_node is provided, use it instead of default root of tree. 430 | """ 431 | # If lineages were built with build_lineages() with matching params 432 | if node in self._lineages and root_node is None and ranks is None: 433 | return self._lineages[node] 434 | else: 435 | if not root_node: 436 | root_node = self.root_node 437 | 438 | n = node 439 | if ranks: 440 | # Fixed length lineage 441 | lin = [self.undefined_node] * len(ranks) 442 | # Loop until end of the tree (in case chosen root is not on lineage) 443 | while n != self.undefined_node: 444 | r = self.rank(n) 445 | if r in ranks: 446 | lin[ranks.index(r)] = n 447 | # If node is root, break (after adding) 448 | if n == root_node: 449 | break 450 | n = self.parent(n) 451 | else: 452 | # Full lineage 453 | lin = [] 454 | # Loop until end of the tree (in case chosen root is not on lineage) 455 | while n != self.undefined_node: 456 | lin.append(n) 457 | # If node is root, break (after adding) 458 | if n == root_node: 459 | break 460 | n = self.parent(n) 461 | # Reverse order 462 | lin = lin[::-1] 463 | 464 | # last iteration node (n) != root_node: didn't find the root, invalid lineage 465 | if n != root_node: 466 | return [] 467 | else: 468 | return lin 469 | 470 | def name(self, node: str): 471 | """ 472 | Returns name of a given node. 473 | """ 474 | if node in self._names: 475 | return self._names[node] 476 | else: 477 | return self.undefined_name 478 | 479 | def name_lineage(self, node: str, root_node: str = None, ranks: list = None): 480 | """ 481 | Returns a list with the name lineage of a given node. 482 | """ 483 | return list(map(self.name, 484 | self.lineage(node=node, 485 | root_node=root_node, 486 | ranks=ranks))) 487 | 488 | def nodes_rank(self, rank: str): 489 | """ 490 | Returns list of nodes of a given rank. 491 | """ 492 | # Setup on first use 493 | if not self._rank_nodes: 494 | self._rank_nodes = reverse_dict(self._ranks) 495 | if rank in self._rank_nodes: 496 | return self._rank_nodes[rank] 497 | else: 498 | return [] 499 | 500 | def parent(self, node: str): 501 | """ 502 | Returns the direct parent node of a given node. 503 | """ 504 | if node in self._nodes: 505 | return self._nodes[node] 506 | else: 507 | return self.undefined_node 508 | 509 | def parent_rank(self, node: str, rank: str): 510 | """ 511 | Returns the parent node of a given rank in the specified rank. 512 | """ 513 | parent = self.lineage(node=node, ranks=[rank]) 514 | return parent[0] if parent else self.undefined_node 515 | 516 | def prune(self, nodes: list): 517 | """ 518 | Prunes branches of the tree under the given nodes. 519 | Deletes built lineages and translations. 520 | """ 521 | 522 | if isinstance(nodes, str): 523 | nodes = [nodes] 524 | 525 | del_nodes = set() 526 | for node in nodes: 527 | if node not in self._nodes: 528 | raise ValueError("Node [" + node + "] not found.") 529 | for leaf in self.leaves(node): 530 | for n in self.lineage(leaf, root_node=node)[1:]: 531 | del_nodes.add(n) 532 | 533 | for n in del_nodes: 534 | self._remove(n) 535 | 536 | self._reset_aux_data() 537 | 538 | def rank(self, node: str): 539 | """ 540 | Returns the rank of a given node. 541 | """ 542 | if node in self._ranks: 543 | return self._ranks[node] 544 | else: 545 | return self.undefined_rank 546 | 547 | def rank_lineage(self, node: str, root_node: str = None, ranks: list = None): 548 | """ 549 | Returns a list with the rank lineage of a given node. 550 | """ 551 | return list(map(self.rank, 552 | self.lineage(node=node, 553 | root_node=root_node, 554 | ranks=ranks))) 555 | 556 | def remove(self, node: str, check_consistency: bool = False): 557 | """ 558 | Removes node from taxonomy. Can break the tree if a parent node is removed. To remove a certain branch, use prune. 559 | Running check consistency after removing a node is recommended. 560 | Deletes built lineages and translations. 561 | """ 562 | if node not in self._nodes: 563 | raise ValueError("Node [" + node + "] not found.") 564 | self._remove(node) 565 | self._reset_aux_data() 566 | if check_consistency: 567 | self.check_consistency() 568 | 569 | def search_name(self, text: str, rank: str = None, exact: bool = True): 570 | """ 571 | Search node by exact or partial name 572 | 573 | Parameters: 574 | * **text** *[str]*: Text to search. 575 | * **rank** *[str]*: Filter results by rank. 576 | * **exact** *[bool]*: Exact or partial name search (both case sensitive). 577 | 578 | Returns: list of matching nodes 579 | """ 580 | # Setup on first use 581 | if not self._name_nodes: 582 | self._name_nodes = reverse_dict(self._names) 583 | 584 | if exact: 585 | ret = self._exact_name(text, self._name_nodes) 586 | else: 587 | ret = self._partial_name(text, self._name_nodes) 588 | 589 | # Only return nodes of chosen rank 590 | if rank: 591 | return filter_function(ret, self.rank, rank) 592 | else: 593 | return ret 594 | 595 | def stats(self): 596 | """ 597 | Returns a dict with general numbers of the taxonomic tree 598 | 599 | Example: 600 | 601 | from pprint import pprint 602 | from multitax import GtdbTx 603 | tax = GtdbTx() 604 | 605 | pprint(tax.stats()) 606 | {'leaves': 30238, 607 | 'names': 42739, 608 | 'nodes': 42739, 609 | 'ranked_leaves': Counter({'species': 30238}), 610 | 'ranked_nodes': Counter({'species': 30238, 611 | 'genus': 8778, 612 | 'family': 2323, 613 | 'order': 930, 614 | 'class': 337, 615 | 'phylum': 131, 616 | 'domain': 1, 617 | 'root': 1}), 618 | 'ranks': 42739} 619 | """ 620 | s = {} 621 | s["nodes"] = len(self._nodes) 622 | s["ranks"] = len(self._ranks) 623 | s["names"] = len(self._names) 624 | all_leaves = self.leaves(self.root_node) 625 | s["leaves"] = len(all_leaves) 626 | s["ranked_nodes"] = Counter(self._ranks.values()) 627 | s["ranked_leaves"] = Counter(map(self.rank, all_leaves)) 628 | 629 | return s 630 | 631 | def translate(self, node: str): 632 | """ 633 | Returns the translated node from another taxonomy. Translated nodes are generated with the build_translation function. 634 | """ 635 | if node in self._translated_nodes: 636 | return self._translated_nodes[node] 637 | else: 638 | return [] 639 | 640 | def write(self, 641 | output_file: str, 642 | cols: list = ["node", "parent", "rank", "name"], 643 | sep: str = "\t", 644 | sep_multi: str = "|", 645 | ranks: list = None, 646 | gz: bool = False): 647 | """ 648 | Writes loaded taxonomy to a file. 649 | 650 | Parameters: 651 | * **cols** *[list]*: Options: "node", "latest", "parent", "rank", "name", "leaves", "children", "lineage", "rank_lineage", "name_lineage" 652 | * **sep** *[str]*: Separator of fields 653 | * **sep_multi** *[str]*: Separator of multi-valued fields 654 | * **ranks** *[list]*: Ranks to report 655 | * **gz** *[bool]*: Gzip output 656 | 657 | Returns: None 658 | """ 659 | import gzip 660 | if gz: 661 | output_file = output_file if output_file.endswith( 662 | ".gz") else output_file + ".gz" 663 | check_no_file(output_file) 664 | outf = gzip.open(output_file, "wt") 665 | else: 666 | check_no_file(output_file) 667 | outf = open(output_file, "w") 668 | 669 | write_field = {"node": lambda node: node, 670 | "latest": self.latest, 671 | "parent": self.parent, 672 | "rank": self.rank, 673 | "name": self.name, 674 | "leaves": lambda node: join_check(self.leaves(node), sep_multi), 675 | "children": lambda node: join_check(self.children(node), sep_multi), 676 | "lineage": lambda node: join_check(self.lineage(node, ranks=ranks), sep_multi), 677 | "rank_lineage": lambda node: join_check(self.rank_lineage(node, ranks=ranks), sep_multi), 678 | "name_lineage": lambda node: join_check(self.name_lineage(node, ranks=ranks), sep_multi)} 679 | 680 | for c in cols: 681 | if c not in write_field: 682 | raise ValueError( 683 | "Field [" + c + "] is not valid. Options: " + ",".join(write_field)) 684 | 685 | if ranks: 686 | for rank in ranks: 687 | for node in self.nodes_rank(rank): 688 | print(*[write_field[c](node) 689 | for c in cols], sep=sep, end="\n", file=outf) 690 | else: 691 | for node in self._nodes: 692 | print(*[write_field[c](node) 693 | for c in cols], sep=sep, end="\n", file=outf) 694 | 695 | outf.close() 696 | -------------------------------------------------------------------------------- /multitax/ncbitx.py: -------------------------------------------------------------------------------- 1 | from .multitax import MultiTax 2 | from .utils import filter_function 3 | from .utils import check_file 4 | from .utils import open_files 5 | from .utils import download_files 6 | import warnings 7 | 8 | 9 | class NcbiTx(MultiTax): 10 | _default_urls = ["https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"] 11 | 12 | def __init__(self, **kwargs): 13 | self._merged = {} 14 | self._extended_name_nodes = {} 15 | super().__init__(**kwargs) 16 | 17 | def __repr__(self): 18 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] 19 | return 'NcbiTx({})'.format(', '.join(stats)) 20 | 21 | def _build_translation(self, target_tax, files: list = None, urls: list = None): 22 | translated_nodes = {} 23 | if target_tax.__class__.__name__ == "GtdbTx": 24 | 25 | if files: 26 | fhs = open_files(files) 27 | else: 28 | _urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz", 29 | "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz"] 30 | fhs = download_files( 31 | urls=urls if urls else _urls, retry_attempts=3) 32 | 33 | 34 | accession_col = 0 35 | gtdb_taxonomy_col = 19 36 | ncbi_taxid_col = 80 37 | 38 | for source, fh in fhs.items(): 39 | for line in fh: 40 | try: 41 | fields = line.rstrip().split('\t') 42 | except: 43 | fields = line.decode().rstrip().split('\t') 44 | 45 | # skip header 46 | if fields[accession_col] == "accession": 47 | continue 48 | 49 | # Build GTDB lineage from leaf (species on given lineage) 50 | # to accomodate possible changes in the loaded tax 51 | gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1] 52 | if gtdb_leaf_node != target_tax.undefined_node: 53 | gtdb_nodes = target_tax.lineage(gtdb_leaf_node, ranks=[ 54 | "domain", "phylum", "class", "order", 55 | "family", "genus", "species"]) 56 | else: 57 | continue 58 | 59 | # Build NCBI lineage from leaf 60 | ncbi_leaf_node = self.latest(fields[ncbi_taxid_col]) 61 | if ncbi_leaf_node != self.undefined_node: 62 | # Additional add connection from leaf to species on GTDB 63 | # that could represent strain, etc on NCBI tax 64 | if ncbi_leaf_node not in translated_nodes: 65 | translated_nodes[ncbi_leaf_node] = set() 66 | translated_nodes[ncbi_leaf_node].add( 67 | gtdb_leaf_node) 68 | ncbi_nodes = self.lineage(ncbi_leaf_node, ranks=[ 69 | "superkingdom", "phylum", "class", "order", 70 | "family", "genus", "species"]) 71 | else: 72 | continue 73 | 74 | # Match ranks 75 | for i, ncbi_n in enumerate(ncbi_nodes): 76 | if gtdb_nodes[i] != target_tax.undefined_node and ncbi_n != self.undefined_node: 77 | if ncbi_n not in translated_nodes: 78 | translated_nodes[ncbi_n] = set() 79 | translated_nodes[ncbi_n].add(gtdb_nodes[i]) 80 | 81 | else: 82 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + 83 | "," + target_tax.__class__.__name__ + "] not yet implemented.") 84 | 85 | return translated_nodes 86 | 87 | def _parse(self, fhs, **kwargs): 88 | fhs_list = list(fhs.values()) 89 | # One element tar.gz -> taxdump.tar.gz 90 | if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"): 91 | nodes, ranks, names, self._merged = self._parse_taxdump( 92 | fhs_list[0], extended_names=kwargs["extended_names"]) 93 | else: 94 | # nodes.dmp 95 | nodes, ranks = self._parse_nodes(fhs_list[0]) 96 | 97 | # [names.dmp] 98 | if len(fhs) >= 2: 99 | names = self._parse_names( 100 | fhs_list[1], extended_names=kwargs["extended_names"]) 101 | else: 102 | names = {} 103 | 104 | # [merged.dmp] 105 | if len(fhs) == 3: 106 | self._merged = self._parse_merged(fhs_list[2]) 107 | return nodes, ranks, names 108 | 109 | def _parse_merged(self, fh): 110 | merged = {} 111 | for line in fh: 112 | try: 113 | old_taxid, _, new_taxid, _ = line.split('\t', 3) 114 | except: 115 | old_taxid, _, new_taxid, _ = line.decode().split('\t', 3) 116 | merged[old_taxid] = new_taxid 117 | return merged 118 | 119 | def _parse_names(self, fh, extended_names): 120 | names = {} 121 | for line in fh: 122 | try: 123 | node, name, _, name_class = line.split('\t|\t') 124 | except: 125 | node, name, _, name_class = line.decode().split('\t|\t') 126 | if name_class.replace('\t|\n', '') == "scientific name": 127 | names[node] = name 128 | elif extended_names: 129 | if name not in self._extended_name_nodes: 130 | self._extended_name_nodes[name] = [] 131 | self._extended_name_nodes[name].append(node) 132 | 133 | return names 134 | 135 | def _parse_nodes(self, fh): 136 | nodes = {} 137 | ranks = {} 138 | for line in fh: 139 | try: 140 | taxid, parent_taxid, rank, _ = line.split('\t|\t', 3) 141 | except: 142 | taxid, parent_taxid, rank, _ = line.decode().split('\t|\t', 3) 143 | ranks[taxid] = rank 144 | nodes[taxid] = parent_taxid 145 | return nodes, ranks 146 | 147 | def _parse_taxdump(self, fh_taxdump, extended_names): 148 | with fh_taxdump.extractfile('nodes.dmp') as fh_nodes: 149 | nodes, ranks = self._parse_nodes(fh_nodes) 150 | with fh_taxdump.extractfile('names.dmp') as fh_names: 151 | names = self._parse_names(fh_names, extended_names=extended_names) 152 | with fh_taxdump.extractfile('merged.dmp') as fh_merged: 153 | merged = self._parse_merged(fh_merged) 154 | return nodes, ranks, names, merged 155 | 156 | def latest(self, node: str): 157 | n = super().latest(node) 158 | if n == self.undefined_node: 159 | n = self.merged(node) 160 | return n 161 | 162 | def merged(self, node: str): 163 | """ 164 | Returns relative entry from the merged.dmp file of a given node. 165 | """ 166 | if node in self._merged: 167 | return self._merged[node] 168 | else: 169 | return self.undefined_node 170 | 171 | def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False): 172 | """ 173 | Search node by exact or partial name. 174 | 175 | Default order (can be skipped with **force_extended=True**): 176 | 177 | 1) Search names defined as "scientific name" on nodes.dmp 178 | 179 | 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**)) 180 | 181 | Parameters: 182 | * **text** *[str]*: Text to search. 183 | * **rank** *[str]*: Filter results by rank. 184 | * **exact** *[bool]*: Exact or partial name search (both case sensitive). 185 | * **force_extended** *[bool]*: Search for text in all categories at once. 186 | 187 | Returns: list of matching nodes 188 | """ 189 | n = super().search_name(text, rank=rank, exact=exact) 190 | if n and not force_extended: 191 | return n 192 | else: 193 | if exact: 194 | ret = self._exact_name(text, self._extended_name_nodes) 195 | else: 196 | ret = self._partial_name(text, self._extended_name_nodes) 197 | 198 | # Only return nodes of chosen rank 199 | if rank: 200 | ret = filter_function(ret, self.rank, rank) 201 | 202 | return list(set(n + ret)) 203 | 204 | def stats(self): 205 | s = super().stats() 206 | if self._merged: 207 | s["merged"] = len(self._merged) 208 | if self._extended_name_nodes: 209 | s["extended_names"] = len(self._extended_name_nodes) 210 | return s 211 | -------------------------------------------------------------------------------- /multitax/otttx.py: -------------------------------------------------------------------------------- 1 | from .multitax import MultiTax 2 | from .utils import filter_function 3 | import warnings 4 | 5 | 6 | class OttTx(MultiTax): 7 | _default_urls = ["http://files.opentreeoflife.org/ott/ott3.4/ott3.4.tgz"] 8 | _default_root_node = "805080" 9 | 10 | def __init__(self, **kwargs): 11 | self._forwards = {} 12 | self._extended_name_nodes = {} 13 | super().__init__(**kwargs) 14 | 15 | def __repr__(self): 16 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] 17 | return 'OttTx({})'.format(', '.join(stats)) 18 | 19 | def _build_translation(self, target_tax, files: list = None, urls: list = None): 20 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + 21 | "," + target_tax.__class__.__name__ + "] not yet implemented.") 22 | return {} 23 | 24 | def _parse(self, fhs, **kwargs): 25 | fhs_list = list(fhs.values()) 26 | if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"): 27 | nodes, ranks, names = self._parse_ott( 28 | fhs_list[0], extended_names=kwargs["extended_names"]) 29 | else: 30 | # nodes.dmp 31 | nodes, ranks, names = self._parse_taxonomy(fhs_list[0]) 32 | # [forwards.tsv] 33 | if len(fhs) >= 2: 34 | self._forwards = self._parse_forwards(fhs_list[1]) 35 | if len(fhs) == 3 and kwargs["extended_names"]: 36 | self._extended_name_nodes = self._parse_synonyms(fhs_list[2]) 37 | 38 | return nodes, ranks, names 39 | 40 | def _parse_forwards(self, fh): 41 | forwards = {} 42 | # skip first line header 43 | next(fh) 44 | for line in fh: 45 | try: 46 | old_taxid, new_taxid = line.rstrip().split('\t') 47 | except: 48 | old_taxid, new_taxid = line.decode().rstrip().split('\t') 49 | forwards[old_taxid] = new_taxid 50 | return forwards 51 | 52 | def _parse_ott(self, fh_taxdump, extended_names): 53 | # Get files inside folder by name 54 | for e in fh_taxdump.getnames(): 55 | if e.endswith("taxonomy.tsv"): 56 | tax = e 57 | if e.endswith("forwards.tsv"): 58 | fwr = e 59 | if e.endswith("synonyms.tsv"): 60 | syn = e 61 | 62 | with fh_taxdump.extractfile(tax) as fh_nodes: 63 | nodes, ranks, names = self._parse_taxonomy(fh_nodes) 64 | with fh_taxdump.extractfile(fwr) as fh_forwards: 65 | self._forwards = self._parse_forwards(fh_forwards) 66 | if extended_names: 67 | with fh_taxdump.extractfile(syn) as fh_synonyms: 68 | self._extended_name_nodes = self._parse_synonyms(fh_synonyms) 69 | return nodes, ranks, names 70 | 71 | def _parse_synonyms(self, fh): 72 | synonyms = {} 73 | # skip first line header 74 | next(fh) 75 | for line in fh: 76 | try: 77 | name, taxid, _ = line.split('\t|\t', 2) 78 | except: 79 | name, taxid, _ = line.decode().split('\t|\t', 2) 80 | if name not in synonyms: 81 | synonyms[name] = [] 82 | synonyms[name].append(taxid) 83 | 84 | return synonyms 85 | 86 | def _parse_taxonomy(self, fh): 87 | nodes = {} 88 | ranks = {} 89 | names = {} 90 | # skip first line header 91 | next(fh) 92 | for line in fh: 93 | try: 94 | taxid, parent_taxid, name, rank, _ = line.split('\t|\t', 4) 95 | except: 96 | taxid, parent_taxid, name, rank, _ = line.decode().split('\t|\t', 4) 97 | ranks[taxid] = rank 98 | nodes[taxid] = parent_taxid 99 | names[taxid] = name 100 | return nodes, ranks, names 101 | 102 | def forwards(self, node: str): 103 | """ 104 | Returns relative entry from the forwards.tsv file of a given node. 105 | """ 106 | if node in self._forwards: 107 | return self._forwards[node] 108 | else: 109 | return self.undefined_node 110 | 111 | def latest(self, node: str): 112 | n = super().latest(node) 113 | if n == self.undefined_node: 114 | n = self.forwards(node) 115 | return n 116 | 117 | def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False): 118 | """ 119 | Search node by exact or partial name. 120 | 121 | Default order (can be skipped with **force_extended=True**): 122 | 123 | 1) Search default names defined on "taxonomy.tsv" 124 | 125 | 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**)) 126 | 127 | Parameters: 128 | * **text** *[str]*: Text to search. 129 | * **rank** *[str]*: Filter results by rank. 130 | * **exact** *[bool]*: Exact or partial name search (both case sensitive). 131 | * **force_extended** *[bool]*: Search for text in all categories at once. 132 | 133 | Returns: list of matching nodes 134 | """ 135 | n = super().search_name(text, rank=rank, exact=exact) 136 | if n and not force_extended: 137 | return n 138 | else: 139 | if exact: 140 | ret = self._exact_name(text, self._extended_name_nodes) 141 | else: 142 | ret = self._partial_name(text, self._extended_name_nodes) 143 | 144 | # Only return nodes of chosen rank 145 | if rank: 146 | ret = filter_function(ret, self.rank, rank) 147 | 148 | return list(set(n + ret)) 149 | 150 | def stats(self): 151 | s = super().stats() 152 | if self._forwards: 153 | s["forwards"] = len(self._forwards) 154 | if self._extended_name_nodes: 155 | s["extended_names"] = len(self._extended_name_nodes) 156 | return s 157 | -------------------------------------------------------------------------------- /multitax/silvatx.py: -------------------------------------------------------------------------------- 1 | from .multitax import MultiTax 2 | import warnings 3 | 4 | 5 | class SilvaTx(MultiTax): 6 | _default_urls = [ 7 | "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"] 8 | 9 | def __init__(self, **kwargs): 10 | super().__init__(**kwargs) 11 | 12 | def __repr__(self): 13 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()] 14 | return 'SilvaTx({})'.format(', '.join(stats)) 15 | 16 | def _build_translation(self, target_tax, files: list = None, urls: list = None): 17 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ + 18 | "," + target_tax.__class__.__name__ + "] not yet implemented.") 19 | return {} 20 | 21 | def _parse(self, fhs, **kwargs): 22 | nodes = {} 23 | ranks = {} 24 | names = {} 25 | 26 | lin = {} 27 | for source, fh in fhs.items(): 28 | for line in fh: 29 | try: 30 | name_lineage, taxid, rank, _ = line.split('\t', 3) 31 | except: 32 | name_lineage, taxid, rank, _ = line.decode().split('\t', 3) 33 | # Remove last char ";" 34 | lineage = name_lineage[:-1] 35 | name = lineage.split(";")[-1] 36 | # Save lineage to build tree 37 | lin[lineage] = taxid 38 | names[taxid] = name 39 | ranks[taxid] = rank 40 | 41 | # Build parent node connection 42 | for lineage, taxid in lin.items(): 43 | t = taxid 44 | l = lineage.split(";")[:-1] 45 | while l: 46 | parent_taxid = lin[";".join(l)] 47 | if t not in nodes: 48 | nodes[t] = parent_taxid 49 | t = parent_taxid 50 | del l[-1] # remove last element 51 | # Connect last node to root 52 | if t not in nodes: 53 | nodes[t] = self._default_root_node 54 | 55 | return nodes, ranks, names 56 | -------------------------------------------------------------------------------- /multitax/utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import io 3 | import os 4 | import tarfile 5 | import urllib.request 6 | import zlib 7 | import warnings 8 | from collections import OrderedDict 9 | from urllib.error import HTTPError 10 | 11 | 12 | def check_dir(prefix: str): 13 | abs_path = os.path.dirname(os.path.abspath(prefix)) 14 | if not os.path.exists(abs_path): 15 | raise NotADirectoryError(abs_path) 16 | 17 | 18 | def check_file(file: str): 19 | if not os.path.isfile(file): 20 | raise FileNotFoundError(file + " file do not exist") 21 | if os.path.getsize(file) == 0: 22 | raise FileNotFoundError(file + " file is empty") 23 | 24 | 25 | def check_no_file(file: str): 26 | if os.path.isfile(file): 27 | raise FileExistsError(file) 28 | 29 | 30 | def close_files(fhs: dict): 31 | """ 32 | Parameters: 33 | * **fhs** *[dict]*: {file: file handler} 34 | 35 | Returns: Nothing 36 | """ 37 | for fh in fhs.values(): 38 | fh.close() 39 | 40 | 41 | def download_files(urls: list, output_prefix: str = None, retry_attempts: int = 1): 42 | """ 43 | Download and open files (memory/stream) or write to disk (multitax.utils.save_urls) 44 | 45 | Parameters: 46 | * **urls** *[list]*: List of files to download (text, ".gz", ".tar.gz", ".tgz") 47 | * **output_prefix** *[str]*: Output directory to save files 48 | 49 | Returns: 50 | * OrderedDict {file: file handler} (same order as input) 51 | """ 52 | if isinstance(urls, str): 53 | urls = [urls] 54 | 55 | att = 0 56 | while att < retry_attempts: 57 | att += 1 58 | try: 59 | # If output is provided, save files and parse from disc 60 | if output_prefix: 61 | files = save_urls(urls, output_prefix) 62 | return open_files(files) 63 | else: 64 | # stream contents from url 65 | fhs = OrderedDict() 66 | for url in urls: 67 | if url.endswith(".tar.gz") or url.endswith(".tgz"): 68 | # tar files have mixed headers and content 69 | # whole file should be loaded in memory first and not streamed 70 | fhs[url] = tarfile.open( 71 | fileobj=load_url_mem(url), mode='r:gz') 72 | elif url.endswith(".gz"): 73 | fhs[url] = gzip.open( 74 | urllib.request.urlopen(url), mode="rb") 75 | fhs[url].peek(1) # peek into file to check if is valid 76 | else: 77 | fhs[url] = urllib.request.urlopen(url) 78 | 79 | return fhs 80 | except (HTTPError, zlib.error, tarfile.TarError): 81 | warnings.warn( 82 | "Download failed, trying again (" + str(att) + "/" + str(retry_attempts) + ")", UserWarning) 83 | 84 | raise Exception("One or more files could not be downloaded: " + 85 | ", ".join(urls)) 86 | 87 | 88 | def filter_function(elements, function, value): 89 | return [elements[i] for i, v in enumerate(map(function, elements)) if v == value] 90 | 91 | 92 | def join_check(elements, sep: str): 93 | if elements: 94 | return sep.join(map(str, elements)) 95 | else: 96 | return "" 97 | 98 | 99 | def load_url_mem(url: str): 100 | """ 101 | Parameters: 102 | * **url** *[str]*: URL to load into memory 103 | 104 | Returns: 105 | * io.BytesIO of the requested url 106 | """ 107 | urlstream = urllib.request.urlopen(url) 108 | # From https://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed 109 | tmpfile = io.BytesIO() 110 | while True: 111 | s = urlstream.read(io.DEFAULT_BUFFER_SIZE) 112 | if not s: 113 | break 114 | tmpfile.write(s) 115 | urlstream.close() 116 | tmpfile.seek(0) 117 | return tmpfile 118 | 119 | 120 | def open_files(files: list): 121 | """ 122 | Parameters: 123 | * **files** *[list]*: List of files to open (text, ".gz", ".tar.gz", ".tgz") 124 | 125 | Returns: 126 | * OrderedDict {file: file handler} (same order as input) 127 | """ 128 | 129 | fhs = OrderedDict() 130 | for file in files: 131 | if file.endswith(".tar.gz") or file.endswith(".tgz"): 132 | fhs[file] = tarfile.open(file, mode='r:gz') 133 | elif file.endswith(".gz"): 134 | fhs[file] = gzip.open(file, "rt") 135 | else: 136 | fhs[file] = open(file, "r") 137 | return fhs 138 | 139 | 140 | def reverse_dict(d: dict): 141 | rd = {} 142 | for k, v in d.items(): 143 | if v not in rd: 144 | rd[v] = [] 145 | rd[v].append(k) 146 | return rd 147 | 148 | 149 | def save_urls(urls: list, output_prefix: str): 150 | """ 151 | Parameters: 152 | * **urls** *[list]*: List of urls to download 153 | * **output_prefix** *[str]*: Output directory to save files 154 | 155 | Returns: 156 | * list of files saved 157 | """ 158 | files = [] 159 | for url in urls: 160 | outfile = output_prefix + os.path.basename(url) 161 | check_no_file(outfile) 162 | urlstream = urllib.request.urlopen(url) 163 | with open(outfile, 'b+w') as f: 164 | f.write(urlstream.read()) 165 | urlstream.close() 166 | files.append(outfile) 167 | return files 168 | 169 | 170 | def warning_on_one_line(message, category, filename, lineno, file=None, line=None): 171 | return '%s:%s: %s: %s\n' % (filename, lineno, category.__name__, message) 172 | 173 | 174 | warnings.formatwarning = warning_on_one_line 175 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import re 4 | 5 | from setuptools import setup 6 | 7 | with open("README.md", "r", encoding="utf-8") as fh: 8 | long_description = fh.read() 9 | 10 | setup( 11 | name="multitax", 12 | version="1.3.2", 13 | url="https://www.github.com/pirovc/multitax", 14 | license="MIT", 15 | author="Vitor C. Piro", 16 | description="Python package to obtain, parse and explore biological and custom taxonomies", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | packages=["multitax"], 20 | python_requires=">=3.4", 21 | classifiers=[ 22 | 'License :: OSI Approved :: MIT License', 23 | 'Programming Language :: Python :: 3.9', 24 | 'Programming Language :: Python :: 3.10', 25 | 'Programming Language :: Python :: 3.11', 26 | 'Programming Language :: Python :: 3.12', 27 | 'Programming Language :: Python :: 3.13', 28 | ], 29 | ) 30 | -------------------------------------------------------------------------------- /tests/multitax/data_minimal/custom.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom.tsv.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/custom2.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom2.tsv.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/custom_unit_test.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom_unit_test.tsv.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/gg.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gg.txt.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/gtdb_ar.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_ar.tsv.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/gtdb_bac.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_bac.tsv.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/ncbi.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/ncbi.tar.gz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/ott.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/ott.tgz -------------------------------------------------------------------------------- /tests/multitax/data_minimal/silva.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/silva.txt.gz -------------------------------------------------------------------------------- /tests/multitax/integration/test_common.py: -------------------------------------------------------------------------------- 1 | from multitax import GreengenesTx, GtdbTx, NcbiTx, OttTx, SilvaTx, CustomTx 2 | from tests.multitax.utils import setup_dir, uncompress_gzip, uncompress_tar_gzip 3 | import unittest 4 | import os 5 | import sys 6 | import random 7 | import io 8 | 9 | 10 | sys.path.append("tests/multitax/") 11 | 12 | 13 | class TestCommon(unittest.TestCase): 14 | 15 | tmp_dir = "tests/multitax/integration/tmp_common/" 16 | data_dir = "tests/multitax/data_minimal/" 17 | #data_dir = "tests/multitax/data_complete/" 18 | 19 | taxonomies = {} 20 | taxonomies["gtdb"] = {"class": GtdbTx, 21 | "params": {"files": [data_dir + "gtdb_ar.tsv.gz", 22 | data_dir + "gtdb_bac.tsv.gz"]}} 23 | taxonomies["ncbi"] = {"class": NcbiTx, 24 | "params": {"files": [data_dir + "ncbi.tar.gz"]}} 25 | taxonomies["silva"] = {"class": SilvaTx, 26 | "params": {"files": [data_dir + "silva.txt.gz"]}} 27 | taxonomies["ott"] = {"class": OttTx, 28 | "params": {"files": [data_dir + "ott.tgz"]}} 29 | taxonomies["greengenes"] = {"class": GreengenesTx, 30 | "params": {"files": [data_dir + "gg.txt.gz"]}} 31 | taxonomies["custom"] = {"class": CustomTx, 32 | "params": {"files": [data_dir + "custom.tsv.gz", 33 | data_dir + "custom2.tsv.gz"]}} 34 | 35 | @classmethod 36 | def setUpClass(self): 37 | setup_dir(self.tmp_dir) 38 | 39 | def test_basic(self): 40 | """ 41 | Basic test with files 42 | """ 43 | for t in self.taxonomies: 44 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"]) 45 | self.assertGreater(tax.stats()["nodes"], 0, t + " failed") 46 | 47 | def test_print(self): 48 | """ 49 | Test output of printing tax object instance 50 | """ 51 | for t in self.taxonomies: 52 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"]) 53 | out = io.StringIO() 54 | sys.stdout = out 55 | print(tax) 56 | sys.stdout = sys.__stdout__ 57 | self.assertEqual(out.getvalue().lower().startswith(t), True) 58 | 59 | def test_urls(self): 60 | """ 61 | Using urls instead of files 62 | """ 63 | for t in self.taxonomies: 64 | # simulate url with "file://" and absolute path 65 | urls = ["file://" + os.path.abspath(file) 66 | for file in self.taxonomies[t]["params"]["files"]] 67 | tax = self.taxonomies[t]["class"](urls=urls) 68 | self.assertGreater( 69 | tax.stats()["nodes"], 0, t + " failed with urls") 70 | 71 | def test_fail_to_download(self): 72 | """ 73 | Using wrong urls should fail (using ncbi) 74 | """ 75 | with self.assertRaises(Exception): 76 | with self.assertWarns(UserWarning): 77 | tax = self.taxonomies["ncbi"]["class"]( 78 | urls=["www.thisisnotawebsite.com/neither/a/file", "fasfafsafasfasf"]) 79 | 80 | def test_urls_output_prefix(self): 81 | """ 82 | Using urls and saving files on disk 83 | """ 84 | for t in self.taxonomies: 85 | # simulate url with "file://" and absolute path 86 | urls = ["file://" + os.path.abspath(file) 87 | for file in self.taxonomies[t]["params"]["files"]] 88 | tax = self.taxonomies[t]["class"]( 89 | urls=urls, output_prefix=self.tmp_dir) 90 | self.assertGreater( 91 | tax.stats()["nodes"], 0, t + " failed with urls and output_prefix") 92 | 93 | def test_gzip_uncompressed(self): 94 | """ 95 | Using uncompressed gzip files ("gtdb", "silva", "greengenes", "custom") 96 | """ 97 | for t in self.taxonomies: 98 | if t in ["gtdb", "silva", "greengenes", "custom"]: 99 | uncompressed = [] 100 | for file in self.taxonomies[t]["params"]["files"]: 101 | if file.endswith(".gz"): 102 | outfile = self.tmp_dir + os.path.basename(file)[:-3] 103 | uncompress_gzip(file, outfile) 104 | uncompressed.append(outfile) 105 | 106 | if uncompressed: 107 | # Check if results are equal with compressed and uncompressed files 108 | tax_compressed = self.taxonomies[t]["class"]( 109 | **self.taxonomies[t]["params"]) 110 | tax_uncompressed = self.taxonomies[t]["class"]( 111 | files=uncompressed) 112 | self.assertEqual(tax_compressed.stats(), tax_uncompressed.stats( 113 | ), t + " failed with uncompressed files") 114 | 115 | def test_tar_gzip_uncompressed_ncbi(self): 116 | """ 117 | Using uncompressed tar gzip files for ncbi 118 | """ 119 | 120 | # Ncbi 121 | tax_compressed = self.taxonomies["ncbi"]["class"]( 122 | **self.taxonomies["ncbi"]["params"]) 123 | uncompressed_files = uncompress_tar_gzip( 124 | f=self.taxonomies["ncbi"]["params"]["files"][0], outd=self.tmp_dir) 125 | self.assertIn("nodes.dmp", uncompressed_files) 126 | self.assertIn("names.dmp", uncompressed_files) 127 | self.assertIn("merged.dmp", uncompressed_files) 128 | tax_uncompressed = self.taxonomies["ncbi"]["class"](files=[self.tmp_dir + "nodes.dmp", 129 | self.tmp_dir + "names.dmp", 130 | self.tmp_dir + "merged.dmp"]) 131 | # Results of compressed and uncompressed should match 132 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats()) 133 | 134 | # Ncbi with extended names 135 | ext_ncbi_conf = self.taxonomies["ncbi"].copy() 136 | ext_ncbi_conf["params"]["extended_names"] = True 137 | tax_compressed = ext_ncbi_conf["class"](**ext_ncbi_conf["params"]) 138 | uncompressed_files = uncompress_tar_gzip( 139 | f=ext_ncbi_conf["params"]["files"][0], outd=self.tmp_dir) 140 | self.assertIn("nodes.dmp", uncompressed_files) 141 | self.assertIn("names.dmp", uncompressed_files) 142 | self.assertIn("merged.dmp", uncompressed_files) 143 | tax_uncompressed = ext_ncbi_conf["class"](files=[self.tmp_dir + "nodes.dmp", 144 | self.tmp_dir + "names.dmp", 145 | self.tmp_dir + "merged.dmp"], 146 | extended_names=True) 147 | # Results of compressed and uncompressed should match 148 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats()) 149 | 150 | def test_tar_gzip_uncompressed_ott(self): 151 | """ 152 | Using uncompressed tar gzip files for ott 153 | """ 154 | # Ott 155 | tax_compressed = self.taxonomies["ott"]["class"]( 156 | **self.taxonomies["ott"]["params"]) 157 | uncompressed_files = uncompress_tar_gzip( 158 | f=self.taxonomies["ott"]["params"]["files"][0], outd=self.tmp_dir) 159 | self.assertIn("taxonomy.tsv", uncompressed_files) 160 | self.assertIn("forwards.tsv", uncompressed_files) 161 | tax_uncompressed = self.taxonomies["ott"]["class"](files=[self.tmp_dir + "taxonomy.tsv", 162 | self.tmp_dir + "forwards.tsv"]) 163 | # Results of compressed and uncompressed should match 164 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats()) 165 | 166 | # Ott with extended names (synonyms.tsv) 167 | ext_ott_conf = self.taxonomies["ott"].copy() 168 | ext_ott_conf["params"]["extended_names"] = True 169 | tax_compressed = ext_ott_conf["class"](**ext_ott_conf["params"]) 170 | uncompressed_files = uncompress_tar_gzip( 171 | f=ext_ott_conf["params"]["files"][0], outd=self.tmp_dir) 172 | self.assertIn("taxonomy.tsv", uncompressed_files) 173 | self.assertIn("forwards.tsv", uncompressed_files) 174 | self.assertIn("synonyms.tsv", uncompressed_files) 175 | tax_uncompressed = ext_ott_conf["class"](files=[self.tmp_dir + "taxonomy.tsv", 176 | self.tmp_dir + "forwards.tsv", 177 | self.tmp_dir + "synonyms.tsv"], 178 | extended_names=True) 179 | # Results of compressed and uncompressed should match 180 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats()) 181 | 182 | def test_inconsistent(self): 183 | """ 184 | Test parsing inconsistent taxonomies 185 | """ 186 | for t in self.taxonomies: 187 | # Delete root 188 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"]) 189 | tax.remove(tax.root_node) 190 | with self.assertRaises(ValueError): 191 | tax.check_consistency() 192 | 193 | # Delete random node (parent from random leaf) 194 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"]) 195 | tax.remove(tax.parent(random.choice(tax.leaves()))) 196 | with self.assertRaises(ValueError): 197 | tax.check_consistency() 198 | 199 | # Delete random leaf (do not generate inconsistency) 200 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"]) 201 | tax._remove(random.choice(tax.leaves())) 202 | self.assertEqual(tax.check_consistency(), None) 203 | -------------------------------------------------------------------------------- /tests/multitax/integration/test_empty.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from multitax.multitax import MultiTax 3 | from multitax import DummyTx 4 | 5 | 6 | class TestDummy(unittest.TestCase): 7 | 8 | def test_multitax(self): 9 | tax = MultiTax() 10 | stats = tax.stats() 11 | # Only root node 12 | self.assertEqual(stats["nodes"], 1) 13 | # No input sources 14 | self.assertFalse(tax.sources) 15 | 16 | def test_dummy(self): 17 | tax = DummyTx() 18 | stats = tax.stats() 19 | # Only root node 20 | self.assertEqual(stats["nodes"], 1) 21 | # No input sources 22 | self.assertFalse(tax.sources) 23 | -------------------------------------------------------------------------------- /tests/multitax/integration/test_online.py: -------------------------------------------------------------------------------- 1 | from multitax import GreengenesTx, GtdbTx, NcbiTx, OttTx, SilvaTx, CustomTx 2 | from tests.multitax.utils import setup_dir, uncompress_gzip, uncompress_tar_gzip 3 | import unittest 4 | import os 5 | import sys 6 | import random 7 | 8 | sys.path.append("tests/multitax/") 9 | 10 | 11 | @unittest.skip('Skip online by default') 12 | class TestOnline(unittest.TestCase): 13 | 14 | tmp_dir = "tests/multitax/integration/tmp_online/" 15 | 16 | taxonomies = {} 17 | taxonomies["gtdb"] = {"class": GtdbTx} 18 | taxonomies["ncbi"] = {"class": NcbiTx} 19 | taxonomies["silva"] = {"class": SilvaTx} 20 | taxonomies["ott"] = {"class": OttTx} 21 | taxonomies["greengenes"] = {"class": GreengenesTx} 22 | # todo test online custom 23 | 24 | @classmethod 25 | def setUpClass(self): 26 | setup_dir(self.tmp_dir) 27 | 28 | def test_online_default(self): 29 | """ 30 | Default test online 31 | """ 32 | for t in self.taxonomies: 33 | tax = self.taxonomies[t]["class"]() 34 | self.assertGreater(tax.stats()["nodes"], 0, t + " failed") 35 | 36 | def test_online_output_prefix(self): 37 | """ 38 | Saving files on disk 39 | """ 40 | for t in self.taxonomies: 41 | tax = self.taxonomies[t]["class"](output_prefix=self.tmp_dir) 42 | self.assertGreater( 43 | tax.stats()["nodes"], 0, t + " failed with urls and output_prefix") 44 | -------------------------------------------------------------------------------- /tests/multitax/unit/test_functions.py: -------------------------------------------------------------------------------- 1 | from multitax.utils import check_file 2 | from multitax import * 3 | from tests.multitax.utils import setup_dir 4 | import unittest 5 | 6 | 7 | class TestFunctions(unittest.TestCase): 8 | # test data (14 nodes) 9 | # 10 | # rank-1 (root) 1 ___________ 11 | # / \ \ 12 | # rank-2 2.1 2.2 ______ \ 13 | # / \ \ \ \ 14 | # rank-3 3.1 3.2 3.4 \ \ 15 | # / / \ \ \ \ 16 | # rank-4 *4.1 *4.2 *4.3 *4.4 *4.5 *4.6 17 | # / | 18 | # rank-5 *5.1 *5.2 19 | # 20 | # names: 1: Node1, 2.1: Node2.1, ...,5.2: Node5.2 21 | 22 | test_file = "tests/multitax/data_minimal/custom_unit_test.tsv.gz" 23 | tmp_dir = "tests/multitax/unit/tmp_functions/" 24 | 25 | @classmethod 26 | def setUpClass(self): 27 | setup_dir(self.tmp_dir) 28 | 29 | def test_children(self): 30 | """ 31 | test children function 32 | """ 33 | tax = CustomTx(files=self.test_file) 34 | self.assertCountEqual(tax.children("1"), ["2.1", "2.2", "4.6"]) 35 | self.assertCountEqual(tax.children("2.1"), ["3.1", "3.2"]) 36 | self.assertCountEqual(tax.children("2.2"), ["3.4", "4.5"]) 37 | self.assertCountEqual(tax.children("4.4"), ["5.1", "5.2"]) 38 | self.assertCountEqual(tax.children("5.2"), []) 39 | self.assertCountEqual(tax.children("XXX"), []) 40 | 41 | def test_search_name(self): 42 | """ 43 | test search_name function 44 | """ 45 | 46 | # Exact matches 47 | tax = CustomTx(files=self.test_file) 48 | self.assertCountEqual(tax.search_name("Node1"), ["1"]) 49 | self.assertCountEqual(tax.search_name("Node2.1"), ["2.1"]) 50 | self.assertCountEqual(tax.search_name("Node5.2"), ["5.2"]) 51 | self.assertCountEqual(tax.search_name("Node2."), []) 52 | 53 | # not exact matches 54 | tax = CustomTx(files=self.test_file) 55 | self.assertCountEqual(tax.search_name( 56 | "Node2", exact=False), ["2.1", "2.2"]) 57 | self.assertCountEqual(tax.search_name("Node2", exact=True), []) 58 | self.assertCountEqual(tax.search_name("Node1", exact=False), ["1"]) 59 | self.assertCountEqual(tax.search_name("NotThere", exact=False), []) 60 | 61 | # Changing root name 62 | tax = CustomTx(files=self.test_file, root_name="AnotherRootName") 63 | self.assertCountEqual(tax.search_name("Node1", exact=False), []) 64 | self.assertCountEqual(tax.search_name( 65 | "AnotherRootName", exact=True), ["1"]) 66 | self.assertCountEqual(tax.search_name("Another", exact=False), ["1"]) 67 | 68 | # With specific rank 69 | tax = CustomTx(files=self.test_file) 70 | self.assertCountEqual(tax.search_name( 71 | "Node2.1", exact=True, rank="rank-2"), ["2.1"]) 72 | self.assertCountEqual(tax.search_name( 73 | "Node4.4", exact=True, rank="rank-4"), ["4.4"]) 74 | self.assertCountEqual(tax.search_name( 75 | "Node", exact=False, rank="rank-5"), ["5.1", "5.2"]) 76 | self.assertCountEqual(tax.search_name( 77 | "Node2.1", exact=True, rank="rank-3"), []) 78 | self.assertCountEqual(tax.search_name( 79 | "Node4.4", exact=True, rank="rank-1"), []) 80 | self.assertCountEqual(tax.search_name( 81 | "Node5", exact=False, rank="rank-XXX"), []) 82 | 83 | def test_nodes_rank(self): 84 | """ 85 | test nodes_rank function 86 | """ 87 | tax = CustomTx(files=self.test_file) 88 | self.assertCountEqual(tax.nodes_rank("rank-1"), ["1"]) 89 | self.assertCountEqual(tax.nodes_rank("rank-4"), 90 | ["4.1", "4.2", "4.3", "4.4", "4.5", "4.6"]) 91 | self.assertCountEqual(tax.nodes_rank("rank-9999"), []) 92 | 93 | def test_parent(self): 94 | """ 95 | test parent function 96 | """ 97 | tax = CustomTx(files=self.test_file) 98 | self.assertEqual(tax.parent("1"), tax.root_parent) 99 | self.assertEqual(tax.parent("3.2"), "2.1") 100 | self.assertEqual(tax.parent("5.2"), "4.4") 101 | self.assertEqual(tax.parent("PpQqRr"), tax.undefined_node) 102 | 103 | tax = CustomTx(files=self.test_file, undefined_node="NoNode") 104 | self.assertEqual(tax.parent("ABVCDE"), "NoNode") 105 | 106 | def test_rank(self): 107 | """ 108 | test rank function 109 | """ 110 | tax = CustomTx(files=self.test_file) 111 | self.assertEqual(tax.rank("4.1"), "rank-4") 112 | self.assertEqual(tax.rank("1"), "rank-1") 113 | self.assertEqual(tax.rank("5.2"), "rank-5") 114 | self.assertEqual(tax.rank("what"), tax.undefined_rank) 115 | 116 | tax = CustomTx(files=self.test_file, undefined_rank="NoRank") 117 | self.assertEqual(tax.rank("ABVCDE"), "NoRank") 118 | 119 | def test_name(self): 120 | """ 121 | test name function 122 | """ 123 | tax = CustomTx(files=self.test_file) 124 | self.assertEqual(tax.name("4.1"), "Node4.1") 125 | self.assertEqual(tax.name("1"), "Node1") 126 | self.assertEqual(tax.name("2.2"), "Node2.2") 127 | self.assertEqual(tax.name("ABVCDE"), tax.undefined_name) 128 | 129 | tax = CustomTx(files=self.test_file, undefined_name="NoName") 130 | self.assertEqual(tax.name("ABVCDE"), "NoName") 131 | 132 | def test_latest(self): 133 | """ 134 | test latest function 135 | """ 136 | tax = CustomTx(files=self.test_file) 137 | self.assertEqual(tax.latest("4.1"), "4.1") 138 | self.assertEqual(tax.latest("1"), "1") 139 | self.assertEqual(tax.latest("4.6"), "4.6") 140 | self.assertEqual(tax.latest("XxXxXx"), tax.undefined_node) 141 | 142 | def test_leaves(self): 143 | """ 144 | test leaves function 145 | """ 146 | tax = CustomTx(files=self.test_file) 147 | self.assertCountEqual( 148 | tax.leaves(), ["4.1", "4.2", "4.3", "4.5", "4.6", "5.1", "5.2"]) 149 | self.assertCountEqual(tax.leaves( 150 | "1"), ["4.1", "4.2", "4.3", "5.1", "5.2", "4.5", "4.6"]) 151 | self.assertCountEqual(tax.leaves("2.2"), ["5.1", "5.2", "4.5"]) 152 | self.assertCountEqual(tax.leaves("4.4"), ["5.1", "5.2"]) 153 | self.assertCountEqual(tax.leaves("5.1"), ["5.1"]) 154 | self.assertCountEqual(tax.leaves("999.999"), []) 155 | 156 | def test_lineage(self): 157 | """ 158 | test lineage function 159 | """ 160 | tax = CustomTx(files=self.test_file) 161 | # Use only assertEqual instead of assertCountEqual -> order matters 162 | self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"]) 163 | self.assertEqual(tax.lineage("3.2"), ["1", "2.1", "3.2"]) 164 | self.assertEqual(tax.lineage("4.6"), ["1", "4.6"]) 165 | self.assertEqual(tax.lineage("1"), ["1"]) 166 | self.assertEqual(tax.lineage("9999"), []) 167 | 168 | # with ranks 169 | self.assertEqual(tax.lineage("5.2", ranks=["rank-1", "rank-3", "rank-5"]), 170 | ["1", "3.4", "5.2"]) 171 | self.assertEqual(tax.lineage("5.2", ranks=["rank-3", "rank-5", "rank-1"]), 172 | ["3.4", "5.2", "1"]) 173 | self.assertEqual(tax.lineage("4.5", ranks=["rank-1"]), 174 | ["1"]) 175 | self.assertEqual(tax.lineage("3.2", ranks=["rank-4", "rank-5"]), 176 | [tax.undefined_node, tax.undefined_node]) 177 | self.assertEqual(tax.lineage("4.5", ranks=["rank-1", "rank-2", "rank-3", "rank-4", "rank-5"]), 178 | ["1", "2.2", tax.undefined_node, "4.5", tax.undefined_node]) 179 | self.assertEqual(tax.lineage("4.6", ranks=["xxxx", "yyy"]), 180 | [tax.undefined_node, tax.undefined_node]) 181 | # Invalid lineage 182 | self.assertEqual(tax.lineage("ZZZ", ranks=["xxxx", "yyy"]), 183 | []) 184 | 185 | # with root_node 186 | self.assertEqual(tax.lineage("5.2", root_node="2.2"), 187 | ["2.2", "3.4", "4.4", "5.2"]) 188 | self.assertEqual(tax.lineage("4.2", root_node="2.1"), 189 | ["2.1", "3.2", "4.2"]) 190 | self.assertEqual(tax.lineage("4.5", root_node="2.2"), 191 | ["2.2", "4.5"]) 192 | # Invalid lineage 193 | self.assertEqual(tax.lineage("5.2", root_node="2.1"), 194 | []) 195 | self.assertEqual(tax.lineage("3.1", root_node="4.1"), 196 | []) 197 | self.assertEqual(tax.lineage("XXX", root_node="YYY"), 198 | []) 199 | 200 | # with both 201 | self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]), 202 | ["3.4", "4.4"]) 203 | self.assertEqual(tax.lineage("5.1", root_node="3.4", ranks=["rank-3", "rank-5"]), 204 | ["3.4", "5.1"]) 205 | self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]), 206 | [tax.undefined_node, "2.1", "3.1", tax.undefined_node]) 207 | self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]), 208 | [tax.undefined_node, tax.undefined_node]) 209 | self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["XXXXX"]), 210 | [tax.undefined_node]) 211 | # Invalid lineage 212 | self.assertEqual(tax.lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]), 213 | []) 214 | self.assertEqual(tax.lineage("XXXX", root_node="2.2", ranks=["rank-3", "rank-4"]), 215 | []) 216 | 217 | def test_rank_lineage(self): 218 | """ 219 | test rank_lineage function 220 | """ 221 | tax = CustomTx(files=self.test_file) 222 | self.assertEqual(tax.rank_lineage("5.2"), [ 223 | "rank-1", "rank-2", "rank-3", "rank-4", "rank-5"]) 224 | self.assertEqual(tax.rank_lineage("4.6"), ["rank-1", "rank-4"]) 225 | self.assertEqual(tax.rank_lineage("1"), ["rank-1"]) 226 | self.assertEqual(tax.rank_lineage("9999"), []) 227 | 228 | # with ranks or root_node 229 | self.assertEqual(tax.rank_lineage("5.2", ranks=["rank-1", "rank-3"]), 230 | ["rank-1", "rank-3"]) 231 | self.assertEqual(tax.rank_lineage("5.2", ranks=["rank-1", "XXX", "rank-3"]), 232 | ["rank-1", tax.undefined_rank, "rank-3"]) 233 | self.assertEqual(tax.rank_lineage("ZZZ", ranks=["rank-1", "XXX", "rank-3"]), 234 | []) 235 | self.assertEqual(tax.rank_lineage("5.2", root_node="2.2"), 236 | ["rank-2", "rank-3", "rank-4", "rank-5"]) 237 | self.assertEqual(tax.rank_lineage("5.2", root_node="2.1"), 238 | []) 239 | self.assertEqual(tax.rank_lineage("XXX", root_node="YYY"), 240 | []) 241 | 242 | # with both 243 | self.assertEqual(tax.rank_lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]), 244 | ["rank-3", "rank-4"]) 245 | self.assertEqual(tax.rank_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]), 246 | [tax.undefined_rank, "rank-2", "rank-3", tax.undefined_rank]) 247 | self.assertEqual(tax.rank_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]), 248 | [tax.undefined_rank, tax.undefined_rank]) 249 | self.assertEqual(tax.rank_lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]), 250 | []) 251 | self.assertEqual(tax.rank_lineage("XXXX", root_node="ZZZ", ranks=["CCC", "VVV"]), 252 | []) 253 | 254 | def test_name_lineage(self): 255 | """ 256 | test rank_lineage function 257 | """ 258 | tax = CustomTx(files=self.test_file) 259 | self.assertEqual(tax.name_lineage("5.2"), [ 260 | "Node1", "Node2.2", "Node3.4", "Node4.4", "Node5.2"]) 261 | self.assertEqual(tax.name_lineage("4.6"), ["Node1", "Node4.6"]) 262 | self.assertEqual(tax.name_lineage("1"), ["Node1"]) 263 | self.assertEqual(tax.name_lineage("9999"), []) 264 | 265 | # with ranks or root_node 266 | self.assertEqual(tax.name_lineage("5.2", ranks=["rank-1", "rank-3"]), 267 | ["Node1", "Node3.4"]) 268 | self.assertEqual(tax.name_lineage("5.2", ranks=["rank-1", "XXX", "rank-3"]), 269 | ["Node1", tax.undefined_name, "Node3.4"]) 270 | self.assertEqual(tax.name_lineage("ZZZ", ranks=["rank-1", "XXX", "rank-3"]), 271 | []) 272 | self.assertEqual(tax.name_lineage("5.2", root_node="2.2"), 273 | ["Node2.2", "Node3.4", "Node4.4", "Node5.2"]) 274 | self.assertEqual(tax.name_lineage("5.2", root_node="2.1"), 275 | []) 276 | self.assertEqual(tax.name_lineage("XXX", root_node="YYY"), 277 | []) 278 | 279 | # with both 280 | self.assertEqual(tax.name_lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]), 281 | ["Node3.4", "Node4.4"]) 282 | self.assertEqual(tax.name_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]), 283 | [tax.undefined_name, "Node2.1", "Node3.1", tax.undefined_name]) 284 | self.assertEqual(tax.name_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]), 285 | [tax.undefined_name, tax.undefined_name]) 286 | self.assertEqual(tax.name_lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]), 287 | []) 288 | self.assertEqual(tax.name_lineage("XXXX", root_node="ZZZ", ranks=["CCC", "VVV"]), 289 | []) 290 | 291 | def test_parent_rank(self): 292 | """ 293 | test parent_rank function 294 | """ 295 | tax = CustomTx(files=self.test_file) 296 | self.assertEqual(tax.parent_rank("5.2", "rank-3"), "3.4") 297 | self.assertEqual(tax.parent_rank("4.1", "rank-2"), "2.1") 298 | self.assertEqual(tax.parent_rank("3.2", "rank-1"), "1") 299 | self.assertEqual(tax.parent_rank("3.2", "rank-4"), tax.undefined_node) 300 | self.assertEqual(tax.parent_rank("2.2", "XXXX"), tax.undefined_node) 301 | self.assertEqual(tax.parent_rank("CCCC", "XXXX"), tax.undefined_node) 302 | 303 | def test_closest_parent(self): 304 | """ 305 | test closest_parent function 306 | """ 307 | tax = CustomTx(files=self.test_file) 308 | self.assertEqual(tax.closest_parent( 309 | "5.2", ["rank-1", "rank-3"]), "3.4") 310 | self.assertEqual(tax.closest_parent( 311 | "5.2", ["rank-1", "rank-3", "rank-4"]), "4.4") 312 | self.assertEqual(tax.closest_parent( 313 | "5.2", ["rank-1", "rank-3", "rank-4", "rank-5"]), "5.2") 314 | self.assertEqual(tax.closest_parent( 315 | "5.2", ["rank-1", "rank-3", "rank-4", "rank-5", "XXXXX"]), "5.2") 316 | self.assertEqual(tax.closest_parent( 317 | "3.4", ["rank-1", "rank-4", "rank-5"]), "1") 318 | self.assertEqual(tax.closest_parent( 319 | "4.6", ["rank-1", "rank-2", "rank-3", "rank-5"]), "1") 320 | self.assertEqual(tax.closest_parent( 321 | "4.6", ["rank-2", "rank-3", "rank-5"]), tax.undefined_node) 322 | self.assertEqual(tax.closest_parent( 323 | "3.4", ["X", "Y", "Z"]), tax.undefined_node) 324 | self.assertEqual(tax.closest_parent("3.4", []), "3.4") 325 | 326 | def test_stats(self): 327 | """ 328 | test stats function 329 | """ 330 | tax = CustomTx(files=self.test_file) 331 | stats = tax.stats() 332 | self.assertEqual(stats["nodes"], 14) 333 | self.assertEqual(stats["names"], 14) 334 | self.assertEqual(stats["ranks"], 14) 335 | self.assertEqual(stats["leaves"], 7) 336 | self.assertEqual(len(stats["ranked_nodes"]), 5) 337 | self.assertEqual(sum(stats["ranked_nodes"].values()), stats["nodes"]) 338 | self.assertEqual(sum(stats["ranked_leaves"].values()), stats["leaves"]) 339 | self.assertCountEqual(list(stats["ranked_leaves"].keys()), [ 340 | "rank-4", "rank-5"]) 341 | 342 | def test_build_lineages(self): 343 | """ 344 | test build_lineages function 345 | """ 346 | # build full lineages 347 | tax = CustomTx(files=self.test_file) 348 | self.assertEqual(len(tax._lineages), 0) 349 | tax.build_lineages() 350 | self.assertEqual(len(tax._lineages), 14) 351 | self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"]) 352 | self.assertEqual(tax.lineage("XXX"), []) 353 | # do not use stored lineage with keyword arguments 354 | self.assertEqual(tax.lineage("5.2", root_node="2.2"), 355 | ["2.2", "3.4", "4.4", "5.2"]) 356 | self.assertEqual(tax.lineage( 357 | "5.2", ranks=["rank-2", "rank-4"]), ["2.2", "4.4"]) 358 | self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=[ 359 | "rank-2", "rank-4"]), ["2.2", "4.4"]) 360 | 361 | # build filtered lineages 362 | tax.clear_lineages() 363 | self.assertEqual(len(tax._lineages), 0) 364 | tax.build_lineages(root_node="2.2", ranks=["rank-2", "rank-4"]) 365 | self.assertEqual(len(tax._lineages), 14) 366 | self.assertEqual(tax.lineage("5.2"), ["2.2", "4.4"]) 367 | self.assertEqual(tax.lineage("XXX"), []) 368 | # do not use stored lineage with keyword arguments 369 | self.assertEqual(tax.lineage("5.2", root_node="3.4"), 370 | ["3.4", "4.4", "5.2"]) 371 | self.assertEqual(tax.lineage("5.2", ranks=[]), [ 372 | "1", "2.2", "3.4", "4.4", "5.2"]) 373 | self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=[ 374 | "rank-2", "rank-5"]), ["2.2", "5.2"]) 375 | 376 | def test_clear_lineages(self): 377 | """ 378 | test clear_lineages function 379 | """ 380 | tax = CustomTx(files=self.test_file) 381 | self.assertEqual(len(tax._lineages), 0) 382 | tax.build_lineages() 383 | self.assertEqual(len(tax._lineages), 14) 384 | tax.clear_lineages() 385 | self.assertEqual(len(tax._lineages), 0) 386 | self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"]) 387 | self.assertEqual(tax.lineage("XXX"), []) 388 | 389 | def test_translation(self): 390 | """ 391 | test build_translation and tranlate functions (GTDB<->NCBI) 392 | """ 393 | gtdb_tax = GtdbTx(files=["tests/multitax/data_minimal/gtdb_ar.tsv.gz", 394 | "tests/multitax/data_minimal/gtdb_bac.tsv.gz"]) 395 | ncbi_tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz") 396 | 397 | # GTDB->NCBI 398 | # Should be no translation yet (g__Paenibacillus is contained in both test sets) 399 | self.assertCountEqual(gtdb_tax.translate("g__Paenibacillus"), []) 400 | gtdb_tax.build_translation(ncbi_tax, files=[ 401 | "tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz", "tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz"]) 402 | self.assertCountEqual(gtdb_tax.translate( 403 | "g__Paenibacillus"), ["44249"]) 404 | 405 | # NCBI->GTDB 406 | # Should be no translation yet (g__Paenibacillus is contained in both test sets) 407 | self.assertCountEqual(ncbi_tax.translate("44249"), []) 408 | ncbi_tax.build_translation(gtdb_tax, files=[ 409 | "tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz", "tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz"]) 410 | self.assertCountEqual(ncbi_tax.translate("44249"), ["g__Paenibacillus"]) 411 | 412 | # Other translations not yet implemented 413 | ott_tax = OttTx(files="tests/multitax/data_minimal/ott.tgz") 414 | silva_tax = SilvaTx(files="tests/multitax/data_minimal/silva.txt.gz") 415 | gg_tax = GreengenesTx(files="tests/multitax/data_minimal/gg.txt.gz") 416 | with self.assertWarns(UserWarning): 417 | ncbi_tax.build_translation(ott_tax) 418 | ncbi_tax.build_translation(silva_tax) 419 | ncbi_tax.build_translation(gg_tax) 420 | gtdb_tax.build_translation(ott_tax) 421 | gtdb_tax.build_translation(silva_tax) 422 | gtdb_tax.build_translation(gg_tax) 423 | ott_tax.build_translation(silva_tax) 424 | ott_tax.build_translation(gg_tax) 425 | ott_tax.build_translation(gtdb_tax) 426 | ott_tax.build_translation(ncbi_tax) 427 | gg_tax.build_translation(ott_tax) 428 | gg_tax.build_translation(silva_tax) 429 | gg_tax.build_translation(gtdb_tax) 430 | gg_tax.build_translation(ncbi_tax) 431 | 432 | def test_check_consistency(self): 433 | """ 434 | test check_consistency function 435 | """ 436 | tax = CustomTx(files=self.test_file) 437 | self.assertEqual(tax.check_consistency(), None) 438 | # delete node 439 | del tax._nodes["3.4"] 440 | with self.assertRaises(ValueError): 441 | tax.check_consistency() 442 | 443 | tax = CustomTx(files=self.test_file) 444 | # delete leaf node 445 | del tax._nodes["5.2"] 446 | self.assertEqual(tax.check_consistency(), None) 447 | 448 | tax = CustomTx(files=self.test_file) 449 | # delete root 450 | del tax._nodes["1"] 451 | # should raise error 452 | with self.assertRaises(ValueError): 453 | tax.check_consistency() 454 | 455 | def test_filter(self): 456 | """ 457 | test filter function 458 | """ 459 | # Ancestors 460 | tax = CustomTx(files=self.test_file) 461 | tax.filter("4.5") 462 | self.assertEqual(tax.stats()["nodes"], 3) 463 | self.assertCountEqual(tax.lineage("4.5"), ["1", "2.2", "4.5"]) 464 | self.assertCountEqual(tax.leaves("1"), ["4.5"]) 465 | 466 | tax = CustomTx(files=self.test_file) 467 | tax.filter(["4.5", "XXXX"]) 468 | self.assertEqual(tax.stats()["nodes"], 3) 469 | self.assertCountEqual(tax.lineage("4.5"), ["1", "2.2", "4.5"]) 470 | self.assertCountEqual(tax.leaves("1"), ["4.5"]) 471 | 472 | tax = CustomTx(files=self.test_file) 473 | tax.filter(["4.1", "5.1", "5.2"]) 474 | self.assertEqual(tax.stats()["nodes"], 9) 475 | self.assertCountEqual(tax.lineage("4.1"), ["1", "2.1", "3.1", "4.1"]) 476 | self.assertCountEqual(tax.leaves("1"), ["4.1", "5.1", "5.2"]) 477 | 478 | tax = CustomTx(files=self.test_file) 479 | tax.filter("XXXX") 480 | self.assertEqual(tax.stats()["nodes"], 1) 481 | 482 | # Descendants 483 | tax = CustomTx(files=self.test_file) 484 | tax.filter("3.4", desc=True) 485 | self.assertEqual(tax.stats()["nodes"], 5) 486 | self.assertCountEqual(tax.lineage("3.4"), ["1", "3.4"]) 487 | self.assertCountEqual(tax.leaves("1"), ["5.1", "5.2"]) 488 | 489 | tax = CustomTx(files=self.test_file) 490 | tax.filter(["XXXXX", "3.4"], desc=True) 491 | self.assertEqual(tax.stats()["nodes"], 5) 492 | self.assertCountEqual(tax.lineage("3.4"), ["1", "3.4"]) 493 | self.assertCountEqual(tax.leaves("1"), ["5.1", "5.2"]) 494 | 495 | tax = CustomTx(files=self.test_file) 496 | tax.filter(["3.2", "4.4"], desc=True) 497 | self.assertEqual(tax.stats()["nodes"], 7) 498 | self.assertCountEqual(tax.lineage("5.2"), ["1", "4.4", "5.2"]) 499 | self.assertCountEqual(tax.lineage("4.5"), []) 500 | self.assertCountEqual(tax.leaves("1"), ["4.2", "4.3", "5.1", "5.2"]) 501 | 502 | tax = CustomTx(files=self.test_file) 503 | self.assertEqual(tax.stats()["nodes"], 14) 504 | tax.filter("XXXXX", desc=True) 505 | self.assertEqual(tax.stats()["nodes"], 1) 506 | 507 | def test_add(self): 508 | """ 509 | test add function 510 | """ 511 | tax = CustomTx(files=self.test_file) 512 | # Add leaf node 5.3 to parent 4.4 513 | tax.add("5.3", "4.4") 514 | self.assertEqual(tax.check_consistency(), None) 515 | self.assertEqual(tax.parent("5.3"), "4.4") 516 | self.assertEqual(tax.name("5.3"), tax.undefined_name) 517 | self.assertEqual(tax.rank("5.3"), tax.undefined_rank) 518 | 519 | # Add another leaf on the 5.3 with name and rank 520 | tax.add("6.1", "5.3", name="Node6.1", rank="rank-6") 521 | self.assertEqual(tax.check_consistency(), None) 522 | self.assertEqual(tax.parent("6.1"), "5.3") 523 | self.assertEqual(tax.name("6.1"), "Node6.1") 524 | self.assertEqual(tax.rank("6.1"), "rank-6") 525 | self.assertEqual(tax.lineage("6.1"), [ 526 | "1", "2.2", "3.4", "4.4", "5.3", "6.1"]) 527 | 528 | # Add node without valid parent, raises ValueError 529 | with self.assertRaises(ValueError): 530 | tax.add("6.2", "XXX") 531 | 532 | # Add already existing node 533 | with self.assertRaises(ValueError): 534 | tax.add("5.1", "4.4") 535 | 536 | def test_remove(self): 537 | """ 538 | test remove function 539 | """ 540 | tax = CustomTx(files=self.test_file) 541 | tax.remove("5.2") 542 | self.assertEqual(tax.latest("5.2"), tax.undefined_node) 543 | self.assertEqual(tax.parent("5.2"), tax.undefined_node) 544 | self.assertEqual(tax.name("5.2"), tax.undefined_node) 545 | self.assertEqual(tax.rank("5.2"), tax.undefined_node) 546 | self.assertEqual(tax.lineage("5.2"), []) 547 | 548 | # Initialize aux structures and clear them after removing node 549 | tax = CustomTx(files=self.test_file, build_name_nodes=True, 550 | build_node_children=True, build_rank_nodes=True) 551 | self.assertNotEqual(len(tax._name_nodes), 0) 552 | self.assertNotEqual(len(tax._node_children), 0) 553 | self.assertNotEqual(len(tax._rank_nodes), 0) 554 | tax.remove("5.2") 555 | self.assertEqual(len(tax._name_nodes), 0) 556 | self.assertEqual(len(tax._node_children), 0) 557 | self.assertEqual(len(tax._rank_nodes), 0) 558 | 559 | # with check_consistency 560 | tax.remove("5.1", check_consistency=True) 561 | 562 | # Removing node that breaks the tree (allowed) 563 | tax.remove("3.1") 564 | # node is removed anyway 565 | self.assertEqual(tax.latest("3.1"), tax.undefined_node) 566 | with self.assertRaises(ValueError): 567 | tax.check_consistency() 568 | 569 | # Removing and raising execption 570 | with self.assertRaises(ValueError): 571 | tax.remove("3.2", check_consistency=True) 572 | # node is removed anyway 573 | self.assertEqual(tax.latest("3.2"), tax.undefined_node) 574 | 575 | # Removing root 576 | tax.remove("1") 577 | with self.assertRaises(ValueError): 578 | tax.check_consistency() 579 | 580 | # Removing node not present 581 | with self.assertRaises(ValueError): 582 | tax.remove("XXX") 583 | 584 | def test_prune(self): 585 | """ 586 | test prune function 587 | """ 588 | tax = CustomTx(files=self.test_file) 589 | 590 | self.assertCountEqual(tax.leaves("4.4"), ["5.1", "5.2"]) 591 | tax.prune("4.4") 592 | self.assertEqual(tax.check_consistency(), None) 593 | self.assertCountEqual(tax.leaves("4.4"), ["4.4"]) 594 | 595 | # Prune leaf node (nothing changes) 596 | self.assertCountEqual(tax.leaves("4.6"), ["4.6"]) 597 | tax.prune("4.6") 598 | self.assertEqual(tax.check_consistency(), None) 599 | self.assertCountEqual(tax.leaves("4.6"), ["4.6"]) 600 | 601 | # Prune multiple overlapping nodes 602 | self.assertCountEqual(tax.leaves("2.1"), ["4.1", "4.2", "4.3"]) 603 | self.assertCountEqual(tax.leaves("3.2"), ["4.2", "4.3"]) 604 | tax.prune(["2.1", "3.2"]) 605 | self.assertEqual(tax.check_consistency(), None) 606 | self.assertCountEqual(tax.leaves("2.1"), ["2.1"]) 607 | self.assertCountEqual(tax.leaves("3.2"), []) 608 | 609 | # Restar tax 610 | tax = CustomTx(files=self.test_file) 611 | # Prune multiple overlapping nodes (reversed) 612 | self.assertCountEqual(tax.leaves("2.1"), ["4.1", "4.2", "4.3"]) 613 | self.assertCountEqual(tax.leaves("3.2"), ["4.2", "4.3"]) 614 | tax.prune(["3.2", "2.1"]) 615 | self.assertEqual(tax.check_consistency(), None) 616 | self.assertCountEqual(tax.leaves("2.1"), ["2.1"]) 617 | self.assertCountEqual(tax.leaves("3.2"), []) 618 | 619 | # Pruning node not present 620 | with self.assertRaises(ValueError): 621 | tax.prune("XXX") 622 | 623 | # Prunning root node 624 | tax.prune(tax.root_node) 625 | self.assertEqual(len(tax._nodes), 1) 626 | 627 | def test_write(self): 628 | """ 629 | test write function 630 | """ 631 | tax = CustomTx(files=self.test_file) 632 | outfile = self.tmp_dir + "default.tsv" 633 | tax.write(outfile) 634 | self.assertEqual(check_file(outfile), None) 635 | 636 | tax = CustomTx(files=self.test_file) 637 | outfile = self.tmp_dir + "ranks.tsv" 638 | tax.write(outfile, 639 | ranks=["rank-2", "rank-4"], 640 | cols=["node", "rank", "lineage", "rank_lineage", "name_lineage"]) 641 | self.assertEqual(check_file(outfile), None) 642 | 643 | tax = CustomTx(files=self.test_file) 644 | outfile = self.tmp_dir + "all_cols.tsv" 645 | tax.write(outfile, 646 | cols=["node", "latest", "parent", "rank", "name", "leaves", "children", "lineage", "rank_lineage", "name_lineage"]) 647 | self.assertEqual(check_file(outfile), None) 648 | 649 | tax = CustomTx(files=self.test_file) 650 | outfile = self.tmp_dir + "sep_comma.tsv" 651 | tax.write(outfile, 652 | sep=",") 653 | self.assertEqual(check_file(outfile), None) 654 | 655 | tax = CustomTx(files=self.test_file) 656 | outfile = self.tmp_dir + "sep_multi_underline.tsv" 657 | tax.write(outfile, 658 | cols=["node", "lineage", "children", "leaves"], 659 | sep_multi="_") 660 | self.assertEqual(check_file(outfile), None) 661 | 662 | def test_ott_forwards(self): 663 | """ 664 | Test forwards functionality (ott only) 665 | """ 666 | # forwards.tsv 667 | # id replacement 668 | # 5044012 4603004 669 | # 391495 391494 670 | 671 | tax = OttTx(files="tests/multitax/data_minimal/ott.tgz") 672 | self.assertEqual(len(tax._forwards), 2) 673 | 674 | self.assertEqual(tax.parent("5044012"), tax.undefined_node) 675 | self.assertEqual(tax.latest("5044012"), "4603004") 676 | self.assertNotEqual(tax.parent( 677 | tax.latest("5044012")), tax.undefined_node) 678 | 679 | self.assertEqual(tax.parent("391495"), tax.undefined_node) 680 | self.assertEqual(tax.latest("391495"), "391494") 681 | self.assertNotEqual(tax.parent( 682 | tax.latest("391495")), tax.undefined_node) 683 | 684 | def test_ncbi_merged(self): 685 | """ 686 | Test merged functionality (ncbi only) 687 | """ 688 | # merged.dmp 689 | # 1235230 | 459525 | 690 | # 1235908 | 363999 | 691 | 692 | tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz") 693 | self.assertEqual(len(tax._merged), 2) 694 | 695 | self.assertEqual(tax.parent("1235230"), tax.undefined_node) 696 | self.assertEqual(tax.latest("1235230"), "459525") 697 | self.assertNotEqual(tax.parent( 698 | tax.latest("1235230")), tax.undefined_node) 699 | 700 | self.assertEqual(tax.parent("1235908"), tax.undefined_node) 701 | self.assertEqual(tax.latest("1235908"), "363999") 702 | self.assertNotEqual(tax.parent( 703 | tax.latest("1235908")), tax.undefined_node) 704 | 705 | def test_ncbi_extended_names(self): 706 | """ 707 | Test extended names functionality (ncbi) 708 | """ 709 | # on names.dmp 710 | # 363999 | Xylariaceae sp. 5129 | | includes | 711 | # 363999 | Xylariaceae sp. 5151 | | includes | 712 | # 363999 | Xylariaceae sp. 5228 | | includes | 713 | # 37990 | mitosporic Xylariaceae | | includes | 714 | # 37990 | Xylariaceae | | scientific name | 715 | 716 | tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz", 717 | extended_names=False) 718 | tax_ex = NcbiTx( 719 | files="tests/multitax/data_minimal/ncbi.tar.gz", extended_names=True) 720 | 721 | # Exact match on scientific name 722 | self.assertCountEqual(tax.search_name("Xylariaceae"), ["37990"]) 723 | self.assertCountEqual(tax_ex.search_name("Xylariaceae"), ["37990"]) 724 | # All scientific names 725 | self.assertCountEqual(tax.search_name( 726 | "Xylariaceae", exact=False), ["37990"]) 727 | self.assertCountEqual(tax_ex.search_name( 728 | "Xylariaceae", exact=False), ["37990"]) 729 | # Exact match on scientific name forcing extended 730 | self.assertCountEqual(tax.search_name("Xylariaceae"), ["37990"]) 731 | self.assertCountEqual(tax_ex.search_name( 732 | "Xylariaceae", force_extended=True), ["37990"]) 733 | # All names 734 | self.assertCountEqual(tax.search_name( 735 | "Xylariaceae", exact=False), ["37990"]) 736 | self.assertCountEqual(tax_ex.search_name( 737 | "Xylariaceae", exact=False, force_extended=True), ["37990", "363999"]) 738 | # Exact name available only on extended 739 | self.assertCountEqual(tax.search_name( 740 | "mitosporic Xylariaceae", exact=True), []) 741 | self.assertCountEqual(tax_ex.search_name( 742 | "mitosporic Xylariaceae", exact=True), ["37990"]) 743 | # Partial name available only on extended 744 | self.assertCountEqual(tax.search_name( 745 | "Xylariaceae sp.", exact=False), []) 746 | self.assertCountEqual(tax_ex.search_name( 747 | "Xylariaceae sp.", exact=False), ["363999"]) 748 | 749 | def test_ott_extended_names(self): 750 | """ 751 | Test extended names functionality (ott) 752 | """ 753 | # on taxonomy.tsv 754 | # 4622 | 470454 | Haemophilus sp. CCUG 32367 | species | silva:EU909664,ncbi:554010 | | sibling_higher | 755 | # 4621 | 470454 | Haemophilus sp. CCUG 35214 | species | silva:EU909665,ncbi:554011 | | sibling_higher | 756 | # 158636 | 470454 | Haemophilus sp. CCUG 30218 | species | silva:EU909662,ncbi:554007 | | sibling_higher | 757 | # 391494 | 470454 | Haemophilus sp. CCUG 31732 | species | silva:EU909663,ncbi:554009 | | sibling_higher | 758 | # 525972 | 470454 | Haemophilus pittmaniae HK 85 | no rank - terminal | silva:AFUV01000004,ncbi:1035188 | 759 | # 788108 | 470454 | Haemophilus sputorum | species | silva:JF506644,ncbi:1078480,gbif:7522132 | 760 | # 470454 | 1098176 | Haemophilus | genus | silva:A16379/#6,ncbi:724,worms:571392,gbif:3219815,irmng:1307220 | | | 761 | # on synonyms.tsv 762 | # Hemophilus | 470454 | synonym | Hemophilus (synonym for Haemophilus) | gbif:3219815,irmng:1307220 | 763 | # Haemophilus sp. HK 85 | 525972 | equivalent name | Haemophilus sp. HK 85 (synonym for Haemophilus pittmaniae HK 85) | ncbi:1035188 | 764 | # Haemophilus sp. CCUG 26672 | 788108 | includes | Haemophilus sp. CCUG 26672 (synonym for Haemophilus sputorum) | ncbi:1078480 | 765 | # Haemophilus sp. CCUG 47809 | 788108 | includes | Haemophilus sp. CCUG 47809 (synonym for Haemophilus sputorum) | ncbi:1078480 | 766 | 767 | tax = OttTx(files="tests/multitax/data_minimal/ott.tgz", 768 | extended_names=False) 769 | tax_ex = OttTx( 770 | files="tests/multitax/data_minimal/ott.tgz", extended_names=True) 771 | 772 | # Exact match on scientific name 773 | self.assertCountEqual(tax.search_name("Haemophilus"), ["470454"]) 774 | self.assertCountEqual(tax_ex.search_name("Haemophilus"), ["470454"]) 775 | # All scientific names 776 | self.assertCountEqual(tax.search_name("Haemophilus sp.", exact=False), [ 777 | "391494", "158636", "4621", "4622"]) 778 | self.assertCountEqual(tax_ex.search_name("Haemophilus sp.", exact=False), [ 779 | "391494", "158636", "4621", "4622"]) 780 | # Exact match on scientific name forcing extended 781 | self.assertCountEqual(tax.search_name("Haemophilus"), ["470454"]) 782 | self.assertCountEqual(tax_ex.search_name( 783 | "Haemophilus", force_extended=True), ["470454"]) 784 | # All names 785 | self.assertCountEqual(tax.search_name("Haemophilus sp. CCUG", exact=False), [ 786 | "391494", "158636", "4621", "4622"]) 787 | self.assertCountEqual(tax_ex.search_name("Haemophilus sp. CCUG", exact=False, force_extended=True), [ 788 | "391494", "158636", "4621", "4622", "788108"]) 789 | # Exact name available only on extended 790 | self.assertCountEqual(tax.search_name( 791 | "Haemophilus sp. HK 85", exact=True), []) 792 | self.assertCountEqual(tax_ex.search_name( 793 | "Haemophilus sp. HK 85", exact=True), ["525972"]) 794 | # Partial name available only on extended 795 | self.assertCountEqual(tax.search_name("CCUG 26672", exact=False), []) 796 | self.assertCountEqual(tax_ex.search_name( 797 | "CCUG 26672", exact=False), ["788108"]) 798 | -------------------------------------------------------------------------------- /tests/multitax/unit/test_init.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from multitax import CustomTx 3 | from multitax.multitax import MultiTax 4 | 5 | 6 | class TestInit(unittest.TestCase): 7 | # test data (14 nodes) 8 | # 9 | # rank-1 (root) 1 ___________ 10 | # / \ \ 11 | # rank-2 2.1 2.2 ______ \ 12 | # / \ \ \ \ 13 | # rank-3 3.1 3.2 3.4 \ \ 14 | # / / \ \ \ \ 15 | # rank-4 *4.1 *4.2 *4.3 *4.4 *4.5 *4.6 16 | # / | 17 | # rank-5 *5.1 *5.2 18 | # 19 | # names: 1: Node1, 2.1: Node2.1, ...,5.2: Node5.2 20 | 21 | test_file = "tests/multitax/data_minimal/custom_unit_test.tsv.gz" 22 | 23 | def test_default(self): 24 | """ 25 | test default values on empty init 26 | """ 27 | # Empty tax 28 | tax = MultiTax() 29 | self.assertEqual(tax.root_parent, "0") 30 | self.assertEqual(tax.root_node, tax._default_root_node) 31 | self.assertEqual(tax.root_name, "root") 32 | self.assertEqual(tax.root_rank, "root") 33 | 34 | self.assertEqual(tax._default_urls, []) 35 | self.assertEqual(tax._default_root_node, "1") 36 | self.assertEqual(tax._nodes, {tax.root_node: '0'}) 37 | self.assertEqual(tax._names, {tax.root_node: 'root'}) 38 | self.assertEqual(tax._ranks, {tax.root_node: 'root'}) 39 | self.assertEqual(tax._lineages, {}) 40 | self.assertEqual(tax._name_nodes, {}) 41 | self.assertEqual(tax._node_children, {}) 42 | self.assertEqual(tax._rank_nodes, {}) 43 | self.assertEqual(tax._translated_nodes, {}) 44 | 45 | self.assertEqual(tax.undefined_node, None) 46 | self.assertEqual(tax.undefined_name, None) 47 | self.assertEqual(tax.undefined_rank, None) 48 | self.assertEqual(tax.sources, []) 49 | 50 | tax = CustomTx(files=self.test_file) 51 | self.assertEqual(tax.root_parent, "0") 52 | self.assertEqual(tax.root_node, tax._default_root_node) 53 | self.assertEqual(tax.root_name, "Node1") 54 | self.assertEqual(tax.root_rank, "rank-1") 55 | 56 | self.assertEqual(tax._default_urls, []) 57 | self.assertEqual(tax._default_root_node, "1") 58 | self.assertEqual(tax._nodes[tax.root_node], "0") 59 | self.assertEqual(tax._names[tax.root_node], "Node1") 60 | self.assertEqual(tax._ranks[tax.root_node], "rank-1") 61 | self.assertEqual(tax._lineages, {}) 62 | self.assertEqual(tax._name_nodes, {}) 63 | self.assertEqual(tax._node_children, {}) 64 | self.assertEqual(tax._rank_nodes, {}) 65 | self.assertEqual(tax._translated_nodes, {}) 66 | 67 | self.assertEqual(tax.undefined_node, None) 68 | self.assertEqual(tax.undefined_name, None) 69 | self.assertEqual(tax.undefined_rank, None) 70 | self.assertEqual(tax.sources, [self.test_file]) 71 | 72 | def test_root_values(self): 73 | """ 74 | test init changing root values 75 | """ 76 | 77 | # New root, not on tree 78 | tax = MultiTax(root_node="root_n", root_parent="root_p", 79 | root_name="newRootName", root_rank="newRootRank") 80 | self.assertEqual(tax.root_node, "root_n") 81 | self.assertEqual(tax.root_parent, "root_p") 82 | # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"} 83 | self.assertEqual(tax._nodes, { 84 | tax.root_node: tax.root_parent, tax._default_root_node: tax.root_node}) 85 | self.assertEqual(tax.root_name, 'newRootName') 86 | self.assertEqual(tax._names, {tax.root_node: 'newRootName'}) 87 | self.assertEqual(tax.root_rank, 'newRootRank') 88 | self.assertEqual(tax._ranks, {tax.root_node: 'newRootRank'}) 89 | 90 | # Root is a new node not in nodes 91 | tax = CustomTx(files=self.test_file, root_node="root_n", 92 | root_parent="root_p", root_name="newRootName", root_rank="newRootRank") 93 | self.assertEqual(tax.root_node, "root_n") 94 | self.assertEqual(tax.root_parent, "root_p") 95 | self.assertEqual(tax.stats()["nodes"], 15) 96 | 97 | # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"} 98 | self.assertEqual(tax.parent(tax.root_node), tax.root_parent) 99 | self.assertEqual(tax.name(tax.root_node), 'newRootName') 100 | self.assertEqual(tax.rank(tax.root_node), 'newRootRank') 101 | # Default root is linked to new root 102 | self.assertEqual(tax.parent(tax._default_root_node), tax.root_node) 103 | self.assertEqual(tax.name(tax._default_root_node), "Node1") 104 | self.assertEqual(tax.rank(tax._default_root_node), "rank-1") 105 | 106 | # Root is an existing node in nodes, but not default, filter tree under node 107 | tax = CustomTx(files=self.test_file, root_node="4.4", root_parent="root_p", 108 | root_name="newRootName", root_rank="newRootRank") 109 | self.assertEqual(tax.root_node, "4.4") 110 | self.assertEqual(tax.root_parent, "root_p") 111 | self.assertEqual(tax.stats()["nodes"], 3) 112 | 113 | # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"} 114 | self.assertEqual(tax.parent(tax.root_node), tax.root_parent) 115 | self.assertEqual(tax.name(tax.root_node), 'newRootName') 116 | self.assertEqual(tax.rank(tax.root_node), 'newRootRank') 117 | # default root should not exist 118 | self.assertEqual(tax.parent(tax._default_root_node), 119 | tax.undefined_node) 120 | self.assertEqual(tax.name(tax._default_root_node), tax.undefined_name) 121 | self.assertEqual(tax.rank(tax._default_root_node), tax.undefined_rank) 122 | 123 | def test_undefined_values(self): 124 | """ 125 | test init changing undefined values 126 | """ 127 | tax = MultiTax(undefined_node="unode", 128 | undefined_rank="urank", undefined_name="uname") 129 | self.assertEqual(tax.undefined_node, "unode") 130 | self.assertEqual(tax.undefined_name, "uname") 131 | self.assertEqual(tax.undefined_rank, "urank") 132 | self.assertEqual(tax.parent("XXX"), "unode") 133 | self.assertEqual(tax.rank("XXX"), "urank") 134 | self.assertEqual(tax.name("XXX"), "uname") 135 | 136 | tax = CustomTx(files=self.test_file, undefined_node="unode", 137 | undefined_rank="urank", undefined_name="uname") 138 | self.assertEqual(tax.undefined_node, "unode") 139 | self.assertEqual(tax.undefined_name, "uname") 140 | self.assertEqual(tax.undefined_rank, "urank") 141 | self.assertEqual(tax.parent("XXX"), "unode") 142 | self.assertEqual(tax.rank("XXX"), "urank") 143 | self.assertEqual(tax.name("XXX"), "uname") 144 | 145 | def test_build_values(self): 146 | """ 147 | test init changing undefined values 148 | """ 149 | tax = MultiTax(build_node_children=True, 150 | build_name_nodes=True, build_rank_nodes=True) 151 | self.assertEqual(tax._name_nodes, { 152 | tax.name(tax.root_node): [tax.root_node]}) 153 | self.assertEqual(tax._node_children, { 154 | tax.root_parent: [tax.root_node]}) 155 | self.assertEqual(tax._rank_nodes, {"root": [tax.root_node]}) 156 | 157 | tax = CustomTx(files=self.test_file, build_node_children=True, 158 | build_name_nodes=True, build_rank_nodes=True) 159 | self.assertNotEqual(len(tax._name_nodes), 0) 160 | self.assertNotEqual(len(tax._node_children), 0) 161 | self.assertNotEqual(len(tax._rank_nodes), 0) 162 | -------------------------------------------------------------------------------- /tests/multitax/utils.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import os 3 | import gzip 4 | import tarfile 5 | 6 | 7 | def setup_dir(d): 8 | shutil.rmtree(d, ignore_errors=True) 9 | os.makedirs(d) 10 | 11 | 12 | def uncompress_gzip(f, outf): 13 | with gzip.open(f, 'r') as f_in, open(outf, 'wb') as f_out: 14 | shutil.copyfileobj(f_in, f_out) 15 | 16 | 17 | def uncompress_tar_gzip(f, outd): 18 | # Extract all files ignoring internal directories to outd 19 | files = [] 20 | with tarfile.open(f) as tar_in: 21 | for member in tar_in.getmembers(): 22 | if member.isreg(): 23 | member.name = os.path.basename(member.name) 24 | files.append(member.name) 25 | tar_in.extract(member, outd) 26 | return files 27 | --------------------------------------------------------------------------------