├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── docs
    ├── index.html
    ├── multitax.html
    ├── multitax
    │   ├── multitax.html
    │   └── utils.html
    └── search.js
├── make_docs.sh
├── multitax
    ├── __init__.py
    ├── customtx.py
    ├── dummytx.py
    ├── greengenestx.py
    ├── gtdbtx.py
    ├── multitax.py
    ├── ncbitx.py
    ├── otttx.py
    ├── silvatx.py
    └── utils.py
├── pyproject.toml
├── setup.py
└── tests
    └── multitax
        ├── data_minimal
            ├── custom.tsv.gz
            ├── custom2.tsv.gz
            ├── custom_unit_test.tsv.gz
            ├── gg.txt.gz
            ├── gtdb_ar.tsv.gz
            ├── gtdb_ar_metadata.tsv.gz
            ├── gtdb_bac.tsv.gz
            ├── gtdb_bac_metadata.tsv.gz
            ├── ncbi.tar.gz
            ├── ott.tgz
            └── silva.txt.gz
        ├── integration
            ├── test_common.py
            ├── test_empty.py
            └── test_online.py
        ├── unit
            ├── test_functions.py
            └── test_init.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # Packages
 4 | *.egg
 5 | *.egg-info
 6 | dist
 7 | build
 8 | eggs
 9 | parts
10 | bin
11 | var
12 | sdist
13 | develop-eggs
14 | .installed.cfg
15 | lib
16 | lib64
17 | __pycache__
18 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.9"
 4 |   - "3.10"
 5 |   - "3.11"
 6 |   - "3.12"
 7 |   - "3.13"
 8 | 
 9 | before_install:
10 |   - python -m pip install coverage
11 |   - python -m pip install setuptools importlib-metadata --upgrade  # fix bug setuptools py37
12 | 
13 | install:
14 |   - python setup.py install
15 | 
16 | script:
17 |   - python -m unittest discover -s tests/multitax/unit/ -v
18 |   - python -m unittest discover -s tests/multitax/integration/ -v
19 |   - python -m coverage run --omit="/usr/*,tests/*" -m unittest discover -s tests/multitax/unit/ -v
20 |   - python -m coverage run --append --omit="/usr/*,tests/*" -m unittest discover -s tests/multitax/integration/ -v
21 | 
22 | after_success:
23 |   - python -m coverage xml -o coverage_py.xml
24 |   - curl -Os https://uploader.codecov.io/latest/linux/codecov;
25 |   - chmod +x codecov;
26 |   - ./codecov --nonZero -X search --file coverage_py.xml;
27 | 
28 | notifications:
29 |   email: false
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Vitor C. Piro - pirovc.github.io
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MultiTax  [![Build Status](https://app.travis-ci.com/pirovc/multitax.svg?token=q6Nfx8pLHh8hV3hLz3Pq&branch=main)](https://app.travis-ci.com/pirovc/multitax) [![codecov](https://codecov.io/gh/pirovc/multitax/branch/main/graph/badge.svg)](https://codecov.io/gh/pirovc/multitax) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/multitax/README.html)
  2 | 
  3 | Python package to obtain, parse and explore biological taxonomies
  4 | 
  5 | ## Description
  6 | 
  7 | MultiTax is a Python package that provides a common and generalized set of functions to download, parse, filter, explore, translate, convert and write multiple biological taxonomies (**GTDB, NCBI, Silva, Greengenes, Open Tree taxonomy**) and custom formatted taxonomies. Main goals are:
  8 | 
  9 |  - Be fast, intuitive, generalized and easy to use
 10 |  - Explore different taxonomies with same set of commands
 11 |  - Enable integration and compatibility with multiple taxonomies
 12 |  - Translate taxonomies (partially implemented)
 13 |  - Convert taxonomies (not yet implemented)
 14 | 
 15 | MultiTax does not link sequence identifiers to taxonomic nodes, it just handles the taxonomy alone. Some integration to work with sequence or external identifiers is planned, but not yet implemented.
 16 | 
 17 | ## API Documentation
 18 | 
 19 | https://pirovc.github.io/multitax/
 20 | 
 21 | ## Installation
 22 | 
 23 | ### pip
 24 | 
 25 | ```bash
 26 | pip install multitax
 27 | ```
 28 | 
 29 | ### conda
 30 | 
 31 | ```bash
 32 | conda install -c bioconda multitax
 33 | ```
 34 | 
 35 | ### local
 36 | 
 37 | ```bash
 38 | git clone https://github.com/pirovc/multitax.git
 39 | cd multitax
 40 | python setup.py install --record files.txt
 41 | ```
 42 | 
 43 | ## Basic usage with GTDB
 44 | 
 45 | ```python
 46 | from multitax import GtdbTx
 47 | 
 48 | # Download and parse taxonomy
 49 | tax = GtdbTx()
 50 | 
 51 | # Get lineage for the Escherichia genus  
 52 | tax.lineage("g__Escherichia")
 53 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia']
 54 | ```
 55 | 
 56 | ## Examples
 57 | 
 58 |  - [List of functions](https://pirovc.github.io/multitax/multitax/multitax.html)
 59 | 
 60 | ### Load
 61 | 
 62 | ```python
 63 | from multitax import GtdbTx  # or NcbiTx, SilvaTx, ...
 64 | 
 65 | # Download and parse in memory
 66 | tax = GtdbTx()
 67 | 
 68 | # Parse local files
 69 | tax = GtdbTx(files=["bac120_taxonomy.tsv.gz", "ar122_taxonomy.tsv.gz"])
 70 | 
 71 | # Download, write and parse files
 72 | tax = GtdbTx(output_prefix="my/path/") 
 73 | 
 74 | # Download and filter only specific branch
 75 | tax = GtdbTx(root_node="p__Proteobacteria") 
 76 | ```
 77 | 
 78 | ### Explore
 79 | 
 80 | ```python
 81 | # List parent node
 82 | tax.parent("g__Escherichia")
 83 | # f__Enterobacteriaceae
 84 | 
 85 | # List children nodes
 86 | tax.children("g__Escherichia")
 87 | # ['s__Escherichia coli',
 88 | # 's__Escherichia albertii',
 89 | # 's__Escherichia marmotae',
 90 | # 's__Escherichia fergusonii',
 91 | # 's__Escherichia sp005843885',
 92 | # 's__Escherichia ruysiae',
 93 | # 's__Escherichia sp001660175',
 94 | # 's__Escherichia sp004211955',
 95 | # 's__Escherichia sp002965065',
 96 | # 's__Escherichia coli_E']
 97 | 
 98 | # Get parent node from a defined rank
 99 | tax.parent_rank("s__Lentisphaera araneosa", "phylum")
100 | # 'p__Verrucomicrobiota'
101 | 
102 | # Get the closest parent from a list of ranks
103 | tax.closest_parent("s__Lentisphaera araneosa", ranks=["phylum", "class", "family"])
104 | # 'f__Lentisphaeraceae'
105 | 
106 | # Get lineage
107 | tax.lineage("g__Escherichia")
108 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia']
109 | 
110 | # Get lineage of names
111 | tax.name_lineage("g__Escherichia")
112 | # ['root', 'Bacteria', 'Proteobacteria', 'Gammaproteobacteria', 'Enterobacterales', 'Enterobacteriaceae', 'Escherichia']
113 | 
114 | # Get lineage of ranks
115 | tax.rank_lineage("g__Escherichia")
116 | # ['root', 'domain', 'phylum', 'class', 'order', 'family', 'genus']
117 | 
118 | # Get lineage with defined ranks and root node
119 | tax.lineage("g__Escherichia", root_node="p__Proteobacteria", ranks=["phylum", "class", "family", "genus"])
120 | # ['p__Proteobacteria', 'c__Gammaproteobacteria', 'f__Enterobacteriaceae', 'g__Escherichia']
121 | 
122 | # Build lineages in memory for faster access
123 | tax.build_lineages()
124 | 
125 | # Get leaf nodes
126 | tax.leaves("p__Hadarchaeota")
127 | # ['s__DG-33 sp004375695', 's__DG-33 sp001515185', 's__Hadarchaeum yellowstonense', 's__B75-G9 sp003661465', 's__WYZ-LMO6 sp004347925', 's__B88-G9 sp003660555']
128 | 
129 | # Search names and filter by rank
130 | tax.search_name("Escherichia", exact=False, rank="genus")
131 | # ['g__Escherichia', 'g__Escherichia_C']
132 | 
133 | # Show stats of loaded tax
134 | tax.stats()
135 | #{'leaves': 31910,
136 | # 'names': 45503,
137 | # 'nodes': 45503,
138 | # 'ranked_leaves': Counter({'species': 31910}),
139 | # 'ranked_nodes': Counter({'species': 31910,
140 | #                          'genus': 9428,
141 | #                          'family': 2600,
142 | #                          'order': 1034,
143 | #                          'class': 379,
144 | #                          'phylum': 149,
145 | #                          'domain': 2,
146 | #                          'root': 1}),
147 | # 'ranks': 45503}
148 | ```
149 | 
150 | ### Filter
151 | 
152 | ```python
153 | # Filter ancestors (desc=True for descendants)
154 | tax.filter(["g__Escherichia", "s__Pseudomonas aeruginosa"])
155 | tax.stats()
156 | #{'leaves': 2,
157 | # 'names': 11,
158 | # 'nodes': 11,
159 | # 'ranked_leaves': Counter({'genus': 1, 'species': 1}),
160 | # 'ranked_nodes': Counter({'genus': 2,
161 | #                          'family': 2,
162 | #                          'order': 2,
163 | #                          'class': 1,
164 | #                          'phylum': 1,
165 | #                          'domain': 1,
166 | #                          'species': 1,
167 | #                          'root': 1}),
168 | # 'ranks': 11}
169 | ```
170 | 
171 | ### Add, remove, prune
172 | 
173 | ```python
174 | # Add node to the tree
175 | tax.add("my_custom_node", "g__Escherichia", name="my custom name", rank="strain")
176 | tax.lineage("my_custom_node")
177 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia', 'my_custom_node']
178 | 
179 | # Remove node from tree (warning: removing parent nodes may break tree -> use check_consistency)
180 | tax.remove("s__Pseudomonas aeruginosa", check_consistency=True)
181 | 
182 | # Prune (remove) full branches of the tree under a certain node
183 | tax.prune("g__Escherichia")
184 | ```
185 | 
186 | ### Translate
187 | 
188 | ```python
189 | # GTDB to NCBI
190 | from multitax import GtdbTx, NcbiTx
191 | ncbi_tax = NcbiTx()
192 | gtdb_tax = GtdbTx()
193 | 
194 | # Build translation
195 | gtdb_tax.build_translation(ncbi_tax)
196 | 
197 | # Check translated nodes
198 | gtdb_tax.translate("g__Escherichia")
199 | # {'1301', '547', '561', '570', '590', '620'}
200 | ```
201 | 
202 | ### Write
203 | 
204 | ```python
205 | # Write tax to file
206 | tax.write("custom_tax.tsv", cols=["node", "rank", "name_lineage"])
207 | 
208 | #g__Escherichia             genus    root|Bacteria|Proteobacteria|Gammaproteobacteria|Ent#erobacterales|Enterobacteriaceae|Escherichia
209 | #f__Enterobacteriaceae      family   root|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales|Enterobacteriaceae
210 | #o__Enterobacterales        order    root|Bacteria|Proteobacteria|Gammaproteobacteria|Enterobacterales
211 | #c__Gammaproteobacteria     class    root|Bacteria|Proteobacteria|Gammaproteobacteria
212 | #...
213 | ```
214 | 
215 | ### The same applies to other taxonomies
216 | 
217 | ```python
218 | # NCBI
219 | from multitax import NcbiTx
220 | tax = NcbiTx()
221 | tax.lineage("561")    
222 | # ['1', '131567', '2', '1224', '1236', '91347', '543', '561']
223 | 
224 | # Silva
225 | from multitax import SilvaTx
226 | tax = SilvaTx()
227 | tax.lineage("46463")    
228 | # ['1', '3', '2375', '3303', '46449', '46454', '46463']
229 | 
230 | # Open Tree taxonomy
231 | from multitax import OttTx
232 | tax = OttTx()
233 | tax.lineage("474503")
234 | # ['805080', '93302', '844192', '248067', '822744', '768012', '424023', '474503']
235 | 
236 | # GreenGenes
237 | from multitax import GreengenesTx
238 | tax = GreengenesTx()
239 | tax.lineage("f__Enterobacteriaceae")
240 | # ['1', 'k__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacteriales', 'f__Enterobacteriaceae']
241 | ```
242 | 
243 | ## LCA integration
244 | 
245 | Using pylca: https://github.com/pirovc/pylca
246 | 
247 | ```bash
248 | conda install -c bioconda pylca
249 | ```
250 | 
251 | ```python
252 | from pylca.pylca import LCA
253 | from multitax import GtdbTx
254 | 
255 | # Download and parse GTDB Taxonomy
256 | tax = GtdbTx()
257 | 
258 | # Build LCA structure
259 | lca = LCA(tax._nodes)
260 | 
261 | # Get LCA
262 | lca("s__Escherichia dysenteriae", "s__Pseudomonas aeruginosa")
263 | # 'c__Gammaproteobacteria'
264 | ```
265 | 
266 | ## Details
267 | 
268 |  - After downloading/parsing the desired taxonomies, MultiTax works fully offline.
269 |  - Taxonomies are parsed into `nodes`. Each node is annotated with a `name` and a `rank`.
270 |  - Some taxonomies have a numeric taxonomic identifier (e.g. NCBI) and other use the rank + name as an identifier (e.g. GTDB). In MultiTax all identifiers are treated as strings.
271 |  - A single root node is defined by default for each taxonomy (or `1` when not defined). This can be changed with `root_node` when loading the taxonomy (as well as annotations `root_parent`, `root_name`, `root_rank`). If the `root_node` already exists, the tree will be filtered.
272 |  - Standard values for unknown/undefined nodes can be configured with `undefined_node`,`undefined_name` and `undefined_rank`. Those are default values returned when nodes/names/ranks are not found.
273 |  - Taxonomy files are automatically downloaded or can be loaded from disk (`files` parameter). Alternative `urls` can be provided. When downloaded, files are handled in memory. It is possible to save the downloaded file to disk with `output_prefix`.
274 | 
275 | ## Translation between taxonomies
276 | 
277 | Partially implemented. The goal is to map different taxonomies if the linkage data is available. That's what is currently availble.
278 | 
279 | 
280 |  |from/to |NCBI     |GTDB   |SILVA     |OTT     |GG    |
281 |  |--------|---------|-------|----------|--------|------|
282 |  |NCBI    |-        |PART   |[part]    |[part]  |no    |
283 |  |GTDB    |FULL     |-      |[part]    |no      |[part]|
284 |  |SILVA   |[full]   |[part] |-         |[part]  |no    |
285 |  |OTT     |[part]   |no     |[part]    |-       |no    |
286 |  |GG      |no       |[part] |no        |no      |-     |
287 | 
288 | Legend:
289 | 
290 |  - full: complete translation available
291 |  - part: partial translation available
292 |  - no: no translation possible
293 |  - []: not yet implemented
294 | 
295 | ### Files and information about specific translations
296 | 
297 |  - NCBI <-> GTDB
298 |    - GTDB is a subset of the NCBI repository, so the translation from NCBI to GTDB can be only partial
299 |    - Translation in both ways is based on: https://data.gtdb.ecogenomic.org/releases/latest/ar53_metadata.tsv.gz and https://data.gtdb.ecogenomic.org/releases/latest/bac120_metadata.tsv.gz
300 | 
301 | --- 
302 | 
303 | ## Further ideas to be implemented
304 | 
305 | - More translations
306 | - Conversion between taxonomies (write on specific format)
307 | 
308 | 
309 | ## Similar projects
310 | 
311 | - https://github.com/FOI-Bioinformatics/flextaxd
312 | - https://github.com/shenwei356/taxonkit
313 | - https://github.com/bioforensics/pytaxonkit
314 | - https://github.com/chanzuckerberg/taxoniq
315 | - https://github.com/sherrillmix/taxonomizr
316 | - https://github.com/etetoolkit/ete
317 | - https://github.com/apcamargo/taxopy


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 | <!doctype html>
2 | <html>
3 | <head>
4 |     <meta charset="utf-8">
5 |     <meta http-equiv="refresh" content="0; url=./multitax.html"/>
6 | </head>
7 | </html>
8 | 


--------------------------------------------------------------------------------
/docs/search.js:
--------------------------------------------------------------------------------
 1 | window.pdocSearch = (function(){
 2 | /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u<s.length;u++){var a=s[u];r[a]=this.pipeline.run(t.tokenizer(e[a]))}var l={};for(var c in o){var d=r[c]||r.any;if(d){var f=this.fieldSearch(d,c,o),h=o[c].boost;for(var p in f)f[p]=f[p]*h;for(var p in f)p in l?l[p]+=f[p]:l[p]=f[p]}}var v,g=[];for(var p in l)v={ref:p,score:l[p]},this.documentStore.hasDoc(p)&&(v.doc=this.documentStore.getDoc(p)),g.push(v);return g.sort(function(e,t){return t.score-e.score}),g},t.Index.prototype.fieldSearch=function(e,t,n){var i=n[t].bool,o=n[t].expand,r=n[t].boost,s=null,u={};return 0!==r?(e.forEach(function(e){var n=[e];1==o&&(n=this.index[t].expandToken(e));var r={};n.forEach(function(n){var o=this.index[t].getDocs(n),a=this.idf(n,t);if(s&&"AND"==i){var l={};for(var c in s)c in o&&(l[c]=o[c]);o=l}n==e&&this.fieldSearchStats(u,n,o);for(var c in o){var d=this.index[t].getTermFrequency(n,c),f=this.documentStore.getFieldLength(c,t),h=1;0!=f&&(h=1/Math.sqrt(f));var p=1;n!=e&&(p=.15*(1-(n.length-e.length)/n.length));var v=d*a*h*p;c in r?r[c]+=v:r[c]=v}},this),s=this.mergeScores(s,r,i)},this),s=this.coordNorm(s,u,e.length)):void 0},t.Index.prototype.mergeScores=function(e,t,n){if(!e)return t;if("AND"==n){var i={};for(var o in t)o in e&&(i[o]=e[o]+t[o]);return i}for(var o in t)o in e?e[o]+=t[o]:e[o]=t[o];return e},t.Index.prototype.fieldSearchStats=function(e,t,n){for(var i in n)i in e?e[i].push(t):e[i]=[t]},t.Index.prototype.coordNorm=function(e,t,n){for(var i in e)if(i in t){var o=t[i].length;e[i]=e[i]*o/n}return e},t.Index.prototype.toJSON=function(){var e={};return this._fields.forEach(function(t){e[t]=this.index[t].toJSON()},this),{version:t.version,fields:this._fields,ref:this._ref,documentStore:this.documentStore.toJSON(),index:e,pipeline:this.pipeline.toJSON()}},t.Index.prototype.use=function(e){var t=Array.prototype.slice.call(arguments,1);t.unshift(this),e.apply(this,t)},t.DocumentStore=function(e){this._save=null===e||void 0===e?!0:e,this.docs={},this.docInfo={},this.length=0},t.DocumentStore.load=function(e){var t=new this;return t.length=e.length,t.docs=e.docs,t.docInfo=e.docInfo,t._save=e.save,t},t.DocumentStore.prototype.isDocStored=function(){return this._save},t.DocumentStore.prototype.addDoc=function(t,n){this.hasDoc(t)||this.length++,this.docs[t]=this._save===!0?e(n):null},t.DocumentStore.prototype.getDoc=function(e){return this.hasDoc(e)===!1?null:this.docs[e]},t.DocumentStore.prototype.hasDoc=function(e){return e in this.docs},t.DocumentStore.prototype.removeDoc=function(e){this.hasDoc(e)&&(delete this.docs[e],delete this.docInfo[e],this.length--)},t.DocumentStore.prototype.addFieldLength=function(e,t,n){null!==e&&void 0!==e&&0!=this.hasDoc(e)&&(this.docInfo[e]||(this.docInfo[e]={}),this.docInfo[e][t]=n)},t.DocumentStore.prototype.updateFieldLength=function(e,t,n){null!==e&&void 0!==e&&0!=this.hasDoc(e)&&this.addFieldLength(e,t,n)},t.DocumentStore.prototype.getFieldLength=function(e,t){return null===e||void 0===e?0:e in this.docs&&t in this.docInfo[e]?this.docInfo[e][t]:0},t.DocumentStore.prototype.toJSON=function(){return{docs:this.docs,docInfo:this.docInfo,length:this.length,save:this._save}},t.stemmer=function(){var e={ational:"ate",tional:"tion",enci:"ence",anci:"ance",izer:"ize",bli:"ble",alli:"al",entli:"ent",eli:"e",ousli:"ous",ization:"ize",ation:"ate",ator:"ate",alism:"al",iveness:"ive",fulness:"ful",ousness:"ous",aliti:"al",iviti:"ive",biliti:"ble",logi:"log"},t={icate:"ic",ative:"",alize:"al",iciti:"ic",ical:"ic",ful:"",ness:""},n="[^aeiou]",i="[aeiouy]",o=n+"[^aeiouy]*",r=i+"[aeiou]*",s="^("+o+")?"+r+o,u="^("+o+")?"+r+o+"("+r+")?$",a="^("+o+")?"+r+o+r+o,l="^("+o+")?"+i,c=new RegExp(s),d=new RegExp(a),f=new RegExp(u),h=new RegExp(l),p=/^(.+?)(ss|i)es$/,v=/^(.+?)([^s])s$/,g=/^(.+?)eed$/,m=/^(.+?)(ed|ing)$/,y=/.$/,S=/(at|bl|iz)$/,x=new RegExp("([^aeiouylsz])\\1$"),w=new RegExp("^"+o+i+"[^aeiouwxy]$"),I=/^(.+?[^aeiou])y$/,b=/^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/,E=/^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/,D=/^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/,F=/^(.+?)(s|t)(ion)$/,_=/^(.+?)e$/,P=/ll$/,k=new RegExp("^"+o+i+"[^aeiouwxy]$"),z=function(n){var i,o,r,s,u,a,l;if(n.length<3)return n;if(r=n.substr(0,1),"y"==r&&(n=r.toUpperCase()+n.substr(1)),s=p,u=v,s.test(n)?n=n.replace(s,"$1$2"):u.test(n)&&(n=n.replace(u,"$1$2")),s=g,u=m,s.test(n)){var z=s.exec(n);s=c,s.test(z[1])&&(s=y,n=n.replace(s,""))}else if(u.test(n)){var z=u.exec(n);i=z[1],u=h,u.test(i)&&(n=i,u=S,a=x,l=w,u.test(n)?n+="e":a.test(n)?(s=y,n=n.replace(s,"")):l.test(n)&&(n+="e"))}if(s=I,s.test(n)){var z=s.exec(n);i=z[1],n=i+"i"}if(s=b,s.test(n)){var z=s.exec(n);i=z[1],o=z[2],s=c,s.test(i)&&(n=i+e[o])}if(s=E,s.test(n)){var z=s.exec(n);i=z[1],o=z[2],s=c,s.test(i)&&(n=i+t[o])}if(s=D,u=F,s.test(n)){var z=s.exec(n);i=z[1],s=d,s.test(i)&&(n=i)}else if(u.test(n)){var z=u.exec(n);i=z[1]+z[2],u=d,u.test(i)&&(n=i)}if(s=_,s.test(n)){var z=s.exec(n);i=z[1],s=d,u=f,a=k,(s.test(i)||u.test(i)&&!a.test(i))&&(n=i)}return s=P,u=d,s.test(n)&&u.test(n)&&(s=y,n=n.replace(s,"")),"y"==r&&(n=r.toLowerCase()+n.substr(1)),n};return z}(),t.Pipeline.registerFunction(t.stemmer,"stemmer"),t.stopWordFilter=function(e){return e&&t.stopWordFilter.stopWords[e]!==!0?e:void 0},t.clearStopWords=function(){t.stopWordFilter.stopWords={}},t.addStopWords=function(e){null!=e&&Array.isArray(e)!==!1&&e.forEach(function(e){t.stopWordFilter.stopWords[e]=!0},this)},t.resetStopWords=function(){t.stopWordFilter.stopWords=t.defaultStopWords},t.defaultStopWords={"":!0,a:!0,able:!0,about:!0,across:!0,after:!0,all:!0,almost:!0,also:!0,am:!0,among:!0,an:!0,and:!0,any:!0,are:!0,as:!0,at:!0,be:!0,because:!0,been:!0,but:!0,by:!0,can:!0,cannot:!0,could:!0,dear:!0,did:!0,"do":!0,does:!0,either:!0,"else":!0,ever:!0,every:!0,"for":!0,from:!0,get:!0,got:!0,had:!0,has:!0,have:!0,he:!0,her:!0,hers:!0,him:!0,his:!0,how:!0,however:!0,i:!0,"if":!0,"in":!0,into:!0,is:!0,it:!0,its:!0,just:!0,least:!0,let:!0,like:!0,likely:!0,may:!0,me:!0,might:!0,most:!0,must:!0,my:!0,neither:!0,no:!0,nor:!0,not:!0,of:!0,off:!0,often:!0,on:!0,only:!0,or:!0,other:!0,our:!0,own:!0,rather:!0,said:!0,say:!0,says:!0,she:!0,should:!0,since:!0,so:!0,some:!0,than:!0,that:!0,the:!0,their:!0,them:!0,then:!0,there:!0,these:!0,they:!0,"this":!0,tis:!0,to:!0,too:!0,twas:!0,us:!0,wants:!0,was:!0,we:!0,were:!0,what:!0,when:!0,where:!0,which:!0,"while":!0,who:!0,whom:!0,why:!0,will:!0,"with":!0,would:!0,yet:!0,you:!0,your:!0},t.stopWordFilter.stopWords=t.defaultStopWords,t.Pipeline.registerFunction(t.stopWordFilter,"stopWordFilter"),t.trimmer=function(e){if(null===e||void 0===e)throw new Error("token should not be undefined");return e.replace(/^\W+/,"").replace(/\W+$/,"")},t.Pipeline.registerFunction(t.trimmer,"trimmer"),t.InvertedIndex=function(){this.root={docs:{},df:0}},t.InvertedIndex.load=function(e){var t=new this;return t.root=e.root,t},t.InvertedIndex.prototype.addToken=function(e,t,n){for(var n=n||this.root,i=0;i<=e.length-1;){var o=e[i];o in n||(n[o]={docs:{},df:0}),i+=1,n=n[o]}var r=t.ref;n.docs[r]?n.docs[r]={tf:t.tf}:(n.docs[r]={tf:t.tf},n.df+=1)},t.InvertedIndex.prototype.hasToken=function(e){if(!e)return!1;for(var t=this.root,n=0;n<e.length;n++){if(!t[e[n]])return!1;t=t[e[n]]}return!0},t.InvertedIndex.prototype.getNode=function(e){if(!e)return null;for(var t=this.root,n=0;n<e.length;n++){if(!t[e[n]])return null;t=t[e[n]]}return t},t.InvertedIndex.prototype.getDocs=function(e){var t=this.getNode(e);return null==t?{}:t.docs},t.InvertedIndex.prototype.getTermFrequency=function(e,t){var n=this.getNode(e);return null==n?0:t in n.docs?n.docs[t].tf:0},t.InvertedIndex.prototype.getDocFreq=function(e){var t=this.getNode(e);return null==t?0:t.df},t.InvertedIndex.prototype.removeToken=function(e,t){if(e){var n=this.getNode(e);null!=n&&t in n.docs&&(delete n.docs[t],n.df-=1)}},t.InvertedIndex.prototype.expandToken=function(e,t,n){if(null==e||""==e)return[];var t=t||[];if(void 0==n&&(n=this.getNode(e),null==n))return t;n.df>0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e<arguments.length;e++)t=arguments[e],~this.indexOf(t)||this.elements.splice(this.locationFor(t),0,t);this.length=this.elements.length},lunr.SortedSet.prototype.toArray=function(){return this.elements.slice()},lunr.SortedSet.prototype.map=function(e,t){return this.elements.map(e,t)},lunr.SortedSet.prototype.forEach=function(e,t){return this.elements.forEach(e,t)},lunr.SortedSet.prototype.indexOf=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]<u[i]?n++:s[n]>u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();o<r.length;o++)i.add(r[o]);return i},lunr.SortedSet.prototype.toJSON=function(){return this.toArray()},function(e,t){"function"==typeof define&&define.amd?define(t):"object"==typeof exports?module.exports=t():e.elasticlunr=t()}(this,function(){return t})}();
 3 |     /** pdoc search index */const docs = [{"fullname": "multitax", "modulename": "multitax", "type": "module", "doc": "<p></p>\n"}, {"fullname": "multitax.CustomTx", "modulename": "multitax", "qualname": "CustomTx", "type": "class", "doc": "<p></p>\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.CustomTx.__init__", "modulename": "multitax", "qualname": "CustomTx.__init__", "type": "function", "doc": "<p>CustomTx()</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>cols</strong> <em>[list, dict]</em>: List of fields to be parsed or a dictionary with {field: column index}. Options: \"node\", \"parent\", \"rank\", \"name\"</li>\n<li><strong>sep</strong> <em>[str]</em>: Separator of fields</li>\n<li><strong>**kwargs</strong> defined at <code>multitax.multitax.MultiTax</code></li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>tax_custom1 = CustomTx(files=\"my_custom_tax.tsv\", cols=[\"node\",\"parent\",\"rank\"])\ntax_custom2 = CustomTx(files=\"my_custom_tax.tsv\", cols={\"node\": 0, \"parent\": 1, \"name\": 5, \"rank\": 3})\n</code></pre>\n", "signature": "(\n    self,\n    cols: list = ['node', 'parent', 'rank', 'name'],\n    sep: str = '\\t',\n    **kwargs\n)", "funcdef": "def"}, {"fullname": "multitax.DummyTx", "modulename": "multitax", "qualname": "DummyTx", "type": "class", "doc": "<p></p>\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.DummyTx.__init__", "modulename": "multitax", "qualname": "DummyTx.__init__", "type": "function", "doc": "<p>DummyTx() - Dummy empty taxonomy</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li>**kwargs defined at <code>multitax.multitax.MultiTax</code></li>\n</ul>\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.GreengenesTx", "modulename": "multitax", "qualname": "GreengenesTx", "type": "class", "doc": "<p></p>\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.GreengenesTx.__init__", "modulename": "multitax", "qualname": "GreengenesTx.__init__", "type": "function", "doc": "<p>Main constructor of MultiTax and sub-classes</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>files</strong> <em>[str, list]</em>: One or more local files to parse.</li>\n<li><strong>urls</strong> <em>[str, list]</em>: One or more urls to download and parse.</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Directory to write downloaded files.</li>\n<li><strong>root_node</strong> <em>[str]</em>: Define an alternative root node.</li>\n<li><strong>root_parent</strong> <em>[str]</em>: Define the root parent node identifier.</li>\n<li><strong>root_name</strong> <em>[str]</em>: Define an alternative root name. Set to None to use original name.</li>\n<li><strong>root_rank</strong> <em>[str]</em>: Define an alternative root rank. Set to None to use original name.</li>\n<li><strong>undefined_node</strong> <em>[str]</em>: Define a default return value for undefined nodes.</li>\n<li><strong>undefined_name</strong> <em>[str]</em>: Define a default return value for undefined names.</li>\n<li><strong>undefined_rank</strong> <em>[str]</em>: Define a default return value for undefined ranks.</li>\n<li><strong>build_node_children</strong> <em>[bool]</em>: Build node,children dict (otherwise it will be created on first use).</li>\n<li><strong>build_name_nodes</strong> <em>[bool]</em>: Build name,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>build_rank_nodes</strong> <em>[bool]</em>: Build rank,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>extended_names</strong> <em>[bool]</em>: Parse extended names if available.</li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n</code></pre>\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.GtdbTx", "modulename": "multitax", "qualname": "GtdbTx", "type": "class", "doc": "<p></p>\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.GtdbTx.__init__", "modulename": "multitax", "qualname": "GtdbTx.__init__", "type": "function", "doc": "<p>Main constructor of MultiTax and sub-classes</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>files</strong> <em>[str, list]</em>: One or more local files to parse.</li>\n<li><strong>urls</strong> <em>[str, list]</em>: One or more urls to download and parse.</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Directory to write downloaded files.</li>\n<li><strong>root_node</strong> <em>[str]</em>: Define an alternative root node.</li>\n<li><strong>root_parent</strong> <em>[str]</em>: Define the root parent node identifier.</li>\n<li><strong>root_name</strong> <em>[str]</em>: Define an alternative root name. Set to None to use original name.</li>\n<li><strong>root_rank</strong> <em>[str]</em>: Define an alternative root rank. Set to None to use original name.</li>\n<li><strong>undefined_node</strong> <em>[str]</em>: Define a default return value for undefined nodes.</li>\n<li><strong>undefined_name</strong> <em>[str]</em>: Define a default return value for undefined names.</li>\n<li><strong>undefined_rank</strong> <em>[str]</em>: Define a default return value for undefined ranks.</li>\n<li><strong>build_node_children</strong> <em>[bool]</em>: Build node,children dict (otherwise it will be created on first use).</li>\n<li><strong>build_name_nodes</strong> <em>[bool]</em>: Build name,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>build_rank_nodes</strong> <em>[bool]</em>: Build rank,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>extended_names</strong> <em>[bool]</em>: Parse extended names if available.</li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n</code></pre>\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx", "modulename": "multitax", "qualname": "NcbiTx", "type": "class", "doc": "<p></p>\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.NcbiTx.__init__", "modulename": "multitax", "qualname": "NcbiTx.__init__", "type": "function", "doc": "<p>Main constructor of MultiTax and sub-classes</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>files</strong> <em>[str, list]</em>: One or more local files to parse.</li>\n<li><strong>urls</strong> <em>[str, list]</em>: One or more urls to download and parse.</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Directory to write downloaded files.</li>\n<li><strong>root_node</strong> <em>[str]</em>: Define an alternative root node.</li>\n<li><strong>root_parent</strong> <em>[str]</em>: Define the root parent node identifier.</li>\n<li><strong>root_name</strong> <em>[str]</em>: Define an alternative root name. Set to None to use original name.</li>\n<li><strong>root_rank</strong> <em>[str]</em>: Define an alternative root rank. Set to None to use original name.</li>\n<li><strong>undefined_node</strong> <em>[str]</em>: Define a default return value for undefined nodes.</li>\n<li><strong>undefined_name</strong> <em>[str]</em>: Define a default return value for undefined names.</li>\n<li><strong>undefined_rank</strong> <em>[str]</em>: Define a default return value for undefined ranks.</li>\n<li><strong>build_node_children</strong> <em>[bool]</em>: Build node,children dict (otherwise it will be created on first use).</li>\n<li><strong>build_name_nodes</strong> <em>[bool]</em>: Build name,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>build_rank_nodes</strong> <em>[bool]</em>: Build rank,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>extended_names</strong> <em>[bool]</em>: Parse extended names if available.</li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n</code></pre>\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.latest", "modulename": "multitax", "qualname": "NcbiTx.latest", "type": "function", "doc": "<p>Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.merged", "modulename": "multitax", "qualname": "NcbiTx.merged", "type": "function", "doc": "<p>Returns relative entry from the merged.dmp file of a given node.</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.search_name", "modulename": "multitax", "qualname": "NcbiTx.search_name", "type": "function", "doc": "<p>Search node by exact or partial name.</p>\n\n<p>Default order (can be skipped with <strong>force_extended=True</strong>):</p>\n\n<p>1) Search names defined as \"scientific name\" on nodes.dmp</p>\n\n<p>2) If nothing was found, search text in all other categories (must be activated with NcbiTx(<strong>extended_names=True</strong>))</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>text</strong> <em>[str]</em>: Text to search.</li>\n<li><strong>rank</strong> <em>[str]</em>: Filter results by rank.</li>\n<li><strong>exact</strong> <em>[bool]</em>: Exact or partial name search (both case sensitive).</li>\n<li><strong>force_extended</strong> <em>[bool]</em>: Search for text in all categories at once.</li>\n</ul>\n\n<p>Returns: list of matching nodes</p>\n", "signature": "(\n    self,\n    text: str,\n    rank: str = None,\n    exact: bool = True,\n    force_extended: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.stats", "modulename": "multitax", "qualname": "NcbiTx.stats", "type": "function", "doc": "<p>Returns a dict with general numbers of the taxonomic tree</p>\n\n<p>Example:</p>\n\n<pre><code>from pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n                          'genus': 8778,\n                          'family': 2323,\n                          'order': 930,\n                          'class': 337,\n                          'phylum': 131,\n                          'domain': 1,\n                          'root': 1}),\n 'ranks': 42739}\n</code></pre>\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.OttTx", "modulename": "multitax", "qualname": "OttTx", "type": "class", "doc": "<p></p>\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.OttTx.__init__", "modulename": "multitax", "qualname": "OttTx.__init__", "type": "function", "doc": "<p>Main constructor of MultiTax and sub-classes</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>files</strong> <em>[str, list]</em>: One or more local files to parse.</li>\n<li><strong>urls</strong> <em>[str, list]</em>: One or more urls to download and parse.</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Directory to write downloaded files.</li>\n<li><strong>root_node</strong> <em>[str]</em>: Define an alternative root node.</li>\n<li><strong>root_parent</strong> <em>[str]</em>: Define the root parent node identifier.</li>\n<li><strong>root_name</strong> <em>[str]</em>: Define an alternative root name. Set to None to use original name.</li>\n<li><strong>root_rank</strong> <em>[str]</em>: Define an alternative root rank. Set to None to use original name.</li>\n<li><strong>undefined_node</strong> <em>[str]</em>: Define a default return value for undefined nodes.</li>\n<li><strong>undefined_name</strong> <em>[str]</em>: Define a default return value for undefined names.</li>\n<li><strong>undefined_rank</strong> <em>[str]</em>: Define a default return value for undefined ranks.</li>\n<li><strong>build_node_children</strong> <em>[bool]</em>: Build node,children dict (otherwise it will be created on first use).</li>\n<li><strong>build_name_nodes</strong> <em>[bool]</em>: Build name,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>build_rank_nodes</strong> <em>[bool]</em>: Build rank,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>extended_names</strong> <em>[bool]</em>: Parse extended names if available.</li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n</code></pre>\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.OttTx.forwards", "modulename": "multitax", "qualname": "OttTx.forwards", "type": "function", "doc": "<p>Returns relative entry from the forwards.tsv file of a given node.</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.OttTx.latest", "modulename": "multitax", "qualname": "OttTx.latest", "type": "function", "doc": "<p>Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.OttTx.search_name", "modulename": "multitax", "qualname": "OttTx.search_name", "type": "function", "doc": "<p>Search node by exact or partial name.</p>\n\n<p>Default order (can be skipped with <strong>force_extended=True</strong>):</p>\n\n<p>1) Search default names defined on \"taxonomy.tsv\"</p>\n\n<p>2) If nothing was found, search in all other names defined on \"synonyms.tsv\" (must be activated with OttTx(<strong>extended_names=True</strong>))</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>text</strong> <em>[str]</em>: Text to search.</li>\n<li><strong>rank</strong> <em>[str]</em>: Filter results by rank.</li>\n<li><strong>exact</strong> <em>[bool]</em>: Exact or partial name search (both case sensitive).</li>\n<li><strong>force_extended</strong> <em>[bool]</em>: Search for text in all categories at once.</li>\n</ul>\n\n<p>Returns: list of matching nodes</p>\n", "signature": "(\n    self,\n    text: str,\n    rank: str = None,\n    exact: bool = True,\n    force_extended: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.OttTx.stats", "modulename": "multitax", "qualname": "OttTx.stats", "type": "function", "doc": "<p>Returns a dict with general numbers of the taxonomic tree</p>\n\n<p>Example:</p>\n\n<pre><code>from pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n                          'genus': 8778,\n                          'family': 2323,\n                          'order': 930,\n                          'class': 337,\n                          'phylum': 131,\n                          'domain': 1,\n                          'root': 1}),\n 'ranks': 42739}\n</code></pre>\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.SilvaTx", "modulename": "multitax", "qualname": "SilvaTx", "type": "class", "doc": "<p></p>\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.SilvaTx.__init__", "modulename": "multitax", "qualname": "SilvaTx.__init__", "type": "function", "doc": "<p>Main constructor of MultiTax and sub-classes</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>files</strong> <em>[str, list]</em>: One or more local files to parse.</li>\n<li><strong>urls</strong> <em>[str, list]</em>: One or more urls to download and parse.</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Directory to write downloaded files.</li>\n<li><strong>root_node</strong> <em>[str]</em>: Define an alternative root node.</li>\n<li><strong>root_parent</strong> <em>[str]</em>: Define the root parent node identifier.</li>\n<li><strong>root_name</strong> <em>[str]</em>: Define an alternative root name. Set to None to use original name.</li>\n<li><strong>root_rank</strong> <em>[str]</em>: Define an alternative root rank. Set to None to use original name.</li>\n<li><strong>undefined_node</strong> <em>[str]</em>: Define a default return value for undefined nodes.</li>\n<li><strong>undefined_name</strong> <em>[str]</em>: Define a default return value for undefined names.</li>\n<li><strong>undefined_rank</strong> <em>[str]</em>: Define a default return value for undefined ranks.</li>\n<li><strong>build_node_children</strong> <em>[bool]</em>: Build node,children dict (otherwise it will be created on first use).</li>\n<li><strong>build_name_nodes</strong> <em>[bool]</em>: Build name,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>build_rank_nodes</strong> <em>[bool]</em>: Build rank,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>extended_names</strong> <em>[bool]</em>: Parse extended names if available.</li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n</code></pre>\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.multitax", "modulename": "multitax.multitax", "type": "module", "doc": "<p></p>\n"}, {"fullname": "multitax.multitax.MultiTax", "modulename": "multitax.multitax", "qualname": "MultiTax", "type": "class", "doc": "<p></p>\n"}, {"fullname": "multitax.multitax.MultiTax.__init__", "modulename": "multitax.multitax", "qualname": "MultiTax.__init__", "type": "function", "doc": "<p>Main constructor of MultiTax and sub-classes</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>files</strong> <em>[str, list]</em>: One or more local files to parse.</li>\n<li><strong>urls</strong> <em>[str, list]</em>: One or more urls to download and parse.</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Directory to write downloaded files.</li>\n<li><strong>root_node</strong> <em>[str]</em>: Define an alternative root node.</li>\n<li><strong>root_parent</strong> <em>[str]</em>: Define the root parent node identifier.</li>\n<li><strong>root_name</strong> <em>[str]</em>: Define an alternative root name. Set to None to use original name.</li>\n<li><strong>root_rank</strong> <em>[str]</em>: Define an alternative root rank. Set to None to use original name.</li>\n<li><strong>undefined_node</strong> <em>[str]</em>: Define a default return value for undefined nodes.</li>\n<li><strong>undefined_name</strong> <em>[str]</em>: Define a default return value for undefined names.</li>\n<li><strong>undefined_rank</strong> <em>[str]</em>: Define a default return value for undefined ranks.</li>\n<li><strong>build_node_children</strong> <em>[bool]</em>: Build node,children dict (otherwise it will be created on first use).</li>\n<li><strong>build_name_nodes</strong> <em>[bool]</em>: Build name,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>build_rank_nodes</strong> <em>[bool]</em>: Build rank,nodes dict (otherwise it will be created on first use).</li>\n<li><strong>extended_names</strong> <em>[bool]</em>: Parse extended names if available.</li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>tax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n</code></pre>\n", "signature": "(\n    self,\n    files: list = None,\n    urls: list = None,\n    output_prefix: str = None,\n    root_node: str = None,\n    root_parent: str = '0',\n    root_name: str = None,\n    root_rank: str = None,\n    undefined_node: str = None,\n    undefined_name: str = None,\n    undefined_rank: str = None,\n    build_name_nodes: bool = False,\n    build_node_children: bool = False,\n    build_rank_nodes: bool = False,\n    extended_names: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.version", "modulename": "multitax.multitax", "qualname": "MultiTax.version", "type": "variable", "doc": "<p></p>\n", "default_value": " = '1.3.1'"}, {"fullname": "multitax.multitax.MultiTax.add", "modulename": "multitax.multitax", "qualname": "MultiTax.add", "type": "function", "doc": "<p>Add node to taxonomy.\nDeletes built lineages and translations.</p>\n", "signature": "(self, node: str, parent: str, name: str = None, rank: str = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.build_lineages", "modulename": "multitax.multitax", "qualname": "MultiTax.build_lineages", "type": "function", "doc": "<p>Stores lineages in memory for faster access.\nIt is valid for lineage(), rank_lineage() and name_lineage().\nIf keyword arguments (root_node, ranks) are used in those functions stored lineages are not used.</p>\n\n<p>Returns: None</p>\n", "signature": "(self, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.build_translation", "modulename": "multitax.multitax", "qualname": "MultiTax.build_translation", "type": "function", "doc": "<p>Create a translation of current taxonomy to another</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>tax</strong> [MultiTax]: A target taxonomy to be translated to.</li>\n<li><strong>files</strong> <em>[str, list]</em>: One or more local files to parse.</li>\n<li><strong>urls</strong> <em>[str, list]</em>: One or more urls to download and parse.</li>\n</ul>\n\n<p>Example:</p>\n\n<pre><code>from multitax import GtdbTx, NcbiTx\ngtdb_tax = GtdbTx()\nncbi_tax = NcbiTx()\n\n# Automatically download translation files\ngtdb_tax.build_translation(ncbi_tax)\ngtdb_tax.translate(\"g__Escherichia\")\n    {'1301', '547', '561', '570', '590', '620'}\n\n# Using local files (NCBI &lt;-&gt; GTDB)\nncbi_tax.build_translation(gtdb_tax, files=[\"ar53_metadata.tar.gz\", \"bac120_metadata.tar.gz\"])\nncbi_tax.translate(\"620\")\n    {'g__Escherichia', 'g__Proteus', 'g__Serratia'}\n</code></pre>\n", "signature": "(self, tax, files: list = None, urls: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.children", "modulename": "multitax.multitax", "qualname": "MultiTax.children", "type": "function", "doc": "<p>Returns list of direct children nodes of a given node.</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.check_consistency", "modulename": "multitax.multitax", "qualname": "MultiTax.check_consistency", "type": "function", "doc": "<p>Checks consistency of the tree</p>\n\n<p>Returns: raise an Exception otherwise None</p>\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.clear_lineages", "modulename": "multitax.multitax", "qualname": "MultiTax.clear_lineages", "type": "function", "doc": "<p>Clear built lineages.</p>\n\n<p>Returns: None</p>\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.closest_parent", "modulename": "multitax.multitax", "qualname": "MultiTax.closest_parent", "type": "function", "doc": "<p>Returns the closest parent node based on a defined list of ranks</p>\n", "signature": "(self, node: str, ranks: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.filter", "modulename": "multitax.multitax", "qualname": "MultiTax.filter", "type": "function", "doc": "<p>Filters taxonomy given a list of nodes.\nBy default keep all the ancestors of the given nodes.\nIf desc=True, keep all descendants instead.\nDeletes built lineages and translations.</p>\n\n<p>Example:</p>\n\n<pre><code>from multitax import GtdbTx\ntax = GtdbTx()\n\ntax.lineage('s__Enterovibrio marina')\n# ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Vibrionaceae', 'g__Enterovibrio', 's__Enterovibrio marina']\n# Keep only ancestors of 'g__Enterovibrio'\ntax.filter('g__Enterovibrio')\n\n# Reload taxonomy\ntax = GtdbTx()\n# Keep only descendants of 'g__Enterovibrio'\ntax.filter('g__Enterovibrio', desc=True)\n</code></pre>\n", "signature": "(self, nodes: list, desc: bool = False)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.latest", "modulename": "multitax.multitax", "qualname": "MultiTax.latest", "type": "function", "doc": "<p>Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.leaves", "modulename": "multitax.multitax", "qualname": "MultiTax.leaves", "type": "function", "doc": "<p>Returns a list of leaf nodes of a given node.</p>\n", "signature": "(self, node: str = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.lineage", "type": "function", "doc": "<p>Returns a list with the lineage of a given node.\nIf ranks is provided, returns only nodes annotated with such ranks.\nIf root_node is provided, use it instead of default root of tree.</p>\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.name", "modulename": "multitax.multitax", "qualname": "MultiTax.name", "type": "function", "doc": "<p>Returns name of a given node.</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.name_lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.name_lineage", "type": "function", "doc": "<p>Returns a list with the name lineage of a given node.</p>\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.nodes_rank", "modulename": "multitax.multitax", "qualname": "MultiTax.nodes_rank", "type": "function", "doc": "<p>Returns list of nodes of a given rank.</p>\n", "signature": "(self, rank: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.parent", "modulename": "multitax.multitax", "qualname": "MultiTax.parent", "type": "function", "doc": "<p>Returns the direct parent node of a given node.</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.parent_rank", "modulename": "multitax.multitax", "qualname": "MultiTax.parent_rank", "type": "function", "doc": "<p>Returns the parent node of a given rank in the specified rank.</p>\n", "signature": "(self, node: str, rank: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.prune", "modulename": "multitax.multitax", "qualname": "MultiTax.prune", "type": "function", "doc": "<p>Prunes branches of the tree under the given nodes.\nDeletes built lineages and translations.</p>\n", "signature": "(self, nodes: list)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.rank", "modulename": "multitax.multitax", "qualname": "MultiTax.rank", "type": "function", "doc": "<p>Returns the rank of a given node.</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.rank_lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.rank_lineage", "type": "function", "doc": "<p>Returns a list with the rank lineage of a given node.</p>\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.remove", "modulename": "multitax.multitax", "qualname": "MultiTax.remove", "type": "function", "doc": "<p>Removes node from taxonomy. Can break the tree if a parent node is removed. To remove a certain branch, use prune.\nRunning check consistency after removing a node is recommended.\nDeletes built lineages and translations.</p>\n", "signature": "(self, node: str, check_consistency: bool = False)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.search_name", "modulename": "multitax.multitax", "qualname": "MultiTax.search_name", "type": "function", "doc": "<p>Search node by exact or partial name</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>text</strong> <em>[str]</em>: Text to search.</li>\n<li><strong>rank</strong> <em>[str]</em>: Filter results by rank.</li>\n<li><strong>exact</strong> <em>[bool]</em>: Exact or partial name search (both case sensitive).</li>\n</ul>\n\n<p>Returns: list of matching nodes</p>\n", "signature": "(self, text: str, rank: str = None, exact: bool = True)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.stats", "modulename": "multitax.multitax", "qualname": "MultiTax.stats", "type": "function", "doc": "<p>Returns a dict with general numbers of the taxonomic tree</p>\n\n<p>Example:</p>\n\n<pre><code>from pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n                          'genus': 8778,\n                          'family': 2323,\n                          'order': 930,\n                          'class': 337,\n                          'phylum': 131,\n                          'domain': 1,\n                          'root': 1}),\n 'ranks': 42739}\n</code></pre>\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.translate", "modulename": "multitax.multitax", "qualname": "MultiTax.translate", "type": "function", "doc": "<p>Returns the translated node from another taxonomy. Translated nodes are generated with the build_translation function.</p>\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.write", "modulename": "multitax.multitax", "qualname": "MultiTax.write", "type": "function", "doc": "<p>Writes loaded taxonomy to a file.</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>cols</strong> <em>[list]</em>: Options: \"node\", \"latest\", \"parent\", \"rank\", \"name\", \"leaves\", \"children\", \"lineage\", \"rank_lineage\", \"name_lineage\"</li>\n<li><strong>sep</strong> <em>[str]</em>: Separator of fields</li>\n<li><strong>sep_multi</strong> <em>[str]</em>: Separator of multi-valued fields</li>\n<li><strong>ranks</strong> <em>[list]</em>: Ranks to report</li>\n<li><strong>gz</strong> <em>[bool]</em>: Gzip output</li>\n</ul>\n\n<p>Returns: None</p>\n", "signature": "(\n    self,\n    output_file: str,\n    cols: list = ['node', 'parent', 'rank', 'name'],\n    sep: str = '\\t',\n    sep_multi: str = '|',\n    ranks: list = None,\n    gz: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.utils", "modulename": "multitax.utils", "type": "module", "doc": "<p></p>\n"}, {"fullname": "multitax.utils.check_dir", "modulename": "multitax.utils", "qualname": "check_dir", "type": "function", "doc": "<p></p>\n", "signature": "(prefix: str)", "funcdef": "def"}, {"fullname": "multitax.utils.check_file", "modulename": "multitax.utils", "qualname": "check_file", "type": "function", "doc": "<p></p>\n", "signature": "(file: str)", "funcdef": "def"}, {"fullname": "multitax.utils.check_no_file", "modulename": "multitax.utils", "qualname": "check_no_file", "type": "function", "doc": "<p></p>\n", "signature": "(file: str)", "funcdef": "def"}, {"fullname": "multitax.utils.close_files", "modulename": "multitax.utils", "qualname": "close_files", "type": "function", "doc": "<p>Parameters:</p>\n\n<ul>\n<li><strong>fhs</strong> <em>[dict]</em>: {file: file handler}</li>\n</ul>\n\n<p>Returns: Nothing</p>\n", "signature": "(fhs: dict)", "funcdef": "def"}, {"fullname": "multitax.utils.download_files", "modulename": "multitax.utils", "qualname": "download_files", "type": "function", "doc": "<p>Download and open files (memory/stream) or write to disk (multitax.utils.save_urls)</p>\n\n<p>Parameters:</p>\n\n<ul>\n<li><strong>urls</strong> <em>[list]</em>: List of files to download (text, \".gz\", \".tar.gz\", \".tgz\")</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Output directory to save files</li>\n</ul>\n\n<p>Returns:</p>\n\n<ul>\n<li>OrderedDict {file: file handler} (same order as input)</li>\n</ul>\n", "signature": "(urls: list, output_prefix: str = None, retry_attempts: int = 1)", "funcdef": "def"}, {"fullname": "multitax.utils.filter_function", "modulename": "multitax.utils", "qualname": "filter_function", "type": "function", "doc": "<p></p>\n", "signature": "(elements, function, value)", "funcdef": "def"}, {"fullname": "multitax.utils.join_check", "modulename": "multitax.utils", "qualname": "join_check", "type": "function", "doc": "<p></p>\n", "signature": "(elements, sep: str)", "funcdef": "def"}, {"fullname": "multitax.utils.load_url_mem", "modulename": "multitax.utils", "qualname": "load_url_mem", "type": "function", "doc": "<p>Parameters:</p>\n\n<ul>\n<li><strong>url</strong> <em>[str]</em>: URL to load into memory</li>\n</ul>\n\n<p>Returns:</p>\n\n<ul>\n<li>io.BytesIO of the requested url</li>\n</ul>\n", "signature": "(url: str)", "funcdef": "def"}, {"fullname": "multitax.utils.open_files", "modulename": "multitax.utils", "qualname": "open_files", "type": "function", "doc": "<p>Parameters:</p>\n\n<ul>\n<li><strong>files</strong> <em>[list]</em>: List of files to open (text, \".gz\", \".tar.gz\", \".tgz\")</li>\n</ul>\n\n<p>Returns:</p>\n\n<ul>\n<li>OrderedDict {file: file handler} (same order as input)</li>\n</ul>\n", "signature": "(files: list)", "funcdef": "def"}, {"fullname": "multitax.utils.reverse_dict", "modulename": "multitax.utils", "qualname": "reverse_dict", "type": "function", "doc": "<p></p>\n", "signature": "(d: dict)", "funcdef": "def"}, {"fullname": "multitax.utils.save_urls", "modulename": "multitax.utils", "qualname": "save_urls", "type": "function", "doc": "<p>Parameters:</p>\n\n<ul>\n<li><strong>urls</strong> <em>[list]</em>: List of urls to download</li>\n<li><strong>output_prefix</strong> <em>[str]</em>: Output directory to save files</li>\n</ul>\n\n<p>Returns:</p>\n\n<ul>\n<li>list of files saved</li>\n</ul>\n", "signature": "(urls: list, output_prefix: str)", "funcdef": "def"}, {"fullname": "multitax.utils.warning_on_one_line", "modulename": "multitax.utils", "qualname": "warning_on_one_line", "type": "function", "doc": "<p></p>\n", "signature": "(message, category, filename, lineno, file=None, line=None)", "funcdef": "def"}];
 4 | 
 5 |     // mirrored in build-search-index.js (part 1)
 6 |     // Also split on html tags. this is a cheap heuristic, but good enough.
 7 |     elasticlunr.tokenizer.setSeperator(/[\s\-.;&_'"=,()]+|<[^>]*>/);
 8 | 
 9 |     let searchIndex;
10 |     if (docs._isPrebuiltIndex) {
11 |         console.info("using precompiled search index");
12 |         searchIndex = elasticlunr.Index.load(docs);
13 |     } else {
14 |         console.time("building search index");
15 |         // mirrored in build-search-index.js (part 2)
16 |         searchIndex = elasticlunr(function () {
17 |             this.pipeline.remove(elasticlunr.stemmer);
18 |             this.pipeline.remove(elasticlunr.stopWordFilter);
19 |             this.addField("qualname");
20 |             this.addField("fullname");
21 |             this.addField("annotation");
22 |             this.addField("default_value");
23 |             this.addField("signature");
24 |             this.addField("bases");
25 |             this.addField("doc");
26 |             this.setRef("fullname");
27 |         });
28 |         for (let doc of docs) {
29 |             searchIndex.addDoc(doc);
30 |         }
31 |         console.timeEnd("building search index");
32 |     }
33 | 
34 |     return (term) => searchIndex.search(term, {
35 |         fields: {
36 |             qualname: {boost: 4},
37 |             fullname: {boost: 2},
38 |             annotation: {boost: 2},
39 |             default_value: {boost: 2},
40 |             signature: {boost: 2},
41 |             bases: {boost: 2},
42 |             doc: {boost: 1},
43 |         },
44 |         expand: true
45 |     });
46 | })();


--------------------------------------------------------------------------------
/make_docs.sh:
--------------------------------------------------------------------------------
1 | pdoc -o docs multitax multitax/multitax.py multitax/utils.py
2 | 


--------------------------------------------------------------------------------
/multitax/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.3.2"
 2 | 
 3 | __all__ = (
 4 |     'CustomTx',
 5 |     'DummyTx',
 6 |     'GreengenesTx',
 7 |     'GtdbTx',
 8 |     'NcbiTx',
 9 |     'OttTx',
10 |     'SilvaTx',
11 | )
12 | 
13 | from .customtx import CustomTx
14 | from .dummytx import DummyTx
15 | from .greengenestx import GreengenesTx
16 | from .gtdbtx import GtdbTx
17 | from .ncbitx import NcbiTx
18 | from .otttx import OttTx
19 | from .silvatx import SilvaTx
20 | 


--------------------------------------------------------------------------------
/multitax/customtx.py:
--------------------------------------------------------------------------------
 1 | from .multitax import MultiTax
 2 | import warnings
 3 | 
 4 | 
 5 | class CustomTx(MultiTax):
 6 | 
 7 |     _required_cols = ["node", "parent"]
 8 |     _possible_cols = ["node", "parent", "rank", "name"]
 9 | 
10 |     def __init__(self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs):
11 |         """
12 |         CustomTx()
13 | 
14 |         Parameters:
15 |         * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
16 |         * **sep** *[str]*: Separator of fields
17 |         * **\*\*kwargs** defined at `multitax.multitax.MultiTax`
18 | 
19 |         Example:
20 | 
21 |             tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
22 |             tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
23 |         """
24 | 
25 |         self._cols = self._parse_cols(cols)
26 |         self._sep = sep
27 |         super().__init__(**kwargs)
28 | 
29 |     def __repr__(self):
30 |         stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
31 |         return 'CustomTx({})'.format(', '.join(stats))
32 | 
33 |     def _build_translation(self, target_tax, files: list = None, urls: list = None):
34 |         warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
35 |                       "," + target_tax.__class__.__name__ + "] not yet implemented.")
36 |         return {}
37 | 
38 |     def _parse(self, fhs, **kwargs):
39 |         nodes = {}
40 |         ranks = {}
41 |         names = {}
42 |         for source, fh in fhs.items():
43 |             for line in fh:
44 |                 try:
45 |                     fields = line.rstrip().split(self._sep)
46 |                 except:
47 |                     fields = line.decode().rstrip().split(self._sep)
48 | 
49 |                 node = fields[self._cols["node"]]
50 |                 nodes[node] = fields[self._cols["parent"]]
51 |                 if "name" in self._cols:
52 |                     names[node] = fields[self._cols["name"]]
53 |                 if "rank" in self._cols:
54 |                     ranks[node] = fields[self._cols["rank"]]
55 | 
56 |         return nodes, ranks, names
57 | 
58 |     def _parse_cols(self, cols):
59 |         if isinstance(cols, list):
60 |             cols = {c: i for i, c in enumerate(cols)}
61 | 
62 |         for rc in self._required_cols:
63 |             if rc not in cols:
64 |                 raise ValueError(rc + " is a required column")
65 | 
66 |         for c in cols:
67 |             if c not in self._possible_cols:
68 |                 raise ValueError(c + " is not a valid column: " +
69 |                                  ",".join(self._possible_cols))
70 | 
71 |         return cols
72 | 


--------------------------------------------------------------------------------
/multitax/dummytx.py:
--------------------------------------------------------------------------------
 1 | from .multitax import MultiTax
 2 | 
 3 | 
 4 | class DummyTx(MultiTax):
 5 | 
 6 |     def __init__(self, **kwargs):
 7 |         """
 8 |         DummyTx() - Dummy empty taxonomy
 9 | 
10 |         Parameters:
11 | 
12 |         * \*\*kwargs defined at `multitax.multitax.MultiTax`
13 |         """
14 |         super().__init__(**kwargs)
15 | 
16 |     def __repr__(self):
17 |         stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
18 |         return 'DummyTx({})'.format(', '.join(stats))
19 | 


--------------------------------------------------------------------------------
/multitax/greengenestx.py:
--------------------------------------------------------------------------------
 1 | from .multitax import MultiTax
 2 | import warnings
 3 | 
 4 | 
 5 | class GreengenesTx(MultiTax):
 6 |     _default_urls = [
 7 |         "https://gg-sg-web.s3-us-west-2.amazonaws.com/downloads/greengenes_database/gg_13_5/gg_13_5_taxonomy.txt.gz"]
 8 |     _rank_codes = [("k__", "kingdom"),
 9 |                    ("p__", "phylum"),
10 |                    ("c__", "class"),
11 |                    ("o__", "order"),
12 |                    ("f__", "family"),
13 |                    ("g__", "genus"),
14 |                    ("s__", "species")]
15 | 
16 |     def __init__(self, **kwargs):
17 |         # forwards.tsv
18 |         self._forwards = {}
19 |         super().__init__(**kwargs)
20 | 
21 |     def __repr__(self):
22 |         stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
23 |         return 'GreengenesTx({})'.format(', '.join(stats))
24 | 
25 |     def _build_translation(self, target_tax, files: list = None, urls: list = None):
26 |         warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
27 |                       "," + target_tax.__class__.__name__ + "] not yet implemented.")
28 |         return {}
29 | 
30 |     def _parse(self, fhs, **kwargs):
31 |         nodes = {}
32 |         ranks = {}
33 |         names = {}
34 | 
35 |         for source, fh in fhs.items():
36 |             for line in fh:
37 |                 try:
38 |                     _, lineage = line.rstrip().split('\t')
39 |                 except:
40 |                     _, lineage = line.decode().rstrip().split('\t')
41 |                 lin = lineage.split("; ")
42 |                 for i in range(len(lin))[::-1]:
43 |                     # assert rank
44 |                     assert lin[i][:3] == self._rank_codes[i][0]
45 |                     # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
46 |                     taxid = lin[i]
47 |                     name = lin[i][3:]
48 |                     if not name:
49 |                         continue  # empty entry "s__"
50 |                     rank = self._rank_codes[i][1]
51 |                     if i == 0:
52 |                         parent_taxid = self._default_root_node
53 |                     else:
54 |                         parent_taxid = lin[i-1]
55 |                     if taxid not in nodes:
56 |                         nodes[taxid] = parent_taxid
57 |                         names[taxid] = name
58 |                         ranks[taxid] = rank
59 | 
60 |         return nodes, ranks, names
61 | 


--------------------------------------------------------------------------------
/multitax/gtdbtx.py:
--------------------------------------------------------------------------------
  1 | from .multitax import MultiTax
  2 | from .utils import check_file
  3 | from .utils import open_files
  4 | from .utils import download_files
  5 | import warnings
  6 | 
  7 | 
  8 | class GtdbTx(MultiTax):
  9 | 
 10 |     _default_urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_taxonomy.tsv.gz",
 11 |                      "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_taxonomy.tsv.gz"]
 12 |     _rank_codes = [("d__", "domain"),
 13 |                    ("p__", "phylum"),
 14 |                    ("c__", "class"),
 15 |                    ("o__", "order"),
 16 |                    ("f__", "family"),
 17 |                    ("g__", "genus"),
 18 |                    ("s__", "species")]
 19 | 
 20 |     def __init__(self, **kwargs):
 21 |         super().__init__(**kwargs)
 22 | 
 23 |     def __repr__(self):
 24 |         stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
 25 |         return 'GtdbTx({})'.format(', '.join(stats))
 26 | 
 27 |     def _build_translation(self, target_tax, files: list = None, urls: list = None):
 28 |         translated_nodes = {}
 29 |         if target_tax.__class__.__name__ == "NcbiTx":
 30 | 
 31 |             if files:
 32 |                 fhs = open_files(files)
 33 |             else:
 34 |                 _urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz",
 35 |                          "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz"]
 36 |                 fhs = download_files(
 37 |                     urls=urls if urls else _urls, retry_attempts=3)
 38 | 
 39 |             accession_col = 0
 40 |             gtdb_taxonomy_col = 19
 41 |             ncbi_taxid_col = 80
 42 | 
 43 |             for source, fh in fhs.items():
 44 |                 for line in fh:
 45 |                     try:
 46 |                         fields = line.rstrip().split('\t')
 47 |                     except:
 48 |                         fields = line.decode().rstrip().split('\t')
 49 | 
 50 |                     # skip header
 51 |                     if fields[accession_col] == "accession":
 52 |                         continue
 53 | 
 54 |                     print(fields)
 55 |                     ncbi_leaf_node = target_tax.latest(fields[ncbi_taxid_col])
 56 |                     if ncbi_leaf_node != target_tax.undefined_node:
 57 |                         ncbi_nodes = target_tax.lineage(ncbi_leaf_node, ranks=[
 58 |                                                         "superkingdom", "phylum", "class",
 59 |                                                         "order", "family", "genus", "species"])
 60 |                     else:
 61 |                         continue
 62 | 
 63 |                     # Build GTDB lineage from leaf (species on given lineage)
 64 |                     # to accomodate possible changes in the loaded tax
 65 |                     gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
 66 |                     if gtdb_leaf_node != self.undefined_node:
 67 |                         gtdb_nodes = self.lineage(gtdb_leaf_node, ranks=[
 68 |                             "domain", "phylum", "class", "order",
 69 |                             "family", "genus", "species"])
 70 |                     else:
 71 |                         continue
 72 | 
 73 |                     # Match ranks
 74 |                     for i, gtdb_n in enumerate(gtdb_nodes):
 75 |                         if ncbi_nodes[i] != target_tax.undefined_node and gtdb_n != self.undefined_node:
 76 |                             if gtdb_n not in translated_nodes:
 77 |                                 translated_nodes[gtdb_n] = set()
 78 |                             translated_nodes[gtdb_n].add(ncbi_nodes[i])
 79 | 
 80 |         else:
 81 |             warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
 82 |                           "," + target_tax.__class__.__name__ + "] not yet implemented.")
 83 | 
 84 |         return translated_nodes
 85 | 
 86 |     def _parse(self, fhs, **kwargs):
 87 |         nodes = {}
 88 |         ranks = {}
 89 |         names = {}
 90 |         for source, fh in fhs.items():
 91 |             for line in fh:
 92 |                 try:
 93 |                     _, lineage = line.rstrip().split('\t')
 94 |                 except:
 95 |                     _, lineage = line.decode().rstrip().split('\t')
 96 |                 lin = lineage.split(";")
 97 |                 for i in range(len(lin))[::-1]:
 98 |                     # assert rank
 99 |                     assert lin[i][:3] == self._rank_codes[i][0]
100 |                     # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
101 |                     taxid = lin[i]
102 |                     name = lin[i][3:]
103 |                     # empty entry "s__"
104 |                     if not name:
105 |                         continue
106 |                     rank = self._rank_codes[i][1]
107 |                     if i == 0:
108 |                         parent_taxid = self._default_root_node
109 |                     else:
110 |                         parent_taxid = lin[i-1]
111 |                     if taxid not in nodes:
112 |                         nodes[taxid] = parent_taxid
113 |                         names[taxid] = name
114 |                         ranks[taxid] = rank
115 | 
116 |         return nodes, ranks, names
117 | 


--------------------------------------------------------------------------------
/multitax/multitax.py:
--------------------------------------------------------------------------------
  1 | from .utils import *
  2 | from collections import Counter
  3 | from . import __version__
  4 | 
  5 | class MultiTax(object):
  6 | 
  7 |     version = __version__
  8 | 
  9 |     _default_urls = []
 10 |     _default_root_node = "1"
 11 | 
 12 |     def __init__(self,
 13 |                  files: list = None,
 14 |                  urls: list = None,
 15 |                  output_prefix: str = None,
 16 |                  root_node: str = None,
 17 |                  root_parent: str = "0",
 18 |                  root_name: str = None,
 19 |                  root_rank: str = None,
 20 |                  undefined_node: str = None,
 21 |                  undefined_name: str = None,
 22 |                  undefined_rank: str = None,
 23 |                  build_name_nodes: bool = False,
 24 |                  build_node_children: bool = False,
 25 |                  build_rank_nodes: bool = False,
 26 |                  extended_names: bool = False):
 27 |         """
 28 |         Main constructor of MultiTax and sub-classes
 29 | 
 30 |         Parameters:
 31 |         * **files** *[str, list]*: One or more local files to parse.
 32 |         * **urls** *[str, list]*: One or more urls to download and parse.
 33 |         * **output_prefix** *[str]*: Directory to write downloaded files.
 34 |         * **root_node** *[str]*: Define an alternative root node.
 35 |         * **root_parent** *[str]*: Define the root parent node identifier.
 36 |         * **root_name** *[str]*: Define an alternative root name. Set to None to use original name.
 37 |         * **root_rank** *[str]*: Define an alternative root rank. Set to None to use original name.
 38 |         * **undefined_node** *[str]*: Define a default return value for undefined nodes.
 39 |         * **undefined_name** *[str]*: Define a default return value for undefined names.
 40 |         * **undefined_rank** *[str]*: Define a default return value for undefined ranks.
 41 |         * **build_node_children** *[bool]*: Build node,children dict (otherwise it will be created on first use).
 42 |         * **build_name_nodes** *[bool]*: Build name,nodes dict (otherwise it will be created on first use).
 43 |         * **build_rank_nodes** *[bool]*: Build rank,nodes dict (otherwise it will be created on first use).
 44 |         * **extended_names** *[bool]*: Parse extended names if available.
 45 | 
 46 |         Example:
 47 | 
 48 |             tax_ncbi = NcbiTx()
 49 |             tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
 50 |             tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
 51 |             tax_ott = OttTx(root_node="844192")
 52 |             tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
 53 |         """
 54 |         if files:
 55 |             if isinstance(files, str):
 56 |                 files = [files]
 57 |             for file in files:
 58 |                 check_file(file)
 59 | 
 60 |         if output_prefix:
 61 |             check_dir(output_prefix)
 62 | 
 63 |         # Main structures
 64 |         self._nodes = {}
 65 |         self._ranks = {}
 66 |         self._names = {}
 67 |         # Aux. structures
 68 |         self._lineages = {}
 69 |         self._name_nodes = {}
 70 |         self._node_children = {}
 71 |         self._rank_nodes = {}
 72 |         self._translated_nodes = {}
 73 | 
 74 |         # Store source of tax files (url or file)
 75 |         self.sources = []
 76 | 
 77 |         # Open/Download/Write files
 78 |         fhs = {}
 79 |         if files:
 80 |             fhs = open_files(files)
 81 |         elif urls or self._default_urls:
 82 |             fhs = download_files(urls=urls if urls else self._default_urls,
 83 |                                  output_prefix=output_prefix,
 84 |                                  retry_attempts=3)
 85 | 
 86 |         if fhs:
 87 |             # Parse taxonomy
 88 |             self._nodes, self._ranks, self._names = self._parse(
 89 |                 fhs, extended_names=extended_names)
 90 |             close_files(fhs)
 91 |             # Save sources for stats (files or urls)
 92 |             self.sources = list(fhs.keys())
 93 | 
 94 |         # Set undefined values
 95 |         self.undefined_node = undefined_node
 96 |         self.undefined_name = undefined_name
 97 |         self.undefined_rank = undefined_rank
 98 | 
 99 |         # Set root values
100 |         self._set_root_node(root=root_node if root_node else self._default_root_node,
101 |                             parent=root_parent, name=root_name, rank=root_rank)
102 | 
103 |         # build auxiliary structures
104 |         if build_node_children:
105 |             self._node_children = reverse_dict(self._nodes)
106 |         if build_name_nodes:
107 |             self._name_nodes = reverse_dict(self._names)
108 |         if build_rank_nodes:
109 |             self._rank_nodes = reverse_dict(self._ranks)
110 | 
111 |         self.check_consistency()
112 | 
113 |     def _exact_name(self, text: str, names: dict):
114 |         """
115 |         Returns list of nodes of a given exact name (case sensitive).
116 |         """
117 |         if text in names:
118 |             return names[text]
119 |         else:
120 |             return []
121 | 
122 |     def _parse(self, fhs: dict):
123 |         """
124 |         main function to be overloaded
125 |         receives a dictionary with {"url/file": file handler}
126 |         return nodes, ranks and names dicts
127 |         """
128 |         return {}, {}, {}
129 | 
130 |     def _partial_name(self, text: str, names: dict):
131 |         """
132 |         Searches names containing a certain text (case sensitive) and return their respective nodes.
133 |         """
134 |         matching_nodes = set()
135 |         for name in names:
136 |             if text in name:
137 |                 matching_nodes.update(names[name])
138 |         return list(matching_nodes)
139 | 
140 |     def _recurse_leaves(self, node: str):
141 |         """
142 |         Recursive function returning leaf nodes
143 |         """
144 |         children = self.children(node)
145 |         if not children:
146 |             return [node]
147 |         leaves = []
148 |         for child in children:
149 |             leaves.extend(self._recurse_leaves(child))
150 |         return leaves
151 | 
152 |     def _remove(self, node: str):
153 |         """
154 |         Removes node from taxonomy, no checking, for internal use
155 |         """
156 |         del self._nodes[node]
157 |         if node in self._names:
158 |             del self._names[node]
159 |         if node in self._ranks:
160 |             del self._ranks[node]
161 | 
162 |     def _reset_aux_data(self):
163 |         """
164 |         Reset aux. data structures
165 |         """
166 |         self._lineages = {}
167 |         self._name_nodes = {}
168 |         self._node_children = {}
169 |         self._rank_nodes = {}
170 |         self._translated_nodes = {}
171 | 
172 |     def _set_root_node(self, root: str, parent: str, name: str, rank: str):
173 |         """
174 |         Set root node of the tree.
175 |         The files are parsed based on the self._default_root_node for each class
176 |         A user-defined root node can be:
177 |         1) internal: will filter the tree acodingly and delete the default root_node
178 |         2) external: will add node and link to the default
179 |         """
180 | 
181 |         # Set parent/root with defaults
182 |         self.root_parent = parent
183 |         self.root_node = self._default_root_node
184 |         self._nodes[self.root_node] = self.root_parent
185 | 
186 |         # Default root node is the top by definition
187 |         if root != self._default_root_node:
188 |             if root in self._nodes:
189 |                 # Not default but exists on tree, filter only descendants
190 |                 self.filter(root, desc=True)
191 |                 # Remove entry for _default_root_node
192 |                 self._remove(self._default_root_node)
193 |             else:
194 |                 # Not on tree, link default node with new root
195 |                 self._nodes[self._default_root_node] = root
196 |             # Change root to user defined
197 |             self.root_node = root
198 |             # Set/Update new root node parent link
199 |             self._nodes[self.root_node] = self.root_parent
200 | 
201 |         # User-defined rank/name.
202 |         # If provided, insert manually,
203 |         # If None, check if is in the tree (defined in the given tax)
204 |         #    otherwise insert default "root"
205 |         if name:
206 |             self._names[self.root_node] = name
207 |         elif self.root_node not in self._names:
208 |             self._names[self.root_node] = "root"
209 |         # Set static name
210 |         self.root_name = self._names[self.root_node]
211 | 
212 |         if rank:
213 |             self._ranks[self.root_node] = rank
214 |         elif self.root_node not in self._ranks:
215 |             self._ranks[self.root_node] = "root"
216 |         # Set static rank
217 |         self.root_rank = self._ranks[self.root_node]
218 | 
219 |     def add(self, node: str, parent: str, name: str = None, rank: str = None):
220 |         """
221 |         Add node to taxonomy.
222 |         Deletes built lineages and translations.
223 |         """
224 |         if parent not in self._nodes:
225 |             raise ValueError("Parent node [" + parent + "] not found.")
226 |         elif node in self._nodes:
227 |             raise ValueError("Node [" + node + "] already present.")
228 |         
229 |         self._nodes[node] = parent
230 |         self._names[node] = name if name is not None else self.undefined_name
231 |         self._ranks[node] = rank if rank is not None else self.undefined_rank
232 |         self._reset_aux_data()
233 | 
234 |     def build_lineages(self, root_node: str = None, ranks: list = None):
235 |         """
236 |         Stores lineages in memory for faster access.
237 |         It is valid for lineage(), rank_lineage() and name_lineage().
238 |         If keyword arguments (root_node, ranks) are used in those functions stored lineages are not used.
239 | 
240 |         Returns: None
241 |         """
242 |         self.clear_lineages()
243 |         for node in self._nodes:
244 |             self._lineages[node] = self.lineage(
245 |                 node=node, root_node=root_node, ranks=ranks)
246 | 
247 |     def build_translation(self, tax, files: list = None, urls: list = None):
248 |         """
249 |         Create a translation of current taxonomy to another
250 | 
251 |         Parameters:
252 | 
253 |         * **tax** [MultiTax]: A target taxonomy to be translated to.
254 |         * **files** *[str, list]*: One or more local files to parse.
255 |         * **urls** *[str, list]*: One or more urls to download and parse.
256 | 
257 |         Example:
258 | 
259 |             from multitax import GtdbTx, NcbiTx
260 |             gtdb_tax = GtdbTx()
261 |             ncbi_tax = NcbiTx()
262 | 
263 |             # Automatically download translation files
264 |             gtdb_tax.build_translation(ncbi_tax)
265 |             gtdb_tax.translate("g__Escherichia")
266 |                 {'1301', '547', '561', '570', '590', '620'}
267 | 
268 |             # Using local files (NCBI <-> GTDB)
269 |             ncbi_tax.build_translation(gtdb_tax, files=["ar53_metadata.tsv.gz", "bac120_metadata.tsv.gz"])
270 |             ncbi_tax.translate("620")
271 |                 {'g__Escherichia', 'g__Proteus', 'g__Serratia'}
272 |         """
273 |         if files:
274 |             if isinstance(files, str):
275 |                 files = [files]
276 |             for file in files:
277 |                 check_file(file)
278 | 
279 |         self._translated_nodes = self._build_translation(tax, files, urls)
280 | 
281 |     def children(self, node: str):
282 |         """
283 |         Returns list of direct children nodes of a given node.
284 |         """
285 |         # Setup on first use
286 |         if not self._node_children:
287 |             self._node_children = reverse_dict(self._nodes)
288 |         if node in self._node_children:
289 |             return self._node_children[node]
290 |         else:
291 |             return []
292 | 
293 |     def check_consistency(self):
294 |         """
295 |         Checks consistency of the tree
296 | 
297 |         Returns: raise an Exception otherwise None
298 |         """
299 |         if self.root_node not in self._nodes:
300 |             raise ValueError("Root node [" + self.root_node + "] not found.")
301 |         if self.root_parent in self._nodes:
302 |             raise ValueError(
303 |                 "Root parent [" + self.root_parent + "] found but should not be on tree.")
304 |         if self.undefined_node in self._nodes:
305 |             raise ValueError(
306 |                 "Undefined node [" + self.undefined_node + "] found but should not be on tree.")
307 | 
308 |         # Difference between values and keys should be only root_parent
309 |         lost_nodes = set(self._nodes.values()).difference(self._nodes)
310 |         if self.root_parent not in lost_nodes:
311 |             raise ValueError(
312 |                 "Root parent [" + self.root_parent + "] not properly defined.")
313 |         # Remove root_parent from lost nodes to report only missing
314 |         lost_nodes.remove(self.root_parent)
315 |         if len(lost_nodes) > 0:
316 |             raise ValueError("Parent nodes missing: " + ",".join(lost_nodes))
317 | 
318 |         return None
319 | 
320 |     def clear_lineages(self):
321 |         """
322 |         Clear built lineages.
323 | 
324 |         Returns: None
325 |         """
326 |         self._lineages = {}
327 | 
328 |     def closest_parent(self, node: str, ranks: str):
329 |         """
330 |         Returns the closest parent node based on a defined list of ranks
331 |         """
332 |         # Rank of node is already on the list
333 |         if self.rank(node) in ranks:
334 |             return node
335 |         else:
336 |             # check lineage from back to front until find a valid node
337 |             for n in self.lineage(node, ranks=ranks)[::-1]:
338 |                 if n != self.undefined_node:
339 |                     return n
340 |         # nothing found
341 |         return self.undefined_node
342 | 
343 |     def filter(self, nodes: list, desc: bool = False):
344 |         """
345 |         Filters taxonomy given a list of nodes.
346 |         By default keep all the ancestors of the given nodes.
347 |         If desc=True, keep all descendants instead.
348 |         Deletes built lineages and translations.
349 | 
350 |         Example:
351 | 
352 |             from multitax import GtdbTx
353 |             tax = GtdbTx()
354 | 
355 |             tax.lineage('s__Enterovibrio marina')
356 |             # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Vibrionaceae', 'g__Enterovibrio', 's__Enterovibrio marina']
357 |             # Keep only ancestors of 'g__Enterovibrio'
358 |             tax.filter('g__Enterovibrio')
359 | 
360 |             # Reload taxonomy
361 |             tax = GtdbTx()
362 |             # Keep only descendants of 'g__Enterovibrio'
363 |             tax.filter('g__Enterovibrio', desc=True)
364 |         """
365 |         if isinstance(nodes, str):
366 |             nodes = [nodes]
367 | 
368 |         # Keep track of nodes to be filtered out
369 |         filtered_nodes = set(self._nodes)
370 |         # Always keep root
371 |         filtered_nodes.discard(self.root_node)
372 | 
373 |         if desc:
374 |             # Keep descendants of the given nodes
375 |             for node in nodes:
376 |                 # Check if node exists (skips root)
377 |                 if node in filtered_nodes:
378 |                     # For each leaf of the selected nodes
379 |                     for leaf in self.leaves(node):
380 |                         # Build lineage of each leaf up-to node itself
381 |                         for n in self.lineage(leaf, root_node=node):
382 |                             # Discard nodes from set to be kept
383 |                             filtered_nodes.discard(n)
384 |                     # Link node to root
385 |                     self._nodes[node] = self.root_node
386 |         else:
387 |             # Keep ancestors of the given nodes (full lineage up-to root)
388 |             for node in nodes:
389 |                 # ranks=[] in case build_lineages() was used with specific ranks
390 |                 for n in self.lineage(node, ranks=[]):
391 |                     # Discard nodes from set to be kept
392 |                     filtered_nodes.discard(n)
393 | 
394 |         # Delete filtered nodes
395 |         for node in filtered_nodes:
396 |             self._remove(node)
397 | 
398 |         # Delete aux. data structures
399 |         self._reset_aux_data()
400 |         self.check_consistency()
401 | 
402 |     def latest(self, node: str):
403 |         """
404 |         Returns latest/updated version of a given node.
405 |         If node is already the latests, returns itself.
406 |         Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
407 |         """
408 |         if node in self._nodes:
409 |             return node
410 |         else:
411 |             return self.undefined_node
412 | 
413 |     def leaves(self, node: str = None):
414 |         """
415 |         Returns a list of leaf nodes of a given node.
416 |         """
417 |         if node is None or node == self.root_node:
418 |             # Leaves are nodes not contained in _nodes.values() ("parents")
419 |             return list(set(self._nodes).difference(self._nodes.values()))
420 |         elif node in self._nodes:
421 |             return self._recurse_leaves(node)
422 |         else:
423 |             return []
424 | 
425 |     def lineage(self, node: str, root_node: str = None, ranks: list = None):
426 |         """
427 |         Returns a list with the lineage of a given node.
428 |         If ranks is provided, returns only nodes annotated with such ranks.
429 |         If root_node is provided, use it instead of default root of tree.
430 |         """
431 |         # If lineages were built with build_lineages() with matching params
432 |         if node in self._lineages and root_node is None and ranks is None:
433 |             return self._lineages[node]
434 |         else:
435 |             if not root_node:
436 |                 root_node = self.root_node
437 | 
438 |             n = node
439 |             if ranks:
440 |                 # Fixed length lineage
441 |                 lin = [self.undefined_node] * len(ranks)
442 |                 # Loop until end of the tree (in case chosen root is not on lineage)
443 |                 while n != self.undefined_node:
444 |                     r = self.rank(n)
445 |                     if r in ranks:
446 |                         lin[ranks.index(r)] = n
447 |                     # If node is root, break (after adding)
448 |                     if n == root_node:
449 |                         break
450 |                     n = self.parent(n)
451 |             else:
452 |                 # Full lineage
453 |                 lin = []
454 |                 # Loop until end of the tree (in case chosen root is not on lineage)
455 |                 while n != self.undefined_node:
456 |                     lin.append(n)
457 |                     # If node is root, break (after adding)
458 |                     if n == root_node:
459 |                         break
460 |                     n = self.parent(n)
461 |                 # Reverse order
462 |                 lin = lin[::-1]
463 | 
464 |             # last iteration node (n) != root_node: didn't find the root, invalid lineage
465 |             if n != root_node:
466 |                 return []
467 |             else:
468 |                 return lin
469 | 
470 |     def name(self, node: str):
471 |         """
472 |         Returns name of a given node.
473 |         """
474 |         if node in self._names:
475 |             return self._names[node]
476 |         else:
477 |             return self.undefined_name
478 | 
479 |     def name_lineage(self, node: str, root_node: str = None, ranks: list = None):
480 |         """
481 |         Returns a list with the name lineage of a given node.
482 |         """
483 |         return list(map(self.name,
484 |                         self.lineage(node=node,
485 |                                      root_node=root_node,
486 |                                      ranks=ranks)))
487 | 
488 |     def nodes_rank(self, rank: str):
489 |         """
490 |         Returns list of nodes of a given rank.
491 |         """
492 |         # Setup on first use
493 |         if not self._rank_nodes:
494 |             self._rank_nodes = reverse_dict(self._ranks)
495 |         if rank in self._rank_nodes:
496 |             return self._rank_nodes[rank]
497 |         else:
498 |             return []
499 | 
500 |     def parent(self, node: str):
501 |         """
502 |         Returns the direct parent node of a given node.
503 |         """
504 |         if node in self._nodes:
505 |             return self._nodes[node]
506 |         else:
507 |             return self.undefined_node
508 | 
509 |     def parent_rank(self, node: str, rank: str):
510 |         """
511 |         Returns the parent node of a given rank in the specified rank.
512 |         """
513 |         parent = self.lineage(node=node, ranks=[rank])
514 |         return parent[0] if parent else self.undefined_node
515 | 
516 |     def prune(self, nodes: list):
517 |         """
518 |         Prunes branches of the tree under the given nodes.
519 |         Deletes built lineages and translations.
520 |         """
521 | 
522 |         if isinstance(nodes, str):
523 |             nodes = [nodes]
524 | 
525 |         del_nodes = set()
526 |         for node in nodes:
527 |             if node not in self._nodes:
528 |                 raise ValueError("Node [" + node + "] not found.")
529 |             for leaf in self.leaves(node):
530 |                 for n in self.lineage(leaf, root_node=node)[1:]:
531 |                     del_nodes.add(n)
532 | 
533 |         for n in del_nodes:
534 |             self._remove(n)
535 | 
536 |         self._reset_aux_data()
537 | 
538 |     def rank(self, node: str):
539 |         """
540 |         Returns the rank of a given node.
541 |         """
542 |         if node in self._ranks:
543 |             return self._ranks[node]
544 |         else:
545 |             return self.undefined_rank
546 | 
547 |     def rank_lineage(self, node: str, root_node: str = None, ranks: list = None):
548 |         """
549 |         Returns a list with the rank lineage of a given node.
550 |         """
551 |         return list(map(self.rank,
552 |                         self.lineage(node=node,
553 |                                      root_node=root_node,
554 |                                      ranks=ranks)))
555 | 
556 |     def remove(self, node: str, check_consistency: bool = False):
557 |         """
558 |         Removes node from taxonomy. Can break the tree if a parent node is removed. To remove a certain branch, use prune.
559 |         Running check consistency after removing a node is recommended.
560 |         Deletes built lineages and translations.
561 |         """
562 |         if node not in self._nodes:
563 |             raise ValueError("Node [" + node + "] not found.")
564 |         self._remove(node)
565 |         self._reset_aux_data()
566 |         if check_consistency:
567 |             self.check_consistency()
568 | 
569 |     def search_name(self, text: str, rank: str = None, exact: bool = True):
570 |         """
571 |         Search node by exact or partial name
572 | 
573 |         Parameters:
574 |         * **text** *[str]*: Text to search.
575 |         * **rank** *[str]*: Filter results by rank.
576 |         * **exact** *[bool]*: Exact or partial name search (both case sensitive).
577 | 
578 |         Returns: list of matching nodes
579 |         """
580 |         # Setup on first use
581 |         if not self._name_nodes:
582 |             self._name_nodes = reverse_dict(self._names)
583 | 
584 |         if exact:
585 |             ret = self._exact_name(text, self._name_nodes)
586 |         else:
587 |             ret = self._partial_name(text, self._name_nodes)
588 | 
589 |         # Only return nodes of chosen rank
590 |         if rank:
591 |             return filter_function(ret, self.rank, rank)
592 |         else:
593 |             return ret
594 | 
595 |     def stats(self):
596 |         """
597 |         Returns a dict with general numbers of the taxonomic tree
598 | 
599 |         Example:
600 | 
601 |             from pprint import pprint
602 |             from multitax import GtdbTx
603 |             tax = GtdbTx()
604 | 
605 |             pprint(tax.stats())
606 |             {'leaves': 30238,
607 |              'names': 42739,
608 |              'nodes': 42739,
609 |              'ranked_leaves': Counter({'species': 30238}),
610 |              'ranked_nodes': Counter({'species': 30238,
611 |                                       'genus': 8778,
612 |                                       'family': 2323,
613 |                                       'order': 930,
614 |                                       'class': 337,
615 |                                       'phylum': 131,
616 |                                       'domain': 1,
617 |                                       'root': 1}),
618 |              'ranks': 42739}
619 |         """
620 |         s = {}
621 |         s["nodes"] = len(self._nodes)
622 |         s["ranks"] = len(self._ranks)
623 |         s["names"] = len(self._names)
624 |         all_leaves = self.leaves(self.root_node)
625 |         s["leaves"] = len(all_leaves)
626 |         s["ranked_nodes"] = Counter(self._ranks.values())
627 |         s["ranked_leaves"] = Counter(map(self.rank, all_leaves))
628 | 
629 |         return s
630 | 
631 |     def translate(self, node: str):
632 |         """
633 |         Returns the translated node from another taxonomy. Translated nodes are generated with the build_translation function.
634 |         """
635 |         if node in self._translated_nodes:
636 |             return self._translated_nodes[node]
637 |         else:
638 |             return []
639 | 
640 |     def write(self,
641 |               output_file: str,
642 |               cols: list = ["node", "parent", "rank", "name"],
643 |               sep: str = "\t",
644 |               sep_multi: str = "|",
645 |               ranks: list = None,
646 |               gz: bool = False):
647 |         """
648 |         Writes loaded taxonomy to a file.
649 | 
650 |         Parameters:
651 |         * **cols** *[list]*: Options: "node", "latest", "parent", "rank", "name", "leaves", "children", "lineage", "rank_lineage", "name_lineage"
652 |         * **sep** *[str]*: Separator of fields
653 |         * **sep_multi** *[str]*: Separator of multi-valued fields
654 |         * **ranks** *[list]*: Ranks to report
655 |         * **gz** *[bool]*: Gzip output
656 | 
657 |         Returns: None
658 |         """
659 |         import gzip
660 |         if gz:
661 |             output_file = output_file if output_file.endswith(
662 |                 ".gz") else output_file + ".gz"
663 |             check_no_file(output_file)
664 |             outf = gzip.open(output_file, "wt")
665 |         else:
666 |             check_no_file(output_file)
667 |             outf = open(output_file, "w")
668 | 
669 |         write_field = {"node": lambda node: node,
670 |                        "latest": self.latest,
671 |                        "parent": self.parent,
672 |                        "rank": self.rank,
673 |                        "name": self.name,
674 |                        "leaves": lambda node: join_check(self.leaves(node), sep_multi),
675 |                        "children": lambda node: join_check(self.children(node), sep_multi),
676 |                        "lineage": lambda node: join_check(self.lineage(node, ranks=ranks), sep_multi),
677 |                        "rank_lineage": lambda node: join_check(self.rank_lineage(node, ranks=ranks), sep_multi),
678 |                        "name_lineage": lambda node: join_check(self.name_lineage(node, ranks=ranks), sep_multi)}
679 | 
680 |         for c in cols:
681 |             if c not in write_field:
682 |                 raise ValueError(
683 |                     "Field [" + c + "] is not valid. Options: " + ",".join(write_field))
684 | 
685 |         if ranks:
686 |             for rank in ranks:
687 |                 for node in self.nodes_rank(rank):
688 |                     print(*[write_field[c](node)
689 |                             for c in cols], sep=sep, end="\n", file=outf)
690 |         else:
691 |             for node in self._nodes:
692 |                 print(*[write_field[c](node)
693 |                         for c in cols], sep=sep, end="\n", file=outf)
694 | 
695 |         outf.close()
696 | 


--------------------------------------------------------------------------------
/multitax/ncbitx.py:
--------------------------------------------------------------------------------
  1 | from .multitax import MultiTax
  2 | from .utils import filter_function
  3 | from .utils import check_file
  4 | from .utils import open_files
  5 | from .utils import download_files
  6 | import warnings
  7 | 
  8 | 
  9 | class NcbiTx(MultiTax):
 10 |     _default_urls = ["https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"]
 11 | 
 12 |     def __init__(self, **kwargs):
 13 |         self._merged = {}
 14 |         self._extended_name_nodes = {}
 15 |         super().__init__(**kwargs)
 16 | 
 17 |     def __repr__(self):
 18 |         stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
 19 |         return 'NcbiTx({})'.format(', '.join(stats))
 20 | 
 21 |     def _build_translation(self, target_tax, files: list = None, urls: list = None):
 22 |         translated_nodes = {}
 23 |         if target_tax.__class__.__name__ == "GtdbTx":
 24 | 
 25 |             if files:
 26 |                 fhs = open_files(files)
 27 |             else:
 28 |                 _urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz",
 29 |                          "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz"]
 30 |                 fhs = download_files(
 31 |                     urls=urls if urls else _urls, retry_attempts=3)
 32 | 
 33 | 
 34 |             accession_col = 0
 35 |             gtdb_taxonomy_col = 19
 36 |             ncbi_taxid_col = 80
 37 | 
 38 |             for source, fh in fhs.items():
 39 |                 for line in fh:
 40 |                     try:
 41 |                         fields = line.rstrip().split('\t')
 42 |                     except:
 43 |                         fields = line.decode().rstrip().split('\t')
 44 | 
 45 |                     # skip header
 46 |                     if fields[accession_col] == "accession":
 47 |                         continue
 48 |  
 49 |                     # Build GTDB lineage from leaf (species on given lineage)
 50 |                     # to accomodate possible changes in the loaded tax
 51 |                     gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
 52 |                     if gtdb_leaf_node != target_tax.undefined_node:
 53 |                         gtdb_nodes = target_tax.lineage(gtdb_leaf_node, ranks=[
 54 |                                                         "domain", "phylum", "class", "order",
 55 |                                                         "family", "genus", "species"])
 56 |                     else:
 57 |                         continue
 58 | 
 59 |                     # Build NCBI lineage from leaf
 60 |                     ncbi_leaf_node = self.latest(fields[ncbi_taxid_col])
 61 |                     if ncbi_leaf_node != self.undefined_node:
 62 |                         # Additional add connection from leaf to species on GTDB
 63 |                         # that could represent strain, etc on NCBI tax
 64 |                         if ncbi_leaf_node not in translated_nodes:
 65 |                             translated_nodes[ncbi_leaf_node] = set()
 66 |                         translated_nodes[ncbi_leaf_node].add(
 67 |                             gtdb_leaf_node)
 68 |                         ncbi_nodes = self.lineage(ncbi_leaf_node, ranks=[
 69 |                                                     "superkingdom", "phylum", "class", "order",
 70 |                                                     "family", "genus", "species"])
 71 |                     else:
 72 |                         continue
 73 | 
 74 |                     # Match ranks
 75 |                     for i, ncbi_n in enumerate(ncbi_nodes):
 76 |                         if gtdb_nodes[i] != target_tax.undefined_node and ncbi_n != self.undefined_node:
 77 |                             if ncbi_n not in translated_nodes:
 78 |                                 translated_nodes[ncbi_n] = set()
 79 |                             translated_nodes[ncbi_n].add(gtdb_nodes[i])
 80 | 
 81 |         else:
 82 |             warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
 83 |                           "," + target_tax.__class__.__name__ + "] not yet implemented.")
 84 | 
 85 |         return translated_nodes
 86 | 
 87 |     def _parse(self, fhs, **kwargs):
 88 |         fhs_list = list(fhs.values())
 89 |         # One element tar.gz -> taxdump.tar.gz
 90 |         if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"):
 91 |             nodes, ranks, names, self._merged = self._parse_taxdump(
 92 |                 fhs_list[0], extended_names=kwargs["extended_names"])
 93 |         else:
 94 |             # nodes.dmp
 95 |             nodes, ranks = self._parse_nodes(fhs_list[0])
 96 | 
 97 |             # [names.dmp]
 98 |             if len(fhs) >= 2:
 99 |                 names = self._parse_names(
100 |                     fhs_list[1], extended_names=kwargs["extended_names"])
101 |             else:
102 |                 names = {}
103 | 
104 |             # [merged.dmp]
105 |             if len(fhs) == 3:
106 |                 self._merged = self._parse_merged(fhs_list[2])
107 |         return nodes, ranks, names
108 | 
109 |     def _parse_merged(self, fh):
110 |         merged = {}
111 |         for line in fh:
112 |             try:
113 |                 old_taxid, _, new_taxid, _ = line.split('\t', 3)
114 |             except:
115 |                 old_taxid, _, new_taxid, _ = line.decode().split('\t', 3)
116 |             merged[old_taxid] = new_taxid
117 |         return merged
118 | 
119 |     def _parse_names(self, fh, extended_names):
120 |         names = {}
121 |         for line in fh:
122 |             try:
123 |                 node, name, _, name_class = line.split('\t|\t')
124 |             except:
125 |                 node, name, _, name_class = line.decode().split('\t|\t')
126 |             if name_class.replace('\t|\n', '') == "scientific name":
127 |                 names[node] = name
128 |             elif extended_names:
129 |                 if name not in self._extended_name_nodes:
130 |                     self._extended_name_nodes[name] = []
131 |                 self._extended_name_nodes[name].append(node)
132 | 
133 |         return names
134 | 
135 |     def _parse_nodes(self, fh):
136 |         nodes = {}
137 |         ranks = {}
138 |         for line in fh:
139 |             try:
140 |                 taxid, parent_taxid, rank, _ = line.split('\t|\t', 3)
141 |             except:
142 |                 taxid, parent_taxid, rank, _ = line.decode().split('\t|\t', 3)
143 |             ranks[taxid] = rank
144 |             nodes[taxid] = parent_taxid
145 |         return nodes, ranks
146 | 
147 |     def _parse_taxdump(self, fh_taxdump, extended_names):
148 |         with fh_taxdump.extractfile('nodes.dmp') as fh_nodes:
149 |             nodes, ranks = self._parse_nodes(fh_nodes)
150 |         with fh_taxdump.extractfile('names.dmp') as fh_names:
151 |             names = self._parse_names(fh_names, extended_names=extended_names)
152 |         with fh_taxdump.extractfile('merged.dmp') as fh_merged:
153 |             merged = self._parse_merged(fh_merged)
154 |         return nodes, ranks, names, merged
155 | 
156 |     def latest(self, node: str):
157 |         n = super().latest(node)
158 |         if n == self.undefined_node:
159 |             n = self.merged(node)
160 |         return n
161 | 
162 |     def merged(self, node: str):
163 |         """
164 |         Returns relative entry from the merged.dmp file of a given node.
165 |         """
166 |         if node in self._merged:
167 |             return self._merged[node]
168 |         else:
169 |             return self.undefined_node
170 | 
171 |     def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
172 |         """
173 |         Search node by exact or partial name.
174 | 
175 |         Default order (can be skipped with **force_extended=True**):
176 | 
177 |         1) Search names defined as "scientific name" on nodes.dmp
178 | 
179 |         2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
180 | 
181 |         Parameters:
182 |         * **text** *[str]*: Text to search.
183 |         * **rank** *[str]*: Filter results by rank.
184 |         * **exact** *[bool]*: Exact or partial name search (both case sensitive).
185 |         * **force_extended** *[bool]*: Search for text in all categories at once.
186 | 
187 |         Returns: list of matching nodes
188 |         """
189 |         n = super().search_name(text, rank=rank, exact=exact)
190 |         if n and not force_extended:
191 |             return n
192 |         else:
193 |             if exact:
194 |                 ret = self._exact_name(text, self._extended_name_nodes)
195 |             else:
196 |                 ret = self._partial_name(text, self._extended_name_nodes)
197 | 
198 |             # Only return nodes of chosen rank
199 |             if rank:
200 |                 ret = filter_function(ret, self.rank, rank)
201 | 
202 |             return list(set(n + ret))
203 | 
204 |     def stats(self):
205 |         s = super().stats()
206 |         if self._merged:
207 |             s["merged"] = len(self._merged)
208 |         if self._extended_name_nodes:
209 |             s["extended_names"] = len(self._extended_name_nodes)
210 |         return s
211 | 


--------------------------------------------------------------------------------
/multitax/otttx.py:
--------------------------------------------------------------------------------
  1 | from .multitax import MultiTax
  2 | from .utils import filter_function
  3 | import warnings
  4 | 
  5 | 
  6 | class OttTx(MultiTax):
  7 |     _default_urls = ["http://files.opentreeoflife.org/ott/ott3.4/ott3.4.tgz"]
  8 |     _default_root_node = "805080"
  9 | 
 10 |     def __init__(self, **kwargs):
 11 |         self._forwards = {}
 12 |         self._extended_name_nodes = {}
 13 |         super().__init__(**kwargs)
 14 | 
 15 |     def __repr__(self):
 16 |         stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
 17 |         return 'OttTx({})'.format(', '.join(stats))
 18 | 
 19 |     def _build_translation(self, target_tax, files: list = None, urls: list = None):
 20 |         warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
 21 |                       "," + target_tax.__class__.__name__ + "] not yet implemented.")
 22 |         return {}
 23 | 
 24 |     def _parse(self, fhs, **kwargs):
 25 |         fhs_list = list(fhs.values())
 26 |         if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"):
 27 |             nodes, ranks, names = self._parse_ott(
 28 |                 fhs_list[0], extended_names=kwargs["extended_names"])
 29 |         else:
 30 |             # nodes.dmp
 31 |             nodes, ranks, names = self._parse_taxonomy(fhs_list[0])
 32 |             # [forwards.tsv]
 33 |             if len(fhs) >= 2:
 34 |                 self._forwards = self._parse_forwards(fhs_list[1])
 35 |             if len(fhs) == 3 and kwargs["extended_names"]:
 36 |                 self._extended_name_nodes = self._parse_synonyms(fhs_list[2])
 37 | 
 38 |         return nodes, ranks, names
 39 | 
 40 |     def _parse_forwards(self, fh):
 41 |         forwards = {}
 42 |         # skip first line header
 43 |         next(fh)
 44 |         for line in fh:
 45 |             try:
 46 |                 old_taxid, new_taxid = line.rstrip().split('\t')
 47 |             except:
 48 |                 old_taxid, new_taxid = line.decode().rstrip().split('\t')
 49 |             forwards[old_taxid] = new_taxid
 50 |         return forwards
 51 | 
 52 |     def _parse_ott(self, fh_taxdump, extended_names):
 53 |         # Get files inside folder by name
 54 |         for e in fh_taxdump.getnames():
 55 |             if e.endswith("taxonomy.tsv"):
 56 |                 tax = e
 57 |             if e.endswith("forwards.tsv"):
 58 |                 fwr = e
 59 |             if e.endswith("synonyms.tsv"):
 60 |                 syn = e
 61 | 
 62 |         with fh_taxdump.extractfile(tax) as fh_nodes:
 63 |             nodes, ranks, names = self._parse_taxonomy(fh_nodes)
 64 |         with fh_taxdump.extractfile(fwr) as fh_forwards:
 65 |             self._forwards = self._parse_forwards(fh_forwards)
 66 |         if extended_names:
 67 |             with fh_taxdump.extractfile(syn) as fh_synonyms:
 68 |                 self._extended_name_nodes = self._parse_synonyms(fh_synonyms)
 69 |         return nodes, ranks, names
 70 | 
 71 |     def _parse_synonyms(self, fh):
 72 |         synonyms = {}
 73 |         # skip first line header
 74 |         next(fh)
 75 |         for line in fh:
 76 |             try:
 77 |                 name, taxid, _ = line.split('\t|\t', 2)
 78 |             except:
 79 |                 name, taxid, _ = line.decode().split('\t|\t', 2)
 80 |             if name not in synonyms:
 81 |                 synonyms[name] = []
 82 |             synonyms[name].append(taxid)
 83 | 
 84 |         return synonyms
 85 | 
 86 |     def _parse_taxonomy(self, fh):
 87 |         nodes = {}
 88 |         ranks = {}
 89 |         names = {}
 90 |         # skip first line header
 91 |         next(fh)
 92 |         for line in fh:
 93 |             try:
 94 |                 taxid, parent_taxid, name, rank, _ = line.split('\t|\t', 4)
 95 |             except:
 96 |                 taxid, parent_taxid, name, rank, _ = line.decode().split('\t|\t', 4)
 97 |             ranks[taxid] = rank
 98 |             nodes[taxid] = parent_taxid
 99 |             names[taxid] = name
100 |         return nodes, ranks, names
101 | 
102 |     def forwards(self, node: str):
103 |         """
104 |         Returns relative entry from the forwards.tsv file of a given node.
105 |         """
106 |         if node in self._forwards:
107 |             return self._forwards[node]
108 |         else:
109 |             return self.undefined_node
110 | 
111 |     def latest(self, node: str):
112 |         n = super().latest(node)
113 |         if n == self.undefined_node:
114 |             n = self.forwards(node)
115 |         return n
116 | 
117 |     def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
118 |         """
119 |         Search node by exact or partial name.
120 | 
121 |         Default order (can be skipped with **force_extended=True**):
122 | 
123 |         1) Search default names defined on "taxonomy.tsv"
124 | 
125 |         2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
126 | 
127 |         Parameters:
128 |         * **text** *[str]*: Text to search.
129 |         * **rank** *[str]*: Filter results by rank.
130 |         * **exact** *[bool]*: Exact or partial name search (both case sensitive).
131 |         * **force_extended** *[bool]*: Search for text in all categories at once.
132 | 
133 |         Returns: list of matching nodes
134 |         """
135 |         n = super().search_name(text, rank=rank, exact=exact)
136 |         if n and not force_extended:
137 |             return n
138 |         else:
139 |             if exact:
140 |                 ret = self._exact_name(text, self._extended_name_nodes)
141 |             else:
142 |                 ret = self._partial_name(text, self._extended_name_nodes)
143 | 
144 |             # Only return nodes of chosen rank
145 |             if rank:
146 |                 ret = filter_function(ret, self.rank, rank)
147 | 
148 |             return list(set(n + ret))
149 | 
150 |     def stats(self):
151 |         s = super().stats()
152 |         if self._forwards:
153 |             s["forwards"] = len(self._forwards)
154 |         if self._extended_name_nodes:
155 |             s["extended_names"] = len(self._extended_name_nodes)
156 |         return s
157 | 


--------------------------------------------------------------------------------
/multitax/silvatx.py:
--------------------------------------------------------------------------------
 1 | from .multitax import MultiTax
 2 | import warnings
 3 | 
 4 | 
 5 | class SilvaTx(MultiTax):
 6 |     _default_urls = [
 7 |         "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"]
 8 | 
 9 |     def __init__(self, **kwargs):
10 |         super().__init__(**kwargs)
11 | 
12 |     def __repr__(self):
13 |         stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
14 |         return 'SilvaTx({})'.format(', '.join(stats))
15 | 
16 |     def _build_translation(self, target_tax, files: list = None, urls: list = None):
17 |         warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
18 |                       "," + target_tax.__class__.__name__ + "] not yet implemented.")
19 |         return {}
20 | 
21 |     def _parse(self, fhs, **kwargs):
22 |         nodes = {}
23 |         ranks = {}
24 |         names = {}
25 | 
26 |         lin = {}
27 |         for source, fh in fhs.items():
28 |             for line in fh:
29 |                 try:
30 |                     name_lineage, taxid, rank, _ = line.split('\t', 3)
31 |                 except:
32 |                     name_lineage, taxid, rank, _ = line.decode().split('\t', 3)
33 |                 # Remove last char ";"
34 |                 lineage = name_lineage[:-1]
35 |                 name = lineage.split(";")[-1]
36 |                 # Save lineage to build tree
37 |                 lin[lineage] = taxid
38 |                 names[taxid] = name
39 |                 ranks[taxid] = rank
40 | 
41 |         # Build parent node connection
42 |         for lineage, taxid in lin.items():
43 |             t = taxid
44 |             l = lineage.split(";")[:-1]
45 |             while l:
46 |                 parent_taxid = lin[";".join(l)]
47 |                 if t not in nodes:
48 |                     nodes[t] = parent_taxid
49 |                 t = parent_taxid
50 |                 del l[-1]  # remove last element
51 |             # Connect last node to root
52 |             if t not in nodes:
53 |                 nodes[t] = self._default_root_node
54 | 
55 |         return nodes, ranks, names
56 | 


--------------------------------------------------------------------------------
/multitax/utils.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import io
  3 | import os
  4 | import tarfile
  5 | import urllib.request
  6 | import zlib
  7 | import warnings
  8 | from collections import OrderedDict
  9 | from urllib.error import HTTPError
 10 | 
 11 | 
 12 | def check_dir(prefix: str):
 13 |     abs_path = os.path.dirname(os.path.abspath(prefix))
 14 |     if not os.path.exists(abs_path):
 15 |         raise NotADirectoryError(abs_path)
 16 | 
 17 | 
 18 | def check_file(file: str):
 19 |     if not os.path.isfile(file):
 20 |         raise FileNotFoundError(file + " file do not exist")
 21 |     if os.path.getsize(file) == 0:
 22 |         raise FileNotFoundError(file + " file is empty")
 23 | 
 24 | 
 25 | def check_no_file(file: str):
 26 |     if os.path.isfile(file):
 27 |         raise FileExistsError(file)
 28 | 
 29 | 
 30 | def close_files(fhs: dict):
 31 |     """
 32 |     Parameters:
 33 |     * **fhs** *[dict]*: {file: file handler}
 34 | 
 35 |     Returns: Nothing
 36 |     """
 37 |     for fh in fhs.values():
 38 |         fh.close()
 39 | 
 40 | 
 41 | def download_files(urls: list, output_prefix: str = None, retry_attempts: int = 1):
 42 |     """
 43 |     Download and open files (memory/stream) or write to disk (multitax.utils.save_urls)
 44 | 
 45 |     Parameters:
 46 |     * **urls** *[list]*: List of files to download (text, ".gz", ".tar.gz", ".tgz")
 47 |     * **output_prefix** *[str]*: Output directory to save files
 48 | 
 49 |     Returns:
 50 |     * OrderedDict {file: file handler} (same order as input)
 51 |     """
 52 |     if isinstance(urls, str):
 53 |         urls = [urls]
 54 | 
 55 |     att = 0
 56 |     while att < retry_attempts:
 57 |         att += 1
 58 |         try:
 59 |             # If output is provided, save files and parse from disc
 60 |             if output_prefix:
 61 |                 files = save_urls(urls, output_prefix)
 62 |                 return open_files(files)
 63 |             else:
 64 |                 # stream contents from url
 65 |                 fhs = OrderedDict()
 66 |                 for url in urls:
 67 |                     if url.endswith(".tar.gz") or url.endswith(".tgz"):
 68 |                         # tar files have mixed headers and content
 69 |                         # whole file should be loaded in memory first and not streamed
 70 |                         fhs[url] = tarfile.open(
 71 |                             fileobj=load_url_mem(url), mode='r:gz')
 72 |                     elif url.endswith(".gz"):
 73 |                         fhs[url] = gzip.open(
 74 |                             urllib.request.urlopen(url), mode="rb")
 75 |                         fhs[url].peek(1)  # peek into file to check if is valid
 76 |                     else:
 77 |                         fhs[url] = urllib.request.urlopen(url)
 78 | 
 79 |                 return fhs
 80 |         except (HTTPError, zlib.error, tarfile.TarError):
 81 |             warnings.warn(
 82 |                 "Download failed, trying again (" + str(att) + "/" + str(retry_attempts) + ")", UserWarning)
 83 | 
 84 |     raise Exception("One or more files could not be downloaded: " +
 85 |                     ", ".join(urls))
 86 | 
 87 | 
 88 | def filter_function(elements, function, value):
 89 |     return [elements[i] for i, v in enumerate(map(function, elements)) if v == value]
 90 | 
 91 | 
 92 | def join_check(elements, sep: str):
 93 |     if elements:
 94 |         return sep.join(map(str, elements))
 95 |     else:
 96 |         return ""
 97 | 
 98 | 
 99 | def load_url_mem(url: str):
100 |     """
101 |     Parameters:
102 |     * **url** *[str]*: URL to load into memory
103 | 
104 |     Returns:
105 |     * io.BytesIO of the requested url
106 |     """
107 |     urlstream = urllib.request.urlopen(url)
108 |     # From https://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed
109 |     tmpfile = io.BytesIO()
110 |     while True:
111 |         s = urlstream.read(io.DEFAULT_BUFFER_SIZE)
112 |         if not s:
113 |             break
114 |         tmpfile.write(s)
115 |     urlstream.close()
116 |     tmpfile.seek(0)
117 |     return tmpfile
118 | 
119 | 
120 | def open_files(files: list):
121 |     """
122 |     Parameters:
123 |     * **files** *[list]*: List of files to open (text, ".gz", ".tar.gz", ".tgz")
124 | 
125 |     Returns:
126 |     * OrderedDict {file: file handler} (same order as input)
127 |     """
128 | 
129 |     fhs = OrderedDict()
130 |     for file in files:
131 |         if file.endswith(".tar.gz") or file.endswith(".tgz"):
132 |             fhs[file] = tarfile.open(file, mode='r:gz')
133 |         elif file.endswith(".gz"):
134 |             fhs[file] = gzip.open(file, "rt")
135 |         else:
136 |             fhs[file] = open(file, "r")
137 |     return fhs
138 | 
139 | 
140 | def reverse_dict(d: dict):
141 |     rd = {}
142 |     for k, v in d.items():
143 |         if v not in rd:
144 |             rd[v] = []
145 |         rd[v].append(k)
146 |     return rd
147 | 
148 | 
149 | def save_urls(urls: list, output_prefix: str):
150 |     """
151 |     Parameters:
152 |     * **urls** *[list]*: List of urls to download
153 |     * **output_prefix** *[str]*: Output directory to save files
154 | 
155 |     Returns:
156 |     * list of files saved
157 |     """
158 |     files = []
159 |     for url in urls:
160 |         outfile = output_prefix + os.path.basename(url)
161 |         check_no_file(outfile)
162 |         urlstream = urllib.request.urlopen(url)
163 |         with open(outfile, 'b+w') as f:
164 |             f.write(urlstream.read())
165 |         urlstream.close()
166 |         files.append(outfile)
167 |     return files
168 | 
169 | 
170 | def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
171 |     return '%s:%s: %s: %s\n' % (filename, lineno, category.__name__, message)
172 | 
173 | 
174 | warnings.formatwarning = warning_on_one_line
175 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import re
 4 | 
 5 | from setuptools import setup
 6 | 
 7 | with open("README.md", "r", encoding="utf-8") as fh:
 8 |     long_description = fh.read()
 9 | 
10 | setup(
11 |     name="multitax",
12 |     version="1.3.2",
13 |     url="https://www.github.com/pirovc/multitax",
14 |     license="MIT",
15 |     author="Vitor C. Piro",
16 |     description="Python package to obtain, parse and explore biological and custom taxonomies",
17 |     long_description=long_description,
18 |     long_description_content_type="text/markdown",
19 |     packages=["multitax"],
20 |     python_requires=">=3.4",
21 |     classifiers=[
22 |         'License :: OSI Approved :: MIT License',
23 |         'Programming Language :: Python :: 3.9',
24 |         'Programming Language :: Python :: 3.10',
25 |         'Programming Language :: Python :: 3.11',
26 |         'Programming Language :: Python :: 3.12',
27 |         'Programming Language :: Python :: 3.13',
28 |     ],
29 | )
30 | 


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/custom.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom.tsv.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/custom2.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom2.tsv.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/custom_unit_test.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom_unit_test.tsv.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gg.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gg.txt.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_ar.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_ar.tsv.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_bac.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_bac.tsv.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/ncbi.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/ncbi.tar.gz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/ott.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/ott.tgz


--------------------------------------------------------------------------------
/tests/multitax/data_minimal/silva.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/silva.txt.gz


--------------------------------------------------------------------------------
/tests/multitax/integration/test_common.py:
--------------------------------------------------------------------------------
  1 | from multitax import GreengenesTx, GtdbTx, NcbiTx, OttTx, SilvaTx, CustomTx
  2 | from tests.multitax.utils import setup_dir, uncompress_gzip, uncompress_tar_gzip
  3 | import unittest
  4 | import os
  5 | import sys
  6 | import random
  7 | import io
  8 | 
  9 | 
 10 | sys.path.append("tests/multitax/")
 11 | 
 12 | 
 13 | class TestCommon(unittest.TestCase):
 14 | 
 15 |     tmp_dir = "tests/multitax/integration/tmp_common/"
 16 |     data_dir = "tests/multitax/data_minimal/"
 17 |     #data_dir = "tests/multitax/data_complete/"
 18 | 
 19 |     taxonomies = {}
 20 |     taxonomies["gtdb"] = {"class": GtdbTx,
 21 |                           "params": {"files": [data_dir + "gtdb_ar.tsv.gz",
 22 |                                                data_dir + "gtdb_bac.tsv.gz"]}}
 23 |     taxonomies["ncbi"] = {"class": NcbiTx,
 24 |                           "params": {"files": [data_dir + "ncbi.tar.gz"]}}
 25 |     taxonomies["silva"] = {"class": SilvaTx,
 26 |                            "params": {"files": [data_dir + "silva.txt.gz"]}}
 27 |     taxonomies["ott"] = {"class": OttTx,
 28 |                          "params": {"files": [data_dir + "ott.tgz"]}}
 29 |     taxonomies["greengenes"] = {"class": GreengenesTx,
 30 |                                 "params": {"files": [data_dir + "gg.txt.gz"]}}
 31 |     taxonomies["custom"] = {"class": CustomTx,
 32 |                             "params": {"files": [data_dir + "custom.tsv.gz",
 33 |                                                  data_dir + "custom2.tsv.gz"]}}
 34 | 
 35 |     @classmethod
 36 |     def setUpClass(self):
 37 |         setup_dir(self.tmp_dir)
 38 | 
 39 |     def test_basic(self):
 40 |         """
 41 |         Basic test with files
 42 |         """
 43 |         for t in self.taxonomies:
 44 |             tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
 45 |             self.assertGreater(tax.stats()["nodes"], 0, t + " failed")
 46 | 
 47 |     def test_print(self):
 48 |         """
 49 |         Test output of printing tax object instance
 50 |         """
 51 |         for t in self.taxonomies:
 52 |             tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
 53 |             out = io.StringIO()
 54 |             sys.stdout = out
 55 |             print(tax)
 56 |             sys.stdout = sys.__stdout__
 57 |             self.assertEqual(out.getvalue().lower().startswith(t), True)
 58 | 
 59 |     def test_urls(self):
 60 |         """
 61 |         Using urls instead of files
 62 |         """
 63 |         for t in self.taxonomies:
 64 |             # simulate url with "file://" and absolute path
 65 |             urls = ["file://" + os.path.abspath(file)
 66 |                     for file in self.taxonomies[t]["params"]["files"]]
 67 |             tax = self.taxonomies[t]["class"](urls=urls)
 68 |             self.assertGreater(
 69 |                 tax.stats()["nodes"], 0, t + " failed with urls")
 70 | 
 71 |     def test_fail_to_download(self):
 72 |         """
 73 |         Using wrong urls should fail (using ncbi)
 74 |         """
 75 |         with self.assertRaises(Exception):
 76 |             with self.assertWarns(UserWarning):
 77 |                 tax = self.taxonomies["ncbi"]["class"](
 78 |                     urls=["www.thisisnotawebsite.com/neither/a/file", "fasfafsafasfasf"])
 79 | 
 80 |     def test_urls_output_prefix(self):
 81 |         """
 82 |         Using urls and saving files on disk
 83 |         """
 84 |         for t in self.taxonomies:
 85 |             # simulate url with "file://" and absolute path
 86 |             urls = ["file://" + os.path.abspath(file)
 87 |                     for file in self.taxonomies[t]["params"]["files"]]
 88 |             tax = self.taxonomies[t]["class"](
 89 |                 urls=urls, output_prefix=self.tmp_dir)
 90 |             self.assertGreater(
 91 |                 tax.stats()["nodes"], 0, t + " failed with urls and output_prefix")
 92 | 
 93 |     def test_gzip_uncompressed(self):
 94 |         """
 95 |         Using uncompressed gzip files ("gtdb", "silva", "greengenes", "custom")
 96 |         """
 97 |         for t in self.taxonomies:
 98 |             if t in ["gtdb", "silva", "greengenes", "custom"]:
 99 |                 uncompressed = []
100 |                 for file in self.taxonomies[t]["params"]["files"]:
101 |                     if file.endswith(".gz"):
102 |                         outfile = self.tmp_dir + os.path.basename(file)[:-3]
103 |                         uncompress_gzip(file, outfile)
104 |                         uncompressed.append(outfile)
105 | 
106 |                 if uncompressed:
107 |                     # Check if results are equal with compressed and uncompressed files
108 |                     tax_compressed = self.taxonomies[t]["class"](
109 |                         **self.taxonomies[t]["params"])
110 |                     tax_uncompressed = self.taxonomies[t]["class"](
111 |                         files=uncompressed)
112 |                     self.assertEqual(tax_compressed.stats(), tax_uncompressed.stats(
113 |                     ), t + " failed with uncompressed files")
114 | 
115 |     def test_tar_gzip_uncompressed_ncbi(self):
116 |         """
117 |         Using uncompressed tar gzip files for ncbi
118 |         """
119 | 
120 |         # Ncbi
121 |         tax_compressed = self.taxonomies["ncbi"]["class"](
122 |             **self.taxonomies["ncbi"]["params"])
123 |         uncompressed_files = uncompress_tar_gzip(
124 |             f=self.taxonomies["ncbi"]["params"]["files"][0], outd=self.tmp_dir)
125 |         self.assertIn("nodes.dmp", uncompressed_files)
126 |         self.assertIn("names.dmp", uncompressed_files)
127 |         self.assertIn("merged.dmp", uncompressed_files)
128 |         tax_uncompressed = self.taxonomies["ncbi"]["class"](files=[self.tmp_dir + "nodes.dmp",
129 |                                                                    self.tmp_dir + "names.dmp",
130 |                                                                    self.tmp_dir + "merged.dmp"])
131 |         # Results of compressed and uncompressed should match
132 |         self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
133 | 
134 |         # Ncbi with extended names
135 |         ext_ncbi_conf = self.taxonomies["ncbi"].copy()
136 |         ext_ncbi_conf["params"]["extended_names"] = True
137 |         tax_compressed = ext_ncbi_conf["class"](**ext_ncbi_conf["params"])
138 |         uncompressed_files = uncompress_tar_gzip(
139 |             f=ext_ncbi_conf["params"]["files"][0], outd=self.tmp_dir)
140 |         self.assertIn("nodes.dmp", uncompressed_files)
141 |         self.assertIn("names.dmp", uncompressed_files)
142 |         self.assertIn("merged.dmp", uncompressed_files)
143 |         tax_uncompressed = ext_ncbi_conf["class"](files=[self.tmp_dir + "nodes.dmp",
144 |                                                          self.tmp_dir + "names.dmp",
145 |                                                          self.tmp_dir + "merged.dmp"],
146 |                                                   extended_names=True)
147 |         # Results of compressed and uncompressed should match
148 |         self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
149 | 
150 |     def test_tar_gzip_uncompressed_ott(self):
151 |         """
152 |         Using uncompressed tar gzip files for ott
153 |         """
154 |         # Ott
155 |         tax_compressed = self.taxonomies["ott"]["class"](
156 |             **self.taxonomies["ott"]["params"])
157 |         uncompressed_files = uncompress_tar_gzip(
158 |             f=self.taxonomies["ott"]["params"]["files"][0], outd=self.tmp_dir)
159 |         self.assertIn("taxonomy.tsv", uncompressed_files)
160 |         self.assertIn("forwards.tsv", uncompressed_files)
161 |         tax_uncompressed = self.taxonomies["ott"]["class"](files=[self.tmp_dir + "taxonomy.tsv",
162 |                                                                   self.tmp_dir + "forwards.tsv"])
163 |         # Results of compressed and uncompressed should match
164 |         self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
165 | 
166 |         # Ott with extended names (synonyms.tsv)
167 |         ext_ott_conf = self.taxonomies["ott"].copy()
168 |         ext_ott_conf["params"]["extended_names"] = True
169 |         tax_compressed = ext_ott_conf["class"](**ext_ott_conf["params"])
170 |         uncompressed_files = uncompress_tar_gzip(
171 |             f=ext_ott_conf["params"]["files"][0], outd=self.tmp_dir)
172 |         self.assertIn("taxonomy.tsv", uncompressed_files)
173 |         self.assertIn("forwards.tsv", uncompressed_files)
174 |         self.assertIn("synonyms.tsv", uncompressed_files)
175 |         tax_uncompressed = ext_ott_conf["class"](files=[self.tmp_dir + "taxonomy.tsv",
176 |                                                         self.tmp_dir + "forwards.tsv",
177 |                                                         self.tmp_dir + "synonyms.tsv"],
178 |                                                  extended_names=True)
179 |         # Results of compressed and uncompressed should match
180 |         self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
181 | 
182 |     def test_inconsistent(self):
183 |         """
184 |         Test parsing inconsistent taxonomies
185 |         """
186 |         for t in self.taxonomies:
187 |             # Delete root
188 |             tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
189 |             tax.remove(tax.root_node)
190 |             with self.assertRaises(ValueError):
191 |                 tax.check_consistency()
192 | 
193 |             # Delete random node (parent from random leaf)
194 |             tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
195 |             tax.remove(tax.parent(random.choice(tax.leaves())))
196 |             with self.assertRaises(ValueError):
197 |                 tax.check_consistency()
198 | 
199 |             # Delete random leaf (do not generate inconsistency)
200 |             tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
201 |             tax._remove(random.choice(tax.leaves()))
202 |             self.assertEqual(tax.check_consistency(), None)
203 | 


--------------------------------------------------------------------------------
/tests/multitax/integration/test_empty.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from multitax.multitax import MultiTax
 3 | from multitax import DummyTx
 4 | 
 5 | 
 6 | class TestDummy(unittest.TestCase):
 7 | 
 8 |     def test_multitax(self):
 9 |         tax = MultiTax()
10 |         stats = tax.stats()
11 |         # Only root node
12 |         self.assertEqual(stats["nodes"], 1)
13 |         # No input sources
14 |         self.assertFalse(tax.sources)
15 | 
16 |     def test_dummy(self):
17 |         tax = DummyTx()
18 |         stats = tax.stats()
19 |         # Only root node
20 |         self.assertEqual(stats["nodes"], 1)
21 |         # No input sources
22 |         self.assertFalse(tax.sources)
23 | 


--------------------------------------------------------------------------------
/tests/multitax/integration/test_online.py:
--------------------------------------------------------------------------------
 1 | from multitax import GreengenesTx, GtdbTx, NcbiTx, OttTx, SilvaTx, CustomTx
 2 | from tests.multitax.utils import setup_dir, uncompress_gzip, uncompress_tar_gzip
 3 | import unittest
 4 | import os
 5 | import sys
 6 | import random
 7 | 
 8 | sys.path.append("tests/multitax/")
 9 | 
10 | 
11 | @unittest.skip('Skip online by default')
12 | class TestOnline(unittest.TestCase):
13 | 
14 |     tmp_dir = "tests/multitax/integration/tmp_online/"
15 | 
16 |     taxonomies = {}
17 |     taxonomies["gtdb"] = {"class": GtdbTx}
18 |     taxonomies["ncbi"] = {"class": NcbiTx}
19 |     taxonomies["silva"] = {"class": SilvaTx}
20 |     taxonomies["ott"] = {"class": OttTx}
21 |     taxonomies["greengenes"] = {"class": GreengenesTx}
22 |     # todo test online custom
23 | 
24 |     @classmethod
25 |     def setUpClass(self):
26 |         setup_dir(self.tmp_dir)
27 | 
28 |     def test_online_default(self):
29 |         """
30 |         Default test online
31 |         """
32 |         for t in self.taxonomies:
33 |             tax = self.taxonomies[t]["class"]()
34 |             self.assertGreater(tax.stats()["nodes"], 0, t + " failed")
35 | 
36 |     def test_online_output_prefix(self):
37 |         """
38 |         Saving files on disk
39 |         """
40 |         for t in self.taxonomies:
41 |             tax = self.taxonomies[t]["class"](output_prefix=self.tmp_dir)
42 |             self.assertGreater(
43 |                 tax.stats()["nodes"], 0, t + " failed with urls and output_prefix")
44 | 


--------------------------------------------------------------------------------
/tests/multitax/unit/test_functions.py:
--------------------------------------------------------------------------------
  1 | from multitax.utils import check_file
  2 | from multitax import *
  3 | from tests.multitax.utils import setup_dir
  4 | import unittest
  5 | 
  6 | 
  7 | class TestFunctions(unittest.TestCase):
  8 |     # test data (14 nodes)
  9 |     #
 10 |     # rank-1 (root)            1 ___________
 11 |     #                         / \           \
 12 |     # rank-2                2.1 2.2 ______   \
 13 |     #                       / \    \      \   \
 14 |     # rank-3             3.1  3.2   3.4    \   \
 15 |     #                     /   / \     \     \   \
 16 |     # rank-4          *4.1 *4.2 *4.3  *4.4  *4.5 *4.6
 17 |     #                                 / |
 18 |     # rank-5                     *5.1 *5.2
 19 |     #
 20 |     # names: 1: Node1, 2.1: Node2.1, ...,5.2: Node5.2
 21 | 
 22 |     test_file = "tests/multitax/data_minimal/custom_unit_test.tsv.gz"
 23 |     tmp_dir = "tests/multitax/unit/tmp_functions/"
 24 | 
 25 |     @classmethod
 26 |     def setUpClass(self):
 27 |         setup_dir(self.tmp_dir)
 28 | 
 29 |     def test_children(self):
 30 |         """
 31 |         test children function
 32 |         """
 33 |         tax = CustomTx(files=self.test_file)
 34 |         self.assertCountEqual(tax.children("1"), ["2.1", "2.2", "4.6"])
 35 |         self.assertCountEqual(tax.children("2.1"), ["3.1", "3.2"])
 36 |         self.assertCountEqual(tax.children("2.2"), ["3.4", "4.5"])
 37 |         self.assertCountEqual(tax.children("4.4"), ["5.1", "5.2"])
 38 |         self.assertCountEqual(tax.children("5.2"), [])
 39 |         self.assertCountEqual(tax.children("XXX"), [])
 40 | 
 41 |     def test_search_name(self):
 42 |         """
 43 |         test search_name function
 44 |         """
 45 | 
 46 |         # Exact matches
 47 |         tax = CustomTx(files=self.test_file)
 48 |         self.assertCountEqual(tax.search_name("Node1"), ["1"])
 49 |         self.assertCountEqual(tax.search_name("Node2.1"), ["2.1"])
 50 |         self.assertCountEqual(tax.search_name("Node5.2"), ["5.2"])
 51 |         self.assertCountEqual(tax.search_name("Node2."), [])
 52 | 
 53 |         # not exact matches
 54 |         tax = CustomTx(files=self.test_file)
 55 |         self.assertCountEqual(tax.search_name(
 56 |             "Node2", exact=False), ["2.1", "2.2"])
 57 |         self.assertCountEqual(tax.search_name("Node2", exact=True), [])
 58 |         self.assertCountEqual(tax.search_name("Node1", exact=False), ["1"])
 59 |         self.assertCountEqual(tax.search_name("NotThere", exact=False), [])
 60 | 
 61 |         # Changing root name
 62 |         tax = CustomTx(files=self.test_file, root_name="AnotherRootName")
 63 |         self.assertCountEqual(tax.search_name("Node1", exact=False), [])
 64 |         self.assertCountEqual(tax.search_name(
 65 |             "AnotherRootName", exact=True), ["1"])
 66 |         self.assertCountEqual(tax.search_name("Another", exact=False), ["1"])
 67 | 
 68 |         # With specific rank
 69 |         tax = CustomTx(files=self.test_file)
 70 |         self.assertCountEqual(tax.search_name(
 71 |             "Node2.1", exact=True, rank="rank-2"), ["2.1"])
 72 |         self.assertCountEqual(tax.search_name(
 73 |             "Node4.4", exact=True, rank="rank-4"), ["4.4"])
 74 |         self.assertCountEqual(tax.search_name(
 75 |             "Node", exact=False, rank="rank-5"), ["5.1", "5.2"])
 76 |         self.assertCountEqual(tax.search_name(
 77 |             "Node2.1", exact=True, rank="rank-3"), [])
 78 |         self.assertCountEqual(tax.search_name(
 79 |             "Node4.4", exact=True, rank="rank-1"), [])
 80 |         self.assertCountEqual(tax.search_name(
 81 |             "Node5", exact=False, rank="rank-XXX"), [])
 82 | 
 83 |     def test_nodes_rank(self):
 84 |         """
 85 |         test nodes_rank function
 86 |         """
 87 |         tax = CustomTx(files=self.test_file)
 88 |         self.assertCountEqual(tax.nodes_rank("rank-1"), ["1"])
 89 |         self.assertCountEqual(tax.nodes_rank("rank-4"),
 90 |                               ["4.1", "4.2", "4.3", "4.4", "4.5", "4.6"])
 91 |         self.assertCountEqual(tax.nodes_rank("rank-9999"), [])
 92 | 
 93 |     def test_parent(self):
 94 |         """
 95 |         test parent function
 96 |         """
 97 |         tax = CustomTx(files=self.test_file)
 98 |         self.assertEqual(tax.parent("1"), tax.root_parent)
 99 |         self.assertEqual(tax.parent("3.2"), "2.1")
100 |         self.assertEqual(tax.parent("5.2"), "4.4")
101 |         self.assertEqual(tax.parent("PpQqRr"), tax.undefined_node)
102 | 
103 |         tax = CustomTx(files=self.test_file, undefined_node="NoNode")
104 |         self.assertEqual(tax.parent("ABVCDE"), "NoNode")
105 | 
106 |     def test_rank(self):
107 |         """
108 |         test rank function
109 |         """
110 |         tax = CustomTx(files=self.test_file)
111 |         self.assertEqual(tax.rank("4.1"), "rank-4")
112 |         self.assertEqual(tax.rank("1"), "rank-1")
113 |         self.assertEqual(tax.rank("5.2"), "rank-5")
114 |         self.assertEqual(tax.rank("what"), tax.undefined_rank)
115 | 
116 |         tax = CustomTx(files=self.test_file, undefined_rank="NoRank")
117 |         self.assertEqual(tax.rank("ABVCDE"), "NoRank")
118 | 
119 |     def test_name(self):
120 |         """
121 |         test name function
122 |         """
123 |         tax = CustomTx(files=self.test_file)
124 |         self.assertEqual(tax.name("4.1"), "Node4.1")
125 |         self.assertEqual(tax.name("1"), "Node1")
126 |         self.assertEqual(tax.name("2.2"), "Node2.2")
127 |         self.assertEqual(tax.name("ABVCDE"), tax.undefined_name)
128 | 
129 |         tax = CustomTx(files=self.test_file, undefined_name="NoName")
130 |         self.assertEqual(tax.name("ABVCDE"), "NoName")
131 | 
132 |     def test_latest(self):
133 |         """
134 |         test latest function
135 |         """
136 |         tax = CustomTx(files=self.test_file)
137 |         self.assertEqual(tax.latest("4.1"), "4.1")
138 |         self.assertEqual(tax.latest("1"), "1")
139 |         self.assertEqual(tax.latest("4.6"), "4.6")
140 |         self.assertEqual(tax.latest("XxXxXx"), tax.undefined_node)
141 | 
142 |     def test_leaves(self):
143 |         """
144 |         test leaves function
145 |         """
146 |         tax = CustomTx(files=self.test_file)
147 |         self.assertCountEqual(
148 |             tax.leaves(), ["4.1", "4.2", "4.3", "4.5", "4.6", "5.1", "5.2"])
149 |         self.assertCountEqual(tax.leaves(
150 |             "1"), ["4.1", "4.2", "4.3", "5.1", "5.2", "4.5", "4.6"])
151 |         self.assertCountEqual(tax.leaves("2.2"), ["5.1", "5.2", "4.5"])
152 |         self.assertCountEqual(tax.leaves("4.4"), ["5.1", "5.2"])
153 |         self.assertCountEqual(tax.leaves("5.1"), ["5.1"])
154 |         self.assertCountEqual(tax.leaves("999.999"), [])
155 | 
156 |     def test_lineage(self):
157 |         """
158 |         test lineage function
159 |         """
160 |         tax = CustomTx(files=self.test_file)
161 |         # Use only assertEqual instead of assertCountEqual -> order matters
162 |         self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"])
163 |         self.assertEqual(tax.lineage("3.2"), ["1", "2.1", "3.2"])
164 |         self.assertEqual(tax.lineage("4.6"), ["1", "4.6"])
165 |         self.assertEqual(tax.lineage("1"), ["1"])
166 |         self.assertEqual(tax.lineage("9999"), [])
167 | 
168 |         # with ranks
169 |         self.assertEqual(tax.lineage("5.2", ranks=["rank-1", "rank-3", "rank-5"]),
170 |                          ["1", "3.4", "5.2"])
171 |         self.assertEqual(tax.lineage("5.2", ranks=["rank-3", "rank-5", "rank-1"]),
172 |                          ["3.4", "5.2", "1"])
173 |         self.assertEqual(tax.lineage("4.5", ranks=["rank-1"]),
174 |                          ["1"])
175 |         self.assertEqual(tax.lineage("3.2", ranks=["rank-4", "rank-5"]),
176 |                          [tax.undefined_node, tax.undefined_node])
177 |         self.assertEqual(tax.lineage("4.5", ranks=["rank-1", "rank-2", "rank-3", "rank-4", "rank-5"]),
178 |                          ["1", "2.2", tax.undefined_node, "4.5", tax.undefined_node])
179 |         self.assertEqual(tax.lineage("4.6", ranks=["xxxx", "yyy"]),
180 |                          [tax.undefined_node, tax.undefined_node])
181 |         # Invalid lineage
182 |         self.assertEqual(tax.lineage("ZZZ", ranks=["xxxx", "yyy"]),
183 |                          [])
184 | 
185 |         # with root_node
186 |         self.assertEqual(tax.lineage("5.2", root_node="2.2"),
187 |                          ["2.2", "3.4", "4.4", "5.2"])
188 |         self.assertEqual(tax.lineage("4.2", root_node="2.1"),
189 |                          ["2.1", "3.2", "4.2"])
190 |         self.assertEqual(tax.lineage("4.5", root_node="2.2"),
191 |                          ["2.2", "4.5"])
192 |         # Invalid lineage
193 |         self.assertEqual(tax.lineage("5.2", root_node="2.1"),
194 |                          [])
195 |         self.assertEqual(tax.lineage("3.1", root_node="4.1"),
196 |                          [])
197 |         self.assertEqual(tax.lineage("XXX", root_node="YYY"),
198 |                          [])
199 | 
200 |         # with both
201 |         self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]),
202 |                          ["3.4", "4.4"])
203 |         self.assertEqual(tax.lineage("5.1", root_node="3.4", ranks=["rank-3", "rank-5"]),
204 |                          ["3.4", "5.1"])
205 |         self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
206 |                          [tax.undefined_node, "2.1", "3.1", tax.undefined_node])
207 |         self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]),
208 |                          [tax.undefined_node, tax.undefined_node])
209 |         self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["XXXXX"]),
210 |                          [tax.undefined_node])
211 |         # Invalid lineage
212 |         self.assertEqual(tax.lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
213 |                          [])
214 |         self.assertEqual(tax.lineage("XXXX", root_node="2.2", ranks=["rank-3", "rank-4"]),
215 |                          [])
216 | 
217 |     def test_rank_lineage(self):
218 |         """
219 |         test rank_lineage function
220 |         """
221 |         tax = CustomTx(files=self.test_file)
222 |         self.assertEqual(tax.rank_lineage("5.2"), [
223 |                          "rank-1", "rank-2", "rank-3", "rank-4", "rank-5"])
224 |         self.assertEqual(tax.rank_lineage("4.6"), ["rank-1", "rank-4"])
225 |         self.assertEqual(tax.rank_lineage("1"), ["rank-1"])
226 |         self.assertEqual(tax.rank_lineage("9999"), [])
227 | 
228 |         # with ranks or  root_node
229 |         self.assertEqual(tax.rank_lineage("5.2", ranks=["rank-1", "rank-3"]),
230 |                          ["rank-1", "rank-3"])
231 |         self.assertEqual(tax.rank_lineage("5.2", ranks=["rank-1", "XXX", "rank-3"]),
232 |                          ["rank-1", tax.undefined_rank, "rank-3"])
233 |         self.assertEqual(tax.rank_lineage("ZZZ", ranks=["rank-1", "XXX", "rank-3"]),
234 |                          [])
235 |         self.assertEqual(tax.rank_lineage("5.2", root_node="2.2"),
236 |                          ["rank-2", "rank-3", "rank-4", "rank-5"])
237 |         self.assertEqual(tax.rank_lineage("5.2", root_node="2.1"),
238 |                          [])
239 |         self.assertEqual(tax.rank_lineage("XXX", root_node="YYY"),
240 |                          [])
241 | 
242 |         # with both
243 |         self.assertEqual(tax.rank_lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]),
244 |                          ["rank-3", "rank-4"])
245 |         self.assertEqual(tax.rank_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
246 |                          [tax.undefined_rank, "rank-2", "rank-3", tax.undefined_rank])
247 |         self.assertEqual(tax.rank_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]),
248 |                          [tax.undefined_rank, tax.undefined_rank])
249 |         self.assertEqual(tax.rank_lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
250 |                          [])
251 |         self.assertEqual(tax.rank_lineage("XXXX", root_node="ZZZ", ranks=["CCC", "VVV"]),
252 |                          [])
253 | 
254 |     def test_name_lineage(self):
255 |         """
256 |         test rank_lineage function
257 |         """
258 |         tax = CustomTx(files=self.test_file)
259 |         self.assertEqual(tax.name_lineage("5.2"), [
260 |                          "Node1", "Node2.2", "Node3.4", "Node4.4", "Node5.2"])
261 |         self.assertEqual(tax.name_lineage("4.6"), ["Node1", "Node4.6"])
262 |         self.assertEqual(tax.name_lineage("1"), ["Node1"])
263 |         self.assertEqual(tax.name_lineage("9999"), [])
264 | 
265 |         # with ranks or  root_node
266 |         self.assertEqual(tax.name_lineage("5.2", ranks=["rank-1", "rank-3"]),
267 |                          ["Node1", "Node3.4"])
268 |         self.assertEqual(tax.name_lineage("5.2", ranks=["rank-1", "XXX", "rank-3"]),
269 |                          ["Node1", tax.undefined_name, "Node3.4"])
270 |         self.assertEqual(tax.name_lineage("ZZZ", ranks=["rank-1", "XXX", "rank-3"]),
271 |                          [])
272 |         self.assertEqual(tax.name_lineage("5.2", root_node="2.2"),
273 |                          ["Node2.2", "Node3.4", "Node4.4", "Node5.2"])
274 |         self.assertEqual(tax.name_lineage("5.2", root_node="2.1"),
275 |                          [])
276 |         self.assertEqual(tax.name_lineage("XXX", root_node="YYY"),
277 |                          [])
278 | 
279 |         # with both
280 |         self.assertEqual(tax.name_lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]),
281 |                          ["Node3.4", "Node4.4"])
282 |         self.assertEqual(tax.name_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
283 |                          [tax.undefined_name, "Node2.1", "Node3.1", tax.undefined_name])
284 |         self.assertEqual(tax.name_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]),
285 |                          [tax.undefined_name, tax.undefined_name])
286 |         self.assertEqual(tax.name_lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
287 |                          [])
288 |         self.assertEqual(tax.name_lineage("XXXX", root_node="ZZZ", ranks=["CCC", "VVV"]),
289 |                          [])
290 | 
291 |     def test_parent_rank(self):
292 |         """
293 |         test parent_rank function
294 |         """
295 |         tax = CustomTx(files=self.test_file)
296 |         self.assertEqual(tax.parent_rank("5.2", "rank-3"), "3.4")
297 |         self.assertEqual(tax.parent_rank("4.1", "rank-2"), "2.1")
298 |         self.assertEqual(tax.parent_rank("3.2", "rank-1"), "1")
299 |         self.assertEqual(tax.parent_rank("3.2", "rank-4"), tax.undefined_node)
300 |         self.assertEqual(tax.parent_rank("2.2", "XXXX"), tax.undefined_node)
301 |         self.assertEqual(tax.parent_rank("CCCC", "XXXX"), tax.undefined_node)
302 | 
303 |     def test_closest_parent(self):
304 |         """
305 |         test closest_parent function
306 |         """
307 |         tax = CustomTx(files=self.test_file)
308 |         self.assertEqual(tax.closest_parent(
309 |             "5.2", ["rank-1", "rank-3"]), "3.4")
310 |         self.assertEqual(tax.closest_parent(
311 |             "5.2", ["rank-1", "rank-3", "rank-4"]), "4.4")
312 |         self.assertEqual(tax.closest_parent(
313 |             "5.2", ["rank-1", "rank-3", "rank-4", "rank-5"]), "5.2")
314 |         self.assertEqual(tax.closest_parent(
315 |             "5.2", ["rank-1", "rank-3", "rank-4", "rank-5", "XXXXX"]), "5.2")
316 |         self.assertEqual(tax.closest_parent(
317 |             "3.4", ["rank-1", "rank-4", "rank-5"]), "1")
318 |         self.assertEqual(tax.closest_parent(
319 |             "4.6", ["rank-1", "rank-2", "rank-3", "rank-5"]), "1")
320 |         self.assertEqual(tax.closest_parent(
321 |             "4.6", ["rank-2", "rank-3", "rank-5"]), tax.undefined_node)
322 |         self.assertEqual(tax.closest_parent(
323 |             "3.4", ["X", "Y", "Z"]), tax.undefined_node)
324 |         self.assertEqual(tax.closest_parent("3.4", []), "3.4")
325 | 
326 |     def test_stats(self):
327 |         """
328 |         test stats function
329 |         """
330 |         tax = CustomTx(files=self.test_file)
331 |         stats = tax.stats()
332 |         self.assertEqual(stats["nodes"], 14)
333 |         self.assertEqual(stats["names"], 14)
334 |         self.assertEqual(stats["ranks"], 14)
335 |         self.assertEqual(stats["leaves"], 7)
336 |         self.assertEqual(len(stats["ranked_nodes"]), 5)
337 |         self.assertEqual(sum(stats["ranked_nodes"].values()), stats["nodes"])
338 |         self.assertEqual(sum(stats["ranked_leaves"].values()), stats["leaves"])
339 |         self.assertCountEqual(list(stats["ranked_leaves"].keys()), [
340 |                               "rank-4", "rank-5"])
341 | 
342 |     def test_build_lineages(self):
343 |         """
344 |         test build_lineages function
345 |         """
346 |         # build full lineages
347 |         tax = CustomTx(files=self.test_file)
348 |         self.assertEqual(len(tax._lineages), 0)
349 |         tax.build_lineages()
350 |         self.assertEqual(len(tax._lineages), 14)
351 |         self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"])
352 |         self.assertEqual(tax.lineage("XXX"), [])
353 |         # do not use stored lineage with keyword arguments
354 |         self.assertEqual(tax.lineage("5.2", root_node="2.2"),
355 |                          ["2.2", "3.4", "4.4", "5.2"])
356 |         self.assertEqual(tax.lineage(
357 |             "5.2", ranks=["rank-2", "rank-4"]), ["2.2", "4.4"])
358 |         self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=[
359 |                          "rank-2", "rank-4"]), ["2.2", "4.4"])
360 | 
361 |         # build filtered lineages
362 |         tax.clear_lineages()
363 |         self.assertEqual(len(tax._lineages), 0)
364 |         tax.build_lineages(root_node="2.2", ranks=["rank-2", "rank-4"])
365 |         self.assertEqual(len(tax._lineages), 14)
366 |         self.assertEqual(tax.lineage("5.2"), ["2.2", "4.4"])
367 |         self.assertEqual(tax.lineage("XXX"), [])
368 |         # do not use stored lineage with keyword arguments
369 |         self.assertEqual(tax.lineage("5.2", root_node="3.4"),
370 |                          ["3.4", "4.4", "5.2"])
371 |         self.assertEqual(tax.lineage("5.2", ranks=[]), [
372 |                          "1", "2.2", "3.4", "4.4", "5.2"])
373 |         self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=[
374 |                          "rank-2", "rank-5"]), ["2.2", "5.2"])
375 | 
376 |     def test_clear_lineages(self):
377 |         """
378 |         test clear_lineages function
379 |         """
380 |         tax = CustomTx(files=self.test_file)
381 |         self.assertEqual(len(tax._lineages), 0)
382 |         tax.build_lineages()
383 |         self.assertEqual(len(tax._lineages), 14)
384 |         tax.clear_lineages()
385 |         self.assertEqual(len(tax._lineages), 0)
386 |         self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"])
387 |         self.assertEqual(tax.lineage("XXX"), [])
388 | 
389 |     def test_translation(self):
390 |         """
391 |         test build_translation and tranlate functions (GTDB<->NCBI)
392 |         """
393 |         gtdb_tax = GtdbTx(files=["tests/multitax/data_minimal/gtdb_ar.tsv.gz",
394 |                                  "tests/multitax/data_minimal/gtdb_bac.tsv.gz"])
395 |         ncbi_tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz")
396 | 
397 |         # GTDB->NCBI
398 |         # Should be no translation yet (g__Paenibacillus is contained in both test sets)
399 |         self.assertCountEqual(gtdb_tax.translate("g__Paenibacillus"), [])
400 |         gtdb_tax.build_translation(ncbi_tax, files=[
401 |                                    "tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz", "tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz"])
402 |         self.assertCountEqual(gtdb_tax.translate(
403 |             "g__Paenibacillus"), ["44249"])
404 | 
405 |         # NCBI->GTDB
406 |         # Should be no translation yet (g__Paenibacillus is contained in both test sets)
407 |         self.assertCountEqual(ncbi_tax.translate("44249"), [])
408 |         ncbi_tax.build_translation(gtdb_tax, files=[
409 |                                    "tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz", "tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz"])
410 |         self.assertCountEqual(ncbi_tax.translate("44249"), ["g__Paenibacillus"])
411 | 
412 |         # Other translations not yet implemented
413 |         ott_tax = OttTx(files="tests/multitax/data_minimal/ott.tgz")
414 |         silva_tax = SilvaTx(files="tests/multitax/data_minimal/silva.txt.gz")
415 |         gg_tax = GreengenesTx(files="tests/multitax/data_minimal/gg.txt.gz")
416 |         with self.assertWarns(UserWarning):
417 |             ncbi_tax.build_translation(ott_tax)
418 |             ncbi_tax.build_translation(silva_tax)
419 |             ncbi_tax.build_translation(gg_tax)
420 |             gtdb_tax.build_translation(ott_tax)
421 |             gtdb_tax.build_translation(silva_tax)
422 |             gtdb_tax.build_translation(gg_tax)
423 |             ott_tax.build_translation(silva_tax)
424 |             ott_tax.build_translation(gg_tax)
425 |             ott_tax.build_translation(gtdb_tax)
426 |             ott_tax.build_translation(ncbi_tax)
427 |             gg_tax.build_translation(ott_tax)
428 |             gg_tax.build_translation(silva_tax)
429 |             gg_tax.build_translation(gtdb_tax)
430 |             gg_tax.build_translation(ncbi_tax)
431 | 
432 |     def test_check_consistency(self):
433 |         """
434 |         test check_consistency function
435 |         """
436 |         tax = CustomTx(files=self.test_file)
437 |         self.assertEqual(tax.check_consistency(), None)
438 |         # delete node
439 |         del tax._nodes["3.4"]
440 |         with self.assertRaises(ValueError):
441 |             tax.check_consistency()
442 | 
443 |         tax = CustomTx(files=self.test_file)
444 |         # delete leaf node
445 |         del tax._nodes["5.2"]
446 |         self.assertEqual(tax.check_consistency(), None)
447 | 
448 |         tax = CustomTx(files=self.test_file)
449 |         # delete root
450 |         del tax._nodes["1"]
451 |         # should raise error
452 |         with self.assertRaises(ValueError):
453 |             tax.check_consistency()
454 | 
455 |     def test_filter(self):
456 |         """
457 |         test filter function
458 |         """
459 |         # Ancestors
460 |         tax = CustomTx(files=self.test_file)
461 |         tax.filter("4.5")
462 |         self.assertEqual(tax.stats()["nodes"], 3)
463 |         self.assertCountEqual(tax.lineage("4.5"), ["1", "2.2", "4.5"])
464 |         self.assertCountEqual(tax.leaves("1"), ["4.5"])
465 | 
466 |         tax = CustomTx(files=self.test_file)
467 |         tax.filter(["4.5", "XXXX"])
468 |         self.assertEqual(tax.stats()["nodes"], 3)
469 |         self.assertCountEqual(tax.lineage("4.5"), ["1", "2.2", "4.5"])
470 |         self.assertCountEqual(tax.leaves("1"), ["4.5"])
471 | 
472 |         tax = CustomTx(files=self.test_file)
473 |         tax.filter(["4.1", "5.1", "5.2"])
474 |         self.assertEqual(tax.stats()["nodes"], 9)
475 |         self.assertCountEqual(tax.lineage("4.1"), ["1", "2.1", "3.1", "4.1"])
476 |         self.assertCountEqual(tax.leaves("1"), ["4.1", "5.1", "5.2"])
477 | 
478 |         tax = CustomTx(files=self.test_file)
479 |         tax.filter("XXXX")
480 |         self.assertEqual(tax.stats()["nodes"], 1)
481 | 
482 |         # Descendants
483 |         tax = CustomTx(files=self.test_file)
484 |         tax.filter("3.4", desc=True)
485 |         self.assertEqual(tax.stats()["nodes"], 5)
486 |         self.assertCountEqual(tax.lineage("3.4"), ["1", "3.4"])
487 |         self.assertCountEqual(tax.leaves("1"), ["5.1", "5.2"])
488 | 
489 |         tax = CustomTx(files=self.test_file)
490 |         tax.filter(["XXXXX", "3.4"], desc=True)
491 |         self.assertEqual(tax.stats()["nodes"], 5)
492 |         self.assertCountEqual(tax.lineage("3.4"), ["1", "3.4"])
493 |         self.assertCountEqual(tax.leaves("1"), ["5.1", "5.2"])
494 | 
495 |         tax = CustomTx(files=self.test_file)
496 |         tax.filter(["3.2", "4.4"], desc=True)
497 |         self.assertEqual(tax.stats()["nodes"], 7)
498 |         self.assertCountEqual(tax.lineage("5.2"), ["1", "4.4", "5.2"])
499 |         self.assertCountEqual(tax.lineage("4.5"), [])
500 |         self.assertCountEqual(tax.leaves("1"), ["4.2", "4.3", "5.1", "5.2"])
501 | 
502 |         tax = CustomTx(files=self.test_file)
503 |         self.assertEqual(tax.stats()["nodes"], 14)
504 |         tax.filter("XXXXX", desc=True)
505 |         self.assertEqual(tax.stats()["nodes"], 1)
506 | 
507 |     def test_add(self):
508 |         """
509 |         test add function
510 |         """
511 |         tax = CustomTx(files=self.test_file)
512 |         # Add leaf node 5.3 to parent 4.4
513 |         tax.add("5.3", "4.4")
514 |         self.assertEqual(tax.check_consistency(), None)
515 |         self.assertEqual(tax.parent("5.3"), "4.4")
516 |         self.assertEqual(tax.name("5.3"), tax.undefined_name)
517 |         self.assertEqual(tax.rank("5.3"), tax.undefined_rank)
518 | 
519 |         # Add another leaf on the 5.3 with name and rank
520 |         tax.add("6.1", "5.3", name="Node6.1", rank="rank-6")
521 |         self.assertEqual(tax.check_consistency(), None)
522 |         self.assertEqual(tax.parent("6.1"), "5.3")
523 |         self.assertEqual(tax.name("6.1"), "Node6.1")
524 |         self.assertEqual(tax.rank("6.1"), "rank-6")
525 |         self.assertEqual(tax.lineage("6.1"), [
526 |                          "1", "2.2", "3.4", "4.4", "5.3", "6.1"])
527 | 
528 |         # Add node without valid parent, raises ValueError
529 |         with self.assertRaises(ValueError):
530 |             tax.add("6.2", "XXX")
531 | 
532 |         # Add already existing node
533 |         with self.assertRaises(ValueError):
534 |             tax.add("5.1", "4.4")
535 | 
536 |     def test_remove(self):
537 |         """
538 |         test remove function
539 |         """
540 |         tax = CustomTx(files=self.test_file)
541 |         tax.remove("5.2")
542 |         self.assertEqual(tax.latest("5.2"), tax.undefined_node)
543 |         self.assertEqual(tax.parent("5.2"), tax.undefined_node)
544 |         self.assertEqual(tax.name("5.2"), tax.undefined_node)
545 |         self.assertEqual(tax.rank("5.2"), tax.undefined_node)
546 |         self.assertEqual(tax.lineage("5.2"), [])
547 | 
548 |         # Initialize aux structures and clear them after removing node
549 |         tax = CustomTx(files=self.test_file, build_name_nodes=True,
550 |                        build_node_children=True, build_rank_nodes=True)
551 |         self.assertNotEqual(len(tax._name_nodes), 0)
552 |         self.assertNotEqual(len(tax._node_children), 0)
553 |         self.assertNotEqual(len(tax._rank_nodes), 0)
554 |         tax.remove("5.2")
555 |         self.assertEqual(len(tax._name_nodes), 0)
556 |         self.assertEqual(len(tax._node_children), 0)
557 |         self.assertEqual(len(tax._rank_nodes), 0)
558 | 
559 |         # with check_consistency
560 |         tax.remove("5.1", check_consistency=True)
561 | 
562 |         # Removing node that breaks the tree (allowed)
563 |         tax.remove("3.1")
564 |         # node is removed anyway
565 |         self.assertEqual(tax.latest("3.1"), tax.undefined_node)
566 |         with self.assertRaises(ValueError):
567 |             tax.check_consistency()
568 | 
569 |         # Removing and raising execption
570 |         with self.assertRaises(ValueError):
571 |             tax.remove("3.2", check_consistency=True)
572 |         # node is removed anyway
573 |         self.assertEqual(tax.latest("3.2"), tax.undefined_node)
574 | 
575 |         # Removing root
576 |         tax.remove("1")
577 |         with self.assertRaises(ValueError):
578 |             tax.check_consistency()
579 | 
580 |         # Removing node not present
581 |         with self.assertRaises(ValueError):
582 |             tax.remove("XXX")
583 | 
584 |     def test_prune(self):
585 |         """
586 |         test prune function
587 |         """
588 |         tax = CustomTx(files=self.test_file)
589 | 
590 |         self.assertCountEqual(tax.leaves("4.4"), ["5.1", "5.2"])
591 |         tax.prune("4.4")
592 |         self.assertEqual(tax.check_consistency(), None)
593 |         self.assertCountEqual(tax.leaves("4.4"), ["4.4"])
594 | 
595 |         # Prune leaf node (nothing changes)
596 |         self.assertCountEqual(tax.leaves("4.6"), ["4.6"])
597 |         tax.prune("4.6")
598 |         self.assertEqual(tax.check_consistency(), None)
599 |         self.assertCountEqual(tax.leaves("4.6"), ["4.6"])
600 | 
601 |         # Prune multiple overlapping nodes
602 |         self.assertCountEqual(tax.leaves("2.1"), ["4.1", "4.2", "4.3"])
603 |         self.assertCountEqual(tax.leaves("3.2"), ["4.2", "4.3"])
604 |         tax.prune(["2.1", "3.2"])
605 |         self.assertEqual(tax.check_consistency(), None)
606 |         self.assertCountEqual(tax.leaves("2.1"), ["2.1"])
607 |         self.assertCountEqual(tax.leaves("3.2"), [])
608 | 
609 |         # Restar tax
610 |         tax = CustomTx(files=self.test_file)
611 |         # Prune multiple overlapping nodes (reversed)
612 |         self.assertCountEqual(tax.leaves("2.1"), ["4.1", "4.2", "4.3"])
613 |         self.assertCountEqual(tax.leaves("3.2"), ["4.2", "4.3"])
614 |         tax.prune(["3.2", "2.1"])
615 |         self.assertEqual(tax.check_consistency(), None)
616 |         self.assertCountEqual(tax.leaves("2.1"), ["2.1"])
617 |         self.assertCountEqual(tax.leaves("3.2"), [])
618 | 
619 |         # Pruning node not present
620 |         with self.assertRaises(ValueError):
621 |             tax.prune("XXX")
622 | 
623 |         # Prunning root node
624 |         tax.prune(tax.root_node)
625 |         self.assertEqual(len(tax._nodes), 1)
626 | 
627 |     def test_write(self):
628 |         """
629 |         test write function
630 |         """
631 |         tax = CustomTx(files=self.test_file)
632 |         outfile = self.tmp_dir + "default.tsv"
633 |         tax.write(outfile)
634 |         self.assertEqual(check_file(outfile), None)
635 | 
636 |         tax = CustomTx(files=self.test_file)
637 |         outfile = self.tmp_dir + "ranks.tsv"
638 |         tax.write(outfile,
639 |                   ranks=["rank-2", "rank-4"],
640 |                   cols=["node", "rank", "lineage", "rank_lineage", "name_lineage"])
641 |         self.assertEqual(check_file(outfile), None)
642 | 
643 |         tax = CustomTx(files=self.test_file)
644 |         outfile = self.tmp_dir + "all_cols.tsv"
645 |         tax.write(outfile,
646 |                   cols=["node", "latest", "parent", "rank", "name", "leaves", "children", "lineage", "rank_lineage", "name_lineage"])
647 |         self.assertEqual(check_file(outfile), None)
648 | 
649 |         tax = CustomTx(files=self.test_file)
650 |         outfile = self.tmp_dir + "sep_comma.tsv"
651 |         tax.write(outfile,
652 |                   sep=",")
653 |         self.assertEqual(check_file(outfile), None)
654 | 
655 |         tax = CustomTx(files=self.test_file)
656 |         outfile = self.tmp_dir + "sep_multi_underline.tsv"
657 |         tax.write(outfile,
658 |                   cols=["node", "lineage", "children", "leaves"],
659 |                   sep_multi="_")
660 |         self.assertEqual(check_file(outfile), None)
661 | 
662 |     def test_ott_forwards(self):
663 |         """
664 |         Test forwards functionality (ott only)
665 |         """
666 |         # forwards.tsv
667 |         # id    replacement
668 |         # 5044012 4603004
669 |         # 391495  391494
670 | 
671 |         tax = OttTx(files="tests/multitax/data_minimal/ott.tgz")
672 |         self.assertEqual(len(tax._forwards), 2)
673 | 
674 |         self.assertEqual(tax.parent("5044012"), tax.undefined_node)
675 |         self.assertEqual(tax.latest("5044012"), "4603004")
676 |         self.assertNotEqual(tax.parent(
677 |             tax.latest("5044012")), tax.undefined_node)
678 | 
679 |         self.assertEqual(tax.parent("391495"), tax.undefined_node)
680 |         self.assertEqual(tax.latest("391495"), "391494")
681 |         self.assertNotEqual(tax.parent(
682 |             tax.latest("391495")), tax.undefined_node)
683 | 
684 |     def test_ncbi_merged(self):
685 |         """
686 |         Test merged functionality (ncbi only)
687 |         """
688 |         # merged.dmp
689 |         # 1235230 |   459525  |
690 |         # 1235908 |   363999  |
691 | 
692 |         tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz")
693 |         self.assertEqual(len(tax._merged), 2)
694 | 
695 |         self.assertEqual(tax.parent("1235230"), tax.undefined_node)
696 |         self.assertEqual(tax.latest("1235230"), "459525")
697 |         self.assertNotEqual(tax.parent(
698 |             tax.latest("1235230")), tax.undefined_node)
699 | 
700 |         self.assertEqual(tax.parent("1235908"), tax.undefined_node)
701 |         self.assertEqual(tax.latest("1235908"), "363999")
702 |         self.assertNotEqual(tax.parent(
703 |             tax.latest("1235908")), tax.undefined_node)
704 | 
705 |     def test_ncbi_extended_names(self):
706 |         """
707 |         Test extended names functionality (ncbi)
708 |         """
709 |         # on names.dmp
710 |         # 363999  |   Xylariaceae sp. 5129    |       |   includes    |
711 |         # 363999  |   Xylariaceae sp. 5151    |       |   includes    |
712 |         # 363999  |   Xylariaceae sp. 5228    |       |   includes    |
713 |         # 37990   |   mitosporic Xylariaceae  |       |   includes    |
714 |         # 37990   |   Xylariaceae |       |   scientific name |
715 | 
716 |         tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz",
717 |                      extended_names=False)
718 |         tax_ex = NcbiTx(
719 |             files="tests/multitax/data_minimal/ncbi.tar.gz", extended_names=True)
720 | 
721 |         # Exact match on scientific name
722 |         self.assertCountEqual(tax.search_name("Xylariaceae"), ["37990"])
723 |         self.assertCountEqual(tax_ex.search_name("Xylariaceae"), ["37990"])
724 |         # All scientific names
725 |         self.assertCountEqual(tax.search_name(
726 |             "Xylariaceae", exact=False), ["37990"])
727 |         self.assertCountEqual(tax_ex.search_name(
728 |             "Xylariaceae", exact=False), ["37990"])
729 |         # Exact match on scientific name forcing extended
730 |         self.assertCountEqual(tax.search_name("Xylariaceae"), ["37990"])
731 |         self.assertCountEqual(tax_ex.search_name(
732 |             "Xylariaceae", force_extended=True), ["37990"])
733 |         # All names
734 |         self.assertCountEqual(tax.search_name(
735 |             "Xylariaceae", exact=False), ["37990"])
736 |         self.assertCountEqual(tax_ex.search_name(
737 |             "Xylariaceae", exact=False, force_extended=True), ["37990", "363999"])
738 |         # Exact name available only on extended
739 |         self.assertCountEqual(tax.search_name(
740 |             "mitosporic Xylariaceae", exact=True), [])
741 |         self.assertCountEqual(tax_ex.search_name(
742 |             "mitosporic Xylariaceae", exact=True), ["37990"])
743 |         # Partial name available only on extended
744 |         self.assertCountEqual(tax.search_name(
745 |             "Xylariaceae sp.", exact=False), [])
746 |         self.assertCountEqual(tax_ex.search_name(
747 |             "Xylariaceae sp.", exact=False), ["363999"])
748 | 
749 |     def test_ott_extended_names(self):
750 |         """
751 |         Test extended names functionality (ott)
752 |         """
753 |         # on taxonomy.tsv
754 |         # 4622    |   470454  |   Haemophilus sp. CCUG 32367  |   species |   silva:EU909664,ncbi:554010  |       |   sibling_higher  |
755 |         # 4621    |   470454  |   Haemophilus sp. CCUG 35214  |   species |   silva:EU909665,ncbi:554011  |       |   sibling_higher  |
756 |         # 158636  |   470454  |   Haemophilus sp. CCUG 30218  |   species |   silva:EU909662,ncbi:554007  |       |   sibling_higher  |
757 |         # 391494  |   470454  |   Haemophilus sp. CCUG 31732  |   species |   silva:EU909663,ncbi:554009  |       |   sibling_higher  |
758 |         # 525972  |   470454  |   Haemophilus pittmaniae HK 85    |   no rank - terminal  |   silva:AFUV01000004,ncbi:1035188 |
759 |         # 788108  |   470454  |   Haemophilus sputorum    |   species |   silva:JF506644,ncbi:1078480,gbif:7522132    |
760 |         # 470454  |   1098176 |   Haemophilus |   genus   |   silva:A16379/#6,ncbi:724,worms:571392,gbif:3219815,irmng:1307220    |       |       |
761 |         # on synonyms.tsv
762 |         # Hemophilus  |   470454  |   synonym |   Hemophilus (synonym for Haemophilus)    |   gbif:3219815,irmng:1307220  |
763 |         # Haemophilus sp. HK 85   |   525972  |   equivalent name |   Haemophilus sp. HK 85 (synonym for Haemophilus pittmaniae HK 85)    |   ncbi:1035188    |
764 |         # Haemophilus sp. CCUG 26672  |   788108  |   includes    |   Haemophilus sp. CCUG 26672 (synonym for Haemophilus sputorum)   |   ncbi:1078480    |
765 |         # Haemophilus sp. CCUG 47809  |   788108  |   includes    |   Haemophilus sp. CCUG 47809 (synonym for Haemophilus sputorum)   |   ncbi:1078480    |
766 | 
767 |         tax = OttTx(files="tests/multitax/data_minimal/ott.tgz",
768 |                     extended_names=False)
769 |         tax_ex = OttTx(
770 |             files="tests/multitax/data_minimal/ott.tgz", extended_names=True)
771 | 
772 |         # Exact match on scientific name
773 |         self.assertCountEqual(tax.search_name("Haemophilus"), ["470454"])
774 |         self.assertCountEqual(tax_ex.search_name("Haemophilus"), ["470454"])
775 |         # All scientific names
776 |         self.assertCountEqual(tax.search_name("Haemophilus sp.", exact=False), [
777 |                               "391494", "158636", "4621", "4622"])
778 |         self.assertCountEqual(tax_ex.search_name("Haemophilus sp.", exact=False), [
779 |                               "391494", "158636", "4621", "4622"])
780 |         # Exact match on scientific name forcing extended
781 |         self.assertCountEqual(tax.search_name("Haemophilus"), ["470454"])
782 |         self.assertCountEqual(tax_ex.search_name(
783 |             "Haemophilus", force_extended=True), ["470454"])
784 |         # All names
785 |         self.assertCountEqual(tax.search_name("Haemophilus sp. CCUG", exact=False), [
786 |                               "391494", "158636", "4621", "4622"])
787 |         self.assertCountEqual(tax_ex.search_name("Haemophilus sp. CCUG", exact=False, force_extended=True), [
788 |                               "391494", "158636", "4621", "4622", "788108"])
789 |         # Exact name available only on extended
790 |         self.assertCountEqual(tax.search_name(
791 |             "Haemophilus sp. HK 85", exact=True), [])
792 |         self.assertCountEqual(tax_ex.search_name(
793 |             "Haemophilus sp. HK 85", exact=True), ["525972"])
794 |         # Partial name available only on extended
795 |         self.assertCountEqual(tax.search_name("CCUG 26672", exact=False), [])
796 |         self.assertCountEqual(tax_ex.search_name(
797 |             "CCUG 26672", exact=False), ["788108"])
798 | 


--------------------------------------------------------------------------------
/tests/multitax/unit/test_init.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from multitax import CustomTx
  3 | from multitax.multitax import MultiTax
  4 | 
  5 | 
  6 | class TestInit(unittest.TestCase):
  7 |     # test data (14 nodes)
  8 |     #
  9 |     # rank-1 (root)            1 ___________
 10 |     #                         / \           \
 11 |     # rank-2                2.1 2.2 ______   \
 12 |     #                       / \    \      \   \
 13 |     # rank-3             3.1  3.2   3.4    \   \
 14 |     #                     /   / \     \     \   \
 15 |     # rank-4          *4.1 *4.2 *4.3  *4.4  *4.5 *4.6
 16 |     #                                 / |
 17 |     # rank-5                     *5.1 *5.2
 18 |     #
 19 |     # names: 1: Node1, 2.1: Node2.1, ...,5.2: Node5.2
 20 | 
 21 |     test_file = "tests/multitax/data_minimal/custom_unit_test.tsv.gz"
 22 | 
 23 |     def test_default(self):
 24 |         """
 25 |         test default values on empty init
 26 |         """
 27 |         # Empty tax
 28 |         tax = MultiTax()
 29 |         self.assertEqual(tax.root_parent, "0")
 30 |         self.assertEqual(tax.root_node, tax._default_root_node)
 31 |         self.assertEqual(tax.root_name, "root")
 32 |         self.assertEqual(tax.root_rank, "root")
 33 | 
 34 |         self.assertEqual(tax._default_urls, [])
 35 |         self.assertEqual(tax._default_root_node, "1")
 36 |         self.assertEqual(tax._nodes, {tax.root_node: '0'})
 37 |         self.assertEqual(tax._names, {tax.root_node: 'root'})
 38 |         self.assertEqual(tax._ranks, {tax.root_node: 'root'})
 39 |         self.assertEqual(tax._lineages, {})
 40 |         self.assertEqual(tax._name_nodes, {})
 41 |         self.assertEqual(tax._node_children, {})
 42 |         self.assertEqual(tax._rank_nodes, {})
 43 |         self.assertEqual(tax._translated_nodes, {})
 44 | 
 45 |         self.assertEqual(tax.undefined_node, None)
 46 |         self.assertEqual(tax.undefined_name, None)
 47 |         self.assertEqual(tax.undefined_rank, None)
 48 |         self.assertEqual(tax.sources, [])
 49 | 
 50 |         tax = CustomTx(files=self.test_file)
 51 |         self.assertEqual(tax.root_parent, "0")
 52 |         self.assertEqual(tax.root_node, tax._default_root_node)
 53 |         self.assertEqual(tax.root_name, "Node1")
 54 |         self.assertEqual(tax.root_rank, "rank-1")
 55 | 
 56 |         self.assertEqual(tax._default_urls, [])
 57 |         self.assertEqual(tax._default_root_node, "1")
 58 |         self.assertEqual(tax._nodes[tax.root_node], "0")
 59 |         self.assertEqual(tax._names[tax.root_node], "Node1")
 60 |         self.assertEqual(tax._ranks[tax.root_node], "rank-1")
 61 |         self.assertEqual(tax._lineages, {})
 62 |         self.assertEqual(tax._name_nodes, {})
 63 |         self.assertEqual(tax._node_children, {})
 64 |         self.assertEqual(tax._rank_nodes, {})
 65 |         self.assertEqual(tax._translated_nodes, {})
 66 | 
 67 |         self.assertEqual(tax.undefined_node, None)
 68 |         self.assertEqual(tax.undefined_name, None)
 69 |         self.assertEqual(tax.undefined_rank, None)
 70 |         self.assertEqual(tax.sources, [self.test_file])
 71 | 
 72 |     def test_root_values(self):
 73 |         """
 74 |         test init changing root values
 75 |         """
 76 | 
 77 |         # New root, not on tree
 78 |         tax = MultiTax(root_node="root_n", root_parent="root_p",
 79 |                        root_name="newRootName", root_rank="newRootRank")
 80 |         self.assertEqual(tax.root_node, "root_n")
 81 |         self.assertEqual(tax.root_parent, "root_p")
 82 |         # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"}
 83 |         self.assertEqual(tax._nodes, {
 84 |                          tax.root_node: tax.root_parent, tax._default_root_node: tax.root_node})
 85 |         self.assertEqual(tax.root_name, 'newRootName')
 86 |         self.assertEqual(tax._names, {tax.root_node: 'newRootName'})
 87 |         self.assertEqual(tax.root_rank, 'newRootRank')
 88 |         self.assertEqual(tax._ranks, {tax.root_node: 'newRootRank'})
 89 | 
 90 |         # Root is a new node not in nodes
 91 |         tax = CustomTx(files=self.test_file, root_node="root_n",
 92 |                        root_parent="root_p", root_name="newRootName", root_rank="newRootRank")
 93 |         self.assertEqual(tax.root_node, "root_n")
 94 |         self.assertEqual(tax.root_parent, "root_p")
 95 |         self.assertEqual(tax.stats()["nodes"], 15)
 96 | 
 97 |         # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"}
 98 |         self.assertEqual(tax.parent(tax.root_node), tax.root_parent)
 99 |         self.assertEqual(tax.name(tax.root_node), 'newRootName')
100 |         self.assertEqual(tax.rank(tax.root_node), 'newRootRank')
101 |         # Default root is linked to new root
102 |         self.assertEqual(tax.parent(tax._default_root_node), tax.root_node)
103 |         self.assertEqual(tax.name(tax._default_root_node), "Node1")
104 |         self.assertEqual(tax.rank(tax._default_root_node), "rank-1")
105 | 
106 |         # Root is an existing node in nodes, but not default, filter tree under node
107 |         tax = CustomTx(files=self.test_file, root_node="4.4", root_parent="root_p",
108 |                        root_name="newRootName", root_rank="newRootRank")
109 |         self.assertEqual(tax.root_node, "4.4")
110 |         self.assertEqual(tax.root_parent, "root_p")
111 |         self.assertEqual(tax.stats()["nodes"], 3)
112 | 
113 |         # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"}
114 |         self.assertEqual(tax.parent(tax.root_node), tax.root_parent)
115 |         self.assertEqual(tax.name(tax.root_node), 'newRootName')
116 |         self.assertEqual(tax.rank(tax.root_node), 'newRootRank')
117 |         # default root should not exist
118 |         self.assertEqual(tax.parent(tax._default_root_node),
119 |                          tax.undefined_node)
120 |         self.assertEqual(tax.name(tax._default_root_node), tax.undefined_name)
121 |         self.assertEqual(tax.rank(tax._default_root_node), tax.undefined_rank)
122 | 
123 |     def test_undefined_values(self):
124 |         """
125 |         test init changing undefined values
126 |         """
127 |         tax = MultiTax(undefined_node="unode",
128 |                        undefined_rank="urank", undefined_name="uname")
129 |         self.assertEqual(tax.undefined_node, "unode")
130 |         self.assertEqual(tax.undefined_name, "uname")
131 |         self.assertEqual(tax.undefined_rank, "urank")
132 |         self.assertEqual(tax.parent("XXX"), "unode")
133 |         self.assertEqual(tax.rank("XXX"), "urank")
134 |         self.assertEqual(tax.name("XXX"), "uname")
135 | 
136 |         tax = CustomTx(files=self.test_file, undefined_node="unode",
137 |                        undefined_rank="urank", undefined_name="uname")
138 |         self.assertEqual(tax.undefined_node, "unode")
139 |         self.assertEqual(tax.undefined_name, "uname")
140 |         self.assertEqual(tax.undefined_rank, "urank")
141 |         self.assertEqual(tax.parent("XXX"), "unode")
142 |         self.assertEqual(tax.rank("XXX"), "urank")
143 |         self.assertEqual(tax.name("XXX"), "uname")
144 | 
145 |     def test_build_values(self):
146 |         """
147 |         test init changing undefined values
148 |         """
149 |         tax = MultiTax(build_node_children=True,
150 |                        build_name_nodes=True, build_rank_nodes=True)
151 |         self.assertEqual(tax._name_nodes, {
152 |                          tax.name(tax.root_node): [tax.root_node]})
153 |         self.assertEqual(tax._node_children, {
154 |                          tax.root_parent: [tax.root_node]})
155 |         self.assertEqual(tax._rank_nodes, {"root": [tax.root_node]})
156 | 
157 |         tax = CustomTx(files=self.test_file, build_node_children=True,
158 |                        build_name_nodes=True, build_rank_nodes=True)
159 |         self.assertNotEqual(len(tax._name_nodes), 0)
160 |         self.assertNotEqual(len(tax._node_children), 0)
161 |         self.assertNotEqual(len(tax._rank_nodes), 0)
162 | 


--------------------------------------------------------------------------------
/tests/multitax/utils.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import os
 3 | import gzip
 4 | import tarfile
 5 | 
 6 | 
 7 | def setup_dir(d):
 8 |     shutil.rmtree(d, ignore_errors=True)
 9 |     os.makedirs(d)
10 | 
11 | 
12 | def uncompress_gzip(f, outf):
13 |     with gzip.open(f, 'r') as f_in, open(outf, 'wb') as f_out:
14 |         shutil.copyfileobj(f_in, f_out)
15 | 
16 | 
17 | def uncompress_tar_gzip(f, outd):
18 |     # Extract all files ignoring internal directories to outd
19 |     files = []
20 |     with tarfile.open(f) as tar_in:
21 |         for member in tar_in.getmembers():
22 |             if member.isreg():
23 |                 member.name = os.path.basename(member.name)
24 |                 files.append(member.name)
25 |                 tar_in.extract(member, outd)
26 |     return files
27 | 


--------------------------------------------------------------------------------