\n"}, {"fullname": "multitax.CustomTx", "modulename": "multitax", "qualname": "CustomTx", "type": "class", "doc": "\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.CustomTx.__init__", "modulename": "multitax", "qualname": "CustomTx.__init__", "type": "function", "doc": "CustomTx()
\n\nParameters:
\n\n\n- cols [list, dict]: List of fields to be parsed or a dictionary with {field: column index}. Options: \"node\", \"parent\", \"rank\", \"name\"
\n- sep [str]: Separator of fields
\n- **kwargs defined at
multitax.multitax.MultiTax
\n
\n\nExample:
\n\ntax_custom1 = CustomTx(files=\"my_custom_tax.tsv\", cols=[\"node\",\"parent\",\"rank\"])\ntax_custom2 = CustomTx(files=\"my_custom_tax.tsv\", cols={\"node\": 0, \"parent\": 1, \"name\": 5, \"rank\": 3})\n
\n", "signature": "(\n self,\n cols: list = ['node', 'parent', 'rank', 'name'],\n sep: str = '\\t',\n **kwargs\n)", "funcdef": "def"}, {"fullname": "multitax.DummyTx", "modulename": "multitax", "qualname": "DummyTx", "type": "class", "doc": "\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.DummyTx.__init__", "modulename": "multitax", "qualname": "DummyTx.__init__", "type": "function", "doc": "DummyTx() - Dummy empty taxonomy
\n\nParameters:
\n\n\n- **kwargs defined at
multitax.multitax.MultiTax
\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.GreengenesTx", "modulename": "multitax", "qualname": "GreengenesTx", "type": "class", "doc": "\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.GreengenesTx.__init__", "modulename": "multitax", "qualname": "GreengenesTx.__init__", "type": "function", "doc": "Main constructor of MultiTax and sub-classes
\n\nParameters:
\n\n\n- files [str, list]: One or more local files to parse.
\n- urls [str, list]: One or more urls to download and parse.
\n- output_prefix [str]: Directory to write downloaded files.
\n- root_node [str]: Define an alternative root node.
\n- root_parent [str]: Define the root parent node identifier.
\n- root_name [str]: Define an alternative root name. Set to None to use original name.
\n- root_rank [str]: Define an alternative root rank. Set to None to use original name.
\n- undefined_node [str]: Define a default return value for undefined nodes.
\n- undefined_name [str]: Define a default return value for undefined names.
\n- undefined_rank [str]: Define a default return value for undefined ranks.
\n- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
\n- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
\n- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
\n- extended_names [bool]: Parse extended names if available.
\n
\n\nExample:
\n\ntax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.GtdbTx", "modulename": "multitax", "qualname": "GtdbTx", "type": "class", "doc": "\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.GtdbTx.__init__", "modulename": "multitax", "qualname": "GtdbTx.__init__", "type": "function", "doc": "Main constructor of MultiTax and sub-classes
\n\nParameters:
\n\n\n- files [str, list]: One or more local files to parse.
\n- urls [str, list]: One or more urls to download and parse.
\n- output_prefix [str]: Directory to write downloaded files.
\n- root_node [str]: Define an alternative root node.
\n- root_parent [str]: Define the root parent node identifier.
\n- root_name [str]: Define an alternative root name. Set to None to use original name.
\n- root_rank [str]: Define an alternative root rank. Set to None to use original name.
\n- undefined_node [str]: Define a default return value for undefined nodes.
\n- undefined_name [str]: Define a default return value for undefined names.
\n- undefined_rank [str]: Define a default return value for undefined ranks.
\n- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
\n- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
\n- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
\n- extended_names [bool]: Parse extended names if available.
\n
\n\nExample:
\n\ntax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx", "modulename": "multitax", "qualname": "NcbiTx", "type": "class", "doc": "\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.NcbiTx.__init__", "modulename": "multitax", "qualname": "NcbiTx.__init__", "type": "function", "doc": "Main constructor of MultiTax and sub-classes
\n\nParameters:
\n\n\n- files [str, list]: One or more local files to parse.
\n- urls [str, list]: One or more urls to download and parse.
\n- output_prefix [str]: Directory to write downloaded files.
\n- root_node [str]: Define an alternative root node.
\n- root_parent [str]: Define the root parent node identifier.
\n- root_name [str]: Define an alternative root name. Set to None to use original name.
\n- root_rank [str]: Define an alternative root rank. Set to None to use original name.
\n- undefined_node [str]: Define a default return value for undefined nodes.
\n- undefined_name [str]: Define a default return value for undefined names.
\n- undefined_rank [str]: Define a default return value for undefined ranks.
\n- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
\n- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
\n- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
\n- extended_names [bool]: Parse extended names if available.
\n
\n\nExample:
\n\ntax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.latest", "modulename": "multitax", "qualname": "NcbiTx.latest", "type": "function", "doc": "Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.merged", "modulename": "multitax", "qualname": "NcbiTx.merged", "type": "function", "doc": "Returns relative entry from the merged.dmp file of a given node.
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.search_name", "modulename": "multitax", "qualname": "NcbiTx.search_name", "type": "function", "doc": "Search node by exact or partial name.
\n\nDefault order (can be skipped with force_extended=True):
\n\n1) Search names defined as \"scientific name\" on nodes.dmp
\n\n2) If nothing was found, search text in all other categories (must be activated with NcbiTx(extended_names=True))
\n\nParameters:
\n\n\n- text [str]: Text to search.
\n- rank [str]: Filter results by rank.
\n- exact [bool]: Exact or partial name search (both case sensitive).
\n- force_extended [bool]: Search for text in all categories at once.
\n
\n\nReturns: list of matching nodes
\n", "signature": "(\n self,\n text: str,\n rank: str = None,\n exact: bool = True,\n force_extended: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.NcbiTx.stats", "modulename": "multitax", "qualname": "NcbiTx.stats", "type": "function", "doc": "Returns a dict with general numbers of the taxonomic tree
\n\nExample:
\n\nfrom pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n 'genus': 8778,\n 'family': 2323,\n 'order': 930,\n 'class': 337,\n 'phylum': 131,\n 'domain': 1,\n 'root': 1}),\n 'ranks': 42739}\n
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.OttTx", "modulename": "multitax", "qualname": "OttTx", "type": "class", "doc": "\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.OttTx.__init__", "modulename": "multitax", "qualname": "OttTx.__init__", "type": "function", "doc": "Main constructor of MultiTax and sub-classes
\n\nParameters:
\n\n\n- files [str, list]: One or more local files to parse.
\n- urls [str, list]: One or more urls to download and parse.
\n- output_prefix [str]: Directory to write downloaded files.
\n- root_node [str]: Define an alternative root node.
\n- root_parent [str]: Define the root parent node identifier.
\n- root_name [str]: Define an alternative root name. Set to None to use original name.
\n- root_rank [str]: Define an alternative root rank. Set to None to use original name.
\n- undefined_node [str]: Define a default return value for undefined nodes.
\n- undefined_name [str]: Define a default return value for undefined names.
\n- undefined_rank [str]: Define a default return value for undefined ranks.
\n- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
\n- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
\n- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
\n- extended_names [bool]: Parse extended names if available.
\n
\n\nExample:
\n\ntax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.OttTx.forwards", "modulename": "multitax", "qualname": "OttTx.forwards", "type": "function", "doc": "Returns relative entry from the forwards.tsv file of a given node.
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.OttTx.latest", "modulename": "multitax", "qualname": "OttTx.latest", "type": "function", "doc": "Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.OttTx.search_name", "modulename": "multitax", "qualname": "OttTx.search_name", "type": "function", "doc": "Search node by exact or partial name.
\n\nDefault order (can be skipped with force_extended=True):
\n\n1) Search default names defined on \"taxonomy.tsv\"
\n\n2) If nothing was found, search in all other names defined on \"synonyms.tsv\" (must be activated with OttTx(extended_names=True))
\n\nParameters:
\n\n\n- text [str]: Text to search.
\n- rank [str]: Filter results by rank.
\n- exact [bool]: Exact or partial name search (both case sensitive).
\n- force_extended [bool]: Search for text in all categories at once.
\n
\n\nReturns: list of matching nodes
\n", "signature": "(\n self,\n text: str,\n rank: str = None,\n exact: bool = True,\n force_extended: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.OttTx.stats", "modulename": "multitax", "qualname": "OttTx.stats", "type": "function", "doc": "Returns a dict with general numbers of the taxonomic tree
\n\nExample:
\n\nfrom pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n 'genus': 8778,\n 'family': 2323,\n 'order': 930,\n 'class': 337,\n 'phylum': 131,\n 'domain': 1,\n 'root': 1}),\n 'ranks': 42739}\n
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.SilvaTx", "modulename": "multitax", "qualname": "SilvaTx", "type": "class", "doc": "\n", "bases": "multitax.multitax.MultiTax"}, {"fullname": "multitax.SilvaTx.__init__", "modulename": "multitax", "qualname": "SilvaTx.__init__", "type": "function", "doc": "Main constructor of MultiTax and sub-classes
\n\nParameters:
\n\n\n- files [str, list]: One or more local files to parse.
\n- urls [str, list]: One or more urls to download and parse.
\n- output_prefix [str]: Directory to write downloaded files.
\n- root_node [str]: Define an alternative root node.
\n- root_parent [str]: Define the root parent node identifier.
\n- root_name [str]: Define an alternative root name. Set to None to use original name.
\n- root_rank [str]: Define an alternative root rank. Set to None to use original name.
\n- undefined_node [str]: Define a default return value for undefined nodes.
\n- undefined_name [str]: Define a default return value for undefined names.
\n- undefined_rank [str]: Define a default return value for undefined ranks.
\n- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
\n- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
\n- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
\n- extended_names [bool]: Parse extended names if available.
\n
\n\nExample:
\n\ntax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(self, **kwargs)", "funcdef": "def"}, {"fullname": "multitax.multitax", "modulename": "multitax.multitax", "type": "module", "doc": "\n"}, {"fullname": "multitax.multitax.MultiTax", "modulename": "multitax.multitax", "qualname": "MultiTax", "type": "class", "doc": "\n"}, {"fullname": "multitax.multitax.MultiTax.__init__", "modulename": "multitax.multitax", "qualname": "MultiTax.__init__", "type": "function", "doc": "Main constructor of MultiTax and sub-classes
\n\nParameters:
\n\n\n- files [str, list]: One or more local files to parse.
\n- urls [str, list]: One or more urls to download and parse.
\n- output_prefix [str]: Directory to write downloaded files.
\n- root_node [str]: Define an alternative root node.
\n- root_parent [str]: Define the root parent node identifier.
\n- root_name [str]: Define an alternative root name. Set to None to use original name.
\n- root_rank [str]: Define an alternative root rank. Set to None to use original name.
\n- undefined_node [str]: Define a default return value for undefined nodes.
\n- undefined_name [str]: Define a default return value for undefined names.
\n- undefined_rank [str]: Define a default return value for undefined ranks.
\n- build_node_children [bool]: Build node,children dict (otherwise it will be created on first use).
\n- build_name_nodes [bool]: Build name,nodes dict (otherwise it will be created on first use).
\n- build_rank_nodes [bool]: Build rank,nodes dict (otherwise it will be created on first use).
\n- extended_names [bool]: Parse extended names if available.
\n
\n\nExample:
\n\ntax_ncbi = NcbiTx()\ntax_gtdb = GtdbTx(files=[\"file1.gz\", \"file2.txt\"])\ntax_silva = SilvaTx(urls=[\"https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz\"])\ntax_ott = OttTx(root_node=\"844192\")\ntax_gg = GreengenesTx(output_prefix=\"save/to/prefix_\")\n
\n", "signature": "(\n self,\n files: list = None,\n urls: list = None,\n output_prefix: str = None,\n root_node: str = None,\n root_parent: str = '0',\n root_name: str = None,\n root_rank: str = None,\n undefined_node: str = None,\n undefined_name: str = None,\n undefined_rank: str = None,\n build_name_nodes: bool = False,\n build_node_children: bool = False,\n build_rank_nodes: bool = False,\n extended_names: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.version", "modulename": "multitax.multitax", "qualname": "MultiTax.version", "type": "variable", "doc": "\n", "default_value": " = '1.3.1'"}, {"fullname": "multitax.multitax.MultiTax.add", "modulename": "multitax.multitax", "qualname": "MultiTax.add", "type": "function", "doc": "Add node to taxonomy.\nDeletes built lineages and translations.
\n", "signature": "(self, node: str, parent: str, name: str = None, rank: str = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.build_lineages", "modulename": "multitax.multitax", "qualname": "MultiTax.build_lineages", "type": "function", "doc": "Stores lineages in memory for faster access.\nIt is valid for lineage(), rank_lineage() and name_lineage().\nIf keyword arguments (root_node, ranks) are used in those functions stored lineages are not used.
\n\nReturns: None
\n", "signature": "(self, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.build_translation", "modulename": "multitax.multitax", "qualname": "MultiTax.build_translation", "type": "function", "doc": "Create a translation of current taxonomy to another
\n\nParameters:
\n\n\n- tax [MultiTax]: A target taxonomy to be translated to.
\n- files [str, list]: One or more local files to parse.
\n- urls [str, list]: One or more urls to download and parse.
\n
\n\nExample:
\n\nfrom multitax import GtdbTx, NcbiTx\ngtdb_tax = GtdbTx()\nncbi_tax = NcbiTx()\n\n# Automatically download translation files\ngtdb_tax.build_translation(ncbi_tax)\ngtdb_tax.translate(\"g__Escherichia\")\n {'1301', '547', '561', '570', '590', '620'}\n\n# Using local files (NCBI <-> GTDB)\nncbi_tax.build_translation(gtdb_tax, files=[\"ar53_metadata.tar.gz\", \"bac120_metadata.tar.gz\"])\nncbi_tax.translate(\"620\")\n {'g__Escherichia', 'g__Proteus', 'g__Serratia'}\n
\n", "signature": "(self, tax, files: list = None, urls: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.children", "modulename": "multitax.multitax", "qualname": "MultiTax.children", "type": "function", "doc": "Returns list of direct children nodes of a given node.
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.check_consistency", "modulename": "multitax.multitax", "qualname": "MultiTax.check_consistency", "type": "function", "doc": "Checks consistency of the tree
\n\nReturns: raise an Exception otherwise None
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.clear_lineages", "modulename": "multitax.multitax", "qualname": "MultiTax.clear_lineages", "type": "function", "doc": "Clear built lineages.
\n\nReturns: None
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.closest_parent", "modulename": "multitax.multitax", "qualname": "MultiTax.closest_parent", "type": "function", "doc": "Returns the closest parent node based on a defined list of ranks
\n", "signature": "(self, node: str, ranks: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.filter", "modulename": "multitax.multitax", "qualname": "MultiTax.filter", "type": "function", "doc": "Filters taxonomy given a list of nodes.\nBy default keep all the ancestors of the given nodes.\nIf desc=True, keep all descendants instead.\nDeletes built lineages and translations.
\n\nExample:
\n\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\ntax.lineage('s__Enterovibrio marina')\n# ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Vibrionaceae', 'g__Enterovibrio', 's__Enterovibrio marina']\n# Keep only ancestors of 'g__Enterovibrio'\ntax.filter('g__Enterovibrio')\n\n# Reload taxonomy\ntax = GtdbTx()\n# Keep only descendants of 'g__Enterovibrio'\ntax.filter('g__Enterovibrio', desc=True)\n
\n", "signature": "(self, nodes: list, desc: bool = False)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.latest", "modulename": "multitax.multitax", "qualname": "MultiTax.latest", "type": "function", "doc": "Returns latest/updated version of a given node.\nIf node is already the latests, returns itself.\nMainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.leaves", "modulename": "multitax.multitax", "qualname": "MultiTax.leaves", "type": "function", "doc": "Returns a list of leaf nodes of a given node.
\n", "signature": "(self, node: str = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.lineage", "type": "function", "doc": "Returns a list with the lineage of a given node.\nIf ranks is provided, returns only nodes annotated with such ranks.\nIf root_node is provided, use it instead of default root of tree.
\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.name", "modulename": "multitax.multitax", "qualname": "MultiTax.name", "type": "function", "doc": "Returns name of a given node.
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.name_lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.name_lineage", "type": "function", "doc": "Returns a list with the name lineage of a given node.
\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.nodes_rank", "modulename": "multitax.multitax", "qualname": "MultiTax.nodes_rank", "type": "function", "doc": "Returns list of nodes of a given rank.
\n", "signature": "(self, rank: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.parent", "modulename": "multitax.multitax", "qualname": "MultiTax.parent", "type": "function", "doc": "Returns the direct parent node of a given node.
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.parent_rank", "modulename": "multitax.multitax", "qualname": "MultiTax.parent_rank", "type": "function", "doc": "Returns the parent node of a given rank in the specified rank.
\n", "signature": "(self, node: str, rank: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.prune", "modulename": "multitax.multitax", "qualname": "MultiTax.prune", "type": "function", "doc": "Prunes branches of the tree under the given nodes.\nDeletes built lineages and translations.
\n", "signature": "(self, nodes: list)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.rank", "modulename": "multitax.multitax", "qualname": "MultiTax.rank", "type": "function", "doc": "Returns the rank of a given node.
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.rank_lineage", "modulename": "multitax.multitax", "qualname": "MultiTax.rank_lineage", "type": "function", "doc": "Returns a list with the rank lineage of a given node.
\n", "signature": "(self, node: str, root_node: str = None, ranks: list = None)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.remove", "modulename": "multitax.multitax", "qualname": "MultiTax.remove", "type": "function", "doc": "Removes node from taxonomy. Can break the tree if a parent node is removed. To remove a certain branch, use prune.\nRunning check consistency after removing a node is recommended.\nDeletes built lineages and translations.
\n", "signature": "(self, node: str, check_consistency: bool = False)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.search_name", "modulename": "multitax.multitax", "qualname": "MultiTax.search_name", "type": "function", "doc": "Search node by exact or partial name
\n\nParameters:
\n\n\n- text [str]: Text to search.
\n- rank [str]: Filter results by rank.
\n- exact [bool]: Exact or partial name search (both case sensitive).
\n
\n\nReturns: list of matching nodes
\n", "signature": "(self, text: str, rank: str = None, exact: bool = True)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.stats", "modulename": "multitax.multitax", "qualname": "MultiTax.stats", "type": "function", "doc": "Returns a dict with general numbers of the taxonomic tree
\n\nExample:
\n\nfrom pprint import pprint\nfrom multitax import GtdbTx\ntax = GtdbTx()\n\npprint(tax.stats())\n{'leaves': 30238,\n 'names': 42739,\n 'nodes': 42739,\n 'ranked_leaves': Counter({'species': 30238}),\n 'ranked_nodes': Counter({'species': 30238,\n 'genus': 8778,\n 'family': 2323,\n 'order': 930,\n 'class': 337,\n 'phylum': 131,\n 'domain': 1,\n 'root': 1}),\n 'ranks': 42739}\n
\n", "signature": "(self)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.translate", "modulename": "multitax.multitax", "qualname": "MultiTax.translate", "type": "function", "doc": "Returns the translated node from another taxonomy. Translated nodes are generated with the build_translation function.
\n", "signature": "(self, node: str)", "funcdef": "def"}, {"fullname": "multitax.multitax.MultiTax.write", "modulename": "multitax.multitax", "qualname": "MultiTax.write", "type": "function", "doc": "Writes loaded taxonomy to a file.
\n\nParameters:
\n\n\n- cols [list]: Options: \"node\", \"latest\", \"parent\", \"rank\", \"name\", \"leaves\", \"children\", \"lineage\", \"rank_lineage\", \"name_lineage\"
\n- sep [str]: Separator of fields
\n- sep_multi [str]: Separator of multi-valued fields
\n- ranks [list]: Ranks to report
\n- gz [bool]: Gzip output
\n
\n\nReturns: None
\n", "signature": "(\n self,\n output_file: str,\n cols: list = ['node', 'parent', 'rank', 'name'],\n sep: str = '\\t',\n sep_multi: str = '|',\n ranks: list = None,\n gz: bool = False\n)", "funcdef": "def"}, {"fullname": "multitax.utils", "modulename": "multitax.utils", "type": "module", "doc": "\n"}, {"fullname": "multitax.utils.check_dir", "modulename": "multitax.utils", "qualname": "check_dir", "type": "function", "doc": "\n", "signature": "(prefix: str)", "funcdef": "def"}, {"fullname": "multitax.utils.check_file", "modulename": "multitax.utils", "qualname": "check_file", "type": "function", "doc": "\n", "signature": "(file: str)", "funcdef": "def"}, {"fullname": "multitax.utils.check_no_file", "modulename": "multitax.utils", "qualname": "check_no_file", "type": "function", "doc": "\n", "signature": "(file: str)", "funcdef": "def"}, {"fullname": "multitax.utils.close_files", "modulename": "multitax.utils", "qualname": "close_files", "type": "function", "doc": "Parameters:
\n\n\n- fhs [dict]: {file: file handler}
\n
\n\nReturns: Nothing
\n", "signature": "(fhs: dict)", "funcdef": "def"}, {"fullname": "multitax.utils.download_files", "modulename": "multitax.utils", "qualname": "download_files", "type": "function", "doc": "Download and open files (memory/stream) or write to disk (multitax.utils.save_urls)
\n\nParameters:
\n\n\n- urls [list]: List of files to download (text, \".gz\", \".tar.gz\", \".tgz\")
\n- output_prefix [str]: Output directory to save files
\n
\n\nReturns:
\n\n\n- OrderedDict {file: file handler} (same order as input)
\n
\n", "signature": "(urls: list, output_prefix: str = None, retry_attempts: int = 1)", "funcdef": "def"}, {"fullname": "multitax.utils.filter_function", "modulename": "multitax.utils", "qualname": "filter_function", "type": "function", "doc": "\n", "signature": "(elements, function, value)", "funcdef": "def"}, {"fullname": "multitax.utils.join_check", "modulename": "multitax.utils", "qualname": "join_check", "type": "function", "doc": "\n", "signature": "(elements, sep: str)", "funcdef": "def"}, {"fullname": "multitax.utils.load_url_mem", "modulename": "multitax.utils", "qualname": "load_url_mem", "type": "function", "doc": "Parameters:
\n\n\n- url [str]: URL to load into memory
\n
\n\nReturns:
\n\n\n- io.BytesIO of the requested url
\n
\n", "signature": "(url: str)", "funcdef": "def"}, {"fullname": "multitax.utils.open_files", "modulename": "multitax.utils", "qualname": "open_files", "type": "function", "doc": "Parameters:
\n\n\n- files [list]: List of files to open (text, \".gz\", \".tar.gz\", \".tgz\")
\n
\n\nReturns:
\n\n\n- OrderedDict {file: file handler} (same order as input)
\n
\n", "signature": "(files: list)", "funcdef": "def"}, {"fullname": "multitax.utils.reverse_dict", "modulename": "multitax.utils", "qualname": "reverse_dict", "type": "function", "doc": "\n", "signature": "(d: dict)", "funcdef": "def"}, {"fullname": "multitax.utils.save_urls", "modulename": "multitax.utils", "qualname": "save_urls", "type": "function", "doc": "Parameters:
\n\n\n- urls [list]: List of urls to download
\n- output_prefix [str]: Output directory to save files
\n
\n\nReturns:
\n\n\n", "signature": "(urls: list, output_prefix: str)", "funcdef": "def"}, {"fullname": "multitax.utils.warning_on_one_line", "modulename": "multitax.utils", "qualname": "warning_on_one_line", "type": "function", "doc": "\n", "signature": "(message, category, filename, lineno, file=None, line=None)", "funcdef": "def"}];
4 |
5 | // mirrored in build-search-index.js (part 1)
6 | // Also split on html tags. this is a cheap heuristic, but good enough.
7 | elasticlunr.tokenizer.setSeperator(/[\s\-.;&_'"=,()]+|<[^>]*>/);
8 |
9 | let searchIndex;
10 | if (docs._isPrebuiltIndex) {
11 | console.info("using precompiled search index");
12 | searchIndex = elasticlunr.Index.load(docs);
13 | } else {
14 | console.time("building search index");
15 | // mirrored in build-search-index.js (part 2)
16 | searchIndex = elasticlunr(function () {
17 | this.pipeline.remove(elasticlunr.stemmer);
18 | this.pipeline.remove(elasticlunr.stopWordFilter);
19 | this.addField("qualname");
20 | this.addField("fullname");
21 | this.addField("annotation");
22 | this.addField("default_value");
23 | this.addField("signature");
24 | this.addField("bases");
25 | this.addField("doc");
26 | this.setRef("fullname");
27 | });
28 | for (let doc of docs) {
29 | searchIndex.addDoc(doc);
30 | }
31 | console.timeEnd("building search index");
32 | }
33 |
34 | return (term) => searchIndex.search(term, {
35 | fields: {
36 | qualname: {boost: 4},
37 | fullname: {boost: 2},
38 | annotation: {boost: 2},
39 | default_value: {boost: 2},
40 | signature: {boost: 2},
41 | bases: {boost: 2},
42 | doc: {boost: 1},
43 | },
44 | expand: true
45 | });
46 | })();
--------------------------------------------------------------------------------
/make_docs.sh:
--------------------------------------------------------------------------------
1 | pdoc -o docs multitax multitax/multitax.py multitax/utils.py
2 |
--------------------------------------------------------------------------------
/multitax/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.3.2"
2 |
3 | __all__ = (
4 | 'CustomTx',
5 | 'DummyTx',
6 | 'GreengenesTx',
7 | 'GtdbTx',
8 | 'NcbiTx',
9 | 'OttTx',
10 | 'SilvaTx',
11 | )
12 |
13 | from .customtx import CustomTx
14 | from .dummytx import DummyTx
15 | from .greengenestx import GreengenesTx
16 | from .gtdbtx import GtdbTx
17 | from .ncbitx import NcbiTx
18 | from .otttx import OttTx
19 | from .silvatx import SilvaTx
20 |
--------------------------------------------------------------------------------
/multitax/customtx.py:
--------------------------------------------------------------------------------
1 | from .multitax import MultiTax
2 | import warnings
3 |
4 |
5 | class CustomTx(MultiTax):
6 |
7 | _required_cols = ["node", "parent"]
8 | _possible_cols = ["node", "parent", "rank", "name"]
9 |
10 | def __init__(self, cols: list = ["node", "parent", "rank", "name"], sep: str = "\t", **kwargs):
11 | """
12 | CustomTx()
13 |
14 | Parameters:
15 | * **cols** *[list, dict]*: List of fields to be parsed or a dictionary with {field: column index}. Options: "node", "parent", "rank", "name"
16 | * **sep** *[str]*: Separator of fields
17 | * **\*\*kwargs** defined at `multitax.multitax.MultiTax`
18 |
19 | Example:
20 |
21 | tax_custom1 = CustomTx(files="my_custom_tax.tsv", cols=["node","parent","rank"])
22 | tax_custom2 = CustomTx(files="my_custom_tax.tsv", cols={"node": 0, "parent": 1, "name": 5, "rank": 3})
23 | """
24 |
25 | self._cols = self._parse_cols(cols)
26 | self._sep = sep
27 | super().__init__(**kwargs)
28 |
29 | def __repr__(self):
30 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
31 | return 'CustomTx({})'.format(', '.join(stats))
32 |
33 | def _build_translation(self, target_tax, files: list = None, urls: list = None):
34 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
35 | "," + target_tax.__class__.__name__ + "] not yet implemented.")
36 | return {}
37 |
38 | def _parse(self, fhs, **kwargs):
39 | nodes = {}
40 | ranks = {}
41 | names = {}
42 | for source, fh in fhs.items():
43 | for line in fh:
44 | try:
45 | fields = line.rstrip().split(self._sep)
46 | except:
47 | fields = line.decode().rstrip().split(self._sep)
48 |
49 | node = fields[self._cols["node"]]
50 | nodes[node] = fields[self._cols["parent"]]
51 | if "name" in self._cols:
52 | names[node] = fields[self._cols["name"]]
53 | if "rank" in self._cols:
54 | ranks[node] = fields[self._cols["rank"]]
55 |
56 | return nodes, ranks, names
57 |
58 | def _parse_cols(self, cols):
59 | if isinstance(cols, list):
60 | cols = {c: i for i, c in enumerate(cols)}
61 |
62 | for rc in self._required_cols:
63 | if rc not in cols:
64 | raise ValueError(rc + " is a required column")
65 |
66 | for c in cols:
67 | if c not in self._possible_cols:
68 | raise ValueError(c + " is not a valid column: " +
69 | ",".join(self._possible_cols))
70 |
71 | return cols
72 |
--------------------------------------------------------------------------------
/multitax/dummytx.py:
--------------------------------------------------------------------------------
1 | from .multitax import MultiTax
2 |
3 |
4 | class DummyTx(MultiTax):
5 |
6 | def __init__(self, **kwargs):
7 | """
8 | DummyTx() - Dummy empty taxonomy
9 |
10 | Parameters:
11 |
12 | * \*\*kwargs defined at `multitax.multitax.MultiTax`
13 | """
14 | super().__init__(**kwargs)
15 |
16 | def __repr__(self):
17 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
18 | return 'DummyTx({})'.format(', '.join(stats))
19 |
--------------------------------------------------------------------------------
/multitax/greengenestx.py:
--------------------------------------------------------------------------------
1 | from .multitax import MultiTax
2 | import warnings
3 |
4 |
5 | class GreengenesTx(MultiTax):
6 | _default_urls = [
7 | "https://gg-sg-web.s3-us-west-2.amazonaws.com/downloads/greengenes_database/gg_13_5/gg_13_5_taxonomy.txt.gz"]
8 | _rank_codes = [("k__", "kingdom"),
9 | ("p__", "phylum"),
10 | ("c__", "class"),
11 | ("o__", "order"),
12 | ("f__", "family"),
13 | ("g__", "genus"),
14 | ("s__", "species")]
15 |
16 | def __init__(self, **kwargs):
17 | # forwards.tsv
18 | self._forwards = {}
19 | super().__init__(**kwargs)
20 |
21 | def __repr__(self):
22 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
23 | return 'GreengenesTx({})'.format(', '.join(stats))
24 |
25 | def _build_translation(self, target_tax, files: list = None, urls: list = None):
26 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
27 | "," + target_tax.__class__.__name__ + "] not yet implemented.")
28 | return {}
29 |
30 | def _parse(self, fhs, **kwargs):
31 | nodes = {}
32 | ranks = {}
33 | names = {}
34 |
35 | for source, fh in fhs.items():
36 | for line in fh:
37 | try:
38 | _, lineage = line.rstrip().split('\t')
39 | except:
40 | _, lineage = line.decode().rstrip().split('\t')
41 | lin = lineage.split("; ")
42 | for i in range(len(lin))[::-1]:
43 | # assert rank
44 | assert lin[i][:3] == self._rank_codes[i][0]
45 | # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
46 | taxid = lin[i]
47 | name = lin[i][3:]
48 | if not name:
49 | continue # empty entry "s__"
50 | rank = self._rank_codes[i][1]
51 | if i == 0:
52 | parent_taxid = self._default_root_node
53 | else:
54 | parent_taxid = lin[i-1]
55 | if taxid not in nodes:
56 | nodes[taxid] = parent_taxid
57 | names[taxid] = name
58 | ranks[taxid] = rank
59 |
60 | return nodes, ranks, names
61 |
--------------------------------------------------------------------------------
/multitax/gtdbtx.py:
--------------------------------------------------------------------------------
1 | from .multitax import MultiTax
2 | from .utils import check_file
3 | from .utils import open_files
4 | from .utils import download_files
5 | import warnings
6 |
7 |
8 | class GtdbTx(MultiTax):
9 |
10 | _default_urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_taxonomy.tsv.gz",
11 | "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_taxonomy.tsv.gz"]
12 | _rank_codes = [("d__", "domain"),
13 | ("p__", "phylum"),
14 | ("c__", "class"),
15 | ("o__", "order"),
16 | ("f__", "family"),
17 | ("g__", "genus"),
18 | ("s__", "species")]
19 |
20 | def __init__(self, **kwargs):
21 | super().__init__(**kwargs)
22 |
23 | def __repr__(self):
24 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
25 | return 'GtdbTx({})'.format(', '.join(stats))
26 |
27 | def _build_translation(self, target_tax, files: list = None, urls: list = None):
28 | translated_nodes = {}
29 | if target_tax.__class__.__name__ == "NcbiTx":
30 |
31 | if files:
32 | fhs = open_files(files)
33 | else:
34 | _urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz",
35 | "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz"]
36 | fhs = download_files(
37 | urls=urls if urls else _urls, retry_attempts=3)
38 |
39 | accession_col = 0
40 | gtdb_taxonomy_col = 19
41 | ncbi_taxid_col = 80
42 |
43 | for source, fh in fhs.items():
44 | for line in fh:
45 | try:
46 | fields = line.rstrip().split('\t')
47 | except:
48 | fields = line.decode().rstrip().split('\t')
49 |
50 | # skip header
51 | if fields[accession_col] == "accession":
52 | continue
53 |
54 | print(fields)
55 | ncbi_leaf_node = target_tax.latest(fields[ncbi_taxid_col])
56 | if ncbi_leaf_node != target_tax.undefined_node:
57 | ncbi_nodes = target_tax.lineage(ncbi_leaf_node, ranks=[
58 | "superkingdom", "phylum", "class",
59 | "order", "family", "genus", "species"])
60 | else:
61 | continue
62 |
63 | # Build GTDB lineage from leaf (species on given lineage)
64 | # to accomodate possible changes in the loaded tax
65 | gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
66 | if gtdb_leaf_node != self.undefined_node:
67 | gtdb_nodes = self.lineage(gtdb_leaf_node, ranks=[
68 | "domain", "phylum", "class", "order",
69 | "family", "genus", "species"])
70 | else:
71 | continue
72 |
73 | # Match ranks
74 | for i, gtdb_n in enumerate(gtdb_nodes):
75 | if ncbi_nodes[i] != target_tax.undefined_node and gtdb_n != self.undefined_node:
76 | if gtdb_n not in translated_nodes:
77 | translated_nodes[gtdb_n] = set()
78 | translated_nodes[gtdb_n].add(ncbi_nodes[i])
79 |
80 | else:
81 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
82 | "," + target_tax.__class__.__name__ + "] not yet implemented.")
83 |
84 | return translated_nodes
85 |
86 | def _parse(self, fhs, **kwargs):
87 | nodes = {}
88 | ranks = {}
89 | names = {}
90 | for source, fh in fhs.items():
91 | for line in fh:
92 | try:
93 | _, lineage = line.rstrip().split('\t')
94 | except:
95 | _, lineage = line.decode().rstrip().split('\t')
96 | lin = lineage.split(";")
97 | for i in range(len(lin))[::-1]:
98 | # assert rank
99 | assert lin[i][:3] == self._rank_codes[i][0]
100 | # taxid = "c__Deinococci", rank = "class", name = "Deinococci"
101 | taxid = lin[i]
102 | name = lin[i][3:]
103 | # empty entry "s__"
104 | if not name:
105 | continue
106 | rank = self._rank_codes[i][1]
107 | if i == 0:
108 | parent_taxid = self._default_root_node
109 | else:
110 | parent_taxid = lin[i-1]
111 | if taxid not in nodes:
112 | nodes[taxid] = parent_taxid
113 | names[taxid] = name
114 | ranks[taxid] = rank
115 |
116 | return nodes, ranks, names
117 |
--------------------------------------------------------------------------------
/multitax/multitax.py:
--------------------------------------------------------------------------------
1 | from .utils import *
2 | from collections import Counter
3 | from . import __version__
4 |
5 | class MultiTax(object):
6 |
7 | version = __version__
8 |
9 | _default_urls = []
10 | _default_root_node = "1"
11 |
12 | def __init__(self,
13 | files: list = None,
14 | urls: list = None,
15 | output_prefix: str = None,
16 | root_node: str = None,
17 | root_parent: str = "0",
18 | root_name: str = None,
19 | root_rank: str = None,
20 | undefined_node: str = None,
21 | undefined_name: str = None,
22 | undefined_rank: str = None,
23 | build_name_nodes: bool = False,
24 | build_node_children: bool = False,
25 | build_rank_nodes: bool = False,
26 | extended_names: bool = False):
27 | """
28 | Main constructor of MultiTax and sub-classes
29 |
30 | Parameters:
31 | * **files** *[str, list]*: One or more local files to parse.
32 | * **urls** *[str, list]*: One or more urls to download and parse.
33 | * **output_prefix** *[str]*: Directory to write downloaded files.
34 | * **root_node** *[str]*: Define an alternative root node.
35 | * **root_parent** *[str]*: Define the root parent node identifier.
36 | * **root_name** *[str]*: Define an alternative root name. Set to None to use original name.
37 | * **root_rank** *[str]*: Define an alternative root rank. Set to None to use original name.
38 | * **undefined_node** *[str]*: Define a default return value for undefined nodes.
39 | * **undefined_name** *[str]*: Define a default return value for undefined names.
40 | * **undefined_rank** *[str]*: Define a default return value for undefined ranks.
41 | * **build_node_children** *[bool]*: Build node,children dict (otherwise it will be created on first use).
42 | * **build_name_nodes** *[bool]*: Build name,nodes dict (otherwise it will be created on first use).
43 | * **build_rank_nodes** *[bool]*: Build rank,nodes dict (otherwise it will be created on first use).
44 | * **extended_names** *[bool]*: Parse extended names if available.
45 |
46 | Example:
47 |
48 | tax_ncbi = NcbiTx()
49 | tax_gtdb = GtdbTx(files=["file1.gz", "file2.txt"])
50 | tax_silva = SilvaTx(urls=["https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_lsu_138.1.txt.gz"])
51 | tax_ott = OttTx(root_node="844192")
52 | tax_gg = GreengenesTx(output_prefix="save/to/prefix_")
53 | """
54 | if files:
55 | if isinstance(files, str):
56 | files = [files]
57 | for file in files:
58 | check_file(file)
59 |
60 | if output_prefix:
61 | check_dir(output_prefix)
62 |
63 | # Main structures
64 | self._nodes = {}
65 | self._ranks = {}
66 | self._names = {}
67 | # Aux. structures
68 | self._lineages = {}
69 | self._name_nodes = {}
70 | self._node_children = {}
71 | self._rank_nodes = {}
72 | self._translated_nodes = {}
73 |
74 | # Store source of tax files (url or file)
75 | self.sources = []
76 |
77 | # Open/Download/Write files
78 | fhs = {}
79 | if files:
80 | fhs = open_files(files)
81 | elif urls or self._default_urls:
82 | fhs = download_files(urls=urls if urls else self._default_urls,
83 | output_prefix=output_prefix,
84 | retry_attempts=3)
85 |
86 | if fhs:
87 | # Parse taxonomy
88 | self._nodes, self._ranks, self._names = self._parse(
89 | fhs, extended_names=extended_names)
90 | close_files(fhs)
91 | # Save sources for stats (files or urls)
92 | self.sources = list(fhs.keys())
93 |
94 | # Set undefined values
95 | self.undefined_node = undefined_node
96 | self.undefined_name = undefined_name
97 | self.undefined_rank = undefined_rank
98 |
99 | # Set root values
100 | self._set_root_node(root=root_node if root_node else self._default_root_node,
101 | parent=root_parent, name=root_name, rank=root_rank)
102 |
103 | # build auxiliary structures
104 | if build_node_children:
105 | self._node_children = reverse_dict(self._nodes)
106 | if build_name_nodes:
107 | self._name_nodes = reverse_dict(self._names)
108 | if build_rank_nodes:
109 | self._rank_nodes = reverse_dict(self._ranks)
110 |
111 | self.check_consistency()
112 |
113 | def _exact_name(self, text: str, names: dict):
114 | """
115 | Returns list of nodes of a given exact name (case sensitive).
116 | """
117 | if text in names:
118 | return names[text]
119 | else:
120 | return []
121 |
122 | def _parse(self, fhs: dict):
123 | """
124 | main function to be overloaded
125 | receives a dictionary with {"url/file": file handler}
126 | return nodes, ranks and names dicts
127 | """
128 | return {}, {}, {}
129 |
130 | def _partial_name(self, text: str, names: dict):
131 | """
132 | Searches names containing a certain text (case sensitive) and return their respective nodes.
133 | """
134 | matching_nodes = set()
135 | for name in names:
136 | if text in name:
137 | matching_nodes.update(names[name])
138 | return list(matching_nodes)
139 |
140 | def _recurse_leaves(self, node: str):
141 | """
142 | Recursive function returning leaf nodes
143 | """
144 | children = self.children(node)
145 | if not children:
146 | return [node]
147 | leaves = []
148 | for child in children:
149 | leaves.extend(self._recurse_leaves(child))
150 | return leaves
151 |
152 | def _remove(self, node: str):
153 | """
154 | Removes node from taxonomy, no checking, for internal use
155 | """
156 | del self._nodes[node]
157 | if node in self._names:
158 | del self._names[node]
159 | if node in self._ranks:
160 | del self._ranks[node]
161 |
162 | def _reset_aux_data(self):
163 | """
164 | Reset aux. data structures
165 | """
166 | self._lineages = {}
167 | self._name_nodes = {}
168 | self._node_children = {}
169 | self._rank_nodes = {}
170 | self._translated_nodes = {}
171 |
172 | def _set_root_node(self, root: str, parent: str, name: str, rank: str):
173 | """
174 | Set root node of the tree.
175 | The files are parsed based on the self._default_root_node for each class
176 | A user-defined root node can be:
177 | 1) internal: will filter the tree acodingly and delete the default root_node
178 | 2) external: will add node and link to the default
179 | """
180 |
181 | # Set parent/root with defaults
182 | self.root_parent = parent
183 | self.root_node = self._default_root_node
184 | self._nodes[self.root_node] = self.root_parent
185 |
186 | # Default root node is the top by definition
187 | if root != self._default_root_node:
188 | if root in self._nodes:
189 | # Not default but exists on tree, filter only descendants
190 | self.filter(root, desc=True)
191 | # Remove entry for _default_root_node
192 | self._remove(self._default_root_node)
193 | else:
194 | # Not on tree, link default node with new root
195 | self._nodes[self._default_root_node] = root
196 | # Change root to user defined
197 | self.root_node = root
198 | # Set/Update new root node parent link
199 | self._nodes[self.root_node] = self.root_parent
200 |
201 | # User-defined rank/name.
202 | # If provided, insert manually,
203 | # If None, check if is in the tree (defined in the given tax)
204 | # otherwise insert default "root"
205 | if name:
206 | self._names[self.root_node] = name
207 | elif self.root_node not in self._names:
208 | self._names[self.root_node] = "root"
209 | # Set static name
210 | self.root_name = self._names[self.root_node]
211 |
212 | if rank:
213 | self._ranks[self.root_node] = rank
214 | elif self.root_node not in self._ranks:
215 | self._ranks[self.root_node] = "root"
216 | # Set static rank
217 | self.root_rank = self._ranks[self.root_node]
218 |
219 | def add(self, node: str, parent: str, name: str = None, rank: str = None):
220 | """
221 | Add node to taxonomy.
222 | Deletes built lineages and translations.
223 | """
224 | if parent not in self._nodes:
225 | raise ValueError("Parent node [" + parent + "] not found.")
226 | elif node in self._nodes:
227 | raise ValueError("Node [" + node + "] already present.")
228 |
229 | self._nodes[node] = parent
230 | self._names[node] = name if name is not None else self.undefined_name
231 | self._ranks[node] = rank if rank is not None else self.undefined_rank
232 | self._reset_aux_data()
233 |
234 | def build_lineages(self, root_node: str = None, ranks: list = None):
235 | """
236 | Stores lineages in memory for faster access.
237 | It is valid for lineage(), rank_lineage() and name_lineage().
238 | If keyword arguments (root_node, ranks) are used in those functions stored lineages are not used.
239 |
240 | Returns: None
241 | """
242 | self.clear_lineages()
243 | for node in self._nodes:
244 | self._lineages[node] = self.lineage(
245 | node=node, root_node=root_node, ranks=ranks)
246 |
247 | def build_translation(self, tax, files: list = None, urls: list = None):
248 | """
249 | Create a translation of current taxonomy to another
250 |
251 | Parameters:
252 |
253 | * **tax** [MultiTax]: A target taxonomy to be translated to.
254 | * **files** *[str, list]*: One or more local files to parse.
255 | * **urls** *[str, list]*: One or more urls to download and parse.
256 |
257 | Example:
258 |
259 | from multitax import GtdbTx, NcbiTx
260 | gtdb_tax = GtdbTx()
261 | ncbi_tax = NcbiTx()
262 |
263 | # Automatically download translation files
264 | gtdb_tax.build_translation(ncbi_tax)
265 | gtdb_tax.translate("g__Escherichia")
266 | {'1301', '547', '561', '570', '590', '620'}
267 |
268 | # Using local files (NCBI <-> GTDB)
269 | ncbi_tax.build_translation(gtdb_tax, files=["ar53_metadata.tsv.gz", "bac120_metadata.tsv.gz"])
270 | ncbi_tax.translate("620")
271 | {'g__Escherichia', 'g__Proteus', 'g__Serratia'}
272 | """
273 | if files:
274 | if isinstance(files, str):
275 | files = [files]
276 | for file in files:
277 | check_file(file)
278 |
279 | self._translated_nodes = self._build_translation(tax, files, urls)
280 |
281 | def children(self, node: str):
282 | """
283 | Returns list of direct children nodes of a given node.
284 | """
285 | # Setup on first use
286 | if not self._node_children:
287 | self._node_children = reverse_dict(self._nodes)
288 | if node in self._node_children:
289 | return self._node_children[node]
290 | else:
291 | return []
292 |
293 | def check_consistency(self):
294 | """
295 | Checks consistency of the tree
296 |
297 | Returns: raise an Exception otherwise None
298 | """
299 | if self.root_node not in self._nodes:
300 | raise ValueError("Root node [" + self.root_node + "] not found.")
301 | if self.root_parent in self._nodes:
302 | raise ValueError(
303 | "Root parent [" + self.root_parent + "] found but should not be on tree.")
304 | if self.undefined_node in self._nodes:
305 | raise ValueError(
306 | "Undefined node [" + self.undefined_node + "] found but should not be on tree.")
307 |
308 | # Difference between values and keys should be only root_parent
309 | lost_nodes = set(self._nodes.values()).difference(self._nodes)
310 | if self.root_parent not in lost_nodes:
311 | raise ValueError(
312 | "Root parent [" + self.root_parent + "] not properly defined.")
313 | # Remove root_parent from lost nodes to report only missing
314 | lost_nodes.remove(self.root_parent)
315 | if len(lost_nodes) > 0:
316 | raise ValueError("Parent nodes missing: " + ",".join(lost_nodes))
317 |
318 | return None
319 |
320 | def clear_lineages(self):
321 | """
322 | Clear built lineages.
323 |
324 | Returns: None
325 | """
326 | self._lineages = {}
327 |
328 | def closest_parent(self, node: str, ranks: str):
329 | """
330 | Returns the closest parent node based on a defined list of ranks
331 | """
332 | # Rank of node is already on the list
333 | if self.rank(node) in ranks:
334 | return node
335 | else:
336 | # check lineage from back to front until find a valid node
337 | for n in self.lineage(node, ranks=ranks)[::-1]:
338 | if n != self.undefined_node:
339 | return n
340 | # nothing found
341 | return self.undefined_node
342 |
343 | def filter(self, nodes: list, desc: bool = False):
344 | """
345 | Filters taxonomy given a list of nodes.
346 | By default keep all the ancestors of the given nodes.
347 | If desc=True, keep all descendants instead.
348 | Deletes built lineages and translations.
349 |
350 | Example:
351 |
352 | from multitax import GtdbTx
353 | tax = GtdbTx()
354 |
355 | tax.lineage('s__Enterovibrio marina')
356 | # ['1', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Vibrionaceae', 'g__Enterovibrio', 's__Enterovibrio marina']
357 | # Keep only ancestors of 'g__Enterovibrio'
358 | tax.filter('g__Enterovibrio')
359 |
360 | # Reload taxonomy
361 | tax = GtdbTx()
362 | # Keep only descendants of 'g__Enterovibrio'
363 | tax.filter('g__Enterovibrio', desc=True)
364 | """
365 | if isinstance(nodes, str):
366 | nodes = [nodes]
367 |
368 | # Keep track of nodes to be filtered out
369 | filtered_nodes = set(self._nodes)
370 | # Always keep root
371 | filtered_nodes.discard(self.root_node)
372 |
373 | if desc:
374 | # Keep descendants of the given nodes
375 | for node in nodes:
376 | # Check if node exists (skips root)
377 | if node in filtered_nodes:
378 | # For each leaf of the selected nodes
379 | for leaf in self.leaves(node):
380 | # Build lineage of each leaf up-to node itself
381 | for n in self.lineage(leaf, root_node=node):
382 | # Discard nodes from set to be kept
383 | filtered_nodes.discard(n)
384 | # Link node to root
385 | self._nodes[node] = self.root_node
386 | else:
387 | # Keep ancestors of the given nodes (full lineage up-to root)
388 | for node in nodes:
389 | # ranks=[] in case build_lineages() was used with specific ranks
390 | for n in self.lineage(node, ranks=[]):
391 | # Discard nodes from set to be kept
392 | filtered_nodes.discard(n)
393 |
394 | # Delete filtered nodes
395 | for node in filtered_nodes:
396 | self._remove(node)
397 |
398 | # Delete aux. data structures
399 | self._reset_aux_data()
400 | self.check_consistency()
401 |
402 | def latest(self, node: str):
403 | """
404 | Returns latest/updated version of a given node.
405 | If node is already the latests, returns itself.
406 | Mainly used for NCBI (merged.dmp) and OTT (forwards.tsv)
407 | """
408 | if node in self._nodes:
409 | return node
410 | else:
411 | return self.undefined_node
412 |
413 | def leaves(self, node: str = None):
414 | """
415 | Returns a list of leaf nodes of a given node.
416 | """
417 | if node is None or node == self.root_node:
418 | # Leaves are nodes not contained in _nodes.values() ("parents")
419 | return list(set(self._nodes).difference(self._nodes.values()))
420 | elif node in self._nodes:
421 | return self._recurse_leaves(node)
422 | else:
423 | return []
424 |
425 | def lineage(self, node: str, root_node: str = None, ranks: list = None):
426 | """
427 | Returns a list with the lineage of a given node.
428 | If ranks is provided, returns only nodes annotated with such ranks.
429 | If root_node is provided, use it instead of default root of tree.
430 | """
431 | # If lineages were built with build_lineages() with matching params
432 | if node in self._lineages and root_node is None and ranks is None:
433 | return self._lineages[node]
434 | else:
435 | if not root_node:
436 | root_node = self.root_node
437 |
438 | n = node
439 | if ranks:
440 | # Fixed length lineage
441 | lin = [self.undefined_node] * len(ranks)
442 | # Loop until end of the tree (in case chosen root is not on lineage)
443 | while n != self.undefined_node:
444 | r = self.rank(n)
445 | if r in ranks:
446 | lin[ranks.index(r)] = n
447 | # If node is root, break (after adding)
448 | if n == root_node:
449 | break
450 | n = self.parent(n)
451 | else:
452 | # Full lineage
453 | lin = []
454 | # Loop until end of the tree (in case chosen root is not on lineage)
455 | while n != self.undefined_node:
456 | lin.append(n)
457 | # If node is root, break (after adding)
458 | if n == root_node:
459 | break
460 | n = self.parent(n)
461 | # Reverse order
462 | lin = lin[::-1]
463 |
464 | # last iteration node (n) != root_node: didn't find the root, invalid lineage
465 | if n != root_node:
466 | return []
467 | else:
468 | return lin
469 |
470 | def name(self, node: str):
471 | """
472 | Returns name of a given node.
473 | """
474 | if node in self._names:
475 | return self._names[node]
476 | else:
477 | return self.undefined_name
478 |
479 | def name_lineage(self, node: str, root_node: str = None, ranks: list = None):
480 | """
481 | Returns a list with the name lineage of a given node.
482 | """
483 | return list(map(self.name,
484 | self.lineage(node=node,
485 | root_node=root_node,
486 | ranks=ranks)))
487 |
488 | def nodes_rank(self, rank: str):
489 | """
490 | Returns list of nodes of a given rank.
491 | """
492 | # Setup on first use
493 | if not self._rank_nodes:
494 | self._rank_nodes = reverse_dict(self._ranks)
495 | if rank in self._rank_nodes:
496 | return self._rank_nodes[rank]
497 | else:
498 | return []
499 |
500 | def parent(self, node: str):
501 | """
502 | Returns the direct parent node of a given node.
503 | """
504 | if node in self._nodes:
505 | return self._nodes[node]
506 | else:
507 | return self.undefined_node
508 |
509 | def parent_rank(self, node: str, rank: str):
510 | """
511 | Returns the parent node of a given rank in the specified rank.
512 | """
513 | parent = self.lineage(node=node, ranks=[rank])
514 | return parent[0] if parent else self.undefined_node
515 |
516 | def prune(self, nodes: list):
517 | """
518 | Prunes branches of the tree under the given nodes.
519 | Deletes built lineages and translations.
520 | """
521 |
522 | if isinstance(nodes, str):
523 | nodes = [nodes]
524 |
525 | del_nodes = set()
526 | for node in nodes:
527 | if node not in self._nodes:
528 | raise ValueError("Node [" + node + "] not found.")
529 | for leaf in self.leaves(node):
530 | for n in self.lineage(leaf, root_node=node)[1:]:
531 | del_nodes.add(n)
532 |
533 | for n in del_nodes:
534 | self._remove(n)
535 |
536 | self._reset_aux_data()
537 |
538 | def rank(self, node: str):
539 | """
540 | Returns the rank of a given node.
541 | """
542 | if node in self._ranks:
543 | return self._ranks[node]
544 | else:
545 | return self.undefined_rank
546 |
547 | def rank_lineage(self, node: str, root_node: str = None, ranks: list = None):
548 | """
549 | Returns a list with the rank lineage of a given node.
550 | """
551 | return list(map(self.rank,
552 | self.lineage(node=node,
553 | root_node=root_node,
554 | ranks=ranks)))
555 |
556 | def remove(self, node: str, check_consistency: bool = False):
557 | """
558 | Removes node from taxonomy. Can break the tree if a parent node is removed. To remove a certain branch, use prune.
559 | Running check consistency after removing a node is recommended.
560 | Deletes built lineages and translations.
561 | """
562 | if node not in self._nodes:
563 | raise ValueError("Node [" + node + "] not found.")
564 | self._remove(node)
565 | self._reset_aux_data()
566 | if check_consistency:
567 | self.check_consistency()
568 |
569 | def search_name(self, text: str, rank: str = None, exact: bool = True):
570 | """
571 | Search node by exact or partial name
572 |
573 | Parameters:
574 | * **text** *[str]*: Text to search.
575 | * **rank** *[str]*: Filter results by rank.
576 | * **exact** *[bool]*: Exact or partial name search (both case sensitive).
577 |
578 | Returns: list of matching nodes
579 | """
580 | # Setup on first use
581 | if not self._name_nodes:
582 | self._name_nodes = reverse_dict(self._names)
583 |
584 | if exact:
585 | ret = self._exact_name(text, self._name_nodes)
586 | else:
587 | ret = self._partial_name(text, self._name_nodes)
588 |
589 | # Only return nodes of chosen rank
590 | if rank:
591 | return filter_function(ret, self.rank, rank)
592 | else:
593 | return ret
594 |
595 | def stats(self):
596 | """
597 | Returns a dict with general numbers of the taxonomic tree
598 |
599 | Example:
600 |
601 | from pprint import pprint
602 | from multitax import GtdbTx
603 | tax = GtdbTx()
604 |
605 | pprint(tax.stats())
606 | {'leaves': 30238,
607 | 'names': 42739,
608 | 'nodes': 42739,
609 | 'ranked_leaves': Counter({'species': 30238}),
610 | 'ranked_nodes': Counter({'species': 30238,
611 | 'genus': 8778,
612 | 'family': 2323,
613 | 'order': 930,
614 | 'class': 337,
615 | 'phylum': 131,
616 | 'domain': 1,
617 | 'root': 1}),
618 | 'ranks': 42739}
619 | """
620 | s = {}
621 | s["nodes"] = len(self._nodes)
622 | s["ranks"] = len(self._ranks)
623 | s["names"] = len(self._names)
624 | all_leaves = self.leaves(self.root_node)
625 | s["leaves"] = len(all_leaves)
626 | s["ranked_nodes"] = Counter(self._ranks.values())
627 | s["ranked_leaves"] = Counter(map(self.rank, all_leaves))
628 |
629 | return s
630 |
631 | def translate(self, node: str):
632 | """
633 | Returns the translated node from another taxonomy. Translated nodes are generated with the build_translation function.
634 | """
635 | if node in self._translated_nodes:
636 | return self._translated_nodes[node]
637 | else:
638 | return []
639 |
640 | def write(self,
641 | output_file: str,
642 | cols: list = ["node", "parent", "rank", "name"],
643 | sep: str = "\t",
644 | sep_multi: str = "|",
645 | ranks: list = None,
646 | gz: bool = False):
647 | """
648 | Writes loaded taxonomy to a file.
649 |
650 | Parameters:
651 | * **cols** *[list]*: Options: "node", "latest", "parent", "rank", "name", "leaves", "children", "lineage", "rank_lineage", "name_lineage"
652 | * **sep** *[str]*: Separator of fields
653 | * **sep_multi** *[str]*: Separator of multi-valued fields
654 | * **ranks** *[list]*: Ranks to report
655 | * **gz** *[bool]*: Gzip output
656 |
657 | Returns: None
658 | """
659 | import gzip
660 | if gz:
661 | output_file = output_file if output_file.endswith(
662 | ".gz") else output_file + ".gz"
663 | check_no_file(output_file)
664 | outf = gzip.open(output_file, "wt")
665 | else:
666 | check_no_file(output_file)
667 | outf = open(output_file, "w")
668 |
669 | write_field = {"node": lambda node: node,
670 | "latest": self.latest,
671 | "parent": self.parent,
672 | "rank": self.rank,
673 | "name": self.name,
674 | "leaves": lambda node: join_check(self.leaves(node), sep_multi),
675 | "children": lambda node: join_check(self.children(node), sep_multi),
676 | "lineage": lambda node: join_check(self.lineage(node, ranks=ranks), sep_multi),
677 | "rank_lineage": lambda node: join_check(self.rank_lineage(node, ranks=ranks), sep_multi),
678 | "name_lineage": lambda node: join_check(self.name_lineage(node, ranks=ranks), sep_multi)}
679 |
680 | for c in cols:
681 | if c not in write_field:
682 | raise ValueError(
683 | "Field [" + c + "] is not valid. Options: " + ",".join(write_field))
684 |
685 | if ranks:
686 | for rank in ranks:
687 | for node in self.nodes_rank(rank):
688 | print(*[write_field[c](node)
689 | for c in cols], sep=sep, end="\n", file=outf)
690 | else:
691 | for node in self._nodes:
692 | print(*[write_field[c](node)
693 | for c in cols], sep=sep, end="\n", file=outf)
694 |
695 | outf.close()
696 |
--------------------------------------------------------------------------------
/multitax/ncbitx.py:
--------------------------------------------------------------------------------
1 | from .multitax import MultiTax
2 | from .utils import filter_function
3 | from .utils import check_file
4 | from .utils import open_files
5 | from .utils import download_files
6 | import warnings
7 |
8 |
9 | class NcbiTx(MultiTax):
10 | _default_urls = ["https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz"]
11 |
12 | def __init__(self, **kwargs):
13 | self._merged = {}
14 | self._extended_name_nodes = {}
15 | super().__init__(**kwargs)
16 |
17 | def __repr__(self):
18 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
19 | return 'NcbiTx({})'.format(', '.join(stats))
20 |
21 | def _build_translation(self, target_tax, files: list = None, urls: list = None):
22 | translated_nodes = {}
23 | if target_tax.__class__.__name__ == "GtdbTx":
24 |
25 | if files:
26 | fhs = open_files(files)
27 | else:
28 | _urls = ["https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/ar53_metadata.tsv.gz",
29 | "https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/bac120_metadata.tsv.gz"]
30 | fhs = download_files(
31 | urls=urls if urls else _urls, retry_attempts=3)
32 |
33 |
34 | accession_col = 0
35 | gtdb_taxonomy_col = 19
36 | ncbi_taxid_col = 80
37 |
38 | for source, fh in fhs.items():
39 | for line in fh:
40 | try:
41 | fields = line.rstrip().split('\t')
42 | except:
43 | fields = line.decode().rstrip().split('\t')
44 |
45 | # skip header
46 | if fields[accession_col] == "accession":
47 | continue
48 |
49 | # Build GTDB lineage from leaf (species on given lineage)
50 | # to accomodate possible changes in the loaded tax
51 | gtdb_leaf_node = fields[gtdb_taxonomy_col].split(";")[-1]
52 | if gtdb_leaf_node != target_tax.undefined_node:
53 | gtdb_nodes = target_tax.lineage(gtdb_leaf_node, ranks=[
54 | "domain", "phylum", "class", "order",
55 | "family", "genus", "species"])
56 | else:
57 | continue
58 |
59 | # Build NCBI lineage from leaf
60 | ncbi_leaf_node = self.latest(fields[ncbi_taxid_col])
61 | if ncbi_leaf_node != self.undefined_node:
62 | # Additional add connection from leaf to species on GTDB
63 | # that could represent strain, etc on NCBI tax
64 | if ncbi_leaf_node not in translated_nodes:
65 | translated_nodes[ncbi_leaf_node] = set()
66 | translated_nodes[ncbi_leaf_node].add(
67 | gtdb_leaf_node)
68 | ncbi_nodes = self.lineage(ncbi_leaf_node, ranks=[
69 | "superkingdom", "phylum", "class", "order",
70 | "family", "genus", "species"])
71 | else:
72 | continue
73 |
74 | # Match ranks
75 | for i, ncbi_n in enumerate(ncbi_nodes):
76 | if gtdb_nodes[i] != target_tax.undefined_node and ncbi_n != self.undefined_node:
77 | if ncbi_n not in translated_nodes:
78 | translated_nodes[ncbi_n] = set()
79 | translated_nodes[ncbi_n].add(gtdb_nodes[i])
80 |
81 | else:
82 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
83 | "," + target_tax.__class__.__name__ + "] not yet implemented.")
84 |
85 | return translated_nodes
86 |
87 | def _parse(self, fhs, **kwargs):
88 | fhs_list = list(fhs.values())
89 | # One element tar.gz -> taxdump.tar.gz
90 | if len(fhs_list) == 1 and list(fhs)[0].endswith(".tar.gz"):
91 | nodes, ranks, names, self._merged = self._parse_taxdump(
92 | fhs_list[0], extended_names=kwargs["extended_names"])
93 | else:
94 | # nodes.dmp
95 | nodes, ranks = self._parse_nodes(fhs_list[0])
96 |
97 | # [names.dmp]
98 | if len(fhs) >= 2:
99 | names = self._parse_names(
100 | fhs_list[1], extended_names=kwargs["extended_names"])
101 | else:
102 | names = {}
103 |
104 | # [merged.dmp]
105 | if len(fhs) == 3:
106 | self._merged = self._parse_merged(fhs_list[2])
107 | return nodes, ranks, names
108 |
109 | def _parse_merged(self, fh):
110 | merged = {}
111 | for line in fh:
112 | try:
113 | old_taxid, _, new_taxid, _ = line.split('\t', 3)
114 | except:
115 | old_taxid, _, new_taxid, _ = line.decode().split('\t', 3)
116 | merged[old_taxid] = new_taxid
117 | return merged
118 |
119 | def _parse_names(self, fh, extended_names):
120 | names = {}
121 | for line in fh:
122 | try:
123 | node, name, _, name_class = line.split('\t|\t')
124 | except:
125 | node, name, _, name_class = line.decode().split('\t|\t')
126 | if name_class.replace('\t|\n', '') == "scientific name":
127 | names[node] = name
128 | elif extended_names:
129 | if name not in self._extended_name_nodes:
130 | self._extended_name_nodes[name] = []
131 | self._extended_name_nodes[name].append(node)
132 |
133 | return names
134 |
135 | def _parse_nodes(self, fh):
136 | nodes = {}
137 | ranks = {}
138 | for line in fh:
139 | try:
140 | taxid, parent_taxid, rank, _ = line.split('\t|\t', 3)
141 | except:
142 | taxid, parent_taxid, rank, _ = line.decode().split('\t|\t', 3)
143 | ranks[taxid] = rank
144 | nodes[taxid] = parent_taxid
145 | return nodes, ranks
146 |
147 | def _parse_taxdump(self, fh_taxdump, extended_names):
148 | with fh_taxdump.extractfile('nodes.dmp') as fh_nodes:
149 | nodes, ranks = self._parse_nodes(fh_nodes)
150 | with fh_taxdump.extractfile('names.dmp') as fh_names:
151 | names = self._parse_names(fh_names, extended_names=extended_names)
152 | with fh_taxdump.extractfile('merged.dmp') as fh_merged:
153 | merged = self._parse_merged(fh_merged)
154 | return nodes, ranks, names, merged
155 |
156 | def latest(self, node: str):
157 | n = super().latest(node)
158 | if n == self.undefined_node:
159 | n = self.merged(node)
160 | return n
161 |
162 | def merged(self, node: str):
163 | """
164 | Returns relative entry from the merged.dmp file of a given node.
165 | """
166 | if node in self._merged:
167 | return self._merged[node]
168 | else:
169 | return self.undefined_node
170 |
171 | def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
172 | """
173 | Search node by exact or partial name.
174 |
175 | Default order (can be skipped with **force_extended=True**):
176 |
177 | 1) Search names defined as "scientific name" on nodes.dmp
178 |
179 | 2) If nothing was found, search text in all other categories (must be activated with NcbiTx(**extended_names=True**))
180 |
181 | Parameters:
182 | * **text** *[str]*: Text to search.
183 | * **rank** *[str]*: Filter results by rank.
184 | * **exact** *[bool]*: Exact or partial name search (both case sensitive).
185 | * **force_extended** *[bool]*: Search for text in all categories at once.
186 |
187 | Returns: list of matching nodes
188 | """
189 | n = super().search_name(text, rank=rank, exact=exact)
190 | if n and not force_extended:
191 | return n
192 | else:
193 | if exact:
194 | ret = self._exact_name(text, self._extended_name_nodes)
195 | else:
196 | ret = self._partial_name(text, self._extended_name_nodes)
197 |
198 | # Only return nodes of chosen rank
199 | if rank:
200 | ret = filter_function(ret, self.rank, rank)
201 |
202 | return list(set(n + ret))
203 |
204 | def stats(self):
205 | s = super().stats()
206 | if self._merged:
207 | s["merged"] = len(self._merged)
208 | if self._extended_name_nodes:
209 | s["extended_names"] = len(self._extended_name_nodes)
210 | return s
211 |
--------------------------------------------------------------------------------
/multitax/otttx.py:
--------------------------------------------------------------------------------
1 | from .multitax import MultiTax
2 | from .utils import filter_function
3 | import warnings
4 |
5 |
6 | class OttTx(MultiTax):
7 | _default_urls = ["http://files.opentreeoflife.org/ott/ott3.4/ott3.4.tgz"]
8 | _default_root_node = "805080"
9 |
10 | def __init__(self, **kwargs):
11 | self._forwards = {}
12 | self._extended_name_nodes = {}
13 | super().__init__(**kwargs)
14 |
15 | def __repr__(self):
16 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
17 | return 'OttTx({})'.format(', '.join(stats))
18 |
19 | def _build_translation(self, target_tax, files: list = None, urls: list = None):
20 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
21 | "," + target_tax.__class__.__name__ + "] not yet implemented.")
22 | return {}
23 |
24 | def _parse(self, fhs, **kwargs):
25 | fhs_list = list(fhs.values())
26 | if len(fhs_list) == 1 and list(fhs)[0].endswith(".tgz"):
27 | nodes, ranks, names = self._parse_ott(
28 | fhs_list[0], extended_names=kwargs["extended_names"])
29 | else:
30 | # nodes.dmp
31 | nodes, ranks, names = self._parse_taxonomy(fhs_list[0])
32 | # [forwards.tsv]
33 | if len(fhs) >= 2:
34 | self._forwards = self._parse_forwards(fhs_list[1])
35 | if len(fhs) == 3 and kwargs["extended_names"]:
36 | self._extended_name_nodes = self._parse_synonyms(fhs_list[2])
37 |
38 | return nodes, ranks, names
39 |
40 | def _parse_forwards(self, fh):
41 | forwards = {}
42 | # skip first line header
43 | next(fh)
44 | for line in fh:
45 | try:
46 | old_taxid, new_taxid = line.rstrip().split('\t')
47 | except:
48 | old_taxid, new_taxid = line.decode().rstrip().split('\t')
49 | forwards[old_taxid] = new_taxid
50 | return forwards
51 |
52 | def _parse_ott(self, fh_taxdump, extended_names):
53 | # Get files inside folder by name
54 | for e in fh_taxdump.getnames():
55 | if e.endswith("taxonomy.tsv"):
56 | tax = e
57 | if e.endswith("forwards.tsv"):
58 | fwr = e
59 | if e.endswith("synonyms.tsv"):
60 | syn = e
61 |
62 | with fh_taxdump.extractfile(tax) as fh_nodes:
63 | nodes, ranks, names = self._parse_taxonomy(fh_nodes)
64 | with fh_taxdump.extractfile(fwr) as fh_forwards:
65 | self._forwards = self._parse_forwards(fh_forwards)
66 | if extended_names:
67 | with fh_taxdump.extractfile(syn) as fh_synonyms:
68 | self._extended_name_nodes = self._parse_synonyms(fh_synonyms)
69 | return nodes, ranks, names
70 |
71 | def _parse_synonyms(self, fh):
72 | synonyms = {}
73 | # skip first line header
74 | next(fh)
75 | for line in fh:
76 | try:
77 | name, taxid, _ = line.split('\t|\t', 2)
78 | except:
79 | name, taxid, _ = line.decode().split('\t|\t', 2)
80 | if name not in synonyms:
81 | synonyms[name] = []
82 | synonyms[name].append(taxid)
83 |
84 | return synonyms
85 |
86 | def _parse_taxonomy(self, fh):
87 | nodes = {}
88 | ranks = {}
89 | names = {}
90 | # skip first line header
91 | next(fh)
92 | for line in fh:
93 | try:
94 | taxid, parent_taxid, name, rank, _ = line.split('\t|\t', 4)
95 | except:
96 | taxid, parent_taxid, name, rank, _ = line.decode().split('\t|\t', 4)
97 | ranks[taxid] = rank
98 | nodes[taxid] = parent_taxid
99 | names[taxid] = name
100 | return nodes, ranks, names
101 |
102 | def forwards(self, node: str):
103 | """
104 | Returns relative entry from the forwards.tsv file of a given node.
105 | """
106 | if node in self._forwards:
107 | return self._forwards[node]
108 | else:
109 | return self.undefined_node
110 |
111 | def latest(self, node: str):
112 | n = super().latest(node)
113 | if n == self.undefined_node:
114 | n = self.forwards(node)
115 | return n
116 |
117 | def search_name(self, text: str, rank: str = None, exact: bool = True, force_extended: bool = False):
118 | """
119 | Search node by exact or partial name.
120 |
121 | Default order (can be skipped with **force_extended=True**):
122 |
123 | 1) Search default names defined on "taxonomy.tsv"
124 |
125 | 2) If nothing was found, search in all other names defined on "synonyms.tsv" (must be activated with OttTx(**extended_names=True**))
126 |
127 | Parameters:
128 | * **text** *[str]*: Text to search.
129 | * **rank** *[str]*: Filter results by rank.
130 | * **exact** *[bool]*: Exact or partial name search (both case sensitive).
131 | * **force_extended** *[bool]*: Search for text in all categories at once.
132 |
133 | Returns: list of matching nodes
134 | """
135 | n = super().search_name(text, rank=rank, exact=exact)
136 | if n and not force_extended:
137 | return n
138 | else:
139 | if exact:
140 | ret = self._exact_name(text, self._extended_name_nodes)
141 | else:
142 | ret = self._partial_name(text, self._extended_name_nodes)
143 |
144 | # Only return nodes of chosen rank
145 | if rank:
146 | ret = filter_function(ret, self.rank, rank)
147 |
148 | return list(set(n + ret))
149 |
150 | def stats(self):
151 | s = super().stats()
152 | if self._forwards:
153 | s["forwards"] = len(self._forwards)
154 | if self._extended_name_nodes:
155 | s["extended_names"] = len(self._extended_name_nodes)
156 | return s
157 |
--------------------------------------------------------------------------------
/multitax/silvatx.py:
--------------------------------------------------------------------------------
1 | from .multitax import MultiTax
2 | import warnings
3 |
4 |
5 | class SilvaTx(MultiTax):
6 | _default_urls = [
7 | "https://www.arb-silva.de/fileadmin/silva_databases/current/Exports/taxonomy/tax_slv_ssu_138.1.txt.gz"]
8 |
9 | def __init__(self, **kwargs):
10 | super().__init__(**kwargs)
11 |
12 | def __repr__(self):
13 | stats = ['{}={}'.format(k, repr(v)) for (k, v) in self.stats().items()]
14 | return 'SilvaTx({})'.format(', '.join(stats))
15 |
16 | def _build_translation(self, target_tax, files: list = None, urls: list = None):
17 | warnings.warn("Translation between taxonomies [" + self.__class__.__name__ +
18 | "," + target_tax.__class__.__name__ + "] not yet implemented.")
19 | return {}
20 |
21 | def _parse(self, fhs, **kwargs):
22 | nodes = {}
23 | ranks = {}
24 | names = {}
25 |
26 | lin = {}
27 | for source, fh in fhs.items():
28 | for line in fh:
29 | try:
30 | name_lineage, taxid, rank, _ = line.split('\t', 3)
31 | except:
32 | name_lineage, taxid, rank, _ = line.decode().split('\t', 3)
33 | # Remove last char ";"
34 | lineage = name_lineage[:-1]
35 | name = lineage.split(";")[-1]
36 | # Save lineage to build tree
37 | lin[lineage] = taxid
38 | names[taxid] = name
39 | ranks[taxid] = rank
40 |
41 | # Build parent node connection
42 | for lineage, taxid in lin.items():
43 | t = taxid
44 | l = lineage.split(";")[:-1]
45 | while l:
46 | parent_taxid = lin[";".join(l)]
47 | if t not in nodes:
48 | nodes[t] = parent_taxid
49 | t = parent_taxid
50 | del l[-1] # remove last element
51 | # Connect last node to root
52 | if t not in nodes:
53 | nodes[t] = self._default_root_node
54 |
55 | return nodes, ranks, names
56 |
--------------------------------------------------------------------------------
/multitax/utils.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import io
3 | import os
4 | import tarfile
5 | import urllib.request
6 | import zlib
7 | import warnings
8 | from collections import OrderedDict
9 | from urllib.error import HTTPError
10 |
11 |
12 | def check_dir(prefix: str):
13 | abs_path = os.path.dirname(os.path.abspath(prefix))
14 | if not os.path.exists(abs_path):
15 | raise NotADirectoryError(abs_path)
16 |
17 |
18 | def check_file(file: str):
19 | if not os.path.isfile(file):
20 | raise FileNotFoundError(file + " file do not exist")
21 | if os.path.getsize(file) == 0:
22 | raise FileNotFoundError(file + " file is empty")
23 |
24 |
25 | def check_no_file(file: str):
26 | if os.path.isfile(file):
27 | raise FileExistsError(file)
28 |
29 |
30 | def close_files(fhs: dict):
31 | """
32 | Parameters:
33 | * **fhs** *[dict]*: {file: file handler}
34 |
35 | Returns: Nothing
36 | """
37 | for fh in fhs.values():
38 | fh.close()
39 |
40 |
41 | def download_files(urls: list, output_prefix: str = None, retry_attempts: int = 1):
42 | """
43 | Download and open files (memory/stream) or write to disk (multitax.utils.save_urls)
44 |
45 | Parameters:
46 | * **urls** *[list]*: List of files to download (text, ".gz", ".tar.gz", ".tgz")
47 | * **output_prefix** *[str]*: Output directory to save files
48 |
49 | Returns:
50 | * OrderedDict {file: file handler} (same order as input)
51 | """
52 | if isinstance(urls, str):
53 | urls = [urls]
54 |
55 | att = 0
56 | while att < retry_attempts:
57 | att += 1
58 | try:
59 | # If output is provided, save files and parse from disc
60 | if output_prefix:
61 | files = save_urls(urls, output_prefix)
62 | return open_files(files)
63 | else:
64 | # stream contents from url
65 | fhs = OrderedDict()
66 | for url in urls:
67 | if url.endswith(".tar.gz") or url.endswith(".tgz"):
68 | # tar files have mixed headers and content
69 | # whole file should be loaded in memory first and not streamed
70 | fhs[url] = tarfile.open(
71 | fileobj=load_url_mem(url), mode='r:gz')
72 | elif url.endswith(".gz"):
73 | fhs[url] = gzip.open(
74 | urllib.request.urlopen(url), mode="rb")
75 | fhs[url].peek(1) # peek into file to check if is valid
76 | else:
77 | fhs[url] = urllib.request.urlopen(url)
78 |
79 | return fhs
80 | except (HTTPError, zlib.error, tarfile.TarError):
81 | warnings.warn(
82 | "Download failed, trying again (" + str(att) + "/" + str(retry_attempts) + ")", UserWarning)
83 |
84 | raise Exception("One or more files could not be downloaded: " +
85 | ", ".join(urls))
86 |
87 |
88 | def filter_function(elements, function, value):
89 | return [elements[i] for i, v in enumerate(map(function, elements)) if v == value]
90 |
91 |
92 | def join_check(elements, sep: str):
93 | if elements:
94 | return sep.join(map(str, elements))
95 | else:
96 | return ""
97 |
98 |
99 | def load_url_mem(url: str):
100 | """
101 | Parameters:
102 | * **url** *[str]*: URL to load into memory
103 |
104 | Returns:
105 | * io.BytesIO of the requested url
106 | """
107 | urlstream = urllib.request.urlopen(url)
108 | # From https://stackoverflow.com/questions/18623842/read-contents-tarfile-into-python-seeking-backwards-is-not-allowed
109 | tmpfile = io.BytesIO()
110 | while True:
111 | s = urlstream.read(io.DEFAULT_BUFFER_SIZE)
112 | if not s:
113 | break
114 | tmpfile.write(s)
115 | urlstream.close()
116 | tmpfile.seek(0)
117 | return tmpfile
118 |
119 |
120 | def open_files(files: list):
121 | """
122 | Parameters:
123 | * **files** *[list]*: List of files to open (text, ".gz", ".tar.gz", ".tgz")
124 |
125 | Returns:
126 | * OrderedDict {file: file handler} (same order as input)
127 | """
128 |
129 | fhs = OrderedDict()
130 | for file in files:
131 | if file.endswith(".tar.gz") or file.endswith(".tgz"):
132 | fhs[file] = tarfile.open(file, mode='r:gz')
133 | elif file.endswith(".gz"):
134 | fhs[file] = gzip.open(file, "rt")
135 | else:
136 | fhs[file] = open(file, "r")
137 | return fhs
138 |
139 |
140 | def reverse_dict(d: dict):
141 | rd = {}
142 | for k, v in d.items():
143 | if v not in rd:
144 | rd[v] = []
145 | rd[v].append(k)
146 | return rd
147 |
148 |
149 | def save_urls(urls: list, output_prefix: str):
150 | """
151 | Parameters:
152 | * **urls** *[list]*: List of urls to download
153 | * **output_prefix** *[str]*: Output directory to save files
154 |
155 | Returns:
156 | * list of files saved
157 | """
158 | files = []
159 | for url in urls:
160 | outfile = output_prefix + os.path.basename(url)
161 | check_no_file(outfile)
162 | urlstream = urllib.request.urlopen(url)
163 | with open(outfile, 'b+w') as f:
164 | f.write(urlstream.read())
165 | urlstream.close()
166 | files.append(outfile)
167 | return files
168 |
169 |
170 | def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
171 | return '%s:%s: %s: %s\n' % (filename, lineno, category.__name__, message)
172 |
173 |
174 | warnings.formatwarning = warning_on_one_line
175 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 | "setuptools>=42",
4 | "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import re
4 |
5 | from setuptools import setup
6 |
7 | with open("README.md", "r", encoding="utf-8") as fh:
8 | long_description = fh.read()
9 |
10 | setup(
11 | name="multitax",
12 | version="1.3.2",
13 | url="https://www.github.com/pirovc/multitax",
14 | license="MIT",
15 | author="Vitor C. Piro",
16 | description="Python package to obtain, parse and explore biological and custom taxonomies",
17 | long_description=long_description,
18 | long_description_content_type="text/markdown",
19 | packages=["multitax"],
20 | python_requires=">=3.4",
21 | classifiers=[
22 | 'License :: OSI Approved :: MIT License',
23 | 'Programming Language :: Python :: 3.9',
24 | 'Programming Language :: Python :: 3.10',
25 | 'Programming Language :: Python :: 3.11',
26 | 'Programming Language :: Python :: 3.12',
27 | 'Programming Language :: Python :: 3.13',
28 | ],
29 | )
30 |
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/custom.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom.tsv.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/custom2.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom2.tsv.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/custom_unit_test.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/custom_unit_test.tsv.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gg.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gg.txt.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_ar.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_ar.tsv.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_bac.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_bac.tsv.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/ncbi.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/ncbi.tar.gz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/ott.tgz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/ott.tgz
--------------------------------------------------------------------------------
/tests/multitax/data_minimal/silva.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pirovc/multitax/daed499cec926848a3d5ea2e36545d823e3a4a81/tests/multitax/data_minimal/silva.txt.gz
--------------------------------------------------------------------------------
/tests/multitax/integration/test_common.py:
--------------------------------------------------------------------------------
1 | from multitax import GreengenesTx, GtdbTx, NcbiTx, OttTx, SilvaTx, CustomTx
2 | from tests.multitax.utils import setup_dir, uncompress_gzip, uncompress_tar_gzip
3 | import unittest
4 | import os
5 | import sys
6 | import random
7 | import io
8 |
9 |
10 | sys.path.append("tests/multitax/")
11 |
12 |
13 | class TestCommon(unittest.TestCase):
14 |
15 | tmp_dir = "tests/multitax/integration/tmp_common/"
16 | data_dir = "tests/multitax/data_minimal/"
17 | #data_dir = "tests/multitax/data_complete/"
18 |
19 | taxonomies = {}
20 | taxonomies["gtdb"] = {"class": GtdbTx,
21 | "params": {"files": [data_dir + "gtdb_ar.tsv.gz",
22 | data_dir + "gtdb_bac.tsv.gz"]}}
23 | taxonomies["ncbi"] = {"class": NcbiTx,
24 | "params": {"files": [data_dir + "ncbi.tar.gz"]}}
25 | taxonomies["silva"] = {"class": SilvaTx,
26 | "params": {"files": [data_dir + "silva.txt.gz"]}}
27 | taxonomies["ott"] = {"class": OttTx,
28 | "params": {"files": [data_dir + "ott.tgz"]}}
29 | taxonomies["greengenes"] = {"class": GreengenesTx,
30 | "params": {"files": [data_dir + "gg.txt.gz"]}}
31 | taxonomies["custom"] = {"class": CustomTx,
32 | "params": {"files": [data_dir + "custom.tsv.gz",
33 | data_dir + "custom2.tsv.gz"]}}
34 |
35 | @classmethod
36 | def setUpClass(self):
37 | setup_dir(self.tmp_dir)
38 |
39 | def test_basic(self):
40 | """
41 | Basic test with files
42 | """
43 | for t in self.taxonomies:
44 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
45 | self.assertGreater(tax.stats()["nodes"], 0, t + " failed")
46 |
47 | def test_print(self):
48 | """
49 | Test output of printing tax object instance
50 | """
51 | for t in self.taxonomies:
52 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
53 | out = io.StringIO()
54 | sys.stdout = out
55 | print(tax)
56 | sys.stdout = sys.__stdout__
57 | self.assertEqual(out.getvalue().lower().startswith(t), True)
58 |
59 | def test_urls(self):
60 | """
61 | Using urls instead of files
62 | """
63 | for t in self.taxonomies:
64 | # simulate url with "file://" and absolute path
65 | urls = ["file://" + os.path.abspath(file)
66 | for file in self.taxonomies[t]["params"]["files"]]
67 | tax = self.taxonomies[t]["class"](urls=urls)
68 | self.assertGreater(
69 | tax.stats()["nodes"], 0, t + " failed with urls")
70 |
71 | def test_fail_to_download(self):
72 | """
73 | Using wrong urls should fail (using ncbi)
74 | """
75 | with self.assertRaises(Exception):
76 | with self.assertWarns(UserWarning):
77 | tax = self.taxonomies["ncbi"]["class"](
78 | urls=["www.thisisnotawebsite.com/neither/a/file", "fasfafsafasfasf"])
79 |
80 | def test_urls_output_prefix(self):
81 | """
82 | Using urls and saving files on disk
83 | """
84 | for t in self.taxonomies:
85 | # simulate url with "file://" and absolute path
86 | urls = ["file://" + os.path.abspath(file)
87 | for file in self.taxonomies[t]["params"]["files"]]
88 | tax = self.taxonomies[t]["class"](
89 | urls=urls, output_prefix=self.tmp_dir)
90 | self.assertGreater(
91 | tax.stats()["nodes"], 0, t + " failed with urls and output_prefix")
92 |
93 | def test_gzip_uncompressed(self):
94 | """
95 | Using uncompressed gzip files ("gtdb", "silva", "greengenes", "custom")
96 | """
97 | for t in self.taxonomies:
98 | if t in ["gtdb", "silva", "greengenes", "custom"]:
99 | uncompressed = []
100 | for file in self.taxonomies[t]["params"]["files"]:
101 | if file.endswith(".gz"):
102 | outfile = self.tmp_dir + os.path.basename(file)[:-3]
103 | uncompress_gzip(file, outfile)
104 | uncompressed.append(outfile)
105 |
106 | if uncompressed:
107 | # Check if results are equal with compressed and uncompressed files
108 | tax_compressed = self.taxonomies[t]["class"](
109 | **self.taxonomies[t]["params"])
110 | tax_uncompressed = self.taxonomies[t]["class"](
111 | files=uncompressed)
112 | self.assertEqual(tax_compressed.stats(), tax_uncompressed.stats(
113 | ), t + " failed with uncompressed files")
114 |
115 | def test_tar_gzip_uncompressed_ncbi(self):
116 | """
117 | Using uncompressed tar gzip files for ncbi
118 | """
119 |
120 | # Ncbi
121 | tax_compressed = self.taxonomies["ncbi"]["class"](
122 | **self.taxonomies["ncbi"]["params"])
123 | uncompressed_files = uncompress_tar_gzip(
124 | f=self.taxonomies["ncbi"]["params"]["files"][0], outd=self.tmp_dir)
125 | self.assertIn("nodes.dmp", uncompressed_files)
126 | self.assertIn("names.dmp", uncompressed_files)
127 | self.assertIn("merged.dmp", uncompressed_files)
128 | tax_uncompressed = self.taxonomies["ncbi"]["class"](files=[self.tmp_dir + "nodes.dmp",
129 | self.tmp_dir + "names.dmp",
130 | self.tmp_dir + "merged.dmp"])
131 | # Results of compressed and uncompressed should match
132 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
133 |
134 | # Ncbi with extended names
135 | ext_ncbi_conf = self.taxonomies["ncbi"].copy()
136 | ext_ncbi_conf["params"]["extended_names"] = True
137 | tax_compressed = ext_ncbi_conf["class"](**ext_ncbi_conf["params"])
138 | uncompressed_files = uncompress_tar_gzip(
139 | f=ext_ncbi_conf["params"]["files"][0], outd=self.tmp_dir)
140 | self.assertIn("nodes.dmp", uncompressed_files)
141 | self.assertIn("names.dmp", uncompressed_files)
142 | self.assertIn("merged.dmp", uncompressed_files)
143 | tax_uncompressed = ext_ncbi_conf["class"](files=[self.tmp_dir + "nodes.dmp",
144 | self.tmp_dir + "names.dmp",
145 | self.tmp_dir + "merged.dmp"],
146 | extended_names=True)
147 | # Results of compressed and uncompressed should match
148 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
149 |
150 | def test_tar_gzip_uncompressed_ott(self):
151 | """
152 | Using uncompressed tar gzip files for ott
153 | """
154 | # Ott
155 | tax_compressed = self.taxonomies["ott"]["class"](
156 | **self.taxonomies["ott"]["params"])
157 | uncompressed_files = uncompress_tar_gzip(
158 | f=self.taxonomies["ott"]["params"]["files"][0], outd=self.tmp_dir)
159 | self.assertIn("taxonomy.tsv", uncompressed_files)
160 | self.assertIn("forwards.tsv", uncompressed_files)
161 | tax_uncompressed = self.taxonomies["ott"]["class"](files=[self.tmp_dir + "taxonomy.tsv",
162 | self.tmp_dir + "forwards.tsv"])
163 | # Results of compressed and uncompressed should match
164 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
165 |
166 | # Ott with extended names (synonyms.tsv)
167 | ext_ott_conf = self.taxonomies["ott"].copy()
168 | ext_ott_conf["params"]["extended_names"] = True
169 | tax_compressed = ext_ott_conf["class"](**ext_ott_conf["params"])
170 | uncompressed_files = uncompress_tar_gzip(
171 | f=ext_ott_conf["params"]["files"][0], outd=self.tmp_dir)
172 | self.assertIn("taxonomy.tsv", uncompressed_files)
173 | self.assertIn("forwards.tsv", uncompressed_files)
174 | self.assertIn("synonyms.tsv", uncompressed_files)
175 | tax_uncompressed = ext_ott_conf["class"](files=[self.tmp_dir + "taxonomy.tsv",
176 | self.tmp_dir + "forwards.tsv",
177 | self.tmp_dir + "synonyms.tsv"],
178 | extended_names=True)
179 | # Results of compressed and uncompressed should match
180 | self.assertEqual(tax_uncompressed.stats(), tax_compressed.stats())
181 |
182 | def test_inconsistent(self):
183 | """
184 | Test parsing inconsistent taxonomies
185 | """
186 | for t in self.taxonomies:
187 | # Delete root
188 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
189 | tax.remove(tax.root_node)
190 | with self.assertRaises(ValueError):
191 | tax.check_consistency()
192 |
193 | # Delete random node (parent from random leaf)
194 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
195 | tax.remove(tax.parent(random.choice(tax.leaves())))
196 | with self.assertRaises(ValueError):
197 | tax.check_consistency()
198 |
199 | # Delete random leaf (do not generate inconsistency)
200 | tax = self.taxonomies[t]["class"](**self.taxonomies[t]["params"])
201 | tax._remove(random.choice(tax.leaves()))
202 | self.assertEqual(tax.check_consistency(), None)
203 |
--------------------------------------------------------------------------------
/tests/multitax/integration/test_empty.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from multitax.multitax import MultiTax
3 | from multitax import DummyTx
4 |
5 |
6 | class TestDummy(unittest.TestCase):
7 |
8 | def test_multitax(self):
9 | tax = MultiTax()
10 | stats = tax.stats()
11 | # Only root node
12 | self.assertEqual(stats["nodes"], 1)
13 | # No input sources
14 | self.assertFalse(tax.sources)
15 |
16 | def test_dummy(self):
17 | tax = DummyTx()
18 | stats = tax.stats()
19 | # Only root node
20 | self.assertEqual(stats["nodes"], 1)
21 | # No input sources
22 | self.assertFalse(tax.sources)
23 |
--------------------------------------------------------------------------------
/tests/multitax/integration/test_online.py:
--------------------------------------------------------------------------------
1 | from multitax import GreengenesTx, GtdbTx, NcbiTx, OttTx, SilvaTx, CustomTx
2 | from tests.multitax.utils import setup_dir, uncompress_gzip, uncompress_tar_gzip
3 | import unittest
4 | import os
5 | import sys
6 | import random
7 |
8 | sys.path.append("tests/multitax/")
9 |
10 |
11 | @unittest.skip('Skip online by default')
12 | class TestOnline(unittest.TestCase):
13 |
14 | tmp_dir = "tests/multitax/integration/tmp_online/"
15 |
16 | taxonomies = {}
17 | taxonomies["gtdb"] = {"class": GtdbTx}
18 | taxonomies["ncbi"] = {"class": NcbiTx}
19 | taxonomies["silva"] = {"class": SilvaTx}
20 | taxonomies["ott"] = {"class": OttTx}
21 | taxonomies["greengenes"] = {"class": GreengenesTx}
22 | # todo test online custom
23 |
24 | @classmethod
25 | def setUpClass(self):
26 | setup_dir(self.tmp_dir)
27 |
28 | def test_online_default(self):
29 | """
30 | Default test online
31 | """
32 | for t in self.taxonomies:
33 | tax = self.taxonomies[t]["class"]()
34 | self.assertGreater(tax.stats()["nodes"], 0, t + " failed")
35 |
36 | def test_online_output_prefix(self):
37 | """
38 | Saving files on disk
39 | """
40 | for t in self.taxonomies:
41 | tax = self.taxonomies[t]["class"](output_prefix=self.tmp_dir)
42 | self.assertGreater(
43 | tax.stats()["nodes"], 0, t + " failed with urls and output_prefix")
44 |
--------------------------------------------------------------------------------
/tests/multitax/unit/test_functions.py:
--------------------------------------------------------------------------------
1 | from multitax.utils import check_file
2 | from multitax import *
3 | from tests.multitax.utils import setup_dir
4 | import unittest
5 |
6 |
7 | class TestFunctions(unittest.TestCase):
8 | # test data (14 nodes)
9 | #
10 | # rank-1 (root) 1 ___________
11 | # / \ \
12 | # rank-2 2.1 2.2 ______ \
13 | # / \ \ \ \
14 | # rank-3 3.1 3.2 3.4 \ \
15 | # / / \ \ \ \
16 | # rank-4 *4.1 *4.2 *4.3 *4.4 *4.5 *4.6
17 | # / |
18 | # rank-5 *5.1 *5.2
19 | #
20 | # names: 1: Node1, 2.1: Node2.1, ...,5.2: Node5.2
21 |
22 | test_file = "tests/multitax/data_minimal/custom_unit_test.tsv.gz"
23 | tmp_dir = "tests/multitax/unit/tmp_functions/"
24 |
25 | @classmethod
26 | def setUpClass(self):
27 | setup_dir(self.tmp_dir)
28 |
29 | def test_children(self):
30 | """
31 | test children function
32 | """
33 | tax = CustomTx(files=self.test_file)
34 | self.assertCountEqual(tax.children("1"), ["2.1", "2.2", "4.6"])
35 | self.assertCountEqual(tax.children("2.1"), ["3.1", "3.2"])
36 | self.assertCountEqual(tax.children("2.2"), ["3.4", "4.5"])
37 | self.assertCountEqual(tax.children("4.4"), ["5.1", "5.2"])
38 | self.assertCountEqual(tax.children("5.2"), [])
39 | self.assertCountEqual(tax.children("XXX"), [])
40 |
41 | def test_search_name(self):
42 | """
43 | test search_name function
44 | """
45 |
46 | # Exact matches
47 | tax = CustomTx(files=self.test_file)
48 | self.assertCountEqual(tax.search_name("Node1"), ["1"])
49 | self.assertCountEqual(tax.search_name("Node2.1"), ["2.1"])
50 | self.assertCountEqual(tax.search_name("Node5.2"), ["5.2"])
51 | self.assertCountEqual(tax.search_name("Node2."), [])
52 |
53 | # not exact matches
54 | tax = CustomTx(files=self.test_file)
55 | self.assertCountEqual(tax.search_name(
56 | "Node2", exact=False), ["2.1", "2.2"])
57 | self.assertCountEqual(tax.search_name("Node2", exact=True), [])
58 | self.assertCountEqual(tax.search_name("Node1", exact=False), ["1"])
59 | self.assertCountEqual(tax.search_name("NotThere", exact=False), [])
60 |
61 | # Changing root name
62 | tax = CustomTx(files=self.test_file, root_name="AnotherRootName")
63 | self.assertCountEqual(tax.search_name("Node1", exact=False), [])
64 | self.assertCountEqual(tax.search_name(
65 | "AnotherRootName", exact=True), ["1"])
66 | self.assertCountEqual(tax.search_name("Another", exact=False), ["1"])
67 |
68 | # With specific rank
69 | tax = CustomTx(files=self.test_file)
70 | self.assertCountEqual(tax.search_name(
71 | "Node2.1", exact=True, rank="rank-2"), ["2.1"])
72 | self.assertCountEqual(tax.search_name(
73 | "Node4.4", exact=True, rank="rank-4"), ["4.4"])
74 | self.assertCountEqual(tax.search_name(
75 | "Node", exact=False, rank="rank-5"), ["5.1", "5.2"])
76 | self.assertCountEqual(tax.search_name(
77 | "Node2.1", exact=True, rank="rank-3"), [])
78 | self.assertCountEqual(tax.search_name(
79 | "Node4.4", exact=True, rank="rank-1"), [])
80 | self.assertCountEqual(tax.search_name(
81 | "Node5", exact=False, rank="rank-XXX"), [])
82 |
83 | def test_nodes_rank(self):
84 | """
85 | test nodes_rank function
86 | """
87 | tax = CustomTx(files=self.test_file)
88 | self.assertCountEqual(tax.nodes_rank("rank-1"), ["1"])
89 | self.assertCountEqual(tax.nodes_rank("rank-4"),
90 | ["4.1", "4.2", "4.3", "4.4", "4.5", "4.6"])
91 | self.assertCountEqual(tax.nodes_rank("rank-9999"), [])
92 |
93 | def test_parent(self):
94 | """
95 | test parent function
96 | """
97 | tax = CustomTx(files=self.test_file)
98 | self.assertEqual(tax.parent("1"), tax.root_parent)
99 | self.assertEqual(tax.parent("3.2"), "2.1")
100 | self.assertEqual(tax.parent("5.2"), "4.4")
101 | self.assertEqual(tax.parent("PpQqRr"), tax.undefined_node)
102 |
103 | tax = CustomTx(files=self.test_file, undefined_node="NoNode")
104 | self.assertEqual(tax.parent("ABVCDE"), "NoNode")
105 |
106 | def test_rank(self):
107 | """
108 | test rank function
109 | """
110 | tax = CustomTx(files=self.test_file)
111 | self.assertEqual(tax.rank("4.1"), "rank-4")
112 | self.assertEqual(tax.rank("1"), "rank-1")
113 | self.assertEqual(tax.rank("5.2"), "rank-5")
114 | self.assertEqual(tax.rank("what"), tax.undefined_rank)
115 |
116 | tax = CustomTx(files=self.test_file, undefined_rank="NoRank")
117 | self.assertEqual(tax.rank("ABVCDE"), "NoRank")
118 |
119 | def test_name(self):
120 | """
121 | test name function
122 | """
123 | tax = CustomTx(files=self.test_file)
124 | self.assertEqual(tax.name("4.1"), "Node4.1")
125 | self.assertEqual(tax.name("1"), "Node1")
126 | self.assertEqual(tax.name("2.2"), "Node2.2")
127 | self.assertEqual(tax.name("ABVCDE"), tax.undefined_name)
128 |
129 | tax = CustomTx(files=self.test_file, undefined_name="NoName")
130 | self.assertEqual(tax.name("ABVCDE"), "NoName")
131 |
132 | def test_latest(self):
133 | """
134 | test latest function
135 | """
136 | tax = CustomTx(files=self.test_file)
137 | self.assertEqual(tax.latest("4.1"), "4.1")
138 | self.assertEqual(tax.latest("1"), "1")
139 | self.assertEqual(tax.latest("4.6"), "4.6")
140 | self.assertEqual(tax.latest("XxXxXx"), tax.undefined_node)
141 |
142 | def test_leaves(self):
143 | """
144 | test leaves function
145 | """
146 | tax = CustomTx(files=self.test_file)
147 | self.assertCountEqual(
148 | tax.leaves(), ["4.1", "4.2", "4.3", "4.5", "4.6", "5.1", "5.2"])
149 | self.assertCountEqual(tax.leaves(
150 | "1"), ["4.1", "4.2", "4.3", "5.1", "5.2", "4.5", "4.6"])
151 | self.assertCountEqual(tax.leaves("2.2"), ["5.1", "5.2", "4.5"])
152 | self.assertCountEqual(tax.leaves("4.4"), ["5.1", "5.2"])
153 | self.assertCountEqual(tax.leaves("5.1"), ["5.1"])
154 | self.assertCountEqual(tax.leaves("999.999"), [])
155 |
156 | def test_lineage(self):
157 | """
158 | test lineage function
159 | """
160 | tax = CustomTx(files=self.test_file)
161 | # Use only assertEqual instead of assertCountEqual -> order matters
162 | self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"])
163 | self.assertEqual(tax.lineage("3.2"), ["1", "2.1", "3.2"])
164 | self.assertEqual(tax.lineage("4.6"), ["1", "4.6"])
165 | self.assertEqual(tax.lineage("1"), ["1"])
166 | self.assertEqual(tax.lineage("9999"), [])
167 |
168 | # with ranks
169 | self.assertEqual(tax.lineage("5.2", ranks=["rank-1", "rank-3", "rank-5"]),
170 | ["1", "3.4", "5.2"])
171 | self.assertEqual(tax.lineage("5.2", ranks=["rank-3", "rank-5", "rank-1"]),
172 | ["3.4", "5.2", "1"])
173 | self.assertEqual(tax.lineage("4.5", ranks=["rank-1"]),
174 | ["1"])
175 | self.assertEqual(tax.lineage("3.2", ranks=["rank-4", "rank-5"]),
176 | [tax.undefined_node, tax.undefined_node])
177 | self.assertEqual(tax.lineage("4.5", ranks=["rank-1", "rank-2", "rank-3", "rank-4", "rank-5"]),
178 | ["1", "2.2", tax.undefined_node, "4.5", tax.undefined_node])
179 | self.assertEqual(tax.lineage("4.6", ranks=["xxxx", "yyy"]),
180 | [tax.undefined_node, tax.undefined_node])
181 | # Invalid lineage
182 | self.assertEqual(tax.lineage("ZZZ", ranks=["xxxx", "yyy"]),
183 | [])
184 |
185 | # with root_node
186 | self.assertEqual(tax.lineage("5.2", root_node="2.2"),
187 | ["2.2", "3.4", "4.4", "5.2"])
188 | self.assertEqual(tax.lineage("4.2", root_node="2.1"),
189 | ["2.1", "3.2", "4.2"])
190 | self.assertEqual(tax.lineage("4.5", root_node="2.2"),
191 | ["2.2", "4.5"])
192 | # Invalid lineage
193 | self.assertEqual(tax.lineage("5.2", root_node="2.1"),
194 | [])
195 | self.assertEqual(tax.lineage("3.1", root_node="4.1"),
196 | [])
197 | self.assertEqual(tax.lineage("XXX", root_node="YYY"),
198 | [])
199 |
200 | # with both
201 | self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]),
202 | ["3.4", "4.4"])
203 | self.assertEqual(tax.lineage("5.1", root_node="3.4", ranks=["rank-3", "rank-5"]),
204 | ["3.4", "5.1"])
205 | self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
206 | [tax.undefined_node, "2.1", "3.1", tax.undefined_node])
207 | self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]),
208 | [tax.undefined_node, tax.undefined_node])
209 | self.assertEqual(tax.lineage("4.1", root_node="2.1", ranks=["XXXXX"]),
210 | [tax.undefined_node])
211 | # Invalid lineage
212 | self.assertEqual(tax.lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
213 | [])
214 | self.assertEqual(tax.lineage("XXXX", root_node="2.2", ranks=["rank-3", "rank-4"]),
215 | [])
216 |
217 | def test_rank_lineage(self):
218 | """
219 | test rank_lineage function
220 | """
221 | tax = CustomTx(files=self.test_file)
222 | self.assertEqual(tax.rank_lineage("5.2"), [
223 | "rank-1", "rank-2", "rank-3", "rank-4", "rank-5"])
224 | self.assertEqual(tax.rank_lineage("4.6"), ["rank-1", "rank-4"])
225 | self.assertEqual(tax.rank_lineage("1"), ["rank-1"])
226 | self.assertEqual(tax.rank_lineage("9999"), [])
227 |
228 | # with ranks or root_node
229 | self.assertEqual(tax.rank_lineage("5.2", ranks=["rank-1", "rank-3"]),
230 | ["rank-1", "rank-3"])
231 | self.assertEqual(tax.rank_lineage("5.2", ranks=["rank-1", "XXX", "rank-3"]),
232 | ["rank-1", tax.undefined_rank, "rank-3"])
233 | self.assertEqual(tax.rank_lineage("ZZZ", ranks=["rank-1", "XXX", "rank-3"]),
234 | [])
235 | self.assertEqual(tax.rank_lineage("5.2", root_node="2.2"),
236 | ["rank-2", "rank-3", "rank-4", "rank-5"])
237 | self.assertEqual(tax.rank_lineage("5.2", root_node="2.1"),
238 | [])
239 | self.assertEqual(tax.rank_lineage("XXX", root_node="YYY"),
240 | [])
241 |
242 | # with both
243 | self.assertEqual(tax.rank_lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]),
244 | ["rank-3", "rank-4"])
245 | self.assertEqual(tax.rank_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
246 | [tax.undefined_rank, "rank-2", "rank-3", tax.undefined_rank])
247 | self.assertEqual(tax.rank_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]),
248 | [tax.undefined_rank, tax.undefined_rank])
249 | self.assertEqual(tax.rank_lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
250 | [])
251 | self.assertEqual(tax.rank_lineage("XXXX", root_node="ZZZ", ranks=["CCC", "VVV"]),
252 | [])
253 |
254 | def test_name_lineage(self):
255 | """
256 | test rank_lineage function
257 | """
258 | tax = CustomTx(files=self.test_file)
259 | self.assertEqual(tax.name_lineage("5.2"), [
260 | "Node1", "Node2.2", "Node3.4", "Node4.4", "Node5.2"])
261 | self.assertEqual(tax.name_lineage("4.6"), ["Node1", "Node4.6"])
262 | self.assertEqual(tax.name_lineage("1"), ["Node1"])
263 | self.assertEqual(tax.name_lineage("9999"), [])
264 |
265 | # with ranks or root_node
266 | self.assertEqual(tax.name_lineage("5.2", ranks=["rank-1", "rank-3"]),
267 | ["Node1", "Node3.4"])
268 | self.assertEqual(tax.name_lineage("5.2", ranks=["rank-1", "XXX", "rank-3"]),
269 | ["Node1", tax.undefined_name, "Node3.4"])
270 | self.assertEqual(tax.name_lineage("ZZZ", ranks=["rank-1", "XXX", "rank-3"]),
271 | [])
272 | self.assertEqual(tax.name_lineage("5.2", root_node="2.2"),
273 | ["Node2.2", "Node3.4", "Node4.4", "Node5.2"])
274 | self.assertEqual(tax.name_lineage("5.2", root_node="2.1"),
275 | [])
276 | self.assertEqual(tax.name_lineage("XXX", root_node="YYY"),
277 | [])
278 |
279 | # with both
280 | self.assertEqual(tax.name_lineage("5.2", root_node="2.2", ranks=["rank-3", "rank-4"]),
281 | ["Node3.4", "Node4.4"])
282 | self.assertEqual(tax.name_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
283 | [tax.undefined_name, "Node2.1", "Node3.1", tax.undefined_name])
284 | self.assertEqual(tax.name_lineage("4.1", root_node="2.1", ranks=["rank-1", "rank-5"]),
285 | [tax.undefined_name, tax.undefined_name])
286 | self.assertEqual(tax.name_lineage("4.1", root_node="5.1", ranks=["rank-1", "rank-2", "rank-3", "rank-5"]),
287 | [])
288 | self.assertEqual(tax.name_lineage("XXXX", root_node="ZZZ", ranks=["CCC", "VVV"]),
289 | [])
290 |
291 | def test_parent_rank(self):
292 | """
293 | test parent_rank function
294 | """
295 | tax = CustomTx(files=self.test_file)
296 | self.assertEqual(tax.parent_rank("5.2", "rank-3"), "3.4")
297 | self.assertEqual(tax.parent_rank("4.1", "rank-2"), "2.1")
298 | self.assertEqual(tax.parent_rank("3.2", "rank-1"), "1")
299 | self.assertEqual(tax.parent_rank("3.2", "rank-4"), tax.undefined_node)
300 | self.assertEqual(tax.parent_rank("2.2", "XXXX"), tax.undefined_node)
301 | self.assertEqual(tax.parent_rank("CCCC", "XXXX"), tax.undefined_node)
302 |
303 | def test_closest_parent(self):
304 | """
305 | test closest_parent function
306 | """
307 | tax = CustomTx(files=self.test_file)
308 | self.assertEqual(tax.closest_parent(
309 | "5.2", ["rank-1", "rank-3"]), "3.4")
310 | self.assertEqual(tax.closest_parent(
311 | "5.2", ["rank-1", "rank-3", "rank-4"]), "4.4")
312 | self.assertEqual(tax.closest_parent(
313 | "5.2", ["rank-1", "rank-3", "rank-4", "rank-5"]), "5.2")
314 | self.assertEqual(tax.closest_parent(
315 | "5.2", ["rank-1", "rank-3", "rank-4", "rank-5", "XXXXX"]), "5.2")
316 | self.assertEqual(tax.closest_parent(
317 | "3.4", ["rank-1", "rank-4", "rank-5"]), "1")
318 | self.assertEqual(tax.closest_parent(
319 | "4.6", ["rank-1", "rank-2", "rank-3", "rank-5"]), "1")
320 | self.assertEqual(tax.closest_parent(
321 | "4.6", ["rank-2", "rank-3", "rank-5"]), tax.undefined_node)
322 | self.assertEqual(tax.closest_parent(
323 | "3.4", ["X", "Y", "Z"]), tax.undefined_node)
324 | self.assertEqual(tax.closest_parent("3.4", []), "3.4")
325 |
326 | def test_stats(self):
327 | """
328 | test stats function
329 | """
330 | tax = CustomTx(files=self.test_file)
331 | stats = tax.stats()
332 | self.assertEqual(stats["nodes"], 14)
333 | self.assertEqual(stats["names"], 14)
334 | self.assertEqual(stats["ranks"], 14)
335 | self.assertEqual(stats["leaves"], 7)
336 | self.assertEqual(len(stats["ranked_nodes"]), 5)
337 | self.assertEqual(sum(stats["ranked_nodes"].values()), stats["nodes"])
338 | self.assertEqual(sum(stats["ranked_leaves"].values()), stats["leaves"])
339 | self.assertCountEqual(list(stats["ranked_leaves"].keys()), [
340 | "rank-4", "rank-5"])
341 |
342 | def test_build_lineages(self):
343 | """
344 | test build_lineages function
345 | """
346 | # build full lineages
347 | tax = CustomTx(files=self.test_file)
348 | self.assertEqual(len(tax._lineages), 0)
349 | tax.build_lineages()
350 | self.assertEqual(len(tax._lineages), 14)
351 | self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"])
352 | self.assertEqual(tax.lineage("XXX"), [])
353 | # do not use stored lineage with keyword arguments
354 | self.assertEqual(tax.lineage("5.2", root_node="2.2"),
355 | ["2.2", "3.4", "4.4", "5.2"])
356 | self.assertEqual(tax.lineage(
357 | "5.2", ranks=["rank-2", "rank-4"]), ["2.2", "4.4"])
358 | self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=[
359 | "rank-2", "rank-4"]), ["2.2", "4.4"])
360 |
361 | # build filtered lineages
362 | tax.clear_lineages()
363 | self.assertEqual(len(tax._lineages), 0)
364 | tax.build_lineages(root_node="2.2", ranks=["rank-2", "rank-4"])
365 | self.assertEqual(len(tax._lineages), 14)
366 | self.assertEqual(tax.lineage("5.2"), ["2.2", "4.4"])
367 | self.assertEqual(tax.lineage("XXX"), [])
368 | # do not use stored lineage with keyword arguments
369 | self.assertEqual(tax.lineage("5.2", root_node="3.4"),
370 | ["3.4", "4.4", "5.2"])
371 | self.assertEqual(tax.lineage("5.2", ranks=[]), [
372 | "1", "2.2", "3.4", "4.4", "5.2"])
373 | self.assertEqual(tax.lineage("5.2", root_node="2.2", ranks=[
374 | "rank-2", "rank-5"]), ["2.2", "5.2"])
375 |
376 | def test_clear_lineages(self):
377 | """
378 | test clear_lineages function
379 | """
380 | tax = CustomTx(files=self.test_file)
381 | self.assertEqual(len(tax._lineages), 0)
382 | tax.build_lineages()
383 | self.assertEqual(len(tax._lineages), 14)
384 | tax.clear_lineages()
385 | self.assertEqual(len(tax._lineages), 0)
386 | self.assertEqual(tax.lineage("5.2"), ["1", "2.2", "3.4", "4.4", "5.2"])
387 | self.assertEqual(tax.lineage("XXX"), [])
388 |
389 | def test_translation(self):
390 | """
391 | test build_translation and tranlate functions (GTDB<->NCBI)
392 | """
393 | gtdb_tax = GtdbTx(files=["tests/multitax/data_minimal/gtdb_ar.tsv.gz",
394 | "tests/multitax/data_minimal/gtdb_bac.tsv.gz"])
395 | ncbi_tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz")
396 |
397 | # GTDB->NCBI
398 | # Should be no translation yet (g__Paenibacillus is contained in both test sets)
399 | self.assertCountEqual(gtdb_tax.translate("g__Paenibacillus"), [])
400 | gtdb_tax.build_translation(ncbi_tax, files=[
401 | "tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz", "tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz"])
402 | self.assertCountEqual(gtdb_tax.translate(
403 | "g__Paenibacillus"), ["44249"])
404 |
405 | # NCBI->GTDB
406 | # Should be no translation yet (g__Paenibacillus is contained in both test sets)
407 | self.assertCountEqual(ncbi_tax.translate("44249"), [])
408 | ncbi_tax.build_translation(gtdb_tax, files=[
409 | "tests/multitax/data_minimal/gtdb_ar_metadata.tsv.gz", "tests/multitax/data_minimal/gtdb_bac_metadata.tsv.gz"])
410 | self.assertCountEqual(ncbi_tax.translate("44249"), ["g__Paenibacillus"])
411 |
412 | # Other translations not yet implemented
413 | ott_tax = OttTx(files="tests/multitax/data_minimal/ott.tgz")
414 | silva_tax = SilvaTx(files="tests/multitax/data_minimal/silva.txt.gz")
415 | gg_tax = GreengenesTx(files="tests/multitax/data_minimal/gg.txt.gz")
416 | with self.assertWarns(UserWarning):
417 | ncbi_tax.build_translation(ott_tax)
418 | ncbi_tax.build_translation(silva_tax)
419 | ncbi_tax.build_translation(gg_tax)
420 | gtdb_tax.build_translation(ott_tax)
421 | gtdb_tax.build_translation(silva_tax)
422 | gtdb_tax.build_translation(gg_tax)
423 | ott_tax.build_translation(silva_tax)
424 | ott_tax.build_translation(gg_tax)
425 | ott_tax.build_translation(gtdb_tax)
426 | ott_tax.build_translation(ncbi_tax)
427 | gg_tax.build_translation(ott_tax)
428 | gg_tax.build_translation(silva_tax)
429 | gg_tax.build_translation(gtdb_tax)
430 | gg_tax.build_translation(ncbi_tax)
431 |
432 | def test_check_consistency(self):
433 | """
434 | test check_consistency function
435 | """
436 | tax = CustomTx(files=self.test_file)
437 | self.assertEqual(tax.check_consistency(), None)
438 | # delete node
439 | del tax._nodes["3.4"]
440 | with self.assertRaises(ValueError):
441 | tax.check_consistency()
442 |
443 | tax = CustomTx(files=self.test_file)
444 | # delete leaf node
445 | del tax._nodes["5.2"]
446 | self.assertEqual(tax.check_consistency(), None)
447 |
448 | tax = CustomTx(files=self.test_file)
449 | # delete root
450 | del tax._nodes["1"]
451 | # should raise error
452 | with self.assertRaises(ValueError):
453 | tax.check_consistency()
454 |
455 | def test_filter(self):
456 | """
457 | test filter function
458 | """
459 | # Ancestors
460 | tax = CustomTx(files=self.test_file)
461 | tax.filter("4.5")
462 | self.assertEqual(tax.stats()["nodes"], 3)
463 | self.assertCountEqual(tax.lineage("4.5"), ["1", "2.2", "4.5"])
464 | self.assertCountEqual(tax.leaves("1"), ["4.5"])
465 |
466 | tax = CustomTx(files=self.test_file)
467 | tax.filter(["4.5", "XXXX"])
468 | self.assertEqual(tax.stats()["nodes"], 3)
469 | self.assertCountEqual(tax.lineage("4.5"), ["1", "2.2", "4.5"])
470 | self.assertCountEqual(tax.leaves("1"), ["4.5"])
471 |
472 | tax = CustomTx(files=self.test_file)
473 | tax.filter(["4.1", "5.1", "5.2"])
474 | self.assertEqual(tax.stats()["nodes"], 9)
475 | self.assertCountEqual(tax.lineage("4.1"), ["1", "2.1", "3.1", "4.1"])
476 | self.assertCountEqual(tax.leaves("1"), ["4.1", "5.1", "5.2"])
477 |
478 | tax = CustomTx(files=self.test_file)
479 | tax.filter("XXXX")
480 | self.assertEqual(tax.stats()["nodes"], 1)
481 |
482 | # Descendants
483 | tax = CustomTx(files=self.test_file)
484 | tax.filter("3.4", desc=True)
485 | self.assertEqual(tax.stats()["nodes"], 5)
486 | self.assertCountEqual(tax.lineage("3.4"), ["1", "3.4"])
487 | self.assertCountEqual(tax.leaves("1"), ["5.1", "5.2"])
488 |
489 | tax = CustomTx(files=self.test_file)
490 | tax.filter(["XXXXX", "3.4"], desc=True)
491 | self.assertEqual(tax.stats()["nodes"], 5)
492 | self.assertCountEqual(tax.lineage("3.4"), ["1", "3.4"])
493 | self.assertCountEqual(tax.leaves("1"), ["5.1", "5.2"])
494 |
495 | tax = CustomTx(files=self.test_file)
496 | tax.filter(["3.2", "4.4"], desc=True)
497 | self.assertEqual(tax.stats()["nodes"], 7)
498 | self.assertCountEqual(tax.lineage("5.2"), ["1", "4.4", "5.2"])
499 | self.assertCountEqual(tax.lineage("4.5"), [])
500 | self.assertCountEqual(tax.leaves("1"), ["4.2", "4.3", "5.1", "5.2"])
501 |
502 | tax = CustomTx(files=self.test_file)
503 | self.assertEqual(tax.stats()["nodes"], 14)
504 | tax.filter("XXXXX", desc=True)
505 | self.assertEqual(tax.stats()["nodes"], 1)
506 |
507 | def test_add(self):
508 | """
509 | test add function
510 | """
511 | tax = CustomTx(files=self.test_file)
512 | # Add leaf node 5.3 to parent 4.4
513 | tax.add("5.3", "4.4")
514 | self.assertEqual(tax.check_consistency(), None)
515 | self.assertEqual(tax.parent("5.3"), "4.4")
516 | self.assertEqual(tax.name("5.3"), tax.undefined_name)
517 | self.assertEqual(tax.rank("5.3"), tax.undefined_rank)
518 |
519 | # Add another leaf on the 5.3 with name and rank
520 | tax.add("6.1", "5.3", name="Node6.1", rank="rank-6")
521 | self.assertEqual(tax.check_consistency(), None)
522 | self.assertEqual(tax.parent("6.1"), "5.3")
523 | self.assertEqual(tax.name("6.1"), "Node6.1")
524 | self.assertEqual(tax.rank("6.1"), "rank-6")
525 | self.assertEqual(tax.lineage("6.1"), [
526 | "1", "2.2", "3.4", "4.4", "5.3", "6.1"])
527 |
528 | # Add node without valid parent, raises ValueError
529 | with self.assertRaises(ValueError):
530 | tax.add("6.2", "XXX")
531 |
532 | # Add already existing node
533 | with self.assertRaises(ValueError):
534 | tax.add("5.1", "4.4")
535 |
536 | def test_remove(self):
537 | """
538 | test remove function
539 | """
540 | tax = CustomTx(files=self.test_file)
541 | tax.remove("5.2")
542 | self.assertEqual(tax.latest("5.2"), tax.undefined_node)
543 | self.assertEqual(tax.parent("5.2"), tax.undefined_node)
544 | self.assertEqual(tax.name("5.2"), tax.undefined_node)
545 | self.assertEqual(tax.rank("5.2"), tax.undefined_node)
546 | self.assertEqual(tax.lineage("5.2"), [])
547 |
548 | # Initialize aux structures and clear them after removing node
549 | tax = CustomTx(files=self.test_file, build_name_nodes=True,
550 | build_node_children=True, build_rank_nodes=True)
551 | self.assertNotEqual(len(tax._name_nodes), 0)
552 | self.assertNotEqual(len(tax._node_children), 0)
553 | self.assertNotEqual(len(tax._rank_nodes), 0)
554 | tax.remove("5.2")
555 | self.assertEqual(len(tax._name_nodes), 0)
556 | self.assertEqual(len(tax._node_children), 0)
557 | self.assertEqual(len(tax._rank_nodes), 0)
558 |
559 | # with check_consistency
560 | tax.remove("5.1", check_consistency=True)
561 |
562 | # Removing node that breaks the tree (allowed)
563 | tax.remove("3.1")
564 | # node is removed anyway
565 | self.assertEqual(tax.latest("3.1"), tax.undefined_node)
566 | with self.assertRaises(ValueError):
567 | tax.check_consistency()
568 |
569 | # Removing and raising execption
570 | with self.assertRaises(ValueError):
571 | tax.remove("3.2", check_consistency=True)
572 | # node is removed anyway
573 | self.assertEqual(tax.latest("3.2"), tax.undefined_node)
574 |
575 | # Removing root
576 | tax.remove("1")
577 | with self.assertRaises(ValueError):
578 | tax.check_consistency()
579 |
580 | # Removing node not present
581 | with self.assertRaises(ValueError):
582 | tax.remove("XXX")
583 |
584 | def test_prune(self):
585 | """
586 | test prune function
587 | """
588 | tax = CustomTx(files=self.test_file)
589 |
590 | self.assertCountEqual(tax.leaves("4.4"), ["5.1", "5.2"])
591 | tax.prune("4.4")
592 | self.assertEqual(tax.check_consistency(), None)
593 | self.assertCountEqual(tax.leaves("4.4"), ["4.4"])
594 |
595 | # Prune leaf node (nothing changes)
596 | self.assertCountEqual(tax.leaves("4.6"), ["4.6"])
597 | tax.prune("4.6")
598 | self.assertEqual(tax.check_consistency(), None)
599 | self.assertCountEqual(tax.leaves("4.6"), ["4.6"])
600 |
601 | # Prune multiple overlapping nodes
602 | self.assertCountEqual(tax.leaves("2.1"), ["4.1", "4.2", "4.3"])
603 | self.assertCountEqual(tax.leaves("3.2"), ["4.2", "4.3"])
604 | tax.prune(["2.1", "3.2"])
605 | self.assertEqual(tax.check_consistency(), None)
606 | self.assertCountEqual(tax.leaves("2.1"), ["2.1"])
607 | self.assertCountEqual(tax.leaves("3.2"), [])
608 |
609 | # Restar tax
610 | tax = CustomTx(files=self.test_file)
611 | # Prune multiple overlapping nodes (reversed)
612 | self.assertCountEqual(tax.leaves("2.1"), ["4.1", "4.2", "4.3"])
613 | self.assertCountEqual(tax.leaves("3.2"), ["4.2", "4.3"])
614 | tax.prune(["3.2", "2.1"])
615 | self.assertEqual(tax.check_consistency(), None)
616 | self.assertCountEqual(tax.leaves("2.1"), ["2.1"])
617 | self.assertCountEqual(tax.leaves("3.2"), [])
618 |
619 | # Pruning node not present
620 | with self.assertRaises(ValueError):
621 | tax.prune("XXX")
622 |
623 | # Prunning root node
624 | tax.prune(tax.root_node)
625 | self.assertEqual(len(tax._nodes), 1)
626 |
627 | def test_write(self):
628 | """
629 | test write function
630 | """
631 | tax = CustomTx(files=self.test_file)
632 | outfile = self.tmp_dir + "default.tsv"
633 | tax.write(outfile)
634 | self.assertEqual(check_file(outfile), None)
635 |
636 | tax = CustomTx(files=self.test_file)
637 | outfile = self.tmp_dir + "ranks.tsv"
638 | tax.write(outfile,
639 | ranks=["rank-2", "rank-4"],
640 | cols=["node", "rank", "lineage", "rank_lineage", "name_lineage"])
641 | self.assertEqual(check_file(outfile), None)
642 |
643 | tax = CustomTx(files=self.test_file)
644 | outfile = self.tmp_dir + "all_cols.tsv"
645 | tax.write(outfile,
646 | cols=["node", "latest", "parent", "rank", "name", "leaves", "children", "lineage", "rank_lineage", "name_lineage"])
647 | self.assertEqual(check_file(outfile), None)
648 |
649 | tax = CustomTx(files=self.test_file)
650 | outfile = self.tmp_dir + "sep_comma.tsv"
651 | tax.write(outfile,
652 | sep=",")
653 | self.assertEqual(check_file(outfile), None)
654 |
655 | tax = CustomTx(files=self.test_file)
656 | outfile = self.tmp_dir + "sep_multi_underline.tsv"
657 | tax.write(outfile,
658 | cols=["node", "lineage", "children", "leaves"],
659 | sep_multi="_")
660 | self.assertEqual(check_file(outfile), None)
661 |
662 | def test_ott_forwards(self):
663 | """
664 | Test forwards functionality (ott only)
665 | """
666 | # forwards.tsv
667 | # id replacement
668 | # 5044012 4603004
669 | # 391495 391494
670 |
671 | tax = OttTx(files="tests/multitax/data_minimal/ott.tgz")
672 | self.assertEqual(len(tax._forwards), 2)
673 |
674 | self.assertEqual(tax.parent("5044012"), tax.undefined_node)
675 | self.assertEqual(tax.latest("5044012"), "4603004")
676 | self.assertNotEqual(tax.parent(
677 | tax.latest("5044012")), tax.undefined_node)
678 |
679 | self.assertEqual(tax.parent("391495"), tax.undefined_node)
680 | self.assertEqual(tax.latest("391495"), "391494")
681 | self.assertNotEqual(tax.parent(
682 | tax.latest("391495")), tax.undefined_node)
683 |
684 | def test_ncbi_merged(self):
685 | """
686 | Test merged functionality (ncbi only)
687 | """
688 | # merged.dmp
689 | # 1235230 | 459525 |
690 | # 1235908 | 363999 |
691 |
692 | tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz")
693 | self.assertEqual(len(tax._merged), 2)
694 |
695 | self.assertEqual(tax.parent("1235230"), tax.undefined_node)
696 | self.assertEqual(tax.latest("1235230"), "459525")
697 | self.assertNotEqual(tax.parent(
698 | tax.latest("1235230")), tax.undefined_node)
699 |
700 | self.assertEqual(tax.parent("1235908"), tax.undefined_node)
701 | self.assertEqual(tax.latest("1235908"), "363999")
702 | self.assertNotEqual(tax.parent(
703 | tax.latest("1235908")), tax.undefined_node)
704 |
705 | def test_ncbi_extended_names(self):
706 | """
707 | Test extended names functionality (ncbi)
708 | """
709 | # on names.dmp
710 | # 363999 | Xylariaceae sp. 5129 | | includes |
711 | # 363999 | Xylariaceae sp. 5151 | | includes |
712 | # 363999 | Xylariaceae sp. 5228 | | includes |
713 | # 37990 | mitosporic Xylariaceae | | includes |
714 | # 37990 | Xylariaceae | | scientific name |
715 |
716 | tax = NcbiTx(files="tests/multitax/data_minimal/ncbi.tar.gz",
717 | extended_names=False)
718 | tax_ex = NcbiTx(
719 | files="tests/multitax/data_minimal/ncbi.tar.gz", extended_names=True)
720 |
721 | # Exact match on scientific name
722 | self.assertCountEqual(tax.search_name("Xylariaceae"), ["37990"])
723 | self.assertCountEqual(tax_ex.search_name("Xylariaceae"), ["37990"])
724 | # All scientific names
725 | self.assertCountEqual(tax.search_name(
726 | "Xylariaceae", exact=False), ["37990"])
727 | self.assertCountEqual(tax_ex.search_name(
728 | "Xylariaceae", exact=False), ["37990"])
729 | # Exact match on scientific name forcing extended
730 | self.assertCountEqual(tax.search_name("Xylariaceae"), ["37990"])
731 | self.assertCountEqual(tax_ex.search_name(
732 | "Xylariaceae", force_extended=True), ["37990"])
733 | # All names
734 | self.assertCountEqual(tax.search_name(
735 | "Xylariaceae", exact=False), ["37990"])
736 | self.assertCountEqual(tax_ex.search_name(
737 | "Xylariaceae", exact=False, force_extended=True), ["37990", "363999"])
738 | # Exact name available only on extended
739 | self.assertCountEqual(tax.search_name(
740 | "mitosporic Xylariaceae", exact=True), [])
741 | self.assertCountEqual(tax_ex.search_name(
742 | "mitosporic Xylariaceae", exact=True), ["37990"])
743 | # Partial name available only on extended
744 | self.assertCountEqual(tax.search_name(
745 | "Xylariaceae sp.", exact=False), [])
746 | self.assertCountEqual(tax_ex.search_name(
747 | "Xylariaceae sp.", exact=False), ["363999"])
748 |
749 | def test_ott_extended_names(self):
750 | """
751 | Test extended names functionality (ott)
752 | """
753 | # on taxonomy.tsv
754 | # 4622 | 470454 | Haemophilus sp. CCUG 32367 | species | silva:EU909664,ncbi:554010 | | sibling_higher |
755 | # 4621 | 470454 | Haemophilus sp. CCUG 35214 | species | silva:EU909665,ncbi:554011 | | sibling_higher |
756 | # 158636 | 470454 | Haemophilus sp. CCUG 30218 | species | silva:EU909662,ncbi:554007 | | sibling_higher |
757 | # 391494 | 470454 | Haemophilus sp. CCUG 31732 | species | silva:EU909663,ncbi:554009 | | sibling_higher |
758 | # 525972 | 470454 | Haemophilus pittmaniae HK 85 | no rank - terminal | silva:AFUV01000004,ncbi:1035188 |
759 | # 788108 | 470454 | Haemophilus sputorum | species | silva:JF506644,ncbi:1078480,gbif:7522132 |
760 | # 470454 | 1098176 | Haemophilus | genus | silva:A16379/#6,ncbi:724,worms:571392,gbif:3219815,irmng:1307220 | | |
761 | # on synonyms.tsv
762 | # Hemophilus | 470454 | synonym | Hemophilus (synonym for Haemophilus) | gbif:3219815,irmng:1307220 |
763 | # Haemophilus sp. HK 85 | 525972 | equivalent name | Haemophilus sp. HK 85 (synonym for Haemophilus pittmaniae HK 85) | ncbi:1035188 |
764 | # Haemophilus sp. CCUG 26672 | 788108 | includes | Haemophilus sp. CCUG 26672 (synonym for Haemophilus sputorum) | ncbi:1078480 |
765 | # Haemophilus sp. CCUG 47809 | 788108 | includes | Haemophilus sp. CCUG 47809 (synonym for Haemophilus sputorum) | ncbi:1078480 |
766 |
767 | tax = OttTx(files="tests/multitax/data_minimal/ott.tgz",
768 | extended_names=False)
769 | tax_ex = OttTx(
770 | files="tests/multitax/data_minimal/ott.tgz", extended_names=True)
771 |
772 | # Exact match on scientific name
773 | self.assertCountEqual(tax.search_name("Haemophilus"), ["470454"])
774 | self.assertCountEqual(tax_ex.search_name("Haemophilus"), ["470454"])
775 | # All scientific names
776 | self.assertCountEqual(tax.search_name("Haemophilus sp.", exact=False), [
777 | "391494", "158636", "4621", "4622"])
778 | self.assertCountEqual(tax_ex.search_name("Haemophilus sp.", exact=False), [
779 | "391494", "158636", "4621", "4622"])
780 | # Exact match on scientific name forcing extended
781 | self.assertCountEqual(tax.search_name("Haemophilus"), ["470454"])
782 | self.assertCountEqual(tax_ex.search_name(
783 | "Haemophilus", force_extended=True), ["470454"])
784 | # All names
785 | self.assertCountEqual(tax.search_name("Haemophilus sp. CCUG", exact=False), [
786 | "391494", "158636", "4621", "4622"])
787 | self.assertCountEqual(tax_ex.search_name("Haemophilus sp. CCUG", exact=False, force_extended=True), [
788 | "391494", "158636", "4621", "4622", "788108"])
789 | # Exact name available only on extended
790 | self.assertCountEqual(tax.search_name(
791 | "Haemophilus sp. HK 85", exact=True), [])
792 | self.assertCountEqual(tax_ex.search_name(
793 | "Haemophilus sp. HK 85", exact=True), ["525972"])
794 | # Partial name available only on extended
795 | self.assertCountEqual(tax.search_name("CCUG 26672", exact=False), [])
796 | self.assertCountEqual(tax_ex.search_name(
797 | "CCUG 26672", exact=False), ["788108"])
798 |
--------------------------------------------------------------------------------
/tests/multitax/unit/test_init.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from multitax import CustomTx
3 | from multitax.multitax import MultiTax
4 |
5 |
6 | class TestInit(unittest.TestCase):
7 | # test data (14 nodes)
8 | #
9 | # rank-1 (root) 1 ___________
10 | # / \ \
11 | # rank-2 2.1 2.2 ______ \
12 | # / \ \ \ \
13 | # rank-3 3.1 3.2 3.4 \ \
14 | # / / \ \ \ \
15 | # rank-4 *4.1 *4.2 *4.3 *4.4 *4.5 *4.6
16 | # / |
17 | # rank-5 *5.1 *5.2
18 | #
19 | # names: 1: Node1, 2.1: Node2.1, ...,5.2: Node5.2
20 |
21 | test_file = "tests/multitax/data_minimal/custom_unit_test.tsv.gz"
22 |
23 | def test_default(self):
24 | """
25 | test default values on empty init
26 | """
27 | # Empty tax
28 | tax = MultiTax()
29 | self.assertEqual(tax.root_parent, "0")
30 | self.assertEqual(tax.root_node, tax._default_root_node)
31 | self.assertEqual(tax.root_name, "root")
32 | self.assertEqual(tax.root_rank, "root")
33 |
34 | self.assertEqual(tax._default_urls, [])
35 | self.assertEqual(tax._default_root_node, "1")
36 | self.assertEqual(tax._nodes, {tax.root_node: '0'})
37 | self.assertEqual(tax._names, {tax.root_node: 'root'})
38 | self.assertEqual(tax._ranks, {tax.root_node: 'root'})
39 | self.assertEqual(tax._lineages, {})
40 | self.assertEqual(tax._name_nodes, {})
41 | self.assertEqual(tax._node_children, {})
42 | self.assertEqual(tax._rank_nodes, {})
43 | self.assertEqual(tax._translated_nodes, {})
44 |
45 | self.assertEqual(tax.undefined_node, None)
46 | self.assertEqual(tax.undefined_name, None)
47 | self.assertEqual(tax.undefined_rank, None)
48 | self.assertEqual(tax.sources, [])
49 |
50 | tax = CustomTx(files=self.test_file)
51 | self.assertEqual(tax.root_parent, "0")
52 | self.assertEqual(tax.root_node, tax._default_root_node)
53 | self.assertEqual(tax.root_name, "Node1")
54 | self.assertEqual(tax.root_rank, "rank-1")
55 |
56 | self.assertEqual(tax._default_urls, [])
57 | self.assertEqual(tax._default_root_node, "1")
58 | self.assertEqual(tax._nodes[tax.root_node], "0")
59 | self.assertEqual(tax._names[tax.root_node], "Node1")
60 | self.assertEqual(tax._ranks[tax.root_node], "rank-1")
61 | self.assertEqual(tax._lineages, {})
62 | self.assertEqual(tax._name_nodes, {})
63 | self.assertEqual(tax._node_children, {})
64 | self.assertEqual(tax._rank_nodes, {})
65 | self.assertEqual(tax._translated_nodes, {})
66 |
67 | self.assertEqual(tax.undefined_node, None)
68 | self.assertEqual(tax.undefined_name, None)
69 | self.assertEqual(tax.undefined_rank, None)
70 | self.assertEqual(tax.sources, [self.test_file])
71 |
72 | def test_root_values(self):
73 | """
74 | test init changing root values
75 | """
76 |
77 | # New root, not on tree
78 | tax = MultiTax(root_node="root_n", root_parent="root_p",
79 | root_name="newRootName", root_rank="newRootRank")
80 | self.assertEqual(tax.root_node, "root_n")
81 | self.assertEqual(tax.root_parent, "root_p")
82 | # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"}
83 | self.assertEqual(tax._nodes, {
84 | tax.root_node: tax.root_parent, tax._default_root_node: tax.root_node})
85 | self.assertEqual(tax.root_name, 'newRootName')
86 | self.assertEqual(tax._names, {tax.root_node: 'newRootName'})
87 | self.assertEqual(tax.root_rank, 'newRootRank')
88 | self.assertEqual(tax._ranks, {tax.root_node: 'newRootRank'})
89 |
90 | # Root is a new node not in nodes
91 | tax = CustomTx(files=self.test_file, root_node="root_n",
92 | root_parent="root_p", root_name="newRootName", root_rank="newRootRank")
93 | self.assertEqual(tax.root_node, "root_n")
94 | self.assertEqual(tax.root_parent, "root_p")
95 | self.assertEqual(tax.stats()["nodes"], 15)
96 |
97 | # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"}
98 | self.assertEqual(tax.parent(tax.root_node), tax.root_parent)
99 | self.assertEqual(tax.name(tax.root_node), 'newRootName')
100 | self.assertEqual(tax.rank(tax.root_node), 'newRootRank')
101 | # Default root is linked to new root
102 | self.assertEqual(tax.parent(tax._default_root_node), tax.root_node)
103 | self.assertEqual(tax.name(tax._default_root_node), "Node1")
104 | self.assertEqual(tax.rank(tax._default_root_node), "rank-1")
105 |
106 | # Root is an existing node in nodes, but not default, filter tree under node
107 | tax = CustomTx(files=self.test_file, root_node="4.4", root_parent="root_p",
108 | root_name="newRootName", root_rank="newRootRank")
109 | self.assertEqual(tax.root_node, "4.4")
110 | self.assertEqual(tax.root_parent, "root_p")
111 | self.assertEqual(tax.stats()["nodes"], 3)
112 |
113 | # Create new root node and link old default (1) {"root_n": "root_p", "1": "root_p"}
114 | self.assertEqual(tax.parent(tax.root_node), tax.root_parent)
115 | self.assertEqual(tax.name(tax.root_node), 'newRootName')
116 | self.assertEqual(tax.rank(tax.root_node), 'newRootRank')
117 | # default root should not exist
118 | self.assertEqual(tax.parent(tax._default_root_node),
119 | tax.undefined_node)
120 | self.assertEqual(tax.name(tax._default_root_node), tax.undefined_name)
121 | self.assertEqual(tax.rank(tax._default_root_node), tax.undefined_rank)
122 |
123 | def test_undefined_values(self):
124 | """
125 | test init changing undefined values
126 | """
127 | tax = MultiTax(undefined_node="unode",
128 | undefined_rank="urank", undefined_name="uname")
129 | self.assertEqual(tax.undefined_node, "unode")
130 | self.assertEqual(tax.undefined_name, "uname")
131 | self.assertEqual(tax.undefined_rank, "urank")
132 | self.assertEqual(tax.parent("XXX"), "unode")
133 | self.assertEqual(tax.rank("XXX"), "urank")
134 | self.assertEqual(tax.name("XXX"), "uname")
135 |
136 | tax = CustomTx(files=self.test_file, undefined_node="unode",
137 | undefined_rank="urank", undefined_name="uname")
138 | self.assertEqual(tax.undefined_node, "unode")
139 | self.assertEqual(tax.undefined_name, "uname")
140 | self.assertEqual(tax.undefined_rank, "urank")
141 | self.assertEqual(tax.parent("XXX"), "unode")
142 | self.assertEqual(tax.rank("XXX"), "urank")
143 | self.assertEqual(tax.name("XXX"), "uname")
144 |
145 | def test_build_values(self):
146 | """
147 | test init changing undefined values
148 | """
149 | tax = MultiTax(build_node_children=True,
150 | build_name_nodes=True, build_rank_nodes=True)
151 | self.assertEqual(tax._name_nodes, {
152 | tax.name(tax.root_node): [tax.root_node]})
153 | self.assertEqual(tax._node_children, {
154 | tax.root_parent: [tax.root_node]})
155 | self.assertEqual(tax._rank_nodes, {"root": [tax.root_node]})
156 |
157 | tax = CustomTx(files=self.test_file, build_node_children=True,
158 | build_name_nodes=True, build_rank_nodes=True)
159 | self.assertNotEqual(len(tax._name_nodes), 0)
160 | self.assertNotEqual(len(tax._node_children), 0)
161 | self.assertNotEqual(len(tax._rank_nodes), 0)
162 |
--------------------------------------------------------------------------------
/tests/multitax/utils.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import os
3 | import gzip
4 | import tarfile
5 |
6 |
7 | def setup_dir(d):
8 | shutil.rmtree(d, ignore_errors=True)
9 | os.makedirs(d)
10 |
11 |
12 | def uncompress_gzip(f, outf):
13 | with gzip.open(f, 'r') as f_in, open(outf, 'wb') as f_out:
14 | shutil.copyfileobj(f_in, f_out)
15 |
16 |
17 | def uncompress_tar_gzip(f, outd):
18 | # Extract all files ignoring internal directories to outd
19 | files = []
20 | with tarfile.open(f) as tar_in:
21 | for member in tar_in.getmembers():
22 | if member.isreg():
23 | member.name = os.path.basename(member.name)
24 | files.append(member.name)
25 | tar_in.extract(member, outd)
26 | return files
27 |
--------------------------------------------------------------------------------