├── stringmeup ├── __init__.py ├── taxonomy.py └── stringmeup.py ├── setup.py ├── LICENSE ├── .gitignore └── README.md /stringmeup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from stringmeup.stringmeup import __version__ 3 | 4 | setup( 5 | name="StringMeUp", 6 | version=__version__, 7 | url="https://github.com/danisven/stringmeup", 8 | description="A post-processing tool to reclassify Kraken 2 output based on the confidence score and/or minimum minimizer hit groups.", 9 | license="MIT", 10 | 11 | # Author details 12 | author='Daniel Svensson', 13 | author_email='daniel.svensson@umu.se', 14 | 15 | keywords="Bioinformatics NGS kraken2", 16 | classifiers=[ 17 | 'Development Status :: 5 - Beta', 18 | 'License :: OSI Approved :: MIT', 19 | 'Programming Language :: Python :: 3' 20 | ], 21 | install_requires=['dataclasses'], 22 | packages=find_packages(exclude=['contrib', 'docs', 'test*'], include=['stringmeup']), 23 | entry_points={'console_scripts': [ 'stringmeup=stringmeup.stringmeup:stringmeup', 24 | # 'kraken2-taxonomy=kraken2_confidence_recal.taxonomy:main', 25 | 26 | ]}) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Daniel Svensson 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Kraken 2 2 | *.report 3 | *.kraken2 4 | 5 | # NCBI taxonomy files 6 | *.dmp 7 | 8 | # Kraken2_confidence_recal 9 | *.verbose 10 | *.pickle 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | pip-wheel-metadata/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 106 | __pypackages__/ 107 | 108 | # Celery stuff 109 | celerybeat-schedule 110 | celerybeat.pid 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | .dmypy.json 137 | dmypy.json 138 | 139 | # Pyre type checker 140 | .pyre/ 141 | <<<<<<< HEAD 142 | >>>>>>> 6d7afd916c05c5d84bf73628335008b00d0e39c8 143 | ======= 144 | >>>>>>> 6d7afd916c05c5d84bf73628335008b00d0e39c8 145 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/stringmeup/badges/installer/conda.svg)](https://conda.anaconda.org/bioconda) 2 | 3 | # StringMeUp 4 | 5 | A post-processing tool for [Kraken 2] read classifications. Run Kraken 2 **once** and re-classify the reads with any confidence score stringency of your choice afterwards, saving you lots of compute time. Creates Kraken 2 style report and read classification files. 6 | 7 | For additional insight into your Kraken 2 classifications, try out [KrakMeOpen] - a downstream analysis toolkit for Kraken 2 classification quality metrics. 8 | 9 | ## Installation 10 | 11 | StringMeUp is available to install through conda. Simply run the following command to install it: 12 | 13 | `conda install -c conda-forge -c bioconda stringmeup` 14 | 15 | ## Usage 16 | 17 | A good start is to run `stringmeup --help`. 18 | 19 | ## About the confidence score 20 | 21 | The confidence score (CS) for a given read _R_ classified to a given node _J_ is calculated by dividing the number of k-mers that hit any node in the clade rooted at node _J_ (N) by the total number of k-mers that were queried against the database (M). Any k-mer with an ambiguous nucleotide is not queried against the database, and is thus not part of M. 22 | 23 | CS = N / M 24 | 25 | If the CS for a given read _R_ at a given node _J_ is equal to or larger than the specified cutoff, read _R_ is classified to node _J_. If not, the CS of read _R_ is calculated for the parent of node _J_. This is repeated until the CS >= CS cutoff or until we reach the root of the taxonomy. If the CS < CS cutoff at the root, the read is deemed unclassified. 26 | 27 | ## Reclassifying Kraken 2 output 28 | 29 | To reclassify reads classified by Kraken 2 with a confidence cutoff of 0.1: 30 | 31 | `stringmeup --names --nodes 0.1 ` 32 | 33 | Where: 34 | * original_classifications.kraken2 is the output file from Kraken 2 that contain the read-by-read classifications. 35 | * names.dmp and nodes.dmp are the same NCBI taxonomy files used for the building of the database that was used to produce the classifications in original_classifications.kraken2. 36 | 37 | This command would output a Kraken 2 style report to stdout. Adding `--output_report ` would save the report in a file. 38 | 39 | To save the read-by-read classifications, add `--output_classifications ` to the command. 40 | 41 | To save a verbose version of the read-by-read classifications, add `--output_verbose ` to the command. The verbose version of the read-by-read classifications will contain the following columns: 42 | 43 | | Column | Explanation | 44 | |--------|-------------| 45 | | READ_ID | The ID of the read | 46 | | READ_LENGTH | The length of the read (same as Kraken 2 output) | 47 | | MINIMIZER_HIT_GROUPS* | The number of minimizer hit groups found during Kraken 2 classification* | 48 | | TAX_LVL_MOVES | How many levels in the taxonomy that the read moved during reclassification | 49 | | ORIGINAL_TAXID | The taxID that the read was classified to originally | 50 | | NEW_TAXID | The taxID that the read was reclassified to | 51 | | ORIGINAL_CONFIDENCE | The original confidence score | 52 | | NEW_CONFIDENCE | The confidence score at the taxID that the read was reclassified to | 53 | | MAX_CONFIDENCE | The maximum confidence that the read can have | 54 | | ORIGINAL_TAX_LVL | The taxonomic rank of the orignally classified taxID | 55 | | NEW_TAX_LVL | The taxonomic rank of the reclassified taxID | 56 | | ORIGINAL_NAME | The scientific name of the original taxID | 57 | | NEW_NAME | The scientific name of the reclassified taxID | 58 | | KMER_STRING | The k-mer string (same as Kraken 2 output) | 59 | 60 | *: Is only present if the forked version of Kraken 2 was used for initial classification. 61 | 62 | ## Reclassifying with minimum hit groups 63 | 64 | This option requires an input file that was produced with my [fork] of Kraken 2. 65 | 66 | Add `--minimum_hit_groups ` to the command. A read can only be considered classified if the number of minimizer hit groups is at or above the minimum_hit_groups setting. 67 | 68 | [Kraken 2]: https://github.com/DerrickWood/kraken2 69 | [KrakMeOpen]: https://github.com/danisven/KrakMeOpen 70 | [fork]: https://github.com/danisven/kraken2 71 | -------------------------------------------------------------------------------- /stringmeup/taxonomy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | import logging 5 | from collections import namedtuple 6 | from dataclasses import dataclass, field 7 | from os import path 8 | 9 | logging.basicConfig( 10 | format='%(asctime)s %(levelname)-8s %(message)s', 11 | level=logging.INFO, 12 | datefmt='%Y-%m-%d [%H:%M:%S]') 13 | log = logging.getLogger(path.basename(__file__)) 14 | 15 | # TODO: make it possible to use scientific names in the same way as tax_id 16 | @dataclass 17 | class Node: 18 | name: str = None 19 | genbank_common_name: str = None 20 | rank: str = None 21 | parent: int = None 22 | children: list = field(default_factory=list) 23 | 24 | 25 | Rank = namedtuple('Rank', ['rank_name', 'rank_code', 'rank_depth']) 26 | 27 | # Using the same rank codes as Kraken 2 (https://github.com/DerrickWood/kraken2/blob/master/src/reports.cc) 28 | translate_rank2code = { 29 | 'superkingdom': 'D', 30 | 'kingdom': 'K', 31 | 'phylum': 'P', 32 | 'class': 'C', 33 | 'order': 'O', 34 | 'family': 'F', 35 | 'genus': 'G', 36 | 'species': 'S' 37 | } 38 | 39 | 40 | class TaxonomyTreeException(Exception): 41 | pass 42 | 43 | 44 | class TaxonomyTree: 45 | """ 46 | Creates a representation of the taxonomy in the files names.dmp and 47 | nodes.dmp of a kraken2 database. 48 | 49 | Inspired by https://github.com/frallain/NCBI_taxonomy_tree. 50 | """ 51 | 52 | def __init__(self, nodes_filename, names_filename): 53 | self.nodes_filename = nodes_filename 54 | self.names_filename = names_filename 55 | self.wanted_name_types = set( 56 | ['scientific name', 'genbank common name']) 57 | 58 | # Main data structure 59 | self.taxonomy = {} 60 | 61 | self.byranks = {} 62 | self.leaves = set() 63 | 64 | # "Memory" data structure to be populated at function calls 65 | # For faster response in case of same query is asked again 66 | self.lineages = {} 67 | self.distances = {} 68 | self.lca_mappings = {} 69 | 70 | # Add nodes to self.taxonomy 71 | self.construct_tree() 72 | 73 | def construct_tree(self): 74 | """ 75 | Reads a names.dmp and nodes.dmp file, and constructs a taxonomy tree 76 | representation: 77 | {tax_id#1: Node('name', 'genbank_common_name', 'rank', 'parent', 'children'), 78 | tax_id#2: Node('name', 'genbank_common_name', 'rank', 'parent', 'children'), 79 | ..., 80 | tax_id#N: ...} 81 | """ 82 | 83 | log.info("Constructing taxonomy tree...") 84 | taxid2name = {} 85 | 86 | try: 87 | log.info('Mapping taxonomic ID to scientific and genbank common names from "{names_file}"...'.format(names_file=self.names_filename)) 88 | # TODO: check so that names.dmp conforms to expected format 89 | with open(self.names_filename, 'r') as f: 90 | for name_line in f: 91 | name_info = name_line.split('|') 92 | name_type = name_info[3].strip() 93 | if name_type not in self.wanted_name_types: 94 | continue 95 | 96 | tax_id = int(name_info[0].strip()) 97 | if tax_id not in taxid2name: 98 | taxid2name[tax_id] = { 99 | 'scientific_name': None, 100 | 'genbank_common_name': None} 101 | 102 | tax_name = name_info[1].strip() 103 | 104 | if name_type == 'scientific name': 105 | if taxid2name[tax_id]['scientific_name'] is not None: 106 | # Some logical checking, should only be one scientific name for a tax_id 107 | raise TaxonomyTreeException("Found more than one scientific name for a unique tax_id. The tax_id was '{}'".format(tax_id)) 108 | taxid2name[tax_id]['scientific_name'] = tax_name 109 | 110 | elif name_type == 'genbank common name': 111 | if taxid2name[tax_id]['genbank_common_name'] is not None: 112 | # Some logical checking, should only be one genbank common name for a tax_id 113 | raise TaxonomyTreeException("Found more than one genbank common name for a unique tax_id. The tax_id was '{}'".format(tax_id)) 114 | taxid2name[tax_id]['genbank_common_name'] = tax_name 115 | 116 | else: 117 | raise TaxonomyTreeException("Logical error. Should not end up here. name_type was '{}'".format(tax_name)) 118 | 119 | except FileNotFoundError: 120 | log.exception('Could not find the file "{names_file}".'.format(names_file=self.names_filename)) 121 | raise 122 | 123 | try: 124 | log.info('Reading taxonomy from "{nodes_file}"...'.format(nodes_file=self.nodes_filename)) 125 | # TODO: check so that nodes.dmp conforms to expected format 126 | with open(self.nodes_filename, 'r') as f: 127 | for tax_line in f: 128 | tax_info = tax_line.split('|')[0:3] 129 | tax_id = int(tax_info[0].strip()) 130 | tax_parent = int(tax_info[1].strip()) 131 | tax_rank = tax_info[2].strip() 132 | tax_scientific_name = taxid2name[tax_id]['scientific_name'] 133 | tax_common_name = taxid2name[tax_id]['genbank_common_name'] 134 | 135 | if tax_id in self.taxonomy: 136 | # We already inserted the current tax_id as a parent of another 137 | self.taxonomy[tax_id].rank = tax_rank 138 | self.taxonomy[tax_id].parent = tax_parent 139 | else: 140 | node = Node( 141 | name=tax_scientific_name, 142 | genbank_common_name=tax_common_name, 143 | rank=tax_rank, 144 | parent=tax_parent, 145 | children=[]) 146 | self.taxonomy[tax_id] = node 147 | self.leaves.add(tax_id) 148 | 149 | if tax_parent in self.taxonomy: 150 | self.taxonomy[tax_parent].children.append(tax_id) 151 | if tax_parent in self.leaves: 152 | self.leaves.remove(tax_parent) 153 | else: 154 | parent_node = Node( 155 | name=taxid2name[tax_parent]['scientific_name'], 156 | genbank_common_name=taxid2name[tax_parent]['genbank_common_name'], 157 | rank=None, 158 | parent=None, 159 | children=[tax_id]) 160 | self.taxonomy[tax_parent] = parent_node 161 | 162 | # Save the tax_id to it's corresponding rank set 163 | if tax_rank in self.byranks: 164 | self.byranks[tax_rank].add(tax_id) 165 | else: 166 | self.byranks[tax_rank] = set([tax_id]) 167 | 168 | except FileNotFoundError: 169 | log.exception('Could not find the nodes file "{nodes_file}".'.format(nodes_file=self.nodes_filename)) 170 | raise 171 | 172 | # Adjust the root (the root is tax_id=1, and its parent is also tax_id=1) 173 | root_children = self.taxonomy[1].children 174 | root_children.remove(1) 175 | self.taxonomy[1].parent = None 176 | self.taxonomy[1].children = root_children 177 | log.info("Taxonomy tree built.") 178 | 179 | 180 | def translate2taxid(self, scientific_names_list): 181 | """ 182 | Will return the tax_ids for the scientific names listed in the input 183 | list. If no name can be found the value will be None. More than one 184 | tax_id may be found for any given scientific name - they will all be 185 | added to the list of tax_ids being returned for that scientific name. 186 | Returns: 187 | {: [tax_id_1, tax_id_2]} 188 | """ 189 | self._verify_list(scientific_names_list) 190 | tax_id_dict = {k: list() for k in scientific_names_list} 191 | 192 | if len(tax_id_dict) != len(scientific_names_list): 193 | log.warning('You entered duplicated names in the input list for translate2taxid.') 194 | 195 | for tax_id in self.taxonomy: 196 | if self.taxonomy[tax_id].name in tax_id_dict: 197 | name = self.taxonomy[tax_id].name 198 | tax_id_dict[name].append(tax_id) 199 | else: 200 | # continue search 201 | continue 202 | 203 | return tax_id_dict 204 | 205 | 206 | def _get_property(self, tax_id, property): 207 | """ 208 | Internal function to fetch the value of a single property of a namedtuple in the taxonomy dictionary. 209 | Raises an exception if tax_id does not exist in the taxonomy tree. 210 | Raises an exception if the taxonomy tree isn't built yet. 211 | """ 212 | if self.taxonomy: 213 | try: 214 | property_value = getattr(self.taxonomy[tax_id], property) 215 | except KeyError: 216 | log.exception('Could not find tax_id={tax_id} in the taxonomy tree.'.format(tax_id=tax_id)) 217 | raise 218 | except AttributeError: 219 | log.exception('There is no such field ("{field}") in the namedtuple.'.format(field=property)) 220 | raise 221 | else: 222 | log.exception('You have not built the taxonomy tree yet.') 223 | raise TaxonomyTreeException('You have not built the taxonomy tree yet.') 224 | 225 | return property_value 226 | 227 | def _verify_list(self, putative_list): 228 | """ 229 | Internal helper function to check that input lists are indeed lists. 230 | """ 231 | try: 232 | assert isinstance(putative_list, list) 233 | except AssertionError: 234 | log.exception('Input must be a list. You input "{input}", of type {input_type}'.format( 235 | input=putative_list, input_type=type(putative_list))) 236 | raise 237 | 238 | def get_name(self, tax_id_list): 239 | """ 240 | Returns the names of the tax_ids in the input list. 241 | """ 242 | self._verify_list(tax_id_list) 243 | name_dict = {} 244 | for tax_id in tax_id_list: 245 | name_dict[tax_id] = self._get_property(tax_id, 'name') 246 | return name_dict 247 | 248 | def get_common_name(self, tax_id_list): 249 | """ 250 | Returns the genbank common names of the tax_ids in the input list. 251 | """ 252 | self._verify_list(tax_id_list) 253 | name_dict = {} 254 | for tax_id in tax_id_list: 255 | name_dict[tax_id] = self._get_property(tax_id, 'genbank_common_name') 256 | return name_dict 257 | 258 | def get_children(self, tax_id_list): 259 | """ 260 | Returns the direct descending children of each tax_id. 261 | """ 262 | self._verify_list(tax_id_list) 263 | children_dict = {} 264 | for tax_id in tax_id_list: 265 | children_dict[tax_id] = self._get_property(tax_id, 'children') 266 | return children_dict 267 | 268 | def get_parent(self, tax_id_list): 269 | """ 270 | Returns the parent of each tax_id. 271 | """ 272 | self._verify_list(tax_id_list) 273 | parent_dict = {} 274 | for tax_id in tax_id_list: 275 | parent_dict[tax_id] = self._get_property(tax_id, 'parent') 276 | return parent_dict 277 | 278 | def get_distance(self, tax_id_1, tax_id_2): 279 | """ 280 | Return the distance between two tax_ids. The distance is defined as 281 | the number of edges that need to be traversed to get from tax_id_1 to 282 | tax_id_2. 283 | 284 | Distance between a parent and child is 1, distance between two genera 285 | in the same family (where the family node is the direct parent of both 286 | genera) is 2, etc. 287 | 288 | All edges between two tax_ids are counted, so the distance between two 289 | ranks in one part of the tree can be different from that in another 290 | part of the tree (depending on tree structure). 291 | """ 292 | 293 | def one_way_distance(tax_id_ancestor, tax_id): 294 | """ 295 | Find the distance (number of steps) between the 296 | ancestor (tax_id_ancestor) and the taxon (tax_id). 297 | """ 298 | 299 | # Lineage of the descendant tax_id (of which ancestor tax_id is part of) 300 | lineage = self.get_lineage([tax_id])[tax_id] 301 | 302 | # The indices of both tax_ids in the lineage 303 | ancestor_index = lineage.index(tax_id_ancestor) 304 | tax_id_index = lineage.index(tax_id) 305 | 306 | distance = tax_id_index - ancestor_index 307 | 308 | return distance 309 | 310 | distance = None 311 | 312 | # Extra calcs to check for distance from self.distances 313 | tax_id_small = min(tax_id_1, tax_id_2) 314 | tax_id_large = max(tax_id_1, tax_id_2) 315 | 316 | # self.distances is ordered... smallest tax_id always goes first 317 | if tax_id_small in self.distances: 318 | if tax_id_large in self.distances[tax_id_small]: 319 | distance = self.distances[tax_id_small][tax_id_large] 320 | else: 321 | self.distances[tax_id_small] = {} 322 | 323 | # Do we need to calculate the distance? 324 | if distance is None: 325 | 326 | # Lowest common ancestor 327 | lca = self.get_lca(tax_id_1, tax_id_2) 328 | 329 | # Sum of distances between both tax_ids and the LCA makes the total distance 330 | distance_1 = one_way_distance(lca, tax_id_1) 331 | distance_2 = one_way_distance(lca, tax_id_2) 332 | distance = distance_1 + distance_2 333 | 334 | # Save distance for faster response next time 335 | self.distances[tax_id_small][tax_id_large] = distance 336 | 337 | return distance 338 | 339 | def get_rank(self, tax_id_list): 340 | """ 341 | Returns the rank of each tax_id. 342 | """ 343 | self._verify_list(tax_id_list) 344 | rank_dict = {} 345 | for tax_id in tax_id_list: 346 | rank_dict[tax_id] = self._get_property(tax_id, 'rank') 347 | return rank_dict 348 | 349 | def get_rank_code(self, tax_id_list): 350 | """ 351 | Returns the rank, rank code, and rank offset for each tax_id. 352 | For example: 353 | tax_id 314295 is rank 'superfamily'. That rank has no rank code in the 354 | original Kraken 2 reports (see translate_rank2code dict above). Same 355 | goes for all of the 'no rank' tax_ids. Instead, 314295 is considered to 356 | be an 'order' but at the depth of 4, i.e. 4 steps down from the tax_id 357 | of rank 'order' that is closes above it in the lineage. The rank code 358 | is therefore O, and the depth is 4. So the full rank code is O4. 359 | 360 | Returns a dict of namedtupes, one for each tax_id in the supplied list. 361 | """ 362 | rank_dict = self.get_rank(tax_id_list) 363 | rank_code_dict = {} 364 | for tax_id in rank_dict: 365 | rank = rank_dict[tax_id] 366 | rank_code = '' 367 | current_node = tax_id 368 | 369 | # Find the rank code for this node or the one above 370 | while not rank_code: 371 | if rank in translate_rank2code: 372 | rank_code = translate_rank2code[rank] 373 | elif current_node == 1: 374 | # Special case for root, as it has rank 'no rank' 375 | rank_code = 'R' 376 | else: 377 | current_node = self.get_parent([current_node])[current_node] 378 | rank = self.get_rank([current_node])[current_node] 379 | 380 | rank_depth = self.get_distance(current_node, tax_id) 381 | rank_name = self.get_rank([tax_id])[tax_id] 382 | 383 | rank_tuple = Rank( 384 | rank_name=rank_name, 385 | rank_code=rank_code, 386 | rank_depth=rank_depth) 387 | 388 | rank_code_dict[tax_id] = rank_tuple 389 | 390 | return rank_code_dict 391 | 392 | def get_node(self, tax_id_list): 393 | """ 394 | Returns the node instances of the supplied tax_ids. 395 | """ 396 | #TODO: Use this fnc in other fncs when getting nodes from self.taxonomy 397 | self._verify_list(tax_id_list) 398 | node_dict = {} 399 | 400 | if self.taxonomy: 401 | for tax_id in tax_id_list: 402 | try: 403 | node = self.taxonomy[tax_id] 404 | except KeyError: 405 | log.exception('Could not find tax_id={tax_id} in the taxonomy tree.'.format(tax_id=tax_id)) 406 | raise 407 | node_dict[tax_id] = node 408 | else: 409 | log.exception('You have not built the taxonomy tree yet.') 410 | raise TaxonomyTreeException('You have not built the taxonomy tree yet.') 411 | 412 | return node_dict 413 | 414 | def get_lineage(self, tax_id_list): 415 | """ 416 | For each tax_id, returns the input tax_id and the tax_ids of its 417 | ancestors. 418 | """ 419 | self._verify_list(tax_id_list) 420 | lineage_dict = {} 421 | 422 | for tax_id in tax_id_list: 423 | if tax_id in self.lineages: 424 | # Lineage has already been calculated, retrieve it 425 | lineage_dict[tax_id] = self.lineages[tax_id] 426 | continue 427 | 428 | lineage = [tax_id] 429 | node = self.get_node([tax_id])[tax_id] 430 | 431 | while node.parent: 432 | lineage.append(node.parent) 433 | node = self.get_node([node.parent])[node.parent] 434 | 435 | lineage.reverse() 436 | lineage_dict[tax_id] = lineage 437 | 438 | # Save lineage for faster response next time 439 | self.lineages[tax_id] = lineage 440 | 441 | return lineage_dict 442 | 443 | def get_clade(self, tax_id_list): 444 | """ 445 | For each tax_id, returns all of the tax_ids of the clade rooted at the 446 | tax_id. 447 | 448 | returns: {tax_id#1: set(all tax_ids in node), 449 | tax_id#2: set(all tax_ids in node)} 450 | """ 451 | 452 | self._verify_list(tax_id_list) 453 | clade_dict = {} 454 | 455 | for tax_id in tax_id_list: 456 | node = self.get_node([tax_id])[tax_id] 457 | children_pool = set(node.children) 458 | clade = set([tax_id]) 459 | clade.update(children_pool) 460 | 461 | while children_pool: 462 | try: 463 | clade_taxon = children_pool.pop() 464 | except KeyError: 465 | break 466 | else: 467 | new_children = self.get_node([clade_taxon])[clade_taxon].children 468 | clade.update(new_children) 469 | children_pool.update(new_children) 470 | 471 | clade_dict[tax_id] = clade 472 | 473 | return clade_dict 474 | 475 | def get_leaves(self, tax_ids=[1]): 476 | """ 477 | Returns a {tax_id: set(leaf_taxids)} mapping of leaf node tax_ids for 478 | the clades rooted at the tax_ids. 479 | """ 480 | 481 | self._verify_list(tax_ids) 482 | clade_dict = {} 483 | 484 | def get_leaves_dfs(tax_id, clade_leaves, visited_nodes=None): 485 | if visited_nodes == None: 486 | visited_nodes = set() 487 | 488 | if tax_id not in visited_nodes: 489 | visited_nodes.add(tax_id) 490 | children = self.get_children([tax_id])[tax_id] 491 | if children: 492 | for child in children: 493 | get_leaves_dfs(child, clade_leaves, visited_nodes) 494 | else: 495 | clade_leaves.add(tax_id) 496 | 497 | return clade_leaves 498 | 499 | for tax_id in tax_ids: 500 | clade_leaves = set() 501 | clade_leaves = get_leaves_dfs(tax_id, clade_leaves) 502 | clade_dict[tax_id] = clade_leaves 503 | 504 | return clade_dict 505 | 506 | def get_lca(self, tax_id_1, tax_id_2): 507 | """ 508 | Get the tax_id of the lowest common ancestor (LCA) of two tax_ids. 509 | """ 510 | lca = None 511 | 512 | # Extra calcs to check for lca from self.lca_mappings 513 | tax_id_small = min(tax_id_1, tax_id_2) 514 | tax_id_large = max(tax_id_1, tax_id_2) 515 | 516 | # self.lca_mappings is ordered... smallest tax_id always goes first 517 | if tax_id_small in self.lca_mappings: 518 | if tax_id_large in self.lca_mappings[tax_id_small]: 519 | lca = self.lca_mappings[tax_id_small][tax_id_large] 520 | else: 521 | self.lca_mappings[tax_id_small] = {} 522 | 523 | if lca is None: 524 | # Get lineages and convert to sets for fast operation 525 | lineages = self.get_lineage([tax_id_1, tax_id_2]) 526 | lineage_1 = set(lineages[tax_id_1]) 527 | lineage_2 = set(lineages[tax_id_2]) 528 | 529 | # Get only the common tax_ids between the lineages of tax_id 1 and 2 530 | common_lineage = lineage_1.intersection(lineage_2) 531 | 532 | # The LCA will be the tax_id @ index (num(common_taxIDs) - 1) 533 | lca = lineages[tax_id_1][len(common_lineage) - 1] 534 | 535 | # Save LCA for faster response next time 536 | self.lca_mappings[tax_id_small][tax_id_large] = lca 537 | 538 | return lca 539 | 540 | def get_clade_rank_taxids(self, tax_ids, rank=None): 541 | """ 542 | For each clade rooted at the input tax_ids, return all tax_ids that 543 | represent taxa at the supplied rank, or all ranks. For example: 544 | # get_clade_rank_taxids([1], 'phylum') -- returns all phyla in the whole tree 545 | # get_clade_rank_taxids([2, 9443], 'genus') -- returns all genera in the clades rooted at 'Bacteria' and 'Primates' 546 | # get_clade_rank_taxids([1]) -- returns all canonical ranks in the whole tree. 547 | """ 548 | self._verify_list(tax_ids) 549 | 550 | canonical_ranks = translate_rank2code.values() 551 | canonical_rank_weights = {rank: weight for weight, rank in enumerate(['R'] + list(canonical_ranks))} 552 | clade_tax_rank_dict = {tax_id: dict() for tax_id in tax_ids} 553 | 554 | if rank: 555 | rank = translate_rank2code[rank] 556 | else: 557 | rank = canonical_ranks 558 | 559 | def dfs(tax_id, visited_nodes=None, tax_lvl_dict=None, wanted_ranks=None): 560 | """ 561 | Fnc to recursively search the taxonomy tree in a depth-first 562 | fashion. Saves all tax_ids that are canonical (S/G/F etc) in 563 | tax_lvl_dict. 564 | """ 565 | if visited_nodes is None: 566 | visited_nodes = set() 567 | 568 | if wanted_ranks is None: 569 | wanted_ranks = {rank in canonical_ranks} 570 | 571 | if tax_lvl_dict is None: 572 | tax_lvl_dict = {tax_lvl: set() for tax_lvl in wanted_ranks} 573 | 574 | if tax_id not in visited_nodes: 575 | visited_nodes.add(tax_id) 576 | 577 | taxonomy_rank = self.get_rank_code([tax_id])[tax_id] 578 | rank_code = taxonomy_rank.rank_code 579 | if taxonomy_rank.rank_depth == 0: 580 | if rank_code in wanted_ranks: 581 | tax_lvl_dict[rank_code].add(tax_id) 582 | 583 | rank_code_weight = canonical_rank_weights[rank_code] 584 | 585 | # Keep going down the tree only if there's a wanted rank below current rank 586 | if any([rank_code_weight < canonical_rank_weights[rank] for rank in wanted_ranks]): 587 | children = self.get_children([tax_id])[tax_id] 588 | for child in children: 589 | _ = dfs(child, visited_nodes, tax_lvl_dict, wanted_ranks) 590 | 591 | return tax_lvl_dict 592 | 593 | for tax_id in tax_ids: 594 | tax_lvl_dict = dfs(tax_id, wanted_ranks=set(rank)) 595 | clade_tax_rank_dict[tax_id] = tax_lvl_dict 596 | 597 | return clade_tax_rank_dict 598 | 599 | def get_siblings(self, tax_id): 600 | """ 601 | NB! This fnc hasn't been extensively tested, use at own risk. 602 | 603 | This fnc is similar to get_clade_rank_taxids, but I think it should 604 | be faster. 605 | 606 | For a given tax_id X with any rank in ['S', 'G', 'F', 'O', 'C', 'P'], 607 | return all taxa with the same rank in the clade rooted at the parent 608 | of X. The parent is defined as the most recent ancestor of X that has 609 | a rank also in ['S', 'G', 'F', 'O', 'C', 'P']. 610 | 611 | For example, if the tax_id 3352 (Pinus taeda, a species) is submitted 612 | to the function, it will return all other species in the genus Pinus 613 | (3337). Conversely, if the genus Pinus (3337) is submitted, the 614 | function will return all genera in the family Pinaceae (3318). 615 | """ 616 | # TODO: Test this more. 617 | # TODO: In line with other exposed functions in this class, it should take a list of taxids instead of a single one. 618 | 619 | tax_id_rank = self.get_rank_code([tax_id])[tax_id] 620 | rank = tax_id_rank.rank_code 621 | rank_codes = ['S', 'G', 'F', 'O', 'C', 'P'] 622 | 623 | if tax_id_rank.rank_depth != 0: 624 | raise TaxonomyTreeException("Can only work with ranks of level {}.".format(rank_codes)) 625 | 626 | def get_parent(tax_id): 627 | parent_rank_ok = False 628 | current_tax_id = tax_id 629 | while not parent_rank_ok: 630 | parent = self.get_parent([current_tax_id])[current_tax_id] 631 | taxonomy_rank = self.get_rank_code([parent])[parent] 632 | if taxonomy_rank.rank_code in rank_codes and taxonomy_rank.rank_depth == 0: 633 | parent_rank_ok = True 634 | elif parent == 1: 635 | parent_rank_ok = True 636 | else: 637 | current_tax_id = parent 638 | 639 | return parent 640 | 641 | parent = get_parent(tax_id) 642 | 643 | visited_nodes = set() 644 | siblings = set() 645 | 646 | def dfs(tax_id, wanted_rank): 647 | if tax_id not in visited_nodes: 648 | visited_nodes.add(tax_id) 649 | taxonomy_rank = self.get_rank_code([tax_id])[tax_id] 650 | if taxonomy_rank.rank_code != wanted_rank: 651 | children = self.get_children([tax_id])[tax_id] 652 | for child in children: 653 | dfs(child, wanted_rank) 654 | else: 655 | siblings.add(tax_id) 656 | 657 | dfs(parent, rank) 658 | return siblings 659 | 660 | 661 | if __name__ == '__main__': 662 | parser = argparse.ArgumentParser() 663 | parser.add_argument('--nodes') 664 | parser.add_argument('--names') 665 | args = parser.parse_args() 666 | 667 | taxonomy_tree = TaxonomyTree(args.nodes, args.names) 668 | -------------------------------------------------------------------------------- /stringmeup/stringmeup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | __version__ = "0.1.5" 4 | 5 | import argparse 6 | import operator 7 | import logging 8 | import gzip 9 | import sys 10 | from stringmeup import taxonomy 11 | from dataclasses import dataclass 12 | from os import path 13 | 14 | logging.basicConfig( 15 | format='%(asctime)s %(levelname)-8s %(message)s', 16 | level=logging.INFO, 17 | datefmt='%Y-%m-%d [%H:%M:%S]') 18 | log = logging.getLogger(path.basename(__file__)) 19 | 20 | # TODO: make sure confidence_threshold is between 0 and 1 21 | # TODO: For the verbose output, also output (1) the number of kmers that hit in total, (2) the number of non-ambiguous kmers (queried). 22 | 23 | 24 | @dataclass 25 | class ReadClassification: 26 | current_node: int = None 27 | original_conf: float = None 28 | recalculated_conf: float = None 29 | original_taxid: int = None 30 | reclassified_taxid: int = None 31 | original_rank_code: str = None 32 | reclassified_rank_code: str = None 33 | original_name: str = None 34 | reclassified_name: str = None 35 | reclassified_distance: int = None 36 | id: str = None 37 | length: str = None 38 | kmer_string: str = None 39 | classified: bool = False 40 | max_confidence = None 41 | minimizer_hit_groups = None 42 | 43 | 44 | @dataclass 45 | class ReportNode: 46 | ratio: str 47 | hits_at_clade: int 48 | hits_at_node: int 49 | rank_code: str 50 | rank_depth: int 51 | node_taxid: int 52 | name: str 53 | offset: int 54 | 55 | 56 | def validate_input_file(putative_classifications_file, verbose_input, minimum_hit_groups, paired_input): 57 | """ 58 | Perform simple validation of the input file. 59 | """ 60 | 61 | log.debug('Validating input classifications file.') 62 | 63 | if not path.isfile(putative_classifications_file): 64 | log.error('Cannot find the specified file ({file}).'.format( 65 | file=putative_classifications_file)) 66 | sys.exit() 67 | 68 | with read_file(putative_classifications_file) as f: 69 | line = f.readline() 70 | line_proc = line.strip() 71 | line_proc = line_proc.split('\t') 72 | 73 | # The following should be the case of a Kraken 2 output file 74 | # First, check so the number of columns in the input file conforms to the expected number 75 | if not verbose_input: 76 | num_cols = len(line_proc) == 5 # original type of kraken2 output file 77 | else: 78 | num_cols = len(line_proc) == 6 # 6 columns if the output was produced with the verbose version of kraken2 that outputs minimizer hit groups 79 | 80 | # Line must start with C or U (as in Classified/unclassified) 81 | line_start = line_proc[0] in ['U', 'C'] 82 | 83 | # If the data is paired 84 | if paired_input: 85 | # Must be information on both sides of the pipe character 86 | data_col_1 = len(line_proc[3].split('|')) == 2 87 | 88 | # If the data is paired in the 3rd column, it must also be paired in the last column 89 | if "|" in line_proc[-1]: 90 | data_col_2 = len(line_proc[-1].split('|:|')) == 2 91 | else: 92 | data_col_2 = False 93 | 94 | # If the input is from single end reads, atleast the read length column (3rd) must be an int 95 | else: 96 | try: 97 | int(line_proc[3]) 98 | except: 99 | data_col_1 = False 100 | else: 101 | data_col_1 = True 102 | 103 | # And the last column should contain colons between kmer/taxon pairs 104 | if ":" in line_proc[4]: 105 | data_col_2 = True 106 | else: 107 | data_col_2 = False 108 | 109 | if num_cols and line_start and data_col_1 and data_col_2: 110 | log.debug('Validation OK.') 111 | return 112 | else: 113 | log.error('The classifications file is malformatted.') 114 | log.debug('First line of input: {}'.format(line)) 115 | log.debug('num_cols: {}'.format(num_cols)) 116 | log.debug('line_start: {}'.format(line_start)) 117 | log.debug('data_col_1: {}'.format(data_col_1)) 118 | log.debug('data_col_1: {}'.format(data_col_2)) 119 | sys.exit() 120 | 121 | 122 | def is_paired_input(classifications_file): 123 | """ 124 | Returns true if input file appears to contain paired read data. 125 | """ 126 | with read_file(classifications_file) as f: 127 | line = f.readline() 128 | line_proc = line.strip() 129 | line_proc = line_proc.split('\t') 130 | 131 | # If column 4 contains a pipe character "|", the data is paired 132 | if "|" in line_proc[3]: 133 | return True 134 | 135 | 136 | def is_verbose_input(classifications_file): 137 | """ 138 | Returns true if input file consists of 6 columns instead of 5. 139 | """ 140 | 141 | with read_file(classifications_file) as f: 142 | line = f.readline() 143 | line_proc = line.strip() 144 | line_proc = line_proc.split('\t') 145 | if len(line_proc) == 6: 146 | return True 147 | else: 148 | return False 149 | 150 | 151 | def process_kmer_string(kmer_info_string, paired_input): 152 | """ 153 | Process a kmer info string (last column of a Kraken 2 output file), so that 154 | we get a dictionary mapping of tax_ids to total sum of kmer hits. 155 | Returns: 156 | {tax_id_#1: X kmer hits, 157 | tax_id_#2: Y kmer hits, 158 | ... 159 | tax_id_#N: Z kmer hits} 160 | """ 161 | kmer_info_string = kmer_info_string.split() 162 | 163 | # Kraken2 classifications file for paired data contain the "|:|" delimiter 164 | if paired_input: 165 | kmer_info_string.remove('|:|') 166 | 167 | # Messy list comprehension. Converts all "taxa":"num_kmer" string pairs 168 | # into integer tuples like (taxa, num_kmers), and saves them in a list. 169 | # Ambiguous kmers are not processed (discarded). 170 | kmer_classifications = [ 171 | (int(x[0]), int(x[1])) for x in ( 172 | kmer_info.split(':') for kmer_info in kmer_info_string) 173 | if x[0] != 'A'] 174 | 175 | # Further processes the (taxa, num_kmers) tuples into a dict where each 176 | # tax_id stores the total sum of kmer hits to that tax_id. 177 | taxa_kmer_dict = {} 178 | for kmer_info in kmer_classifications: 179 | if kmer_info[0] not in taxa_kmer_dict: 180 | taxa_kmer_dict[kmer_info[0]] = kmer_info[1] 181 | else: 182 | taxa_kmer_dict[kmer_info[0]] += kmer_info[1] 183 | 184 | return taxa_kmer_dict 185 | 186 | 187 | def reclassify_read(read, confidence_threshold, taxonomy_tree, verbose_input, minimum_hit_groups, taxa_lineages, paired_input): 188 | """ 189 | Sums the number of kmers that hit in the clade rooted at "current_node", 190 | and divides it with the total number of kmers queried against the database: 191 | confidence = clade_kmer_hits / total_kmer_hits 192 | 193 | If the confidence at a specific node is < confidence_threshold, we go one 194 | step up the taxonomy (to the parent node) and recalculates the confidence. 195 | This is repeated until confidence >= confidence_threshold. 196 | 197 | In this function it's envisionable to include other parameters for the 198 | classification... Right now I'm only considering the confidence score 199 | and minimum hit groups. 200 | """ 201 | # Process the kmer string into a dict of {tax_id: #kmers} key, value pairs 202 | taxa_kmer_dict = process_kmer_string(read.kmer_string, paired_input) 203 | 204 | # Make the current node the same as the original classification 205 | read.current_node = read.original_taxid 206 | 207 | # The total number of kmers that were interrogated against the 208 | # database (non-ambiguous): 209 | total_kmer_hits = sum(taxa_kmer_dict.values()) 210 | 211 | # Only interested in tax_ids that are in the database. A '0' signifies that 212 | # the kmer could not be assigned to any tax_id (missing from database). 213 | assigned_taxa_set = set( 214 | [tax_id for tax_id in taxa_kmer_dict.keys() if tax_id != 0]) 215 | 216 | # Make a quick check to see if it is even possible to obtain the confidence 217 | # needed to make a classification. If it isn't we don't have to go through 218 | # the hassle of calculating the confidence at all parent nodes. Potentially 219 | # saving us a lot of time. 220 | doomed_to_fail = False 221 | total_hits = sum(taxa_kmer_dict[tax_id] for tax_id in assigned_taxa_set) 222 | max_confidence = total_hits / total_kmer_hits 223 | read.max_confidence = max_confidence 224 | 225 | # The read can't achieve a confidence high enough, so we mark it 226 | if max_confidence < confidence_threshold: 227 | doomed_to_fail = True 228 | 229 | # Filter minimizer_hit_groups 230 | if verbose_input: 231 | if read.minimizer_hit_groups < minimum_hit_groups: 232 | doomed_to_fail = True 233 | 234 | # The nr of kmers that hit within the clade rooted at the current node: 235 | num_hits_within_clade = 0 236 | 237 | while not read.classified: 238 | taxa_in_clade = set() 239 | 240 | # For each tax_id that kmers in the read were assigned to: 241 | for tax_id in assigned_taxa_set: 242 | 243 | # Get the lineage (all ancestors including itself) for the tax_id: 244 | if tax_id in taxa_lineages: 245 | lineage = taxa_lineages[tax_id] 246 | else: 247 | lineage = taxonomy_tree.get_lineage([tax_id])[tax_id] 248 | 249 | # Save lineage so we don't have to get it from taxonomy_tree 250 | # more than once. Also make it into a set, which is faster to 251 | # query (the order of tax_ids in the lineage is not important 252 | # here). 253 | taxa_lineages[tax_id] = set(lineage) 254 | 255 | # If the currently classified (read.current_node) tax_id is in the 256 | # lineage (parents) of tax_id, then tax_id must be in the clade 257 | # rooted at read.current_node - i.e. tax_id is a descendant of 258 | # current_node. 259 | if read.current_node in lineage: 260 | 261 | # There is no need to get the lineage of tax_id in future 262 | # iterations since it will always be in the clade rooted at 263 | # read.current_node (we only ever go up in the taxonomy). 264 | # Remember which tax_ids we have counted, so we can remove them 265 | # from the set, outside of the loop: 266 | taxa_in_clade.add(tax_id) 267 | 268 | # Instead, we just add the kmers that hit tax_id to the total 269 | # hits at the clade: 270 | num_hits_within_clade += taxa_kmer_dict[tax_id] 271 | 272 | # Remove the already counted tax_ids: 273 | if taxa_in_clade: 274 | assigned_taxa_set -= taxa_in_clade 275 | 276 | # The confidence value for the read pair classification at the current 277 | # taxonomic level: 278 | read.recalculated_conf = num_hits_within_clade / total_kmer_hits 279 | 280 | # Set the original confidence score 281 | if not read.original_conf: 282 | read.original_conf = read.recalculated_conf 283 | 284 | # If we can't achieve the confidence score cutoff, now is the time 285 | # to exit the loop and return the read (since we have calculated 286 | # the original confidence). 287 | if doomed_to_fail: 288 | read.recalculated_conf = max_confidence 289 | read.reclassified_taxid = 0 290 | return read, taxa_lineages 291 | 292 | # If the confidence at this node is sufficient, we classify it to 293 | # the current node (TaxID). 294 | if read.recalculated_conf >= confidence_threshold: 295 | read.classified = True 296 | read.reclassified_taxid = read.current_node 297 | return read, taxa_lineages 298 | 299 | # If the current node is the root, we stop (can't go higher up in the 300 | # taxonomy). 301 | elif read.current_node == 1: 302 | read.reclassified_taxid = 0 303 | return read, taxa_lineages 304 | 305 | # Otherwise, set the current_node to the parent and keep going. 306 | else: 307 | read.current_node = taxonomy_tree.get_parent( 308 | [read.current_node])[read.current_node] 309 | 310 | 311 | def read_kraken_output(): 312 | """ 313 | This should work to read classifications from a kraken 2 output file. It's 314 | not complete, but the backbone is there. The point is, we should not have 315 | to run reclassification in order to produce a report file - we shuold be 316 | able to just read an output file and work with the classifications as they 317 | are. Could also just modify the main loop. 318 | """ 319 | tax_dict = {'hits_at_node': {}, 'hits_at_clade': {}} 320 | i = 0 321 | with open('Ki-1974-23-291.kraken2', 'r') as f: 322 | for line in f: 323 | if line.startswith('C'): 324 | l = line.strip().split('\t') 325 | tax_id = int(l[2]) 326 | if tax_id not in tax_dict['hits_at_node']: 327 | tax_dict['hits_at_node'][tax_id] = 1 328 | else: 329 | tax_dict['hits_at_node'][tax_id] += 1 330 | i += 1 331 | if i % report_frequency == 0: 332 | print('Processed {} lines'.format(i)) 333 | 334 | 335 | def get_kraken2_report_content(tax_reads, taxonomy_tree, total_reads): 336 | """ 337 | First calculates the cumulative clade read count (i.e. for each node, how 338 | many reads are classified to the clades rooted at that node). 339 | 340 | Then, sorts the nodes in the order they will be printed in the report file. 341 | The sorting works like this: perform a depth first search of the taxonomy, 342 | starting at the root. At each node, continue with the depth first seach 343 | in the order of highest to lowest cumulative clade read counts among the 344 | child nodes. 345 | 346 | There is probably a way of merging the internal functions dfs_ccrc and 347 | dfs_sort, but I was in a hurry and this is what it is. 348 | 349 | total_reads: total reads in the kraken output file. 350 | """ 351 | report_node_list = [] 352 | 353 | # Depth First Search, cumulative clade read count 354 | def dfs_ccrc(visited_nodes, taxonomy_tree, node_taxid): 355 | if node_taxid not in visited_nodes: 356 | visited_nodes.add(node_taxid) 357 | 358 | # The cumulative clade read count for the clade rooted at current 359 | # node will start at the number of hits at that node: 360 | if node_taxid in tax_reads['hits_at_node']: 361 | hits_at_node = tax_reads['hits_at_node'][node_taxid] 362 | else: 363 | hits_at_node = 0 364 | clade_read_count = hits_at_node 365 | 366 | # Recursively search the children in the clade rooted at the 367 | # current node: 368 | for child in taxonomy_tree.get_children([node_taxid])[node_taxid]: 369 | clade_read_count += dfs_ccrc(visited_nodes, taxonomy_tree, child) 370 | 371 | # Only save the node if its clade read count is !=0: 372 | if clade_read_count > 0: 373 | tax_reads['hits_at_clade'][node_taxid] = clade_read_count 374 | 375 | return clade_read_count # This return ends up in the above for loop 376 | 377 | # Depth First Search, sorting for the hierarchy of the output report. 378 | # Moves down the tree and adds the nodes with highest cumulative clade 379 | # read count to report_node_list 380 | def dfs_sort(visited_nodes, taxonomy_tree, node_taxid, offset): 381 | if node_taxid not in visited_nodes: 382 | visited_nodes.add(node_taxid) 383 | 384 | # Find the cumluative read counts for all children of current node: 385 | children_cumulative_counts = {} 386 | for child in taxonomy_tree.get_children([node_taxid])[node_taxid]: 387 | if child in tax_reads['hits_at_clade']: 388 | children_cumulative_counts[child] = tax_reads['hits_at_clade'][child] 389 | 390 | # Get the hits to this node (to be output in the report): 391 | if node_taxid in tax_reads['hits_at_node']: 392 | hits_at_node = tax_reads['hits_at_node'][node_taxid] 393 | else: 394 | hits_at_node = 0 395 | 396 | # Get some information that will go into the report file. 397 | # 1) total number of reads at this node and at the clade rooted here (column 2 in output): 398 | hits_at_clade = tax_reads['hits_at_clade'][node_taxid] 399 | # 2) the ratio (column 1 in output): 400 | ratio_classified2clade = hits_at_clade / total_reads * 100 401 | # 3) the rank code and depth (column 4 in output): 402 | rank_tuple = taxonomy_tree.get_rank_code([node_taxid])[node_taxid] 403 | # 4) scientific name of the node (column 6 of output): 404 | name = taxonomy_tree.get_name([node_taxid])[node_taxid] 405 | 406 | # Construct the dataclass instance that holds the information 407 | # about this node that is printed to the report file: 408 | report_node = ReportNode( 409 | ratio="{0:.2f}".format(ratio_classified2clade), 410 | hits_at_clade=hits_at_clade, 411 | hits_at_node=hits_at_node, 412 | rank_code=rank_tuple.rank_code, 413 | rank_depth=rank_tuple.rank_depth, 414 | node_taxid=node_taxid, 415 | offset=offset, 416 | name=name) 417 | 418 | # Append it to the list. The order of the elements in the list is 419 | # the order the nodes will be printed. 420 | report_node_list.append(report_node) 421 | 422 | if len(children_cumulative_counts) > 0: 423 | # sorted_by_ccrc is a list of tuples [(tax_id#1, ccrc), (tax_id#2, ccrc)...] 424 | sorted_by_ccrc = sorted( 425 | children_cumulative_counts.items(), 426 | key=operator.itemgetter(1), 427 | reverse=True) 428 | 429 | # We are going one level deeper in the taxonomy, increment 430 | # the offset: 431 | offset += 1 432 | 433 | for child_tuple in sorted_by_ccrc: 434 | child_taxid = child_tuple[0] 435 | dfs_sort(visited_nodes, taxonomy_tree, child_taxid, offset) 436 | 437 | # Depth first search to get cumulative read counts for all clades: 438 | visited_nodes = set() 439 | log.info('Calculating cumulative clade read counts...') 440 | dfs_ccrc(visited_nodes, taxonomy_tree, 1) 441 | 442 | # Depth first search to sort the output according to largest cumulative 443 | # read count: 444 | visited_nodes = set() 445 | log.info('Sorting the order of the output in the report file...') 446 | dfs_sort(visited_nodes, taxonomy_tree, 1, 0) 447 | 448 | # Make sure to add the unclassified row that goes at the very top of the 449 | # kraken 2 report: 450 | num_unclassified_reads = total_reads - tax_reads['hits_at_clade'][1] 451 | ratio = num_unclassified_reads / total_reads * 100 452 | unclassified_node = ReportNode( 453 | ratio="{0:.2f}".format(ratio), 454 | hits_at_clade=num_unclassified_reads, 455 | hits_at_node=num_unclassified_reads, 456 | rank_code='U', 457 | rank_depth=0, 458 | node_taxid=0, 459 | name='unclassified', 460 | offset=0) 461 | 462 | report_node_list.insert(0, unclassified_node) 463 | 464 | return report_node_list 465 | 466 | 467 | def format_kraken2_report_row(report_node): 468 | """ 469 | Formats the row that will be output in the kraken 2 style report. Input 470 | is an instance of ReportNode. 471 | """ 472 | offset = 2 * report_node.offset * ' ' 473 | name = offset + report_node.name 474 | 475 | if report_node.rank_depth == 0: 476 | rank_depth = '' 477 | else: 478 | rank_depth = str(report_node.rank_depth) 479 | 480 | rank_code = report_node.rank_code + rank_depth 481 | report_row = '\t'.join([ 482 | str(report_node.ratio), 483 | str(report_node.hits_at_clade), 484 | str(report_node.hits_at_node), 485 | rank_code, 486 | str(report_node.node_taxid), 487 | name]) 488 | 489 | return report_row 490 | 491 | 492 | def make_kraken2_report(tax_reads, taxonomy_tree, total_reads, output_report): 493 | """ 494 | Gets the information that should be printed from 495 | get_kraken2_report_content. Formats the information and prints it to file 496 | or stdout. 497 | """ 498 | log.info('Creating report...') 499 | 500 | # Get the data to print 501 | report_node_list = get_kraken2_report_content( 502 | tax_reads, taxonomy_tree, total_reads) 503 | 504 | # If the output should go to file 505 | if output_report: 506 | with open(output_report, 'w') as f: 507 | for node in report_node_list: 508 | 509 | # Format the output 510 | report_row = format_kraken2_report_row(node) 511 | f.write(report_row + '\n') 512 | 513 | log.info('Report saved in {}.'.format(output_report)) 514 | 515 | # Otherwise, print to stdout 516 | else: 517 | for node in report_node_list: 518 | 519 | # Format the output 520 | report_row = format_kraken2_report_row(node) 521 | sys.stdout.write(report_row + '\n') 522 | 523 | 524 | def get_verbose_output(read, taxonomy_tree): 525 | """ 526 | Gets more information about the reclassification of a read. This is for 527 | the output_verbose option. 528 | 529 | read is an instance of ReadClassification. 530 | """ 531 | # Variable renaming to make things more readable 532 | new_taxid = read.reclassified_taxid 533 | old_tax_id = read.original_taxid 534 | 535 | # Stuff that is conditional if we have a classified read or not. 536 | # TaxonomyTree doesn't cope with tax_id=0 537 | if read.classified: 538 | distance = taxonomy_tree.get_distance(new_taxid, old_tax_id) 539 | new_rank_tuple = taxonomy_tree.get_rank_code([new_taxid])[new_taxid] 540 | new_rank_depth = new_rank_tuple.rank_depth if new_rank_tuple.rank_depth != 0 else '' 541 | new_rank_code = str(new_rank_tuple.rank_code) + str(new_rank_depth) 542 | new_rank_name = taxonomy_tree.get_name([new_taxid])[new_taxid] 543 | else: 544 | distance = 'NaN' 545 | new_rank_code = 'U' 546 | new_rank_name = 'unclassified' 547 | 548 | # Distance information 549 | read.reclassified_distance = distance 550 | 551 | # Rank information 552 | old_rank_tuple = taxonomy_tree.get_rank_code([old_tax_id])[old_tax_id] 553 | old_rank_depth = old_rank_tuple.rank_depth if old_rank_tuple.rank_depth != 0 else '' 554 | old_rank_code = str(old_rank_tuple.rank_code) + str(old_rank_depth) 555 | read.original_rank_code = old_rank_code 556 | read.reclassified_rank_code = new_rank_code 557 | 558 | # Scientific name information 559 | old_rank_name = taxonomy_tree.get_name([old_tax_id])[old_tax_id] 560 | read.original_name = old_rank_name 561 | read.reclassified_name = new_rank_name 562 | 563 | return read 564 | 565 | 566 | def create_read(kraken2_read, verbose_input=False): 567 | """ 568 | Creates an instance of ReadClassification dataclass, that holds 569 | information about the read and its classification. 570 | """ 571 | # Process the read string so that its elements go into a list 572 | read_pair_proc = kraken2_read.strip() 573 | read_pair_proc = read_pair_proc.split('\t') 574 | 575 | # Create the read object 576 | read = ReadClassification( 577 | original_taxid=int(read_pair_proc[2]), 578 | id=read_pair_proc[1], 579 | length=read_pair_proc[3], 580 | kmer_string=read_pair_proc[-1]) 581 | 582 | if verbose_input: 583 | read.minimizer_hit_groups = int(read_pair_proc[4]) 584 | 585 | return read 586 | 587 | 588 | def main_loop(f_handle, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, paired_input, verbose_input=False, o_handle=None, v_handle=None): 589 | """ 590 | f_handle: classifications input file to read from. 591 | o_handle: output_classifications file to write to. 592 | v_handle: output_verbose file to write to. 593 | """ 594 | def write_read_output(read): 595 | # read is an instance of ReadClassification 596 | classification = 'C' if read.classified else 'U' 597 | row_items = [ 598 | classification, 599 | read.id, 600 | read.reclassified_taxid, 601 | read.length, 602 | read.kmer_string] 603 | 604 | if verbose_input: 605 | row_items.insert(4, read.minimizer_hit_groups) 606 | 607 | row_string = '\t'.join([str(x) for x in row_items]) + '\n' 608 | _ = o_handle.write(row_string) # gzip write fnc returns output, therefore send to "_" 609 | 610 | def write_verbose_output(read): 611 | # read is an instance of ReadClassification 612 | row_items = [ 613 | read.id, 614 | read.length, 615 | read.reclassified_distance, 616 | read.original_taxid, 617 | read.reclassified_taxid, 618 | "{0:.2f}".format(read.original_conf), 619 | "{0:.2f}".format(read.recalculated_conf), 620 | "{0:.2f}".format(read.max_confidence), 621 | read.original_rank_code, 622 | read.reclassified_rank_code, 623 | read.original_name, 624 | read.reclassified_name, 625 | read.kmer_string] 626 | 627 | if verbose_input: 628 | row_items.insert(2, read.minimizer_hit_groups) 629 | 630 | row_string = '\t'.join([str(x) for x in row_items]) + '\n' 631 | _ = v_handle.write(row_string) # gzip write fnc returns output, therefore send to "_" 632 | 633 | # Parse the input file, read per read 634 | i = 0 635 | for read_pair in f_handle: 636 | 637 | # Only working with classified reads: 638 | if read_pair.startswith('C'): 639 | 640 | # Make an instance of ReadClassification to hold information 641 | # about the read and its classification 642 | read = create_read(read_pair, verbose_input) 643 | 644 | # Reclassify the read pair based on confidence 645 | read, taxa_lineages = reclassify_read( 646 | read, 647 | args.confidence_threshold, 648 | taxonomy_tree, 649 | verbose_input, 650 | args.minimum_hit_groups, 651 | taxa_lineages, 652 | paired_input) 653 | 654 | # Counter for number of reads per taxon/node 655 | if read.reclassified_taxid in tax_reads_dict['hits_at_node']: 656 | tax_reads_dict['hits_at_node'][read.reclassified_taxid] += 1 657 | else: 658 | tax_reads_dict['hits_at_node'][read.reclassified_taxid] = 1 659 | 660 | # Write the reclassified reads to file 661 | if o_handle: 662 | if read.classified or args.keep_unclassified: 663 | write_read_output(read) 664 | 665 | # Write verbose output about the reclassification 666 | if v_handle: 667 | read = get_verbose_output(read, taxonomy_tree) 668 | write_verbose_output(read) 669 | 670 | else: 671 | # Change here if you want to keep reads from the input file 672 | # that were initially unclassified. 673 | pass 674 | 675 | # Keep track of progress 676 | i += 1 677 | if i % report_frequency == 0: 678 | log.info('Processed {} reads...'.format(i)) 679 | 680 | log.info('Done processing reads. They were {} in total.'.format(i)) 681 | 682 | # Output a report file 683 | make_kraken2_report(tax_reads_dict, taxonomy_tree, i, args.output_report) # i is used to calculate the ratio of classified reads (col 1 in output file). 684 | 685 | 686 | def read_file(filename): 687 | """ 688 | Wrapper to read either gzipped or ordinary text file input. 689 | """ 690 | if filename.endswith('.gz'): 691 | return gzip.open(filename, 'rt') 692 | else: 693 | return open(filename, 'r') 694 | 695 | 696 | def write_file(filename, gz_output): 697 | """ 698 | Wrapper to write either gzipped or ordinary text file output. 699 | """ 700 | if gz_output: 701 | return gzip.open(filename, 'wt') 702 | else: 703 | return open(filename, 'w') 704 | 705 | 706 | def get_arguments(): 707 | """ 708 | Wrapper function to get the command line arguments. Inserting this piece of code 709 | into its own function for conda compatibility. 710 | """ 711 | 712 | parser = argparse.ArgumentParser( 713 | prog='StringMeUp', 714 | usage='stringmeup --names --nodes [--output_report ] [--output_classifications ] [--output_verbose ] [--keep_unclassified] [--minimum_hit_groups INT] [--gz_output] [--help] confidence classifications', 715 | description='A post-processing tool to reclassify Kraken 2 output based on the confidence score and/or minimum minimizer hit groups.') 716 | parser.add_argument( 717 | 'confidence_threshold', 718 | metavar='confidence', 719 | type=float, 720 | help='The confidence score threshold to be used in reclassification [0-1].') 721 | parser.add_argument( 722 | 'original_classifications_file', 723 | metavar='classifications', 724 | type=str, 725 | help='Path to the Kraken 2 output file containing the individual read classifications.') 726 | parser.add_argument( 727 | '--output_report', 728 | metavar='FILE', 729 | type=str, 730 | help='File to save the Kraken 2 report in.') 731 | parser.add_argument( 732 | '--output_classifications', 733 | metavar='FILE', 734 | type=str, 735 | help='File to save the Kraken 2 read classifications in.') 736 | parser.add_argument( 737 | '--keep_unclassified', 738 | action='store_true', 739 | help='Specify if you want to output unclassified reads in addition to classified reads. NOTE(!): This script will always discard reads that are unclassified in the classifications input file, this flag will just make sure to keep previously classified reads even if they are reclassified as unclassified by this script. TIP(!): Always run Kraken2 with no confidence cutoff.') 740 | parser.add_argument( 741 | '--output_verbose', 742 | metavar='FILE', 743 | type=str, 744 | help='File to send verbose output to. This file will contain, for each read, (1) original classification, (2) new classification, (3) original confidence, (4), new confidence (5), original taxa name (6), new taxa name, (7) original rank, (8) new rank, (9) distance travelled (how many nodes was it lifted upwards in the taxonomy).') 745 | parser.add_argument( 746 | '--names', 747 | metavar='FILE', 748 | required=True, 749 | help='Taxonomy names dump file (names.dmp)') 750 | parser.add_argument( 751 | '--nodes', 752 | metavar='FILE', 753 | required=True, 754 | help='Taxonomy nodes dump file (nodes.dmp)') 755 | parser.add_argument( 756 | '--minimum_hit_groups', 757 | metavar='INT', 758 | type=int, 759 | help='The minimum number of hit groups a read needs to be classified. NOTE: You need to supply a classifications file (kraken2 output) that contain the "minimizer_hit_groups" column.') 760 | parser.add_argument( 761 | '--gz_output', 762 | action='store_true', 763 | help='Set this flag to output and in gzipped format (will add .gz extension to the filenames).' 764 | ) 765 | args = parser.parse_args() 766 | 767 | return args 768 | 769 | 770 | def stringmeup(): 771 | 772 | # Get the CL arguments 773 | args = get_arguments() 774 | 775 | # Some initial setup 776 | taxa_lineages = {} 777 | report_frequency = 10000000 # Will output progress every nth read 778 | tax_reads_dict = {'hits_at_node': {}, 'hits_at_clade': {}} 779 | 780 | # Was the input generated with https://github.com/danisven/kraken2 ? 781 | verbose_input = is_verbose_input(args.original_classifications_file) 782 | 783 | # If so, output warnings if input doesn't contain minimizer hit groups 784 | if verbose_input: 785 | log.info("The input file appears to contain a column for minimizer hit groups.") 786 | 787 | if not args.minimum_hit_groups: 788 | log.warning('You didn\'t specify --minimum_hit_groups.') 789 | log.warning('Will NOT reclassify based on minimizer hit groups, setting minimum_hit_groups=1 (lowest possible setting).') 790 | args.minimum_hit_groups = 1 791 | 792 | # If not, output warnings if user was planning to reclassify with minimizer hit groups 793 | else: 794 | if args.minimum_hit_groups: 795 | log.warning('You specified minimum_hit_groups={}, but did not supply an input file with a column for minimizer hit groups.'.format(args.minimum_hit_groups)) 796 | log.warning('Will NOT reclassify based on minimizer hit groups.') 797 | args.minimum_hit_groups = None 798 | 799 | # Check if the input data is paired or not 800 | paired_input = is_paired_input(args.original_classifications_file) 801 | if paired_input: 802 | log.info('Classifications were made from paired-end data.') 803 | else: 804 | log.info('Classifications were made from single-read data.') 805 | 806 | # Perform a naive check of the input file 807 | validate_input_file(args.original_classifications_file, verbose_input, args.minimum_hit_groups, paired_input) 808 | 809 | # Create a TaxonomyTree from the user provided names.dmp and nodes.dmp files 810 | taxonomy_tree = taxonomy.TaxonomyTree(names_filename=args.names, nodes_filename=args.nodes) 811 | 812 | # Filehandles-to-be 813 | o = None 814 | v = None 815 | 816 | # Open the classifications input file: 817 | with read_file(args.original_classifications_file) as f: 818 | log.info('Processing read classifications from "{file}".'.format(file=path.abspath(args.original_classifications_file))) 819 | 820 | # TODO: make sure output files are writable 821 | # If user wants to save the read classifications to file, open file 822 | if args.output_classifications: 823 | if args.gz_output: 824 | if not args.output_classifications.endswith('.gz'): 825 | args.output_classifications += '.gz' 826 | log.info('Saving reclassified reads in {}.'.format(args.output_classifications)) 827 | o = write_file(args.output_classifications, args.gz_output) 828 | 829 | # If user wants to save the verbose classification output to file, open file 830 | if args.output_verbose: 831 | if args.gz_output: 832 | if not args.output_verbose.endswith('.gz'): 833 | args.output_verbose += '.gz' 834 | log.info('Saving verbose classification information in {}.'.format(args.output_verbose)) 835 | v = write_file(args.output_verbose, args.gz_output) 836 | 837 | # Run the main loop (reclassification) 838 | main_loop(f, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, paired_input, verbose_input, o, v) 839 | 840 | # Remember to close files 841 | if o: 842 | o.close() 843 | if v: 844 | v.close() 845 | 846 | 847 | if __name__ == '__main__': 848 | stringmeup() 849 | --------------------------------------------------------------------------------