├── stringmeup
    ├── __init__.py
    ├── taxonomy.py
    └── stringmeup.py
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/stringmeup/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from stringmeup.stringmeup import __version__
 3 | 
 4 | setup(
 5 |     name="StringMeUp",
 6 |     version=__version__,
 7 |     url="https://github.com/danisven/stringmeup",
 8 |     description="A post-processing tool to reclassify Kraken 2 output based on the confidence score and/or minimum minimizer hit groups.",
 9 |     license="MIT",
10 | 
11 |     # Author details
12 |     author='Daniel Svensson',
13 |     author_email='daniel.svensson@umu.se',
14 | 
15 |     keywords="Bioinformatics NGS kraken2",
16 |     classifiers=[
17 |         'Development Status :: 5 - Beta',
18 |         'License :: OSI Approved :: MIT',
19 |         'Programming Language :: Python :: 3'
20 |         ],
21 |     install_requires=['dataclasses'],
22 |     packages=find_packages(exclude=['contrib', 'docs', 'test*'], include=['stringmeup']),
23 |     entry_points={'console_scripts': [  'stringmeup=stringmeup.stringmeup:stringmeup',
24 | #                                        'kraken2-taxonomy=kraken2_confidence_recal.taxonomy:main',
25 | 
26 | ]})
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Daniel Svensson
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Kraken 2
  2 | *.report
  3 | *.kraken2
  4 | 
  5 | # NCBI taxonomy files
  6 | *.dmp
  7 | 
  8 | # Kraken2_confidence_recal
  9 | *.verbose
 10 | *.pickle
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | pip-wheel-metadata/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106 | __pypackages__/
107 | 
108 | # Celery stuff
109 | celerybeat-schedule
110 | celerybeat.pid
111 | 
112 | # SageMath parsed files
113 | *.sage.py
114 | 
115 | # Environments
116 | .env
117 | .venv
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 | 
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 | 
128 | # Rope project settings
129 | .ropeproject
130 | 
131 | # mkdocs documentation
132 | /site
133 | 
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 | 
139 | # Pyre type checker
140 | .pyre/
141 | <<<<<<< HEAD
142 | >>>>>>> 6d7afd916c05c5d84bf73628335008b00d0e39c8
143 | =======
144 | >>>>>>> 6d7afd916c05c5d84bf73628335008b00d0e39c8
145 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Anaconda-Server Badge](https://anaconda.org/bioconda/stringmeup/badges/installer/conda.svg)](https://conda.anaconda.org/bioconda)
 2 | 
 3 | # StringMeUp
 4 | 
 5 | A post-processing tool for [Kraken 2] read classifications. Run Kraken 2 **once** and re-classify the reads with any confidence score stringency of your choice afterwards, saving you lots of compute time. Creates Kraken 2 style report and read classification files.
 6 | 
 7 | For additional insight into your Kraken 2 classifications, try out [KrakMeOpen] - a downstream analysis toolkit for Kraken 2 classification quality metrics.
 8 | 
 9 | ## Installation
10 | 
11 | StringMeUp is available to install through conda. Simply run the following command to install it:
12 | 
13 | `conda install -c conda-forge -c bioconda stringmeup`
14 | 
15 | ## Usage
16 | 
17 | A good start is to run `stringmeup --help`.
18 | 
19 | ## About the confidence score
20 | 
21 | The confidence score (CS) for a given read _R_ classified to a given node _J_ is calculated by dividing the number of k-mers that hit any node in the clade rooted at node _J_ (N) by the total number of k-mers that were queried against the database (M). Any k-mer with an ambiguous nucleotide is not queried against the database, and is thus not part of M.
22 | 
23 | CS = N / M
24 | 
25 | If the CS for a given read _R_ at a given node _J_ is equal to or larger than the specified cutoff, read _R_ is classified to node _J_. If not, the CS of read _R_ is calculated for the parent of node _J_. This is repeated until the CS >= CS cutoff or until we reach the root of the taxonomy. If the CS < CS cutoff at the root, the read is deemed unclassified.
26 | 
27 | ## Reclassifying Kraken 2 output
28 | 
29 | To reclassify reads classified by Kraken 2 with a confidence cutoff of 0.1:
30 | 
31 | `stringmeup --names <names.dmp> --nodes <nodes.dmp> 0.1 <original_classifications.kraken2>`
32 | 
33 | Where:
34 | * original_classifications.kraken2 is the output file from Kraken 2 that contain the read-by-read classifications.
35 | * names.dmp and nodes.dmp are the same NCBI taxonomy files used for the building of the database that was used to produce the classifications in original_classifications.kraken2.
36 | 
37 | This command would output a Kraken 2 style report to stdout. Adding `--output_report <FILE>` would save the report in a file.
38 | 
39 | To save the read-by-read classifications, add `--output_classifications <FILE>` to the command.
40 | 
41 | To save a verbose version of the read-by-read classifications, add `--output_verbose <FILE>` to the command. The verbose version of the read-by-read classifications will contain the following columns:
42 | 
43 | | Column | Explanation |
44 | |--------|-------------|
45 | | READ_ID | The ID of the read |
46 | | READ_LENGTH | The length of the read (same as Kraken 2 output) |
47 | | MINIMIZER_HIT_GROUPS* | The number of minimizer hit groups found during Kraken 2 classification* |
48 | | TAX_LVL_MOVES | How many levels in the taxonomy that the read moved during reclassification |
49 | | ORIGINAL_TAXID | The taxID that the read was classified to originally |
50 | | NEW_TAXID | The taxID that the read was reclassified to |
51 | | ORIGINAL_CONFIDENCE | The original confidence score |
52 | | NEW_CONFIDENCE | The confidence score at the taxID that the read was reclassified to |
53 | | MAX_CONFIDENCE | The maximum confidence that the read can have |
54 | | ORIGINAL_TAX_LVL | The taxonomic rank of the orignally classified taxID |
55 | | NEW_TAX_LVL | The taxonomic rank of the reclassified taxID |
56 | | ORIGINAL_NAME | The scientific name of the original taxID |
57 | | NEW_NAME | The scientific name of the reclassified taxID |
58 | | KMER_STRING | The k-mer string (same as Kraken 2 output) |
59 | 
60 | *: Is only present if the forked version of Kraken 2 was used for initial classification.
61 | 
62 | ## Reclassifying with minimum hit groups
63 | 
64 | This option requires an input file that was produced with my [fork] of Kraken 2.
65 | 
66 | Add `--minimum_hit_groups <INT>` to the command. A read can only be considered classified if the number of minimizer hit groups is at or above the minimum_hit_groups setting.
67 | 
68 | [Kraken 2]: https://github.com/DerrickWood/kraken2
69 | [KrakMeOpen]: https://github.com/danisven/KrakMeOpen
70 | [fork]: https://github.com/danisven/kraken2
71 | 


--------------------------------------------------------------------------------
/stringmeup/taxonomy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import logging
  5 | from collections import namedtuple
  6 | from dataclasses import dataclass, field
  7 | from os import path
  8 | 
  9 | logging.basicConfig(
 10 |     format='%(asctime)s %(levelname)-8s %(message)s',
 11 |     level=logging.INFO,
 12 |     datefmt='%Y-%m-%d [%H:%M:%S]')
 13 | log = logging.getLogger(path.basename(__file__))
 14 | 
 15 | # TODO: make it possible to use scientific names in the same way as tax_id
 16 | @dataclass
 17 | class Node:
 18 |     name: str = None
 19 |     genbank_common_name: str = None
 20 |     rank: str = None
 21 |     parent: int = None
 22 |     children: list = field(default_factory=list)
 23 | 
 24 | 
 25 | Rank = namedtuple('Rank', ['rank_name', 'rank_code', 'rank_depth'])
 26 | 
 27 | # Using the same rank codes as Kraken 2 (https://github.com/DerrickWood/kraken2/blob/master/src/reports.cc)
 28 | translate_rank2code = {
 29 |     'superkingdom': 'D',
 30 |     'kingdom': 'K',
 31 |     'phylum': 'P',
 32 |     'class': 'C',
 33 |     'order': 'O',
 34 |     'family': 'F',
 35 |     'genus': 'G',
 36 |     'species': 'S'
 37 | }
 38 | 
 39 | 
 40 | class TaxonomyTreeException(Exception):
 41 |     pass
 42 | 
 43 | 
 44 | class TaxonomyTree:
 45 |     """
 46 |     Creates a representation of the taxonomy in the files names.dmp and
 47 |     nodes.dmp of a kraken2 database.
 48 | 
 49 |     Inspired by https://github.com/frallain/NCBI_taxonomy_tree.
 50 |     """
 51 | 
 52 |     def __init__(self, nodes_filename, names_filename):
 53 |         self.nodes_filename = nodes_filename
 54 |         self.names_filename = names_filename
 55 |         self.wanted_name_types = set(
 56 |             ['scientific name', 'genbank common name'])
 57 | 
 58 |         # Main data structure
 59 |         self.taxonomy = {}
 60 | 
 61 |         self.byranks = {}
 62 |         self.leaves = set()
 63 | 
 64 |         # "Memory" data structure to be populated at function calls
 65 |         # For faster response in case of same query is asked again
 66 |         self.lineages = {}
 67 |         self.distances = {}
 68 |         self.lca_mappings = {}
 69 | 
 70 |         # Add nodes to self.taxonomy
 71 |         self.construct_tree()
 72 | 
 73 |     def construct_tree(self):
 74 |         """
 75 |         Reads a names.dmp and nodes.dmp file, and constructs a taxonomy tree
 76 |         representation:
 77 |             {tax_id#1: Node('name', 'genbank_common_name', 'rank', 'parent', 'children'),
 78 |              tax_id#2: Node('name', 'genbank_common_name', 'rank', 'parent', 'children'),
 79 |              ...,
 80 |              tax_id#N: ...}
 81 |         """
 82 | 
 83 |         log.info("Constructing taxonomy tree...")
 84 |         taxid2name = {}
 85 | 
 86 |         try:
 87 |             log.info('Mapping taxonomic ID to scientific and genbank common names from "{names_file}"...'.format(names_file=self.names_filename))
 88 |             # TODO: check so that names.dmp conforms to expected format
 89 |             with open(self.names_filename, 'r') as f:
 90 |                 for name_line in f:
 91 |                     name_info = name_line.split('|')
 92 |                     name_type = name_info[3].strip()
 93 |                     if name_type not in self.wanted_name_types:
 94 |                         continue
 95 | 
 96 |                     tax_id = int(name_info[0].strip())
 97 |                     if tax_id not in taxid2name:
 98 |                         taxid2name[tax_id] = {
 99 |                             'scientific_name': None,
100 |                             'genbank_common_name': None}
101 | 
102 |                     tax_name = name_info[1].strip()
103 | 
104 |                     if name_type == 'scientific name':
105 |                         if taxid2name[tax_id]['scientific_name'] is not None:
106 |                             # Some logical checking, should only be one scientific name for a tax_id
107 |                             raise TaxonomyTreeException("Found more than one scientific name for a unique tax_id. The tax_id was '{}'".format(tax_id))
108 |                         taxid2name[tax_id]['scientific_name'] = tax_name
109 | 
110 |                     elif name_type == 'genbank common name':
111 |                         if taxid2name[tax_id]['genbank_common_name'] is not None:
112 |                             # Some logical checking, should only be one genbank common name for a tax_id
113 |                             raise TaxonomyTreeException("Found more than one genbank common name for a unique tax_id. The tax_id was '{}'".format(tax_id))
114 |                         taxid2name[tax_id]['genbank_common_name'] = tax_name
115 | 
116 |                     else:
117 |                         raise TaxonomyTreeException("Logical error. Should not end up here. name_type was '{}'".format(tax_name))
118 | 
119 |         except FileNotFoundError:
120 |             log.exception('Could not find the file "{names_file}".'.format(names_file=self.names_filename))
121 |             raise
122 | 
123 |         try:
124 |             log.info('Reading taxonomy from "{nodes_file}"...'.format(nodes_file=self.nodes_filename))
125 |             # TODO: check so that nodes.dmp conforms to expected format
126 |             with open(self.nodes_filename, 'r') as f:
127 |                 for tax_line in f:
128 |                     tax_info = tax_line.split('|')[0:3]
129 |                     tax_id = int(tax_info[0].strip())
130 |                     tax_parent = int(tax_info[1].strip())
131 |                     tax_rank = tax_info[2].strip()
132 |                     tax_scientific_name = taxid2name[tax_id]['scientific_name']
133 |                     tax_common_name = taxid2name[tax_id]['genbank_common_name']
134 | 
135 |                     if tax_id in self.taxonomy:
136 |                         # We already inserted the current tax_id as a parent of another
137 |                         self.taxonomy[tax_id].rank = tax_rank
138 |                         self.taxonomy[tax_id].parent = tax_parent
139 |                     else:
140 |                         node = Node(
141 |                             name=tax_scientific_name,
142 |                             genbank_common_name=tax_common_name,
143 |                             rank=tax_rank,
144 |                             parent=tax_parent,
145 |                             children=[])
146 |                         self.taxonomy[tax_id] = node
147 |                         self.leaves.add(tax_id)
148 | 
149 |                     if tax_parent in self.taxonomy:
150 |                         self.taxonomy[tax_parent].children.append(tax_id)
151 |                         if tax_parent in self.leaves:
152 |                             self.leaves.remove(tax_parent)
153 |                     else:
154 |                         parent_node = Node(
155 |                             name=taxid2name[tax_parent]['scientific_name'],
156 |                             genbank_common_name=taxid2name[tax_parent]['genbank_common_name'],
157 |                             rank=None,
158 |                             parent=None,
159 |                             children=[tax_id])
160 |                         self.taxonomy[tax_parent] = parent_node
161 | 
162 |                     # Save the tax_id to it's corresponding rank set
163 |                     if tax_rank in self.byranks:
164 |                         self.byranks[tax_rank].add(tax_id)
165 |                     else:
166 |                         self.byranks[tax_rank] = set([tax_id])
167 | 
168 |         except FileNotFoundError:
169 |             log.exception('Could not find the nodes file "{nodes_file}".'.format(nodes_file=self.nodes_filename))
170 |             raise
171 | 
172 |         # Adjust the root (the root is tax_id=1, and its parent is also tax_id=1)
173 |         root_children = self.taxonomy[1].children
174 |         root_children.remove(1)
175 |         self.taxonomy[1].parent = None
176 |         self.taxonomy[1].children = root_children
177 |         log.info("Taxonomy tree built.")
178 | 
179 | 
180 |     def translate2taxid(self, scientific_names_list):
181 |         """
182 |         Will return the tax_ids for the scientific names listed in the input
183 |         list. If no name can be found the value will be None. More than one
184 |         tax_id may be found for any given scientific name - they will all be
185 |         added to the list of tax_ids being returned for that scientific name.
186 |         Returns:
187 |         {<scientific_name>: [tax_id_1, tax_id_2]}
188 |         """
189 |         self._verify_list(scientific_names_list)
190 |         tax_id_dict = {k: list() for k in scientific_names_list}
191 | 
192 |         if len(tax_id_dict) != len(scientific_names_list):
193 |             log.warning('You entered duplicated names in the input list for translate2taxid.')
194 | 
195 |         for tax_id in self.taxonomy:
196 |             if self.taxonomy[tax_id].name in tax_id_dict:
197 |                 name = self.taxonomy[tax_id].name
198 |                 tax_id_dict[name].append(tax_id)
199 |             else:
200 |                 # continue search
201 |                 continue
202 | 
203 |         return tax_id_dict
204 | 
205 | 
206 |     def _get_property(self, tax_id, property):
207 |         """
208 |         Internal function to fetch the value of a single property of a namedtuple in the taxonomy dictionary.
209 |         Raises an exception if tax_id does not exist in the taxonomy tree.
210 |         Raises an exception if the taxonomy tree isn't built yet.
211 |         """
212 |         if self.taxonomy:
213 |             try:
214 |                 property_value = getattr(self.taxonomy[tax_id], property)
215 |             except KeyError:
216 |                 log.exception('Could not find tax_id={tax_id} in the taxonomy tree.'.format(tax_id=tax_id))
217 |                 raise
218 |             except AttributeError:
219 |                 log.exception('There is no such field ("{field}") in the namedtuple.'.format(field=property))
220 |                 raise
221 |         else:
222 |             log.exception('You have not built the taxonomy tree yet.')
223 |             raise TaxonomyTreeException('You have not built the taxonomy tree yet.')
224 | 
225 |         return property_value
226 | 
227 |     def _verify_list(self, putative_list):
228 |         """
229 |         Internal helper function to check that input lists are indeed lists.
230 |         """
231 |         try:
232 |             assert isinstance(putative_list, list)
233 |         except AssertionError:
234 |             log.exception('Input must be a list. You input "{input}", of type {input_type}'.format(
235 |                 input=putative_list, input_type=type(putative_list)))
236 |             raise
237 | 
238 |     def get_name(self, tax_id_list):
239 |         """
240 |         Returns the names of the tax_ids in the input list.
241 |         """
242 |         self._verify_list(tax_id_list)
243 |         name_dict = {}
244 |         for tax_id in tax_id_list:
245 |             name_dict[tax_id] = self._get_property(tax_id, 'name')
246 |         return name_dict
247 | 
248 |     def get_common_name(self, tax_id_list):
249 |         """
250 |         Returns the genbank common names of the tax_ids in the input list.
251 |         """
252 |         self._verify_list(tax_id_list)
253 |         name_dict = {}
254 |         for tax_id in tax_id_list:
255 |             name_dict[tax_id] = self._get_property(tax_id, 'genbank_common_name')
256 |         return name_dict
257 | 
258 |     def get_children(self, tax_id_list):
259 |         """
260 |         Returns the direct descending children of each tax_id.
261 |         """
262 |         self._verify_list(tax_id_list)
263 |         children_dict = {}
264 |         for tax_id in tax_id_list:
265 |             children_dict[tax_id] = self._get_property(tax_id, 'children')
266 |         return children_dict
267 | 
268 |     def get_parent(self, tax_id_list):
269 |         """
270 |         Returns the parent of each tax_id.
271 |         """
272 |         self._verify_list(tax_id_list)
273 |         parent_dict = {}
274 |         for tax_id in tax_id_list:
275 |             parent_dict[tax_id] = self._get_property(tax_id, 'parent')
276 |         return parent_dict
277 | 
278 |     def get_distance(self, tax_id_1, tax_id_2):
279 |         """
280 |         Return the distance between two tax_ids. The distance is defined as
281 |         the number of edges that need to be traversed to get from tax_id_1 to
282 |         tax_id_2.
283 | 
284 |         Distance between a parent and child is 1, distance between two genera
285 |         in the same family (where the family node is the direct parent of both
286 |         genera) is 2, etc.
287 | 
288 |         All edges between two tax_ids are counted, so the distance between two
289 |         ranks in one part of the tree can be different from that in another
290 |         part of the tree (depending on tree structure).
291 |         """
292 | 
293 |         def one_way_distance(tax_id_ancestor, tax_id):
294 |             """
295 |             Find the distance (number of steps) between the
296 |             ancestor (tax_id_ancestor) and the taxon (tax_id).
297 |             """
298 | 
299 |             # Lineage of the descendant tax_id (of which ancestor tax_id is part of)
300 |             lineage = self.get_lineage([tax_id])[tax_id]
301 | 
302 |             # The indices of both tax_ids in the lineage
303 |             ancestor_index = lineage.index(tax_id_ancestor)
304 |             tax_id_index = lineage.index(tax_id)
305 | 
306 |             distance = tax_id_index - ancestor_index
307 | 
308 |             return distance
309 | 
310 |         distance = None
311 | 
312 |         # Extra calcs to check for distance from self.distances
313 |         tax_id_small = min(tax_id_1, tax_id_2)
314 |         tax_id_large = max(tax_id_1, tax_id_2)
315 | 
316 |         # self.distances is ordered... smallest tax_id always goes first
317 |         if tax_id_small in self.distances:
318 |             if tax_id_large in self.distances[tax_id_small]:
319 |                 distance = self.distances[tax_id_small][tax_id_large]
320 |         else:
321 |             self.distances[tax_id_small] = {}
322 | 
323 |         # Do we need to calculate the distance?
324 |         if distance is None:
325 | 
326 |             # Lowest common ancestor
327 |             lca = self.get_lca(tax_id_1, tax_id_2)
328 | 
329 |             # Sum of distances between both tax_ids and the LCA makes the total distance
330 |             distance_1 = one_way_distance(lca, tax_id_1)
331 |             distance_2 = one_way_distance(lca, tax_id_2)
332 |             distance = distance_1 + distance_2
333 | 
334 |             # Save distance for faster response next time
335 |             self.distances[tax_id_small][tax_id_large] = distance
336 | 
337 |         return distance
338 | 
339 |     def get_rank(self, tax_id_list):
340 |         """
341 |         Returns the rank of each tax_id.
342 |         """
343 |         self._verify_list(tax_id_list)
344 |         rank_dict = {}
345 |         for tax_id in tax_id_list:
346 |             rank_dict[tax_id] = self._get_property(tax_id, 'rank')
347 |         return rank_dict
348 | 
349 |     def get_rank_code(self, tax_id_list):
350 |         """
351 |         Returns the rank, rank code, and rank offset for each tax_id.
352 |         For example:
353 |         tax_id 314295 is rank 'superfamily'. That rank has no rank code in the
354 |         original Kraken 2 reports (see translate_rank2code dict above). Same
355 |         goes for all of the 'no rank' tax_ids. Instead, 314295 is considered to
356 |         be an 'order' but at the depth of 4, i.e. 4 steps down from the tax_id
357 |         of rank 'order' that is closes above it in the lineage. The rank code
358 |         is therefore O, and the depth is 4. So the full rank code is O4.
359 | 
360 |         Returns a dict of namedtupes, one for each tax_id in the supplied list.
361 |         """
362 |         rank_dict = self.get_rank(tax_id_list)
363 |         rank_code_dict = {}
364 |         for tax_id in rank_dict:
365 |             rank = rank_dict[tax_id]
366 |             rank_code = ''
367 |             current_node = tax_id
368 | 
369 |             # Find the rank code for this node or the one above
370 |             while not rank_code:
371 |                 if rank in translate_rank2code:
372 |                     rank_code = translate_rank2code[rank]
373 |                 elif current_node == 1:
374 |                     # Special case for root, as it has rank 'no rank'
375 |                     rank_code = 'R'
376 |                 else:
377 |                     current_node = self.get_parent([current_node])[current_node]
378 |                     rank = self.get_rank([current_node])[current_node]
379 | 
380 |             rank_depth = self.get_distance(current_node, tax_id)
381 |             rank_name = self.get_rank([tax_id])[tax_id]
382 | 
383 |             rank_tuple = Rank(
384 |                 rank_name=rank_name,
385 |                 rank_code=rank_code,
386 |                 rank_depth=rank_depth)
387 | 
388 |             rank_code_dict[tax_id] = rank_tuple
389 | 
390 |         return rank_code_dict
391 | 
392 |     def get_node(self, tax_id_list):
393 |         """
394 |         Returns the node instances of the supplied tax_ids.
395 |         """
396 |         #TODO: Use this fnc in other fncs when getting nodes from self.taxonomy
397 |         self._verify_list(tax_id_list)
398 |         node_dict = {}
399 | 
400 |         if self.taxonomy:
401 |             for tax_id in tax_id_list:
402 |                 try:
403 |                     node = self.taxonomy[tax_id]
404 |                 except KeyError:
405 |                     log.exception('Could not find tax_id={tax_id} in the taxonomy tree.'.format(tax_id=tax_id))
406 |                     raise
407 |                 node_dict[tax_id] = node
408 |         else:
409 |             log.exception('You have not built the taxonomy tree yet.')
410 |             raise TaxonomyTreeException('You have not built the taxonomy tree yet.')
411 | 
412 |         return node_dict
413 | 
414 |     def get_lineage(self, tax_id_list):
415 |         """
416 |         For each tax_id, returns the input tax_id and the tax_ids of its
417 |         ancestors.
418 |         """
419 |         self._verify_list(tax_id_list)
420 |         lineage_dict = {}
421 | 
422 |         for tax_id in tax_id_list:
423 |             if tax_id in self.lineages:
424 |                 # Lineage has already been calculated, retrieve it
425 |                 lineage_dict[tax_id] = self.lineages[tax_id]
426 |                 continue
427 | 
428 |             lineage = [tax_id]
429 |             node = self.get_node([tax_id])[tax_id]
430 | 
431 |             while node.parent:
432 |                 lineage.append(node.parent)
433 |                 node = self.get_node([node.parent])[node.parent]
434 | 
435 |             lineage.reverse()
436 |             lineage_dict[tax_id] = lineage
437 | 
438 |             # Save lineage for faster response next time
439 |             self.lineages[tax_id] = lineage
440 | 
441 |         return lineage_dict
442 | 
443 |     def get_clade(self, tax_id_list):
444 |         """
445 |         For each tax_id, returns all of the tax_ids of the clade rooted at the
446 |         tax_id.
447 | 
448 |         returns: {tax_id#1: set(all tax_ids in node),
449 |                   tax_id#2: set(all tax_ids in node)}
450 |         """
451 | 
452 |         self._verify_list(tax_id_list)
453 |         clade_dict = {}
454 | 
455 |         for tax_id in tax_id_list:
456 |             node = self.get_node([tax_id])[tax_id]
457 |             children_pool = set(node.children)
458 |             clade = set([tax_id])
459 |             clade.update(children_pool)
460 | 
461 |             while children_pool:
462 |                 try:
463 |                     clade_taxon = children_pool.pop()
464 |                 except KeyError:
465 |                     break
466 |                 else:
467 |                     new_children = self.get_node([clade_taxon])[clade_taxon].children
468 |                     clade.update(new_children)
469 |                     children_pool.update(new_children)
470 | 
471 |             clade_dict[tax_id] = clade
472 | 
473 |         return clade_dict
474 | 
475 |     def get_leaves(self, tax_ids=[1]):
476 |         """
477 |         Returns a {tax_id: set(leaf_taxids)} mapping of leaf node tax_ids for
478 |         the clades rooted at the tax_ids.
479 |         """
480 | 
481 |         self._verify_list(tax_ids)
482 |         clade_dict = {}
483 | 
484 |         def get_leaves_dfs(tax_id, clade_leaves, visited_nodes=None):
485 |             if visited_nodes == None:
486 |                 visited_nodes = set()
487 | 
488 |             if tax_id not in visited_nodes:
489 |                 visited_nodes.add(tax_id)
490 |                 children = self.get_children([tax_id])[tax_id]
491 |                 if children:
492 |                     for child in children:
493 |                         get_leaves_dfs(child, clade_leaves, visited_nodes)
494 |                 else:
495 |                     clade_leaves.add(tax_id)
496 | 
497 |                 return clade_leaves
498 | 
499 |         for tax_id in tax_ids:
500 |             clade_leaves = set()
501 |             clade_leaves = get_leaves_dfs(tax_id, clade_leaves)
502 |             clade_dict[tax_id] = clade_leaves
503 | 
504 |         return clade_dict
505 | 
506 |     def get_lca(self, tax_id_1, tax_id_2):
507 |         """
508 |         Get the tax_id of the lowest common ancestor (LCA) of two tax_ids.
509 |         """
510 |         lca = None
511 | 
512 |         # Extra calcs to check for lca from self.lca_mappings
513 |         tax_id_small = min(tax_id_1, tax_id_2)
514 |         tax_id_large = max(tax_id_1, tax_id_2)
515 | 
516 |         # self.lca_mappings is ordered... smallest tax_id always goes first
517 |         if tax_id_small in self.lca_mappings:
518 |             if tax_id_large in self.lca_mappings[tax_id_small]:
519 |                 lca = self.lca_mappings[tax_id_small][tax_id_large]
520 |         else:
521 |             self.lca_mappings[tax_id_small] = {}
522 | 
523 |         if lca is None:
524 |             # Get lineages and convert to sets for fast operation
525 |             lineages = self.get_lineage([tax_id_1, tax_id_2])
526 |             lineage_1 = set(lineages[tax_id_1])
527 |             lineage_2 = set(lineages[tax_id_2])
528 | 
529 |             # Get only the common tax_ids between the lineages of tax_id 1 and 2
530 |             common_lineage = lineage_1.intersection(lineage_2)
531 | 
532 |             # The LCA will be the tax_id @ index (num(common_taxIDs) - 1)
533 |             lca = lineages[tax_id_1][len(common_lineage) - 1]
534 | 
535 |             # Save LCA for faster response next time
536 |             self.lca_mappings[tax_id_small][tax_id_large] = lca
537 | 
538 |         return lca
539 | 
540 |     def get_clade_rank_taxids(self, tax_ids, rank=None):
541 |         """
542 |         For each clade rooted at the input tax_ids, return all tax_ids that
543 |         represent taxa at the supplied rank, or all ranks. For example:
544 |         # get_clade_rank_taxids([1], 'phylum') -- returns all phyla in the whole tree
545 |         # get_clade_rank_taxids([2, 9443], 'genus') -- returns all genera in the clades rooted at 'Bacteria' and 'Primates'
546 |         # get_clade_rank_taxids([1]) -- returns all canonical ranks in the whole tree.
547 |         """
548 |         self._verify_list(tax_ids)
549 | 
550 |         canonical_ranks = translate_rank2code.values()
551 |         canonical_rank_weights = {rank: weight for weight, rank in enumerate(['R'] + list(canonical_ranks))}
552 |         clade_tax_rank_dict = {tax_id: dict() for tax_id in tax_ids}
553 | 
554 |         if rank:
555 |             rank = translate_rank2code[rank]
556 |         else:
557 |             rank = canonical_ranks
558 | 
559 |         def dfs(tax_id, visited_nodes=None, tax_lvl_dict=None, wanted_ranks=None):
560 |             """
561 |             Fnc to recursively search the taxonomy tree in a depth-first
562 |             fashion. Saves all tax_ids that are canonical (S/G/F etc) in
563 |             tax_lvl_dict.
564 |             """
565 |             if visited_nodes is None:
566 |                 visited_nodes = set()
567 | 
568 |             if wanted_ranks is None:
569 |                 wanted_ranks = {rank in canonical_ranks}
570 | 
571 |             if tax_lvl_dict is None:
572 |                 tax_lvl_dict = {tax_lvl: set() for tax_lvl in wanted_ranks}
573 | 
574 |             if tax_id not in visited_nodes:
575 |                 visited_nodes.add(tax_id)
576 | 
577 |                 taxonomy_rank = self.get_rank_code([tax_id])[tax_id]
578 |                 rank_code = taxonomy_rank.rank_code
579 |                 if taxonomy_rank.rank_depth == 0:
580 |                     if rank_code in wanted_ranks:
581 |                         tax_lvl_dict[rank_code].add(tax_id)
582 | 
583 |                 rank_code_weight = canonical_rank_weights[rank_code]
584 | 
585 |                 # Keep going down the tree only if there's a wanted rank below current rank
586 |                 if any([rank_code_weight < canonical_rank_weights[rank] for rank in wanted_ranks]):
587 |                     children = self.get_children([tax_id])[tax_id]
588 |                     for child in children:
589 |                         _ = dfs(child, visited_nodes, tax_lvl_dict, wanted_ranks)
590 | 
591 |                 return tax_lvl_dict
592 | 
593 |         for tax_id in tax_ids:
594 |             tax_lvl_dict = dfs(tax_id, wanted_ranks=set(rank))
595 |             clade_tax_rank_dict[tax_id] = tax_lvl_dict
596 | 
597 |         return clade_tax_rank_dict
598 | 
599 |     def get_siblings(self, tax_id):
600 |         """
601 |         NB! This fnc hasn't been extensively tested, use at own risk.
602 | 
603 |         This fnc is similar to get_clade_rank_taxids, but I think it should
604 |         be faster.
605 | 
606 |         For a given tax_id X with any rank in ['S', 'G', 'F', 'O', 'C', 'P'],
607 |         return all taxa with the same rank in the clade rooted at the parent
608 |         of X. The parent is defined as the most recent ancestor of X that has
609 |         a rank also in ['S', 'G', 'F', 'O', 'C', 'P'].
610 | 
611 |         For example, if the tax_id 3352 (Pinus taeda, a species) is submitted
612 |         to the function, it will return all other species in the genus Pinus
613 |         (3337). Conversely, if the genus Pinus (3337) is submitted, the
614 |         function will return all genera in the family Pinaceae (3318).
615 |         """
616 |         # TODO: Test this more.
617 |         # TODO: In line with other exposed functions in this class, it should take a list of taxids instead of a single one.
618 | 
619 |         tax_id_rank = self.get_rank_code([tax_id])[tax_id]
620 |         rank = tax_id_rank.rank_code
621 |         rank_codes = ['S', 'G', 'F', 'O', 'C', 'P']
622 | 
623 |         if tax_id_rank.rank_depth != 0:
624 |             raise TaxonomyTreeException("Can only work with ranks of level {}.".format(rank_codes))
625 | 
626 |         def get_parent(tax_id):
627 |             parent_rank_ok = False
628 |             current_tax_id = tax_id
629 |             while not parent_rank_ok:
630 |                 parent = self.get_parent([current_tax_id])[current_tax_id]
631 |                 taxonomy_rank = self.get_rank_code([parent])[parent]
632 |                 if taxonomy_rank.rank_code in rank_codes and taxonomy_rank.rank_depth == 0:
633 |                     parent_rank_ok = True
634 |                 elif parent == 1:
635 |                     parent_rank_ok = True
636 |                 else:
637 |                     current_tax_id = parent
638 | 
639 |             return parent
640 | 
641 |         parent = get_parent(tax_id)
642 | 
643 |         visited_nodes = set()
644 |         siblings = set()
645 | 
646 |         def dfs(tax_id, wanted_rank):
647 |             if tax_id not in visited_nodes:
648 |                 visited_nodes.add(tax_id)
649 |                 taxonomy_rank = self.get_rank_code([tax_id])[tax_id]
650 |                 if taxonomy_rank.rank_code != wanted_rank:
651 |                     children = self.get_children([tax_id])[tax_id]
652 |                     for child in children:
653 |                         dfs(child, wanted_rank)
654 |                 else:
655 |                     siblings.add(tax_id)
656 | 
657 |         dfs(parent, rank)
658 |         return siblings
659 | 
660 | 
661 | if __name__ == '__main__':
662 |     parser = argparse.ArgumentParser()
663 |     parser.add_argument('--nodes')
664 |     parser.add_argument('--names')
665 |     args = parser.parse_args()
666 | 
667 |     taxonomy_tree = TaxonomyTree(args.nodes, args.names)
668 | 


--------------------------------------------------------------------------------
/stringmeup/stringmeup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | __version__ = "0.1.5"
  4 | 
  5 | import argparse
  6 | import operator
  7 | import logging
  8 | import gzip
  9 | import sys
 10 | from stringmeup import taxonomy
 11 | from dataclasses import dataclass
 12 | from os import path
 13 | 
 14 | logging.basicConfig(
 15 |     format='%(asctime)s %(levelname)-8s %(message)s',
 16 |     level=logging.INFO,
 17 |     datefmt='%Y-%m-%d [%H:%M:%S]')
 18 | log = logging.getLogger(path.basename(__file__))
 19 | 
 20 | # TODO: make sure confidence_threshold is between 0 and 1
 21 | # TODO: For the verbose output, also output (1) the number of kmers that hit in total, (2) the number of non-ambiguous kmers (queried).
 22 | 
 23 | 
 24 | @dataclass
 25 | class ReadClassification:
 26 |     current_node: int = None
 27 |     original_conf: float = None
 28 |     recalculated_conf: float = None
 29 |     original_taxid: int = None
 30 |     reclassified_taxid: int = None
 31 |     original_rank_code: str = None
 32 |     reclassified_rank_code: str = None
 33 |     original_name: str = None
 34 |     reclassified_name: str = None
 35 |     reclassified_distance: int = None
 36 |     id: str = None
 37 |     length: str = None
 38 |     kmer_string: str = None
 39 |     classified: bool = False
 40 |     max_confidence = None
 41 |     minimizer_hit_groups = None
 42 | 
 43 | 
 44 | @dataclass
 45 | class ReportNode:
 46 |     ratio: str
 47 |     hits_at_clade: int
 48 |     hits_at_node: int
 49 |     rank_code: str
 50 |     rank_depth: int
 51 |     node_taxid: int
 52 |     name: str
 53 |     offset: int
 54 | 
 55 | 
 56 | def validate_input_file(putative_classifications_file, verbose_input, minimum_hit_groups, paired_input):
 57 |     """
 58 |     Perform simple validation of the input file.
 59 |     """
 60 | 
 61 |     log.debug('Validating input classifications file.')
 62 | 
 63 |     if not path.isfile(putative_classifications_file):
 64 |         log.error('Cannot find the specified file ({file}).'.format(
 65 |             file=putative_classifications_file))
 66 |         sys.exit()
 67 | 
 68 |     with read_file(putative_classifications_file) as f:
 69 |         line = f.readline()
 70 |         line_proc = line.strip()
 71 |         line_proc = line_proc.split('\t')
 72 | 
 73 |         # The following should be the case of a Kraken 2 output file
 74 |         # First, check so the number of columns in the input file conforms to the expected number
 75 |         if not verbose_input:
 76 |             num_cols = len(line_proc) == 5  # original type of kraken2 output file
 77 |         else:
 78 |             num_cols = len(line_proc) == 6  # 6 columns if the output was produced with the verbose version of kraken2 that outputs minimizer hit groups
 79 | 
 80 |         # Line must start with C or U (as in Classified/unclassified)
 81 |         line_start = line_proc[0] in ['U', 'C']
 82 | 
 83 |         # If the data is paired
 84 |         if paired_input:
 85 |             # Must be information on both sides of the pipe character
 86 |             data_col_1 = len(line_proc[3].split('|')) == 2
 87 | 
 88 |             # If the data is paired in the 3rd column, it must also be paired in the last column
 89 |             if "|" in line_proc[-1]:
 90 |                 data_col_2 = len(line_proc[-1].split('|:|')) == 2
 91 |             else:
 92 |                 data_col_2 = False
 93 | 
 94 |         # If the input is from single end reads, atleast the read length column (3rd) must be an int
 95 |         else:
 96 |             try:
 97 |                 int(line_proc[3])
 98 |             except:
 99 |                 data_col_1 = False
100 |             else:
101 |                 data_col_1 = True
102 | 
103 |             # And the last column should contain colons between kmer/taxon pairs
104 |             if ":" in line_proc[4]:
105 |                 data_col_2 = True
106 |             else:
107 |                 data_col_2 = False
108 | 
109 |         if num_cols and line_start and data_col_1 and data_col_2:
110 |             log.debug('Validation OK.')
111 |             return
112 |         else:
113 |             log.error('The classifications file is malformatted.')
114 |             log.debug('First line of input: {}'.format(line))
115 |             log.debug('num_cols: {}'.format(num_cols))
116 |             log.debug('line_start: {}'.format(line_start))
117 |             log.debug('data_col_1: {}'.format(data_col_1))
118 |             log.debug('data_col_1: {}'.format(data_col_2))
119 |             sys.exit()
120 | 
121 | 
122 | def is_paired_input(classifications_file):
123 |     """
124 |     Returns true if input file appears to contain paired read data.
125 |     """
126 |     with read_file(classifications_file) as f:
127 |         line = f.readline()
128 |         line_proc = line.strip()
129 |         line_proc = line_proc.split('\t')
130 | 
131 |         # If column 4 contains a pipe character "|", the data is paired
132 |         if "|" in line_proc[3]:
133 |             return True
134 | 
135 | 
136 | def is_verbose_input(classifications_file):
137 |     """
138 |     Returns true if input file consists of 6 columns instead of 5.
139 |     """
140 | 
141 |     with read_file(classifications_file) as f:
142 |         line = f.readline()
143 |         line_proc = line.strip()
144 |         line_proc = line_proc.split('\t')
145 |         if len(line_proc) == 6:
146 |             return True
147 |         else:
148 |             return False
149 | 
150 | 
151 | def process_kmer_string(kmer_info_string, paired_input):
152 |     """
153 |     Process a kmer info string (last column of a Kraken 2 output file), so that
154 |     we get a dictionary mapping of tax_ids to total sum of kmer hits.
155 |     Returns:
156 |     {tax_id_#1: X kmer hits,
157 |      tax_id_#2: Y kmer hits,
158 |      ...
159 |      tax_id_#N: Z kmer hits}
160 |     """
161 |     kmer_info_string = kmer_info_string.split()
162 | 
163 |     # Kraken2 classifications file for paired data contain the "|:|" delimiter
164 |     if paired_input:
165 |         kmer_info_string.remove('|:|')
166 | 
167 |     # Messy list comprehension. Converts all "taxa":"num_kmer" string pairs
168 |     # into integer tuples like (taxa, num_kmers), and saves them in a list.
169 |     # Ambiguous kmers are not processed (discarded).
170 |     kmer_classifications = [
171 |         (int(x[0]), int(x[1])) for x in (
172 |             kmer_info.split(':') for kmer_info in kmer_info_string)
173 |         if x[0] != 'A']
174 | 
175 |     # Further processes the (taxa, num_kmers) tuples into a dict where each
176 |     # tax_id stores the total sum of kmer hits to that tax_id.
177 |     taxa_kmer_dict = {}
178 |     for kmer_info in kmer_classifications:
179 |         if kmer_info[0] not in taxa_kmer_dict:
180 |             taxa_kmer_dict[kmer_info[0]] = kmer_info[1]
181 |         else:
182 |             taxa_kmer_dict[kmer_info[0]] += kmer_info[1]
183 | 
184 |     return taxa_kmer_dict
185 | 
186 | 
187 | def reclassify_read(read, confidence_threshold, taxonomy_tree, verbose_input, minimum_hit_groups, taxa_lineages, paired_input):
188 |     """
189 |     Sums the number of kmers that hit in the clade rooted at "current_node",
190 |     and divides it with the total number of kmers queried against the database:
191 |     confidence = clade_kmer_hits / total_kmer_hits
192 | 
193 |     If the confidence at a specific node is < confidence_threshold, we go one
194 |     step up the taxonomy (to the parent node) and recalculates the confidence.
195 |     This is repeated until confidence >= confidence_threshold.
196 | 
197 |     In this function it's envisionable to include other parameters for the
198 |     classification... Right now I'm only considering the confidence score
199 |     and minimum hit groups.
200 |     """
201 |     # Process the kmer string into a dict of {tax_id: #kmers} key, value pairs
202 |     taxa_kmer_dict = process_kmer_string(read.kmer_string, paired_input)
203 | 
204 |     # Make the current node the same as the original classification
205 |     read.current_node = read.original_taxid
206 | 
207 |     # The total number of kmers that were interrogated against the
208 |     # database (non-ambiguous):
209 |     total_kmer_hits = sum(taxa_kmer_dict.values())
210 | 
211 |     # Only interested in tax_ids that are in the database. A '0' signifies that
212 |     # the kmer could not be assigned to any tax_id (missing from database).
213 |     assigned_taxa_set = set(
214 |         [tax_id for tax_id in taxa_kmer_dict.keys() if tax_id != 0])
215 | 
216 |     # Make a quick check to see if it is even possible to obtain the confidence
217 |     # needed to make a classification. If it isn't we don't have to go through
218 |     # the hassle of calculating the confidence at all parent nodes. Potentially
219 |     # saving us a lot of time.
220 |     doomed_to_fail = False
221 |     total_hits = sum(taxa_kmer_dict[tax_id] for tax_id in assigned_taxa_set)
222 |     max_confidence = total_hits / total_kmer_hits
223 |     read.max_confidence = max_confidence
224 | 
225 |     # The read can't achieve a confidence high enough, so we mark it
226 |     if max_confidence < confidence_threshold:
227 |         doomed_to_fail = True
228 | 
229 |     # Filter minimizer_hit_groups
230 |     if verbose_input:
231 |         if read.minimizer_hit_groups < minimum_hit_groups:
232 |             doomed_to_fail = True
233 | 
234 |     # The nr of kmers that hit within the clade rooted at the current node:
235 |     num_hits_within_clade = 0
236 | 
237 |     while not read.classified:
238 |         taxa_in_clade = set()
239 | 
240 |         # For each tax_id that kmers in the read were assigned to:
241 |         for tax_id in assigned_taxa_set:
242 | 
243 |             # Get the lineage (all ancestors including itself) for the tax_id:
244 |             if tax_id in taxa_lineages:
245 |                 lineage = taxa_lineages[tax_id]
246 |             else:
247 |                 lineage = taxonomy_tree.get_lineage([tax_id])[tax_id]
248 | 
249 |                 # Save lineage so we don't have to get it from taxonomy_tree
250 |                 # more than once. Also make it into a set, which is faster to
251 |                 # query (the order of tax_ids in the lineage is not important
252 |                 # here).
253 |                 taxa_lineages[tax_id] = set(lineage)
254 | 
255 |             # If the currently classified (read.current_node) tax_id is in the
256 |             # lineage (parents) of tax_id, then tax_id must be in the clade
257 |             # rooted at read.current_node - i.e. tax_id is a descendant of
258 |             # current_node.
259 |             if read.current_node in lineage:
260 | 
261 |                 # There is no need to get the lineage of tax_id in future
262 |                 # iterations since it will always be in the clade rooted at
263 |                 # read.current_node (we only ever go up in the taxonomy).
264 |                 # Remember which tax_ids we have counted, so we can remove them
265 |                 # from the set, outside of the loop:
266 |                 taxa_in_clade.add(tax_id)
267 | 
268 |                 # Instead, we just add the kmers that hit tax_id to the total
269 |                 # hits at the clade:
270 |                 num_hits_within_clade += taxa_kmer_dict[tax_id]
271 | 
272 |         # Remove the already counted tax_ids:
273 |         if taxa_in_clade:
274 |             assigned_taxa_set -= taxa_in_clade
275 | 
276 |         # The confidence value for the read pair classification at the current
277 |         # taxonomic level:
278 |         read.recalculated_conf = num_hits_within_clade / total_kmer_hits
279 | 
280 |         # Set the original confidence score
281 |         if not read.original_conf:
282 |             read.original_conf = read.recalculated_conf
283 | 
284 |             # If we can't achieve the confidence score cutoff, now is the time
285 |             # to exit the loop and return the read (since we have calculated
286 |             # the original confidence).
287 |             if doomed_to_fail:
288 |                 read.recalculated_conf = max_confidence
289 |                 read.reclassified_taxid = 0
290 |                 return read, taxa_lineages
291 | 
292 |         # If the confidence at this node is sufficient, we classify it to
293 |         # the current node (TaxID).
294 |         if read.recalculated_conf >= confidence_threshold:
295 |             read.classified = True
296 |             read.reclassified_taxid = read.current_node
297 |             return read, taxa_lineages
298 | 
299 |         # If the current node is the root, we stop (can't go higher up in the
300 |         # taxonomy).
301 |         elif read.current_node == 1:
302 |             read.reclassified_taxid = 0
303 |             return read, taxa_lineages
304 | 
305 |         # Otherwise, set the current_node to the parent and keep going.
306 |         else:
307 |             read.current_node = taxonomy_tree.get_parent(
308 |                 [read.current_node])[read.current_node]
309 | 
310 | 
311 | def read_kraken_output():
312 |     """
313 |     This should work to read classifications from a kraken 2 output file. It's
314 |     not complete, but the backbone is there. The point is, we should not have
315 |     to run reclassification in order to produce a report file - we shuold be
316 |     able to just read an output file and work with the classifications as they
317 |     are. Could also just modify the main loop.
318 |     """
319 |     tax_dict = {'hits_at_node': {}, 'hits_at_clade': {}}
320 |     i = 0
321 |     with open('Ki-1974-23-291.kraken2', 'r') as f:
322 |         for line in f:
323 |             if line.startswith('C'):
324 |                 l = line.strip().split('\t')
325 |                 tax_id = int(l[2])
326 |                 if tax_id not in tax_dict['hits_at_node']:
327 |                     tax_dict['hits_at_node'][tax_id] = 1
328 |                 else:
329 |                     tax_dict['hits_at_node'][tax_id] += 1
330 |             i += 1
331 |             if i % report_frequency == 0:
332 |                 print('Processed {} lines'.format(i))
333 | 
334 | 
335 | def get_kraken2_report_content(tax_reads, taxonomy_tree, total_reads):
336 |     """
337 |     First calculates the cumulative clade read count (i.e. for each node, how
338 |     many reads are classified to the clades rooted at that node).
339 | 
340 |     Then, sorts the nodes in the order they will be printed in the report file.
341 |     The sorting works like this: perform a depth first search of the taxonomy,
342 |     starting at the root. At each node, continue with the depth first seach
343 |     in the order of highest to lowest cumulative clade read counts among the
344 |     child nodes.
345 | 
346 |     There is probably a way of merging the internal functions dfs_ccrc and
347 |     dfs_sort, but I was in a hurry and this is what it is.
348 | 
349 |     total_reads: total reads in the kraken output file.
350 |     """
351 |     report_node_list = []
352 | 
353 |     # Depth First Search, cumulative clade read count
354 |     def dfs_ccrc(visited_nodes, taxonomy_tree, node_taxid):
355 |         if node_taxid not in visited_nodes:
356 |             visited_nodes.add(node_taxid)
357 | 
358 |             # The cumulative clade read count for the clade rooted at current
359 |             # node will start at the number of hits at that node:
360 |             if node_taxid in tax_reads['hits_at_node']:
361 |                 hits_at_node = tax_reads['hits_at_node'][node_taxid]
362 |             else:
363 |                 hits_at_node = 0
364 |             clade_read_count = hits_at_node
365 | 
366 |             # Recursively search the children in the clade rooted at the
367 |             # current node:
368 |             for child in taxonomy_tree.get_children([node_taxid])[node_taxid]:
369 |                 clade_read_count += dfs_ccrc(visited_nodes, taxonomy_tree, child)
370 | 
371 |             # Only save the node if its clade read count is !=0:
372 |             if clade_read_count > 0:
373 |                 tax_reads['hits_at_clade'][node_taxid] = clade_read_count
374 | 
375 |             return clade_read_count  # This return ends up in the above for loop
376 | 
377 |     # Depth First Search, sorting for the hierarchy of the output report.
378 |     # Moves down the tree and adds the nodes with highest cumulative clade
379 |     # read count to report_node_list
380 |     def dfs_sort(visited_nodes, taxonomy_tree, node_taxid, offset):
381 |         if node_taxid not in visited_nodes:
382 |             visited_nodes.add(node_taxid)
383 | 
384 |             # Find the cumluative read counts for all children of current node:
385 |             children_cumulative_counts = {}
386 |             for child in taxonomy_tree.get_children([node_taxid])[node_taxid]:
387 |                 if child in tax_reads['hits_at_clade']:
388 |                     children_cumulative_counts[child] = tax_reads['hits_at_clade'][child]
389 | 
390 |             # Get the hits to this node (to be output in the report):
391 |             if node_taxid in tax_reads['hits_at_node']:
392 |                 hits_at_node = tax_reads['hits_at_node'][node_taxid]
393 |             else:
394 |                 hits_at_node = 0
395 | 
396 |             # Get some information that will go into the report file.
397 |             # 1) total number of reads at this node and at the clade rooted here (column 2 in output):
398 |             hits_at_clade = tax_reads['hits_at_clade'][node_taxid]
399 |             # 2) the ratio (column 1 in output):
400 |             ratio_classified2clade = hits_at_clade / total_reads * 100
401 |             # 3) the rank code and depth (column 4 in output):
402 |             rank_tuple = taxonomy_tree.get_rank_code([node_taxid])[node_taxid]
403 |             # 4) scientific name of the node (column 6 of output):
404 |             name = taxonomy_tree.get_name([node_taxid])[node_taxid]
405 | 
406 |             # Construct the dataclass instance that holds the information
407 |             # about this node that is printed to the report file:
408 |             report_node = ReportNode(
409 |                 ratio="{0:.2f}".format(ratio_classified2clade),
410 |                 hits_at_clade=hits_at_clade,
411 |                 hits_at_node=hits_at_node,
412 |                 rank_code=rank_tuple.rank_code,
413 |                 rank_depth=rank_tuple.rank_depth,
414 |                 node_taxid=node_taxid,
415 |                 offset=offset,
416 |                 name=name)
417 | 
418 |             # Append it to the list. The order of the elements in the list is
419 |             # the order the nodes will be printed.
420 |             report_node_list.append(report_node)
421 | 
422 |             if len(children_cumulative_counts) > 0:
423 |                 # sorted_by_ccrc is a list of tuples [(tax_id#1, ccrc), (tax_id#2, ccrc)...]
424 |                 sorted_by_ccrc = sorted(
425 |                     children_cumulative_counts.items(),
426 |                     key=operator.itemgetter(1),
427 |                     reverse=True)
428 | 
429 |                 # We are going one level deeper in the taxonomy, increment
430 |                 # the offset:
431 |                 offset += 1
432 | 
433 |                 for child_tuple in sorted_by_ccrc:
434 |                     child_taxid = child_tuple[0]
435 |                     dfs_sort(visited_nodes, taxonomy_tree, child_taxid, offset)
436 | 
437 |     # Depth first search to get cumulative read counts for all clades:
438 |     visited_nodes = set()
439 |     log.info('Calculating cumulative clade read counts...')
440 |     dfs_ccrc(visited_nodes, taxonomy_tree, 1)
441 | 
442 |     # Depth first search to sort the output according to largest cumulative
443 |     # read count:
444 |     visited_nodes = set()
445 |     log.info('Sorting the order of the output in the report file...')
446 |     dfs_sort(visited_nodes, taxonomy_tree, 1, 0)
447 | 
448 |     # Make sure to add the unclassified row that goes at the very top of the
449 |     # kraken 2 report:
450 |     num_unclassified_reads = total_reads - tax_reads['hits_at_clade'][1]
451 |     ratio = num_unclassified_reads / total_reads * 100
452 |     unclassified_node = ReportNode(
453 |         ratio="{0:.2f}".format(ratio),
454 |         hits_at_clade=num_unclassified_reads,
455 |         hits_at_node=num_unclassified_reads,
456 |         rank_code='U',
457 |         rank_depth=0,
458 |         node_taxid=0,
459 |         name='unclassified',
460 |         offset=0)
461 | 
462 |     report_node_list.insert(0, unclassified_node)
463 | 
464 |     return report_node_list
465 | 
466 | 
467 | def format_kraken2_report_row(report_node):
468 |     """
469 |     Formats the row that will be output in the kraken 2 style report. Input
470 |     is an instance of ReportNode.
471 |     """
472 |     offset = 2 * report_node.offset * ' '
473 |     name = offset + report_node.name
474 | 
475 |     if report_node.rank_depth == 0:
476 |         rank_depth = ''
477 |     else:
478 |         rank_depth = str(report_node.rank_depth)
479 | 
480 |     rank_code = report_node.rank_code + rank_depth
481 |     report_row = '\t'.join([
482 |         str(report_node.ratio),
483 |         str(report_node.hits_at_clade),
484 |         str(report_node.hits_at_node),
485 |         rank_code,
486 |         str(report_node.node_taxid),
487 |         name])
488 | 
489 |     return report_row
490 | 
491 | 
492 | def make_kraken2_report(tax_reads, taxonomy_tree, total_reads, output_report):
493 |     """
494 |     Gets the information that should be printed from
495 |     get_kraken2_report_content. Formats the information and prints it to file
496 |     or stdout.
497 |     """
498 |     log.info('Creating report...')
499 | 
500 |     # Get the data to print
501 |     report_node_list = get_kraken2_report_content(
502 |         tax_reads, taxonomy_tree, total_reads)
503 | 
504 |     # If the output should go to file
505 |     if output_report:
506 |         with open(output_report, 'w') as f:
507 |             for node in report_node_list:
508 | 
509 |                 # Format the output
510 |                 report_row = format_kraken2_report_row(node)
511 |                 f.write(report_row + '\n')
512 | 
513 |             log.info('Report saved in {}.'.format(output_report))
514 | 
515 |     # Otherwise, print to stdout
516 |     else:
517 |         for node in report_node_list:
518 | 
519 |             # Format the output
520 |             report_row = format_kraken2_report_row(node)
521 |             sys.stdout.write(report_row + '\n')
522 | 
523 | 
524 | def get_verbose_output(read, taxonomy_tree):
525 |     """
526 |     Gets more information about the reclassification of a read. This is for
527 |     the output_verbose option.
528 | 
529 |     read is an instance of ReadClassification.
530 |     """
531 |     # Variable renaming to make things more readable
532 |     new_taxid = read.reclassified_taxid
533 |     old_tax_id = read.original_taxid
534 | 
535 |     # Stuff that is conditional if we have a classified read or not.
536 |     # TaxonomyTree doesn't cope with tax_id=0
537 |     if read.classified:
538 |         distance = taxonomy_tree.get_distance(new_taxid, old_tax_id)
539 |         new_rank_tuple = taxonomy_tree.get_rank_code([new_taxid])[new_taxid]
540 |         new_rank_depth = new_rank_tuple.rank_depth if new_rank_tuple.rank_depth != 0 else ''
541 |         new_rank_code = str(new_rank_tuple.rank_code) + str(new_rank_depth)
542 |         new_rank_name = taxonomy_tree.get_name([new_taxid])[new_taxid]
543 |     else:
544 |         distance = 'NaN'
545 |         new_rank_code = 'U'
546 |         new_rank_name = 'unclassified'
547 | 
548 |     # Distance information
549 |     read.reclassified_distance = distance
550 | 
551 |     # Rank information
552 |     old_rank_tuple = taxonomy_tree.get_rank_code([old_tax_id])[old_tax_id]
553 |     old_rank_depth = old_rank_tuple.rank_depth if old_rank_tuple.rank_depth != 0 else ''
554 |     old_rank_code = str(old_rank_tuple.rank_code) + str(old_rank_depth)
555 |     read.original_rank_code = old_rank_code
556 |     read.reclassified_rank_code = new_rank_code
557 | 
558 |     # Scientific name information
559 |     old_rank_name = taxonomy_tree.get_name([old_tax_id])[old_tax_id]
560 |     read.original_name = old_rank_name
561 |     read.reclassified_name = new_rank_name
562 | 
563 |     return read
564 | 
565 | 
566 | def create_read(kraken2_read, verbose_input=False):
567 |     """
568 |     Creates an instance of ReadClassification dataclass, that holds
569 |     information about the read and its classification.
570 |     """
571 |     # Process the read string so that its elements go into a list
572 |     read_pair_proc = kraken2_read.strip()
573 |     read_pair_proc = read_pair_proc.split('\t')
574 | 
575 |     # Create the read object
576 |     read = ReadClassification(
577 |         original_taxid=int(read_pair_proc[2]),
578 |         id=read_pair_proc[1],
579 |         length=read_pair_proc[3],
580 |         kmer_string=read_pair_proc[-1])
581 | 
582 |     if verbose_input:
583 |         read.minimizer_hit_groups = int(read_pair_proc[4])
584 | 
585 |     return read
586 | 
587 | 
588 | def main_loop(f_handle, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, paired_input, verbose_input=False, o_handle=None, v_handle=None):
589 |     """
590 |     f_handle: classifications input file to read from.
591 |     o_handle: output_classifications file to write to.
592 |     v_handle: output_verbose file to write to.
593 |     """
594 |     def write_read_output(read):
595 |         # read is an instance of ReadClassification
596 |         classification = 'C' if read.classified else 'U'
597 |         row_items = [
598 |             classification,
599 |             read.id,
600 |             read.reclassified_taxid,
601 |             read.length,
602 |             read.kmer_string]
603 | 
604 |         if verbose_input:
605 |             row_items.insert(4, read.minimizer_hit_groups)
606 | 
607 |         row_string = '\t'.join([str(x) for x in row_items]) + '\n'
608 |         _ = o_handle.write(row_string)  # gzip write fnc returns output, therefore send to "_"
609 | 
610 |     def write_verbose_output(read):
611 |         # read is an instance of ReadClassification
612 |         row_items = [
613 |             read.id,
614 |             read.length,
615 |             read.reclassified_distance,
616 |             read.original_taxid,
617 |             read.reclassified_taxid,
618 |             "{0:.2f}".format(read.original_conf),
619 |             "{0:.2f}".format(read.recalculated_conf),
620 |             "{0:.2f}".format(read.max_confidence),
621 |             read.original_rank_code,
622 |             read.reclassified_rank_code,
623 |             read.original_name,
624 |             read.reclassified_name,
625 |             read.kmer_string]
626 | 
627 |         if verbose_input:
628 |             row_items.insert(2, read.minimizer_hit_groups)
629 | 
630 |         row_string = '\t'.join([str(x) for x in row_items]) + '\n'
631 |         _ = v_handle.write(row_string)  # gzip write fnc returns output, therefore send to "_"
632 | 
633 |     # Parse the input file, read per read
634 |     i = 0
635 |     for read_pair in f_handle:
636 | 
637 |         # Only working with classified reads:
638 |         if read_pair.startswith('C'):
639 | 
640 |             # Make an instance of ReadClassification to hold information
641 |             # about the read and its classification
642 |             read = create_read(read_pair, verbose_input)
643 | 
644 |             # Reclassify the read pair based on confidence
645 |             read, taxa_lineages = reclassify_read(
646 |                 read,
647 |                 args.confidence_threshold,
648 |                 taxonomy_tree,
649 |                 verbose_input,
650 |                 args.minimum_hit_groups,
651 |                 taxa_lineages,
652 |                 paired_input)
653 | 
654 |             # Counter for number of reads per taxon/node
655 |             if read.reclassified_taxid in tax_reads_dict['hits_at_node']:
656 |                 tax_reads_dict['hits_at_node'][read.reclassified_taxid] += 1
657 |             else:
658 |                 tax_reads_dict['hits_at_node'][read.reclassified_taxid] = 1
659 | 
660 |             # Write the reclassified reads to file
661 |             if o_handle:
662 |                 if read.classified or args.keep_unclassified:
663 |                     write_read_output(read)
664 | 
665 |             # Write verbose output about the reclassification
666 |             if v_handle:
667 |                 read = get_verbose_output(read, taxonomy_tree)
668 |                 write_verbose_output(read)
669 | 
670 |         else:
671 |             # Change here if you want to keep reads from the input file
672 |             # that were initially unclassified.
673 |             pass
674 | 
675 |         # Keep track of progress
676 |         i += 1
677 |         if i % report_frequency == 0:
678 |             log.info('Processed {} reads...'.format(i))
679 | 
680 |     log.info('Done processing reads. They were {} in total.'.format(i))
681 | 
682 |     # Output a report file
683 |     make_kraken2_report(tax_reads_dict, taxonomy_tree, i, args.output_report)  # i is used to calculate the ratio of classified reads (col 1 in output file).
684 | 
685 | 
686 | def read_file(filename):
687 |     """
688 |     Wrapper to read either gzipped or ordinary text file input.
689 |     """
690 |     if filename.endswith('.gz'):
691 |         return gzip.open(filename, 'rt')
692 |     else:
693 |         return open(filename, 'r')
694 | 
695 | 
696 | def write_file(filename, gz_output):
697 |     """
698 |     Wrapper to write either gzipped or ordinary text file output.
699 |     """
700 |     if gz_output:
701 |         return gzip.open(filename, 'wt')
702 |     else:
703 |         return open(filename, 'w')
704 | 
705 | 
706 | def get_arguments():
707 |     """
708 |     Wrapper function to get the command line arguments. Inserting this piece of code
709 |     into its own function for conda compatibility.
710 |     """
711 | 
712 |     parser = argparse.ArgumentParser(
713 |         prog='StringMeUp',
714 |         usage='stringmeup --names <FILE> --nodes <FILE> [--output_report <FILE>] [--output_classifications <FILE>] [--output_verbose <FILE>] [--keep_unclassified] [--minimum_hit_groups INT] [--gz_output] [--help] confidence classifications',
715 |         description='A post-processing tool to reclassify Kraken 2 output based on the confidence score and/or minimum minimizer hit groups.')
716 |     parser.add_argument(
717 |         'confidence_threshold',
718 |         metavar='confidence',
719 |         type=float,
720 |         help='The confidence score threshold to be used in reclassification [0-1].')
721 |     parser.add_argument(
722 |         'original_classifications_file',
723 |         metavar='classifications',
724 |         type=str,
725 |         help='Path to the Kraken 2 output file containing the individual read classifications.')
726 |     parser.add_argument(
727 |         '--output_report',
728 |         metavar='FILE',
729 |         type=str,
730 |         help='File to save the Kraken 2 report in.')
731 |     parser.add_argument(
732 |         '--output_classifications',
733 |         metavar='FILE',
734 |         type=str,
735 |         help='File to save the Kraken 2 read classifications in.')
736 |     parser.add_argument(
737 |         '--keep_unclassified',
738 |         action='store_true',
739 |         help='Specify if you want to output unclassified reads in addition to classified reads. NOTE(!): This script will always discard reads that are unclassified in the classifications input file, this flag will just make sure to keep previously classified reads even if they are reclassified as unclassified by this script. TIP(!): Always run Kraken2 with no confidence cutoff.')
740 |     parser.add_argument(
741 |         '--output_verbose',
742 |         metavar='FILE',
743 |         type=str,
744 |         help='File to send verbose output to. This file will contain, for each read, (1) original classification, (2) new classification, (3) original confidence, (4), new confidence (5), original taxa name (6), new taxa name, (7) original rank, (8) new rank, (9) distance travelled (how many nodes was it lifted upwards in the taxonomy).')
745 |     parser.add_argument(
746 |         '--names',
747 |         metavar='FILE',
748 |         required=True,
749 |         help='Taxonomy names dump file (names.dmp)')
750 |     parser.add_argument(
751 |         '--nodes',
752 |         metavar='FILE',
753 |         required=True,
754 |         help='Taxonomy nodes dump file (nodes.dmp)')
755 |     parser.add_argument(
756 |         '--minimum_hit_groups',
757 |         metavar='INT',
758 |         type=int,
759 |         help='The minimum number of hit groups a read needs to be classified. NOTE: You need to supply a classifications file (kraken2 output) that contain the "minimizer_hit_groups" column.')
760 |     parser.add_argument(
761 |         '--gz_output',
762 |         action='store_true',
763 |         help='Set this flag to output <output_classifications> and <output_verbose> in gzipped format (will add .gz extension to the filenames).'
764 |     )
765 |     args = parser.parse_args()
766 | 
767 |     return args
768 | 
769 | 
770 | def stringmeup():
771 | 
772 |     # Get the CL arguments
773 |     args = get_arguments()
774 | 
775 |     # Some initial setup
776 |     taxa_lineages = {}
777 |     report_frequency = 10000000  # Will output progress every nth read
778 |     tax_reads_dict = {'hits_at_node': {}, 'hits_at_clade': {}}
779 | 
780 |     # Was the input generated with https://github.com/danisven/kraken2 ?
781 |     verbose_input = is_verbose_input(args.original_classifications_file)
782 | 
783 |     # If so, output warnings if input doesn't contain minimizer hit groups
784 |     if verbose_input:
785 |         log.info("The input file appears to contain a column for minimizer hit groups.")
786 | 
787 |         if not args.minimum_hit_groups:
788 |             log.warning('You didn\'t specify --minimum_hit_groups.')
789 |             log.warning('Will NOT reclassify based on minimizer hit groups, setting minimum_hit_groups=1 (lowest possible setting).')
790 |             args.minimum_hit_groups = 1
791 | 
792 |     # If not, output warnings if user was planning to reclassify with minimizer hit groups
793 |     else:
794 |         if args.minimum_hit_groups:
795 |             log.warning('You specified minimum_hit_groups={}, but did not supply an input file with a column for minimizer hit groups.'.format(args.minimum_hit_groups))
796 |             log.warning('Will NOT reclassify based on minimizer hit groups.')
797 |             args.minimum_hit_groups = None
798 | 
799 |     # Check if the input data is paired or not
800 |     paired_input = is_paired_input(args.original_classifications_file)
801 |     if paired_input:
802 |         log.info('Classifications were made from paired-end data.')
803 |     else:
804 |         log.info('Classifications were made from single-read data.')
805 | 
806 |     # Perform a naive check of the input file
807 |     validate_input_file(args.original_classifications_file, verbose_input, args.minimum_hit_groups, paired_input)
808 | 
809 |     # Create a TaxonomyTree from the user provided names.dmp and nodes.dmp files
810 |     taxonomy_tree = taxonomy.TaxonomyTree(names_filename=args.names, nodes_filename=args.nodes)
811 | 
812 |     # Filehandles-to-be
813 |     o = None
814 |     v = None
815 | 
816 |     # Open the classifications input file:
817 |     with read_file(args.original_classifications_file) as f:
818 |         log.info('Processing read classifications from "{file}".'.format(file=path.abspath(args.original_classifications_file)))
819 | 
820 |         # TODO: make sure output files are writable
821 |         # If user wants to save the read classifications to file, open file
822 |         if args.output_classifications:
823 |             if args.gz_output:
824 |                 if not args.output_classifications.endswith('.gz'):
825 |                     args.output_classifications += '.gz'
826 |             log.info('Saving reclassified reads in {}.'.format(args.output_classifications))
827 |             o = write_file(args.output_classifications, args.gz_output)
828 | 
829 |         # If user wants to save the verbose classification output to file, open file
830 |         if args.output_verbose:
831 |             if args.gz_output:
832 |                 if not args.output_verbose.endswith('.gz'):
833 |                     args.output_verbose += '.gz'
834 |             log.info('Saving verbose classification information in {}.'.format(args.output_verbose))
835 |             v = write_file(args.output_verbose, args.gz_output)
836 | 
837 |         # Run the main loop (reclassification)
838 |         main_loop(f, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, paired_input, verbose_input, o, v)
839 | 
840 |     # Remember to close files
841 |     if o:
842 |         o.close()
843 |     if v:
844 |         v.close()
845 | 
846 | 
847 | if __name__ == '__main__':
848 |     stringmeup()
849 | 


--------------------------------------------------------------------------------